diff --git a/docs/features/prompt-tools.md b/docs/features/prompt-tools.md new file mode 100644 index 00000000000..5b00bfa4956 --- /dev/null +++ b/docs/features/prompt-tools.md @@ -0,0 +1,50 @@ +# LLM Prompt Tools + +InvokeAI includes two built-in tools that use local language models to help you write better prompts. Both tools appear as small buttons in the top-right corner of the positive prompt area and are only visible when you have a compatible model installed. + +## Expand Prompt + +Takes your short prompt and expands it into a detailed, vivid description suitable for image generation. + +**How to use:** + +1. Type a brief prompt (e.g. "a cat in a garden") +2. Click the sparkle button in the prompt area +3. Select a Text LLM model from the dropdown +4. Click **Expand** +5. Your prompt is replaced with the expanded version + +**Compatible models:** Any HuggingFace model with a `ForCausalLM` architecture. Recommended options: + +| Model | Size | HuggingFace ID | +|-------|------|----------------| +| Qwen2.5 1.5B Instruct | ~3 GB | `Qwen/Qwen2.5-1.5B-Instruct` | +| Phi-3 Mini Instruct | ~7.5 GB | `microsoft/Phi-3-mini-4k-instruct` | +| TinyLlama Chat | ~2 GB | `TinyLlama/TinyLlama-1.1B-Chat-v1.0` | + +Install by pasting the HuggingFace ID into the Model Manager. The model is automatically detected as a **Text LLM** type. + +## Image to Prompt + +Upload an image and generate a descriptive prompt from it using a vision-language model. + +**How to use:** + +1. Click the image button in the prompt area +2. Select a LLaVA OneVision model from the dropdown +3. Click **Upload Image** and select an image +4. Click **Generate Prompt** +5. The generated description is set as your prompt + +**Compatible models:** LLaVA OneVision models (already supported by InvokeAI). + +## Undo + +Both tools overwrite your current prompt. You can undo this change: + +- Press **Ctrl+Z** (or **Cmd+Z** on macOS) in the prompt textarea within 30 seconds +- The undo state is cleared when you start typing manually + +## Workflow Node + +A **Text LLM** node is also available in the workflow editor for use in automated pipelines. It accepts a prompt string and model selection as inputs and outputs the expanded text as a string. diff --git a/invokeai/app/api/routers/utilities.py b/invokeai/app/api/routers/utilities.py index 921645b1d86..f77f77a8534 100644 --- a/invokeai/app/api/routers/utilities.py +++ b/invokeai/app/api/routers/utilities.py @@ -1,13 +1,32 @@ +import asyncio +import logging +import threading +from pathlib import Path from typing import Optional, Union +import torch from dynamicprompts.generators import CombinatorialPromptGenerator, RandomPromptGenerator -from fastapi import Body +from fastapi import Body, HTTPException from fastapi.routing import APIRouter -from pydantic import BaseModel +from pydantic import BaseModel, Field from pyparsing import ParseException +from transformers import AutoProcessor, AutoTokenizer, LlavaOnevisionForConditionalGeneration, LlavaOnevisionProcessor + +from invokeai.app.api.dependencies import ApiDependencies +from invokeai.app.services.image_files.image_files_common import ImageFileNotFoundException +from invokeai.app.services.model_records.model_records_base import UnknownModelException +from invokeai.backend.llava_onevision_pipeline import LlavaOnevisionPipeline +from invokeai.backend.model_manager.taxonomy import ModelType +from invokeai.backend.text_llm_pipeline import DEFAULT_SYSTEM_PROMPT, TextLLMPipeline +from invokeai.backend.util.devices import TorchDevice + +logger = logging.getLogger(__name__) utilities_router = APIRouter(prefix="/v1/utilities", tags=["utilities"]) +# The underlying model loader is not thread-safe, so we serialize load_model calls. +_model_load_lock = threading.Lock() + class DynamicPromptsResponse(BaseModel): prompts: list[str] @@ -42,3 +61,160 @@ async def parse_dynamicprompts( prompts = [prompt] error = str(e) return DynamicPromptsResponse(prompts=prompts if prompts else [""], error=error) + + +# --- Expand Prompt --- + + +class ExpandPromptRequest(BaseModel): + prompt: str + model_key: str + max_tokens: int = Field(default=300, ge=1, le=2048) + system_prompt: str | None = None + + +class ExpandPromptResponse(BaseModel): + expanded_prompt: str + error: str | None = None + + +def _resolve_model_path(model_config_path: str) -> Path: + """Resolve a model config path to an absolute path.""" + model_path = Path(model_config_path) + if model_path.is_absolute(): + return model_path.resolve() + base_models_path = ApiDependencies.invoker.services.configuration.models_path + return (base_models_path / model_path).resolve() + + +def _run_expand_prompt(prompt: str, model_key: str, max_tokens: int, system_prompt: str | None) -> str: + """Run text LLM inference synchronously (called from thread).""" + model_manager = ApiDependencies.invoker.services.model_manager + model_config = model_manager.store.get_model(model_key) + + if model_config.type != ModelType.TextLLM: + raise ValueError(f"Model '{model_key}' is not a TextLLM model (got {model_config.type})") + + with _model_load_lock: + loaded_model = model_manager.load.load_model(model_config) + + with torch.no_grad(), loaded_model.model_on_device() as (_, model): + model_abs_path = _resolve_model_path(model_config.path) + tokenizer = AutoTokenizer.from_pretrained(model_abs_path, local_files_only=True) + + pipeline = TextLLMPipeline(model, tokenizer) + model_device = next(model.parameters()).device + output = pipeline.run( + prompt=prompt, + system_prompt=system_prompt or DEFAULT_SYSTEM_PROMPT, + max_new_tokens=max_tokens, + device=model_device, + dtype=TorchDevice.choose_torch_dtype(), + ) + + return output + + +@utilities_router.post( + "/expand-prompt", + operation_id="expand_prompt", + responses={ + 200: {"model": ExpandPromptResponse}, + }, +) +async def expand_prompt(body: ExpandPromptRequest) -> ExpandPromptResponse: + """Expand a brief prompt into a detailed image generation prompt using a text LLM.""" + try: + expanded = await asyncio.to_thread( + _run_expand_prompt, + body.prompt, + body.model_key, + body.max_tokens, + body.system_prompt, + ) + return ExpandPromptResponse(expanded_prompt=expanded) + except UnknownModelException: + raise HTTPException(status_code=404, detail=f"Model '{body.model_key}' not found") + except ValueError as e: + raise HTTPException(status_code=422, detail=str(e)) + except Exception as e: + logger.error(f"Error expanding prompt: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +# --- Image to Prompt --- + + +class ImageToPromptRequest(BaseModel): + image_name: str + model_key: str + instruction: str = "Describe this image in detail for use as an AI image generation prompt." + + +class ImageToPromptResponse(BaseModel): + prompt: str + error: str | None = None + + +def _run_image_to_prompt(image_name: str, model_key: str, instruction: str) -> str: + """Run LLaVA OneVision inference synchronously (called from thread).""" + model_manager = ApiDependencies.invoker.services.model_manager + model_config = model_manager.store.get_model(model_key) + + if model_config.type != ModelType.LlavaOnevision: + raise ValueError(f"Model '{model_key}' is not a LLaVA OneVision model (got {model_config.type})") + + with _model_load_lock: + loaded_model = model_manager.load.load_model(model_config) + + # Load the image from InvokeAI's image store + image = ApiDependencies.invoker.services.images.get_pil_image(image_name) + image = image.convert("RGB") + + with torch.no_grad(), loaded_model.model_on_device() as (_, model): + if not isinstance(model, LlavaOnevisionForConditionalGeneration): + raise TypeError(f"Expected LlavaOnevisionForConditionalGeneration, got {type(model).__name__}") + + model_abs_path = _resolve_model_path(model_config.path) + processor = AutoProcessor.from_pretrained(model_abs_path, local_files_only=True) + if not isinstance(processor, LlavaOnevisionProcessor): + raise TypeError(f"Expected LlavaOnevisionProcessor, got {type(processor).__name__}") + + pipeline = LlavaOnevisionPipeline(model, processor) + model_device = next(model.parameters()).device + output = pipeline.run( + prompt=instruction, + images=[image], + device=model_device, + dtype=TorchDevice.choose_torch_dtype(), + ) + + return output + + +@utilities_router.post( + "/image-to-prompt", + operation_id="image_to_prompt", + responses={ + 200: {"model": ImageToPromptResponse}, + }, +) +async def image_to_prompt(body: ImageToPromptRequest) -> ImageToPromptResponse: + """Generate a descriptive prompt from an image using a vision-language model.""" + try: + prompt = await asyncio.to_thread( + _run_image_to_prompt, + body.image_name, + body.model_key, + body.instruction, + ) + return ImageToPromptResponse(prompt=prompt) + except UnknownModelException: + raise HTTPException(status_code=404, detail=f"Model '{body.model_key}' not found") + except ImageFileNotFoundException: + raise HTTPException(status_code=404, detail=f"Image '{body.image_name}' not found") + except (ValueError, TypeError) as e: + raise HTTPException(status_code=422, detail=str(e)) + except Exception as e: + logger.error(f"Error generating prompt from image: {e}") + raise HTTPException(status_code=500, detail=str(e)) diff --git a/invokeai/app/invocations/fields.py b/invokeai/app/invocations/fields.py index 2fc5fd5a3c0..e53aeb417b2 100644 --- a/invokeai/app/invocations/fields.py +++ b/invokeai/app/invocations/fields.py @@ -229,6 +229,7 @@ class FieldDescriptions: instantx_control_mode = "The control mode for InstantX ControlNet union models. Ignored for other ControlNet models. The standard mapping is: canny (0), tile (1), depth (2), blur (3), pose (4), gray (5), low quality (6). Negative values will be treated as 'None'." flux_redux_conditioning = "FLUX Redux conditioning tensor" vllm_model = "The VLLM model to use" + text_llm_model = "The text language model to use for text generation" flux_fill_conditioning = "FLUX Fill conditioning tensor" flux_kontext_conditioning = "FLUX Kontext conditioning (reference image)" diff --git a/invokeai/app/invocations/text_llm.py b/invokeai/app/invocations/text_llm.py new file mode 100644 index 00000000000..789e65be018 --- /dev/null +++ b/invokeai/app/invocations/text_llm.py @@ -0,0 +1,65 @@ +import torch +from transformers import AutoTokenizer + +from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation +from invokeai.app.invocations.fields import FieldDescriptions, InputField, UIComponent +from invokeai.app.invocations.model import ModelIdentifierField +from invokeai.app.invocations.primitives import StringOutput +from invokeai.app.services.shared.invocation_context import InvocationContext +from invokeai.backend.model_manager.taxonomy import ModelType +from invokeai.backend.text_llm_pipeline import DEFAULT_SYSTEM_PROMPT, TextLLMPipeline +from invokeai.backend.util.devices import TorchDevice + + +@invocation( + "text_llm", + title="Text LLM", + tags=["llm", "text", "prompt"], + category="llm", + version="1.0.0", + classification=Classification.Beta, +) +class TextLLMInvocation(BaseInvocation): + """Run a text language model to generate or expand text (e.g. for prompt expansion).""" + + prompt: str = InputField( + default="", + description="Input text prompt.", + ui_component=UIComponent.Textarea, + ) + system_prompt: str = InputField( + default=DEFAULT_SYSTEM_PROMPT, + description="System prompt that guides the model's behavior.", + ui_component=UIComponent.Textarea, + ) + text_llm_model: ModelIdentifierField = InputField( + title="Text LLM Model", + description=FieldDescriptions.text_llm_model, + ui_model_type=ModelType.TextLLM, + ) + max_tokens: int = InputField( + default=300, + ge=1, + le=2048, + description="Maximum number of tokens to generate.", + ) + + @torch.no_grad() + def invoke(self, context: InvocationContext) -> StringOutput: + model_config = context.models.get_config(self.text_llm_model) + + with context.models.load(self.text_llm_model).model_on_device() as (_, model): + model_abs_path = context.models.get_absolute_path(model_config) + tokenizer = AutoTokenizer.from_pretrained(model_abs_path, local_files_only=True) + + pipeline = TextLLMPipeline(model, tokenizer) + model_device = next(model.parameters()).device + output = pipeline.run( + prompt=self.prompt, + system_prompt=self.system_prompt, + max_new_tokens=self.max_tokens, + device=model_device, + dtype=TorchDevice.choose_torch_dtype(), + ) + + return StringOutput(value=output) diff --git a/invokeai/backend/model_manager/configs/factory.py b/invokeai/backend/model_manager/configs/factory.py index 4d26b4c3347..9059aecebd9 100644 --- a/invokeai/backend/model_manager/configs/factory.py +++ b/invokeai/backend/model_manager/configs/factory.py @@ -97,6 +97,7 @@ T2IAdapter_Diffusers_SDXL_Config, ) from invokeai.backend.model_manager.configs.t5_encoder import T5Encoder_BnBLLMint8_Config, T5Encoder_T5Encoder_Config +from invokeai.backend.model_manager.configs.text_llm import TextLLM_Diffusers_Config from invokeai.backend.model_manager.configs.textual_inversion import ( TI_File_SD1_Config, TI_File_SD2_Config, @@ -269,6 +270,7 @@ Annotated[SigLIP_Diffusers_Config, SigLIP_Diffusers_Config.get_tag()], Annotated[FLUXRedux_Checkpoint_Config, FLUXRedux_Checkpoint_Config.get_tag()], Annotated[LlavaOnevision_Diffusers_Config, LlavaOnevision_Diffusers_Config.get_tag()], + Annotated[TextLLM_Diffusers_Config, TextLLM_Diffusers_Config.get_tag()], Annotated[ExternalApiModelConfig, ExternalApiModelConfig.get_tag()], # Unknown model (fallback) Annotated[Unknown_Config, Unknown_Config.get_tag()], diff --git a/invokeai/backend/model_manager/configs/text_llm.py b/invokeai/backend/model_manager/configs/text_llm.py new file mode 100644 index 00000000000..a0fb3e009f9 --- /dev/null +++ b/invokeai/backend/model_manager/configs/text_llm.py @@ -0,0 +1,52 @@ +from typing import ( + Literal, + Self, +) + +from pydantic import Field +from typing_extensions import Any + +from invokeai.backend.model_manager.configs.base import Config_Base, Diffusers_Config_Base +from invokeai.backend.model_manager.configs.identification_utils import ( + NotAMatchError, + common_config_paths, + get_class_name_from_config_dict_or_raise, + raise_for_override_fields, + raise_if_not_dir, +) +from invokeai.backend.model_manager.model_on_disk import ModelOnDisk +from invokeai.backend.model_manager.taxonomy import ( + BaseModelType, + ModelType, +) + + +class TextLLM_Diffusers_Config(Diffusers_Config_Base, Config_Base): + """Model config for text-only causal language models (e.g. Llama, Phi, Qwen, Mistral).""" + + type: Literal[ModelType.TextLLM] = Field(default=ModelType.TextLLM) + base: Literal[BaseModelType.Any] = Field(default=BaseModelType.Any) + cpu_only: bool | None = Field(default=None, description="Whether this model should run on CPU only") + + @classmethod + def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self: + raise_if_not_dir(mod) + + raise_for_override_fields(cls, override_fields) + + # Check that the model's architecture is a causal language model. + # This covers LlamaForCausalLM, PhiForCausalLM, Phi3ForCausalLM, Qwen2ForCausalLM, + # MistralForCausalLM, GemmaForCausalLM, GPTNeoXForCausalLM, etc. + class_name = get_class_name_from_config_dict_or_raise(common_config_paths(mod.path)) + if not class_name.endswith("ForCausalLM"): + raise NotAMatchError(f"model architecture '{class_name}' is not a causal language model") + + # Verify tokenizer files exist to avoid runtime failures + tokenizer_files = {"tokenizer.json", "tokenizer.model", "tokenizer_config.json"} + if not any((mod.path / f).exists() for f in tokenizer_files): + raise NotAMatchError( + f"no tokenizer files found in '{mod.path}' " + f"(expected at least one of: {', '.join(sorted(tokenizer_files))})" + ) + + return cls(**override_fields) diff --git a/invokeai/backend/model_manager/load/model_loaders/text_llm.py b/invokeai/backend/model_manager/load/model_loaders/text_llm.py new file mode 100644 index 00000000000..0ebfe3cc453 --- /dev/null +++ b/invokeai/backend/model_manager/load/model_loaders/text_llm.py @@ -0,0 +1,32 @@ +from pathlib import Path +from typing import Optional + +import torch +from transformers import AutoModelForCausalLM + +from invokeai.backend.model_manager.configs.factory import AnyModelConfig +from invokeai.backend.model_manager.load.load_default import ModelLoader +from invokeai.backend.model_manager.load.model_loader_registry import ModelLoaderRegistry +from invokeai.backend.model_manager.taxonomy import AnyModel, BaseModelType, ModelFormat, ModelType, SubModelType + + +@ModelLoaderRegistry.register(base=BaseModelType.Any, type=ModelType.TextLLM, format=ModelFormat.Diffusers) +class TextLLMModelLoader(ModelLoader): + """Class for loading text causal language models (Llama, Phi, Qwen, Mistral, etc.).""" + + def _load_model( + self, + config: AnyModelConfig, + submodel_type: Optional[SubModelType] = None, + ) -> AnyModel: + if submodel_type is not None: + raise ValueError("Unexpected submodel requested for TextLLM model.") + + # Use float32 for CPU-only models since CPU fp16 is emulated and slow. + dtype = self._torch_dtype + if getattr(config, "cpu_only", False) is True: + dtype = torch.float32 + + model_path = Path(config.path) + model = AutoModelForCausalLM.from_pretrained(model_path, local_files_only=True, torch_dtype=dtype) + return model diff --git a/invokeai/backend/model_manager/taxonomy.py b/invokeai/backend/model_manager/taxonomy.py index b2b55ebd3fc..a141d43cf42 100644 --- a/invokeai/backend/model_manager/taxonomy.py +++ b/invokeai/backend/model_manager/taxonomy.py @@ -82,6 +82,7 @@ class ModelType(str, Enum): SigLIP = "siglip" FluxRedux = "flux_redux" LlavaOnevision = "llava_onevision" + TextLLM = "text_llm" ExternalImageGenerator = "external_image_generator" Unknown = "unknown" diff --git a/invokeai/backend/text_llm_pipeline.py b/invokeai/backend/text_llm_pipeline.py new file mode 100644 index 00000000000..69815c1a7f7 --- /dev/null +++ b/invokeai/backend/text_llm_pipeline.py @@ -0,0 +1,56 @@ +import torch +from transformers import PreTrainedModel, PreTrainedTokenizerBase + +DEFAULT_SYSTEM_PROMPT = ( + "You are an expert prompt writer for AI image generation. " + "Given a brief description, expand it into a detailed, vivid prompt suitable for generating high-quality images. " + "Only output the expanded prompt, nothing else." +) + + +class TextLLMPipeline: + """A wrapper for a causal language model + tokenizer for text generation.""" + + def __init__(self, model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase): + self._model = model + self._tokenizer = tokenizer + + def run( + self, + prompt: str, + system_prompt: str = DEFAULT_SYSTEM_PROMPT, + max_new_tokens: int = 300, + device: torch.device = torch.device("cpu"), + dtype: torch.dtype = torch.float16, + ) -> str: + # Build messages for chat template if supported, otherwise use raw prompt. + if hasattr(self._tokenizer, "apply_chat_template") and self._tokenizer.chat_template is not None: + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": prompt}) + formatted_prompt: str = self._tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + else: + # Fallback for models without chat template + if system_prompt: + formatted_prompt = f"{system_prompt}\n\nUser: {prompt}\nAssistant:" + else: + formatted_prompt = prompt + + inputs = self._tokenizer(formatted_prompt, return_tensors="pt").to(device=device) + output = self._model.generate( + **inputs, + max_new_tokens=max_new_tokens, + do_sample=True, + temperature=0.7, + top_p=0.9, + ) + + # Decode only the newly generated tokens (exclude the input prompt tokens). + input_length = inputs["input_ids"].shape[1] + generated_tokens = output[0][input_length:] + response = self._tokenizer.decode(generated_tokens, skip_special_tokens=True).strip() + + return response diff --git a/invokeai/frontend/web/public/locales/en.json b/invokeai/frontend/web/public/locales/en.json index 75c5ad6671f..65378016d40 100644 --- a/invokeai/frontend/web/public/locales/en.json +++ b/invokeai/frontend/web/public/locales/en.json @@ -378,7 +378,16 @@ "promptHistory": "Prompt History", "clearHistory": "Clear History", "usePrompt": "Use prompt", - "searchPrompts": "Search..." + "searchPrompts": "Search...", + "imageToPrompt": "Image to Prompt", + "selectVisionModel": "Select Vision Model...", + "changeImage": "Change Image", + "uploadImage": "Upload Image", + "generatePrompt": "Generate Prompt", + "expandPromptWithLLM": "Expand Prompt with LLM", + "expandPrompt": "Expand Prompt", + "selectTextLLM": "Select Text LLM...", + "expand": "Expand" }, "queue": { "queue": "Queue", @@ -1284,6 +1293,7 @@ }, "controlLora": "Control LoRA", "llavaOnevision": "LLaVA OneVision", + "textLLM": "Text LLM", "syncModels": "Sync Models", "syncModelsTooltip": "Identify and remove unused model files in the InvokeAI root directory.", "syncModelsDirectory": "Synchronize Models Directory", @@ -3324,6 +3334,10 @@ "whatsNew": { "whatsNewInInvoke": "What's New in Invoke", "items": [ + "LLM Prompt Tools: Use local language models to expand prompts or generate prompts from images. Install a Text LLM model (e.g. Qwen2.5-1.5B-Instruct) to get started.", + "FLUX.2 Klein Support: InvokeAI now supports the new FLUX.2 Klein models (4B and 9B variants) with GGUF, FP8, and Diffusers formats. Features include txt2img, img2img, inpainting, and outpainting. See 'Starter Models' to get started.", + "DyPE support for FLUX models improves high-resolution (>1536 px up to 4K) images. Go to the 'Advanced Options' section to activate.", + "Z-Image Turbo diversity: Active 'Seed Variance Enhancer' under 'Advanced Options' to add diversity to your ZiT gens.", "Multi-user mode supports multiple isolated users on the same server.", "Enhanced support for Z-Image and FLUX.2 Models.", "Multiple user interface enhancements and new canvas features." diff --git a/invokeai/frontend/web/src/features/modelManagerV2/hooks/useEncoderModelSettings.ts b/invokeai/frontend/web/src/features/modelManagerV2/hooks/useEncoderModelSettings.ts index b1521f55fce..6b3e9d71010 100644 --- a/invokeai/frontend/web/src/features/modelManagerV2/hooks/useEncoderModelSettings.ts +++ b/invokeai/frontend/web/src/features/modelManagerV2/hooks/useEncoderModelSettings.ts @@ -7,6 +7,7 @@ import type { Qwen3EncoderModelConfig, SigLIPModelConfig, T5EncoderModelConfig, + TextLLMModelConfig, } from 'services/api/types'; type EncoderModelConfig = @@ -15,7 +16,8 @@ type EncoderModelConfig = | Qwen3EncoderModelConfig | CLIPVisionModelConfig | SigLIPModelConfig - | LlavaOnevisionModelConfig; + | LlavaOnevisionModelConfig + | TextLLMModelConfig; export const useEncoderModelSettings = (modelConfig: EncoderModelConfig) => { const encoderModelSettingsDefaults = useMemo(() => { diff --git a/invokeai/frontend/web/src/features/modelManagerV2/models.ts b/invokeai/frontend/web/src/features/modelManagerV2/models.ts index 7cdba474bbf..1c0d2b20c7c 100644 --- a/invokeai/frontend/web/src/features/modelManagerV2/models.ts +++ b/invokeai/frontend/web/src/features/modelManagerV2/models.ts @@ -17,6 +17,7 @@ import { isSpandrelImageToImageModelConfig, isT2IAdapterModelConfig, isT5EncoderModelConfig, + isTextLLMModelConfig, isTIModelConfig, isUnknownModelConfig, isVAEModelConfig, @@ -122,6 +123,11 @@ const MODEL_CATEGORIES: Record = { i18nKey: 'modelManager.llavaOnevision', filter: isLLaVAModelConfig, }, + text_llm: { + category: 'text_llm', + i18nKey: 'modelManager.textLLM', + filter: isTextLLMModelConfig, + }, external_image_generator: { category: 'external_image_generator', i18nKey: 'modelManager.externalImageGenerator', @@ -176,6 +182,7 @@ export const MODEL_TYPE_TO_LONG_NAME: Record = { clip_embed: 'CLIP Embed', siglip: 'SigLIP', flux_redux: 'FLUX Redux', + text_llm: 'Text LLM', external_image_generator: 'External Image Generator', unknown: 'Unknown', }; diff --git a/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelPanel/EncoderModelSettings/EncoderModelSettings.tsx b/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelPanel/EncoderModelSettings/EncoderModelSettings.tsx index e10766214f4..9bfe3974ddf 100644 --- a/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelPanel/EncoderModelSettings/EncoderModelSettings.tsx +++ b/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelPanel/EncoderModelSettings/EncoderModelSettings.tsx @@ -19,6 +19,7 @@ import type { Qwen3EncoderModelConfig, SigLIPModelConfig, T5EncoderModelConfig, + TextLLMModelConfig, } from 'services/api/types'; export type EncoderModelSettingsFormData = { @@ -31,7 +32,8 @@ type EncoderModelConfig = | Qwen3EncoderModelConfig | CLIPVisionModelConfig | SigLIPModelConfig - | LlavaOnevisionModelConfig; + | LlavaOnevisionModelConfig + | TextLLMModelConfig; type Props = { modelConfig: EncoderModelConfig; diff --git a/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelPanel/ModelView.tsx b/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelPanel/ModelView.tsx index d29d330facd..365f7cff4b8 100644 --- a/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelPanel/ModelView.tsx +++ b/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelPanel/ModelView.tsx @@ -21,6 +21,7 @@ import { type Qwen3EncoderModelConfig, type SigLIPModelConfig, type T5EncoderModelConfig, + type TextLLMModelConfig, } from 'services/api/types'; import { isExternalModel } from './isExternalModel'; @@ -37,7 +38,8 @@ type EncoderModelConfig = | Qwen3EncoderModelConfig | CLIPVisionModelConfig | SigLIPModelConfig - | LlavaOnevisionModelConfig; + | LlavaOnevisionModelConfig + | TextLLMModelConfig; const isEncoderModel = (modelConfig: AnyModelConfigWithExternal): modelConfig is EncoderModelConfig => { return ( @@ -46,7 +48,8 @@ const isEncoderModel = (modelConfig: AnyModelConfigWithExternal): modelConfig is modelConfig.type === 'qwen3_encoder' || modelConfig.type === 'clip_vision' || modelConfig.type === 'siglip' || - modelConfig.type === 'llava_onevision' + modelConfig.type === 'llava_onevision' || + modelConfig.type === 'text_llm' ); }; diff --git a/invokeai/frontend/web/src/features/nodes/types/common.ts b/invokeai/frontend/web/src/features/nodes/types/common.ts index 75c3415cefb..d1aa0523a43 100644 --- a/invokeai/frontend/web/src/features/nodes/types/common.ts +++ b/invokeai/frontend/web/src/features/nodes/types/common.ts @@ -121,6 +121,7 @@ export const zModelType = z.enum([ 'vae', 'lora', 'llava_onevision', + 'text_llm', 'control_lora', 'controlnet', 't2i_adapter', diff --git a/invokeai/frontend/web/src/features/parameters/components/Core/ParamPositivePrompt.tsx b/invokeai/frontend/web/src/features/parameters/components/Core/ParamPositivePrompt.tsx index 89169b5ea54..5167dd1527b 100644 --- a/invokeai/frontend/web/src/features/parameters/components/Core/ParamPositivePrompt.tsx +++ b/invokeai/frontend/web/src/features/parameters/components/Core/ParamPositivePrompt.tsx @@ -1,3 +1,5 @@ +import { combine } from '@atlaskit/pragmatic-drag-and-drop/combine'; +import { dropTargetForElements, monitorForElements } from '@atlaskit/pragmatic-drag-and-drop/element/adapter'; import { Box, Flex, Textarea } from '@invoke-ai/ui-library'; import { useAppDispatch, useAppSelector, useAppStore } from 'app/store/storeHooks'; import { usePersistedTextAreaSize } from 'common/hooks/usePersistedTextareaSize'; @@ -7,6 +9,9 @@ import { selectPositivePrompt, selectPositivePromptHistory, } from 'features/controlLayers/store/paramsSlice'; +import { singleImageDndSource } from 'features/dnd/dnd'; +import { DndDropOverlay } from 'features/dnd/DndDropOverlay'; +import type { DndTargetState } from 'features/dnd/types'; import { ShowDynamicPromptsPreviewButton } from 'features/dynamicPrompts/components/ShowDynamicPromptsPreviewButton'; import { NegativePromptToggleButton } from 'features/parameters/components/Core/NegativePromptToggleButton'; import { PromptLabel } from 'features/parameters/components/Prompts/PromptLabel'; @@ -14,7 +19,10 @@ import { PromptOverlayButtonWrapper } from 'features/parameters/components/Promp import { PromptResizeHandle } from 'features/parameters/components/Prompts/PromptResizeHandle'; import { ViewModePrompt } from 'features/parameters/components/Prompts/ViewModePrompt'; import { AddPromptTriggerButton } from 'features/prompt/AddPromptTriggerButton'; +import { ExpandPromptButton } from 'features/prompt/ExpandPromptButton'; +import { ImageToPromptButton } from 'features/prompt/ImageToPromptButton'; import { PromptPopover } from 'features/prompt/PromptPopover'; +import { clearPromptUndo, consumePromptUndo } from 'features/prompt/promptUndo'; import { usePrompt } from 'features/prompt/usePrompt'; import { usePromptAttentionHotkeys } from 'features/prompt/usePromptAttentionHotkeys'; import { @@ -22,11 +30,13 @@ import { selectStylePresetViewMode, } from 'features/stylePresets/store/stylePresetSlice'; import { useRegisteredHotkeys } from 'features/system/components/HotkeysModal/useHotkeyData'; -import React, { memo, useCallback, useRef } from 'react'; +import React, { memo, useCallback, useEffect, useRef, useState } from 'react'; import type { HotkeyCallback } from 'react-hotkeys-hook'; import { useTranslation } from 'react-i18next'; import { useClickAway } from 'react-use'; import { useListStylePresetsQuery } from 'services/api/endpoints/stylePresets'; +import { useLlavaModels } from 'services/api/hooks/modelsByType'; +import type { ImageDTO } from 'services/api/types'; import { PositivePromptHistoryIconButton } from './PositivePromptHistory'; @@ -116,6 +126,8 @@ export const ParamPositivePrompt = memo(() => { const viewMode = useAppSelector(selectStylePresetViewMode); const activeStylePresetId = useAppSelector(selectStylePresetActivePresetId); const modelSupportsNegativePrompt = useAppSelector(selectModelSupportsNegativePrompt); + const [llavaModels] = useLlavaModels(); + const hasLlavaModels = llavaModels.length > 0; const promptHistoryApi = usePromptHistory(); @@ -139,15 +151,41 @@ export const ParamPositivePrompt = memo(() => { // When the user changes the prompt, reset the prompt history state. This event is not fired when the prompt is // changed via the prompt history navigation. promptHistoryApi.reset(); + // Clear LLM undo state when the user types manually + clearPromptUndo(); }, [dispatch, promptHistoryApi] ); - const { onChange, isOpen, onClose, onOpen, onSelect, onKeyDown, onFocus } = usePrompt({ + const { + onChange, + isOpen, + onClose, + onOpen, + onSelect, + onKeyDown: onKeyDownPrompt, + onFocus, + } = usePrompt({ prompt, textareaRef: textareaRef, onChange: handleChange, }); + const onKeyDown = useCallback( + (e: React.KeyboardEvent) => { + // Intercept Ctrl+Z to undo LLM prompt changes + if (e.key === 'z' && (e.ctrlKey || e.metaKey) && !e.shiftKey) { + const previousPrompt = consumePromptUndo(); + if (previousPrompt !== null) { + e.preventDefault(); + dispatch(positivePromptChanged(previousPrompt)); + return; + } + } + onKeyDownPrompt(e); + }, + [dispatch, onKeyDownPrompt] + ); + // When the user clicks away from the textarea, reset the prompt history state. useClickAway(textareaRef, promptHistoryApi.reset); @@ -201,8 +239,44 @@ export const ParamPositivePrompt = memo(() => { onPromptChange: (prompt) => dispatch(positivePromptChanged(prompt)), }); + // Drop target for gallery images -> Image to Prompt + const dropTargetRef = useRef(null); + const [droppedImage, setDroppedImage] = useState(undefined); + const [dndState, setDndState] = useState('idle'); + + const clearDroppedImage = useCallback(() => { + setDroppedImage(undefined); + }, []); + + useEffect(() => { + const element = dropTargetRef.current; + if (!element || !hasLlavaModels) { + return; + } + + return combine( + dropTargetForElements({ + element, + canDrop: ({ source }) => singleImageDndSource.typeGuard(source.data), + onDragEnter: () => setDndState('over'), + onDragLeave: () => setDndState('potential'), + onDrop: ({ source }) => { + setDndState('idle'); + if (singleImageDndSource.typeGuard(source.data)) { + setDroppedImage(source.data.payload.imageDTO); + } + }, + }), + monitorForElements({ + canMonitor: ({ source }) => singleImageDndSource.typeGuard(source.data), + onDragStart: () => setDndState('potential'), + onDrop: () => setDndState('idle'), + }) + ); + }, [hasLlavaModels]); + return ( - +