feat(multi-gpu): surface per-session GPU number in logs and UI

lstein · claude · lstein · commit 8cef3cf6ca48 · 2026-06-11T22:41:33.000-04:00
Help users track which CUDA device is processing each session:

- Model-load log: "Loaded model ... onto cuda device #N in ..s"
- Denoise progress bars: "Denoising (#N)" across all architectures
  (SD1.5/SDXL, FLUX, FLUX2, Z-Image, Anima, SD3, CogView4)
- Progress preview circle: GPU number centered in the ring, via a new
  `device` field on InvocationProgressEvent (resolved from the worker's
  thread-local session device)
- Session Queue: new "GPU #" column between STATUS and TIME, backed by a
  `device` column on session_queue (migration_32) recorded when a worker
  claims an item

Adds TorchDevice.get_session_device_label()/get_session_device_index()
helpers and a frontend getCudaDeviceIndex() parser (with tests). Shows the
number on CUDA only; CPU/MPS show nothing.

Co-Authored-By: Claude Opus 4.8 &lt;noreply@anthropic.com&gt;
diff --git a/invokeai/app/invocations/anima_denoise.py b/invokeai/app/invocations/anima_denoise.py
@@ -608,7 +608,7 @@ def _run_transformer(ctx: torch.Tensor, x: torch.Tensor, t: torch.Tensor) -> tor
 
             if driver is not None:
                 user_step = 0
-                pbar = tqdm(total=total_steps, desc="Denoising (Anima)")
+                pbar = tqdm(total=total_steps, desc=f"Denoising (Anima){TorchDevice.get_session_device_label()}")
                 for it in driver.iterations():
                     timestep = torch.tensor(
                         [it.sigma_curr * ANIMA_MULTIPLIER], device=device, dtype=inference_dtype
@@ -655,7 +655,9 @@ def _run_transformer(ctx: torch.Tensor, x: torch.Tensor, t: torch.Tensor) -> tor
                 pbar.close()
             else:
                 # Built-in Euler implementation (default for Anima)
-                for step_idx in tqdm(range(total_steps), desc="Denoising (Anima)"):
+                for step_idx in tqdm(
+                    range(total_steps), desc=f"Denoising (Anima){TorchDevice.get_session_device_label()}"
+                ):
                     sigma_curr = sigmas[step_idx]
                     sigma_prev = sigmas[step_idx + 1]
 
diff --git a/invokeai/app/invocations/cogview4_denoise.py b/invokeai/app/invocations/cogview4_denoise.py
@@ -294,7 +294,7 @@ def _run_diffusion(
             assert isinstance(transformer, CogView4Transformer2DModel)
 
             # Denoising loop
-            for step_idx in tqdm(range(total_steps)):
+            for step_idx in tqdm(range(total_steps), desc=f"Denoising{TorchDevice.get_session_device_label()}"):
                 t_curr = timesteps[step_idx]
                 sigma_curr = sigmas[step_idx]
                 sigma_prev = sigmas[step_idx + 1]
diff --git a/invokeai/app/invocations/sd3_denoise.py b/invokeai/app/invocations/sd3_denoise.py
@@ -284,7 +284,10 @@ def _run_diffusion(
             assert isinstance(transformer, SD3Transformer2DModel)
 
             # 6. Denoising loop
-            for step_idx, (t_curr, t_prev) in tqdm(list(enumerate(zip(timesteps[:-1], timesteps[1:], strict=True)))):
+            for step_idx, (t_curr, t_prev) in tqdm(
+                list(enumerate(zip(timesteps[:-1], timesteps[1:], strict=True))),
+                desc=f"Denoising{TorchDevice.get_session_device_label()}",
+            ):
                 # Expand the latents if we are doing CFG.
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                 # Expand the timestep to match the latent model input.
diff --git a/invokeai/app/invocations/z_image_denoise.py b/invokeai/app/invocations/z_image_denoise.py
@@ -569,7 +569,7 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor:
                 # Use diffusers scheduler for stepping
                 # Use tqdm with total_steps (user-facing steps) not num_scheduler_steps (internal steps)
                 # This ensures progress bar shows 1/8, 2/8, etc. even when scheduler uses more internal steps
-                pbar = tqdm(total=total_steps, desc="Denoising")
+                pbar = tqdm(total=total_steps, desc=f"Denoising{TorchDevice.get_session_device_label()}")
                 for step_index in range(num_scheduler_steps):
                     sched_timestep = scheduler.timesteps[step_index]
                     # Convert scheduler timestep (0-1000) to normalized sigma (0-1)
@@ -686,7 +686,7 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor:
                 pbar.close()
             else:
                 # Original Euler implementation (default, optimized for Z-Image)
-                for step_idx in tqdm(range(total_steps)):
+                for step_idx in tqdm(range(total_steps), desc=f"Denoising{TorchDevice.get_session_device_label()}"):
                     sigma_curr = sigmas[step_idx]
                     sigma_prev = sigmas[step_idx + 1]
 
diff --git a/invokeai/app/services/events/events_common.py b/invokeai/app/services/events/events_common.py
@@ -138,6 +138,10 @@ class InvocationProgressEvent(InvocationEventBase):
     image: ProgressImage | None = Field(
         default=None, description="An image representing the current state of the progress"
     )
+    device: str | None = Field(
+        default=None,
+        description="The device processing this session, e.g. 'cuda:1' (set only when running on a CUDA GPU)",
+    )
 
     @classmethod
     def build(
@@ -148,6 +152,13 @@ def build(
         percentage: float | None = None,
         image: ProgressImage | None = None,
     ) -> "InvocationProgressEvent":
+        # This is emitted from the session-processor worker thread, which pins its CUDA device via
+        # TorchDevice.set_session_device(). Resolve that here so the UI can label progress by GPU.
+        from invokeai.backend.util.devices import TorchDevice
+
+        session_device = TorchDevice.get_session_device()
+        device = str(session_device) if session_device is not None and session_device.type == "cuda" else None
+
         return cls(
             queue_id=queue_item.queue_id,
             item_id=queue_item.item_id,
@@ -161,6 +172,7 @@ def build(
             percentage=percentage,
             image=image,
             message=message,
+            device=device,
         )
 
 
diff --git a/invokeai/app/services/session_processor/session_processor_default.py b/invokeai/app/services/session_processor/session_processor_default.py
@@ -529,8 +529,11 @@ def _process(
                         break
 
                     # Get the next session to process. dequeue() atomically claims the item, so concurrent
-                    # workers never receive the same item.
-                    worker.queue_item = self._invoker.services.session_queue.dequeue()
+                    # workers never receive the same item. Pass this worker's device so the item is
+                    # tagged with the GPU that ran it (None in single-device/legacy mode).
+                    worker.queue_item = self._invoker.services.session_queue.dequeue(
+                        device=str(worker.device) if worker.device is not None else None
+                    )
 
                     if worker.queue_item is None:
                         # The queue was empty, wait for next polling interval or event to try again
diff --git a/invokeai/app/services/session_queue/session_queue_base.py b/invokeai/app/services/session_queue/session_queue_base.py
@@ -31,8 +31,8 @@ class SessionQueueBase(ABC):
     """Base class for session queue"""
 
     @abstractmethod
-    def dequeue(self) -> Optional[SessionQueueItem]:
-        """Dequeues the next session queue item."""
+    def dequeue(self, device: Optional[str] = None) -> Optional[SessionQueueItem]:
+        """Dequeues the next session queue item, recording the processing device (e.g. 'cuda:1') if given."""
         pass
 
     @abstractmethod
diff --git a/invokeai/app/services/session_queue/session_queue_common.py b/invokeai/app/services/session_queue/session_queue_common.py
@@ -262,6 +262,10 @@ class SessionQueueItem(BaseModel):
     retried_from_item_id: Optional[int] = Field(
         default=None, description="The item_id of the queue item that this item was retried from"
     )
+    device: Optional[str] = Field(
+        default=None,
+        description="The device that processed this queue item, e.g. 'cuda:1' (set only when running on a CUDA GPU)",
+    )
     session: GraphExecutionState = Field(description="The fully-populated session to be executed")
     workflow: Optional[WorkflowWithoutID] = Field(
         default=None, description="The workflow associated with this queue item"
diff --git a/invokeai/app/services/session_queue/session_queue_sqlite.py b/invokeai/app/services/session_queue/session_queue_sqlite.py
@@ -216,7 +216,7 @@ async def enqueue_batch(
         self.__invoker.services.events.emit_batch_enqueued(enqueue_result, user_id=user_id)
         return enqueue_result
 
-    def dequeue(self) -> Optional[SessionQueueItem]:
+    def dequeue(self, device: Optional[str] = None) -> Optional[SessionQueueItem]:
         # Hold the dequeue lock across the select-then-claim so concurrent workers (multi-GPU)
         # cannot select and claim the same pending item. `_set_queue_item_status` already no-ops
         # if the item was concurrently moved to a terminal state (e.g. canceled), so we only need
@@ -242,7 +242,8 @@ def dequeue(self) -> Optional[SessionQueueItem]:
             if result is None:
                 return None
             queue_item = SessionQueueItem.queue_item_from_dict(dict(result))
-            queue_item = self._set_queue_item_status(item_id=queue_item.item_id, status="in_progress")
+            # Record the claiming worker's device so the UI can label the item by GPU.
+            queue_item = self._set_queue_item_status(item_id=queue_item.item_id, status="in_progress", device=device)
         return queue_item
 
     def get_next(self, queue_id: str) -> Optional[SessionQueueItem]:
@@ -299,6 +300,7 @@ def _set_queue_item_status(
         error_type: Optional[str] = None,
         error_message: Optional[str] = None,
         error_traceback: Optional[str] = None,
+        device: Optional[str] = None,
     ) -> SessionQueueItem:
         with self._db.transaction() as cursor:
             cursor.execute(
@@ -320,10 +322,10 @@ def _set_queue_item_status(
             cursor.execute(
                 """--sql
                 UPDATE session_queue
-                SET status = ?, status_sequence = COALESCE(status_sequence, 0) + 1, error_type = ?, error_message = ?, error_traceback = ?
+                SET status = ?, status_sequence = COALESCE(status_sequence, 0) + 1, error_type = ?, error_message = ?, error_traceback = ?, device = COALESCE(?, device)
                 WHERE item_id = ?
                 """,
-                (status, error_type, error_message, error_traceback, item_id),
+                (status, error_type, error_message, error_traceback, device, item_id),
             )
 
         queue_item = self.get_queue_item(item_id)
diff --git a/invokeai/app/services/shared/sqlite/sqlite_util.py b/invokeai/app/services/shared/sqlite/sqlite_util.py
@@ -34,6 +34,7 @@
 from invokeai.app.services.shared.sqlite_migrator.migrations.migration_29 import build_migration_29
 from invokeai.app.services.shared.sqlite_migrator.migrations.migration_30 import build_migration_30
 from invokeai.app.services.shared.sqlite_migrator.migrations.migration_31 import build_migration_31
+from invokeai.app.services.shared.sqlite_migrator.migrations.migration_32 import build_migration_32
 from invokeai.app.services.shared.sqlite_migrator.sqlite_migrator_impl import SqliteMigrator
 
 
@@ -85,6 +86,7 @@ def init_db(config: InvokeAIAppConfig, logger: Logger, image_files: ImageFileSto
     migrator.register_migration(build_migration_29())
     migrator.register_migration(build_migration_30())
     migrator.register_migration(build_migration_31())
+    migrator.register_migration(build_migration_32())
     migrator.run_migrations()
 
     return db
diff --git a/invokeai/app/services/shared/sqlite_migrator/migrations/migration_32.py b/invokeai/app/services/shared/sqlite_migrator/migrations/migration_32.py
@@ -0,0 +1,36 @@
+"""Migration 32: Add device column to session_queue table.
+
+This records which device (e.g. 'cuda:1') processed a queue item, so the UI can show a per-item
+GPU number in the Session Queue. Existing rows get NULL (unknown device).
+"""
+
+import sqlite3
+
+from invokeai.app.services.shared.sqlite_migrator.sqlite_migrator_common import Migration
+
+
+class Migration32Callback:
+    """Migration to add a device column to the session_queue table."""
+
+    def __call__(self, cursor: sqlite3.Cursor) -> None:
+        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='session_queue';")
+        if cursor.fetchone() is None:
+            return
+
+        cursor.execute("PRAGMA table_info(session_queue);")
+        columns = [row[1] for row in cursor.fetchall()]
+
+        if "device" not in columns:
+            cursor.execute("ALTER TABLE session_queue ADD COLUMN device TEXT;")
+
+
+def build_migration_32() -> Migration:
+    """Builds the migration object for migrating from version 31 to version 32.
+
+    This migration adds a device column to the session_queue table.
+    """
+    return Migration(
+        from_version=31,
+        to_version=32,
+        callback=Migration32Callback(),
+    )
diff --git a/invokeai/backend/flux/denoise.py b/invokeai/backend/flux/denoise.py
@@ -15,6 +15,7 @@
 from invokeai.backend.flux.model import Flux
 from invokeai.backend.rectified_flow.rectified_flow_inpaint_extension import RectifiedFlowInpaintExtension
 from invokeai.backend.stable_diffusion.diffusers_pipeline import PipelineIntermediateState
+from invokeai.backend.util.devices import TorchDevice
 
 
 def denoise(
@@ -95,7 +96,7 @@ def denoise(
             # Use diffusers scheduler for stepping
             # Use tqdm with total_steps (user-facing steps) not num_scheduler_steps (internal steps)
             # This ensures progress bar shows 1/8, 2/8, etc. even when scheduler uses more internal steps
-            pbar = tqdm(total=total_steps, desc="Denoising")
+            pbar = tqdm(total=total_steps, desc=f"Denoising{TorchDevice.get_session_device_label()}")
             for step_index in range(num_scheduler_steps):
                 timestep = scheduler.timesteps[step_index]
                 # Convert scheduler timestep (0-1000) to normalized (0-1) for the model
@@ -266,7 +267,10 @@ def denoise(
             return img
 
         # Original Euler implementation (when scheduler is None)
-        for step_index, (t_curr, t_prev) in tqdm(list(enumerate(zip(timesteps[:-1], timesteps[1:], strict=True)))):
+        for step_index, (t_curr, t_prev) in tqdm(
+            list(enumerate(zip(timesteps[:-1], timesteps[1:], strict=True))),
+            desc=f"Denoising{TorchDevice.get_session_device_label()}",
+        ):
             # DyPE: Update step state for timestep-dependent scaling
             if dype_extension is not None and dype_embedder is not None:
                 dype_extension.update_step_state(
diff --git a/invokeai/backend/flux2/denoise.py b/invokeai/backend/flux2/denoise.py
@@ -14,6 +14,7 @@
 
 from invokeai.backend.rectified_flow.rectified_flow_inpaint_extension import RectifiedFlowInpaintExtension
 from invokeai.backend.stable_diffusion.diffusers_pipeline import PipelineIntermediateState
+from invokeai.backend.util.devices import TorchDevice
 
 
 def denoise(
@@ -118,7 +119,7 @@ def denoise(
         is_heun = hasattr(scheduler, "state_in_first_order")
         user_step = 0
 
-        pbar = tqdm(total=total_steps, desc="Denoising")
+        pbar = tqdm(total=total_steps, desc=f"Denoising{TorchDevice.get_session_device_label()}")
         for step_index in range(num_scheduler_steps):
             timestep = scheduler.timesteps[step_index]
             # Convert scheduler timestep (0-1000) to normalized (0-1) for the model
@@ -226,7 +227,10 @@ def denoise(
         pbar.close()
     else:
         # Manual Euler stepping (original behavior)
-        for step_index, (t_curr, t_prev) in tqdm(list(enumerate(zip(timesteps[:-1], timesteps[1:], strict=True)))):
+        for step_index, (t_curr, t_prev) in tqdm(
+            list(enumerate(zip(timesteps[:-1], timesteps[1:], strict=True))),
+            desc=f"Denoising{TorchDevice.get_session_device_label()}",
+        ):
             t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
 
             # Run the transformer model (matching diffusers: guidance=guidance, return_dict=False)
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache.py b/invokeai/backend/model_manager/load/model_cache/model_cache.py
@@ -619,9 +619,13 @@ def _load_locked_model(self, cache_entry: CacheRecord, working_mem_bytes: Option
         loaded_percent = model_cur_vram_bytes / model_total_bytes if model_total_bytes > 0 else 0
         # Use the model's actual compute_device for logging, not the cache's default
         model_device = cache_entry.cached_model.compute_device
+        if model_device.type == "cuda":
+            device_label = f"cuda device #{model_device.index}" if model_device.index is not None else "cuda device"
+        else:
+            device_label = f"{model_device.type} device"
         self._logger.info(
             f"Loaded model '{cache_entry.key}' ({cache_entry.cached_model.model.__class__.__name__}) onto "
-            f"{model_device.type} device in {(time.time() - start_time):.2f}s. "
+            f"{device_label} in {(time.time() - start_time):.2f}s. "
             f"Total model size: {model_total_bytes / MB:.2f}MB, "
             f"VRAM: {model_cur_vram_bytes / MB:.2f}MB ({loaded_percent:.1%})"
         )
diff --git a/invokeai/backend/stable_diffusion/diffusion_backend.py b/invokeai/backend/stable_diffusion/diffusion_backend.py
@@ -10,6 +10,7 @@
 from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ConditioningMode
 from invokeai.backend.stable_diffusion.extension_callback_type import ExtensionCallbackType
 from invokeai.backend.stable_diffusion.extensions_manager import ExtensionsManager
+from invokeai.backend.util.devices import TorchDevice
 
 
 class StableDiffusionBackend:
@@ -44,7 +45,9 @@ def latents_from_embeddings(self, ctx: DenoiseContext, ext_manager: ExtensionsMa
         # ext: preview[pre_denoise_loop, priority=low]
         ext_manager.run_callback(ExtensionCallbackType.PRE_DENOISE_LOOP, ctx)
 
-        for ctx.step_index, ctx.timestep in enumerate(tqdm(ctx.inputs.timesteps)):  # noqa: B020
+        for ctx.step_index, ctx.timestep in enumerate(  # noqa: B020
+            tqdm(ctx.inputs.timesteps, desc=f"Denoising{TorchDevice.get_session_device_label()}")
+        ):
             # ext: inpaint (apply mask to latents on non-inpaint models)
             ext_manager.run_callback(ExtensionCallbackType.PRE_STEP, ctx)
 
diff --git a/invokeai/backend/util/devices.py b/invokeai/backend/util/devices.py
@@ -69,6 +69,23 @@ def clear_session_device(cls) -> None:
         if hasattr(cls._session_device, "device"):
             del cls._session_device.device
 
+    @classmethod
+    def get_session_device_index(cls) -> Optional[int]:
+        """Return the CUDA index of the calling thread's effective device, or None if not on CUDA.
+
+        Resolves the thread-local session device when a worker has pinned one (multi-GPU), otherwise
+        falls back to the globally-configured device. Used to annotate logs/progress with the GPU
+        number so concurrent sessions can be told apart.
+        """
+        device = cls.get_session_device() or cls.choose_torch_device()
+        return device.index if device.type == "cuda" else None
+
+    @classmethod
+    def get_session_device_label(cls) -> str:
+        """Return a ``" (#N)"`` suffix for the calling thread's CUDA device, or ``""`` when not on CUDA."""
+        index = cls.get_session_device_index()
+        return f" (#{index})" if index is not None else ""
+
     @classmethod
     def choose_torch_device(cls) -> torch.device:
         """Return the torch.device to use for accelerated inference."""
diff --git a/invokeai/frontend/web/public/locales/en.json b/invokeai/frontend/web/public/locales/en.json
@@ -443,6 +443,7 @@
         "next": "Next",
         "status": "Status",
         "total": "Total",
+        "gpu": "GPU #",
         "time": "Time",
         "credits": "Credits",
         "pending": "Pending",
diff --git a/invokeai/frontend/web/src/common/util/getCudaDeviceIndex.test.ts b/invokeai/frontend/web/src/common/util/getCudaDeviceIndex.test.ts
@@ -0,0 +1,29 @@
+import { describe, expect, it } from 'vitest';
+
+import { getCudaDeviceIndex } from './getCudaDeviceIndex';
+
+describe('getCudaDeviceIndex', () => {
+  it('parses the index from a cuda device string', () => {
+    expect(getCudaDeviceIndex('cuda:0')).toBe(0);
+    expect(getCudaDeviceIndex('cuda:1')).toBe(1);
+    expect(getCudaDeviceIndex('cuda:11')).toBe(11);
+  });
+
+  it('returns null for non-cuda devices', () => {
+    expect(getCudaDeviceIndex('cpu')).toBeNull();
+    expect(getCudaDeviceIndex('mps')).toBeNull();
+  });
+
+  it('returns null for null/undefined/empty', () => {
+    expect(getCudaDeviceIndex(null)).toBeNull();
+    expect(getCudaDeviceIndex(undefined)).toBeNull();
+    expect(getCudaDeviceIndex('')).toBeNull();
+  });
+
+  it('returns null for malformed cuda strings', () => {
+    expect(getCudaDeviceIndex('cuda')).toBeNull();
+    expect(getCudaDeviceIndex('cuda:')).toBeNull();
+    expect(getCudaDeviceIndex('cuda:x')).toBeNull();
+    expect(getCudaDeviceIndex('cuda:0:0')).toBeNull();
+  });
+});
diff --git a/invokeai/frontend/web/src/common/util/getCudaDeviceIndex.ts b/invokeai/frontend/web/src/common/util/getCudaDeviceIndex.ts
@@ -0,0 +1,13 @@
+/**
+ * Parse the CUDA device index from a device string (e.g. `"cuda:1"` → `1`).
+ *
+ * Returns `null` when the device is null/undefined or is not a CUDA device (e.g. `"cpu"`, `"mps"`).
+ * Used to label progress previews and queue items with their GPU number in multi-GPU setups.
+ */
+export const getCudaDeviceIndex = (device: string | null | undefined): number | null => {
+  if (!device) {
+    return null;
+  }
+  const match = /^cuda:(\d+)$/.exec(device);
+  return match ? Number(match[1]) : null;
+};
diff --git a/invokeai/frontend/web/src/features/gallery/components/ImageViewer/ProgressIndicator2.tsx b/invokeai/frontend/web/src/features/gallery/components/ImageViewer/ProgressIndicator2.tsx
diff --git a/invokeai/frontend/web/src/features/queue/components/QueueList/QueueItemComponent.tsx b/invokeai/frontend/web/src/features/queue/components/QueueList/QueueItemComponent.tsx
diff --git a/invokeai/frontend/web/src/features/queue/components/QueueList/QueueListHeader.tsx b/invokeai/frontend/web/src/features/queue/components/QueueList/QueueListHeader.tsx
diff --git a/invokeai/frontend/web/src/features/queue/components/QueueList/constants.ts b/invokeai/frontend/web/src/features/queue/components/QueueList/constants.ts
diff --git a/invokeai/frontend/web/src/services/api/schema.ts b/invokeai/frontend/web/src/services/api/schema.ts
diff --git a/tests/app/services/session_queue/test_session_queue_dequeue_concurrency.py b/tests/app/services/session_queue/test_session_queue_dequeue_concurrency.py