Fix VAE offload encode device mismatch in DreamBooth scripts (#13417)

azolotenkov · sayakpaul · web-flow · commit d31061b2aca3 · 2026-04-06T16:53:06.000+02:00
Co-authored-by: Sayak Paul &lt;spsayakpaul@gmail.com&gt;
diff --git a/examples/dreambooth/train_dreambooth_lora_flux2.py b/examples/dreambooth/train_dreambooth_lora_flux2.py
@@ -1749,8 +1749,8 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
                     model_input = latents_cache[step].mode()
                 else:
                     with offload_models(vae, device=accelerator.device, offload=args.offload):
-                        pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
-                    model_input = vae.encode(pixel_values).latent_dist.mode()
+                        pixel_values = batch["pixel_values"].to(device=accelerator.device, dtype=vae.dtype)
+                        model_input = vae.encode(pixel_values).latent_dist.mode()
 
                 model_input = Flux2Pipeline._patchify_latents(model_input)
                 model_input = (model_input - latents_bn_mean) / latents_bn_std
diff --git a/examples/dreambooth/train_dreambooth_lora_flux2_img2img.py b/examples/dreambooth/train_dreambooth_lora_flux2_img2img.py
@@ -1686,11 +1686,10 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
                     cond_model_input = cond_latents_cache[step].mode()
                 else:
                     with offload_models(vae, device=accelerator.device, offload=args.offload):
-                        pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
-                        cond_pixel_values = batch["cond_pixel_values"].to(dtype=vae.dtype)
-
-                    model_input = vae.encode(pixel_values).latent_dist.mode()
-                    cond_model_input = vae.encode(cond_pixel_values).latent_dist.mode()
+                        pixel_values = batch["pixel_values"].to(device=accelerator.device, dtype=vae.dtype)
+                        cond_pixel_values = batch["cond_pixel_values"].to(device=accelerator.device, dtype=vae.dtype)
+                        model_input = vae.encode(pixel_values).latent_dist.mode()
+                        cond_model_input = vae.encode(cond_pixel_values).latent_dist.mode()
 
                     # model_input = Flux2Pipeline._encode_vae_image(pixel_values)
 
diff --git a/examples/dreambooth/train_dreambooth_lora_flux2_klein.py b/examples/dreambooth/train_dreambooth_lora_flux2_klein.py
@@ -1689,8 +1689,8 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
                     model_input = latents_cache[step].mode()
                 else:
                     with offload_models(vae, device=accelerator.device, offload=args.offload):
-                        pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
-                    model_input = vae.encode(pixel_values).latent_dist.mode()
+                        pixel_values = batch["pixel_values"].to(device=accelerator.device, dtype=vae.dtype)
+                        model_input = vae.encode(pixel_values).latent_dist.mode()
 
                 model_input = Flux2KleinPipeline._patchify_latents(model_input)
                 model_input = (model_input - latents_bn_mean) / latents_bn_std
diff --git a/examples/dreambooth/train_dreambooth_lora_flux2_klein_img2img.py b/examples/dreambooth/train_dreambooth_lora_flux2_klein_img2img.py
@@ -1634,11 +1634,10 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
                     cond_model_input = cond_latents_cache[step].mode()
                 else:
                     with offload_models(vae, device=accelerator.device, offload=args.offload):
-                        pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
-                        cond_pixel_values = batch["cond_pixel_values"].to(dtype=vae.dtype)
-
-                    model_input = vae.encode(pixel_values).latent_dist.mode()
-                    cond_model_input = vae.encode(cond_pixel_values).latent_dist.mode()
+                        pixel_values = batch["pixel_values"].to(device=accelerator.device, dtype=vae.dtype)
+                        cond_pixel_values = batch["cond_pixel_values"].to(device=accelerator.device, dtype=vae.dtype)
+                        model_input = vae.encode(pixel_values).latent_dist.mode()
+                        cond_model_input = vae.encode(cond_pixel_values).latent_dist.mode()
 
                 model_input = Flux2KleinPipeline._patchify_latents(model_input)
                 model_input = (model_input - latents_bn_mean) / latents_bn_std
diff --git a/examples/dreambooth/train_dreambooth_lora_z_image.py b/examples/dreambooth/train_dreambooth_lora_z_image.py
@@ -1665,8 +1665,8 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
                     model_input = latents_cache[step].mode()
                 else:
                     with offload_models(vae, device=accelerator.device, offload=args.offload):
-                        pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
-                    model_input = vae.encode(pixel_values).latent_dist.mode()
+                        pixel_values = batch["pixel_values"].to(device=accelerator.device, dtype=vae.dtype)
+                        model_input = vae.encode(pixel_values).latent_dist.mode()
 
                 model_input = (model_input - vae_config_shift_factor) * vae_config_scaling_factor
                 # Sample noise that we'll add to the latents