Skip to content

Commit 7aaa068

Browse files
committed
Guard against zero std_cfg in rescale_noise_cfg (#13425)
`rescale_noise_cfg` divides `std_text` by `std_cfg` without any numerical guard. When `noise_cfg` has zero variance (e.g. a constant or zero-valued guided prediction at the start of a schedule), `std_cfg` is `0`, the division produces `nan`/`inf`, and `noise_cfg * (std_text / 0)` silently corrupts the diffusion output (#13425). Clamp `std_cfg` to `torch.finfo(noise_cfg.dtype).eps` before dividing so the rescaling becomes a no-op in the degenerate case (`noise_cfg == 0` produces `0 * <finite> == 0`) instead of `nan`. Propagated to all 32 in-tree copies via `python utils/check_copies.py --fix_and_overwrite`.
1 parent 48f39c2 commit 7aaa068

32 files changed

Lines changed: 160 additions & 0 deletions

src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
141141
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
142142
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
143143
# rescale the results from guidance (fixes overexposure)
144+
# Guard against `std_cfg == 0` (constant/zero `noise_cfg`, which can happen at the
145+
# beginning of a schedule or in numerical edge cases): a raw division would produce
146+
# `nan`/`inf` and silently corrupt the diffusion output (issue #13425). When the
147+
# standard deviation of the guided prediction is zero, the rescaling is a no-op.
148+
std_cfg = std_cfg.clamp(min=torch.finfo(noise_cfg.dtype).eps)
144149
noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
145150
# mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
146151
noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg

src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
161161
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
162162
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
163163
# rescale the results from guidance (fixes overexposure)
164+
# Guard against `std_cfg == 0` (constant/zero `noise_cfg`, which can happen at the
165+
# beginning of a schedule or in numerical edge cases): a raw division would produce
166+
# `nan`/`inf` and silently corrupt the diffusion output (issue #13425). When the
167+
# standard deviation of the guided prediction is zero, the rescaling is a no-op.
168+
std_cfg = std_cfg.clamp(min=torch.finfo(noise_cfg.dtype).eps)
164169
noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
165170
# mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
166171
noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg

src/diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
151151
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
152152
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
153153
# rescale the results from guidance (fixes overexposure)
154+
# Guard against `std_cfg == 0` (constant/zero `noise_cfg`, which can happen at the
155+
# beginning of a schedule or in numerical edge cases): a raw division would produce
156+
# `nan`/`inf` and silently corrupt the diffusion output (issue #13425). When the
157+
# standard deviation of the guided prediction is zero, the rescaling is a no-op.
158+
std_cfg = std_cfg.clamp(min=torch.finfo(noise_cfg.dtype).eps)
154159
noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
155160
# mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
156161
noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg

src/diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
156156
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
157157
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
158158
# rescale the results from guidance (fixes overexposure)
159+
# Guard against `std_cfg == 0` (constant/zero `noise_cfg`, which can happen at the
160+
# beginning of a schedule or in numerical edge cases): a raw division would produce
161+
# `nan`/`inf` and silently corrupt the diffusion output (issue #13425). When the
162+
# standard deviation of the guided prediction is zero, the rescaling is a no-op.
163+
std_cfg = std_cfg.clamp(min=torch.finfo(noise_cfg.dtype).eps)
159164
noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
160165
# mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
161166
noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg

src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
8484
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
8585
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
8686
# rescale the results from guidance (fixes overexposure)
87+
# Guard against `std_cfg == 0` (constant/zero `noise_cfg`, which can happen at the
88+
# beginning of a schedule or in numerical edge cases): a raw division would produce
89+
# `nan`/`inf` and silently corrupt the diffusion output (issue #13425). When the
90+
# standard deviation of the guided prediction is zero, the rescaling is a no-op.
91+
std_cfg = std_cfg.clamp(min=torch.finfo(noise_cfg.dtype).eps)
8792
noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
8893
# mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
8994
noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg

src/diffusers/pipelines/deprecated/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
9494
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
9595
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
9696
# rescale the results from guidance (fixes overexposure)
97+
# Guard against `std_cfg == 0` (constant/zero `noise_cfg`, which can happen at the
98+
# beginning of a schedule or in numerical edge cases): a raw division would produce
99+
# `nan`/`inf` and silently corrupt the diffusion output (issue #13425). When the
100+
# standard deviation of the guided prediction is zero, the rescaling is a no-op.
101+
std_cfg = std_cfg.clamp(min=torch.finfo(noise_cfg.dtype).eps)
97102
noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
98103
# mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
99104
noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg

src/diffusers/pipelines/deprecated/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
8989
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
9090
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
9191
# rescale the results from guidance (fixes overexposure)
92+
# Guard against `std_cfg == 0` (constant/zero `noise_cfg`, which can happen at the
93+
# beginning of a schedule or in numerical edge cases): a raw division would produce
94+
# `nan`/`inf` and silently corrupt the diffusion output (issue #13425). When the
95+
# standard deviation of the guided prediction is zero, the rescaling is a no-op.
96+
std_cfg = std_cfg.clamp(min=torch.finfo(noise_cfg.dtype).eps)
9297
noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
9398
# mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
9499
noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg

src/diffusers/pipelines/deprecated/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
335335
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
336336
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
337337
# rescale the results from guidance (fixes overexposure)
338+
# Guard against `std_cfg == 0` (constant/zero `noise_cfg`, which can happen at the
339+
# beginning of a schedule or in numerical edge cases): a raw division would produce
340+
# `nan`/`inf` and silently corrupt the diffusion output (issue #13425). When the
341+
# standard deviation of the guided prediction is zero, the rescaling is a no-op.
342+
std_cfg = std_cfg.clamp(min=torch.finfo(noise_cfg.dtype).eps)
338343
noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
339344
# mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
340345
noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg

src/diffusers/pipelines/easyanimate/pipeline_easyanimate.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
117117
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
118118
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
119119
# rescale the results from guidance (fixes overexposure)
120+
# Guard against `std_cfg == 0` (constant/zero `noise_cfg`, which can happen at the
121+
# beginning of a schedule or in numerical edge cases): a raw division would produce
122+
# `nan`/`inf` and silently corrupt the diffusion output (issue #13425). When the
123+
# standard deviation of the guided prediction is zero, the rescaling is a no-op.
124+
std_cfg = std_cfg.clamp(min=torch.finfo(noise_cfg.dtype).eps)
120125
noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
121126
# mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
122127
noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg

src/diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
193193
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
194194
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
195195
# rescale the results from guidance (fixes overexposure)
196+
# Guard against `std_cfg == 0` (constant/zero `noise_cfg`, which can happen at the
197+
# beginning of a schedule or in numerical edge cases): a raw division would produce
198+
# `nan`/`inf` and silently corrupt the diffusion output (issue #13425). When the
199+
# standard deviation of the guided prediction is zero, the rescaling is a no-op.
200+
std_cfg = std_cfg.clamp(min=torch.finfo(noise_cfg.dtype).eps)
196201
noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
197202
# mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
198203
noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg

0 commit comments

Comments
 (0)