Fix BN backward: apply gamma factor to dX's dbeta/dgamma terms

runwangdl · runwangdl · commit 84a9fa51ca94 · 2026-04-19T21:41:14.000Z
The canonical BatchNorm backward formula is
    dX = (gamma * inv_std / N) * (N * dY - dbeta - x_hat * dgamma)
The gamma factor must multiply ALL three terms inside the parentheses,
not just the N*dY term. Two sites had the same error:

  1. PULP_BNGradNormalize_fp32 (split BN backward, second pass)
  2. PULP_BatchNormGrad_fp32   (monolithic BN backward)

Fix: pull gamma out into the scale factor
    scale = gamma * inv_std / N_total
so that
    dX = scale * (N * dY - dbeta - x_hat * dgamma)
applies gamma uniformly.

Impact on MobileNetV1 training (4 steps, random-init):
  before fix: step 3 loss diff 0.017 (fail)
  after fix : step 3 loss diff 0.003 (pass at TOL=0.01)
The bug was masked at step 0 because gamma is initialized to 1, so
gamma × anything = anything. Visible only after the optimizer starts
updating gamma.

Verification: instrumented PULP_BatchNormGrad_fp32 with a per-call
signature print and compared against PyTorch's autograd dgamma/dbeta
across all 27 BN layers at step 0 — bit-exact within FP32 rounding
(max 1% rel diff on ~1e-8 magnitude grads, &lt;0.1% on all larger grads).
diff --git a/TargetLibraries/PULPOpen/src/BatchNorm.c b/TargetLibraries/PULPOpen/src/BatchNorm.c
@@ -234,16 +234,18 @@ void PULP_BNGradNormalize_fp32(const float32_t *dY, const float32_t *X,
     float32_t g = gamma[c];
     float32_t dg = dgamma[c];
     float32_t db = dbeta[c];
-    float32_t scale = inv_std * N_total_inv;
+    /* scale = gamma * inv_std / N_total; gamma applies to all three terms of
+       the canonical BN backward formula
+         dX = (g * inv_std / N) * (N*dY - dbeta - x_hat * dgamma) */
+    float32_t scale = g * inv_std * N_total_inv;
 
     for (uint32_t n = 0; n < N; n++) {
       const float32_t *x_nc = X + (n * C + c) * N_hw;
       const float32_t *dy_nc = dY + (n * C + c) * N_hw;
       float32_t *dx_nc = dX + (n * C + c) * N_hw;
       for (uint32_t hw = 0; hw < N_hw; hw++) {
         float32_t x_hat = (x_nc[hw] - mean) * inv_std;
-        float32_t dx_hat = dy_nc[hw] * g;
-        dx_nc[hw] = scale * (N_total_f * dx_hat - db - x_hat * dg);
+        dx_nc[hw] = scale * (N_total_f * dy_nc[hw] - db - x_hat * dg);
       }
     }
   }
@@ -288,16 +290,17 @@ void PULP_BatchNormGrad_fp32(const float32_t *dY, const float32_t *X,
     dbeta[c] = sum_dbeta;
 
     /* ── Second pass: compute dX ─────────────────────────────────────────── */
-    float32_t scale = inv_std * inv_N;
+    /* scale = gamma * inv_std / N_total; gamma applies to all three terms:
+         dX = (g * inv_std / N) * (N*dY - dbeta - x_hat * dgamma) */
+    float32_t scale = g * inv_std * inv_N;
 
     for (uint32_t n = 0; n < N; n++) {
       const float32_t *x_nc = X + (n * C + c) * N_hw;
       const float32_t *dy_nc = dY + (n * C + c) * N_hw;
       float32_t *dx_nc = dX + (n * C + c) * N_hw;
       for (uint32_t hw = 0; hw < N_hw; hw++) {
         float32_t x_hat = (x_nc[hw] - mean) * inv_std;
-        float32_t dx_hat = dy_nc[hw] * g;
-        dx_nc[hw] = scale * ((float32_t)N_total * dx_hat - sum_dbeta - x_hat * sum_dgamma);
+        dx_nc[hw] = scale * ((float32_t)N_total * dy_nc[hw] - sum_dbeta - x_hat * sum_dgamma);
       }
     }
   }