We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 7dd1659 commit bf566b9Copy full SHA for bf566b9
1 file changed
bionemo-recipes/recipes/esm2_native_te/train_mfsdp.py
@@ -153,7 +153,8 @@ def main(args: DictConfig) -> float | None:
153
loss.backward()
154
155
# Compute and clip gradient norms.
156
- total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0).item()
+ # This is causing training to hang in 25.12 torch base image for multi-process mFSDP.
157
+ # total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0).item()
158
159
# Step optimizer.
160
optimizer.step()
@@ -164,7 +165,7 @@ def main(args: DictConfig) -> float | None:
164
165
step=step,
166
batch=batch,
167
outputs=outputs,
- grad_norm=total_norm,
168
+ grad_norm=0.0, # total_norm,
169
lr=optimizer.param_groups[0]["lr"],
170
)
171
0 commit comments