Merge pull request #376 from bigict/trainer

chungongyu · web-flow · commit 704861868bd8 · 2025-12-31T11:57:24.000+08:00
feat: multiply the final loss of each training example by the square root of the number of residues after cropping.
diff --git a/profold2/command/trainer.py b/profold2/command/trainer.py
@@ -347,6 +347,11 @@ def _step(data_loader, it, writer, stage='train', batch_callback=None):
           '%d %d %d seq.shape: %s pid: %s, clips: %s', epoch, it, jt, seq.shape,
           ','.join(batch['pid']), batch.get('clip')
       )
+      length_scaler = 1.0
+      if args.train_apply_sqrt_length_scale:
+        length_scaler = torch.sqrt(
+            (torch.mean(torch.sum(batch['mask'], dim=-1) + 1e-6)) / args.max_crop_len
+        )
 
       # maybe sync or not
       with no_sync_ctx(
@@ -360,7 +365,7 @@ def _step(data_loader, it, writer, stage='train', batch_callback=None):
                   shard_size=args.model_shard_size
               )
           )
-        grad_scaler.scale(r.loss * loss_scaler).backward()
+        grad_scaler.scale(r.loss * loss_scaler * length_scaler).backward()
 
       # running loss
       running_loss += MetricDict({'all': r.loss})
@@ -542,6 +547,12 @@ def add_arguments(parser):  # pylint: disable=redefined-outer-name
   parser.add_argument(
       '--max_crop_len', type=int, default=255, help='crop protein whose length>LEN.'
   )
+  parser.add_argument(
+      '--train_apply_sqrt_length_scale',
+      action='store_true',
+      help='multiply the final loss of each training example by the srqt of the number '
+           'of residues after cropping.'
+  )
   parser.add_argument(
       '--crop_algorithm',
       type=str,