Streamlines the augment_dataloader method in DPO (#134)

trias702 · pre-commit-ci[bot] · web-flow · commit 062dcb0f3708 · 2024-03-25T20:59:46.000-05:00
* Initial commit of DPO augment cleanup Signed-off-by: Daniel Egert <degert@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor fixes for PR review Signed-off-by: Daniel Egert <degert@nvidia.com> --------- Signed-off-by: Daniel Egert <degert@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/nemo_aligner/algorithms/dpo.py b/nemo_aligner/algorithms/dpo.py
@@ -308,27 +308,18 @@ def load_state_dict(self, state_dict):
     def augment_dataloader(self, dataloader):
         """Augment dataloader with ref policy log prob"""
         iter_dataloader = iter(dataloader)
-        buffer = []
-        done = False
-        while not done:
+        while True:
             try:
                 batch = next(iter_dataloader)
+                logprobs = self.model.get_ref_policy_logprobs(batch).cpu()
+                chosen_logps, reject_logps = torch.split(logprobs, len(logprobs) // 2, dim=0)
+                batch["ref_policy_log_probs_chosen"] = chosen_logps
+                batch["ref_policy_log_probs_rejected"] = reject_logps
+
+                yield batch
+                del logprobs, chosen_logps, reject_logps
             except StopIteration:
-                done = True
-            else:
-                buffer.append(batch)
-            if (done and buffer) or len(buffer) == 1:
-                logprobs = self.model.get_ref_policy_logprobs(buffer).cpu()
-                start = 0
-                for batch in buffer:
-                    batch_size = len(batch["chosen"])
-                    assert len(batch["rejected"]) == batch_size
-                    for key in ("chosen", "rejected"):
-                        batch[f"ref_policy_log_probs_{key}"] = logprobs[start : start + batch_size]
-                        start += batch_size
-                    yield batch
-                buffer.clear()
-                del logprobs
+                break
 
     @property
     def epoch(self):
diff --git a/nemo_aligner/models/nlp/gpt/megatron_gpt_dpo_model.py b/nemo_aligner/models/nlp/gpt/megatron_gpt_dpo_model.py
@@ -355,15 +355,11 @@ def get_logprob_batch(self, global_batch):
 
         return logprobs
 
-    def get_ref_policy_logprobs(self, list_of_batches):
-        tokens = torch.cat([torch.cat((b["chosen"], b["rejected"]), dim=0) for b in list_of_batches], dim=0)
-        masks = torch.cat(
-            [torch.cat((b["attention_mask"], b["attention_mask"]), dim=0) for b in list_of_batches], dim=0
-        )
-        pos_ids = torch.cat([torch.cat((b["position_ids"], b["position_ids"]), dim=0) for b in list_of_batches], dim=0)
-        labels = torch.cat(
-            [torch.cat((b["chosen_labels"], b["rejected_labels"]), dim=0) for b in list_of_batches], dim=0
-        )
+    def get_ref_policy_logprobs(self, batch):
+        tokens = torch.cat((batch["chosen"], batch["rejected"]), dim=0)
+        masks = torch.cat((batch["attention_mask"], batch["attention_mask"]), dim=0)
+        pos_ids = torch.cat((batch["position_ids"], batch["position_ids"]), dim=0)
+        labels = torch.cat((batch["chosen_labels"], batch["rejected_labels"]), dim=0)
         global_batch = [tokens, masks, pos_ids, labels]
         with cpu_weight_swap(self, self.ref_policy_state_dict, megatron_amp_O2=self.megatron_amp_O2):
             ref_log_probs = self.get_logprob_batch(global_batch)