We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent ad23558 commit b543cc8Copy full SHA for b543cc8
2 files changed
deepmd/pt/train/training.py
@@ -956,10 +956,15 @@ def log_loss_valid(_task_key="Default"):
956
self.wrapper.train()
957
self.t0 = time.time()
958
self.total_train_time = 0.0
959
- for step_id in range(self.start_step, self.num_steps):
960
- step(step_id)
961
- if JIT:
962
- break
+ try:
+ torch.cuda.memory._record_memory_history()
+ for step_id in range(self.start_step, self.num_steps):
+ step(step_id)
963
+ if JIT:
964
+ break
965
+ finally:
966
+ torch.cuda.memory._dump_snapshot("mem.pickle")
967
+ logging.warning("Memory snapshot dumped to mem.pickle")
968
969
if self.change_bias_after_training and (self.rank == 0 or dist.get_rank() == 0):
970
if not self.multi_task:
mem.pickle
22.3 MB
0 commit comments