Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
475 changes: 415 additions & 60 deletions deepmd/dpmodel/utils/learning_rate.py

Large diffs are not rendered by default.

40 changes: 16 additions & 24 deletions deepmd/pd/train/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def get_sample() -> dict[str, Any]:
return get_sample

def get_lr(lr_params: dict[str, Any]) -> BaseLR:
lr_params["stop_steps"] = self.num_steps - self.warmup_steps
lr_params["num_steps"] = self.num_steps
lr_schedule = BaseLR(**lr_params)
return lr_schedule

Expand Down Expand Up @@ -475,17 +475,15 @@ def get_lr(lr_params: dict[str, Any]) -> BaseLR:
)

# Learning rate
self.warmup_steps = training_params.get("warmup_steps", 0)
self.gradient_max_norm = training_params.get("gradient_max_norm", 0.0)
assert self.num_steps - self.warmup_steps > 0 or self.warmup_steps == 0, (
"Warm up steps must be less than total training steps!"
)
if self.multi_task and config.get("learning_rate_dict", None) is not None:
self.lr_exp = {}
self.lr_schedule = {}
for model_key in self.model_keys:
self.lr_exp[model_key] = get_lr(config["learning_rate_dict"][model_key])
self.lr_schedule[model_key] = get_lr(
config["learning_rate_dict"][model_key]
)
else:
self.lr_exp = get_lr(config["learning_rate"])
self.lr_schedule = get_lr(config["learning_rate"])

# JIT
if JIT:
Expand Down Expand Up @@ -668,18 +666,15 @@ def single_model_finetune(

# TODO add lr warmups for multitask
# author: iProzd
def warm_up_linear(step: int, warmup_steps: int) -> float:
if step < warmup_steps:
return step / warmup_steps
else:
return self.lr_exp.value(step - warmup_steps) / self.lr_exp.start_lr

# TODO add optimizers for multitask
# author: iProzd
if self.opt_type == "Adam":
self.scheduler = paddle.optimizer.lr.LambdaDecay(
learning_rate=self.lr_exp.start_lr,
lr_lambda=lambda step: warm_up_linear(step, self.warmup_steps),
learning_rate=self.lr_schedule.start_lr,
lr_lambda=lambda step: (
self.lr_schedule.value(step + self.start_step)
/ self.lr_schedule.start_lr
),
)
self.optimizer = paddle.optimizer.Adam(
learning_rate=self.scheduler, parameters=self.wrapper.parameters()
Expand Down Expand Up @@ -811,10 +806,10 @@ def step(_step_id: int, task_key: str = "Default") -> None:
# Paddle Profiler
if enable_profiling:
core.nvprof_nvtx_push(f"Training step {_step_id}")
if isinstance(self.lr_exp, dict):
_lr = self.lr_exp[task_key]
if isinstance(self.lr_schedule, dict):
_lr = self.lr_schedule[task_key]
else:
_lr = self.lr_exp
_lr = self.lr_schedule
cur_lr = _lr.value(_step_id)
pref_lr = cur_lr

Expand All @@ -828,10 +823,7 @@ def step(_step_id: int, task_key: str = "Default") -> None:
fout1.flush()
if self.opt_type == "Adam":
cur_lr = self.scheduler.get_lr()
if _step_id < self.warmup_steps:
pref_lr = _lr.start_lr
else:
pref_lr = cur_lr
pref_lr = cur_lr

# disable synchronization in forward-backward manually
# as derivatives exist in model forward
Expand Down Expand Up @@ -1072,7 +1064,7 @@ def log_loss_valid(_task_key: str = "Default") -> dict:
_bias_adjust_mode="change-by-statistic",
)
self.latest_model = Path(self.save_ckpt + f"-{self.num_steps}.pd")
cur_lr = self.lr_exp.value(self.num_steps - 1)
cur_lr = self.lr_schedule.value(self.num_steps - 1)
self.save_model(self.latest_model, lr=cur_lr, step=self.num_steps - 1)
log.info(f"Saved model to {self.latest_model}")
symlink_prefix_files(self.latest_model.stem, self.save_ckpt)
Expand Down
4 changes: 3 additions & 1 deletion deepmd/pd/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

from .env import (
DEVICE,
GLOBAL_NP_FLOAT_PRECISION,
)
from .env import PRECISION_DICT as PD_PRECISION_DICT

Expand Down Expand Up @@ -257,7 +258,8 @@ def to_numpy_array(
):
if xx is None:
return None
assert xx is not None
if isinstance(xx, (float, int)):
return np.array(xx, dtype=GLOBAL_NP_FLOAT_PRECISION)
# Create a reverse mapping of PD_PRECISION_DICT
Comment thread
OutisLi marked this conversation as resolved.
reverse_precision_dict = {v: k for k, v in PD_PRECISION_DICT.items()}
# Use the reverse mapping to find keys with the desired value
Expand Down
81 changes: 35 additions & 46 deletions deepmd/pt/train/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def get_sample() -> Any:
return get_sample

def get_lr(lr_params: dict[str, Any]) -> BaseLR:
lr_params["stop_steps"] = self.num_steps - self.warmup_steps
lr_params["num_steps"] = self.num_steps
lr_schedule = BaseLR(**lr_params)
return lr_schedule

Expand Down Expand Up @@ -547,33 +547,15 @@ def get_lr(lr_params: dict[str, Any]) -> BaseLR:
)

# Learning rate
warmup_steps = training_params.get("warmup_steps", None)
warmup_ratio = training_params.get("warmup_ratio", None)
if warmup_steps is not None:
self.warmup_steps = warmup_steps
elif warmup_ratio is not None:
if not 0 <= warmup_ratio < 1:
raise ValueError(f"warmup_ratio must be in [0, 1), got {warmup_ratio}")
self.warmup_steps = int(warmup_ratio * self.num_steps)
if self.warmup_steps == 0 and warmup_ratio > 0:
log.warning(
f"warmup_ratio {warmup_ratio} results in 0 warmup steps "
f"due to truncation. Consider using a larger ratio or "
f"specify warmup_steps directly."
)
else:
self.warmup_steps = 0
self.warmup_start_factor = training_params.get("warmup_start_factor", 0.0)
self.gradient_max_norm = training_params.get("gradient_max_norm", 0.0)
assert self.num_steps - self.warmup_steps > 0 or self.warmup_steps == 0, (
"Warm up steps must be less than total training steps!"
)
if self.multi_task and config.get("learning_rate_dict", None) is not None:
self.lr_exp = {}
self.lr_schedule = {}
for model_key in self.model_keys:
self.lr_exp[model_key] = get_lr(config["learning_rate_dict"][model_key])
self.lr_schedule[model_key] = get_lr(
config["learning_rate_dict"][model_key]
)
else:
self.lr_exp = get_lr(config["learning_rate"])
self.lr_schedule = get_lr(config["learning_rate"])

# JIT
if JIT:
Expand Down Expand Up @@ -807,34 +789,32 @@ def single_model_finetune(

# TODO add lr warmups for multitask
# author: iProzd
def warm_up_linear(step: int, warmup_steps: int) -> float:
if step < warmup_steps:
return self.warmup_start_factor + (1.0 - self.warmup_start_factor) * (
step / warmup_steps
)
else:
return self.lr_exp.value(step - warmup_steps) / self.lr_exp.start_lr

# TODO add optimizers for multitask
# author: iProzd
initial_lr = self.lr_schedule.value(self.start_step)
if self.opt_type in ["Adam", "AdamW"]:
# Initialize optimizer with the actual learning rate at start_step
# to ensure warmup is applied from the first step
if self.opt_type == "Adam":
self.optimizer = self._create_optimizer(
torch.optim.Adam,
lr=self.lr_exp.start_lr,
lr=initial_lr,
fused=DEVICE.type != "cpu",
)
else:
self.optimizer = self._create_optimizer(
torch.optim.AdamW,
lr=self.lr_exp.start_lr,
lr=initial_lr,
weight_decay=float(self.opt_param["weight_decay"]),
fused=DEVICE.type != "cpu",
)
self._load_optimizer_state(optimizer_state_dict)
self.scheduler = torch.optim.lr_scheduler.LambdaLR(
self.optimizer,
lambda step: warm_up_linear(step + self.start_step, self.warmup_steps),
lambda step: (
self.lr_schedule.value(step + self.start_step) / initial_lr
),
last_epoch=self.start_step - 1,
)
elif self.opt_type == "LKF":
self.optimizer = LKFOptimizer(
Expand All @@ -843,7 +823,7 @@ def warm_up_linear(step: int, warmup_steps: int) -> float:
elif self.opt_type == "AdaMuon":
self.optimizer = self._create_optimizer(
AdaMuonOptimizer,
lr=self.lr_exp.start_lr,
lr=initial_lr,
momentum=float(self.opt_param["momentum"]),
weight_decay=float(self.opt_param["weight_decay"]),
adam_betas=(
Expand All @@ -853,10 +833,19 @@ def warm_up_linear(step: int, warmup_steps: int) -> float:
lr_adjust=float(self.opt_param["lr_adjust"]),
lr_adjust_coeff=float(self.opt_param["lr_adjust_coeff"]),
)
if optimizer_state_dict is not None and self.restart_training:
self.optimizer.load_state_dict(optimizer_state_dict)
self.scheduler = torch.optim.lr_scheduler.LambdaLR(
self.optimizer,
lambda step: (
self.lr_schedule.value(step + self.start_step) / initial_lr
),
last_epoch=self.start_step - 1,
)
elif self.opt_type == "HybridMuon":
self.optimizer = self._create_optimizer(
HybridMuonOptimizer,
lr=self.lr_exp.start_lr,
lr=initial_lr,
momentum=float(self.opt_param["momentum"]),
weight_decay=float(self.opt_param["weight_decay"]),
adam_betas=(
Expand All @@ -872,7 +861,10 @@ def warm_up_linear(step: int, warmup_steps: int) -> float:
self._load_optimizer_state(optimizer_state_dict)
self.scheduler = torch.optim.lr_scheduler.LambdaLR(
self.optimizer,
lambda step: warm_up_linear(step + self.start_step, self.warmup_steps),
lambda step: (
self.lr_schedule.value(step + self.start_step) / initial_lr
),
last_epoch=self.start_step - 1,
)
else:
raise ValueError(f"Not supported optimizer type '{self.opt_type}'")
Expand Down Expand Up @@ -1034,10 +1026,10 @@ def step(_step_id: int, task_key: str = "Default") -> None:
# PyTorch Profiler
if self.enable_profiler or self.profiling:
prof.step()
if isinstance(self.lr_exp, dict):
_lr = self.lr_exp[task_key]
if isinstance(self.lr_schedule, dict):
_lr = self.lr_schedule[task_key]
else:
_lr = self.lr_exp
_lr = self.lr_schedule
cur_lr = _lr.value(_step_id)
pref_lr = cur_lr
self.optimizer.zero_grad(set_to_none=True)
Expand All @@ -1050,10 +1042,7 @@ def step(_step_id: int, task_key: str = "Default") -> None:
fout1.flush()
if self.opt_type in ["Adam", "AdamW", "AdaMuon", "HybridMuon"]:
cur_lr = self.scheduler.get_last_lr()[0]
if _step_id < self.warmup_steps:
pref_lr = _lr.start_lr
else:
pref_lr = cur_lr
pref_lr = cur_lr
model_pred, loss, more_loss = self.wrapper(
**input_dict, cur_lr=pref_lr, label=label_dict, task_key=task_key
)
Expand Down Expand Up @@ -1446,7 +1435,7 @@ def log_loss_valid(_task_key: str = "Default") -> dict:
_bias_adjust_mode="change-by-statistic",
)
self.latest_model = Path(self.save_ckpt + f"-{self.num_steps}.pt")
cur_lr = self.lr_exp.value(self.num_steps - 1)
cur_lr = self.lr_schedule.value(self.num_steps - 1)
self.save_model(self.latest_model, lr=cur_lr, step=self.num_steps - 1)
log.info(f"Saved model to {self.latest_model}")
symlink_prefix_files(self.latest_model.stem, self.save_ckpt)
Expand Down
17 changes: 15 additions & 2 deletions deepmd/pt/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from .env import (
DEVICE,
GLOBAL_NP_FLOAT_PRECISION,
)
from .env import PRECISION_DICT as PT_PRECISION_DICT

Expand Down Expand Up @@ -218,6 +219,14 @@
raise RuntimeError(f"activation function {self.activation} not supported")


@overload
def to_numpy_array(xx: np.ndarray) -> np.ndarray: ...

Check notice

Code scanning / CodeQL

Statement has no effect Note

This statement has no effect.


@overload
def to_numpy_array(xx: float) -> np.ndarray: ...

Check notice

Code scanning / CodeQL

Statement has no effect Note

This statement has no effect.


@overload
def to_numpy_array(xx: torch.Tensor) -> np.ndarray: ...

Expand All @@ -227,18 +236,22 @@


def to_numpy_array(
xx: torch.Tensor | None,
xx: torch.Tensor | np.ndarray | float | None,
Comment thread
OutisLi marked this conversation as resolved.
) -> np.ndarray | None:
if xx is None:
return None
assert xx is not None
if isinstance(xx, (float, int)):
return np.array(xx, dtype=GLOBAL_NP_FLOAT_PRECISION)
if isinstance(xx, np.ndarray):
return xx.astype(GLOBAL_NP_FLOAT_PRECISION)
Comment thread
OutisLi marked this conversation as resolved.
# Create a reverse mapping of PT_PRECISION_DICT
reverse_precision_dict = {v: k for k, v in PT_PRECISION_DICT.items()}
# Use the reverse mapping to find keys with the desired value
prec = reverse_precision_dict.get(xx.dtype, None)
prec = NP_PRECISION_DICT.get(prec, None)
if prec is None:
raise ValueError(f"unknown precision {xx.dtype}")
assert isinstance(xx, torch.Tensor)
Comment thread
OutisLi marked this conversation as resolved.
if xx.dtype == torch.bfloat16:
# https://github.com/pytorch/pytorch/issues/109873
xx = xx.float()
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/fit/dipole.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ def get_loss(self, loss: dict, lr: LearningRateExp) -> Loss:
----------
loss : dict
the loss dict
lr : LearningRateExp
lr : LearningRateSchedule
the learning rate

Returns
Expand Down
4 changes: 2 additions & 2 deletions deepmd/tf/fit/dos.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
)

if TYPE_CHECKING:
from deepmd.tf.train.learning_rate import (
from deepmd.tf.utils.learning_rate import (
LearningRateExp,
)
from deepmd.utils.version import (
Expand Down Expand Up @@ -668,7 +668,7 @@ def get_loss(self, loss: dict, lr: "LearningRateExp") -> Loss:
----------
loss : dict
the loss dict
lr : LearningRateExp
lr : LearningRateSchedule
the learning rate

Returns
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/fit/ener.py
Original file line number Diff line number Diff line change
Expand Up @@ -864,7 +864,7 @@ def get_loss(self, loss: dict, lr: LearningRateExp) -> Loss:
----------
loss : dict
The loss function parameters.
lr : LearningRateExp
lr : LearningRateSchedule
The learning rate.

Returns
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/fit/fitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def get_loss(self, loss: dict, lr: LearningRateExp) -> Loss:
----------
loss : dict
the loss dict
lr : LearningRateExp
lr : LearningRateSchedule
the learning rate

Returns
Expand Down
4 changes: 2 additions & 2 deletions deepmd/tf/fit/polar.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
)

if TYPE_CHECKING:
from deepmd.tf.train.learning_rate import (
from deepmd.tf.utils.learning_rate import (
LearningRateExp,
)

Expand Down Expand Up @@ -880,7 +880,7 @@ def get_loss(self, loss: dict, lr: "LearningRateExp") -> Loss:
----------
loss : dict
the loss dict
lr : LearningRateExp
lr : LearningRateSchedule
Comment thread
OutisLi marked this conversation as resolved.
the learning rate

Returns
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
Self,
)

from deepmd.tf.train.learning_rate import (
from deepmd.tf.utils.learning_rate import (
LearningRateExp,
)

Expand Down
Loading