Skip to content

Commit be14ac2

Browse files
author
Han Wang
committed
test(pt_expt): cover DPA2/DPA3 in varying-natoms compile correctness
Parametrize TestCompiledVaryingNatoms over se_e2_a, DPA2 and DPA3 with strict atol=rtol=1e-10 on float64 (machine epsilon). DPA1 (se_atten) is intentionally omitted: its compiled path is intermittently incorrect (~20% of compiles produce grad diffs up to 0.67 at the first embedding layer), and including it would have required masking the bug with a loose tolerance.
1 parent 7722f52 commit be14ac2

1 file changed

Lines changed: 107 additions & 8 deletions

File tree

source/tests/pt_expt/test_training.py

Lines changed: 107 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,70 @@
4646
_COMPILE_PRED_KEYS = ("atom_energy", "energy", "force", "virial")
4747
_COMPILE_TOL = {"atol": 1e-10, "rtol": 1e-10}
4848

49+
# DPA3 descriptor config used to extend the varying-natoms compile-correctness
50+
# test to a non-trivial architecture (repflow with attention). ``precision:
51+
# float64`` is set explicitly so the strict ``atol=rtol=1e-10`` comparison
52+
# holds at machine epsilon.
53+
#
54+
# DPA1 (se_atten) is intentionally NOT covered here: inductor's compile of the
55+
# se_atten attention path is intermittently incorrect — see the "known
56+
# limitations" section of the multi-task compile memo for details.
57+
_DESCRIPTOR_DPA2 = {
58+
"type": "dpa2",
59+
"repinit": {
60+
"rcut": 4.0,
61+
"rcut_smth": 0.5,
62+
"nsel": 18,
63+
"neuron": [2, 4, 8],
64+
"axis_neuron": 4,
65+
"activation_function": "tanh",
66+
},
67+
"repformer": {
68+
"rcut": 3.0,
69+
"rcut_smth": 0.5,
70+
"nsel": 12,
71+
"nlayers": 2,
72+
"g1_dim": 8,
73+
"g2_dim": 5,
74+
"attn2_hidden": 3,
75+
"attn2_nhead": 1,
76+
"attn1_hidden": 5,
77+
"attn1_nhead": 1,
78+
"axis_neuron": 4,
79+
"update_h2": False,
80+
"update_g1_has_conv": True,
81+
"update_g1_has_grrg": True,
82+
"update_g1_has_drrd": True,
83+
"update_g1_has_attn": True,
84+
"update_g2_has_g1g1": True,
85+
"update_g2_has_attn": True,
86+
"attn2_has_gate": True,
87+
},
88+
"precision": "float64",
89+
"seed": 1,
90+
"add_tebd_to_repinit_out": False,
91+
}
92+
93+
_DESCRIPTOR_DPA3 = {
94+
"type": "dpa3",
95+
"repflow": {
96+
"n_dim": 8,
97+
"e_dim": 5,
98+
"a_dim": 4,
99+
"nlayers": 2,
100+
"e_rcut": 3.0,
101+
"e_rcut_smth": 0.5,
102+
"e_sel": 12,
103+
"a_rcut": 3.0,
104+
"a_rcut_smth": 0.5,
105+
"a_sel": 8,
106+
"axis_neuron": 4,
107+
},
108+
"precision": "float64",
109+
"concat_output_tebd": False,
110+
"seed": 1,
111+
}
112+
49113

50114
def _assert_compile_predictions_match(
51115
testcase: unittest.TestCase,
@@ -977,30 +1041,45 @@ def setUpClass(cls) -> None:
9771041
def tearDownClass(cls) -> None:
9781042
shutil.rmtree(cls.small_data_dir, ignore_errors=True)
9791043

980-
def _make_varying_config(self, enable_compile: bool, numb_steps: int = 10) -> dict:
981-
"""Config with two systems of different natoms and auto batch size."""
982-
config = _make_config(self.data_dir, numb_steps=numb_steps)
1044+
def _make_varying_config(
1045+
self,
1046+
enable_compile: bool,
1047+
descriptor: dict | None = None,
1048+
) -> dict:
1049+
"""Config with two systems of different natoms and auto batch size.
1050+
1051+
``descriptor`` overrides the default se_e2_a descriptor when given.
1052+
"""
1053+
config = _make_config(self.data_dir)
9831054
config["training"]["training_data"]["systems"].append(self.small_data_dir)
9841055
config["training"]["training_data"]["batch_size"] = "auto"
9851056
# enable virial in loss so the model returns it (virial.npy exists in
9861057
# both systems), exercising the compiled virial passthrough on each step
9871058
config["loss"]["start_pref_v"] = 1.0
9881059
config["loss"]["limit_pref_v"] = 1.0
1060+
if descriptor is not None:
1061+
config["model"]["descriptor"] = descriptor
9891062
if enable_compile:
9901063
config["training"]["enable_compile"] = True
9911064
config = update_deepmd_input(config, warning=False)
9921065
config = normalize(config)
9931066
return config
9941067

995-
def test_compiled_matches_uncompiled_varying_natoms(self) -> None:
996-
"""Compiled and uncompiled produce identical predictions/loss/grads
997-
across batches with varying ``nframes`` and ``natoms``.
1068+
def _check_varying_natoms(self, descriptor: dict | None = None) -> None:
1069+
"""Per-step compiled-vs-uncompiled comparison for the given descriptor.
9981070
9991071
The loss config has ``start_pref_f=1000`` and ``start_pref_v=1.0``,
10001072
so ``loss.backward()`` propagates through ``F = -dE/dr`` (computed
10011073
via ``autograd.grad(..., create_graph=True)``); the per-parameter
10021074
grad comparison therefore exercises the second-order derivative
10031075
``d^2 E / (dr d theta)`` on each step at each system size.
1076+
1077+
Verifies multi-step training-trajectory equivalence: weights are
1078+
synced once at the start, then both trainers step their own Adam
1079+
states forward. All assertions use the strict
1080+
``atol=rtol=1e-10`` tolerance; if a descriptor's compiled path
1081+
cannot meet that on float64 the descriptor has a real numerical
1082+
problem (see the DPA1 limitation note where this happened).
10041083
"""
10051084
from deepmd.pt_expt.train.training import (
10061085
_CompiledModel,
@@ -1015,8 +1094,8 @@ def test_compiled_matches_uncompiled_varying_natoms(self) -> None:
10151094
old_cwd = os.getcwd()
10161095
os.chdir(tmpdir)
10171096
try:
1018-
trainer_uc = get_trainer(self._make_varying_config(False))
1019-
trainer_c = get_trainer(self._make_varying_config(True))
1097+
trainer_uc = get_trainer(self._make_varying_config(False, descriptor))
1098+
trainer_c = get_trainer(self._make_varying_config(True, descriptor))
10201099
compiled_model = trainer_c.wrapper.model["Default"]
10211100
self.assertIsInstance(compiled_model, _CompiledModel)
10221101

@@ -1071,6 +1150,26 @@ def test_compiled_matches_uncompiled_varying_natoms(self) -> None:
10711150
finally:
10721151
shutil.rmtree(tmpdir, ignore_errors=True)
10731152

1153+
def test_compiled_matches_uncompiled_varying_natoms_se_e2_a(self) -> None:
1154+
"""se_e2_a: compiled vs uncompiled match across varying nframes/natoms."""
1155+
self._check_varying_natoms() # uses default se_e2_a from _make_config
1156+
1157+
def test_compiled_matches_uncompiled_varying_natoms_dpa2(self) -> None:
1158+
"""DPA2: compiled vs uncompiled match across varying nframes/natoms.
1159+
1160+
Exercises the DPA2 repinit + repformers stack; matches at machine
1161+
epsilon (~1e-12) on float64 just like se_e2_a.
1162+
"""
1163+
self._check_varying_natoms(_DESCRIPTOR_DPA2)
1164+
1165+
def test_compiled_matches_uncompiled_varying_natoms_dpa3(self) -> None:
1166+
"""DPA3: compiled vs uncompiled match across varying nframes/natoms.
1167+
1168+
Exercises a non-trivial multi-layer repflow descriptor; matches at
1169+
machine epsilon (~1e-12) on float64 just like se_e2_a.
1170+
"""
1171+
self._check_varying_natoms(_DESCRIPTOR_DPA3)
1172+
10741173

10751174
if __name__ == "__main__":
10761175
unittest.main()

0 commit comments

Comments
 (0)