From d69632ed12a6e8515617d7995d3003c00015da47 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 12 Apr 2026 17:32:40 +0800 Subject: [PATCH 1/4] fix(pt_expt): centralize default-device guard in AOTInductor compilation AOTInductor's lowering code creates tensors without explicit device=, inheriting any active torch.set_default_device. This caused compilation failures when tests/pt/__init__.py set a fake CUDA device. Move the set_default_device(None) guard into _deserialize_to_file_pt2 so all callers (tests, dp freeze, dp compress) are protected, and remove the 12 scattered workarounds from test files. --- deepmd/pt_expt/utils/serialization.py | 15 ++++++-- source/tests/pt_expt/infer/test_deep_eval.py | 34 +++++-------------- .../pt_expt/infer/test_deep_eval_spin.py | 23 ++----------- source/tests/pt_expt/test_change_bias.py | 9 +---- 4 files changed, 25 insertions(+), 56 deletions(-) diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py index f59c397525..d47d82505a 100644 --- a/deepmd/pt_expt/utils/serialization.py +++ b/deepmd/pt_expt/utils/serialization.py @@ -554,8 +554,19 @@ def _deserialize_to_file_pt2( data, model_json_override ) - # Compile via AOTInductor into a .pt2 package - aoti_compile_and_package(exported, package_path=model_file) + # AOTInductor's lowering code internally creates tensors (e.g. + # ``torch.zeros``) without an explicit ``device=`` argument. If a + # non-CPU default device is active (e.g. tests/pt/__init__.py sets + # ``torch.set_default_device("cuda:9999999")``), the compilation fails + # on CPU-only builds. Temporarily clear the default device so the + # inductor always targets CPU. + prev_device = torch.get_default_device() + torch.set_default_device(None) + try: + # Compile via AOTInductor into a .pt2 package + aoti_compile_and_package(exported, package_path=model_file) + finally: + torch.set_default_device(prev_device) # Embed metadata into the .pt2 ZIP archive model_def_script = data.get("model_def_script") or {} diff --git a/source/tests/pt_expt/infer/test_deep_eval.py b/source/tests/pt_expt/infer/test_deep_eval.py index 6797fa2c03..c932627672 100644 --- a/source/tests/pt_expt/infer/test_deep_eval.py +++ b/source/tests/pt_expt/infer/test_deep_eval.py @@ -543,13 +543,7 @@ def setUpClass(cls) -> None: cls.model_data = {"model": cls.model.serialize()} cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) cls.tmpfile.close() - # Temporarily clear default device to avoid poisoning AOTInductor - # compilation (tests/pt/__init__.py sets it to "cuda:9999999"). - torch.set_default_device(None) - try: - deserialize_to_file(cls.tmpfile.name, cls.model_data) - finally: - torch.set_default_device("cuda:9999999") + deserialize_to_file(cls.tmpfile.name, cls.model_data) # Also save to .pte for cross-format comparison cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False) @@ -606,15 +600,11 @@ def test_get_model_def_script_with_params(self) -> None: with tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) as f: tmpfile2 = f.name try: - torch.set_default_device(None) - try: - data_with_config = { - **self.model_data, - "model_def_script": training_config, - } - deserialize_to_file(tmpfile2, data_with_config) - finally: - torch.set_default_device("cuda:9999999") + data_with_config = { + **self.model_data, + "model_def_script": training_config, + } + deserialize_to_file(tmpfile2, data_with_config) dp2 = DeepPot(tmpfile2) mds = dp2.deep_eval.get_model_def_script() self.assertEqual(mds, training_config) @@ -970,11 +960,7 @@ def setUpClass(cls) -> None: cls.model_data = {"model": cls.model.serialize()} cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) cls.tmpfile.close() - torch.set_default_device(None) - try: - deserialize_to_file(cls.tmpfile.name, cls.model_data) - finally: - torch.set_default_device("cuda:9999999") + deserialize_to_file(cls.tmpfile.name, cls.model_data) # Also save .pte for cross-format comparison cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False) @@ -1185,11 +1171,7 @@ def setUpClass(cls) -> None: cls.model_data = {"model": cls.model.serialize()} cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) cls.tmpfile.close() - torch.set_default_device(None) - try: - deserialize_to_file(cls.tmpfile.name, cls.model_data) - finally: - torch.set_default_device("cuda:9999999") + deserialize_to_file(cls.tmpfile.name, cls.model_data) # Also save .pte for cross-format comparison cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False) diff --git a/source/tests/pt_expt/infer/test_deep_eval_spin.py b/source/tests/pt_expt/infer/test_deep_eval_spin.py index 829b1f5666..64eb169dc9 100644 --- a/source/tests/pt_expt/infer/test_deep_eval_spin.py +++ b/source/tests/pt_expt/infer/test_deep_eval_spin.py @@ -154,14 +154,7 @@ def spin_model_files(): tmpdir = tempfile.mkdtemp() for ext in (".pt2", ".pte"): path = os.path.join(tmpdir, f"spin_test{ext}") - # AOTInductor (.pt2) internally creates tensors using the PyTorch - # default device. Clear it so compilation stays on CPU. - prev = torch.get_default_device() - torch.set_default_device(None) - try: - deserialize_to_file(path, copy.deepcopy(data)) - finally: - torch.set_default_device(prev) + deserialize_to_file(path, copy.deepcopy(data)) files[ext] = path yield files, ref_pbc, ref_nopbc for path in files.values(): @@ -362,12 +355,7 @@ def spin_fparam_model_files(): tmpdir = tempfile.mkdtemp() for ext in (".pt2", ".pte"): path = os.path.join(tmpdir, f"spin_fparam_test{ext}") - prev = torch.get_default_device() - torch.set_default_device(None) - try: - deserialize_to_file(path, copy.deepcopy(data)) - finally: - torch.set_default_device(prev) + deserialize_to_file(path, copy.deepcopy(data)) files[ext] = path yield files for path in files.values(): @@ -426,12 +414,7 @@ def spin_aparam_model_files(): tmpdir = tempfile.mkdtemp() for ext in (".pt2", ".pte"): path = os.path.join(tmpdir, f"spin_aparam_test{ext}") - prev = torch.get_default_device() - torch.set_default_device(None) - try: - deserialize_to_file(path, copy.deepcopy(data)) - finally: - torch.set_default_device(prev) + deserialize_to_file(path, copy.deepcopy(data)) files[ext] = path yield files for path in files.values(): diff --git a/source/tests/pt_expt/test_change_bias.py b/source/tests/pt_expt/test_change_bias.py index 03329642e9..1fb3c5dd5d 100644 --- a/source/tests/pt_expt/test_change_bias.py +++ b/source/tests/pt_expt/test_change_bias.py @@ -156,14 +156,7 @@ def setUpClass(cls) -> None: cls.shared_pte = os.path.join(cls.tmpdir, "shared.pte") freeze(model=cls.model_path, output=cls.shared_pte) cls.shared_pt2 = os.path.join(cls.tmpdir, "shared.pt2") - # Clear default device: tests/pt/__init__.py may set a fake device - # for CPU fallback, which poisons AOTInductor compilation. - saved_device = torch.get_default_device() - torch.set_default_device(None) - try: - freeze(model=cls.model_path, output=cls.shared_pt2) - finally: - torch.set_default_device(saved_device) + freeze(model=cls.model_path, output=cls.shared_pt2) @classmethod def tearDownClass(cls) -> None: From 016ea5b809d1bb2c9aea124eb30f69aae75b0bba Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 12 Apr 2026 22:24:10 +0800 Subject: [PATCH 2/4] test(pt_expt): reduce AOTInductor compile time with fast configs Set inductor configs in conftest to skip expensive C++ optimizations during .pt2 compilation: max_fusion_size=8, epilogue_fusion=False, pattern_matcher=False, package_cpp_only=True, compile_opt_level=O0. Tests only validate correctness so runtime performance is irrelevant. Cuts per-model compile time from ~50s to ~30s. --- source/tests/pt_expt/conftest.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/source/tests/pt_expt/conftest.py b/source/tests/pt_expt/conftest.py index f2a9b07a6a..3b48d2e51d 100644 --- a/source/tests/pt_expt/conftest.py +++ b/source/tests/pt_expt/conftest.py @@ -12,11 +12,21 @@ """ import pytest +import torch._inductor.config as _inductor_config import torch.utils._device as _device from torch.overrides import ( _get_current_function_mode_stack, ) +# Reduce AOTInductor (.pt2) compile time for unit tests. +# Tests only validate correctness, not runtime performance, so we can +# skip expensive C++ optimizations. This cuts compile time by ~50%. +_inductor_config.max_fusion_size = 8 +_inductor_config.epilogue_fusion = False +_inductor_config.pattern_matcher = False +_inductor_config.aot_inductor.package_cpp_only = True +_inductor_config.aot_inductor.compile_wrapper_opt_level = "O0" + def _pop_device_contexts() -> list: """Pop all stale DeviceContext modes from the torch function mode stack.""" From 2a687ce2221e8f14cff7ebaf7e5607b574434d87 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 13 Apr 2026 00:09:08 +0800 Subject: [PATCH 3/4] fix(pt_expt): revert centralized device guard, keep per-test workarounds Centralizing set_default_device(None) in _deserialize_to_file_pt2 re-pushes a stale DeviceContext on restore (torch.get_default_device returns a stale value after DeviceContext is popped from mode stack), breaking subsequent training (Adam optimizer creates tensors without device=). Revert to per-test workarounds which don't have this issue. --- deepmd/pt_expt/utils/serialization.py | 15 ++------ source/tests/pt_expt/infer/test_deep_eval.py | 34 ++++++++++++++----- .../pt_expt/infer/test_deep_eval_spin.py | 23 +++++++++++-- source/tests/pt_expt/test_change_bias.py | 9 ++++- 4 files changed, 56 insertions(+), 25 deletions(-) diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py index d47d82505a..f59c397525 100644 --- a/deepmd/pt_expt/utils/serialization.py +++ b/deepmd/pt_expt/utils/serialization.py @@ -554,19 +554,8 @@ def _deserialize_to_file_pt2( data, model_json_override ) - # AOTInductor's lowering code internally creates tensors (e.g. - # ``torch.zeros``) without an explicit ``device=`` argument. If a - # non-CPU default device is active (e.g. tests/pt/__init__.py sets - # ``torch.set_default_device("cuda:9999999")``), the compilation fails - # on CPU-only builds. Temporarily clear the default device so the - # inductor always targets CPU. - prev_device = torch.get_default_device() - torch.set_default_device(None) - try: - # Compile via AOTInductor into a .pt2 package - aoti_compile_and_package(exported, package_path=model_file) - finally: - torch.set_default_device(prev_device) + # Compile via AOTInductor into a .pt2 package + aoti_compile_and_package(exported, package_path=model_file) # Embed metadata into the .pt2 ZIP archive model_def_script = data.get("model_def_script") or {} diff --git a/source/tests/pt_expt/infer/test_deep_eval.py b/source/tests/pt_expt/infer/test_deep_eval.py index c932627672..6797fa2c03 100644 --- a/source/tests/pt_expt/infer/test_deep_eval.py +++ b/source/tests/pt_expt/infer/test_deep_eval.py @@ -543,7 +543,13 @@ def setUpClass(cls) -> None: cls.model_data = {"model": cls.model.serialize()} cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) cls.tmpfile.close() - deserialize_to_file(cls.tmpfile.name, cls.model_data) + # Temporarily clear default device to avoid poisoning AOTInductor + # compilation (tests/pt/__init__.py sets it to "cuda:9999999"). + torch.set_default_device(None) + try: + deserialize_to_file(cls.tmpfile.name, cls.model_data) + finally: + torch.set_default_device("cuda:9999999") # Also save to .pte for cross-format comparison cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False) @@ -600,11 +606,15 @@ def test_get_model_def_script_with_params(self) -> None: with tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) as f: tmpfile2 = f.name try: - data_with_config = { - **self.model_data, - "model_def_script": training_config, - } - deserialize_to_file(tmpfile2, data_with_config) + torch.set_default_device(None) + try: + data_with_config = { + **self.model_data, + "model_def_script": training_config, + } + deserialize_to_file(tmpfile2, data_with_config) + finally: + torch.set_default_device("cuda:9999999") dp2 = DeepPot(tmpfile2) mds = dp2.deep_eval.get_model_def_script() self.assertEqual(mds, training_config) @@ -960,7 +970,11 @@ def setUpClass(cls) -> None: cls.model_data = {"model": cls.model.serialize()} cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) cls.tmpfile.close() - deserialize_to_file(cls.tmpfile.name, cls.model_data) + torch.set_default_device(None) + try: + deserialize_to_file(cls.tmpfile.name, cls.model_data) + finally: + torch.set_default_device("cuda:9999999") # Also save .pte for cross-format comparison cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False) @@ -1171,7 +1185,11 @@ def setUpClass(cls) -> None: cls.model_data = {"model": cls.model.serialize()} cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) cls.tmpfile.close() - deserialize_to_file(cls.tmpfile.name, cls.model_data) + torch.set_default_device(None) + try: + deserialize_to_file(cls.tmpfile.name, cls.model_data) + finally: + torch.set_default_device("cuda:9999999") # Also save .pte for cross-format comparison cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False) diff --git a/source/tests/pt_expt/infer/test_deep_eval_spin.py b/source/tests/pt_expt/infer/test_deep_eval_spin.py index 64eb169dc9..829b1f5666 100644 --- a/source/tests/pt_expt/infer/test_deep_eval_spin.py +++ b/source/tests/pt_expt/infer/test_deep_eval_spin.py @@ -154,7 +154,14 @@ def spin_model_files(): tmpdir = tempfile.mkdtemp() for ext in (".pt2", ".pte"): path = os.path.join(tmpdir, f"spin_test{ext}") - deserialize_to_file(path, copy.deepcopy(data)) + # AOTInductor (.pt2) internally creates tensors using the PyTorch + # default device. Clear it so compilation stays on CPU. + prev = torch.get_default_device() + torch.set_default_device(None) + try: + deserialize_to_file(path, copy.deepcopy(data)) + finally: + torch.set_default_device(prev) files[ext] = path yield files, ref_pbc, ref_nopbc for path in files.values(): @@ -355,7 +362,12 @@ def spin_fparam_model_files(): tmpdir = tempfile.mkdtemp() for ext in (".pt2", ".pte"): path = os.path.join(tmpdir, f"spin_fparam_test{ext}") - deserialize_to_file(path, copy.deepcopy(data)) + prev = torch.get_default_device() + torch.set_default_device(None) + try: + deserialize_to_file(path, copy.deepcopy(data)) + finally: + torch.set_default_device(prev) files[ext] = path yield files for path in files.values(): @@ -414,7 +426,12 @@ def spin_aparam_model_files(): tmpdir = tempfile.mkdtemp() for ext in (".pt2", ".pte"): path = os.path.join(tmpdir, f"spin_aparam_test{ext}") - deserialize_to_file(path, copy.deepcopy(data)) + prev = torch.get_default_device() + torch.set_default_device(None) + try: + deserialize_to_file(path, copy.deepcopy(data)) + finally: + torch.set_default_device(prev) files[ext] = path yield files for path in files.values(): diff --git a/source/tests/pt_expt/test_change_bias.py b/source/tests/pt_expt/test_change_bias.py index 1fb3c5dd5d..03329642e9 100644 --- a/source/tests/pt_expt/test_change_bias.py +++ b/source/tests/pt_expt/test_change_bias.py @@ -156,7 +156,14 @@ def setUpClass(cls) -> None: cls.shared_pte = os.path.join(cls.tmpdir, "shared.pte") freeze(model=cls.model_path, output=cls.shared_pte) cls.shared_pt2 = os.path.join(cls.tmpdir, "shared.pt2") - freeze(model=cls.model_path, output=cls.shared_pt2) + # Clear default device: tests/pt/__init__.py may set a fake device + # for CPU fallback, which poisons AOTInductor compilation. + saved_device = torch.get_default_device() + torch.set_default_device(None) + try: + freeze(model=cls.model_path, output=cls.shared_pt2) + finally: + torch.set_default_device(saved_device) @classmethod def tearDownClass(cls) -> None: From 8050d470c20db5605b4de5dd61a3b879fbc4848e Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 13 Apr 2026 00:14:43 +0800 Subject: [PATCH 4/4] fix(test): replace hardcoded cuda:9999999 with torch.get_default_device() The device workaround in test_deep_eval.py hardcoded "cuda:9999999" when restoring the default device after AOTInductor compilation. Use torch.get_default_device() to save/restore the actual previous device instead, making the tests resilient to changes in the fake device value set by tests/pt/__init__.py. --- source/tests/pt_expt/infer/test_deep_eval.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/source/tests/pt_expt/infer/test_deep_eval.py b/source/tests/pt_expt/infer/test_deep_eval.py index 6797fa2c03..0bc9a90a79 100644 --- a/source/tests/pt_expt/infer/test_deep_eval.py +++ b/source/tests/pt_expt/infer/test_deep_eval.py @@ -544,12 +544,13 @@ def setUpClass(cls) -> None: cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) cls.tmpfile.close() # Temporarily clear default device to avoid poisoning AOTInductor - # compilation (tests/pt/__init__.py sets it to "cuda:9999999"). + # compilation (tests/pt/__init__.py may set a fake CUDA device). + prev = torch.get_default_device() torch.set_default_device(None) try: deserialize_to_file(cls.tmpfile.name, cls.model_data) finally: - torch.set_default_device("cuda:9999999") + torch.set_default_device(prev) # Also save to .pte for cross-format comparison cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False) @@ -606,6 +607,7 @@ def test_get_model_def_script_with_params(self) -> None: with tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) as f: tmpfile2 = f.name try: + prev = torch.get_default_device() torch.set_default_device(None) try: data_with_config = { @@ -614,7 +616,7 @@ def test_get_model_def_script_with_params(self) -> None: } deserialize_to_file(tmpfile2, data_with_config) finally: - torch.set_default_device("cuda:9999999") + torch.set_default_device(prev) dp2 = DeepPot(tmpfile2) mds = dp2.deep_eval.get_model_def_script() self.assertEqual(mds, training_config) @@ -970,11 +972,12 @@ def setUpClass(cls) -> None: cls.model_data = {"model": cls.model.serialize()} cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) cls.tmpfile.close() + prev = torch.get_default_device() torch.set_default_device(None) try: deserialize_to_file(cls.tmpfile.name, cls.model_data) finally: - torch.set_default_device("cuda:9999999") + torch.set_default_device(prev) # Also save .pte for cross-format comparison cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False) @@ -1185,11 +1188,12 @@ def setUpClass(cls) -> None: cls.model_data = {"model": cls.model.serialize()} cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) cls.tmpfile.close() + prev = torch.get_default_device() torch.set_default_device(None) try: deserialize_to_file(cls.tmpfile.name, cls.model_data) finally: - torch.set_default_device("cuda:9999999") + torch.set_default_device(prev) # Also save .pte for cross-format comparison cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False)