From d69632ed12a6e8515617d7995d3003c00015da47 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 12 Apr 2026 17:32:40 +0800
Subject: [PATCH 1/4] fix(pt_expt): centralize default-device guard in
 AOTInductor compilation

AOTInductor's lowering code creates tensors without explicit device=,
inheriting any active torch.set_default_device. This caused compilation
failures when tests/pt/__init__.py set a fake CUDA device. Move the
set_default_device(None) guard into _deserialize_to_file_pt2 so all
callers (tests, dp freeze, dp compress) are protected, and remove the
12 scattered workarounds from test files.
---
 deepmd/pt_expt/utils/serialization.py         | 15 ++++++--
 source/tests/pt_expt/infer/test_deep_eval.py  | 34 +++++--------------
 .../pt_expt/infer/test_deep_eval_spin.py      | 23 ++-----------
 source/tests/pt_expt/test_change_bias.py      |  9 +----
 4 files changed, 25 insertions(+), 56 deletions(-)

diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index f59c397525..d47d82505a 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -554,8 +554,19 @@ def _deserialize_to_file_pt2(
         data, model_json_override
     )
 
-    # Compile via AOTInductor into a .pt2 package
-    aoti_compile_and_package(exported, package_path=model_file)
+    # AOTInductor's lowering code internally creates tensors (e.g.
+    # ``torch.zeros``) without an explicit ``device=`` argument.  If a
+    # non-CPU default device is active (e.g. tests/pt/__init__.py sets
+    # ``torch.set_default_device("cuda:9999999")``), the compilation fails
+    # on CPU-only builds.  Temporarily clear the default device so the
+    # inductor always targets CPU.
+    prev_device = torch.get_default_device()
+    torch.set_default_device(None)
+    try:
+        # Compile via AOTInductor into a .pt2 package
+        aoti_compile_and_package(exported, package_path=model_file)
+    finally:
+        torch.set_default_device(prev_device)
 
     # Embed metadata into the .pt2 ZIP archive
     model_def_script = data.get("model_def_script") or {}
diff --git a/source/tests/pt_expt/infer/test_deep_eval.py b/source/tests/pt_expt/infer/test_deep_eval.py
index 6797fa2c03..c932627672 100644
--- a/source/tests/pt_expt/infer/test_deep_eval.py
+++ b/source/tests/pt_expt/infer/test_deep_eval.py
@@ -543,13 +543,7 @@ def setUpClass(cls) -> None:
         cls.model_data = {"model": cls.model.serialize()}
         cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pt2", delete=False)
         cls.tmpfile.close()
-        # Temporarily clear default device to avoid poisoning AOTInductor
-        # compilation (tests/pt/__init__.py sets it to "cuda:9999999").
-        torch.set_default_device(None)
-        try:
-            deserialize_to_file(cls.tmpfile.name, cls.model_data)
-        finally:
-            torch.set_default_device("cuda:9999999")
+        deserialize_to_file(cls.tmpfile.name, cls.model_data)
 
         # Also save to .pte for cross-format comparison
         cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False)
@@ -606,15 +600,11 @@ def test_get_model_def_script_with_params(self) -> None:
         with tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) as f:
             tmpfile2 = f.name
         try:
-            torch.set_default_device(None)
-            try:
-                data_with_config = {
-                    **self.model_data,
-                    "model_def_script": training_config,
-                }
-                deserialize_to_file(tmpfile2, data_with_config)
-            finally:
-                torch.set_default_device("cuda:9999999")
+            data_with_config = {
+                **self.model_data,
+                "model_def_script": training_config,
+            }
+            deserialize_to_file(tmpfile2, data_with_config)
             dp2 = DeepPot(tmpfile2)
             mds = dp2.deep_eval.get_model_def_script()
             self.assertEqual(mds, training_config)
@@ -970,11 +960,7 @@ def setUpClass(cls) -> None:
         cls.model_data = {"model": cls.model.serialize()}
         cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pt2", delete=False)
         cls.tmpfile.close()
-        torch.set_default_device(None)
-        try:
-            deserialize_to_file(cls.tmpfile.name, cls.model_data)
-        finally:
-            torch.set_default_device("cuda:9999999")
+        deserialize_to_file(cls.tmpfile.name, cls.model_data)
 
         # Also save .pte for cross-format comparison
         cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False)
@@ -1185,11 +1171,7 @@ def setUpClass(cls) -> None:
         cls.model_data = {"model": cls.model.serialize()}
         cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pt2", delete=False)
         cls.tmpfile.close()
-        torch.set_default_device(None)
-        try:
-            deserialize_to_file(cls.tmpfile.name, cls.model_data)
-        finally:
-            torch.set_default_device("cuda:9999999")
+        deserialize_to_file(cls.tmpfile.name, cls.model_data)
 
         # Also save .pte for cross-format comparison
         cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False)
diff --git a/source/tests/pt_expt/infer/test_deep_eval_spin.py b/source/tests/pt_expt/infer/test_deep_eval_spin.py
index 829b1f5666..64eb169dc9 100644
--- a/source/tests/pt_expt/infer/test_deep_eval_spin.py
+++ b/source/tests/pt_expt/infer/test_deep_eval_spin.py
@@ -154,14 +154,7 @@ def spin_model_files():
     tmpdir = tempfile.mkdtemp()
     for ext in (".pt2", ".pte"):
         path = os.path.join(tmpdir, f"spin_test{ext}")
-        # AOTInductor (.pt2) internally creates tensors using the PyTorch
-        # default device.  Clear it so compilation stays on CPU.
-        prev = torch.get_default_device()
-        torch.set_default_device(None)
-        try:
-            deserialize_to_file(path, copy.deepcopy(data))
-        finally:
-            torch.set_default_device(prev)
+        deserialize_to_file(path, copy.deepcopy(data))
         files[ext] = path
     yield files, ref_pbc, ref_nopbc
     for path in files.values():
@@ -362,12 +355,7 @@ def spin_fparam_model_files():
     tmpdir = tempfile.mkdtemp()
     for ext in (".pt2", ".pte"):
         path = os.path.join(tmpdir, f"spin_fparam_test{ext}")
-        prev = torch.get_default_device()
-        torch.set_default_device(None)
-        try:
-            deserialize_to_file(path, copy.deepcopy(data))
-        finally:
-            torch.set_default_device(prev)
+        deserialize_to_file(path, copy.deepcopy(data))
         files[ext] = path
     yield files
     for path in files.values():
@@ -426,12 +414,7 @@ def spin_aparam_model_files():
     tmpdir = tempfile.mkdtemp()
     for ext in (".pt2", ".pte"):
         path = os.path.join(tmpdir, f"spin_aparam_test{ext}")
-        prev = torch.get_default_device()
-        torch.set_default_device(None)
-        try:
-            deserialize_to_file(path, copy.deepcopy(data))
-        finally:
-            torch.set_default_device(prev)
+        deserialize_to_file(path, copy.deepcopy(data))
         files[ext] = path
     yield files
     for path in files.values():
diff --git a/source/tests/pt_expt/test_change_bias.py b/source/tests/pt_expt/test_change_bias.py
index 03329642e9..1fb3c5dd5d 100644
--- a/source/tests/pt_expt/test_change_bias.py
+++ b/source/tests/pt_expt/test_change_bias.py
@@ -156,14 +156,7 @@ def setUpClass(cls) -> None:
         cls.shared_pte = os.path.join(cls.tmpdir, "shared.pte")
         freeze(model=cls.model_path, output=cls.shared_pte)
         cls.shared_pt2 = os.path.join(cls.tmpdir, "shared.pt2")
-        # Clear default device: tests/pt/__init__.py may set a fake device
-        # for CPU fallback, which poisons AOTInductor compilation.
-        saved_device = torch.get_default_device()
-        torch.set_default_device(None)
-        try:
-            freeze(model=cls.model_path, output=cls.shared_pt2)
-        finally:
-            torch.set_default_device(saved_device)
+        freeze(model=cls.model_path, output=cls.shared_pt2)
 
     @classmethod
     def tearDownClass(cls) -> None:

From 016ea5b809d1bb2c9aea124eb30f69aae75b0bba Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 12 Apr 2026 22:24:10 +0800
Subject: [PATCH 2/4] test(pt_expt): reduce AOTInductor compile time with fast
 configs

Set inductor configs in conftest to skip expensive C++ optimizations
during .pt2 compilation: max_fusion_size=8, epilogue_fusion=False,
pattern_matcher=False, package_cpp_only=True, compile_opt_level=O0.
Tests only validate correctness so runtime performance is irrelevant.
Cuts per-model compile time from ~50s to ~30s.
---
 source/tests/pt_expt/conftest.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/source/tests/pt_expt/conftest.py b/source/tests/pt_expt/conftest.py
index f2a9b07a6a..3b48d2e51d 100644
--- a/source/tests/pt_expt/conftest.py
+++ b/source/tests/pt_expt/conftest.py
@@ -12,11 +12,21 @@
 """
 
 import pytest
+import torch._inductor.config as _inductor_config
 import torch.utils._device as _device
 from torch.overrides import (
     _get_current_function_mode_stack,
 )
 
+# Reduce AOTInductor (.pt2) compile time for unit tests.
+# Tests only validate correctness, not runtime performance, so we can
+# skip expensive C++ optimizations.  This cuts compile time by ~50%.
+_inductor_config.max_fusion_size = 8
+_inductor_config.epilogue_fusion = False
+_inductor_config.pattern_matcher = False
+_inductor_config.aot_inductor.package_cpp_only = True
+_inductor_config.aot_inductor.compile_wrapper_opt_level = "O0"
+
 
 def _pop_device_contexts() -> list:
     """Pop all stale DeviceContext modes from the torch function mode stack."""

From 2a687ce2221e8f14cff7ebaf7e5607b574434d87 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 13 Apr 2026 00:09:08 +0800
Subject: [PATCH 3/4] fix(pt_expt): revert centralized device guard, keep
 per-test workarounds

Centralizing set_default_device(None) in _deserialize_to_file_pt2
re-pushes a stale DeviceContext on restore (torch.get_default_device
returns a stale value after DeviceContext is popped from mode stack),
breaking subsequent training (Adam optimizer creates tensors without
device=). Revert to per-test workarounds which don't have this issue.
---
 deepmd/pt_expt/utils/serialization.py         | 15 ++------
 source/tests/pt_expt/infer/test_deep_eval.py  | 34 ++++++++++++++-----
 .../pt_expt/infer/test_deep_eval_spin.py      | 23 +++++++++++--
 source/tests/pt_expt/test_change_bias.py      |  9 ++++-
 4 files changed, 56 insertions(+), 25 deletions(-)

diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index d47d82505a..f59c397525 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -554,19 +554,8 @@ def _deserialize_to_file_pt2(
         data, model_json_override
     )
 
-    # AOTInductor's lowering code internally creates tensors (e.g.
-    # ``torch.zeros``) without an explicit ``device=`` argument.  If a
-    # non-CPU default device is active (e.g. tests/pt/__init__.py sets
-    # ``torch.set_default_device("cuda:9999999")``), the compilation fails
-    # on CPU-only builds.  Temporarily clear the default device so the
-    # inductor always targets CPU.
-    prev_device = torch.get_default_device()
-    torch.set_default_device(None)
-    try:
-        # Compile via AOTInductor into a .pt2 package
-        aoti_compile_and_package(exported, package_path=model_file)
-    finally:
-        torch.set_default_device(prev_device)
+    # Compile via AOTInductor into a .pt2 package
+    aoti_compile_and_package(exported, package_path=model_file)
 
     # Embed metadata into the .pt2 ZIP archive
     model_def_script = data.get("model_def_script") or {}
diff --git a/source/tests/pt_expt/infer/test_deep_eval.py b/source/tests/pt_expt/infer/test_deep_eval.py
index c932627672..6797fa2c03 100644
--- a/source/tests/pt_expt/infer/test_deep_eval.py
+++ b/source/tests/pt_expt/infer/test_deep_eval.py
@@ -543,7 +543,13 @@ def setUpClass(cls) -> None:
         cls.model_data = {"model": cls.model.serialize()}
         cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pt2", delete=False)
         cls.tmpfile.close()
-        deserialize_to_file(cls.tmpfile.name, cls.model_data)
+        # Temporarily clear default device to avoid poisoning AOTInductor
+        # compilation (tests/pt/__init__.py sets it to "cuda:9999999").
+        torch.set_default_device(None)
+        try:
+            deserialize_to_file(cls.tmpfile.name, cls.model_data)
+        finally:
+            torch.set_default_device("cuda:9999999")
 
         # Also save to .pte for cross-format comparison
         cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False)
@@ -600,11 +606,15 @@ def test_get_model_def_script_with_params(self) -> None:
         with tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) as f:
             tmpfile2 = f.name
         try:
-            data_with_config = {
-                **self.model_data,
-                "model_def_script": training_config,
-            }
-            deserialize_to_file(tmpfile2, data_with_config)
+            torch.set_default_device(None)
+            try:
+                data_with_config = {
+                    **self.model_data,
+                    "model_def_script": training_config,
+                }
+                deserialize_to_file(tmpfile2, data_with_config)
+            finally:
+                torch.set_default_device("cuda:9999999")
             dp2 = DeepPot(tmpfile2)
             mds = dp2.deep_eval.get_model_def_script()
             self.assertEqual(mds, training_config)
@@ -960,7 +970,11 @@ def setUpClass(cls) -> None:
         cls.model_data = {"model": cls.model.serialize()}
         cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pt2", delete=False)
         cls.tmpfile.close()
-        deserialize_to_file(cls.tmpfile.name, cls.model_data)
+        torch.set_default_device(None)
+        try:
+            deserialize_to_file(cls.tmpfile.name, cls.model_data)
+        finally:
+            torch.set_default_device("cuda:9999999")
 
         # Also save .pte for cross-format comparison
         cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False)
@@ -1171,7 +1185,11 @@ def setUpClass(cls) -> None:
         cls.model_data = {"model": cls.model.serialize()}
         cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pt2", delete=False)
         cls.tmpfile.close()
-        deserialize_to_file(cls.tmpfile.name, cls.model_data)
+        torch.set_default_device(None)
+        try:
+            deserialize_to_file(cls.tmpfile.name, cls.model_data)
+        finally:
+            torch.set_default_device("cuda:9999999")
 
         # Also save .pte for cross-format comparison
         cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False)
diff --git a/source/tests/pt_expt/infer/test_deep_eval_spin.py b/source/tests/pt_expt/infer/test_deep_eval_spin.py
index 64eb169dc9..829b1f5666 100644
--- a/source/tests/pt_expt/infer/test_deep_eval_spin.py
+++ b/source/tests/pt_expt/infer/test_deep_eval_spin.py
@@ -154,7 +154,14 @@ def spin_model_files():
     tmpdir = tempfile.mkdtemp()
     for ext in (".pt2", ".pte"):
         path = os.path.join(tmpdir, f"spin_test{ext}")
-        deserialize_to_file(path, copy.deepcopy(data))
+        # AOTInductor (.pt2) internally creates tensors using the PyTorch
+        # default device.  Clear it so compilation stays on CPU.
+        prev = torch.get_default_device()
+        torch.set_default_device(None)
+        try:
+            deserialize_to_file(path, copy.deepcopy(data))
+        finally:
+            torch.set_default_device(prev)
         files[ext] = path
     yield files, ref_pbc, ref_nopbc
     for path in files.values():
@@ -355,7 +362,12 @@ def spin_fparam_model_files():
     tmpdir = tempfile.mkdtemp()
     for ext in (".pt2", ".pte"):
         path = os.path.join(tmpdir, f"spin_fparam_test{ext}")
-        deserialize_to_file(path, copy.deepcopy(data))
+        prev = torch.get_default_device()
+        torch.set_default_device(None)
+        try:
+            deserialize_to_file(path, copy.deepcopy(data))
+        finally:
+            torch.set_default_device(prev)
         files[ext] = path
     yield files
     for path in files.values():
@@ -414,7 +426,12 @@ def spin_aparam_model_files():
     tmpdir = tempfile.mkdtemp()
     for ext in (".pt2", ".pte"):
         path = os.path.join(tmpdir, f"spin_aparam_test{ext}")
-        deserialize_to_file(path, copy.deepcopy(data))
+        prev = torch.get_default_device()
+        torch.set_default_device(None)
+        try:
+            deserialize_to_file(path, copy.deepcopy(data))
+        finally:
+            torch.set_default_device(prev)
         files[ext] = path
     yield files
     for path in files.values():
diff --git a/source/tests/pt_expt/test_change_bias.py b/source/tests/pt_expt/test_change_bias.py
index 1fb3c5dd5d..03329642e9 100644
--- a/source/tests/pt_expt/test_change_bias.py
+++ b/source/tests/pt_expt/test_change_bias.py
@@ -156,7 +156,14 @@ def setUpClass(cls) -> None:
         cls.shared_pte = os.path.join(cls.tmpdir, "shared.pte")
         freeze(model=cls.model_path, output=cls.shared_pte)
         cls.shared_pt2 = os.path.join(cls.tmpdir, "shared.pt2")
-        freeze(model=cls.model_path, output=cls.shared_pt2)
+        # Clear default device: tests/pt/__init__.py may set a fake device
+        # for CPU fallback, which poisons AOTInductor compilation.
+        saved_device = torch.get_default_device()
+        torch.set_default_device(None)
+        try:
+            freeze(model=cls.model_path, output=cls.shared_pt2)
+        finally:
+            torch.set_default_device(saved_device)
 
     @classmethod
     def tearDownClass(cls) -> None:

From 8050d470c20db5605b4de5dd61a3b879fbc4848e Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 13 Apr 2026 00:14:43 +0800
Subject: [PATCH 4/4] fix(test): replace hardcoded cuda:9999999 with
 torch.get_default_device()

The device workaround in test_deep_eval.py hardcoded "cuda:9999999" when
restoring the default device after AOTInductor compilation.  Use
torch.get_default_device() to save/restore the actual previous device
instead, making the tests resilient to changes in the fake device value
set by tests/pt/__init__.py.
---
 source/tests/pt_expt/infer/test_deep_eval.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/source/tests/pt_expt/infer/test_deep_eval.py b/source/tests/pt_expt/infer/test_deep_eval.py
index 6797fa2c03..0bc9a90a79 100644
--- a/source/tests/pt_expt/infer/test_deep_eval.py
+++ b/source/tests/pt_expt/infer/test_deep_eval.py
@@ -544,12 +544,13 @@ def setUpClass(cls) -> None:
         cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pt2", delete=False)
         cls.tmpfile.close()
         # Temporarily clear default device to avoid poisoning AOTInductor
-        # compilation (tests/pt/__init__.py sets it to "cuda:9999999").
+        # compilation (tests/pt/__init__.py may set a fake CUDA device).
+        prev = torch.get_default_device()
         torch.set_default_device(None)
         try:
             deserialize_to_file(cls.tmpfile.name, cls.model_data)
         finally:
-            torch.set_default_device("cuda:9999999")
+            torch.set_default_device(prev)
 
         # Also save to .pte for cross-format comparison
         cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False)
@@ -606,6 +607,7 @@ def test_get_model_def_script_with_params(self) -> None:
         with tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) as f:
             tmpfile2 = f.name
         try:
+            prev = torch.get_default_device()
             torch.set_default_device(None)
             try:
                 data_with_config = {
@@ -614,7 +616,7 @@ def test_get_model_def_script_with_params(self) -> None:
                 }
                 deserialize_to_file(tmpfile2, data_with_config)
             finally:
-                torch.set_default_device("cuda:9999999")
+                torch.set_default_device(prev)
             dp2 = DeepPot(tmpfile2)
             mds = dp2.deep_eval.get_model_def_script()
             self.assertEqual(mds, training_config)
@@ -970,11 +972,12 @@ def setUpClass(cls) -> None:
         cls.model_data = {"model": cls.model.serialize()}
         cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pt2", delete=False)
         cls.tmpfile.close()
+        prev = torch.get_default_device()
         torch.set_default_device(None)
         try:
             deserialize_to_file(cls.tmpfile.name, cls.model_data)
         finally:
-            torch.set_default_device("cuda:9999999")
+            torch.set_default_device(prev)
 
         # Also save .pte for cross-format comparison
         cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False)
@@ -1185,11 +1188,12 @@ def setUpClass(cls) -> None:
         cls.model_data = {"model": cls.model.serialize()}
         cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pt2", delete=False)
         cls.tmpfile.close()
+        prev = torch.get_default_device()
         torch.set_default_device(None)
         try:
             deserialize_to_file(cls.tmpfile.name, cls.model_data)
         finally:
-            torch.set_default_device("cuda:9999999")
+            torch.set_default_device(prev)
 
         # Also save .pte for cross-format comparison
         cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False)