deepmodeling · njzjz · May 23, 2025 · Mar 7, 2025 · Mar 29, 2025 · May 8, 2025
diff --git a/.devcontainer/download_libtorch.sh b/.devcontainer/download_libtorch.sh
@@ -4,5 +4,5 @@ set -ev
 SCRIPT_PATH=$(dirname $(realpath -s $0))
 cd ${SCRIPT_PATH}/..
 
-wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.6.0%2Bcpu.zip -O ~/libtorch.zip
+wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.7.0%2Bcpu.zip -O ~/libtorch.zip
 unzip ~/libtorch.zip
diff --git a/.github/workflows/build_cc.yml b/.github/workflows/build_cc.yml
@@ -52,7 +52,7 @@ jobs:
       env:
         DEBIAN_FRONTEND: noninteractive
     - run: |
-         echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/5.3/ jammy main' | sudo tee /etc/apt/sources.list.d/rocm.list \
+         echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/6.3/ jammy main' | sudo tee /etc/apt/sources.list.d/rocm.list \
          && printf 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600 \
          && curl -s https://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add - \
          && sudo apt-get update \

diff --git a/.github/workflows/test_cc.yml b/.github/workflows/test_cc.yml
@@ -36,7 +36,7 @@ jobs:
       run: source/tests/infer/convert-models.sh
     - name: Download libtorch
       run: |
-         wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.1.2%2Bcpu.zip -O libtorch.zip
+         wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.7.0%2Bcpu.zip -O libtorch.zip
          unzip libtorch.zip
     # https://github.com/actions/runner-images/issues/9491
     - name: Fix kernel mmap rnd bits

diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml
@@ -47,7 +47,7 @@ jobs:
          && sudo apt-get -y install cuda-12-3 libcudnn8=8.9.5.*-1+cuda12.3
       if: false  # skip as we use nvidia image
     - run: python -m pip install -U uv
-    - run: source/install/uv_with_retry.sh pip install --system "tensorflow~=2.18.0rc2" "torch~=2.6.0" "jax[cuda12]==0.5.0"
+    - run: source/install/uv_with_retry.sh pip install --system "tensorflow~=2.18.0rc2" "torch~=2.7.0" "jax[cuda12]==0.5.0"
     - run: |
         export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
         export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
@@ -67,7 +67,7 @@ jobs:
       run: source/tests/infer/convert-models.sh
     - name: Download libtorch
       run: |
-         wget https://download.pytorch.org/libtorch/cu124/libtorch-cxx11-abi-shared-with-deps-2.6.0%2Bcu124.zip -O libtorch.zip
+         wget https://download.pytorch.org/libtorch/cu126/libtorch-cxx11-abi-shared-with-deps-2.7.0%2Bcu126.zip -O libtorch.zip
          unzip libtorch.zip
     - run: |
         export CMAKE_PREFIX_PATH=$GITHUB_WORKSPACE/libtorch

diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml
@@ -30,6 +30,7 @@ jobs:
         export TENSORFLOW_ROOT=$(python -c 'import tensorflow;print(tensorflow.__path__[0])')
         export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
         source/install/uv_with_retry.sh pip install --system -e .[test,jax] mpi4py "jax==0.5.0;python_version>='3.10'"
+        source/install/uv_with_retry.sh pip install --system -U setuptools
         source/install/uv_with_retry.sh pip install --system horovod --no-build-isolation
       env:
         # Please note that uv has some issues with finding
@@ -42,6 +43,8 @@ jobs:
         HOROVOD_WITH_TENSORFLOW: 1
         HOROVOD_WITHOUT_PYTORCH: 1
         HOROVOD_WITH_MPI: 1
+        # https://cmake.org/cmake/help/latest/variable/CMAKE_POLICY_VERSION_MINIMUM.html
+        CMAKE_POLICY_VERSION_MINIMUM: 3.5
     - run: dp --version
     - name: Get durations from cache
       uses: actions/cache@v4

diff --git a/backend/find_pytorch.py b/backend/find_pytorch.py
@@ -116,7 +116,7 @@ def get_pt_requirement(pt_version: str = "") -> dict:
         cuda_version = os.environ.get("CUDA_VERSION", "12.2")
         if cuda_version == "" or cuda_version in SpecifierSet(">=12,<13"):
             # CUDA 12.2, cudnn 9
-            pt_version = "2.6.0"
+            pt_version = "2.7.0"
         elif cuda_version in SpecifierSet(">=11,<12"):
             # CUDA 11.8, cudnn 8
             pt_version = "2.3.1"

diff --git a/deepmd/calculator.py b/deepmd/calculator.py
@@ -138,12 +138,13 @@
         self.results["virial"] = v[0].reshape(3, 3)
 
         # convert virial into stress for lattice relaxation
-        if "stress" in properties:
-            if sum(atoms.get_pbc()) > 0:
-                # the usual convention (tensile stress is positive)
-                # stress = -virial / volume
-                stress = -0.5 * (v[0].copy() + v[0].copy().T) / atoms.get_volume()
-                # Voigt notation
-                self.results["stress"] = stress.flat[[0, 4, 8, 5, 2, 1]]
-            else:
-                raise PropertyNotImplementedError
+        if cell is not None:
+            # the usual convention (tensile stress is positive)
+            # stress = -virial / volume
+            stress = -0.5 * (v[0].copy() + v[0].copy().T) / atoms.get_volume()
+            # Voigt notation
+            self.results["stress"] = stress.flat[[0, 4, 8, 5, 2, 1]]
+        elif "stress" in properties:
+            raise PropertyNotImplementedError
+        else:
+            pass
diff --git a/deepmd/dpmodel/utils/network.py b/deepmd/dpmodel/utils/network.py
@@ -34,6 +34,19 @@
 )
 
 
+def sigmoid_t(x: np.ndarray) -> np.ndarray:
+    """Sigmoid."""
+    if array_api_compat.is_jax_array(x):
+        from deepmd.jax.env import (
+            jax,
+        )
+
+        # see https://github.com/jax-ml/jax/discussions/15617
+        return jax.nn.sigmoid(x)
+    xp = array_api_compat.array_namespace(x)
+    return 1 / (1 + xp.exp(-x))
+
+
 class Identity(NativeOP):
     def __init__(self) -> None:
         super().__init__()
@@ -313,9 +326,8 @@ def fn(x):
     elif activation_function == "sigmoid":
 
         def fn(x):
-            xp = array_api_compat.array_namespace(x)
             # generated by GitHub Copilot
-            return 1 / (1 + xp.exp(-x))
+            return sigmoid_t(x)
 
         return fn
     elif activation_function.lower() in ("none", "linear"):

diff --git a/deepmd/jax/env.py b/deepmd/jax/env.py
@@ -12,6 +12,8 @@
 
 jax.config.update("jax_enable_x64", True)
 # jax.config.update("jax_debug_nans", True)
+# https://github.com/jax-ml/jax/issues/24909
+jax.config.update("jax_default_matmul_precision", "tensorfloat32")
 
 if os.environ.get("DP_DTYPE_PROMOTION_STRICT") == "1":
     jax.config.update("jax_numpy_dtype_promotion", "strict")

diff --git a/deepmd/jax/model/base_model.py b/deepmd/jax/model/base_model.py
@@ -47,7 +47,7 @@ def forward_common_atomic(
             kk_redu = get_reduce_name(kk)
             model_predict[kk_redu] = jnp.sum(vv, axis=atom_axis)
             kk_derv_r, kk_derv_c = get_deriv_name(kk)
-            if vdef.c_differentiable:
+            if vdef.r_differentiable:
 
                 def eval_output(
                     cc_ext,

diff --git a/deepmd/pt/model/descriptor/repformers.py b/deepmd/pt/model/descriptor/repformers.py
@@ -491,13 +491,13 @@ def forward(
                     torch.tensor(
                         real_nloc,
                         dtype=torch.int32,
-                        device=env.DEVICE,
-                    ),  # should be int of c++
+                        device=torch.device("cpu"),
+                    ),  # should be int of c++, placed on cpu
                     torch.tensor(
                         real_nall - real_nloc,
                         dtype=torch.int32,
-                        device=env.DEVICE,
-                    ),  # should be int of c++
+                        device=torch.device("cpu"),
+                    ),  # should be int of c++, placed on cpu
                 )
                 g1_ext = ret[0].unsqueeze(0)
                 if has_spin:

diff --git a/deepmd/pt/utils/auto_batch_size.py b/deepmd/pt/utils/auto_batch_size.py
@@ -49,11 +49,16 @@
         # several sources think CUSOLVER_STATUS_INTERNAL_ERROR is another out-of-memory error,
         # such as https://github.com/JuliaGPU/CUDA.jl/issues/1924
         # (the meaningless error message should be considered as a bug in cusolver)
-        if isinstance(e, RuntimeError) and (
-            "CUDA out of memory." in e.args[0]
-            or "CUDA driver error: out of memory" in e.args[0]
-            or "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR" in e.args[0]
-        ):
+        if (
+            isinstance(e, RuntimeError)
+            and (
+                "CUDA out of memory." in e.args[0]
+                or "CUDA driver error: out of memory" in e.args[0]
+                or "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR" in e.args[0]
+                # https://github.com/deepmodeling/deepmd-kit/issues/4594
+                or "CUDA error: out of memory" in e.args[0]
+            )
+        ) or isinstance(e, torch.cuda.OutOfMemoryError):
             # Release all unoccupied cached memory
             torch.cuda.empty_cache()
             return True

diff --git a/deepmd/tf/infer/deep_eval.py b/deepmd/tf/infer/deep_eval.py
@@ -259,7 +259,9 @@ def _init_attr(self) -> None:
             self.numb_dos = 0
         self.tmap = tmap.decode("utf-8").split()
         if self.tensors["modifier_type"] is not None:
-            self.modifier_type = run_sess(self.sess, [self.tensors["modifier_type"]])[0]
+            self.modifier_type = run_sess(self.sess, [self.tensors["modifier_type"]])[
+                0
+            ].decode()
         else:
             self.modifier_type = None
 
@@ -761,15 +763,17 @@ def eval(
             odef.name: oo for oo, odef in zip(output, self.output_def.var_defs.values())
         }
         # ugly!!
-        if self.modifier_type is not None and isinstance(self.model_type, DeepPot):
+        if self.modifier_type is not None and issubclass(self.model_type, DeepPot):
             if atomic:
                 raise RuntimeError("modifier does not support atomic modification")
             me, mf, mv = self.dm.eval(coords, cells, atom_types)
-            output = list(output)  # tuple to list
-            e, f, v = output[:3]
-            output_dict["energy_redu"] += me.reshape(e.shape)
-            output_dict["energy_deri_r"] += mf.reshape(f.shape)
-            output_dict["energy_deri_c_redu"] += mv.reshape(v.shape)
+            output_dict["energy_redu"] += me.reshape(output_dict["energy_redu"].shape)
+            output_dict["energy_derv_r"] += mf.reshape(
+                output_dict["energy_derv_r"].shape
+            )
+            output_dict["energy_derv_c_redu"] += mv.reshape(
+                output_dict["energy_derv_c_redu"].shape
+            )
         return output_dict
 
     def _prepare_feed_dict(
@@ -1350,6 +1354,8 @@ def sort_input(
             natoms = atom_type[0].size
             idx_map = np.arange(natoms)  # pylint: disable=no-explicit-dtype
             return coord, atom_type, idx_map
+        if atom_type.ndim > 1:
+            atom_type = atom_type[0]
         if sel_atoms is not None:
             selection = [False] * np.size(atom_type)
             for ii in sel_atoms:

diff --git a/deepmd/tf/model/tensor.py b/deepmd/tf/model/tensor.py
@@ -6,6 +6,7 @@
 
 from deepmd.tf.env import (
     MODEL_VERSION,
+    global_cvt_2_ener_float,
     tf,
 )
 from deepmd.tf.utils.type_embed import (
@@ -173,7 +174,7 @@ def build(
         if "global" not in self.model_type:
             gname = "global_" + self.model_type
             atom_out = tf.reshape(output, [-1, natomsel, nout])
-            global_out = tf.reduce_sum(atom_out, axis=1)
+            global_out = tf.reduce_sum(global_cvt_2_ener_float(atom_out), axis=1)
             global_out = tf.reshape(global_out, [-1, nout], name="o_" + gname + suffix)
 
             out_cpnts = tf.split(atom_out, nout, axis=-1)

diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py
@@ -89,6 +89,11 @@ def __init__(
         # enforce type_map if necessary
         self.enforce_type_map = False
         if type_map is not None and self.type_map is not None and len(type_map):
+            missing_elements = [elem for elem in self.type_map if elem not in type_map]
+            if missing_elements:
+                raise ValueError(
+                    f"Elements {missing_elements} are not present in the provided `type_map`."
+                )
             if not self.mixed_type:
                 atom_type_ = [
                     type_map.index(self.type_map[ii]) for ii in self.atom_type

diff --git a/pyproject.toml b/pyproject.toml
@@ -255,7 +255,7 @@ CMAKE_ARGS = "-DCMAKE_DISABLE_FIND_PACKAGE_OpenMP=1"
 # error: 'value' is unavailable: introduced in macOS 10.13
 select = "*-macosx_x86_64"
 inherit.environment = "append"
-environment.MACOSX_DEPLOYMENT_TARGET = "10.13"
+environment.MACOSX_DEPLOYMENT_TARGET = "11.0"
 
 [tool.cibuildwheel.linux]
 repair-wheel-command = "auditwheel repair --exclude libtensorflow_framework.so.2 --exclude libtensorflow_framework.so.1 --exclude libtensorflow_framework.so --exclude _pywrap_tensorflow_internal.so --exclude libtensorflow_cc.so.2 --exclude libc10.so --exclude libtorch.so --exclude libtorch_cpu.so -w {dest_dir} {wheel}"

diff --git a/source/cmake/googletest.cmake.in b/source/cmake/googletest.cmake.in
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.2)
+cmake_minimum_required(VERSION 3.5)
 
 project(googletest-download NONE)