feat(jax): add training (#5460)

njzjz-bot · web-flow · commit 53f98134d30c · 2026-05-28T03:11:59.000Z
## Summary This PR ports the JAX training entrypoint from the `parallel` branch onto the current `deepmodeling/deepmd-kit` master as a local-only training path. The change keeps the useful JAX trainer/CLI pieces while deliberately removing the parallel/distributed parts requested for cleanup: - add a JAX `train` entrypoint and wire it into the JAX backend command path - add local JAX trainer infrastructure for model initialization, data statistics, loss setup, training, validation, checkpointing, and model export - use the current dpmodel `compute_or_load_stat` data-stat practice from master - remove parallel/sharding-specific behavior from the training path - remove Hessian-specific behavior from the training path - map the lower-interface model outputs into the keys expected by `EnergyLoss` - use `communicate_extended_output` so extended/ghost atom force contributions are scattered back to local atoms correctly - add regression coverage for the local JAX training entrypoint and cleanup constraints ## Tests - `/tmp/deepmd-jax-venv/bin/python -m pytest -q source/tests/jax/test_training.py` - GitHub Actions `Test Python`: https://github.com/njzjz-bothub/deepmd-kit/actions/runs/26464854510 Authored by OpenClaw (model: custom-chat-jinzhezeng-group/gpt-5.5)  ## Summary by CodeRabbit * **New Features** * Added a CLI entrypoint to run JAX train/freeze commands. * Backend hook now returns the JAX entrypoint, enabling invocation as the JAX backend. * New JAX trainer providing training/validation flow, checkpointing, learning-rate scheduling, input preparation and data conversion. * Enhanced runtime summary and logging (device/backend info, GPU counts, JAX version). * **Tests** * Added an end-to-end test that runs a single-step JAX training workflow and verifies produced artifacts.  [![Review Change Stack](https://storage.googleapis.com/coderabbit_public_assets/review-stack-in-coderabbit-ui.svg)](https://app.coderabbit.ai/change-stack/deepmodeling/deepmd-kit/pull/5460?utm_source=github_walkthrough&utm_medium=github&utm_campaign=change_stack)
diff --git a/deepmd/backend/jax.py b/deepmd/backend/jax.py
@@ -62,7 +62,11 @@ def entry_point_hook(self) -> Callable[["Namespace"], None]:
         Callable[[Namespace], None]
             The entry point hook of the backend.
         """
-        raise NotImplementedError
+        from deepmd.jax.entrypoints.main import (
+            main,
+        )
+
+        return main
 
     @property
     def deep_eval(self) -> type["DeepEvalBackend"]:
diff --git a/deepmd/jax/entrypoints/__init__.py b/deepmd/jax/entrypoints/__init__.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
diff --git a/deepmd/jax/entrypoints/freeze.py b/deepmd/jax/entrypoints/freeze.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Freeze utilities for the JAX backend."""
+
+from pathlib import (
+    Path,
+)
+
+from deepmd.backend.suffix import (
+    format_model_suffix,
+)
+from deepmd.jax.utils.serialization import (
+    deserialize_to_file,
+    serialize_from_file,
+)
+
+
+def freeze(
+    *,
+    checkpoint_folder: str,
+    output: str,
+    **kwargs: object,
+) -> None:
+    """Freeze a JAX checkpoint into a serialized model file.
+
+    Parameters
+    ----------
+    checkpoint_folder : str
+        Location of either the checkpoint directory or a folder containing the
+        stable ``checkpoint`` pointer.
+    output : str
+        Output model filename or prefix. The JAX model suffix is added when the
+        filename has no supported backend suffix.
+    **kwargs
+        Other CLI arguments accepted for backend entry-point compatibility.
+    """
+    del kwargs
+
+    checkpoint_path = Path(checkpoint_folder)
+    if (checkpoint_path / "checkpoint").is_file():
+        checkpoint_pointer = (checkpoint_path / "checkpoint").read_text().strip()
+        checkpoint_folder = str(checkpoint_path / checkpoint_pointer)
+
+    output = format_model_suffix(
+        output,
+        preferred_backend="jax",
+        strict_prefer=True,
+    )
+    data = serialize_from_file(checkpoint_folder)
+    deserialize_to_file(output, data)
diff --git a/deepmd/jax/entrypoints/main.py b/deepmd/jax/entrypoints/main.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""DeePMD-Kit entry point module."""
+
+import argparse
+from pathlib import (
+    Path,
+)
+
+from deepmd.jax.entrypoints.freeze import (
+    freeze,
+)
+from deepmd.jax.entrypoints.train import (
+    train,
+)
+from deepmd.loggers.loggers import (
+    set_log_handles,
+)
+from deepmd.main import (
+    parse_args,
+)
+
+__all__ = ["main"]
+
+
+def main(args: list[str] | argparse.Namespace | None = None) -> None:
+    """DeePMD-Kit entry point.
+
+    Parameters
+    ----------
+    args : list[str] or argparse.Namespace, optional
+        list of command line arguments, used to avoid calling from the subprocess,
+        as it is quite slow to import tensorflow; if Namespace is given, it will
+        be used directly
+
+    Raises
+    ------
+    RuntimeError
+        if no command was input
+    """
+    if not isinstance(args, argparse.Namespace):
+        args = parse_args(args=args)
+
+    dict_args = vars(args)
+    set_log_handles(
+        args.log_level,
+        Path(args.log_path) if args.log_path else None,
+        mpi_log=None,
+    )
+
+    if args.command == "train":
+        train(**dict_args)
+    elif args.command == "freeze":
+        freeze(**dict_args)
+    elif args.command is None:
+        pass
+    else:
+        raise RuntimeError(f"unknown command {args.command}")
diff --git a/deepmd/jax/entrypoints/train.py b/deepmd/jax/entrypoints/train.py
@@ -0,0 +1,203 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""DeePMD training entrypoint script.
+
+Can handle local training.
+"""
+
+import json
+import logging
+import time
+from typing import (
+    Any,
+)
+
+from deepmd.common import (
+    j_loader,
+)
+from deepmd.jax.env import (
+    jax,
+    jax_export,
+)
+from deepmd.jax.train.trainer import (
+    DPTrainer,
+)
+from deepmd.utils import random as dp_random
+from deepmd.utils.argcheck import (
+    normalize,
+)
+from deepmd.utils.compat import (
+    update_deepmd_input,
+)
+from deepmd.utils.data_system import (
+    get_data,
+)
+from deepmd.utils.summary import SummaryPrinter as BaseSummaryPrinter
+
+__all__ = ["train"]
+
+log = logging.getLogger(__name__)
+
+
+class SummaryPrinter(BaseSummaryPrinter):
+    """Summary printer for JAX."""
+
+    def is_built_with_cuda(self) -> bool:
+        """Check if the backend is built with CUDA."""
+        return jax_export.default_export_platform() == "cuda"
+
+    def is_built_with_rocm(self) -> bool:
+        """Check if the backend is built with ROCm."""
+        return jax_export.default_export_platform() == "rocm"
+
+    def get_compute_device(self) -> str:
+        """Get Compute device."""
+        return jax.default_backend()
+
+    def get_ngpus(self) -> int:
+        """Get the number of GPUs."""
+        return jax.device_count()
+
+    def get_backend_info(self) -> dict:
+        """Get backend information."""
+        return {
+            "Backend": "JAX",
+            "JAX ver": jax.__version__,
+        }
+
+    def get_device_name(self) -> str:
+        """Get the name of the device."""
+        devices = jax.devices()
+        if devices:
+            return devices[0].device_kind
+        else:
+            return "Unknown"
+
+
+def train(
+    *,
+    INPUT: str,
+    init_model: str | None,
+    restart: str | None,
+    output: str,
+    init_frz_model: str | None,
+    mpi_log: str,
+    log_level: int,
+    log_path: str | None,
+    skip_neighbor_stat: bool = False,
+    finetune: str | None = None,
+    use_pretrain_script: bool = False,
+    **kwargs: Any,
+) -> None:
+    """Run DeePMD model training.
+
+    Parameters
+    ----------
+    INPUT : str
+        json/yaml control file
+    init_model : Optional[str]
+        path prefix of checkpoint files or None
+    restart : Optional[str]
+        path prefix of checkpoint files or None
+    output : str
+        path for dump file with arguments
+    init_frz_model : str | None
+        path to frozen model, or None if no frozen model is used
+    mpi_log : str
+        mpi logging mode
+    log_level : int
+        logging level defined by int 0-3
+    log_path : Optional[str]
+        logging file path or None if logs are to be output only to stdout
+    skip_neighbor_stat : bool, default=False
+        skip checking neighbor statistics
+    finetune : Optional[str]
+        path to pretrained model or None
+    use_pretrain_script : bool
+        Whether to use model script in pretrained model when doing init-model or init-frz-model.
+        Note that this option is true and unchangeable for fine-tuning.
+    **kwargs
+        additional arguments
+
+    Raises
+    ------
+    RuntimeError
+        if the training command fails.
+    """
+    # load json database
+    jdata = j_loader(INPUT)
+
+    if init_frz_model:
+        raise NotImplementedError("JAX training does not support init_frz_model yet")
+    if finetune:
+        raise NotImplementedError("JAX training does not support finetune yet")
+    if use_pretrain_script:
+        raise NotImplementedError(
+            "JAX training does not support use_pretrain_script yet"
+        )
+
+    jdata = update_deepmd_input(jdata, warning=True, dump="input_v2_compat.json")
+
+    jdata = normalize(jdata)
+    if not skip_neighbor_stat:
+        jdata = update_sel(jdata)
+
+    with open(output, "w") as fp:
+        json.dump(jdata, fp, indent=4)
+    SummaryPrinter()()
+
+    # make necessary checks
+    assert "training" in jdata
+
+    # init the model
+
+    model = DPTrainer(
+        jdata,
+        init_model=init_model,
+        restart=restart,
+    )
+    rcut = model.model.get_rcut()
+    type_map = model.model.get_type_map()
+    if len(type_map) == 0:
+        ipt_type_map = None
+    else:
+        ipt_type_map = type_map
+
+    # init random seed of data systems
+    seed = jdata["training"].get("seed", None)
+    if seed is not None:
+        seed += jax.process_index()
+        seed = seed % (2**32)
+    dp_random.seed(seed)
+
+    # init data
+    train_data = get_data(jdata["training"]["training_data"], rcut, ipt_type_map, None)
+    train_data.add_data_requirements(model.data_requirements)
+    train_data.print_summary("training")
+    if jdata["training"].get("validation_data", None) is not None:
+        valid_data = get_data(
+            jdata["training"]["validation_data"],
+            rcut,
+            train_data.type_map,
+            None,
+        )
+        valid_data.add_data_requirements(model.data_requirements)
+        valid_data.print_summary("validation")
+    else:
+        valid_data = None
+
+    # train the model with the provided systems in a cyclic way
+    start_time = time.time()
+    model.train(train_data, valid_data)
+    end_time = time.time()
+    log.info("finished training")
+    log.info(f"wall time: {(end_time - start_time):.3f} s")
+
+
+def update_sel(jdata: dict) -> dict:
+    """Update descriptor selections from neighbor statistics when available."""
+    log.info(
+        "Skip neighbor statistics update for JAX training; "
+        "BaseModel.update_sel currently needs more memory than expected."
+    )
+    # TODO: Restore BaseModel.update_sel once the JAX data path avoids OOM.
+    return jdata.copy()
diff --git a/deepmd/jax/train/__init__.py b/deepmd/jax/train/__init__.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
diff --git a/deepmd/jax/train/trainer.py b/deepmd/jax/train/trainer.py
diff --git a/source/tests/jax/test_training.py b/source/tests/jax/test_training.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# SPDX-License-Identifier: LGPL-3.0-or-later`