diff --git a/deepmd/loggers/training.py b/deepmd/loggers/training.py index c7fe94e24d..c79e414da7 100644 --- a/deepmd/loggers/training.py +++ b/deepmd/loggers/training.py @@ -8,7 +8,7 @@ def format_training_message( eta: int | None = None, ) -> str: """Format a training message.""" - msg = f"batch {batch:7d}: total wall time = {wall_time:.2f} s" + msg = f"Batch {batch:7d}: total wall time = {wall_time:.2f} s" if isinstance(eta, int): msg += f", eta = {datetime.timedelta(seconds=int(eta))!s}" return msg @@ -29,7 +29,7 @@ def format_training_message_per_task( # sort rmse rmse = dict(sorted(rmse.items())) return ( - f"batch {batch:7d}: {task_name}" + f"Batch {batch:7d}: {task_name}" f"{', '.join([f'{kk} = {vv:8.2e}' for kk, vv in rmse.items()])}" f"{lr}" ) diff --git a/deepmd/pd/entrypoints/main.py b/deepmd/pd/entrypoints/main.py index 8600d73bc9..184072da2f 100644 --- a/deepmd/pd/entrypoints/main.py +++ b/deepmd/pd/entrypoints/main.py @@ -219,8 +219,8 @@ def get_backend_info(self) -> dict: op_info = {} return { "Backend": "Paddle", - "PD ver": f"v{paddle.__version__}-g{paddle.version.commit[:11]}", - "Enable custom OP": False, + "PD Ver": f"v{paddle.__version__}-g{paddle.version.commit[:11]}", + "Custom OP Enabled": False, **op_info, } diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index 75efdd8c9f..e6b3862da5 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -239,16 +239,16 @@ def get_backend_info(self) -> dict: """Get backend information.""" if ENABLE_CUSTOMIZED_OP: op_info = { - "build with PT ver": GLOBAL_CONFIG["pt_version"], - "build with PT inc": GLOBAL_CONFIG["pt_include_dir"].replace(";", "\n"), - "build with PT lib": GLOBAL_CONFIG["pt_libs"].replace(";", "\n"), + "Built With PT Ver": GLOBAL_CONFIG["pt_version"], + "Built With PT Inc": GLOBAL_CONFIG["pt_include_dir"].replace(";", "\n"), + "Built With PT Lib": GLOBAL_CONFIG["pt_libs"].replace(";", "\n"), } else: op_info = {} return { "Backend": "PyTorch", - "PT ver": f"v{torch.__version__}-g{torch.version.git_version[:11]}", - "Enable custom OP": ENABLE_CUSTOMIZED_OP, + "PT Ver": f"v{torch.__version__}-g{torch.version.git_version[:11]}", + "Custom OP Enabled": ENABLE_CUSTOMIZED_OP, **op_info, } diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index d98b23d25c..804daf897d 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -721,6 +721,51 @@ def warm_up_linear(step: int, warmup_steps: int) -> float: self.profiling = training_params.get("profiling", False) self.profiling_file = training_params.get("profiling_file", "timeline.json") + # Log model summary info (descriptor type and parameter count) + if self.rank == 0: + self._log_model_summary() + + def _log_model_summary(self) -> None: + """Log model summary information including descriptor type and parameter count.""" + + def get_descriptor_type(model: Any) -> str: + """Get the descriptor type name from model.""" + # Standard models have get_descriptor method + if hasattr(model, "get_descriptor"): + descriptor = model.get_descriptor() + serialized = descriptor.serialize() + if isinstance(serialized, dict) and "type" in serialized: + return serialized["type"].upper() + # ZBL models: descriptor is in atomic_model.models[0] + if hasattr(model, "atomic_model") and hasattr(model.atomic_model, "models"): + models = model.atomic_model.models + if models: # Check non-empty + dp_model = models[0] + if hasattr(dp_model, "descriptor"): + serialized = dp_model.descriptor.serialize() + if isinstance(serialized, dict) and "type" in serialized: + return serialized["type"].upper() + " (with ZBL)" + return "UNKNOWN" + + def count_parameters(model: Any) -> int: + """Count the total number of trainable parameters.""" + return sum(p.numel() for p in model.parameters() if p.requires_grad) + + if not self.multi_task: + desc_type = get_descriptor_type(self.model) + num_params = count_parameters(self.model) + log.info("") + log.info(f"Descriptor: {desc_type}") + log.info(f"Model Params: {num_params / 1e6:.3f} M") + else: + # For multi-task, log each model's info + for model_key in self.model_keys: + desc_type = get_descriptor_type(self.model[model_key]) + num_params = count_parameters(self.model[model_key]) + log.info("") + log.info(f"Descriptor [{model_key}]: {desc_type}") + log.info(f"Model Params [{model_key}]: {num_params / 1e6:.3f} M") + def run(self) -> None: fout = ( open( diff --git a/deepmd/tf/train/run_options.py b/deepmd/tf/train/run_options.py index 0b5c3b1b43..ad60efc002 100644 --- a/deepmd/tf/train/run_options.py +++ b/deepmd/tf/train/run_options.py @@ -67,10 +67,10 @@ def get_backend_info(self) -> dict: """Get backend information.""" return { "Backend": "TensorFlow", - "TF ver": tf.version.GIT_VERSION, - "build with TF ver": TF_VERSION, - "build with TF inc": GLOBAL_CONFIG["tf_include_dir"].replace(";", "\n"), - "build with TF lib": GLOBAL_CONFIG["tf_libs"].replace(";", "\n"), + "TF Ver": tf.version.GIT_VERSION, + "Built With TF Ver": TF_VERSION, + "Built With TF Inc": GLOBAL_CONFIG["tf_include_dir"].replace(";", "\n"), + "Built With TF Lib": GLOBAL_CONFIG["tf_libs"].replace(";", "\n"), } diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py index 82ef5ec25d..37da2fa1c1 100644 --- a/deepmd/utils/data_system.py +++ b/deepmd/utils/data_system.py @@ -713,9 +713,9 @@ def print_summary( # width 65 sys_width = 42 log.info( - f"---Summary of DataSystem: {name:13s}-----------------------------------------------" + f"---Summary of DataSystem: {name.capitalize():13s}-----------------------------------------------" ) - log.info("found %d system(s):", nsystems) + log.info("Found %d System(s):", nsystems) log.info( "%s %6s %6s %6s %9s %3s", _format_name_length("system", sys_width), diff --git a/deepmd/utils/summary.py b/deepmd/utils/summary.py index c00e6deb9e..f0fcc52ca0 100644 --- a/deepmd/utils/summary.py +++ b/deepmd/utils/summary.py @@ -47,13 +47,13 @@ class SummaryPrinter(ABC): ) BUILD: ClassVar = { - "installed to": "\n".join(deepmd.__path__), - "source": GLOBAL_CONFIG["git_summ"], - "source branch": GLOBAL_CONFIG["git_branch"], - "source commit": GLOBAL_CONFIG["git_hash"], - "source commit at": GLOBAL_CONFIG["git_date"], - "use float prec": global_float_prec, - "build variant": GLOBAL_CONFIG["dp_variant"], + "Installed To": "\n".join(deepmd.__path__), + "Source": GLOBAL_CONFIG["git_summ"], + "Source Branch": GLOBAL_CONFIG["git_branch"], + "Source Commit": GLOBAL_CONFIG["git_hash"], + "Source Commit At": GLOBAL_CONFIG["git_date"], + "Float Precision": global_float_prec.capitalize(), + "Build Variant": GLOBAL_CONFIG["dp_variant"].upper(), } def __call__(self) -> None: @@ -64,16 +64,41 @@ def __call__(self) -> None: if len(nodelist) > 1: build_info.update( { - "world size": str(len(nodelist)), - "node list": ", ".join(set(nodelist)), + "World Size": str(len(nodelist)), + "Node List": ", ".join(set(nodelist)), } ) build_info.update( { - "running on": nodename, - "computing device": self.get_compute_device(), + "Running On": nodename, + "Computing Device": self.get_compute_device().upper(), } ) + backend = build_info.get("Backend") + device_name = None + try: + if backend == "PyTorch": + import torch + + if torch.cuda.is_available(): + device_name = torch.cuda.get_device_name(0) + elif backend == "TensorFlow": + import tensorflow as tf + + gpus = tf.config.list_physical_devices("GPU") + if gpus: + # Use the first physical GPU device identifier as the device name + device_name = gpus[0].name + elif backend == "Paddle": + import paddle + + # Use Paddle's current device string (e.g., "gpu:0") as a device identifier + device_name = paddle.get_device() + except Exception: + # Best-effort device name detection; ignore failures silently + pass + if device_name: + build_info["Device Name"] = device_name if self.is_built_with_cuda(): env_value = os.environ.get("CUDA_VISIBLE_DEVICES", "unset") build_info["CUDA_VISIBLE_DEVICES"] = env_value @@ -81,13 +106,13 @@ def __call__(self) -> None: env_value = os.environ.get("HIP_VISIBLE_DEVICES", "unset") build_info["HIP_VISIBLE_DEVICES"] = env_value if self.is_built_with_cuda() or self.is_built_with_rocm(): - build_info["Count of visible GPUs"] = str(self.get_ngpus()) + build_info["Visible GPU Count"] = str(self.get_ngpus()) intra, inter = get_default_nthreads() build_info.update( { - "num_intra_threads": str(intra), - "num_inter_threads": str(inter), + "NUM_INTRA_THREADS": str(intra), + "NUM_INTER_THREADS": str(inter), } ) # count the maximum characters in the keys and values diff --git a/source/tests/pt/test_model_summary.py b/source/tests/pt/test_model_summary.py new file mode 100644 index 0000000000..d3aba80a82 --- /dev/null +++ b/source/tests/pt/test_model_summary.py @@ -0,0 +1,144 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Tests for model summary display functions.""" + +import unittest +from unittest.mock import ( + MagicMock, +) + +import torch + + +class TestGetDescriptorType(unittest.TestCase): + """Test get_descriptor_type helper function.""" + + @staticmethod + def get_descriptor_type(model): + """Replicate the logic from training.py for testing.""" + # Standard models have get_descriptor method + if hasattr(model, "get_descriptor"): + descriptor = model.get_descriptor() + serialized = descriptor.serialize() + if isinstance(serialized, dict) and "type" in serialized: + return serialized["type"].upper() + # ZBL models: descriptor is in atomic_model.models[0] + if hasattr(model, "atomic_model") and hasattr(model.atomic_model, "models"): + models = model.atomic_model.models + if models: # Check non-empty + dp_model = models[0] + if hasattr(dp_model, "descriptor"): + serialized = dp_model.descriptor.serialize() + if isinstance(serialized, dict) and "type" in serialized: + return serialized["type"].upper() + " (with ZBL)" + return "UNKNOWN" + + def test_standard_model(self): + """Test descriptor type detection for standard models.""" + mock_descriptor = MagicMock() + mock_descriptor.serialize.return_value = {"type": "se_e2_a"} + + mock_model = MagicMock() + mock_model.get_descriptor.return_value = mock_descriptor + + result = self.get_descriptor_type(mock_model) + self.assertEqual(result, "SE_E2_A") + + def test_zbl_model(self): + """Test descriptor type detection for ZBL models.""" + mock_descriptor = MagicMock() + mock_descriptor.serialize.return_value = {"type": "dpa1"} + + mock_dp_model = MagicMock() + mock_dp_model.descriptor = mock_descriptor + + mock_atomic_model = MagicMock() + mock_atomic_model.models = [mock_dp_model] + + mock_model = MagicMock(spec=[]) # No get_descriptor + mock_model.atomic_model = mock_atomic_model + + result = self.get_descriptor_type(mock_model) + self.assertEqual(result, "DPA1 (with ZBL)") + + def test_empty_models_list(self): + """Test handling of empty models list in ZBL model.""" + mock_atomic_model = MagicMock() + mock_atomic_model.models = [] + + mock_model = MagicMock(spec=[]) + mock_model.atomic_model = mock_atomic_model + + result = self.get_descriptor_type(mock_model) + self.assertEqual(result, "UNKNOWN") + + def test_missing_type_key(self): + """Test handling of serialize() without 'type' key.""" + mock_descriptor = MagicMock() + mock_descriptor.serialize.return_value = {"other_key": "value"} + + mock_model = MagicMock() + mock_model.get_descriptor.return_value = mock_descriptor + + result = self.get_descriptor_type(mock_model) + self.assertEqual(result, "UNKNOWN") + + def test_serialize_returns_non_dict(self): + """Test handling of serialize() returning non-dict.""" + mock_descriptor = MagicMock() + mock_descriptor.serialize.return_value = "not_a_dict" + + mock_model = MagicMock() + mock_model.get_descriptor.return_value = mock_descriptor + + result = self.get_descriptor_type(mock_model) + self.assertEqual(result, "UNKNOWN") + + def test_unknown_model_structure(self): + """Test handling of unknown model structure.""" + mock_model = MagicMock(spec=[]) # No get_descriptor, no atomic_model + result = self.get_descriptor_type(mock_model) + self.assertEqual(result, "UNKNOWN") + + +class TestCountParameters(unittest.TestCase): + """Test count_parameters helper function.""" + + @staticmethod + def count_parameters(model): + """Replicate the logic from training.py for testing.""" + return sum(p.numel() for p in model.parameters() if p.requires_grad) + + def test_all_trainable(self): + """Test counting when all parameters are trainable.""" + with torch.device("cpu"): + model = torch.nn.Linear(10, 5) # 10*5 + 5 = 55 parameters + result = self.count_parameters(model) + self.assertEqual(result, 55) + + def test_mixed_trainable(self): + """Test counting with some frozen parameters.""" + with torch.device("cpu"): + model = torch.nn.Sequential( + torch.nn.Linear(10, 5), # 55 params + torch.nn.Linear(5, 3), # 18 params + ) + # Freeze first layer + for param in model[0].parameters(): + param.requires_grad = False + + result = self.count_parameters(model) + self.assertEqual(result, 18) # Only second layer + + def test_all_frozen(self): + """Test counting when all parameters are frozen.""" + with torch.device("cpu"): + model = torch.nn.Linear(10, 5) + for param in model.parameters(): + param.requires_grad = False + + result = self.count_parameters(model) + self.assertEqual(result, 0) + + +if __name__ == "__main__": + unittest.main()