diff --git a/codecarbon/core/config.py b/codecarbon/core/config.py index e4061706e..1aa0dd324 100644 --- a/codecarbon/core/config.py +++ b/codecarbon/core/config.py @@ -1,7 +1,7 @@ import configparser import os from pathlib import Path -from typing import List +from typing import List, Union from codecarbon.external.logger import logger @@ -44,24 +44,33 @@ def parse_env_config() -> dict: } -def parse_gpu_ids(gpu_ids_str: str) -> List[int]: +def parse_gpu_ids(gpu_ids: Union[str, List[int]]) -> List[str]: """ - Transforms the potential gpu_ids string into a list of int values + Transforms the potential gpu_ids into a list of string id values. + Args: - gpu_ids_str (str): The config file or environment variable value for `gpu_ids` - which is read as a string and should be parsed into a list of ints + gpu_ids: The config file or environment variable value for `gpu_ids` Returns: - list[int]: The list of GPU ids available declared by the user. + list[str]: The list of GPU ids available. Potentially empty. """ - if not isinstance(gpu_ids_str, str): - return gpu_ids_str - - gpu_ids_str = "".join(c for c in gpu_ids_str if (c.isalnum() or c == ",")) - str_ids = [gpu_id for gpu_id in gpu_ids_str.split(",") if gpu_id] - return list(map(int, str_ids)) + if isinstance(gpu_ids, str): + # Allow '-' in id strings since UUIDs may include them. + gpu_ids = "".join(c for c in gpu_ids if (c.isalnum() or c in ("-", ","))) + str_ids = [gpu_id for gpu_id in gpu_ids.split(",") if gpu_id] + return str_ids + + elif isinstance(gpu_ids, list) and all( + isinstance(gpu_id, int) for gpu_id in gpu_ids + ): + return list(map(str, gpu_ids)) + + else: + logger.warning( + "Invalid gpu_ids format. Expected a string or a list of ints/strings." + ) def get_hierarchical_config(): diff --git a/codecarbon/core/resource_tracker.py b/codecarbon/core/resource_tracker.py index acd89e72e..3ccbf6c44 100644 --- a/codecarbon/core/resource_tracker.py +++ b/codecarbon/core/resource_tracker.py @@ -183,18 +183,11 @@ def set_CPU_tracking(self): def set_GPU_tracking(self): logger.info("[setup] GPU Tracking...") if self.tracker._gpu_ids: - # If _gpu_ids is a string or a list of int, parse it to a list of ints - if isinstance(self.tracker._gpu_ids, str) or ( - isinstance(self.tracker._gpu_ids, list) - and all(isinstance(gpu_id, int) for gpu_id in self.tracker._gpu_ids) - ): - self.tracker._gpu_ids: List[int] = parse_gpu_ids(self.tracker._gpu_ids) + self.tracker._gpu_ids = parse_gpu_ids(self.tracker._gpu_ids) + if self.tracker._gpu_ids: self.tracker._conf["gpu_ids"] = self.tracker._gpu_ids self.tracker._conf["gpu_count"] = len(self.tracker._gpu_ids) - else: - logger.warning( - "Invalid gpu_ids format. Expected a string or a list of ints." - ) + if gpu.is_gpu_details_available(): logger.info("Tracking Nvidia GPU via pynvml") gpu_devices = GPU.from_utils(self.tracker._gpu_ids) diff --git a/codecarbon/external/hardware.py b/codecarbon/external/hardware.py index b65a3ded0..07d88e52e 100644 --- a/codecarbon/external/hardware.py +++ b/codecarbon/external/hardware.py @@ -102,25 +102,43 @@ def _get_gpu_ids(self) -> Iterable[int]: Get the Ids of the GPUs that we will monitor :return: list of ids """ - gpu_ids = [] if self.gpu_ids is not None: - # Check that the provided GPU ids are valid - if not set(self.gpu_ids).issubset(set(range(self.num_gpus))): - logger.warning( - f"Unknown GPU ids {gpu_ids}, only {self.num_gpus} GPUs available." - ) - # Keep only the GPUs that are in the provided list - for gpu_id in range(self.num_gpus): - if gpu_id in self.gpu_ids: - gpu_ids.append(gpu_id) + uuids_to_ids = { + gpu.get("uuid"): gpu.get("gpu_index") + for gpu in self.devices.get_gpu_static_info() + } + monitored_gpu_ids = [] + + for gpu_id in self.gpu_ids: + found_gpu_id = False + # Does it look like an index into the number of GPUs on the system? + if isinstance(gpu_id, int) or gpu_id.isdigit(): + gpu_id = int(gpu_id) + if 0 <= gpu_id < self.num_gpus: + monitored_gpu_ids.append(gpu_id) + found_gpu_id = True + # Does it match a prefix of any UUID on the system after stripping any 'MIG-' + # id prefix per https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-environment-variables ? else: - logger.info( - f"GPU number {gpu_id} will not be monitored, at your request." + stripped_gpu_id_str = gpu_id.lstrip("MIG-") + for uuid, id in uuids_to_ids.items(): + if uuid.startswith(stripped_gpu_id_str): + logger.debug( + f"Matching GPU ID {stripped_gpu_id_str} (originally {gpu_id}) against {uuid} for GPU index {id}" + ) + monitored_gpu_ids.append(id) + found_gpu_id = True + break + if not found_gpu_id: + logger.warning( + f"GPU with ID '{gpu_id}' not found or invalid. It will be ignored." ) - self.gpu_ids = gpu_ids + + monitored_gpu_ids = sorted(list(set(monitored_gpu_ids))) + self.gpu_ids = monitored_gpu_ids + return monitored_gpu_ids else: - gpu_ids = set(range(self.num_gpus)) - return gpu_ids + return list(range(self.num_gpus)) def total_power(self) -> Power: return self._total_power @@ -135,7 +153,7 @@ def from_utils(cls, gpu_ids: Optional[List] = None) -> "GPU": new_gpu_ids = gpus._get_gpu_ids() if len(new_gpu_ids) < gpus.num_gpus: logger.warning( - f"You have {gpus.num_gpus} GPUs but we will monitor only {len(gpu_ids)} of them. Check your configuration." + f"You have {gpus.num_gpus} GPUs but we will monitor only {len(new_gpu_ids)} ({new_gpu_ids}) of them. Check your configuration." ) return cls(gpu_ids=new_gpu_ids) diff --git a/docs/edit/parameters.rst b/docs/edit/parameters.rst index a1b4d53bf..56ceeb949 100644 --- a/docs/edit/parameters.rst +++ b/docs/edit/parameters.rst @@ -25,7 +25,9 @@ Input Parameters - | ``machine`` measure the power consumptions of the entire machine (defaults) | ``process`` try and isolate the tracked processes in isolation * - gpu_ids - - User-specified known gpu ids to track, defaults to ``None`` + - | Comma-separated list of GPU ids to track, defaults to ``None`` + | These can either be integer indexes of GPUs on the system, or prefixes + | to match against GPU identifiers as described `here `_ * - log_level - | Global codecarbon log level (by order of verbosity): "debug", "info" (defaults), | "warning", "error", or "critical" diff --git a/docs/parameters.html b/docs/parameters.html index b8b72a0dc..d388a733f 100644 --- a/docs/parameters.html +++ b/docs/parameters.html @@ -131,7 +131,12 @@

Input Parameters

gpu_ids

-

User-specified known gpu ids to track, defaults to None

+
+

log_level

diff --git a/tests/test_config.py b/tests/test_config.py index 3721a6e4d..20160f7d3 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -32,16 +32,16 @@ def test_clean_env_key(self): def test_parse_gpu_ids(self): for ids, target in [ - ("0,1,2", [0, 1, 2]), - ("[0, 1, 2", [0, 1, 2]), - ("(0, 1, 2)", [0, 1, 2]), - ("[1]", [1]), - ("1", [1]), - ("0", [0]), + ("0,1,2", ["0", "1", "2"]), + ("[0, 1, 2", ["0", "1", "2"]), + ("(0, 1, 2)", ["0", "1", "2"]), + ("[1]", ["1"]), + ("1", ["1"]), + ("0", ["0"]), + ("MIG-f1e", ["MIG-f1e"]), ("", []), ([], []), - ([1, 2, 3], [1, 2, 3]), - (1, 1), + ([1, 2, 3], ["1", "2", "3"]), ]: self.assertEqual(parse_gpu_ids(ids), target) @@ -101,6 +101,7 @@ def test_read_confs(self): "USER": "useless key", "CODECARBON_ENV_OVERWRITE": "SUCCESS:overwritten", "CODECARBON_ENV_NEW_KEY": "cool value", + "CODECARBON_ALLOW_MULTIPLE_RUNS": "True", }, ) def test_read_confs_and_parse_envs(self): @@ -145,9 +146,8 @@ def test_empty_conf(self): "builtins.open", new_callable=get_custom_mock_open(global_conf, local_conf) ): conf = dict(get_hierarchical_config()) - target = { - "allow_multiple_runs": "True" - } # allow_multiple_runs is a default value + # allow_multiple_runs is set in pytest.ini and not mocked, so it's visible here. + target = {"allow_multiple_runs": "True"} self.assertDictEqual(conf, target) @mock.patch.dict( @@ -190,7 +190,7 @@ def test_full_hierarchy(self): self.assertEqual(tracker._force_ram_power, 50.5) self.assertEqual(tracker._output_dir, "/success/overwritten") self.assertEqual(tracker._emissions_endpoint, "http://testhost:2000") - self.assertEqual(tracker._gpu_ids, [0, 1]) + self.assertEqual(tracker._gpu_ids, ["0", "1"]) self.assertEqual(tracker._co2_signal_api_token, "signal-token") self.assertEqual(tracker._project_name, "test-project") self.assertTrue(tracker._save_to_file) @@ -206,7 +206,7 @@ def test_gpu_ids_from_env(self): tracker = EmissionsTracker( project_name="test-project", allow_multiple_runs=True ) - self.assertEqual(tracker._gpu_ids, [2, 3]) + self.assertEqual(tracker._gpu_ids, ["2", "3"]) @mock.patch.dict( os.environ, @@ -220,7 +220,7 @@ def test_too_much_gpu_ids_in_env(self): tracker = EmissionsTracker( project_name="test-project", allow_multiple_runs=True ) - self.assertEqual(tracker._gpu_ids, [99]) + self.assertEqual(tracker._gpu_ids, ["99"]) gpu_count = 0 for hardware in tracker._hardware: if isinstance(hardware, GPU): diff --git a/tests/test_gpu.py b/tests/test_gpu.py index 741ad6b28..8433c1580 100644 --- a/tests/test_gpu.py +++ b/tests/test_gpu.py @@ -50,7 +50,7 @@ def setup_method(self): self.DETAILS = { "handle_0": { "name": b"GeForce GTX 1080", - "uuid": b"uuid#1", + "uuid": b"uuid-1", "memory": real_pynvml.c_nvmlMemory_t(1024, 100, 924), "temperature": 75, "power_usage": 26, @@ -66,7 +66,7 @@ def setup_method(self): }, "handle_1": { "name": b"GeForce GTX 1080", - "uuid": b"uuid#2", + "uuid": b"uuid-2", "memory": real_pynvml.c_nvmlMemory_t(1024, 200, 824), "temperature": 79, "power_usage": 29, @@ -84,7 +84,7 @@ def setup_method(self): self.expected = [ { "name": "GeForce GTX 1080", - "uuid": "uuid#1", + "uuid": "uuid-1", "total_memory": 1024, "free_memory": 100, "used_memory": 924, @@ -102,7 +102,7 @@ def setup_method(self): }, { "name": "GeForce GTX 1080", - "uuid": "uuid#2", + "uuid": "uuid-2", "total_memory": 1024, "free_memory": 200, "used_memory": 824, @@ -146,14 +146,14 @@ def test_static_gpu_info(self): expected = [ { "name": "GeForce GTX 1080", - "uuid": "uuid#1", + "uuid": "uuid-1", "total_memory": 1024, "power_limit": 149, "gpu_index": 0, }, { "name": "GeForce GTX 1080", - "uuid": "uuid#2", + "uuid": "uuid-2", "total_memory": 1024, "power_limit": 149, "gpu_index": 1, @@ -311,6 +311,33 @@ def mock_nvmlDeviceGetTotalEnergyConsumption(handle): expected_power = gpu2_power tc.assertAlmostEqual(expected_power.kW, gpu.total_power().kW) + def test_get_gpu_ids(self): + """ + Check parsing of gpu_ids in various forms. + """ + # Prepare + from codecarbon.external.hardware import GPU + + for test_ids, expected_ids in [ + ([0, 1], [0, 1]), + ([0, 1, 2], [0, 1]), + ([2], []), + (["0", "1"], [0, 1]), + # Only two GPUS in the system, so ignore the third (index 2) + (["0", "1", "2"], [0, 1]), + (["2"], []), + # Check UUID-to-index mapping + (["uuid-1"], [0]), + (["uuid-1", "uuid-2"], [0, 1]), + (["uuid-3"], []), + # Check UUID-to-index mapping when we need to strip the prefix + (["MIG-uuid-1"], [0]), + (["MIG-uuid-3"], []), + ]: + gpu = GPU(test_ids) + result = gpu._get_gpu_ids() + assert result == expected_ids + class TestGpuNotAvailable: def setup_method(self):