Skip to content

Commit c7aca80

Browse files
author
benoit-cty
committed
Better RAPL
1 parent bcf998c commit c7aca80

2 files changed

Lines changed: 205 additions & 32 deletions

File tree

codecarbon/core/cpu.py

Lines changed: 151 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -288,42 +288,170 @@ def _fetch_rapl_files(self) -> None:
288288
"""
289289
Fetches RAPL files from the RAPL directory
290290
"""
291-
292-
# consider files like `intel-rapl:$i`
293-
files = list(filter(lambda x: ":" in x, os.listdir(self._lin_rapl_dir)))
291+
# We'll scan common powercap locations and look for domain directories
292+
# that expose an `energy_uj` file. We try to be tolerant to permission
293+
# errors and simply skip unreadable entries instead of failing the whole
294+
# tracker when one RAPL subtree is not accessible (e.g., intel-rapl-mmio).
295+
candidate_bases = [
296+
self._lin_rapl_dir,
297+
os.path.dirname(self._lin_rapl_dir),
298+
"/sys/class/powercap",
299+
"/sys/devices/virtual/powercap",
300+
]
301+
302+
# Deduplicate while preserving order and keep only existing paths
303+
seen = set()
304+
candidate_bases = [
305+
p
306+
for p in candidate_bases
307+
if p and not (p in seen or seen.add(p)) and os.path.exists(p)
308+
]
309+
310+
domain_dirs = []
311+
for base in candidate_bases:
312+
try:
313+
for entry in os.listdir(base):
314+
# Look for powercap provider directories like 'intel-rapl' or 'intel-rapl-mmio'
315+
if not entry.startswith("intel-rapl"):
316+
continue
317+
entry_path = os.path.join(base, entry)
318+
if not os.path.isdir(entry_path):
319+
continue
320+
# Look for domain directories under the provider that usually contain ':' in their name
321+
try:
322+
for sub in os.listdir(entry_path):
323+
sub_path = os.path.join(entry_path, sub)
324+
if ":" in sub and os.path.isdir(sub_path):
325+
# Only consider if energy file exists
326+
if os.path.exists(os.path.join(sub_path, "energy_uj")):
327+
domain_dirs.append(sub_path)
328+
except Exception as e:
329+
if isinstance(e, PermissionError):
330+
logger.warning(
331+
"Permission denied listing %s: %s", entry_path, e
332+
)
333+
else:
334+
logger.debug("Cannot list %s: %s", entry_path, e)
335+
except Exception as e:
336+
if isinstance(e, PermissionError):
337+
logger.warning(
338+
"Permission denied scanning %s for RAPL domains: %s", base, e
339+
)
340+
else:
341+
logger.debug("Cannot scan %s for RAPL domains: %s", base, e)
342+
343+
# Fallback: if none found and the configured path looks like it directly
344+
# contains domain entries, try listing it (preserves backward compatibility).
345+
if not domain_dirs:
346+
try:
347+
for item in os.listdir(self._lin_rapl_dir):
348+
if ":" in item:
349+
path = os.path.join(self._lin_rapl_dir, item)
350+
if os.path.isdir(path) and os.path.exists(
351+
os.path.join(path, "energy_uj")
352+
):
353+
domain_dirs.append(path)
354+
except Exception:
355+
# ignore: we'll handle the empty domain_dirs case below
356+
pass
357+
358+
# Remove duplicates
359+
domain_dirs = list(dict.fromkeys(domain_dirs))
294360

295361
i = 0
296-
for file in files:
297-
path = os.path.join(self._lin_rapl_dir, file, "name")
298-
with open(path) as f:
299-
name = f.read().strip()
300-
# Fake the name used by Power Gadget
301-
# We ignore "core" in name as it seems to be included in "package" for Intel CPU.
302-
# TODO: Use "dram" for memory power
362+
for domain_dir in domain_dirs:
363+
try:
364+
name_path = os.path.join(domain_dir, "name")
365+
name = None
366+
if os.path.exists(name_path):
367+
try:
368+
with open(name_path) as f:
369+
name = f.read().strip()
370+
except Exception as e:
371+
if isinstance(e, PermissionError):
372+
logger.warning(
373+
"Permission denied reading name file %s: %s",
374+
name_path,
375+
e,
376+
)
377+
else:
378+
logger.debug(
379+
"Unable to read name file %s: %s", name_path, e
380+
)
381+
if not name:
382+
# Use the domain directory basename as a fallback
383+
name = os.path.basename(domain_dir)
384+
303385
if "package" in name:
304386
name = f"Processor Energy Delta_{i}(kWh)"
305387
i += 1
306-
# RAPL file to take measurement from
307-
rapl_file = os.path.join(self._lin_rapl_dir, file, "energy_uj")
308-
# RAPL file containing maximum possible value of energy_uj above which it wraps
309-
rapl_file_max = os.path.join(
310-
self._lin_rapl_dir, file, "max_energy_range_uj"
311-
)
388+
389+
rapl_file = os.path.join(domain_dir, "energy_uj")
390+
rapl_file_max = os.path.join(domain_dir, "max_energy_range_uj")
391+
392+
# Quick sanity check: can we read the energy value? If not, either
393+
# fail (for main/package domains) or skip gracefully.
394+
is_required_main = ("package" in name.lower()) or os.path.basename(
395+
domain_dir
396+
).endswith(":0")
312397
try:
313-
# Try to read the file to be sure we can
314398
with open(rapl_file, "r") as f:
315399
_ = float(f.read())
400+
except PermissionError as e:
401+
msg = f"Permission denied reading RAPL file {rapl_file}."
402+
suggestion = "You can grant read permission with: sudo chmod -R a+r /sys/class/powercap/*"
403+
if is_required_main:
404+
# Fail early if the main package energy file is not readable
405+
raise PermissionError(msg + " " + suggestion) from e
406+
else:
407+
logger.warning("%s %s; skipping.", msg, suggestion)
408+
continue
409+
except Exception as e:
410+
if is_required_main:
411+
# If the main file is unreadable or non-numeric, fail early
412+
raise RuntimeError(
413+
f"Unable to read main RAPL file {rapl_file}: {e}"
414+
) from e
415+
else:
416+
logger.debug(
417+
"Skipping non-numeric or unreadable RAPL file %s: %s",
418+
rapl_file,
419+
e,
420+
)
421+
continue
422+
423+
try:
316424
self._rapl_files.append(
317425
RAPLFile(name=name, path=rapl_file, max_path=rapl_file_max)
318426
)
319427
logger.debug("We will read Intel RAPL files at %s", rapl_file)
320-
except PermissionError as e:
321-
raise PermissionError(
322-
"PermissionError : Unable to read Intel RAPL files for CPU power, we will use a constant for your CPU power."
323-
+ " Please view https://github.com/mlco2/codecarbon/issues/244"
324-
+ " for workarounds : %s",
428+
except Exception as e:
429+
if isinstance(e, PermissionError) and is_required_main:
430+
raise
431+
if isinstance(e, PermissionError):
432+
logger.warning(
433+
"Permission denied while initializing RAPL file %s: %s",
434+
rapl_file,
435+
e,
436+
)
437+
else:
438+
logger.debug(
439+
"Unable to initialize RAPLFile for %s: %s", rapl_file, e
440+
)
441+
continue
442+
except Exception as e:
443+
if isinstance(e, PermissionError):
444+
# If we get a permission error here and it's not handled above,
445+
# surface it as a warning unless it's the main domain which
446+
# should have failed earlier.
447+
logger.warning(
448+
"Permission error while processing RAPL domain %s: %s",
449+
domain_dir,
325450
e,
326-
) from e
451+
)
452+
else:
453+
logger.debug("Error processing RAPL domain %s: %s", domain_dir, e)
454+
continue
327455

328456
def get_cpu_details(self, duration: Time) -> Dict:
329457
"""

codecarbon/core/rapl.py

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,21 +22,66 @@ class RAPLFile:
2222
max_energy_reading: Energy = field(default_factory=lambda: Energy(0))
2323

2424
def __post_init__(self):
25-
self.last_energy = self._get_value()
26-
with open(self.max_path, "r") as f:
27-
max_micro_joules = float(f.read())
25+
try:
26+
self.last_energy = self._get_value()
27+
except Exception as e:
28+
# Be tolerant to permission / IO errors during initialization.
29+
# If we cannot read the initial energy, default to 0 and continue;
30+
# the caller will skip or produce zeros for this RAPL file.
31+
if isinstance(e, PermissionError):
32+
logger.warning(
33+
"Unable to read initial RAPL energy from %s due to permission error: %s",
34+
self.path,
35+
e,
36+
)
37+
else:
38+
logger.debug(
39+
"Unable to read initial RAPL energy from %s: %s",
40+
self.path,
41+
e,
42+
)
43+
self.last_energy = Energy.from_ujoules(0)
2844

29-
self.max_energy_reading = Energy.from_ujoules(max_micro_joules)
45+
try:
46+
with open(self.max_path, "r") as f:
47+
max_micro_joules = float(f.read())
48+
self.max_energy_reading = Energy.from_ujoules(max_micro_joules)
49+
except Exception as e:
50+
# If we cannot read the max range, log and set to 0 so wrap detection
51+
# will be effectively disabled for this file.
52+
if isinstance(e, PermissionError):
53+
logger.warning(
54+
"Unable to read max_energy_range_uj from %s due to permission error: %s",
55+
self.max_path,
56+
e,
57+
)
58+
else:
59+
logger.debug(
60+
"Unable to read max_energy_range_uj from %s: %s",
61+
self.max_path,
62+
e,
63+
)
64+
self.max_energy_reading = Energy.from_ujoules(0)
3065

3166
def _get_value(self) -> Energy:
3267
"""
3368
Reads the value in the file at the path
3469
"""
35-
with open(self.path, "r") as f:
36-
micro_joules = float(f.read())
37-
38-
e = Energy.from_ujoules(micro_joules)
39-
return e
70+
try:
71+
with open(self.path, "r") as f:
72+
micro_joules = float(f.read())
73+
return Energy.from_ujoules(micro_joules)
74+
except Exception as e:
75+
# Be tolerant to transient IO / permission errors while reading energy.
76+
if isinstance(e, PermissionError):
77+
logger.warning(
78+
"Unable to read RAPL value from %s due to permission error: %s",
79+
self.path,
80+
e,
81+
)
82+
else:
83+
logger.debug("Unable to read RAPL value from %s: %s", self.path, e)
84+
return Energy.from_ujoules(0)
4085

4186
def start(self) -> None:
4287
self.last_energy = self._get_value()

0 commit comments

Comments
 (0)