Skip to content

Commit 0eee57c

Browse files
authored
Merge pull request #1094 from ArmaanjeetSandhu/fix/csv-gpu-columns-dropna
fix: prevent CSV column inconsistency for gpu_count and gpu_model fields (fixes #1092)
2 parents 01c2fc5 + 4ff4107 commit 0eee57c

4 files changed

Lines changed: 140 additions & 9 deletions

File tree

codecarbon/core/resource_tracker.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,8 @@ def set_GPU_tracking(self):
237237
self.gpu_tracker = "pynvml"
238238
else:
239239
logger.info("No GPU found.")
240+
self.tracker._conf.setdefault("gpu_count", 0)
241+
self.tracker._conf.setdefault("gpu_model", "")
240242

241243
def set_CPU_GPU_ram_tracking(self):
242244
"""

codecarbon/emissions_tracker.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -904,8 +904,8 @@ def _prepare_emissions_data(self) -> EmissionsData:
904904
os=self._conf.get("os"),
905905
python_version=self._conf.get("python_version"),
906906
codecarbon_version=self._conf.get("codecarbon_version"),
907-
gpu_count=self._conf.get("gpu_count"),
908-
gpu_model=self._conf.get("gpu_model"),
907+
gpu_count=self._conf.get("gpu_count", 0),
908+
gpu_model=self._conf.get("gpu_model", ""),
909909
cpu_count=self._conf.get("cpu_count"),
910910
cpu_model=self._conf.get("cpu_model"),
911911
longitude=self._conf.get("longitude"),

codecarbon/output_methods/file.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,8 @@ def out(self, total: EmissionsData, _):
102102
df = new_df
103103
elif self.on_csv_write == "append":
104104
df = pd.read_csv(self.save_file_path)
105-
# Filter out empty or all-NA columns, to avoid warnings from Pandas,
105+
# Filter out empty or all-NA columns only from new_df, to avoid warnings from Pandas,
106106
# see https://github.com/pandas-dev/pandas/issues/55928
107-
df = df.dropna(axis=1, how="all")
108107
new_df = new_df.dropna(axis=1, how="all")
109108
df = pd.concat([df, new_df])
110109
else:
@@ -134,19 +133,17 @@ def task_out(self, data: List[TaskEmissionsData], experiment_name: str):
134133
"""
135134
Save the emissions data from a single task in an experiment run to a CSV file.
136135
137-
Does not attempt to backup existing files or prevent ovewritting them.
136+
Does not attempt to backup existing files or prevent overwriting them.
138137
"""
139138
run_id = data[0].run_id
140139
save_task_file_path = os.path.join(
141140
self.output_dir, "emissions_" + experiment_name + "_" + run_id + ".csv"
142141
)
143-
df = pd.DataFrame(columns=data[0].values.keys())
144142
new_df = pd.DataFrame.from_records(
145143
[dict(data_point.values) for data_point in data]
146144
)
147-
# Filter out empty or all-NA columns, to avoid warnings from Pandas
145+
# Filter out empty or all-NA columns only from new_df, to avoid warnings from Pandas
148146
# see https://github.com/pandas-dev/pandas/issues/55928
149-
df = df.dropna(axis=1, how="all")
150147
new_df = new_df.dropna(axis=1, how="all")
151-
df = pd.concat([df, new_df], ignore_index=True)
148+
df = new_df
152149
df.to_csv(save_task_file_path, index=False)

tests/output_methods/test_file.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,138 @@ def test_file_output_out_update_empty_file_exists(self):
197197
df = pd.read_csv(os.path.join(self.temp_dir, "test.csv"))
198198
self.assertEqual(len(df), 1)
199199

200+
def test_file_output_out_append_no_gpu_consistent_columns(self):
201+
"""Regression test: successive appends with gpu_count=None/gpu_model=None must
202+
never trigger a format-change warning or produce a .bak backup file.
203+
204+
The bug: dropna(axis=1, how="all") was applied to the *existing* CSV DataFrame
205+
as well as to new_df. On a CPU-only machine both gpu_count and gpu_model are
206+
NaN in every row, so after the second write those columns were silently dropped.
207+
The third write then detected a schema mismatch and backed up the file.
208+
"""
209+
no_gpu_data = EmissionsData(
210+
timestamp="2023-01-01T00:00:00",
211+
project_name="test_project",
212+
run_id="test_run_id",
213+
experiment_id="test_experiment_id",
214+
duration=10,
215+
emissions=0.5,
216+
emissions_rate=0.05,
217+
cpu_power=20,
218+
gpu_power=0,
219+
ram_power=5,
220+
cpu_energy=200,
221+
gpu_energy=0,
222+
ram_energy=50,
223+
energy_consumed=250,
224+
water_consumed=0.1,
225+
country_name="Testland",
226+
country_iso_code="TS",
227+
region="Test Region",
228+
cloud_provider="",
229+
cloud_region="",
230+
os="TestOS",
231+
python_version="3.8",
232+
codecarbon_version="2.0",
233+
cpu_count=4,
234+
cpu_model="Test CPU",
235+
gpu_count=None,
236+
gpu_model=None,
237+
longitude=0,
238+
latitude=0,
239+
ram_total_size=16,
240+
tracking_mode="machine",
241+
)
242+
243+
file_output = FileOutput("test.csv", self.temp_dir, on_csv_write="append")
244+
245+
# Write four times — prior to the fix, the 3rd write triggered a backup.
246+
for _ in range(4):
247+
file_output.out(no_gpu_data, None)
248+
self.assertTrue(
249+
file_output.has_valid_headers(no_gpu_data),
250+
"CSV headers became invalid after an append (gpu_count/gpu_model "
251+
"columns were dropped by dropna).",
252+
)
253+
254+
# No .bak file should have been created.
255+
bak_path = file_output.save_file_path + ".bak"
256+
self.assertFalse(
257+
os.path.exists(bak_path),
258+
"A backup file was created even though the CSV schema did not change.",
259+
)
260+
261+
# All four rows must be present.
262+
df = pd.read_csv(file_output.save_file_path)
263+
self.assertEqual(len(df), 4)
264+
265+
# gpu_count and gpu_model columns must still be present (as NaN).
266+
self.assertIn("gpu_count", df.columns)
267+
self.assertIn("gpu_model", df.columns)
268+
269+
def test_file_output_out_append_no_gpu_zero_defaults(self):
270+
"""Test that gpu_count=0 and gpu_model="" (the new tracker defaults for
271+
CPU-only machines) produce consistent CSV columns across successive writes.
272+
"""
273+
no_gpu_data = EmissionsData(
274+
timestamp="2023-01-01T00:00:00",
275+
project_name="test_project",
276+
run_id="test_run_id",
277+
experiment_id="test_experiment_id",
278+
duration=10,
279+
emissions=0.5,
280+
emissions_rate=0.05,
281+
cpu_power=20,
282+
gpu_power=0,
283+
ram_power=5,
284+
cpu_energy=200,
285+
gpu_energy=0,
286+
ram_energy=50,
287+
energy_consumed=250,
288+
water_consumed=0.1,
289+
country_name="Testland",
290+
country_iso_code="TS",
291+
region="Test Region",
292+
cloud_provider="",
293+
cloud_region="",
294+
os="TestOS",
295+
python_version="3.8",
296+
codecarbon_version="2.0",
297+
cpu_count=4,
298+
cpu_model="Test CPU",
299+
gpu_count=0,
300+
gpu_model="",
301+
longitude=0,
302+
latitude=0,
303+
ram_total_size=16,
304+
tracking_mode="machine",
305+
)
306+
307+
file_output = FileOutput("test.csv", self.temp_dir, on_csv_write="append")
308+
309+
for _ in range(4):
310+
file_output.out(no_gpu_data, None)
311+
self.assertTrue(
312+
file_output.has_valid_headers(no_gpu_data),
313+
"CSV headers should remain consistent with gpu_count=0 / gpu_model=''.",
314+
)
315+
316+
bak_path = file_output.save_file_path + ".bak"
317+
self.assertFalse(
318+
os.path.exists(bak_path),
319+
"No backup should be created when columns are consistent.",
320+
)
321+
322+
df = pd.read_csv(file_output.save_file_path)
323+
self.assertEqual(len(df), 4)
324+
self.assertIn("gpu_count", df.columns)
325+
self.assertIn("gpu_model", df.columns)
326+
# With 0/"" defaults, gpu_count should be 0 (not NaN)
327+
self.assertTrue((df["gpu_count"] == 0).all())
328+
# gpu_model="" is read back as NaN by pandas (empty string in CSV),
329+
# but the column must still be present.
330+
self.assertIn("gpu_model", df.columns)
331+
200332
def test_file_output_task_out(self):
201333
task_emissions_data = [
202334
TaskEmissionsData(

0 commit comments

Comments
 (0)