Merge pull request #1094 from ArmaanjeetSandhu/fix/csv-gpu-columns-dropna

benoit-cty · web-flow · commit 0eee57c497cc · 2026-03-01T18:59:19.000+01:00
fix: prevent CSV column inconsistency for gpu_count and gpu_model fields (fixes #1092)
diff --git a/codecarbon/core/resource_tracker.py b/codecarbon/core/resource_tracker.py
@@ -237,6 +237,8 @@ def set_GPU_tracking(self):
             self.gpu_tracker = "pynvml"
         else:
             logger.info("No GPU found.")
+            self.tracker._conf.setdefault("gpu_count", 0)
+            self.tracker._conf.setdefault("gpu_model", "")
 
     def set_CPU_GPU_ram_tracking(self):
         """
diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py
@@ -904,8 +904,8 @@ def _prepare_emissions_data(self) -> EmissionsData:
             os=self._conf.get("os"),
             python_version=self._conf.get("python_version"),
             codecarbon_version=self._conf.get("codecarbon_version"),
-            gpu_count=self._conf.get("gpu_count"),
-            gpu_model=self._conf.get("gpu_model"),
+            gpu_count=self._conf.get("gpu_count", 0),
+            gpu_model=self._conf.get("gpu_model", ""),
             cpu_count=self._conf.get("cpu_count"),
             cpu_model=self._conf.get("cpu_model"),
             longitude=self._conf.get("longitude"),
diff --git a/codecarbon/output_methods/file.py b/codecarbon/output_methods/file.py
@@ -102,9 +102,8 @@ def out(self, total: EmissionsData, _):
             df = new_df
         elif self.on_csv_write == "append":
             df = pd.read_csv(self.save_file_path)
-            # Filter out empty or all-NA columns, to avoid warnings from Pandas,
+            # Filter out empty or all-NA columns only from new_df, to avoid warnings from Pandas,
             # see https://github.com/pandas-dev/pandas/issues/55928
-            df = df.dropna(axis=1, how="all")
             new_df = new_df.dropna(axis=1, how="all")
             df = pd.concat([df, new_df])
         else:
@@ -134,19 +133,17 @@ def task_out(self, data: List[TaskEmissionsData], experiment_name: str):
         """
         Save the emissions data from a single task in an experiment run to a CSV file.
 
-        Does not attempt to backup existing files or prevent ovewritting them.
+        Does not attempt to backup existing files or prevent overwriting them.
         """
         run_id = data[0].run_id
         save_task_file_path = os.path.join(
             self.output_dir, "emissions_" + experiment_name + "_" + run_id + ".csv"
         )
-        df = pd.DataFrame(columns=data[0].values.keys())
         new_df = pd.DataFrame.from_records(
             [dict(data_point.values) for data_point in data]
         )
-        # Filter out empty or all-NA columns, to avoid warnings from Pandas
+        # Filter out empty or all-NA columns only from new_df, to avoid warnings from Pandas
         # see https://github.com/pandas-dev/pandas/issues/55928
-        df = df.dropna(axis=1, how="all")
         new_df = new_df.dropna(axis=1, how="all")
-        df = pd.concat([df, new_df], ignore_index=True)
+        df = new_df
         df.to_csv(save_task_file_path, index=False)
diff --git a/tests/output_methods/test_file.py b/tests/output_methods/test_file.py
@@ -197,6 +197,138 @@ def test_file_output_out_update_empty_file_exists(self):
         df = pd.read_csv(os.path.join(self.temp_dir, "test.csv"))
         self.assertEqual(len(df), 1)
 
+    def test_file_output_out_append_no_gpu_consistent_columns(self):
+        """Regression test: successive appends with gpu_count=None/gpu_model=None must
+        never trigger a format-change warning or produce a .bak backup file.
+
+        The bug: dropna(axis=1, how="all") was applied to the *existing* CSV DataFrame
+        as well as to new_df.  On a CPU-only machine both gpu_count and gpu_model are
+        NaN in every row, so after the second write those columns were silently dropped.
+        The third write then detected a schema mismatch and backed up the file.
+        """
+        no_gpu_data = EmissionsData(
+            timestamp="2023-01-01T00:00:00",
+            project_name="test_project",
+            run_id="test_run_id",
+            experiment_id="test_experiment_id",
+            duration=10,
+            emissions=0.5,
+            emissions_rate=0.05,
+            cpu_power=20,
+            gpu_power=0,
+            ram_power=5,
+            cpu_energy=200,
+            gpu_energy=0,
+            ram_energy=50,
+            energy_consumed=250,
+            water_consumed=0.1,
+            country_name="Testland",
+            country_iso_code="TS",
+            region="Test Region",
+            cloud_provider="",
+            cloud_region="",
+            os="TestOS",
+            python_version="3.8",
+            codecarbon_version="2.0",
+            cpu_count=4,
+            cpu_model="Test CPU",
+            gpu_count=None,
+            gpu_model=None,
+            longitude=0,
+            latitude=0,
+            ram_total_size=16,
+            tracking_mode="machine",
+        )
+
+        file_output = FileOutput("test.csv", self.temp_dir, on_csv_write="append")
+
+        # Write four times — prior to the fix, the 3rd write triggered a backup.
+        for _ in range(4):
+            file_output.out(no_gpu_data, None)
+            self.assertTrue(
+                file_output.has_valid_headers(no_gpu_data),
+                "CSV headers became invalid after an append (gpu_count/gpu_model "
+                "columns were dropped by dropna).",
+            )
+
+        # No .bak file should have been created.
+        bak_path = file_output.save_file_path + ".bak"
+        self.assertFalse(
+            os.path.exists(bak_path),
+            "A backup file was created even though the CSV schema did not change.",
+        )
+
+        # All four rows must be present.
+        df = pd.read_csv(file_output.save_file_path)
+        self.assertEqual(len(df), 4)
+
+        # gpu_count and gpu_model columns must still be present (as NaN).
+        self.assertIn("gpu_count", df.columns)
+        self.assertIn("gpu_model", df.columns)
+
+    def test_file_output_out_append_no_gpu_zero_defaults(self):
+        """Test that gpu_count=0 and gpu_model="" (the new tracker defaults for
+        CPU-only machines) produce consistent CSV columns across successive writes.
+        """
+        no_gpu_data = EmissionsData(
+            timestamp="2023-01-01T00:00:00",
+            project_name="test_project",
+            run_id="test_run_id",
+            experiment_id="test_experiment_id",
+            duration=10,
+            emissions=0.5,
+            emissions_rate=0.05,
+            cpu_power=20,
+            gpu_power=0,
+            ram_power=5,
+            cpu_energy=200,
+            gpu_energy=0,
+            ram_energy=50,
+            energy_consumed=250,
+            water_consumed=0.1,
+            country_name="Testland",
+            country_iso_code="TS",
+            region="Test Region",
+            cloud_provider="",
+            cloud_region="",
+            os="TestOS",
+            python_version="3.8",
+            codecarbon_version="2.0",
+            cpu_count=4,
+            cpu_model="Test CPU",
+            gpu_count=0,
+            gpu_model="",
+            longitude=0,
+            latitude=0,
+            ram_total_size=16,
+            tracking_mode="machine",
+        )
+
+        file_output = FileOutput("test.csv", self.temp_dir, on_csv_write="append")
+
+        for _ in range(4):
+            file_output.out(no_gpu_data, None)
+            self.assertTrue(
+                file_output.has_valid_headers(no_gpu_data),
+                "CSV headers should remain consistent with gpu_count=0 / gpu_model=''.",
+            )
+
+        bak_path = file_output.save_file_path + ".bak"
+        self.assertFalse(
+            os.path.exists(bak_path),
+            "No backup should be created when columns are consistent.",
+        )
+
+        df = pd.read_csv(file_output.save_file_path)
+        self.assertEqual(len(df), 4)
+        self.assertIn("gpu_count", df.columns)
+        self.assertIn("gpu_model", df.columns)
+        # With 0/"" defaults, gpu_count should be 0 (not NaN)
+        self.assertTrue((df["gpu_count"] == 0).all())
+        # gpu_model="" is read back as NaN by pandas (empty string in CSV),
+        # but the column must still be present.
+        self.assertIn("gpu_model", df.columns)
+
     def test_file_output_task_out(self):
         task_emissions_data = [
             TaskEmissionsData(