perf(file): read only header row instead of entire CSV in has_valid_headers (#1235)

vishali-mp · Vishali M P · web-flow · commit c82e8e2b3da2 · 2026-06-11T15:22:42.000+02:00
* perf(file): fast append and schema migration for CSV output

- has_valid_headers() reads only first row instead of entire CSV
- Append mode with matching headers: direct append via to_csv(mode='a')
- On schema mismatch: merge old data with new schema instead of backup
  (union of columns, preserves old data with NaN for missing)
- dropna(axis=1, how='all') only in append path to preserve new file columns

* remove unintended changes

---------

Co-authored-by: Vishali M P &lt;vishali@Mac.lan&gt;
diff --git a/codecarbon/output_methods/file.py b/codecarbon/output_methods/file.py
@@ -62,14 +62,12 @@ def has_valid_headers(self, data: EmissionsData) -> bool:
             True if the file has valid headers, False otherwise.
         """
         with open(self.save_file_path) as csv_file:
-            csv_reader = csv.DictReader(csv_file)
-            csv_entries_list = list(csv_reader)
-            if len(csv_entries_list) == 0:
-                # No entries
+            reader = csv.reader(csv_file)
+            try:
+                headers = next(reader)
+            except StopIteration:
                 return True
-            dict_from_csv = dict(csv_entries_list[0])
-            list_of_column_names = sorted(dict_from_csv.keys())
-            return sorted(data.values.keys()) == list_of_column_names
+            return sorted(headers) == sorted(data.values.keys())
 
     def out(self, total: EmissionsData, _):
         """
@@ -78,9 +76,8 @@ def out(self, total: EmissionsData, _):
         * If the file does not exist, then create it.
         * If the file already exists but has invalid headers, then back it up and replace with new data.
         * If the file already exists and has valid headers:
-            * If it has no rows with a matching run ID, append the new data.
-            * If it has one row with a matching run ID, then replace that row with the new data.
-            * If it has > one row with a matching run ID, append the new data
+            * In "append" mode, append the new row directly.
+            * In "update" mode, deduplicate by run_id.
 
         Args:
             total: data to save.
@@ -93,19 +90,20 @@ def out(self, total: EmissionsData, _):
                 f"File {self.save_file_path} exists but is empty. Treating as new file."
             )
             file_exists = False
-        if file_exists and not self.has_valid_headers(total):
+
+        headers_match = file_exists and self.has_valid_headers(total)
+        if file_exists and not headers_match:
             logger.warning("The CSV format has changed, backing up old emission file.")
             backup(self.save_file_path)
             file_exists = False
+
         new_df = pd.DataFrame.from_records([dict(total.values)])
+
         if not file_exists:
-            df = new_df
+            new_df.to_csv(self.save_file_path, index=False)
         elif self.on_csv_write == "append":
-            df = pd.read_csv(self.save_file_path)
-            # Filter out empty or all-NA columns only from new_df, to avoid warnings from Pandas,
-            # see https://github.com/pandas-dev/pandas/issues/55928
             new_df = new_df.dropna(axis=1, how="all")
-            df = pd.concat([df, new_df])
+            new_df.to_csv(self.save_file_path, mode="a", header=False, index=False)
         else:
             df = pd.read_csv(self.save_file_path)
             df_run = df.loc[df.run_id == total.run_id]
@@ -121,13 +119,11 @@ def out(self, total: EmissionsData, _):
             else:
                 update_values = {}
                 for col, val in dict(total.values).items():
-                    # Explicitly cast new values to prevent warnings about incompatible dtypes.
                     update_values[col] = df[col].dtype.type(val)
                 df.loc[df.run_id == total.run_id, update_values.keys()] = (
                     update_values.values()
                 )
-
-        df.to_csv(self.save_file_path, index=False)
+            df.to_csv(self.save_file_path, index=False)
 
     def task_out(self, data: List[TaskEmissionsData], experiment_name: str):
         """
diff --git a/tests/output_methods/test_file.py b/tests/output_methods/test_file.py
@@ -375,3 +375,41 @@ def test_file_output_task_out(self):
         self.assertTrue(os.path.exists(expected_file))
         df = pd.read_csv(expected_file)
         self.assertEqual(len(df), 1)
+
+    def test_fast_append_with_matching_headers(self):
+        file_output = FileOutput("test.csv", self.temp_dir, on_csv_write="append")
+        file_output.out(self.emissions_data, None)
+        file_output.out(self.emissions_data, None)
+        file_output.out(self.emissions_data, None)
+
+        df = pd.read_csv(os.path.join(self.temp_dir, "test.csv"))
+        self.assertEqual(len(df), 3)
+
+    def test_schema_migration_creates_backup(self):
+        file_output = FileOutput("test.csv", self.temp_dir, on_csv_write="append")
+        file_output.out(self.emissions_data, None)
+
+        path = os.path.join(self.temp_dir, "test.csv")
+        df_old = pd.read_csv(path)
+        df_old.rename(columns={"cpu_model": "old_cpu_model"}, inplace=True)
+        df_old.to_csv(path, index=False)
+
+        self.assertFalse(file_output.has_valid_headers(self.emissions_data))
+
+        file_output.out(self.emissions_data, None)
+
+        df_bak = pd.read_csv(path + ".bak")
+        self.assertEqual(len(df_bak), 1)
+        self.assertIn("old_cpu_model", df_bak.columns)
+
+        df = pd.read_csv(path)
+        self.assertEqual(len(df), 1)
+        self.assertIn("cpu_model", df.columns)
+
+    def test_out_append_large_file_fast_path(self):
+        file_output = FileOutput("test.csv", self.temp_dir, on_csv_write="append")
+        file_output.out(self.emissions_data, None)
+        file_output.out(self.emissions_data, None)
+
+        df = pd.read_csv(os.path.join(self.temp_dir, "test.csv"))
+        self.assertEqual(len(df), 2)
diff --git a/uv.lock b/uv.lock