rows_to_to_csv

oree-xx · oree-xx · commit dc59b8c59f77 · 2026-02-09T13:52:29.000+01:00
diff --git a/scripts/1-fetch/arxiv_fetch.py b/scripts/1-fetch/arxiv_fetch.py
@@ -125,31 +125,6 @@ def parse_arguments():
     return args
 
 
-def initialize_data_file(file_path, headers):
-    """Initialize CSV file with headers if it doesn't exist."""
-    if not os.path.isfile(file_path):
-        with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
-            writer = csv.DictWriter(
-                file_obj, fieldnames=headers, dialect="unix"
-            )
-            writer.writeheader()
-
-
-def initialize_all_data_files(args):
-    """Initialize all data files used by this script.
-
-    Creates the data directory and initializes empty CSVs with headers.
-    """
-    if not args.enable_save:
-        return
-
-    os.makedirs(PATHS["data_1-fetch"], exist_ok=True)
-    initialize_data_file(FILE_ARXIV_COUNT, HEADER_COUNT)
-    initialize_data_file(FILE_ARXIV_CATEGORY_REPORT, HEADER_CATEGORY_REPORT)
-    initialize_data_file(FILE_ARXIV_YEAR, HEADER_YEAR)
-    initialize_data_file(FILE_ARXIV_AUTHOR_BUCKET, HEADER_AUTHOR_BUCKET)
-
-
 def get_identifier_mapping():
     global IDENTIER_MAPPING
     LOGGER.info("Loading CC Legal Tool metadata for CC identifer mapping")
@@ -472,19 +447,6 @@ def query_arxiv(args, session):
     return data, cc_articles_found
 
 
-def rows_to_csv(args, fieldnames, rows, file_path):
-    if not args.enable_save:
-        return args
-
-    with open(file_path, "w", encoding="utf-8", newline="\n") as file_handle:
-        writer = csv.DictWriter(
-            file_handle, fieldnames=fieldnames, dialect="unix"
-        )
-        writer.writeheader()
-        for row in rows:
-            writer.writerow(row)
-
-
 def write_data(args, data):
     """
     Write fetched data to CSV files.
@@ -508,7 +470,9 @@ def write_data(args, data):
                 }
             )
     rows.sort(key=itemgetter("TOOL_IDENTIFIER", "AUTHOR_BUCKET"))
-    rows_to_csv(args, HEADER_AUTHOR_BUCKET, rows, FILE_ARXIV_AUTHOR_BUCKET)
+    shared.rows_to_csv(
+        args, HEADER_AUTHOR_BUCKET, rows, FILE_ARXIV_AUTHOR_BUCKET
+    )
 
     # Save category report
     # fetched_data["category_counts"]: {identifer: {category_code: count}}
@@ -527,15 +491,17 @@ def write_data(args, data):
                 }
             )
     rows.sort(key=itemgetter("TOOL_IDENTIFIER", "CATEGORY_CODE"))
-    rows_to_csv(args, HEADER_CATEGORY_REPORT, rows, FILE_ARXIV_CATEGORY_REPORT)
+    shared.rows_to_csv(
+        args, HEADER_CATEGORY_REPORT, rows, FILE_ARXIV_CATEGORY_REPORT
+    )
 
     # Save tool counts report
     # fetched_data["tool_counts"]: {identfier: count}
     rows = []
     for identifier, count in data["tool_counts"].items():
         rows.append({"TOOL_IDENTIFIER": identifier, "COUNT": count})
     rows.sort(key=itemgetter("TOOL_IDENTIFIER"))
-    rows_to_csv(args, HEADER_COUNT, rows, FILE_ARXIV_COUNT)
+    shared.rows_to_csv(args, HEADER_COUNT, rows, FILE_ARXIV_COUNT)
 
     # Save year count report
     # fetched_data["year_counts"]: {identifer: {year: count}}
@@ -546,7 +512,7 @@ def write_data(args, data):
                 {"TOOL_IDENTIFIER": identifier, "YEAR": year, "COUNT": count}
             )
     rows.sort(key=itemgetter("TOOL_IDENTIFIER", "YEAR"))
-    rows_to_csv(args, HEADER_YEAR, rows, FILE_ARXIV_YEAR)
+    shared.rows_to_csv(args, HEADER_YEAR, FILE_ARXIV_YEAR, rows)
 
 
 def write_provence(args, cc_articles_found):
@@ -584,7 +550,6 @@ def main():
     args = parse_arguments()
     shared.paths_log(LOGGER, PATHS)
     shared.git_fetch_and_merge(args, PATHS["repo"])
-    initialize_all_data_files(args)
     get_identifier_mapping()
     session = shared.get_session()
     query_category_mapping(args, session)
diff --git a/scripts/1-fetch/github_fetch.py b/scripts/1-fetch/github_fetch.py
@@ -28,7 +28,7 @@
 LOGGER, PATHS = shared.setup(__file__)
 
 # Constants
-FILE1_COUNT = os.path.join(PATHS["data_phase"], "github_1_count.csv")
+FILE_COUNT = os.path.join(PATHS["data_phase"], "github_1_count.csv")
 GH_TOKEN = os.getenv("GH_TOKEN")
 # Also see: https://en.wikipedia.org/wiki/Public-domain-equivalent_license
 GITHUB_TOOLS = [
@@ -40,7 +40,7 @@
     {"TOOL_IDENTIFIER": "Unlicense", "SPDX_IDENTIFIER": "Unlicense"},
     {"TOOL_IDENTIFIER": "Total public repositories", "SPDX_IDENTIFIER": "N/A"},
 ]
-HEADER1_COUNT = ["TOOL_IDENTIFIER", "SPDX_IDENTIFIER", "COUNT"]
+HEADER_COUNT = ["TOOL_IDENTIFIER", "SPDX_IDENTIFIER", "COUNT"]
 QUARTER = os.path.basename(PATHS["data_quarter"])
 
 
@@ -68,7 +68,7 @@ def parse_arguments():
 
 def check_for_completion():
     try:
-        with open(FILE1_COUNT, "r", newline="") as file_obj:
+        with open(FILE_COUNT, "r", newline="") as file_obj:
             reader = csv.DictReader(file_obj, dialect="unix")
             if len(list(reader)) == len(GITHUB_TOOLS):
                 raise shared.QuantifyingException(
@@ -78,27 +78,6 @@ def check_for_completion():
         pass  # File may not be found without --enable-save, etc.
 
 
-def write_data(args, tool_data):
-    if not args.enable_save:
-        return args
-
-    # Create data directory for this phase
-    os.makedirs(PATHS["data_phase"], exist_ok=True)
-
-    if len(tool_data) < len(GITHUB_TOOLS):
-        LOGGER.error("Unable to fetch all records. Aborting.")
-        return args
-
-    with open(FILE1_COUNT, "w", encoding="utf-8", newline="\n") as file_obj:
-        writer = csv.DictWriter(
-            file_obj, fieldnames=HEADER1_COUNT, dialect="unix"
-        )
-        writer.writeheader()
-        for row in tool_data:
-            writer.writerow(row)
-    return args
-
-
 def query_github(args, session):
     tool_data = []
     for tool in GITHUB_TOOLS:
@@ -148,7 +127,10 @@ def main():
         session.headers.update({"authorization": f"Bearer {GH_TOKEN}"})
 
     tool_data = query_github(args, session)
-    args = write_data(args, tool_data)
+    if len(tool_data) < len(GITHUB_TOOLS):
+        LOGGER.error("Unable to fetch all records. Aborting.")
+        return args
+    shared.rows_to_csv(args, FILE_COUNT, HEADER_COUNT, tool_data)
     args = shared.git_add_and_commit(
         args,
         PATHS["repo"],
diff --git a/scripts/1-fetch/openverse_fetch.py b/scripts/1-fetch/openverse_fetch.py
@@ -13,7 +13,6 @@
 
 # Standard library
 import argparse
-import csv
 import os
 import sys
 import textwrap
@@ -192,27 +191,12 @@ def query_openverse(session):
     return aggregate
 
 
-def write_data(args, data):
-    if not args.enable_save:
-        return
-    os.makedirs(PATHS["data_phase"], exist_ok=True)
-    with open(FILE_PATH, "w", encoding="utf-8", newline="") as file_obj:
-        writer = csv.DictWriter(
-            file_obj,
-            fieldnames=OPENVERSE_FIELDS,
-            dialect="unix",
-        )
-        writer.writeheader()
-        for row in data:
-            writer.writerow(row)
-
-
 def main():
     args = parse_arguments()
     LOGGER.info("Starting Openverse Fetch Script...")
     session = shared.get_session(accept_header="application/json")
     records = query_openverse(session)
-    write_data(args, records)
+    shared.rows_to_csv(args, FILE_PATH, OPENVERSE_FIELDS, records)
     LOGGER.info(f"Fetched {len(records)} unique Openverse records.")
 
 
diff --git a/scripts/1-fetch/smithsonian_fetch.py b/scripts/1-fetch/smithsonian_fetch.py
@@ -95,32 +95,6 @@ def check_for_completion():
         )
 
 
-def write_data(args, data_metrics, data_units):
-    if not args.enable_save:
-        return args
-
-    # Create data directory for this phase
-    os.makedirs(PATHS["data_phase"], exist_ok=True)
-
-    with open(FILE_1_METRICS, "w", encoding="utf-8", newline="\n") as file_obj:
-        writer = csv.DictWriter(
-            file_obj, fieldnames=HEADER_1_METRICS, dialect="unix"
-        )
-        writer.writeheader()
-        for row in data_metrics:
-            writer.writerow(row)
-
-    with open(FILE_2_UNITS, "w", encoding="utf-8", newline="\n") as file_obj:
-        writer = csv.DictWriter(
-            file_obj, fieldnames=HEADER_2_UNITS, dialect="unix"
-        )
-        writer.writeheader()
-        for row in data_units:
-            writer.writerow(row)
-
-    return args
-
-
 def query_smithsonian(args, session):
     if not DATA_GOV_API_KEY:
         raise shared.QuantifyingException(
@@ -177,7 +151,8 @@ def main():
     check_for_completion()
     session = shared.get_session()
     data_metrics, data_units = query_smithsonian(args, session)
-    args = write_data(args, data_metrics, data_units)
+    shared.rows_to_csv(args, FILE_1_METRICS, HEADER_1_METRICS, data_metrics)
+    shared.rows_to_csv(args, FILE_2_UNITS, HEADER_2_UNITS, data_units)
     args = shared.git_add_and_commit(
         args,
         PATHS["repo"],
diff --git a/scripts/1-fetch/wikipedia_fetch.py b/scripts/1-fetch/wikipedia_fetch.py
@@ -65,7 +65,9 @@ def parse_arguments():
 
 def check_for_completion():
     try:
-        with open(FILE_LANGUAGES, "r", newline="") as file_obj:
+        with open(
+            FILE_LANGUAGES, "r", encoding="utf-8", newline=""
+        ) as file_obj:
             reader = csv.DictReader(file_obj, dialect="unix")
             if len(list(reader)) > 300:
                 raise shared.QuantifyingException(
@@ -75,22 +77,6 @@ def check_for_completion():
         pass  # File may not be found without --enable-save, etc.
 
 
-def write_data(args, tool_data):
-    if not args.enable_save:
-        return args
-    LOGGER.info("Saving fetched data")
-    os.makedirs(PATHS["data_phase"], exist_ok=True)
-
-    with open(FILE_LANGUAGES, "w", encoding="utf-8", newline="\n") as file_obj:
-        writer = csv.DictWriter(
-            file_obj, fieldnames=HEADER_LANGUAGES, dialect="unix"
-        )
-        writer.writeheader()
-        for row in tool_data:
-            writer.writerow(row)
-    return args
-
-
 def query_wikipedia_languages(session):
     LOGGER.info("Fetching article counts from all language Wikipedias")
     tool_data = []
@@ -173,7 +159,7 @@ def main():
     shared.git_fetch_and_merge(args, PATHS["repo"])
     session = shared.get_session()
     tool_data = query_wikipedia_languages(session)
-    args = write_data(args, tool_data)
+    shared.rows_to_csv(args, FILE_LANGUAGES, HEADER_LANGUAGES, tool_data)
     args = shared.git_add_and_commit(
         args,
         PATHS["repo"],
diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py
@@ -121,7 +121,7 @@ def process_product_totals(args, count_data):
         data.items(), columns=["CC legal tool product", "Count"]
     )
     file_path = shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv")
-    shared.data_to_csv(args, data, file_path)
+    shared.dataframe_to_csv(args, data, file_path)
 
 
 def process_latest_prior_retired_totals(args, count_data):
@@ -202,7 +202,7 @@ def process_latest_prior_retired_totals(args, count_data):
         file_path = shared.path_join(
             PATHS["data_phase"], f"gcs_status_{key}_totals.csv"
         )
-        shared.data_to_csv(args, dataframe, file_path)
+        shared.dataframe_to_csv(args, dataframe, file_path)
 
 
 def process_totals_by_free_cultural(args, count_data):
@@ -235,7 +235,7 @@ def process_totals_by_free_cultural(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"
     )
-    shared.data_to_csv(args, data, file_path)
+    shared.dataframe_to_csv(args, data, file_path)
 
 
 def process_totals_by_restrictions(args, count_data):
@@ -269,7 +269,7 @@ def process_totals_by_restrictions(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "gcs_totals_by_restrictions.csv"
     )
-    shared.data_to_csv(args, data, file_path)
+    shared.dataframe_to_csv(args, data, file_path)
 
 
 def process_totals_by_language(args, data):
@@ -290,7 +290,7 @@ def process_totals_by_language(args, data):
     file_path = shared.path_join(
         PATHS["data_phase"], "gcs_totals_by_language.csv"
     )
-    shared.data_to_csv(args, data, file_path)
+    shared.dataframe_to_csv(args, data, file_path)
 
 
 def process_totals_by_country(args, data):
@@ -311,7 +311,7 @@ def process_totals_by_country(args, data):
     file_path = shared.path_join(
         PATHS["data_phase"], "gcs_totals_by_country.csv"
     )
-    shared.data_to_csv(args, data, file_path)
+    shared.dataframe_to_csv(args, data, file_path)
 
 
 def main():
diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py
@@ -96,7 +96,7 @@ def process_totals_by_license(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "github_totals_by_license.csv"
     )
-    shared.data_to_csv(args, data, file_path)
+    shared.dataframe_to_csv(args, data, file_path)
 
 
 def process_totals_by_restriction(args, count_data):
@@ -130,7 +130,7 @@ def process_totals_by_restriction(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "github_totals_by_restriction.csv"
     )
-    shared.data_to_csv(args, data, file_path)
+    shared.dataframe_to_csv(args, data, file_path)
 
 
 def main():
diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py
@@ -103,7 +103,7 @@ def process_highest_language_usage(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
     )
-    shared.data_to_csv(args, top_10, file_path)
+    shared.dataframe_to_csv(args, top_10, file_path)
 
 
 def process_least_language_usage(args, count_data):
@@ -126,7 +126,7 @@ def process_least_language_usage(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "wikipedia_least_language_usage.csv"
     )
-    shared.data_to_csv(args, bottom_10, file_path)
+    shared.dataframe_to_csv(args, bottom_10, file_path)
 
 
 def process_language_representation(args, count_data):
@@ -152,7 +152,7 @@ def process_language_representation(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "wikipedia_language_representation.csv"
     )
-    shared.data_to_csv(args, language_counts, file_path)
+    shared.dataframe_to_csv(args, language_counts, file_path)
 
 
 def main():
diff --git a/scripts/shared.py b/scripts/shared.py

Original file line number	Diff line number	Diff line change
`@@ -96,7 +96,7 @@ def process_totals_by_license(args, count_data):`
`96`	`96`	`file_path = shared.path_join(`
`97`	`97`	`PATHS["data_phase"], "github_totals_by_license.csv"`
`98`	`98`	`)`
`99`		`- shared.data_to_csv(args, data, file_path)`
	`99`	`+ shared.dataframe_to_csv(args, data, file_path)`
`100`	`100`
`101`	`101`
`102`	`102`	`def process_totals_by_restriction(args, count_data):`
`@@ -130,7 +130,7 @@ def process_totals_by_restriction(args, count_data):`
`130`	`130`	`file_path = shared.path_join(`
`131`	`131`	`PATHS["data_phase"], "github_totals_by_restriction.csv"`
`132`	`132`	`)`
`133`		`- shared.data_to_csv(args, data, file_path)`
	`133`	`+ shared.dataframe_to_csv(args, data, file_path)`
`134`	`134`
`135`	`135`
`136`	`136`	`def main():`
Original file line number	Diff line number	Diff line change
`@@ -103,7 +103,7 @@ def process_highest_language_usage(args, count_data):`
`103`	`103`	`file_path = shared.path_join(`
`104`	`104`	`PATHS["data_phase"], "wikipedia_highest_language_usage.csv"`
`105`	`105`	`)`
`106`		`- shared.data_to_csv(args, top_10, file_path)`
	`106`	`+ shared.dataframe_to_csv(args, top_10, file_path)`
`107`	`107`
`108`	`108`
`109`	`109`	`def process_least_language_usage(args, count_data):`
`@@ -126,7 +126,7 @@ def process_least_language_usage(args, count_data):`
`126`	`126`	`file_path = shared.path_join(`
`127`	`127`	`PATHS["data_phase"], "wikipedia_least_language_usage.csv"`
`128`	`128`	`)`
`129`		`- shared.data_to_csv(args, bottom_10, file_path)`
	`129`	`+ shared.dataframe_to_csv(args, bottom_10, file_path)`
`130`	`130`
`131`	`131`
`132`	`132`	`def process_language_representation(args, count_data):`
`@@ -152,7 +152,7 @@ def process_language_representation(args, count_data):`
`152`	`152`	`file_path = shared.path_join(`
`153`	`153`	`PATHS["data_phase"], "wikipedia_language_representation.csv"`
`154`	`154`	`)`
`155`		`- shared.data_to_csv(args, language_counts, file_path)`
	`155`	`+ shared.dataframe_to_csv(args, language_counts, file_path)`
`156`	`156`
`157`	`157`
`158`	`158`	`def main():`