Skip to content

Commit dc59b8c

Browse files
committed
rows_to_to_csv
1 parent 7ad1be2 commit dc59b8c

File tree

9 files changed

+52
-142
lines changed

9 files changed

+52
-142
lines changed

scripts/1-fetch/arxiv_fetch.py

Lines changed: 8 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -125,31 +125,6 @@ def parse_arguments():
125125
return args
126126

127127

128-
def initialize_data_file(file_path, headers):
129-
"""Initialize CSV file with headers if it doesn't exist."""
130-
if not os.path.isfile(file_path):
131-
with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
132-
writer = csv.DictWriter(
133-
file_obj, fieldnames=headers, dialect="unix"
134-
)
135-
writer.writeheader()
136-
137-
138-
def initialize_all_data_files(args):
139-
"""Initialize all data files used by this script.
140-
141-
Creates the data directory and initializes empty CSVs with headers.
142-
"""
143-
if not args.enable_save:
144-
return
145-
146-
os.makedirs(PATHS["data_1-fetch"], exist_ok=True)
147-
initialize_data_file(FILE_ARXIV_COUNT, HEADER_COUNT)
148-
initialize_data_file(FILE_ARXIV_CATEGORY_REPORT, HEADER_CATEGORY_REPORT)
149-
initialize_data_file(FILE_ARXIV_YEAR, HEADER_YEAR)
150-
initialize_data_file(FILE_ARXIV_AUTHOR_BUCKET, HEADER_AUTHOR_BUCKET)
151-
152-
153128
def get_identifier_mapping():
154129
global IDENTIER_MAPPING
155130
LOGGER.info("Loading CC Legal Tool metadata for CC identifer mapping")
@@ -472,19 +447,6 @@ def query_arxiv(args, session):
472447
return data, cc_articles_found
473448

474449

475-
def rows_to_csv(args, fieldnames, rows, file_path):
476-
if not args.enable_save:
477-
return args
478-
479-
with open(file_path, "w", encoding="utf-8", newline="\n") as file_handle:
480-
writer = csv.DictWriter(
481-
file_handle, fieldnames=fieldnames, dialect="unix"
482-
)
483-
writer.writeheader()
484-
for row in rows:
485-
writer.writerow(row)
486-
487-
488450
def write_data(args, data):
489451
"""
490452
Write fetched data to CSV files.
@@ -508,7 +470,9 @@ def write_data(args, data):
508470
}
509471
)
510472
rows.sort(key=itemgetter("TOOL_IDENTIFIER", "AUTHOR_BUCKET"))
511-
rows_to_csv(args, HEADER_AUTHOR_BUCKET, rows, FILE_ARXIV_AUTHOR_BUCKET)
473+
shared.rows_to_csv(
474+
args, HEADER_AUTHOR_BUCKET, rows, FILE_ARXIV_AUTHOR_BUCKET
475+
)
512476

513477
# Save category report
514478
# fetched_data["category_counts"]: {identifer: {category_code: count}}
@@ -527,15 +491,17 @@ def write_data(args, data):
527491
}
528492
)
529493
rows.sort(key=itemgetter("TOOL_IDENTIFIER", "CATEGORY_CODE"))
530-
rows_to_csv(args, HEADER_CATEGORY_REPORT, rows, FILE_ARXIV_CATEGORY_REPORT)
494+
shared.rows_to_csv(
495+
args, HEADER_CATEGORY_REPORT, rows, FILE_ARXIV_CATEGORY_REPORT
496+
)
531497

532498
# Save tool counts report
533499
# fetched_data["tool_counts"]: {identfier: count}
534500
rows = []
535501
for identifier, count in data["tool_counts"].items():
536502
rows.append({"TOOL_IDENTIFIER": identifier, "COUNT": count})
537503
rows.sort(key=itemgetter("TOOL_IDENTIFIER"))
538-
rows_to_csv(args, HEADER_COUNT, rows, FILE_ARXIV_COUNT)
504+
shared.rows_to_csv(args, HEADER_COUNT, rows, FILE_ARXIV_COUNT)
539505

540506
# Save year count report
541507
# fetched_data["year_counts"]: {identifer: {year: count}}
@@ -546,7 +512,7 @@ def write_data(args, data):
546512
{"TOOL_IDENTIFIER": identifier, "YEAR": year, "COUNT": count}
547513
)
548514
rows.sort(key=itemgetter("TOOL_IDENTIFIER", "YEAR"))
549-
rows_to_csv(args, HEADER_YEAR, rows, FILE_ARXIV_YEAR)
515+
shared.rows_to_csv(args, HEADER_YEAR, FILE_ARXIV_YEAR, rows)
550516

551517

552518
def write_provence(args, cc_articles_found):
@@ -584,7 +550,6 @@ def main():
584550
args = parse_arguments()
585551
shared.paths_log(LOGGER, PATHS)
586552
shared.git_fetch_and_merge(args, PATHS["repo"])
587-
initialize_all_data_files(args)
588553
get_identifier_mapping()
589554
session = shared.get_session()
590555
query_category_mapping(args, session)

scripts/1-fetch/github_fetch.py

Lines changed: 7 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
LOGGER, PATHS = shared.setup(__file__)
2929

3030
# Constants
31-
FILE1_COUNT = os.path.join(PATHS["data_phase"], "github_1_count.csv")
31+
FILE_COUNT = os.path.join(PATHS["data_phase"], "github_1_count.csv")
3232
GH_TOKEN = os.getenv("GH_TOKEN")
3333
# Also see: https://en.wikipedia.org/wiki/Public-domain-equivalent_license
3434
GITHUB_TOOLS = [
@@ -40,7 +40,7 @@
4040
{"TOOL_IDENTIFIER": "Unlicense", "SPDX_IDENTIFIER": "Unlicense"},
4141
{"TOOL_IDENTIFIER": "Total public repositories", "SPDX_IDENTIFIER": "N/A"},
4242
]
43-
HEADER1_COUNT = ["TOOL_IDENTIFIER", "SPDX_IDENTIFIER", "COUNT"]
43+
HEADER_COUNT = ["TOOL_IDENTIFIER", "SPDX_IDENTIFIER", "COUNT"]
4444
QUARTER = os.path.basename(PATHS["data_quarter"])
4545

4646

@@ -68,7 +68,7 @@ def parse_arguments():
6868

6969
def check_for_completion():
7070
try:
71-
with open(FILE1_COUNT, "r", newline="") as file_obj:
71+
with open(FILE_COUNT, "r", newline="") as file_obj:
7272
reader = csv.DictReader(file_obj, dialect="unix")
7373
if len(list(reader)) == len(GITHUB_TOOLS):
7474
raise shared.QuantifyingException(
@@ -78,27 +78,6 @@ def check_for_completion():
7878
pass # File may not be found without --enable-save, etc.
7979

8080

81-
def write_data(args, tool_data):
82-
if not args.enable_save:
83-
return args
84-
85-
# Create data directory for this phase
86-
os.makedirs(PATHS["data_phase"], exist_ok=True)
87-
88-
if len(tool_data) < len(GITHUB_TOOLS):
89-
LOGGER.error("Unable to fetch all records. Aborting.")
90-
return args
91-
92-
with open(FILE1_COUNT, "w", encoding="utf-8", newline="\n") as file_obj:
93-
writer = csv.DictWriter(
94-
file_obj, fieldnames=HEADER1_COUNT, dialect="unix"
95-
)
96-
writer.writeheader()
97-
for row in tool_data:
98-
writer.writerow(row)
99-
return args
100-
101-
10281
def query_github(args, session):
10382
tool_data = []
10483
for tool in GITHUB_TOOLS:
@@ -148,7 +127,10 @@ def main():
148127
session.headers.update({"authorization": f"Bearer {GH_TOKEN}"})
149128

150129
tool_data = query_github(args, session)
151-
args = write_data(args, tool_data)
130+
if len(tool_data) < len(GITHUB_TOOLS):
131+
LOGGER.error("Unable to fetch all records. Aborting.")
132+
return args
133+
shared.rows_to_csv(args, FILE_COUNT, HEADER_COUNT, tool_data)
152134
args = shared.git_add_and_commit(
153135
args,
154136
PATHS["repo"],

scripts/1-fetch/openverse_fetch.py

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313

1414
# Standard library
1515
import argparse
16-
import csv
1716
import os
1817
import sys
1918
import textwrap
@@ -192,27 +191,12 @@ def query_openverse(session):
192191
return aggregate
193192

194193

195-
def write_data(args, data):
196-
if not args.enable_save:
197-
return
198-
os.makedirs(PATHS["data_phase"], exist_ok=True)
199-
with open(FILE_PATH, "w", encoding="utf-8", newline="") as file_obj:
200-
writer = csv.DictWriter(
201-
file_obj,
202-
fieldnames=OPENVERSE_FIELDS,
203-
dialect="unix",
204-
)
205-
writer.writeheader()
206-
for row in data:
207-
writer.writerow(row)
208-
209-
210194
def main():
211195
args = parse_arguments()
212196
LOGGER.info("Starting Openverse Fetch Script...")
213197
session = shared.get_session(accept_header="application/json")
214198
records = query_openverse(session)
215-
write_data(args, records)
199+
shared.rows_to_csv(args, FILE_PATH, OPENVERSE_FIELDS, records)
216200
LOGGER.info(f"Fetched {len(records)} unique Openverse records.")
217201

218202

scripts/1-fetch/smithsonian_fetch.py

Lines changed: 2 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -95,32 +95,6 @@ def check_for_completion():
9595
)
9696

9797

98-
def write_data(args, data_metrics, data_units):
99-
if not args.enable_save:
100-
return args
101-
102-
# Create data directory for this phase
103-
os.makedirs(PATHS["data_phase"], exist_ok=True)
104-
105-
with open(FILE_1_METRICS, "w", encoding="utf-8", newline="\n") as file_obj:
106-
writer = csv.DictWriter(
107-
file_obj, fieldnames=HEADER_1_METRICS, dialect="unix"
108-
)
109-
writer.writeheader()
110-
for row in data_metrics:
111-
writer.writerow(row)
112-
113-
with open(FILE_2_UNITS, "w", encoding="utf-8", newline="\n") as file_obj:
114-
writer = csv.DictWriter(
115-
file_obj, fieldnames=HEADER_2_UNITS, dialect="unix"
116-
)
117-
writer.writeheader()
118-
for row in data_units:
119-
writer.writerow(row)
120-
121-
return args
122-
123-
12498
def query_smithsonian(args, session):
12599
if not DATA_GOV_API_KEY:
126100
raise shared.QuantifyingException(
@@ -177,7 +151,8 @@ def main():
177151
check_for_completion()
178152
session = shared.get_session()
179153
data_metrics, data_units = query_smithsonian(args, session)
180-
args = write_data(args, data_metrics, data_units)
154+
shared.rows_to_csv(args, FILE_1_METRICS, HEADER_1_METRICS, data_metrics)
155+
shared.rows_to_csv(args, FILE_2_UNITS, HEADER_2_UNITS, data_units)
181156
args = shared.git_add_and_commit(
182157
args,
183158
PATHS["repo"],

scripts/1-fetch/wikipedia_fetch.py

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,9 @@ def parse_arguments():
6565

6666
def check_for_completion():
6767
try:
68-
with open(FILE_LANGUAGES, "r", newline="") as file_obj:
68+
with open(
69+
FILE_LANGUAGES, "r", encoding="utf-8", newline=""
70+
) as file_obj:
6971
reader = csv.DictReader(file_obj, dialect="unix")
7072
if len(list(reader)) > 300:
7173
raise shared.QuantifyingException(
@@ -75,22 +77,6 @@ def check_for_completion():
7577
pass # File may not be found without --enable-save, etc.
7678

7779

78-
def write_data(args, tool_data):
79-
if not args.enable_save:
80-
return args
81-
LOGGER.info("Saving fetched data")
82-
os.makedirs(PATHS["data_phase"], exist_ok=True)
83-
84-
with open(FILE_LANGUAGES, "w", encoding="utf-8", newline="\n") as file_obj:
85-
writer = csv.DictWriter(
86-
file_obj, fieldnames=HEADER_LANGUAGES, dialect="unix"
87-
)
88-
writer.writeheader()
89-
for row in tool_data:
90-
writer.writerow(row)
91-
return args
92-
93-
9480
def query_wikipedia_languages(session):
9581
LOGGER.info("Fetching article counts from all language Wikipedias")
9682
tool_data = []
@@ -173,7 +159,7 @@ def main():
173159
shared.git_fetch_and_merge(args, PATHS["repo"])
174160
session = shared.get_session()
175161
tool_data = query_wikipedia_languages(session)
176-
args = write_data(args, tool_data)
162+
shared.rows_to_csv(args, FILE_LANGUAGES, HEADER_LANGUAGES, tool_data)
177163
args = shared.git_add_and_commit(
178164
args,
179165
PATHS["repo"],

scripts/2-process/gcs_process.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def process_product_totals(args, count_data):
121121
data.items(), columns=["CC legal tool product", "Count"]
122122
)
123123
file_path = shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv")
124-
shared.data_to_csv(args, data, file_path)
124+
shared.dataframe_to_csv(args, data, file_path)
125125

126126

127127
def process_latest_prior_retired_totals(args, count_data):
@@ -202,7 +202,7 @@ def process_latest_prior_retired_totals(args, count_data):
202202
file_path = shared.path_join(
203203
PATHS["data_phase"], f"gcs_status_{key}_totals.csv"
204204
)
205-
shared.data_to_csv(args, dataframe, file_path)
205+
shared.dataframe_to_csv(args, dataframe, file_path)
206206

207207

208208
def process_totals_by_free_cultural(args, count_data):
@@ -235,7 +235,7 @@ def process_totals_by_free_cultural(args, count_data):
235235
file_path = shared.path_join(
236236
PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"
237237
)
238-
shared.data_to_csv(args, data, file_path)
238+
shared.dataframe_to_csv(args, data, file_path)
239239

240240

241241
def process_totals_by_restrictions(args, count_data):
@@ -269,7 +269,7 @@ def process_totals_by_restrictions(args, count_data):
269269
file_path = shared.path_join(
270270
PATHS["data_phase"], "gcs_totals_by_restrictions.csv"
271271
)
272-
shared.data_to_csv(args, data, file_path)
272+
shared.dataframe_to_csv(args, data, file_path)
273273

274274

275275
def process_totals_by_language(args, data):
@@ -290,7 +290,7 @@ def process_totals_by_language(args, data):
290290
file_path = shared.path_join(
291291
PATHS["data_phase"], "gcs_totals_by_language.csv"
292292
)
293-
shared.data_to_csv(args, data, file_path)
293+
shared.dataframe_to_csv(args, data, file_path)
294294

295295

296296
def process_totals_by_country(args, data):
@@ -311,7 +311,7 @@ def process_totals_by_country(args, data):
311311
file_path = shared.path_join(
312312
PATHS["data_phase"], "gcs_totals_by_country.csv"
313313
)
314-
shared.data_to_csv(args, data, file_path)
314+
shared.dataframe_to_csv(args, data, file_path)
315315

316316

317317
def main():

scripts/2-process/github_process.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def process_totals_by_license(args, count_data):
9696
file_path = shared.path_join(
9797
PATHS["data_phase"], "github_totals_by_license.csv"
9898
)
99-
shared.data_to_csv(args, data, file_path)
99+
shared.dataframe_to_csv(args, data, file_path)
100100

101101

102102
def process_totals_by_restriction(args, count_data):
@@ -130,7 +130,7 @@ def process_totals_by_restriction(args, count_data):
130130
file_path = shared.path_join(
131131
PATHS["data_phase"], "github_totals_by_restriction.csv"
132132
)
133-
shared.data_to_csv(args, data, file_path)
133+
shared.dataframe_to_csv(args, data, file_path)
134134

135135

136136
def main():

scripts/2-process/wikipedia_process.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def process_highest_language_usage(args, count_data):
103103
file_path = shared.path_join(
104104
PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
105105
)
106-
shared.data_to_csv(args, top_10, file_path)
106+
shared.dataframe_to_csv(args, top_10, file_path)
107107

108108

109109
def process_least_language_usage(args, count_data):
@@ -126,7 +126,7 @@ def process_least_language_usage(args, count_data):
126126
file_path = shared.path_join(
127127
PATHS["data_phase"], "wikipedia_least_language_usage.csv"
128128
)
129-
shared.data_to_csv(args, bottom_10, file_path)
129+
shared.dataframe_to_csv(args, bottom_10, file_path)
130130

131131

132132
def process_language_representation(args, count_data):
@@ -152,7 +152,7 @@ def process_language_representation(args, count_data):
152152
file_path = shared.path_join(
153153
PATHS["data_phase"], "wikipedia_language_representation.csv"
154154
)
155-
shared.data_to_csv(args, language_counts, file_path)
155+
shared.dataframe_to_csv(args, language_counts, file_path)
156156

157157

158158
def main():

0 commit comments

Comments
 (0)