Skip to content

Commit 27cb530

Browse files
committed
clean commit for Openverse
1 parent ac22fb5 commit 27cb530

File tree

2 files changed

+697
-0
lines changed

2 files changed

+697
-0
lines changed
Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
#!/usr/bin/env python
2+
"""
3+
This file is dedicated to processing Openverse data
4+
for analysis and comparison between quarters.
5+
"""
6+
# Standard library
7+
import argparse
8+
import csv
9+
import os
10+
import sys
11+
import traceback
12+
from collections import defaultdict
13+
14+
# Third-party
15+
import pandas as pd
16+
17+
# Add parent directory so shared can be imported
18+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
19+
20+
# First-party/Local
21+
import shared # noqa: E402
22+
23+
# Setup
24+
LOGGER, PATHS = shared.setup(__file__)
25+
26+
# Constants
27+
QUARTER = os.path.basename(PATHS["data_quarter"])
28+
29+
30+
def parse_arguments():
31+
"""
32+
Parse command-line options, returns parsed argument namespace.
33+
"""
34+
LOGGER.info("Parsing command-line options")
35+
parser = argparse.ArgumentParser(description=__doc__)
36+
parser.add_argument(
37+
"--quarter",
38+
default=QUARTER,
39+
help=f"Data quarter in format YYYYQx (default: {QUARTER})",
40+
)
41+
parser.add_argument(
42+
"--enable-save",
43+
action="store_true",
44+
help="Enable saving results (default: False)",
45+
)
46+
parser.add_argument(
47+
"--enable-git",
48+
action="store_true",
49+
help="Enable git actions such as fetch, merge, add, commit, and push"
50+
" (default: False)",
51+
)
52+
args = parser.parse_args()
53+
if not args.enable_save and args.enable_git:
54+
parser.error("--enable-git requires --enable-save")
55+
if args.quarter != QUARTER:
56+
global PATHS
57+
PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
58+
args.logger = LOGGER
59+
args.paths = PATHS
60+
return args
61+
62+
63+
def check_for_data_file(file_path):
64+
if os.path.exists(file_path):
65+
raise shared.QuantifyingException(
66+
f"Processed data already exists for {QUARTER}", 0
67+
)
68+
69+
70+
def data_to_csv(args, data, file_path):
71+
if not args.enable_save:
72+
return
73+
os.makedirs(PATHS["data_phase"], exist_ok=True)
74+
# emulate csv.unix_dialect
75+
data.to_csv(
76+
file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
77+
)
78+
79+
80+
def process_totals_by_license(args, count_data):
81+
"""
82+
Processing count data: totals by license
83+
"""
84+
LOGGER.info(process_totals_by_license.__doc__.strip())
85+
data = defaultdict(int)
86+
87+
for row in count_data.itertuples(index=False):
88+
tool = str(row.TOOL_IDENTIFIER)
89+
count = int(row.MEDIA_COUNT)
90+
91+
data[tool] += count
92+
data = pd.DataFrame(data.items(), columns=["License", "Count"])
93+
data.sort_values("License", ascending=True, inplace=True)
94+
data.reset_index(drop=True, inplace=True)
95+
file_path = shared.path_join(
96+
PATHS["data_phase"], "openverse_totals_by_license.csv"
97+
)
98+
check_for_data_file(file_path)
99+
data_to_csv(args, data, file_path)
100+
101+
102+
def process_totals_by_media_type(args, count_data):
103+
"""
104+
Processing count data: totals by media type
105+
"""
106+
# https://creativecommons.org/public-domain/freeworks/
107+
LOGGER.info(process_totals_by_media_type.__doc__.strip())
108+
data = defaultdict(int)
109+
110+
for row in count_data.itertuples(index=False):
111+
media_type = str(row.MEDIA_TYPE)
112+
count = int(row.MEDIA_COUNT)
113+
114+
data[media_type] += count
115+
data = pd.DataFrame(data.items(), columns=["Media_type", "Count"])
116+
data.sort_values("Media_type", ascending=True, inplace=True)
117+
file_path = shared.path_join(
118+
PATHS["data_phase"], "openverse_totals_by_media_type.csv"
119+
)
120+
check_for_data_file(file_path)
121+
data_to_csv(args, data, file_path)
122+
123+
124+
def process_totals_by_source(args, count_data):
125+
"""
126+
Processing count data: totals by source
127+
"""
128+
LOGGER.info(process_totals_by_source.__doc__.strip())
129+
data = defaultdict(int)
130+
for row in count_data.itertuples(index=False):
131+
source = str(row.SOURCE)
132+
count = int(row.MEDIA_COUNT)
133+
134+
data[source] += count
135+
data = pd.DataFrame(data.items(), columns=["Source", "Count"])
136+
data.sort_values("Source", ascending=True, inplace=True)
137+
file_path = shared.path_join(
138+
PATHS["data_phase"], "openverse_totals_by_source.csv"
139+
)
140+
check_for_data_file(file_path)
141+
data_to_csv(args, data, file_path)
142+
143+
144+
def process_permissive_by_media_type(args, count_data):
145+
"""
146+
Processing count data: permissive by media type
147+
"""
148+
LOGGER.info(process_permissive_by_media_type.__doc__.strip())
149+
150+
data = defaultdict(int)
151+
152+
for row in count_data.itertuples(index=False):
153+
tool = str(row.TOOL_IDENTIFIER)
154+
media_type = str(row.MEDIA_TYPE)
155+
count = int(row.MEDIA_COUNT)
156+
157+
if tool in ["CC0", "CC BY", "CC BY-SA"]:
158+
data[media_type] += count
159+
160+
data = pd.DataFrame(data.items(), columns=["Media_type", "Count"])
161+
data.sort_values("Media_type", ascending=True, inplace=True)
162+
163+
file_path = shared.path_join(
164+
PATHS["data_phase"], "openverse_permissive_by_media_type.csv"
165+
)
166+
check_for_data_file(file_path)
167+
data_to_csv(args, data, file_path)
168+
169+
170+
def process_permissive_by_source(args, count_data):
171+
"""
172+
Processing count data: permissive content by source
173+
"""
174+
LOGGER.info(process_permissive_by_source.__doc__.strip())
175+
data = defaultdict(int)
176+
for row in count_data.itertuples(index=False):
177+
tool = str(row.TOOL_IDENTIFIER)
178+
source = str(row.SOURCE)
179+
count = int(row.MEDIA_COUNT)
180+
if tool in ["CC0", "CC BY", "CC BY-SA"]:
181+
data[source] += count
182+
data = pd.DataFrame(data.items(), columns=["Source", "Count"])
183+
data.sort_values("Source", ascending=True, inplace=True)
184+
file_path = shared.path_join(
185+
PATHS["data_phase"], "openverse_permissive_by_source.csv"
186+
)
187+
check_for_data_file(file_path)
188+
data_to_csv(args, data, file_path)
189+
190+
191+
def process_totals_by_restriction(args, count_data):
192+
"""
193+
Processing count data: totals by restriction
194+
"""
195+
LOGGER.info(process_totals_by_restriction.__doc__.strip())
196+
197+
data = {
198+
"Copyleft": 0,
199+
"Permissive": 0,
200+
"Public domain": 0,
201+
"Restricted": 0,
202+
}
203+
204+
for row in count_data.itertuples(index=False):
205+
tool = str(row.TOOL_IDENTIFIER)
206+
count = int(row.MEDIA_COUNT)
207+
208+
if tool in ["CC0", "PDM"]:
209+
key = "Public domain"
210+
211+
elif tool in ["CC BY"]:
212+
key = "Permissive"
213+
214+
elif tool in ["CC BY-SA"]:
215+
key = "Copyleft"
216+
217+
else:
218+
key = "Restricted"
219+
220+
data[key] += count
221+
222+
data = pd.DataFrame(data.items(), columns=["Category", "Count"])
223+
data.sort_values("Category", ascending=True, inplace=True)
224+
225+
file_path = shared.path_join(
226+
PATHS["data_phase"], "openverse_totals_by_restriction.csv"
227+
)
228+
check_for_data_file(file_path)
229+
data_to_csv(args, data, file_path)
230+
231+
232+
def main():
233+
args = parse_arguments()
234+
shared.paths_log(LOGGER, PATHS)
235+
shared.git_fetch_and_merge(args, PATHS["repo"])
236+
237+
file_count = shared.path_join(PATHS["data_1-fetch"], "openverse_fetch.csv")
238+
count_data = shared.open_data_file(
239+
LOGGER,
240+
file_count,
241+
usecols=["SOURCE", "MEDIA_TYPE", "TOOL_IDENTIFIER", "MEDIA_COUNT"],
242+
)
243+
process_totals_by_license(args, count_data)
244+
process_totals_by_media_type(args, count_data)
245+
process_totals_by_source(args, count_data)
246+
process_permissive_by_media_type(args, count_data)
247+
process_permissive_by_source(args, count_data)
248+
process_totals_by_restriction(args, count_data)
249+
# Push changes
250+
args = shared.git_add_and_commit(
251+
args,
252+
PATHS["repo"],
253+
PATHS["data_quarter"],
254+
f"Add and commit new GitHub data for {QUARTER}",
255+
)
256+
shared.git_push_changes(args, PATHS["repo"])
257+
258+
259+
if __name__ == "__main__":
260+
try:
261+
main()
262+
except shared.QuantifyingException as e:
263+
if e.exit_code == 0:
264+
LOGGER.info(e.message)
265+
else:
266+
LOGGER.error(e.message)
267+
sys.exit(e.code)
268+
except SystemExit as e:
269+
LOGGER.error(f"System exit with code: {e.code}")
270+
sys.exit(e.code)
271+
except KeyboardInterrupt:
272+
LOGGER.info("(130) Halted via KeyboardInterrupt.")
273+
sys.exit(130)
274+
except Exception:
275+
LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
276+
sys.exit(1)

0 commit comments

Comments
 (0)