Skip to content

Commit 5fc6503

Browse files
committed
smithsonian and stacked bar plot
1 parent bb60a6b commit 5fc6503

File tree

3 files changed

+524
-0
lines changed

3 files changed

+524
-0
lines changed
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
#!/usr/bin/env python
2+
"""
3+
This file is dedicated to processing Smithsonian data
4+
for analysis and comparison between quarters.
5+
"""
6+
# Standard library
7+
import argparse
8+
import os
9+
import sys
10+
import traceback
11+
12+
# Third-party
13+
import pandas as pd
14+
15+
# Add parent directory so shared can be imported
16+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
17+
18+
# First-party/Local
19+
import shared # noqa: E402
20+
21+
# Setup
22+
LOGGER, PATHS = shared.setup(__file__)
23+
24+
# Constants
25+
QUARTER = os.path.basename(PATHS["data_quarter"])
26+
FILE_PATHS = [
27+
shared.path_join(PATHS["data_phase"], "smithsonian_totals_by_units.csv"),
28+
shared.path_join(PATHS["data_phase"], "smithsonian_totals_by_records.csv"),
29+
]
30+
31+
32+
def parse_arguments():
33+
"""
34+
Parse command-line options, returns parsed argument namespace.
35+
"""
36+
global QUARTER
37+
LOGGER.info("Parsing command-line options")
38+
parser = argparse.ArgumentParser(description=__doc__)
39+
parser.add_argument(
40+
"--quarter",
41+
default=QUARTER,
42+
help=f"Data quarter in format YYYYQx (default: {QUARTER})",
43+
)
44+
parser.add_argument(
45+
"--enable-save",
46+
action="store_true",
47+
help="Enable saving results (default: False)",
48+
)
49+
parser.add_argument(
50+
"--enable-git",
51+
action="store_true",
52+
help="Enable git actions such as fetch, merge, add, commit, and push"
53+
" (default: False)",
54+
)
55+
parser.add_argument(
56+
"--force",
57+
action="store_true",
58+
help="Regenerate data even if processed files already exist",
59+
)
60+
61+
args = parser.parse_args()
62+
if not args.enable_save and args.enable_git:
63+
parser.error("--enable-git requires --enable-save")
64+
if args.quarter != QUARTER:
65+
global FILE_PATHS, PATHS
66+
FILE_PATHS = shared.paths_list_update(
67+
LOGGER, FILE_PATHS, QUARTER, args.quarter
68+
)
69+
PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
70+
QUARTER = args.quarter
71+
args.logger = LOGGER
72+
args.paths = PATHS
73+
return args
74+
75+
76+
def process_totals_by_units(args, count_data):
77+
"""
78+
Processing count data: totals by units
79+
"""
80+
LOGGER.info(process_totals_by_units.__doc__.strip())
81+
data = {}
82+
83+
for row in count_data.itertuples(index=False):
84+
unit = str(row.UNIT)
85+
total_objects = int(row.TOTAL_OBJECTS)
86+
87+
data[unit] = total_objects
88+
89+
data = pd.DataFrame(data.items(), columns=["Unit", "Count"])
90+
data.sort_values("Unit", ascending=True, inplace=True)
91+
data.reset_index(drop=True, inplace=True)
92+
file_path = shared.path_join(
93+
PATHS["data_phase"], "smithsonian_totals_by_units.csv"
94+
)
95+
shared.data_to_csv(args, data, file_path)
96+
97+
98+
def process_totals_by_records(args, count_data):
99+
"""
100+
Processing count data: totals by records
101+
"""
102+
LOGGER.info(process_totals_by_records.__doc__.strip())
103+
data = {}
104+
105+
for row in count_data.itertuples(index=False):
106+
unit = str(row.UNIT)
107+
cc0_records = int(row.CC0_RECORDS)
108+
cc0_records_with_cc0_media = int(row.CC0_RECORDS_WITH_CC0_MEDIA)
109+
total_objects = int(row.TOTAL_OBJECTS)
110+
111+
if cc0_records == 0 and cc0_records_with_cc0_media == 0:
112+
continue
113+
114+
if unit not in data:
115+
data[unit] = {
116+
"CC0_RECORDS": 0,
117+
"CC0_RECORDS_WITH_CC0_MEDIA": 0,
118+
"TOTAL_OBJECTS": 0,
119+
}
120+
121+
data[unit]["CC0_RECORDS"] += cc0_records
122+
data[unit]["CC0_RECORDS_WITH_CC0_MEDIA"] += cc0_records_with_cc0_media
123+
data[unit]["TOTAL_OBJECTS"] += total_objects
124+
125+
data = (
126+
pd.DataFrame.from_dict(data, orient="index")
127+
.reset_index()
128+
.rename(columns={"index": "Unit"})
129+
)
130+
data["CC0_RECORDS_PERCENTAGE"] = (
131+
(data["CC0_RECORDS"] / data["TOTAL_OBJECTS"]) * 100
132+
).round(2)
133+
134+
data["CC0_RECORDS_WITH_CC0_MEDIA_PERCENTAGE"] = (
135+
(data["CC0_RECORDS_WITH_CC0_MEDIA"] / data["TOTAL_OBJECTS"]) * 100
136+
).round(2)
137+
138+
data.sort_values("Unit", ascending=True, inplace=True)
139+
data.reset_index(drop=True, inplace=True)
140+
141+
file_path = shared.path_join(
142+
PATHS["data_phase"], "smithsonian_totals_by_records.csv"
143+
)
144+
shared.data_to_csv(args, data, file_path)
145+
146+
147+
def main():
148+
args = parse_arguments()
149+
shared.paths_log(LOGGER, PATHS)
150+
shared.git_fetch_and_merge(args, PATHS["repo"])
151+
shared.check_completion_file_exists(args, FILE_PATHS)
152+
file_count = shared.path_join(
153+
PATHS["data_1-fetch"], "smithsonian_2_units.csv"
154+
)
155+
count_data = shared.open_data_file(
156+
LOGGER,
157+
file_count,
158+
usecols=[
159+
"UNIT",
160+
"CC0_RECORDS",
161+
"CC0_RECORDS_WITH_CC0_MEDIA",
162+
"TOTAL_OBJECTS",
163+
],
164+
)
165+
process_totals_by_units(args, count_data)
166+
process_totals_by_records(args, count_data)
167+
168+
# Push changes
169+
args = shared.git_add_and_commit(
170+
args,
171+
PATHS["repo"],
172+
PATHS["data_quarter"],
173+
f"Add and commit new GitHub data for {QUARTER}",
174+
)
175+
shared.git_push_changes(args, PATHS["repo"])
176+
177+
178+
if __name__ == "__main__":
179+
try:
180+
main()
181+
except shared.QuantifyingException as e:
182+
if e.exit_code == 0:
183+
LOGGER.info(e.message)
184+
else:
185+
LOGGER.error(e.message)
186+
sys.exit(e.exit_code)
187+
except SystemExit as e:
188+
LOGGER.error(f"System exit with code: {e.code}")
189+
sys.exit(e.code)
190+
except KeyboardInterrupt:
191+
LOGGER.info("(130) Halted via KeyboardInterrupt.")
192+
sys.exit(130)
193+
except Exception:
194+
LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
195+
sys.exit(1)

0 commit comments

Comments
 (0)