Skip to content

Commit 4153081

Browse files
committed
Added unit full name and updates
1 parent 5776eef commit 4153081

File tree

4 files changed

+122
-18
lines changed

4 files changed

+122
-18
lines changed

scripts/1-fetch/smithsonian_fetch.py

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,49 @@
4646
]
4747
QUARTER = os.path.basename(PATHS["data_quarter"])
4848

49+
unit_map = {
50+
"AAA": "Archives of American Art",
51+
"AAG": "Archives of American Gardens",
52+
"ACM": "Anacostia Community Museum",
53+
"ACMA": "Anacostia Community Museum Archives",
54+
"CFCHFOLKLIFE": "Ralph Rinzler Folklife Archives and Collections",
55+
"CHNDM": "Cooper Hewitt, Smithsonian Design Museum",
56+
"FBR": "Smithsonian Field Book Project",
57+
"FSG": "Freer Gallery of Art and Arthur M. Sackler Gallery",
58+
"HAC": "Smithsonian Gardens",
59+
"HMSG": "Hirshhorn Museum and Sculpture Garden",
60+
"HSFA": "Human Studies Film Archives",
61+
"NASM": "National Air and Space Museum",
62+
"NMAAHC": "National Museum of African American History and Culture",
63+
"NMAH": "National Museum of American History",
64+
"NMAI": "National Museum of the American Indian",
65+
"NMAfA": "National Museum of African Art",
66+
"NMNHANTHRO": "NMNH - Anthropology Dept.",
67+
"NMNHBIRDS": "NMNH - Vertebrate Zoology - Birds Division",
68+
"NMNHBOTANY": "NMNH - Botany Dept.",
69+
"NMNHEDUCATION": "NMNH - Education & Outreach",
70+
"NMNHENTO": "NMNH - Entomology Dept.",
71+
"NMNHFISHES": "NMNH - Vertebrate Zoology - Fishes Division",
72+
"NMNHHERPS": "NMNH - Vertebrate Zoology - Herpetology Division",
73+
"NMNHINV": "NMNH - Invertebrate Zoology Dept.",
74+
"NMNHMAMMALS": "NMNH - Vertebrate Zoology - Mammals Division",
75+
"NMNHMINSCI": "NMNH - Mineral Sciences Dept.",
76+
"NMNHPALEO": "NMNH - Paleobiology Dept.",
77+
"NPG": "National Portrait Gallery",
78+
"NPM": "National Postal Museum",
79+
"NZP": "Smithsonian's National Zoo & Conservation Biology Institute",
80+
"OCIO_DPO3D": "OCIO Digital Preservation & 3D Team",
81+
"OFEO-SG": "Office of Facilities Engineering &"
82+
" Operations – Smithsonian Gardens",
83+
"SAAM": "Smithsonian American Art Museum",
84+
"SIA": "Smithsonian Institution Archives",
85+
"SIL": "Smithsonian Libraries",
86+
"SILAF": "Smithsonian Institution Libraries, African Section",
87+
"SILNMAHTL": "Smithsonian Institution Libraries,"
88+
" National Museum of American History, Library",
89+
"SLA_SRO": "Smithsonian Libraries Archives, Special Research/Operations",
90+
}
91+
4992

5093
def parse_arguments():
5194
"""
@@ -121,6 +164,33 @@ def write_data(args, data_metrics, data_units):
121164
return args
122165

123166

167+
def fetch_unit_codes(session):
168+
LOGGER.info("Fetching current unit codes from Smithsonian API")
169+
url = "https://api.si.edu/openaccess/api/v1.0/terms/unit_code"
170+
params = {"api_key": DATA_GOV_API_KEY}
171+
try:
172+
with session.get(url, params=params) as response:
173+
response.raise_for_status()
174+
api_codes = set(response.json()["response"]["terms"])
175+
except requests.HTTPError as e:
176+
raise shared.QuantifyingException(f"HTTP Error: {e}", 1)
177+
except requests.RequestException as e:
178+
raise shared.QuantifyingException(f"Request Exception: {e}", 1)
179+
except KeyError as e:
180+
raise shared.QuantifyingException(f"KeyError: {e}", 1)
181+
182+
map_codes = set(unit_map.keys())
183+
new_codes = sorted(api_codes - map_codes)
184+
removed_codes = sorted(map_codes - api_codes)
185+
186+
if new_codes:
187+
LOGGER.warning(f"New unit code(s) not in unit_map: {new_codes}")
188+
if removed_codes:
189+
LOGGER.warning(f"unit_map code(s) no longer in API: {removed_codes}")
190+
if not new_codes and not removed_codes:
191+
LOGGER.info("unit_map is up to date")
192+
193+
124194
def query_smithsonian(args, session):
125195
if not DATA_GOV_API_KEY:
126196
raise shared.QuantifyingException(
@@ -158,7 +228,7 @@ def query_smithsonian(args, session):
158228
continue
159229
data_units.append(
160230
{
161-
"UNIT": unit["unit"],
231+
"UNIT": unit_map.get(unit["unit"], unit["unit"]),
162232
"CC0_RECORDS": unit["metrics"]["CC0_records"],
163233
"CC0_RECORDS_WITH_CC0_MEDIA": unit["metrics"][
164234
"CC0_records_with_CC0_media"
@@ -176,6 +246,7 @@ def main():
176246
shared.paths_log(LOGGER, PATHS)
177247
check_for_completion()
178248
session = shared.get_session()
249+
fetch_unit_codes(session)
179250
data_metrics, data_units = query_smithsonian(args, session)
180251
args = write_data(args, data_metrics, data_units)
181252
args = shared.git_add_and_commit(

scripts/2-process/smithsonian_process.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,14 +127,23 @@ def process_totals_by_records(args, count_data):
127127
.reset_index()
128128
.rename(columns={"index": "Unit"})
129129
)
130-
data["CC0_RECORDS_PERCENTAGE"] = (
131-
(data["CC0_RECORDS"] / data["TOTAL_OBJECTS"]) * 100
130+
data["CC0_WITHOUT_MEDIA_PERCENTAGE"] = (
131+
(
132+
(data["CC0_RECORDS"] - data["CC0_RECORDS_WITH_CC0_MEDIA"])
133+
/ data["TOTAL_OBJECTS"]
134+
)
135+
* 100
132136
).round(2)
133137

134-
data["CC0_RECORDS_WITH_CC0_MEDIA_PERCENTAGE"] = (
138+
data["CC0_WITH_MEDIA_PERCENTAGE"] = (
135139
(data["CC0_RECORDS_WITH_CC0_MEDIA"] / data["TOTAL_OBJECTS"]) * 100
136140
).round(2)
137141

142+
data["OTHERS_PERCENTAGE"] = (
143+
((data["TOTAL_OBJECTS"] - data["CC0_RECORDS"]) / data["TOTAL_OBJECTS"])
144+
* 100
145+
).round(2)
146+
138147
data.sort_values("Unit", ascending=True, inplace=True)
139148
data.reset_index(drop=True, inplace=True)
140149

scripts/3-report/smithsonian_report.py

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -100,20 +100,35 @@ def smithsonian_intro(args):
100100
"""
101101
LOGGER.info(smithsonian_intro.__doc__.strip())
102102
file_path = shared.path_join(
103-
PATHS["data_1-fetch"],
104-
"smithsonian_1_metrics.csv",
103+
PATHS["data_2-process"],
104+
"smithsonian_totals_by_records.csv",
105105
)
106106
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
107-
# name_label = "UNIT"
108-
# data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
109-
# data.sort_values(name_label, ascending=True, inplace=True)
107+
data = shared.open_data_file(LOGGER, file_path)
108+
total_objects = data["TOTAL_OBJECTS"].sum()
109+
cc0_records = data["CC0_RECORDS"].sum()
110+
cc0_records_with_media = data["CC0_RECORDS_WITH_CC0_MEDIA"].sum()
111+
cc0_media_percentage = f"{data['CC0_WITH_MEDIA_PERCENTAGE'].mean():.2f}%"
112+
num_units = len(data)
113+
min_unit = data["TOTAL_OBJECTS"].min()
110114
shared.update_readme(
111115
args,
112116
SECTION_FILE,
113117
SECTION_TITLE,
114118
"Overview",
115119
None,
116120
None,
121+
"The Smithsonian data returns the overall "
122+
" statistics of CC0 legal tool records."
123+
" It serves as the main legal tool used by Smithsonian."
124+
"\n"
125+
f"The results indicate a total record of {total_objects} objects,"
126+
f" with a breakdown of {cc0_records} objects without CC0 Media and"
127+
f" {cc0_records_with_media} objects with CC0 Media, taking a"
128+
f" percentage of {cc0_media_percentage} in each unit."
129+
f" There are {num_units} unique units in the data"
130+
" representing museums, libraries, zoos and many other"
131+
f" with a minimum of {min_unit} objects.",
117132
)
118133

119134

@@ -124,13 +139,15 @@ def plot_totals_by_units(args):
124139
LOGGER.info(plot_totals_by_units.__doc__.strip())
125140
file_path = shared.path_join(
126141
PATHS["data_2-process"],
127-
"smithsonian_totals_by_units.csv",
142+
"smithsonian_totals_by_records.csv",
128143
)
129144
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
130145
name_label = "Unit"
131-
data_label = "Count"
146+
data_label = "TOTAL_OBJECTS"
132147
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
148+
data["TOTAL_OBJECTS"] = data["TOTAL_OBJECTS"].astype(int)
133149
data.sort_values(data_label, ascending=True, inplace=True)
150+
average_unit = data["TOTAL_OBJECTS"].mean()
134151
data = data.head(10)
135152
title = "Totals by Units"
136153
plt = plot.combined_plot(
@@ -157,7 +174,11 @@ def plot_totals_by_units(args):
157174
SECTION_TITLE,
158175
title,
159176
image_path,
160-
"Coming soon",
177+
"Plots showing totals by units.",
178+
"This shows the distribution of top 10"
179+
" units/ sub providers across smithsonian"
180+
f" with an average of {average_unit} objects"
181+
" across the sub providers.",
161182
)
162183

163184

@@ -173,11 +194,11 @@ def plot_totals_by_records(args):
173194
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
174195
name_label = "Unit"
175196
stack_labels = [
176-
"CC0_RECORDS_PERCENTAGE",
177-
"CC0_RECORDS_WITH_CC0_MEDIA_PERCENTAGE",
197+
"CC0_WITHOUT_MEDIA_PERCENTAGE",
198+
"CC0_WITH_MEDIA_PERCENTAGE",
199+
"OTHERS_PERCENTAGE",
178200
]
179201
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
180-
data.sort_values(stack_labels, ascending=False, inplace=True)
181202
data = data.head(10)
182203
title = "Totals by records"
183204
plt = plot.stacked_barh_plot(
@@ -202,7 +223,9 @@ def plot_totals_by_records(args):
202223
SECTION_TITLE,
203224
title,
204225
image_path,
205-
"Coming soon",
226+
"Plots showing totals by CC0 records.",
227+
"This is the breakdown of CC0 records"
228+
" without media and CC0 records with media.",
206229
)
207230

208231

scripts/plot.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def annotate_ylabels(ax, data, data_label, colors):
2626

2727
# annotate totals
2828
ax.annotate(
29-
f" {row[data_label]:>15,d}",
29+
f" {int(row[data_label]):>15,d}",
3030
(indent, i - 0.1),
3131
xycoords=("axes points", "data"),
3232
color=colors[c],
@@ -81,7 +81,7 @@ def combined_plot(
8181
# pad tick labels to make room for annotation
8282
tick_labels = []
8383
for index, row in data.iterrows():
84-
count = f"{row[data_label]:,d}"
84+
count = f"{int(row[data_label]):,d}"
8585
tick_labels.append(f"{index}\n{' ' * len(count)}")
8686
if bar_xscale == "log":
8787
log = True
@@ -185,6 +185,7 @@ def stacked_barh_plot(
185185
]
186186

187187
ax.set_xlabel("Number of works")
188+
# ax.set_xlim(0, 100)
188189
ax.xaxis.set_major_formatter(ticker.FuncFormatter(number_formatter))
189190

190191
if ylabel:

0 commit comments

Comments
 (0)