Skip to content

Commit eee780d

Browse files
committed
Made review changes
1 parent 15720d1 commit eee780d

File tree

4 files changed

+97
-48
lines changed

4 files changed

+97
-48
lines changed

scripts/1-fetch/smithsonian_fetch.py

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,17 @@
3939
"TOTAL_OBJECTS",
4040
]
4141
HEADER_2_UNITS = [
42-
"UNIT",
42+
"UNIT_CODE",
43+
"UNIT_NAME",
4344
"CC0_RECORDS",
4445
"CC0_RECORDS_WITH_CC0_MEDIA",
4546
"TOTAL_OBJECTS",
4647
]
4748
QUARTER = os.path.basename(PATHS["data_quarter"])
4849

49-
unit_map = {
50+
# Manually compiled unit code and name from URL
51+
# 'https://github.com/Smithsonian/OpenAccess'
52+
UNIT_MAP = {
5053
"AAA": "Archives of American Art",
5154
"AAG": "Archives of American Gardens",
5255
"ACM": "Anacostia Community Museum",
@@ -63,17 +66,35 @@
6366
"NMAH": "National Museum of American History",
6467
"NMAI": "National Museum of the American Indian",
6568
"NMAfA": "National Museum of African Art",
66-
"NMNHANTHRO": "NMNH - Anthropology Dept.",
67-
"NMNHBIRDS": "NMNH - Vertebrate Zoology - Birds Division",
68-
"NMNHBOTANY": "NMNH - Botany Dept.",
69-
"NMNHEDUCATION": "NMNH - Education & Outreach",
70-
"NMNHENTO": "NMNH - Entomology Dept.",
71-
"NMNHFISHES": "NMNH - Vertebrate Zoology - Fishes Division",
72-
"NMNHHERPS": "NMNH - Vertebrate Zoology - Herpetology Division",
73-
"NMNHINV": "NMNH - Invertebrate Zoology Dept.",
74-
"NMNHMAMMALS": "NMNH - Vertebrate Zoology - Mammals Division",
75-
"NMNHMINSCI": "NMNH - Mineral Sciences Dept.",
76-
"NMNHPALEO": "NMNH - Paleobiology Dept.",
69+
"NMNHANTHRO": ("National Musuem of Natural History - Anthropology Dept."),
70+
"NMNHBIRDS": (
71+
"National Musuem of Natural History"
72+
" - Vertebrate Zoology - Birds Division"
73+
),
74+
"NMNHBOTANY": ("National Musuem of Natural History - Botany Dept."),
75+
"NMNHEDUCATION": (
76+
"National Musuem of Natural History" " - Education & Outreach"
77+
),
78+
"NMNHENTO": ("National Musuem of Natural History - Entomology Dept."),
79+
"NMNHFISHES": (
80+
"National Musuem of Natural History"
81+
" - Vertebrate Zoology - Fishes Division"
82+
),
83+
"NMNHHERPS": (
84+
"National Musuem of Natural History"
85+
" - Vertebrate Zoology - Herpetology Division"
86+
),
87+
"NMNHINV": (
88+
"National Musuem of Natural History" " - Invertebrate Zoology Dept."
89+
),
90+
"NMNHMAMMALS": (
91+
"National Musuem of Natural History"
92+
" - Vertebrate Zoology - Mammals Division"
93+
),
94+
"NMNHMINSCI": (
95+
"National Musuem of Natural History" " - Mineral Sciences Dept."
96+
),
97+
"NMNHPALEO": ("National Musuem of Natural History - Paleobiology Dept."),
7798
"NPG": "National Portrait Gallery",
7899
"NPM": "National Postal Museum",
79100
"NZP": "Smithsonian's National Zoo & Conservation Biology Institute",
@@ -179,7 +200,7 @@ def fetch_unit_codes(session):
179200
except KeyError as e:
180201
raise shared.QuantifyingException(f"KeyError: {e}", 1)
181202

182-
map_codes = set(unit_map.keys())
203+
map_codes = set(UNIT_MAP.keys())
183204
new_codes = sorted(api_codes - map_codes)
184205
removed_codes = sorted(map_codes - api_codes)
185206

@@ -228,15 +249,16 @@ def query_smithsonian(args, session):
228249
continue
229250
data_units.append(
230251
{
231-
"UNIT": unit_map.get(unit["unit"], unit["unit"]),
252+
"UNIT_CODE": unit["unit"],
253+
"UNIT_NAME": UNIT_MAP.get(unit["unit"], unit["unit"]),
232254
"CC0_RECORDS": unit["metrics"]["CC0_records"],
233255
"CC0_RECORDS_WITH_CC0_MEDIA": unit["metrics"][
234256
"CC0_records_with_CC0_media"
235257
],
236258
"TOTAL_OBJECTS": unit["total_objects"],
237259
}
238260
)
239-
data_units = sorted(data_units, key=itemgetter("UNIT"))
261+
data_units = sorted(data_units, key=itemgetter("UNIT_CODE"))
240262
LOGGER.info(f"Fetched stats for {len(data_units)} units")
241263
return data_metrics, data_units
242264

scripts/2-process/smithsonian_process.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -82,13 +82,13 @@ def process_totals_by_units(args, count_data):
8282
data = {}
8383

8484
for row in count_data.itertuples(index=False):
85-
unit = str(row.UNIT)
85+
unit = str(row.UNIT_NAME)
8686
total_objects = int(row.TOTAL_OBJECTS)
8787

8888
data[unit] = total_objects
8989

90-
data = pd.DataFrame(data.items(), columns=["Unit", "Total_objects"])
91-
data.sort_values("Unit", ascending=True, inplace=True)
90+
data = pd.DataFrame(data.items(), columns=["Unit_name", "Total_objects"])
91+
data.sort_values("Unit_name", ascending=True, inplace=True)
9292
data.reset_index(drop=True, inplace=True)
9393
file_path = shared.path_join(
9494
PATHS["data_phase"], "smithsonian_totals_by_units.csv"
@@ -104,7 +104,7 @@ def process_totals_by_records(args, count_data):
104104
data = {}
105105

106106
for row in count_data.itertuples(index=False):
107-
unit = str(row.UNIT)
107+
unit = str(row.UNIT_NAME)
108108
CC0_records = int(row.CC0_RECORDS)
109109
CC0_records_with_CC0_media = int(row.CC0_RECORDS_WITH_CC0_MEDIA)
110110
total_objects = int(row.TOTAL_OBJECTS)
@@ -126,7 +126,7 @@ def process_totals_by_records(args, count_data):
126126
data = (
127127
pd.DataFrame.from_dict(data, orient="index")
128128
.reset_index()
129-
.rename(columns={"index": "Unit"})
129+
.rename(columns={"index": "Unit_name"})
130130
)
131131
data["CC0_without_media_percentage"] = (
132132
(
@@ -145,7 +145,7 @@ def process_totals_by_records(args, count_data):
145145
* 100
146146
).round(2)
147147

148-
data.sort_values("Unit", ascending=True, inplace=True)
148+
data.sort_values("Unit_name", ascending=True, inplace=True)
149149
data.reset_index(drop=True, inplace=True)
150150

151151
file_path = shared.path_join(
@@ -166,7 +166,8 @@ def main():
166166
LOGGER,
167167
file_count,
168168
usecols=[
169-
"UNIT",
169+
"UNIT_CODE",
170+
"UNIT_NAME",
170171
"CC0_RECORDS",
171172
"CC0_RECORDS_WITH_CC0_MEDIA",
172173
"TOTAL_OBJECTS",

scripts/3-report/smithsonian_report.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -111,25 +111,25 @@ def smithsonian_intro(args):
111111
CC0_records_with_media = data["CC0_records_with_CC0_media"].sum()
112112
CC0_media_percentage = f"{data['CC0_with_media_percentage'].mean():.2f}%"
113113
num_units = len(data)
114-
min_unit = data["Total_objects"].min()
114+
min_object = data["Total_objects"].min()
115115
shared.update_readme(
116116
args,
117117
SECTION_FILE,
118118
SECTION_TITLE,
119119
"Overview",
120120
None,
121121
None,
122-
"The Smithsonian data returns the overall "
122+
"The Smithsonian Institute data returns the overall"
123123
" statistics of CC0 legal tool records."
124-
" It serves as the main legal tool used by Smithsonian."
124+
" It serves as the main legal tool used by Smithsonian Institute."
125125
"\n"
126-
f"The results indicate a total record of {total_objects} objects,"
127-
f" with a breakdown of {CC0_records} objects without CC0 Media and"
128-
f" {CC0_records_with_media} objects with CC0 Media, taking a"
129-
f" percentage of {CC0_media_percentage} in each unit."
126+
f"The results indicate a total record of {total_objects:,} objects,"
127+
f" with a breakdown of {CC0_records:,} objects without CC0 Media and"
128+
f" {CC0_records_with_media:,} objects with CC0 Media, taking a"
129+
f" percentage of {CC0_media_percentage} in each institute member."
130130
f" There are {num_units} unique units in the data"
131-
" representing museums, libraries, zoos and many other"
132-
f" with a minimum of {min_unit} objects.",
131+
" representing museums, libraries, zoos and other institutions"
132+
f" with a minimum of {min_object} objects.",
133133
)
134134

135135

@@ -143,7 +143,7 @@ def plot_totals_by_top10_units(args):
143143
"smithsonian_totals_by_units.csv",
144144
)
145145
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
146-
name_label = "Unit"
146+
name_label = "Unit_name"
147147
data_label = "Total_objects"
148148
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
149149
data["Total_objects"] = data["Total_objects"].astype(int)
@@ -175,11 +175,11 @@ def plot_totals_by_top10_units(args):
175175
SECTION_TITLE,
176176
title,
177177
image_path,
178-
"Plots showing totals by units.",
179-
"This shows the distribution of top 10"
180-
" units/ sub providers across smithsonian"
181-
f" with an average of {average_unit} objects"
182-
" across the top 10 sub providers.",
178+
"Plots showing totals by units. This shows the"
179+
" distribution of top 10 institute member across"
180+
" Smithsonian Institute with an average of"
181+
f" {average_unit:,} objects across the top 10"
182+
"Institute members.",
183183
)
184184

185185

@@ -193,7 +193,7 @@ def plot_totals_by_lowest10_units(args):
193193
"smithsonian_totals_by_units.csv",
194194
)
195195
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
196-
name_label = "Unit"
196+
name_label = "Unit_name"
197197
data_label = "Total_objects"
198198
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
199199
data["Total_objects"] = data["Total_objects"].astype(int)
@@ -227,9 +227,9 @@ def plot_totals_by_lowest10_units(args):
227227
image_path,
228228
"Plots showing totals by units.",
229229
"This shows the distribution of lowest 10"
230-
" units/ sub providers across smithsonian"
230+
" institute member across Smithsonian Institute"
231231
f" with an average of {average_unit} objects"
232-
" across the lowest 10 sub providers.",
232+
" across the lowest 10 institute members.",
233233
)
234234

235235

@@ -243,7 +243,7 @@ def plot_totals_by_records(args):
243243
"smithsonian_totals_by_records.csv",
244244
)
245245
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
246-
name_label = "Unit"
246+
name_label = "Unit_name"
247247
stack_labels = [
248248
"CC0_without_media_percentage",
249249
"CC0_with_media_percentage",
@@ -274,8 +274,8 @@ def plot_totals_by_records(args):
274274
SECTION_TITLE,
275275
title,
276276
image_path,
277-
"Plots showing totals by CC0 records.",
278-
"This is the breakdown of CC0 records"
277+
"Plots showing totals by CC0 records. This is the"
278+
" breakdown of top 10 records with highest CC0 records"
279279
" without media, CC0 records with media and records"
280280
" that are not associated with CC0.",
281281
)

scripts/plot.py

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def annotate_ylabels(ax, data, data_label, colors):
2727
# annotate totals
2828
ax.annotate(
2929
f" {int(row[data_label]):>15,d}",
30-
(indent, i - 0.1),
30+
(indent, i - 0.22),
3131
xycoords=("axes points", "data"),
3232
color=colors[c],
3333
fontsize="x-small",
@@ -82,7 +82,7 @@ def combined_plot(
8282
tick_labels = []
8383
for index, row in data.iterrows():
8484
count = f"{int(row[data_label]):,d}"
85-
tick_labels.append(f"{index}\n{' ' * len(count)}")
85+
tick_labels.append(f"{wrap_label(index)}\n{' ' * len(count)}")
8686
if bar_xscale == "log":
8787
log = True
8888
else:
@@ -144,13 +144,37 @@ def number_formatter(x, pos):
144144
return f"{x:,.0f}"
145145

146146

147+
def wrap_label(label):
148+
if " " not in label:
149+
return label
150+
151+
midpoint = len(label) // 2
152+
# find nearest space to midpoint
153+
left = label.rfind(" ", 0, midpoint)
154+
right = label.find(" ", midpoint)
155+
156+
if left == -1:
157+
split_index = right
158+
elif right == -1:
159+
split_index = left
160+
else:
161+
if midpoint - left <= right - midpoint:
162+
split_index = left
163+
else:
164+
split_index = right
165+
if split_index == -1:
166+
return label
167+
168+
return f"{label[:split_index]}\n{label[split_index + 1:]}"
169+
170+
147171
def stacked_barh_plot(
148172
args,
149173
data,
150174
title,
151175
name_label,
152176
stack_labels,
153-
xscale=None,
177+
xscale="linear",
154178
ylabel=None,
155179
):
156180
"""
@@ -185,8 +209,9 @@ def stacked_barh_plot(
185209
]
186210

187211
ax.set_xlabel("Number of works")
188-
# ax.set_xlim(0, 100)
189212
ax.xaxis.set_major_formatter(ticker.FuncFormatter(number_formatter))
213+
ax.set_yticks(range(len(data.index)))
214+
ax.set_yticklabels([wrap_label(label) for label in data.index])
190215

191216
if ylabel:
192217
ax.set_ylabel(ylabel)
@@ -198,6 +223,7 @@ def stacked_barh_plot(
198223
fontsize="x-small",
199224
title_fontsize="x-small",
200225
loc="upper right",
226+
bbox_to_anchor=(1.02, 1),
201227
)
202228

203229
plt.suptitle(title)

0 commit comments

Comments
 (0)