Skip to content

Commit 5868ca8

Browse files
committed
Merge branch 'process_report' of https://github.com/creativecommons/quantifying into process_report
2 parents b937dfb + 198a8f4 commit 5868ca8

File tree

4 files changed

+165
-62
lines changed

4 files changed

+165
-62
lines changed

scripts/1-fetch/smithsonian_fetch.py

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,17 @@
3939
"TOTAL_OBJECTS",
4040
]
4141
HEADER_2_UNITS = [
42-
"UNIT",
42+
"UNIT_CODE",
43+
"DATA_SOURCE",
4344
"CC0_RECORDS",
4445
"CC0_RECORDS_WITH_CC0_MEDIA",
4546
"TOTAL_OBJECTS",
4647
]
4748
QUARTER = os.path.basename(PATHS["data_quarter"])
4849

49-
unit_map = {
50+
# Manually compiled unit code and name from URL
51+
# 'https://github.com/Smithsonian/OpenAccess'
52+
UNIT_MAP = {
5053
"AAA": "Archives of American Art",
5154
"AAG": "Archives of American Gardens",
5255
"ACM": "Anacostia Community Museum",
@@ -63,17 +66,35 @@
6366
"NMAH": "National Museum of American History",
6467
"NMAI": "National Museum of the American Indian",
6568
"NMAfA": "National Museum of African Art",
66-
"NMNHANTHRO": "NMNH - Anthropology Dept.",
67-
"NMNHBIRDS": "NMNH - Vertebrate Zoology - Birds Division",
68-
"NMNHBOTANY": "NMNH - Botany Dept.",
69-
"NMNHEDUCATION": "NMNH - Education & Outreach",
70-
"NMNHENTO": "NMNH - Entomology Dept.",
71-
"NMNHFISHES": "NMNH - Vertebrate Zoology - Fishes Division",
72-
"NMNHHERPS": "NMNH - Vertebrate Zoology - Herpetology Division",
73-
"NMNHINV": "NMNH - Invertebrate Zoology Dept.",
74-
"NMNHMAMMALS": "NMNH - Vertebrate Zoology - Mammals Division",
75-
"NMNHMINSCI": "NMNH - Mineral Sciences Dept.",
76-
"NMNHPALEO": "NMNH - Paleobiology Dept.",
69+
"NMNHANTHRO": ("National Musuem of Natural History - Anthropology Dept."),
70+
"NMNHBIRDS": (
71+
"National Musuem of Natural History"
72+
" - Vertebrate Zoology - Birds Division"
73+
),
74+
"NMNHBOTANY": ("National Musuem of Natural History - Botany Dept."),
75+
"NMNHEDUCATION": (
76+
"National Musuem of Natural History" " - Education & Outreach"
77+
),
78+
"NMNHENTO": ("National Musuem of Natural History - Entomology Dept."),
79+
"NMNHFISHES": (
80+
"National Musuem of Natural History"
81+
" - Vertebrate Zoology - Fishes Division"
82+
),
83+
"NMNHHERPS": (
84+
"National Musuem of Natural History"
85+
" - Vertebrate Zoology - Herpetology Division"
86+
),
87+
"NMNHINV": (
88+
"National Musuem of Natural History" " - Invertebrate Zoology Dept."
89+
),
90+
"NMNHMAMMALS": (
91+
"National Musuem of Natural History"
92+
" - Vertebrate Zoology - Mammals Division"
93+
),
94+
"NMNHMINSCI": (
95+
"National Musuem of Natural History" " - Mineral Sciences Dept."
96+
),
97+
"NMNHPALEO": ("National Musuem of Natural History - Paleobiology Dept."),
7798
"NPG": "National Portrait Gallery",
7899
"NPM": "National Postal Museum",
79100
"NZP": "Smithsonian's National Zoo & Conservation Biology Institute",
@@ -145,7 +166,7 @@ def query_smithsonian(args, session):
145166
" API key is set in .env",
146167
1,
147168
)
148-
LOGGER.info("Fetch CC0 metrics and units from units from Smithsonain")
169+
LOGGER.info("Fetch CC0 metrics and units from units from Smithsonian")
149170
url = "https://api.si.edu/openaccess/api/v1.0/stats"
150171
params = {"api_key": DATA_GOV_API_KEY}
151172
try:
@@ -175,15 +196,16 @@ def query_smithsonian(args, session):
175196
continue
176197
data_units.append(
177198
{
178-
"UNIT": unit_map.get(unit["unit"], unit["unit"]),
199+
"UNIT_CODE": unit["unit"],
200+
"DATA_SOURCE": UNIT_MAP.get(unit["unit"], unit["unit"]),
179201
"CC0_RECORDS": unit["metrics"]["CC0_records"],
180202
"CC0_RECORDS_WITH_CC0_MEDIA": unit["metrics"][
181203
"CC0_records_with_CC0_media"
182204
],
183205
"TOTAL_OBJECTS": unit["total_objects"],
184206
}
185207
)
186-
data_units = sorted(data_units, key=itemgetter("UNIT"))
208+
data_units = sorted(data_units, key=itemgetter("UNIT_CODE"))
187209
LOGGER.info(f"Fetched stats for {len(data_units)} units")
188210
return data_metrics, data_units
189211

scripts/2-process/smithsonian_process.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -82,13 +82,13 @@ def process_totals_by_units(args, count_data):
8282
data = {}
8383

8484
for row in count_data.itertuples(index=False):
85-
unit = str(row.UNIT)
85+
unit = str(row.DATA_SOURCE)
8686
total_objects = int(row.TOTAL_OBJECTS)
8787

8888
data[unit] = total_objects
8989

90-
data = pd.DataFrame(data.items(), columns=["Unit", "Total_objects"])
91-
data.sort_values("Unit", ascending=True, inplace=True)
90+
data = pd.DataFrame(data.items(), columns=["Data_source", "Total_objects"])
91+
data.sort_values("Data_source", ascending=True, inplace=True)
9292
data.reset_index(drop=True, inplace=True)
9393
file_path = shared.path_join(
9494
PATHS["data_phase"], "smithsonian_totals_by_units.csv"
@@ -104,14 +104,11 @@ def process_totals_by_records(args, count_data):
104104
data = {}
105105

106106
for row in count_data.itertuples(index=False):
107-
unit = str(row.UNIT)
107+
unit = str(row.DATA_SOURCE)
108108
CC0_records = int(row.CC0_RECORDS)
109109
CC0_records_with_CC0_media = int(row.CC0_RECORDS_WITH_CC0_MEDIA)
110110
total_objects = int(row.TOTAL_OBJECTS)
111111

112-
if CC0_records == 0 and CC0_records_with_CC0_media == 0:
113-
continue
114-
115112
if unit not in data:
116113
data[unit] = {
117114
"CC0_records": 0,
@@ -126,7 +123,7 @@ def process_totals_by_records(args, count_data):
126123
data = (
127124
pd.DataFrame.from_dict(data, orient="index")
128125
.reset_index()
129-
.rename(columns={"index": "Unit"})
126+
.rename(columns={"index": "Data_source"})
130127
)
131128
data["CC0_without_media_percentage"] = (
132129
(
@@ -145,7 +142,7 @@ def process_totals_by_records(args, count_data):
145142
* 100
146143
).round(2)
147144

148-
data.sort_values("Unit", ascending=True, inplace=True)
145+
data.sort_values("Data_source", ascending=True, inplace=True)
149146
data.reset_index(drop=True, inplace=True)
150147

151148
file_path = shared.path_join(
@@ -166,7 +163,8 @@ def main():
166163
LOGGER,
167164
file_count,
168165
usecols=[
169-
"UNIT",
166+
"UNIT_CODE",
167+
"DATA_SOURCE",
170168
"CC0_RECORDS",
171169
"CC0_RECORDS_WITH_CC0_MEDIA",
172170
"TOTAL_OBJECTS",

scripts/3-report/smithsonian_report.py

Lines changed: 85 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -111,25 +111,25 @@ def smithsonian_intro(args):
111111
CC0_records_with_media = data["CC0_records_with_CC0_media"].sum()
112112
CC0_media_percentage = f"{data['CC0_with_media_percentage'].mean():.2f}%"
113113
num_units = len(data)
114-
min_unit = data["Total_objects"].min()
114+
min_object = data["Total_objects"].min()
115115
shared.update_readme(
116116
args,
117117
SECTION_FILE,
118118
SECTION_TITLE,
119119
"Overview",
120120
None,
121121
None,
122-
"The Smithsonian data returns the overall "
122+
"The Smithsonian Institute data returns the overall"
123123
" statistics of CC0 legal tool records."
124-
" It serves as the main legal tool used by Smithsonian."
124+
" It serves as the main legal tool used by Smithsonian Institute."
125125
"\n"
126-
f"The results indicate a total record of {total_objects} objects,"
127-
f" with a breakdown of {CC0_records} objects without CC0 Media and"
128-
f" {CC0_records_with_media} objects with CC0 Media, taking a"
129-
f" percentage of {CC0_media_percentage} in each unit."
126+
f"The results indicate a total record of {total_objects:,} objects,"
127+
f" with a breakdown of {CC0_records:,} objects without CC0 Media and"
128+
f" {CC0_records_with_media:,} objects with CC0 Media, taking a"
129+
f" percentage of {CC0_media_percentage} in each institute member."
130130
f" There are {num_units} unique units in the data"
131-
" representing museums, libraries, zoos and many other"
132-
f" with a minimum of {min_unit} objects.",
131+
" representing museums, libraries, zoos and other institutions"
132+
f" with a minimum of {min_object} objects.",
133133
)
134134

135135

@@ -143,20 +143,21 @@ def plot_totals_by_top10_units(args):
143143
"smithsonian_totals_by_units.csv",
144144
)
145145
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
146-
name_label = "Unit"
146+
name_label = "Data_source"
147147
data_label = "Total_objects"
148148
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
149149
data["Total_objects"] = data["Total_objects"].astype(int)
150150
data.sort_values(data_label, ascending=True, inplace=True)
151151
data = data.tail(10)
152152
average_unit = data["Total_objects"].mean()
153-
title = "Top 10 Units"
153+
title = "Totals by 10 Units"
154154
plt = plot.combined_plot(
155155
args=args,
156156
data=data,
157157
title=title,
158158
name_label=name_label,
159159
data_label=data_label,
160+
bar_ylabel="Data Sources",
160161
)
161162

162163
image_path = shared.path_join(
@@ -175,11 +176,11 @@ def plot_totals_by_top10_units(args):
175176
SECTION_TITLE,
176177
title,
177178
image_path,
178-
"Plots showing totals by units.",
179-
"This shows the distribution of top 10"
180-
" units/ sub providers across smithsonian"
181-
f" with an average of {average_unit} objects"
182-
" across the top 10 sub providers.",
179+
"Plots showing totals by units. This shows the"
180+
" distribution of top 10 institute member across"
181+
" Smithsonian Institute with an average of"
182+
f" {average_unit:,} objects across the top 10"
183+
" Institute members.",
183184
)
184185

185186

@@ -193,7 +194,7 @@ def plot_totals_by_lowest10_units(args):
193194
"smithsonian_totals_by_units.csv",
194195
)
195196
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
196-
name_label = "Unit"
197+
name_label = "Data_source"
197198
data_label = "Total_objects"
198199
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
199200
data["Total_objects"] = data["Total_objects"].astype(int)
@@ -207,6 +208,7 @@ def plot_totals_by_lowest10_units(args):
207208
title=title,
208209
name_label=name_label,
209210
data_label=data_label,
211+
bar_ylabel="Data Sources",
210212
)
211213

212214
image_path = shared.path_join(
@@ -227,40 +229,94 @@ def plot_totals_by_lowest10_units(args):
227229
image_path,
228230
"Plots showing totals by units.",
229231
"This shows the distribution of lowest 10"
230-
" units/ sub providers across smithsonian"
232+
" institute member across Smithsonian Institute"
231233
f" with an average of {average_unit} objects"
232-
" across the lowest 10 sub providers.",
234+
" across the lowest 10 institute members.",
233235
)
234236

235237

236-
def plot_totals_by_records(args):
238+
def plot_totals_by_top10_unit_records(args):
237239
"""
238-
Create plots showing totals by records
240+
Create plots showing breakdown of CC0 records by top 10 units
239241
"""
240-
LOGGER.info(plot_totals_by_records.__doc__.strip())
242+
LOGGER.info(plot_totals_by_top10_unit_records.__doc__.strip())
241243
file_path = shared.path_join(
242244
PATHS["data_2-process"],
243245
"smithsonian_totals_by_records.csv",
244246
)
245247
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
246-
name_label = "Unit"
248+
name_label = "Data_source"
249+
data_label = "Total_objects"
247250
stack_labels = [
248251
"CC0_without_media_percentage",
249252
"CC0_with_media_percentage",
250253
"Others_percentage",
251254
]
252255
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
256+
data.sort_values(data_label, ascending=True, inplace=True)
257+
data = data.tail(10)
258+
title = "Breakdown of CC0 records by top 10 units"
259+
plt = plot.stacked_barh_plot(
260+
args=args,
261+
data=data,
262+
title=title,
263+
name_label=name_label,
264+
stack_labels=stack_labels,
265+
ylabel="Data Sources",
266+
)
267+
image_path = shared.path_join(
268+
PATHS["data_phase"], "smithsonian_by_top10_unit_records.png"
269+
)
270+
LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
271+
if args.enable_save:
272+
# Create the directory if it does not exist
273+
os.makedirs(PATHS["data_phase"], exist_ok=True)
274+
plt.savefig(image_path)
275+
276+
shared.update_readme(
277+
args,
278+
SECTION_FILE,
279+
SECTION_TITLE,
280+
title,
281+
image_path,
282+
"Plots showing totals by CC0 records. This is the"
283+
" top 10 units with a breakdown of CC0 records"
284+
" without media, CC0 records with media and records"
285+
" that are not associated with CC0.",
286+
)
287+
288+
289+
def plot_totals_by_lowest10_unit_records(args):
290+
"""
291+
Create plots showing breakdown of CC0 records by lowest 10 units
292+
"""
293+
LOGGER.info(plot_totals_by_lowest10_unit_records.__doc__.strip())
294+
file_path = shared.path_join(
295+
PATHS["data_2-process"],
296+
"smithsonian_totals_by_records.csv",
297+
)
298+
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
299+
name_label = "Data_source"
300+
data_label = "Total_objects"
301+
stack_labels = [
302+
"CC0_without_media_percentage",
303+
"CC0_with_media_percentage",
304+
"Others_percentage",
305+
]
306+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
307+
data.sort_values(data_label, ascending=True, inplace=True)
253308
data = data.head(10)
254-
title = "Totals by records"
309+
title = "Breakdown of CC0 records by lowest 10 units"
255310
plt = plot.stacked_barh_plot(
256311
args=args,
257312
data=data,
258313
title=title,
259314
name_label=name_label,
260315
stack_labels=stack_labels,
316+
ylabel="Data Sources",
261317
)
262318
image_path = shared.path_join(
263-
PATHS["data_phase"], "smithsonian_by_records.png"
319+
PATHS["data_phase"], "smithsonian_by_lowest10_unit_records.png"
264320
)
265321
LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
266322
if args.enable_save:
@@ -274,8 +330,8 @@ def plot_totals_by_records(args):
274330
SECTION_TITLE,
275331
title,
276332
image_path,
277-
"Plots showing totals by CC0 records.",
278-
"This is the breakdown of CC0 records"
333+
"Plots showing totals by CC0 records. This is the"
334+
" lowest 10 units with a breakdown of CC0 records"
279335
" without media, CC0 records with media and records"
280336
" that are not associated with CC0.",
281337
)
@@ -292,7 +348,8 @@ def main():
292348
smithsonian_intro(args)
293349
plot_totals_by_top10_units(args)
294350
plot_totals_by_lowest10_units(args)
295-
plot_totals_by_records(args)
351+
plot_totals_by_top10_unit_records(args)
352+
plot_totals_by_lowest10_unit_records(args)
296353

297354
# Add and commit changes
298355
args = shared.git_add_and_commit(

0 commit comments

Comments
 (0)