Made review changes

oree-xx · oree-xx · commit eee780ddd798 · 2026-02-16T10:12:55.000+01:00
diff --git a/scripts/1-fetch/smithsonian_fetch.py b/scripts/1-fetch/smithsonian_fetch.py
@@ -39,14 +39,17 @@
     "TOTAL_OBJECTS",
 ]
 HEADER_2_UNITS = [
-    "UNIT",
+    "UNIT_CODE",
+    "UNIT_NAME",
     "CC0_RECORDS",
     "CC0_RECORDS_WITH_CC0_MEDIA",
     "TOTAL_OBJECTS",
 ]
 QUARTER = os.path.basename(PATHS["data_quarter"])
 
-unit_map = {
+# Manually compiled unit code and name from URL
+# 'https://github.com/Smithsonian/OpenAccess'
+UNIT_MAP = {
     "AAA": "Archives of American Art",
     "AAG": "Archives of American Gardens",
     "ACM": "Anacostia Community Museum",
@@ -63,17 +66,35 @@
     "NMAH": "National Museum of American History",
     "NMAI": "National Museum of the American Indian",
     "NMAfA": "National Museum of African Art",
-    "NMNHANTHRO": "NMNH - Anthropology Dept.",
-    "NMNHBIRDS": "NMNH - Vertebrate Zoology - Birds Division",
-    "NMNHBOTANY": "NMNH - Botany Dept.",
-    "NMNHEDUCATION": "NMNH - Education & Outreach",
-    "NMNHENTO": "NMNH - Entomology Dept.",
-    "NMNHFISHES": "NMNH - Vertebrate Zoology - Fishes Division",
-    "NMNHHERPS": "NMNH - Vertebrate Zoology - Herpetology Division",
-    "NMNHINV": "NMNH - Invertebrate Zoology Dept.",
-    "NMNHMAMMALS": "NMNH - Vertebrate Zoology - Mammals Division",
-    "NMNHMINSCI": "NMNH - Mineral Sciences Dept.",
-    "NMNHPALEO": "NMNH - Paleobiology Dept.",
+    "NMNHANTHRO": ("National Musuem of Natural History - Anthropology Dept."),
+    "NMNHBIRDS": (
+        "National Musuem of Natural History"
+        " - Vertebrate Zoology - Birds Division"
+    ),
+    "NMNHBOTANY": ("National Musuem of Natural History - Botany Dept."),
+    "NMNHEDUCATION": (
+        "National Musuem of Natural History" " - Education & Outreach"
+    ),
+    "NMNHENTO": ("National Musuem of Natural History - Entomology Dept."),
+    "NMNHFISHES": (
+        "National Musuem of Natural History"
+        " - Vertebrate Zoology - Fishes Division"
+    ),
+    "NMNHHERPS": (
+        "National Musuem of Natural History"
+        " - Vertebrate Zoology - Herpetology Division"
+    ),
+    "NMNHINV": (
+        "National Musuem of Natural History" " - Invertebrate Zoology Dept."
+    ),
+    "NMNHMAMMALS": (
+        "National Musuem of Natural History"
+        " - Vertebrate Zoology - Mammals Division"
+    ),
+    "NMNHMINSCI": (
+        "National Musuem of Natural History" " - Mineral Sciences Dept."
+    ),
+    "NMNHPALEO": ("National Musuem of Natural History - Paleobiology Dept."),
     "NPG": "National Portrait Gallery",
     "NPM": "National Postal Museum",
     "NZP": "Smithsonian's National Zoo & Conservation Biology Institute",
@@ -179,7 +200,7 @@ def fetch_unit_codes(session):
     except KeyError as e:
         raise shared.QuantifyingException(f"KeyError: {e}", 1)
 
-    map_codes = set(unit_map.keys())
+    map_codes = set(UNIT_MAP.keys())
     new_codes = sorted(api_codes - map_codes)
     removed_codes = sorted(map_codes - api_codes)
 
@@ -228,15 +249,16 @@ def query_smithsonian(args, session):
             continue
         data_units.append(
             {
-                "UNIT": unit_map.get(unit["unit"], unit["unit"]),
+                "UNIT_CODE": unit["unit"],
+                "UNIT_NAME": UNIT_MAP.get(unit["unit"], unit["unit"]),
                 "CC0_RECORDS": unit["metrics"]["CC0_records"],
                 "CC0_RECORDS_WITH_CC0_MEDIA": unit["metrics"][
                     "CC0_records_with_CC0_media"
                 ],
                 "TOTAL_OBJECTS": unit["total_objects"],
             }
         )
-    data_units = sorted(data_units, key=itemgetter("UNIT"))
+    data_units = sorted(data_units, key=itemgetter("UNIT_CODE"))
     LOGGER.info(f"Fetched stats for {len(data_units)} units")
     return data_metrics, data_units
 
diff --git a/scripts/2-process/smithsonian_process.py b/scripts/2-process/smithsonian_process.py
@@ -82,13 +82,13 @@ def process_totals_by_units(args, count_data):
     data = {}
 
     for row in count_data.itertuples(index=False):
-        unit = str(row.UNIT)
+        unit = str(row.UNIT_NAME)
         total_objects = int(row.TOTAL_OBJECTS)
 
         data[unit] = total_objects
 
-    data = pd.DataFrame(data.items(), columns=["Unit", "Total_objects"])
-    data.sort_values("Unit", ascending=True, inplace=True)
+    data = pd.DataFrame(data.items(), columns=["Unit_name", "Total_objects"])
+    data.sort_values("Unit_name", ascending=True, inplace=True)
     data.reset_index(drop=True, inplace=True)
     file_path = shared.path_join(
         PATHS["data_phase"], "smithsonian_totals_by_units.csv"
@@ -104,7 +104,7 @@ def process_totals_by_records(args, count_data):
     data = {}
 
     for row in count_data.itertuples(index=False):
-        unit = str(row.UNIT)
+        unit = str(row.UNIT_NAME)
         CC0_records = int(row.CC0_RECORDS)
         CC0_records_with_CC0_media = int(row.CC0_RECORDS_WITH_CC0_MEDIA)
         total_objects = int(row.TOTAL_OBJECTS)
@@ -126,7 +126,7 @@ def process_totals_by_records(args, count_data):
     data = (
         pd.DataFrame.from_dict(data, orient="index")
         .reset_index()
-        .rename(columns={"index": "Unit"})
+        .rename(columns={"index": "Unit_name"})
     )
     data["CC0_without_media_percentage"] = (
         (
@@ -145,7 +145,7 @@ def process_totals_by_records(args, count_data):
         * 100
     ).round(2)
 
-    data.sort_values("Unit", ascending=True, inplace=True)
+    data.sort_values("Unit_name", ascending=True, inplace=True)
     data.reset_index(drop=True, inplace=True)
 
     file_path = shared.path_join(
@@ -166,7 +166,8 @@ def main():
         LOGGER,
         file_count,
         usecols=[
-            "UNIT",
+            "UNIT_CODE",
+            "UNIT_NAME",
             "CC0_RECORDS",
             "CC0_RECORDS_WITH_CC0_MEDIA",
             "TOTAL_OBJECTS",
diff --git a/scripts/3-report/smithsonian_report.py b/scripts/3-report/smithsonian_report.py
@@ -111,25 +111,25 @@ def smithsonian_intro(args):
     CC0_records_with_media = data["CC0_records_with_CC0_media"].sum()
     CC0_media_percentage = f"{data['CC0_with_media_percentage'].mean():.2f}%"
     num_units = len(data)
-    min_unit = data["Total_objects"].min()
+    min_object = data["Total_objects"].min()
     shared.update_readme(
         args,
         SECTION_FILE,
         SECTION_TITLE,
         "Overview",
         None,
         None,
-        "The Smithsonian data returns the overall "
+        "The Smithsonian Institute data returns the overall"
         " statistics of CC0 legal tool records."
-        " It serves as the main legal tool used by Smithsonian."
+        " It serves as the main legal tool used by Smithsonian Institute."
         "\n"
-        f"The results indicate a total record of {total_objects} objects,"
-        f" with a breakdown of {CC0_records} objects without CC0 Media and"
-        f" {CC0_records_with_media} objects with CC0 Media, taking a"
-        f" percentage of {CC0_media_percentage} in each unit."
+        f"The results indicate a total record of {total_objects:,} objects,"
+        f" with a breakdown of {CC0_records:,} objects without CC0 Media and"
+        f" {CC0_records_with_media:,} objects with CC0 Media, taking a"
+        f" percentage of {CC0_media_percentage} in each institute member."
         f" There are {num_units} unique units in the data"
-        " representing museums, libraries, zoos and many other"
-        f" with a minimum of {min_unit} objects.",
+        " representing museums, libraries, zoos and other institutions"
+        f" with a minimum of {min_object} objects.",
     )
 
 
@@ -143,7 +143,7 @@ def plot_totals_by_top10_units(args):
         "smithsonian_totals_by_units.csv",
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
-    name_label = "Unit"
+    name_label = "Unit_name"
     data_label = "Total_objects"
     data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     data["Total_objects"] = data["Total_objects"].astype(int)
@@ -175,11 +175,11 @@ def plot_totals_by_top10_units(args):
         SECTION_TITLE,
         title,
         image_path,
-        "Plots showing totals by units.",
-        "This shows the distribution of top 10"
-        " units/ sub providers across smithsonian"
-        f" with an average of {average_unit} objects"
-        " across the top 10 sub providers.",
+        "Plots showing totals by units. This shows the"
+        " distribution of top 10 institute member across"
+        " Smithsonian Institute with an average of"
+        f" {average_unit:,} objects across the top 10"
+        "Institute members.",
     )
 
 
@@ -193,7 +193,7 @@ def plot_totals_by_lowest10_units(args):
         "smithsonian_totals_by_units.csv",
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
-    name_label = "Unit"
+    name_label = "Unit_name"
     data_label = "Total_objects"
     data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     data["Total_objects"] = data["Total_objects"].astype(int)
@@ -227,9 +227,9 @@ def plot_totals_by_lowest10_units(args):
         image_path,
         "Plots showing totals by units.",
         "This shows the distribution of lowest 10"
-        " units/ sub providers across smithsonian"
+        " institute member across Smithsonian Institute"
         f" with an average of {average_unit} objects"
-        " across the lowest 10 sub providers.",
+        " across the lowest 10 institute members.",
     )
 
 
@@ -243,7 +243,7 @@ def plot_totals_by_records(args):
         "smithsonian_totals_by_records.csv",
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
-    name_label = "Unit"
+    name_label = "Unit_name"
     stack_labels = [
         "CC0_without_media_percentage",
         "CC0_with_media_percentage",
@@ -274,8 +274,8 @@ def plot_totals_by_records(args):
         SECTION_TITLE,
         title,
         image_path,
-        "Plots showing totals by CC0 records.",
-        "This is the breakdown of CC0 records"
+        "Plots showing totals by CC0 records. This is the"
+        " breakdown of top 10 records with highest CC0 records"
         " without media, CC0 records with media and records"
         " that are not associated with CC0.",
     )
diff --git a/scripts/plot.py b/scripts/plot.py
@@ -27,7 +27,7 @@ def annotate_ylabels(ax, data, data_label, colors):
         # annotate totals
         ax.annotate(
             f"    {int(row[data_label]):>15,d}",
-            (indent, i - 0.1),
+            (indent, i - 0.22),
             xycoords=("axes points", "data"),
             color=colors[c],
             fontsize="x-small",
@@ -82,7 +82,7 @@ def combined_plot(
     tick_labels = []
     for index, row in data.iterrows():
         count = f"{int(row[data_label]):,d}"
-        tick_labels.append(f"{index}\n{' ' * len(count)}")
+        tick_labels.append(f"{wrap_label(index)}\n{' ' * len(count)}")
     if bar_xscale == "log":
         log = True
     else:
@@ -144,13 +144,37 @@ def number_formatter(x, pos):
         return f"{x:,.0f}"
 
 
+def wrap_label(label):
+    if " " not in label:
+        return label
+
+    midpoint = len(label) // 2
+    # find nearest space to midpoint
+    left = label.rfind(" ", 0, midpoint)
+    right = label.find(" ", midpoint)
+
+    if left == -1:
+        split_index = right
+    elif right == -1:
+        split_index = left
+    else:
+        if midpoint - left <= right - midpoint:
+            split_index = left
+        else:
+            split_index = right
+    if split_index == -1:
+        return label
+
+    return f"{label[:split_index]}\n{label[split_index + 1:]}"
+
+
 def stacked_barh_plot(
     args,
     data,
     title,
     name_label,
     stack_labels,
-    xscale=None,
+    xscale="linear",
     ylabel=None,
 ):
     """
@@ -185,8 +209,9 @@ def stacked_barh_plot(
         ]
 
     ax.set_xlabel("Number of works")
-    # ax.set_xlim(0, 100)
     ax.xaxis.set_major_formatter(ticker.FuncFormatter(number_formatter))
+    ax.set_yticks(range(len(data.index)))
+    ax.set_yticklabels([wrap_label(label) for label in data.index])
 
     if ylabel:
         ax.set_ylabel(ylabel)
@@ -198,6 +223,7 @@ def stacked_barh_plot(
         fontsize="x-small",
         title_fontsize="x-small",
         loc="upper right",
+        bbox_to_anchor=(1.02, 1),
     )
 
     plt.suptitle(title)