Added unit full name and updates

oree-xx · oree-xx · commit 41530811469c · 2026-02-06T11:34:53.000+01:00
diff --git a/scripts/1-fetch/smithsonian_fetch.py b/scripts/1-fetch/smithsonian_fetch.py
@@ -46,6 +46,49 @@
 ]
 QUARTER = os.path.basename(PATHS["data_quarter"])
 
+unit_map = {
+    "AAA": "Archives of American Art",
+    "AAG": "Archives of American Gardens",
+    "ACM": "Anacostia Community Museum",
+    "ACMA": "Anacostia Community Museum Archives",
+    "CFCHFOLKLIFE": "Ralph Rinzler Folklife Archives and Collections",
+    "CHNDM": "Cooper Hewitt, Smithsonian Design Museum",
+    "FBR": "Smithsonian Field Book Project",
+    "FSG": "Freer Gallery of Art and Arthur M. Sackler Gallery",
+    "HAC": "Smithsonian Gardens",
+    "HMSG": "Hirshhorn Museum and Sculpture Garden",
+    "HSFA": "Human Studies Film Archives",
+    "NASM": "National Air and Space Museum",
+    "NMAAHC": "National Museum of African American History and Culture",
+    "NMAH": "National Museum of American History",
+    "NMAI": "National Museum of the American Indian",
+    "NMAfA": "National Museum of African Art",
+    "NMNHANTHRO": "NMNH - Anthropology Dept.",
+    "NMNHBIRDS": "NMNH - Vertebrate Zoology - Birds Division",
+    "NMNHBOTANY": "NMNH - Botany Dept.",
+    "NMNHEDUCATION": "NMNH - Education & Outreach",
+    "NMNHENTO": "NMNH - Entomology Dept.",
+    "NMNHFISHES": "NMNH - Vertebrate Zoology - Fishes Division",
+    "NMNHHERPS": "NMNH - Vertebrate Zoology - Herpetology Division",
+    "NMNHINV": "NMNH - Invertebrate Zoology Dept.",
+    "NMNHMAMMALS": "NMNH - Vertebrate Zoology - Mammals Division",
+    "NMNHMINSCI": "NMNH - Mineral Sciences Dept.",
+    "NMNHPALEO": "NMNH - Paleobiology Dept.",
+    "NPG": "National Portrait Gallery",
+    "NPM": "National Postal Museum",
+    "NZP": "Smithsonian's National Zoo & Conservation Biology Institute",
+    "OCIO_DPO3D": "OCIO Digital Preservation & 3D Team",
+    "OFEO-SG": "Office of Facilities Engineering &"
+    " Operations – Smithsonian Gardens",
+    "SAAM": "Smithsonian American Art Museum",
+    "SIA": "Smithsonian Institution Archives",
+    "SIL": "Smithsonian Libraries",
+    "SILAF": "Smithsonian Institution Libraries, African Section",
+    "SILNMAHTL": "Smithsonian Institution Libraries,"
+    " National Museum of American History, Library",
+    "SLA_SRO": "Smithsonian Libraries Archives, Special Research/Operations",
+}
+
 
 def parse_arguments():
     """
@@ -121,6 +164,33 @@ def write_data(args, data_metrics, data_units):
     return args
 
 
+def fetch_unit_codes(session):
+    LOGGER.info("Fetching current unit codes from Smithsonian API")
+    url = "https://api.si.edu/openaccess/api/v1.0/terms/unit_code"
+    params = {"api_key": DATA_GOV_API_KEY}
+    try:
+        with session.get(url, params=params) as response:
+            response.raise_for_status()
+            api_codes = set(response.json()["response"]["terms"])
+    except requests.HTTPError as e:
+        raise shared.QuantifyingException(f"HTTP Error: {e}", 1)
+    except requests.RequestException as e:
+        raise shared.QuantifyingException(f"Request Exception: {e}", 1)
+    except KeyError as e:
+        raise shared.QuantifyingException(f"KeyError: {e}", 1)
+
+    map_codes = set(unit_map.keys())
+    new_codes = sorted(api_codes - map_codes)
+    removed_codes = sorted(map_codes - api_codes)
+
+    if new_codes:
+        LOGGER.warning(f"New unit code(s) not in unit_map: {new_codes}")
+    if removed_codes:
+        LOGGER.warning(f"unit_map code(s) no longer in API: {removed_codes}")
+    if not new_codes and not removed_codes:
+        LOGGER.info("unit_map is up to date")
+
+
 def query_smithsonian(args, session):
     if not DATA_GOV_API_KEY:
         raise shared.QuantifyingException(
@@ -158,7 +228,7 @@ def query_smithsonian(args, session):
             continue
         data_units.append(
             {
-                "UNIT": unit["unit"],
+                "UNIT": unit_map.get(unit["unit"], unit["unit"]),
                 "CC0_RECORDS": unit["metrics"]["CC0_records"],
                 "CC0_RECORDS_WITH_CC0_MEDIA": unit["metrics"][
                     "CC0_records_with_CC0_media"
@@ -176,6 +246,7 @@ def main():
     shared.paths_log(LOGGER, PATHS)
     check_for_completion()
     session = shared.get_session()
+    fetch_unit_codes(session)
     data_metrics, data_units = query_smithsonian(args, session)
     args = write_data(args, data_metrics, data_units)
     args = shared.git_add_and_commit(
diff --git a/scripts/2-process/smithsonian_process.py b/scripts/2-process/smithsonian_process.py
@@ -127,14 +127,23 @@ def process_totals_by_records(args, count_data):
         .reset_index()
         .rename(columns={"index": "Unit"})
     )
-    data["CC0_RECORDS_PERCENTAGE"] = (
-        (data["CC0_RECORDS"] / data["TOTAL_OBJECTS"]) * 100
+    data["CC0_WITHOUT_MEDIA_PERCENTAGE"] = (
+        (
+            (data["CC0_RECORDS"] - data["CC0_RECORDS_WITH_CC0_MEDIA"])
+            / data["TOTAL_OBJECTS"]
+        )
+        * 100
     ).round(2)
 
-    data["CC0_RECORDS_WITH_CC0_MEDIA_PERCENTAGE"] = (
+    data["CC0_WITH_MEDIA_PERCENTAGE"] = (
         (data["CC0_RECORDS_WITH_CC0_MEDIA"] / data["TOTAL_OBJECTS"]) * 100
     ).round(2)
 
+    data["OTHERS_PERCENTAGE"] = (
+        ((data["TOTAL_OBJECTS"] - data["CC0_RECORDS"]) / data["TOTAL_OBJECTS"])
+        * 100
+    ).round(2)
+
     data.sort_values("Unit", ascending=True, inplace=True)
     data.reset_index(drop=True, inplace=True)
 
diff --git a/scripts/3-report/smithsonian_report.py b/scripts/3-report/smithsonian_report.py
@@ -100,20 +100,35 @@ def smithsonian_intro(args):
     """
     LOGGER.info(smithsonian_intro.__doc__.strip())
     file_path = shared.path_join(
-        PATHS["data_1-fetch"],
-        "smithsonian_1_metrics.csv",
+        PATHS["data_2-process"],
+        "smithsonian_totals_by_records.csv",
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
-    # name_label = "UNIT"
-    # data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
-    # data.sort_values(name_label, ascending=True, inplace=True)
+    data = shared.open_data_file(LOGGER, file_path)
+    total_objects = data["TOTAL_OBJECTS"].sum()
+    cc0_records = data["CC0_RECORDS"].sum()
+    cc0_records_with_media = data["CC0_RECORDS_WITH_CC0_MEDIA"].sum()
+    cc0_media_percentage = f"{data['CC0_WITH_MEDIA_PERCENTAGE'].mean():.2f}%"
+    num_units = len(data)
+    min_unit = data["TOTAL_OBJECTS"].min()
     shared.update_readme(
         args,
         SECTION_FILE,
         SECTION_TITLE,
         "Overview",
         None,
         None,
+        "The Smithsonian data returns the overall "
+        " statistics of CC0 legal tool records."
+        " It serves as the main legal tool used by Smithsonian."
+        "\n"
+        f"The results indicate a total record of {total_objects} objects,"
+        f" with a breakdown of {cc0_records} objects without CC0 Media and"
+        f" {cc0_records_with_media} objects with CC0 Media, taking a"
+        f" percentage of {cc0_media_percentage} in each unit."
+        f" There are {num_units} unique units in the data"
+        " representing museums, libraries, zoos and many other"
+        f" with a minimum of {min_unit} objects.",
     )
 
 
@@ -124,13 +139,15 @@ def plot_totals_by_units(args):
     LOGGER.info(plot_totals_by_units.__doc__.strip())
     file_path = shared.path_join(
         PATHS["data_2-process"],
-        "smithsonian_totals_by_units.csv",
+        "smithsonian_totals_by_records.csv",
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "Unit"
-    data_label = "Count"
+    data_label = "TOTAL_OBJECTS"
     data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
+    data["TOTAL_OBJECTS"] = data["TOTAL_OBJECTS"].astype(int)
     data.sort_values(data_label, ascending=True, inplace=True)
+    average_unit = data["TOTAL_OBJECTS"].mean()
     data = data.head(10)
     title = "Totals by Units"
     plt = plot.combined_plot(
@@ -157,7 +174,11 @@ def plot_totals_by_units(args):
         SECTION_TITLE,
         title,
         image_path,
-        "Coming soon",
+        "Plots showing totals by units.",
+        "This shows the distribution of top 10"
+        " units/ sub providers across smithsonian"
+        f" with an average of {average_unit} objects"
+        " across the sub providers.",
     )
 
 
@@ -173,11 +194,11 @@ def plot_totals_by_records(args):
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "Unit"
     stack_labels = [
-        "CC0_RECORDS_PERCENTAGE",
-        "CC0_RECORDS_WITH_CC0_MEDIA_PERCENTAGE",
+        "CC0_WITHOUT_MEDIA_PERCENTAGE",
+        "CC0_WITH_MEDIA_PERCENTAGE",
+        "OTHERS_PERCENTAGE",
     ]
     data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
-    data.sort_values(stack_labels, ascending=False, inplace=True)
     data = data.head(10)
     title = "Totals by records"
     plt = plot.stacked_barh_plot(
@@ -202,7 +223,9 @@ def plot_totals_by_records(args):
         SECTION_TITLE,
         title,
         image_path,
-        "Coming soon",
+        "Plots showing totals by CC0 records.",
+        "This is the breakdown of CC0 records"
+        " without media and CC0 records with media.",
     )
 
 
diff --git a/scripts/plot.py b/scripts/plot.py
@@ -26,7 +26,7 @@ def annotate_ylabels(ax, data, data_label, colors):
 
         # annotate totals
         ax.annotate(
-            f"    {row[data_label]:>15,d}",
+            f"    {int(row[data_label]):>15,d}",
             (indent, i - 0.1),
             xycoords=("axes points", "data"),
             color=colors[c],
@@ -81,7 +81,7 @@ def combined_plot(
     # pad tick labels to make room for annotation
     tick_labels = []
     for index, row in data.iterrows():
-        count = f"{row[data_label]:,d}"
+        count = f"{int(row[data_label]):,d}"
         tick_labels.append(f"{index}\n{' ' * len(count)}")
     if bar_xscale == "log":
         log = True
@@ -185,6 +185,7 @@ def stacked_barh_plot(
         ]
 
     ax.set_xlabel("Number of works")
+    # ax.set_xlim(0, 100)
     ax.xaxis.set_major_formatter(ticker.FuncFormatter(number_formatter))
 
     if ylabel: