Made changes to the variable names

oree-xx · oree-xx · commit 646cb2452eb1 · 2026-02-06T12:36:14.000+01:00
diff --git a/scripts/2-process/smithsonian_process.py b/scripts/2-process/smithsonian_process.py
@@ -86,7 +86,7 @@ def process_totals_by_units(args, count_data):
 
         data[unit] = total_objects
 
-    data = pd.DataFrame(data.items(), columns=["Unit", "Count"])
+    data = pd.DataFrame(data.items(), columns=["Unit", "Total_objects"])
     data.sort_values("Unit", ascending=True, inplace=True)
     data.reset_index(drop=True, inplace=True)
     file_path = shared.path_join(
@@ -104,43 +104,43 @@ def process_totals_by_records(args, count_data):
 
     for row in count_data.itertuples(index=False):
         unit = str(row.UNIT)
-        cc0_records = int(row.CC0_RECORDS)
-        cc0_records_with_cc0_media = int(row.CC0_RECORDS_WITH_CC0_MEDIA)
+        CC0_records = int(row.CC0_RECORDS)
+        CC0_records_with_CC0_media = int(row.CC0_RECORDS_WITH_CC0_MEDIA)
         total_objects = int(row.TOTAL_OBJECTS)
 
-        if cc0_records == 0 and cc0_records_with_cc0_media == 0:
+        if CC0_records == 0 and CC0_records_with_CC0_media == 0:
             continue
 
         if unit not in data:
             data[unit] = {
-                "CC0_RECORDS": 0,
-                "CC0_RECORDS_WITH_CC0_MEDIA": 0,
-                "TOTAL_OBJECTS": 0,
+                "CC0_records": 0,
+                "CC0_records_with_CC0_media": 0,
+                "Total_objects": 0,
             }
 
-        data[unit]["CC0_RECORDS"] += cc0_records
-        data[unit]["CC0_RECORDS_WITH_CC0_MEDIA"] += cc0_records_with_cc0_media
-        data[unit]["TOTAL_OBJECTS"] += total_objects
+        data[unit]["CC0_records"] += CC0_records
+        data[unit]["CC0_records_with_CC0_media"] += CC0_records_with_CC0_media
+        data[unit]["Total_objects"] += total_objects
 
     data = (
         pd.DataFrame.from_dict(data, orient="index")
         .reset_index()
         .rename(columns={"index": "Unit"})
     )
-    data["CC0_WITHOUT_MEDIA_PERCENTAGE"] = (
+    data["CC0_without_media_percentage"] = (
         (
-            (data["CC0_RECORDS"] - data["CC0_RECORDS_WITH_CC0_MEDIA"])
-            / data["TOTAL_OBJECTS"]
+            (data["CC0_records"] - data["CC0_records_with_CC0_media"])
+            / data["Total_objects"]
         )
         * 100
     ).round(2)
 
-    data["CC0_WITH_MEDIA_PERCENTAGE"] = (
-        (data["CC0_RECORDS_WITH_CC0_MEDIA"] / data["TOTAL_OBJECTS"]) * 100
+    data["CC0_with_media_percentage"] = (
+        (data["CC0_records_with_CC0_media"] / data["Total_objects"]) * 100
     ).round(2)
 
-    data["OTHERS_PERCENTAGE"] = (
-        ((data["TOTAL_OBJECTS"] - data["CC0_RECORDS"]) / data["TOTAL_OBJECTS"])
+    data["Others_percentage"] = (
+        ((data["Total_objects"] - data["CC0_records"]) / data["Total_objects"])
         * 100
     ).round(2)
 
diff --git a/scripts/3-report/smithsonian_report.py b/scripts/3-report/smithsonian_report.py
@@ -105,12 +105,12 @@ def smithsonian_intro(args):
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     data = shared.open_data_file(LOGGER, file_path)
-    total_objects = data["TOTAL_OBJECTS"].sum()
-    cc0_records = data["CC0_RECORDS"].sum()
-    cc0_records_with_media = data["CC0_RECORDS_WITH_CC0_MEDIA"].sum()
-    cc0_media_percentage = f"{data['CC0_WITH_MEDIA_PERCENTAGE'].mean():.2f}%"
+    total_objects = data["Total_objects"].sum()
+    CC0_records = data["CC0_records"].sum()
+    CC0_records_with_media = data["CC0_records_with_CC0_media"].sum()
+    CC0_media_percentage = f"{data['CC0_with_media_percentage'].mean():.2f}%"
     num_units = len(data)
-    min_unit = data["TOTAL_OBJECTS"].min()
+    min_unit = data["Total_objects"].min()
     shared.update_readme(
         args,
         SECTION_FILE,
@@ -123,33 +123,33 @@ def smithsonian_intro(args):
         " It serves as the main legal tool used by Smithsonian."
         "\n"
         f"The results indicate a total record of {total_objects} objects,"
-        f" with a breakdown of {cc0_records} objects without CC0 Media and"
-        f" {cc0_records_with_media} objects with CC0 Media, taking a"
-        f" percentage of {cc0_media_percentage} in each unit."
+        f" with a breakdown of {CC0_records} objects without CC0 Media and"
+        f" {CC0_records_with_media} objects with CC0 Media, taking a"
+        f" percentage of {CC0_media_percentage} in each unit."
         f" There are {num_units} unique units in the data"
         " representing museums, libraries, zoos and many other"
         f" with a minimum of {min_unit} objects.",
     )
 
 
-def plot_totals_by_units(args):
+def plot_totals_by_top10_units(args):
     """
-    Create plots showing totals by units
+    Create plots showing totals by top 10 units
     """
-    LOGGER.info(plot_totals_by_units.__doc__.strip())
+    LOGGER.info(plot_totals_by_top10_units.__doc__.strip())
     file_path = shared.path_join(
         PATHS["data_2-process"],
-        "smithsonian_totals_by_records.csv",
+        "smithsonian_totals_by_units.csv",
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "Unit"
-    data_label = "TOTAL_OBJECTS"
+    data_label = "Total_objects"
     data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
-    data["TOTAL_OBJECTS"] = data["TOTAL_OBJECTS"].astype(int)
+    data["Total_objects"] = data["Total_objects"].astype(int)
     data.sort_values(data_label, ascending=True, inplace=True)
-    average_unit = data["TOTAL_OBJECTS"].mean()
     data = data.head(10)
-    title = "Totals by Units"
+    average_unit = data["Total_objects"].mean()
+    title = "Top 10 Units"
     plt = plot.combined_plot(
         args=args,
         data=data,
@@ -178,7 +178,57 @@ def plot_totals_by_units(args):
         "This shows the distribution of top 10"
         " units/ sub providers across smithsonian"
         f" with an average of {average_unit} objects"
-        " across the sub providers.",
+        " across the top 10 sub providers.",
+    )
+
+
+def plot_totals_by_lowest10_units(args):
+    """
+    Create plots showing totals by lowest 10 units
+    """
+    LOGGER.info(plot_totals_by_lowest10_units.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_2-process"],
+        "smithsonian_totals_by_units.csv",
+    )
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    name_label = "Unit"
+    data_label = "Total_objects"
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
+    data["Total_objects"] = data["Total_objects"].astype(int)
+    data.sort_values(data_label, ascending=True, inplace=True)
+    data = data.tail(10)
+    average_unit = data["Total_objects"].mean()
+    title = "Totals by Units"
+    plt = plot.combined_plot(
+        args=args,
+        data=data,
+        title=title,
+        name_label=name_label,
+        data_label=data_label,
+    )
+
+    image_path = shared.path_join(
+        PATHS["data_phase"], "smithsonian_totals_by_unit.png"
+    )
+    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
+
+    if args.enable_save:
+        # Create the directory if it does not exist
+        os.makedirs(PATHS["data_phase"], exist_ok=True)
+        plt.savefig(image_path)
+
+    shared.update_readme(
+        args,
+        SECTION_FILE,
+        SECTION_TITLE,
+        title,
+        image_path,
+        "Plots showing totals by units.",
+        "This shows the distribution of lowest 10"
+        " units/ sub providers across smithsonian"
+        f" with an average of {average_unit} objects"
+        " across the lowest 10 sub providers.",
     )
 
 
@@ -194,9 +244,9 @@ def plot_totals_by_records(args):
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "Unit"
     stack_labels = [
-        "CC0_WITHOUT_MEDIA_PERCENTAGE",
-        "CC0_WITH_MEDIA_PERCENTAGE",
-        "OTHERS_PERCENTAGE",
+        "CC0_without_media_percentage",
+        "CC0_with_media_percentage",
+        "Others_percentage",
     ]
     data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     data = data.head(10)
@@ -225,7 +275,8 @@ def plot_totals_by_records(args):
         image_path,
         "Plots showing totals by CC0 records.",
         "This is the breakdown of CC0 records"
-        " without media and CC0 records with media.",
+        " without media, CC0 records with media and records"
+        " that are not associated with CC0.",
     )
 
 
@@ -238,7 +289,8 @@ def main():
     )
     shared.check_completion_file_exists(args, last_entry)
     smithsonian_intro(args)
-    plot_totals_by_units(args)
+    plot_totals_by_top10_units(args)
+    plot_totals_by_lowest10_units(args)
     plot_totals_by_records(args)
 
     # Add and commit changes