huggingface · BenjaminBossan · Mar 3, 2026 · Mar 4, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/method_comparison/MetaMathQA/run.py b/method_comparison/MetaMathQA/run.py
@@ -101,7 +101,7 @@ def evaluate(model, tokenizer, ds, batch_size, generate_kwargs, use_tqdm: bool =
     return predictions, responses
 
 
-@torch.inference_mode  # type: ignore
+@torch.inference_mode()
 def calculate_mean_per_token_loss(model, tokenizer, rows: list[str], batch_size: int, max_length: int) -> float:
     """Calculate the mean loss per token on the given dataset.
 

diff --git a/method_comparison/app.py b/method_comparison/app.py
@@ -18,25 +18,46 @@
 import tempfile
 
 import gradio as gr
+import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
 from processing import load_df
 from sanitizer import parse_and_filter
 
 
-metric_preferences = {
+_COMMON_METRIC_PREFERENCES = {
     "accelerator_memory_reserved_avg": "lower",
     "accelerator_memory_max": "lower",
     "accelerator_memory_reserved_99th": "lower",
     "total_time": "lower",
     "train_time": "lower",
     "file_size": "lower",
-    "test_accuracy": "higher",
     "train_loss": "lower",
     "num_trainable_params": "lower",
-    "forgetting*": "lower",
 }
 
+_TASK_METRIC_PREFERENCES = {
+    "MetaMathQA": {
+        "test_accuracy": "higher",
+        "forgetting*": "lower",
+    },
+    "image-gen": {
+        "test_dino_similarity": "higher",
+        "drift*": "lower",
+    },
+}
+
+_TASK_PARETO_DEFAULTS = {
+    "MetaMathQA": ("accelerator_memory_max", "test_accuracy"),
+    "image-gen": ("accelerator_memory_max", "test_dino_similarity"),
+}
+
+
+def get_metric_preferences(task_name):
+    prefs = dict(_COMMON_METRIC_PREFERENCES)
+    prefs.update(_TASK_METRIC_PREFERENCES.get(task_name, {}))
+    return prefs
+
 
 def get_model_ids(task_name, df):
     filtered = df[df["task_name"] == task_name]
@@ -49,7 +70,7 @@ def filter_data(task_name, model_id, df):
 
 
 # Compute the Pareto frontier for two selected metrics.
-def compute_pareto_frontier(df, metric_x, metric_y):
+def compute_pareto_frontier(df, metric_x, metric_y, metric_preferences):
     if df.empty:
         return df
 
@@ -87,12 +108,12 @@ def dominates(a, b, metric_x, metric_y):
     return pareto_df
 
 
-def generate_pareto_plot(df, metric_x, metric_y):
+def generate_pareto_plot(df, metric_x, metric_y, metric_preferences):
     if df.empty:
         return {}
 
     # Compute Pareto frontier and non-frontier points.
-    pareto_df = compute_pareto_frontier(df, metric_x, metric_y)
+    pareto_df = compute_pareto_frontier(df, metric_x, metric_y, metric_preferences)
     non_pareto_df = df.drop(pareto_df.index)
 
     # Create an empty figure.
@@ -188,6 +209,11 @@ def format_df(df):
 
 
 def build_app(df):
+    task_names = sorted(df["task_name"].unique())
+    initial_task = "MetaMathQA" if "MetaMathQA" in task_names else task_names[0]
+    initial_prefs = get_metric_preferences(initial_task)
+    initial_x, initial_y = _TASK_PARETO_DEFAULTS.get(initial_task, (list(initial_prefs)[0], list(initial_prefs)[1]))
+
     with gr.Blocks() as demo:
         gr.Markdown("# PEFT method comparison")
         gr.Markdown(
@@ -201,22 +227,21 @@ def build_app(df):
         with gr.Row():
             task_dropdown = gr.Dropdown(
                 label="Select Task",
-                choices=sorted(df["task_name"].unique()),
-                value=sorted(df["task_name"].unique())[0],
-            )
-            model_dropdown = gr.Dropdown(
-                label="Select Model ID", choices=get_model_ids(sorted(df["task_name"].unique())[0], df)
+                choices=task_names,
+                value=initial_task,
             )
+            model_dropdown = gr.Dropdown(label="Select Model ID", choices=get_model_ids(initial_task, df))
 
         # Make dataframe columns all equal in width so that they are good enough for numbers but don't
         # get hugely extended by columns like `train_config`.
-        column_widths = ["150px" for _ in df.columns]
-        column2index = dict(zip(df.columns, range(len(df.columns))))
-        column_widths[column2index['experiment_name']] = '300px'
+        initial_filtered = filter_data(initial_task, get_model_ids(initial_task, df)[0], df)
+        column_widths = ["150px" for _ in initial_filtered.columns]
+        column2index = dict(zip(initial_filtered.columns, range(len(initial_filtered.columns))))
+        column_widths[column2index["experiment_name"]] = "300px"
 
         data_table = gr.DataFrame(
             label="Results",
-            value=format_df(df),
+            value=format_df(initial_filtered),
             interactive=False,
             max_chars=100,
             wrap=False,
@@ -232,9 +257,8 @@ def build_app(df):
             apply_filter_button = gr.Button("Apply Filter")
             reset_filter_button = gr.Button("Reset Filter")
 
-        gr.Markdown(
-            "*forgetting: This is the reduction in CE loss on a sample of Wikipedia data and reflects how much the "
-            "model 'forgot' during training. The lower the number, the better."
+        metric_explanation = gr.Markdown(
+            _get_metric_explanation(initial_task),
         )
 
         gr.Markdown("## Pareto plot")
@@ -245,23 +269,15 @@ def build_app(df):
         )
 
         with gr.Row():
-            x_default = (
-                "accelerator_memory_max"
-                if "accelerator_memory_max" in metric_preferences
-                else list(metric_preferences.keys())[0]
-            )
-            y_default = (
-                "test_accuracy" if "test_accuracy" in metric_preferences else list(metric_preferences.keys())[1]
-            )
             metric_x_dropdown = gr.Dropdown(
                 label="1st metric for Pareto plot",
-                choices=list(metric_preferences.keys()),
-                value=x_default,
+                choices=list(initial_prefs.keys()),
+                value=initial_x,
             )
             metric_y_dropdown = gr.Dropdown(
                 label="2nd metric for Pareto plot",
-                choices=list(metric_preferences.keys()),
-                value=y_default,
+                choices=list(initial_prefs.keys()),
+                value=initial_y,
             )
 
         pareto_plot = gr.Plot(label="Pareto Frontier Plot")
@@ -280,10 +296,24 @@ def update_on_task(task_name, current_filter):
                 except Exception:
                     # invalid filter query
                     pass
-            return gr.update(choices=new_models, value=new_models[0] if new_models else None), format_df(filtered)
+
+            prefs = get_metric_preferences(task_name)
+            x_default, y_default = _TASK_PARETO_DEFAULTS.get(task_name, (list(prefs)[0], list(prefs)[1]))
+            metric_choices = list(prefs.keys())
+            explanation = _get_metric_explanation(task_name)
+
+            return (
+                gr.update(choices=new_models, value=new_models[0] if new_models else None),
+                format_df(filtered),
+                gr.update(choices=metric_choices, value=x_default),
+                gr.update(choices=metric_choices, value=y_default),
+                explanation,
+            )
 
         task_dropdown.change(
-            fn=update_on_task, inputs=[task_dropdown, filter_state], outputs=[model_dropdown, data_table]
+            fn=update_on_task,
+            inputs=[task_dropdown, filter_state],
+            outputs=[model_dropdown, data_table, metric_x_dropdown, metric_y_dropdown, metric_explanation],
         )
 
         def update_on_model(task_name, model_id, current_filter):
@@ -301,16 +331,17 @@ def update_on_model(task_name, model_id, current_filter):
         )
 
         def update_pareto_plot_and_summary(task_name, model_id, metric_x, metric_y, current_filter):
+            prefs = get_metric_preferences(task_name)
             filtered = filter_data(task_name, model_id, df)
             if current_filter.strip():
                 try:
                     mask = parse_and_filter(filtered, current_filter)
                     filtered = filtered[mask]
                 except Exception as e:
-                    return generate_pareto_plot(filtered, metric_x, metric_y), f"Filter error: {e}"
+                    return generate_pareto_plot(filtered, metric_x, metric_y, prefs), f"Filter error: {e}"
 
-            pareto_df = compute_pareto_frontier(filtered, metric_x, metric_y)
-            fig = generate_pareto_plot(filtered, metric_x, metric_y)
+            pareto_df = compute_pareto_frontier(filtered, metric_x, metric_y, prefs)
+            fig = generate_pareto_plot(filtered, metric_x, metric_y, prefs)
             summary = compute_pareto_summary(filtered, pareto_df, metric_x, metric_y)
             return fig, summary
 
@@ -322,6 +353,7 @@ def update_pareto_plot_and_summary(task_name, model_id, metric_x, metric_y, curr
             )
 
         def apply_filter(filter_query, task_name, model_id, metric_x, metric_y):
+            prefs = get_metric_preferences(task_name)
             filtered = filter_data(task_name, model_id, df)
             if filter_query.strip():
                 try:
@@ -332,12 +364,12 @@ def apply_filter(filter_query, task_name, model_id, metric_x, metric_y):
                     return (
                         filter_query,
                         filtered,
-                        filtered,
+                        format_df(filtered),
-                        filtered,
+                        format_df(filtered),
-                        generate_pareto_plot(filtered, metric_x, metric_y),
+                        generate_pareto_plot(filtered, metric_x, metric_y, prefs),
                         f"Filter error: {e}",
                     )
 
-            pareto_df = compute_pareto_frontier(filtered, metric_x, metric_y)
-            fig = generate_pareto_plot(filtered, metric_x, metric_y)
+            pareto_df = compute_pareto_frontier(filtered, metric_x, metric_y, prefs)
+            fig = generate_pareto_plot(filtered, metric_x, metric_y, prefs)
             summary = compute_pareto_summary(filtered, pareto_df, metric_x, metric_y)
             return filter_query, format_df(filtered), fig, summary
 
@@ -348,9 +380,10 @@ def apply_filter(filter_query, task_name, model_id, metric_x, metric_y):
         )
 
         def reset_filter(task_name, model_id, metric_x, metric_y):
+            prefs = get_metric_preferences(task_name)
             filtered = filter_data(task_name, model_id, df)
-            pareto_df = compute_pareto_frontier(filtered, metric_x, metric_y)
-            fig = generate_pareto_plot(filtered, metric_x, metric_y)
+            pareto_df = compute_pareto_frontier(filtered, metric_x, metric_y, prefs)
+            fig = generate_pareto_plot(filtered, metric_x, metric_y, prefs)
             summary = compute_pareto_summary(filtered, pareto_df, metric_x, metric_y)
             # Return empty strings to clear the filter state and textbox.
             return "", "", format_df(filtered), fig, summary
@@ -379,7 +412,34 @@ def reset_filter(task_name, model_id, metric_x, metric_y):
     return demo
 
 
-path = os.path.join(os.path.dirname(__file__), "MetaMathQA", "results")
-df = load_df(path, task_name="MetaMathQA")
+_METRIC_EXPLANATIONS = {
+    "MetaMathQA": (
+        "*forgetting: This is the reduction in CE loss on a sample of Wikipedia data and reflects how much the "
+        "model 'forgot' during training. The lower the number, the better."
+    ),
+    "image-gen": (
+        "*drift: This measures how much the generated images drift from the base model's outputs on unrelated "
+        "prompts, reflecting how much the model 'forgot' during training. The lower the number, the better."
+    ),
+}
+
+
+def _get_metric_explanation(task_name):
+    return _METRIC_EXPLANATIONS.get(task_name, "")
+
+
+base_dir = os.path.dirname(__file__)
+_TASK_CONFIGS = {
+    "MetaMathQA": os.path.join(base_dir, "MetaMathQA", "results"),
+    "image-gen": os.path.join(base_dir, "image-gen", "results"),
+}
+
+dfs = []
+for task_name, path in _TASK_CONFIGS.items():
+    if os.path.isdir(path):
+        task_df = load_df(path, task_name=task_name)
+        if not task_df.empty:
+            dfs.append(task_df)
+df = pd.concat(dfs, ignore_index=True)
 demo = build_app(df)
 demo.launch(theme=gr.themes.Soft())
diff --git a/method_comparison/image-gen/Makefile b/method_comparison/image-gen/Makefile
@@ -0,0 +1,96 @@
+# Makefile for listing and running the image generation experiments.
+
+# --- Configuration ---
+PYTHON := python
+RUN_SCRIPT := run.py
+EXPERIMENTS_DIR := experiments
+RESULTS_DIR := results
+
+OPTIONAL_FLAGS =
+
+ifdef UPLOAD_BUCKET
+	OPTIONAL_FLAGS += --bucket_name "${UPLOAD_BUCKET}"
+endif
+
+# --- Automatic Experiment and Result Discovery ---
+
+# 1. Find all experiment directories by looking for adapter_config.json files.
+#    This gives us a list like: experiments/lora/llama-3.2-3B-rank32 ...
+EXPERIMENT_PATHS := $(shell find $(EXPERIMENTS_DIR) \
+		    -name "adapter_config.json" -or \
+		    -name "training_params.json" | xargs dirname | sort -u)
+
+# 2. Define a function to replace all occurrences of a character in a string.
+#    This is needed to replicate the result naming logic from run.py (e.g., "lora/foo" -> "lora-foo").
+#    Usage: $(call replace-all, string, char_to_replace, replacement_char)
+replace-all = $(if $(findstring $(2),$(1)),$(call replace-all,$(subst $(2),$(3),$(1)),$(2),$(3)),$(1))
+
+# 3. Define a function to convert an experiment path to its flat result file path.
+#    e.g., "experiments/lora/llama-3.2-3B-rank32" -> "results/lora-llama-3.2-3B-rank32.json"
+exp_to_res = $(RESULTS_DIR)/$(call replace-all,$(patsubst $(EXPERIMENTS_DIR)/%,%,$(1)),/,--).json
+
+# 4. Generate the list of all target result files we want to build.
+RESULT_FILES := $(foreach exp,$(EXPERIMENT_PATHS),$(call exp_to_res,$(exp)))
+
+
+# --- Main Rules ---
+
+# The default 'all' target depends on all possible result files.
+# Running `make` or `make all` will check and run any outdated or missing experiments.
+all: $(RESULT_FILES)
+
+
+# --- Dynamic Rule Generation ---
+
+# This is the core logic. We dynamically generate a specific Makefile rule for each experiment found.
+# This avoids a complex pattern rule and makes the logic clearer.
+define EXPERIMENT_template
+# Input $1: The full experiment path (e.g., experiments/lora/llama-3.2-3B-rank32)
+
+# Define the rule:
+# The target is the result file (e.g., results/lora-llama-3.2-3B-rank32.json).
+# The dependencies are its config files, code changes need to be audited manually since they can
+# vary in degree of importance. Note that we explicitly ignore when the script fails to run
+# so that the other experiments still have a chance to run.
+$(call exp_to_res,$(1)): $(wildcard $(1)/adapter_config.json) $(wildcard $(1)/training_params.json)
+	@echo "---"
+	@echo "Running experiment: $(1)"
+	-$(PYTHON) $(RUN_SCRIPT) $(OPTIONAL_FLAGS) -v $(1)
+	@echo "Finished: $$@"
+	@echo "---"
+
+endef
+
+# This command iterates through every found experiment path and evaluates the template,
+# effectively stamping out a unique, explicit rule for each one.
+$(foreach exp_path,$(EXPERIMENT_PATHS),$(eval $(call EXPERIMENT_template,$(exp_path))))
+
+
+# --- Utility Rules ---
+
+.PHONY: all clean list dump_rules
+
+# The 'clean' rule removes all generated results.
+clean:
+	@echo "Cleaning results directory..."
+	@([ -n "$(wildcard $(RESULTS_DIR)/*.json)" ] && rm $(RESULTS_DIR)/*.json) || exit 0
+
+# The 'list' rule is for debugging. It shows the discovered experiments
+# and the result files the Makefile expects to create for them.
+list:
+	@echo "Discovered experiment configurations:"
+	@$(foreach exp,$(EXPERIMENT_PATHS),echo "  - $(exp)/adapter_config.json";)
+	@echo "\nTarget result files:"
+	@$(foreach res,$(RESULT_FILES),echo "  - $(res)";)
+
+# The 'dump_rules' rule is for debugging. It dumps all dynamically defined rules.
+define newline
+
+
+endef
+define DUMPED_RULES
+	$(foreach exp_path,$(EXPERIMENT_PATHS),$(call EXPERIMENT_template,$(exp_path)))
+endef
+
+dump_rules:
+	@echo -e "$(subst $(newline),\n,${DUMPED_RULES})"