Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
84c0ba6
[WIP] Image generation benchmark
BenjaminBossan Mar 3, 2026
250c3be
Some clean up
BenjaminBossan Mar 4, 2026
674f544
Some adjustments
BenjaminBossan Mar 5, 2026
45f1963
More fixes
BenjaminBossan Mar 5, 2026
40e768b
Go back to 512x512 with train batch size 2
BenjaminBossan Mar 5, 2026
ebe0d59
Correctly deal with seeds
BenjaminBossan Mar 5, 2026
9ce4828
Add TODO comment, remove obsolete debug line
BenjaminBossan Mar 5, 2026
84c5478
Take average of 10 generations for test similarity
BenjaminBossan Mar 6, 2026
66ed7a9
Add drift metric
BenjaminBossan Mar 9, 2026
3b81cda
Switch to bigger dataset
BenjaminBossan Mar 9, 2026
9ab2ef3
Increase default max steps, reduce lr
BenjaminBossan Mar 10, 2026
805c757
Add more experiments, minor fixes, update docs
BenjaminBossan Mar 12, 2026
10794ab
Simplify generate_sample_images, document more
BenjaminBossan Mar 13, 2026
81f80e0
Reviewer comments:
BenjaminBossan Mar 16, 2026
bd9f4d4
Reviewer feedback: Add min Diffusers version
BenjaminBossan Mar 16, 2026
b964b1d
Merge branch 'main' into feat-add-image-gen-benchmark
BenjaminBossan Mar 23, 2026
4bd05f2
Add more progress bars for slow steps
BenjaminBossan Mar 23, 2026
3bd307a
Add more experiments
BenjaminBossan Mar 23, 2026
566385e
Update app.py to include image benchmark
BenjaminBossan Mar 23, 2026
9f28c81
Update README
BenjaminBossan Mar 23, 2026
07ef6f9
Merge branch 'main' into feat-add-image-gen-benchmark
BenjaminBossan Apr 22, 2026
46f2355
Rearrange config for better structure
BenjaminBossan Apr 22, 2026
9c6ca45
add gradient checkpoing option
BenjaminBossan Apr 22, 2026
f4250bf
Upload checkpoints and sample images to HF bucket
BenjaminBossan Apr 27, 2026
b757b62
Apply copilot reviewer feedback
BenjaminBossan Apr 27, 2026
a3517b8
Report max memory excluding eval memory
BenjaminBossan Apr 27, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion method_comparison/MetaMathQA/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def evaluate(model, tokenizer, ds, batch_size, generate_kwargs, use_tqdm: bool =
return predictions, responses


@torch.inference_mode # type: ignore
@torch.inference_mode()
def calculate_mean_per_token_loss(model, tokenizer, rows: list[str], batch_size: int, max_length: int) -> float:
"""Calculate the mean loss per token on the given dataset.

Expand Down
144 changes: 102 additions & 42 deletions method_comparison/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,46 @@
import tempfile

import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from processing import load_df
from sanitizer import parse_and_filter


metric_preferences = {
_COMMON_METRIC_PREFERENCES = {
"accelerator_memory_reserved_avg": "lower",
"accelerator_memory_max": "lower",
"accelerator_memory_reserved_99th": "lower",
"total_time": "lower",
"train_time": "lower",
"file_size": "lower",
"test_accuracy": "higher",
"train_loss": "lower",
"num_trainable_params": "lower",
"forgetting*": "lower",
}

_TASK_METRIC_PREFERENCES = {
"MetaMathQA": {
"test_accuracy": "higher",
"forgetting*": "lower",
},
"image-gen": {
"test_dino_similarity": "higher",
"drift*": "lower",
},
}

_TASK_PARETO_DEFAULTS = {
"MetaMathQA": ("accelerator_memory_max", "test_accuracy"),
"image-gen": ("accelerator_memory_max", "test_dino_similarity"),
}


def get_metric_preferences(task_name):
prefs = dict(_COMMON_METRIC_PREFERENCES)
prefs.update(_TASK_METRIC_PREFERENCES.get(task_name, {}))
return prefs


def get_model_ids(task_name, df):
filtered = df[df["task_name"] == task_name]
Expand All @@ -49,7 +70,7 @@ def filter_data(task_name, model_id, df):


# Compute the Pareto frontier for two selected metrics.
def compute_pareto_frontier(df, metric_x, metric_y):
def compute_pareto_frontier(df, metric_x, metric_y, metric_preferences):
if df.empty:
return df

Expand Down Expand Up @@ -87,12 +108,12 @@ def dominates(a, b, metric_x, metric_y):
return pareto_df


def generate_pareto_plot(df, metric_x, metric_y):
def generate_pareto_plot(df, metric_x, metric_y, metric_preferences):
if df.empty:
return {}

# Compute Pareto frontier and non-frontier points.
pareto_df = compute_pareto_frontier(df, metric_x, metric_y)
pareto_df = compute_pareto_frontier(df, metric_x, metric_y, metric_preferences)
non_pareto_df = df.drop(pareto_df.index)

# Create an empty figure.
Expand Down Expand Up @@ -188,6 +209,11 @@ def format_df(df):


def build_app(df):
task_names = sorted(df["task_name"].unique())
initial_task = "MetaMathQA" if "MetaMathQA" in task_names else task_names[0]
initial_prefs = get_metric_preferences(initial_task)
initial_x, initial_y = _TASK_PARETO_DEFAULTS.get(initial_task, (list(initial_prefs)[0], list(initial_prefs)[1]))

with gr.Blocks() as demo:
gr.Markdown("# PEFT method comparison")
gr.Markdown(
Expand All @@ -201,22 +227,21 @@ def build_app(df):
with gr.Row():
task_dropdown = gr.Dropdown(
label="Select Task",
choices=sorted(df["task_name"].unique()),
value=sorted(df["task_name"].unique())[0],
)
model_dropdown = gr.Dropdown(
label="Select Model ID", choices=get_model_ids(sorted(df["task_name"].unique())[0], df)
choices=task_names,
value=initial_task,
)
model_dropdown = gr.Dropdown(label="Select Model ID", choices=get_model_ids(initial_task, df))

# Make dataframe columns all equal in width so that they are good enough for numbers but don't
# get hugely extended by columns like `train_config`.
column_widths = ["150px" for _ in df.columns]
column2index = dict(zip(df.columns, range(len(df.columns))))
column_widths[column2index['experiment_name']] = '300px'
initial_filtered = filter_data(initial_task, get_model_ids(initial_task, df)[0], df)
column_widths = ["150px" for _ in initial_filtered.columns]
column2index = dict(zip(initial_filtered.columns, range(len(initial_filtered.columns))))
column_widths[column2index["experiment_name"]] = "300px"

data_table = gr.DataFrame(
label="Results",
value=format_df(df),
value=format_df(initial_filtered),
interactive=False,
max_chars=100,
wrap=False,
Expand All @@ -232,9 +257,8 @@ def build_app(df):
apply_filter_button = gr.Button("Apply Filter")
reset_filter_button = gr.Button("Reset Filter")

gr.Markdown(
"*forgetting: This is the reduction in CE loss on a sample of Wikipedia data and reflects how much the "
"model 'forgot' during training. The lower the number, the better."
metric_explanation = gr.Markdown(
_get_metric_explanation(initial_task),
)

gr.Markdown("## Pareto plot")
Expand All @@ -245,23 +269,15 @@ def build_app(df):
)

with gr.Row():
x_default = (
"accelerator_memory_max"
if "accelerator_memory_max" in metric_preferences
else list(metric_preferences.keys())[0]
)
y_default = (
"test_accuracy" if "test_accuracy" in metric_preferences else list(metric_preferences.keys())[1]
)
metric_x_dropdown = gr.Dropdown(
label="1st metric for Pareto plot",
choices=list(metric_preferences.keys()),
value=x_default,
choices=list(initial_prefs.keys()),
value=initial_x,
)
metric_y_dropdown = gr.Dropdown(
label="2nd metric for Pareto plot",
choices=list(metric_preferences.keys()),
value=y_default,
choices=list(initial_prefs.keys()),
value=initial_y,
)

pareto_plot = gr.Plot(label="Pareto Frontier Plot")
Expand All @@ -280,10 +296,24 @@ def update_on_task(task_name, current_filter):
except Exception:
# invalid filter query
pass
return gr.update(choices=new_models, value=new_models[0] if new_models else None), format_df(filtered)

prefs = get_metric_preferences(task_name)
x_default, y_default = _TASK_PARETO_DEFAULTS.get(task_name, (list(prefs)[0], list(prefs)[1]))
metric_choices = list(prefs.keys())
explanation = _get_metric_explanation(task_name)

return (
gr.update(choices=new_models, value=new_models[0] if new_models else None),
format_df(filtered),
gr.update(choices=metric_choices, value=x_default),
gr.update(choices=metric_choices, value=y_default),
explanation,
)

task_dropdown.change(
fn=update_on_task, inputs=[task_dropdown, filter_state], outputs=[model_dropdown, data_table]
fn=update_on_task,
inputs=[task_dropdown, filter_state],
outputs=[model_dropdown, data_table, metric_x_dropdown, metric_y_dropdown, metric_explanation],
)

def update_on_model(task_name, model_id, current_filter):
Expand All @@ -301,16 +331,17 @@ def update_on_model(task_name, model_id, current_filter):
)

def update_pareto_plot_and_summary(task_name, model_id, metric_x, metric_y, current_filter):
prefs = get_metric_preferences(task_name)
filtered = filter_data(task_name, model_id, df)
if current_filter.strip():
try:
mask = parse_and_filter(filtered, current_filter)
filtered = filtered[mask]
except Exception as e:
return generate_pareto_plot(filtered, metric_x, metric_y), f"Filter error: {e}"
return generate_pareto_plot(filtered, metric_x, metric_y, prefs), f"Filter error: {e}"

pareto_df = compute_pareto_frontier(filtered, metric_x, metric_y)
fig = generate_pareto_plot(filtered, metric_x, metric_y)
pareto_df = compute_pareto_frontier(filtered, metric_x, metric_y, prefs)
fig = generate_pareto_plot(filtered, metric_x, metric_y, prefs)
summary = compute_pareto_summary(filtered, pareto_df, metric_x, metric_y)
return fig, summary

Expand All @@ -322,6 +353,7 @@ def update_pareto_plot_and_summary(task_name, model_id, metric_x, metric_y, curr
)

def apply_filter(filter_query, task_name, model_id, metric_x, metric_y):
prefs = get_metric_preferences(task_name)
filtered = filter_data(task_name, model_id, df)
if filter_query.strip():
try:
Expand All @@ -332,12 +364,12 @@ def apply_filter(filter_query, task_name, model_id, metric_x, metric_y):
return (
filter_query,
filtered,
Copy link

Copilot AI Apr 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the filter-error path, the second return value is a raw pandas DataFrame, but the DataFrame component is otherwise fed a styled dataframe via format_df(...). Returning the raw DF here can break rendering or make the table formatting inconsistent; return the same formatted value as the non-error path.

Suggested change
filtered,
format_df(filtered),

Copilot uses AI. Check for mistakes.
generate_pareto_plot(filtered, metric_x, metric_y),
generate_pareto_plot(filtered, metric_x, metric_y, prefs),
f"Filter error: {e}",
)

pareto_df = compute_pareto_frontier(filtered, metric_x, metric_y)
fig = generate_pareto_plot(filtered, metric_x, metric_y)
pareto_df = compute_pareto_frontier(filtered, metric_x, metric_y, prefs)
fig = generate_pareto_plot(filtered, metric_x, metric_y, prefs)
summary = compute_pareto_summary(filtered, pareto_df, metric_x, metric_y)
return filter_query, format_df(filtered), fig, summary

Expand All @@ -348,9 +380,10 @@ def apply_filter(filter_query, task_name, model_id, metric_x, metric_y):
)

def reset_filter(task_name, model_id, metric_x, metric_y):
prefs = get_metric_preferences(task_name)
filtered = filter_data(task_name, model_id, df)
pareto_df = compute_pareto_frontier(filtered, metric_x, metric_y)
fig = generate_pareto_plot(filtered, metric_x, metric_y)
pareto_df = compute_pareto_frontier(filtered, metric_x, metric_y, prefs)
fig = generate_pareto_plot(filtered, metric_x, metric_y, prefs)
summary = compute_pareto_summary(filtered, pareto_df, metric_x, metric_y)
# Return empty strings to clear the filter state and textbox.
return "", "", format_df(filtered), fig, summary
Expand Down Expand Up @@ -379,7 +412,34 @@ def reset_filter(task_name, model_id, metric_x, metric_y):
return demo


path = os.path.join(os.path.dirname(__file__), "MetaMathQA", "results")
df = load_df(path, task_name="MetaMathQA")
_METRIC_EXPLANATIONS = {
"MetaMathQA": (
"*forgetting: This is the reduction in CE loss on a sample of Wikipedia data and reflects how much the "
"model 'forgot' during training. The lower the number, the better."
),
"image-gen": (
"*drift: This measures how much the generated images drift from the base model's outputs on unrelated "
"prompts, reflecting how much the model 'forgot' during training. The lower the number, the better."
),
}


def _get_metric_explanation(task_name):
return _METRIC_EXPLANATIONS.get(task_name, "")


base_dir = os.path.dirname(__file__)
_TASK_CONFIGS = {
"MetaMathQA": os.path.join(base_dir, "MetaMathQA", "results"),
"image-gen": os.path.join(base_dir, "image-gen", "results"),
}

dfs = []
for task_name, path in _TASK_CONFIGS.items():
if os.path.isdir(path):
task_df = load_df(path, task_name=task_name)
if not task_df.empty:
dfs.append(task_df)
df = pd.concat(dfs, ignore_index=True)
demo = build_app(df)
demo.launch(theme=gr.themes.Soft())
96 changes: 96 additions & 0 deletions method_comparison/image-gen/Makefile
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as for MetaMath benchmark.

Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Makefile for listing and running the image generation experiments.

# --- Configuration ---
PYTHON := python
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Honestly, a Makefile for launching a couple of experiments seems more complicated than it needs to be. But I am sure I am missing out on something. What's the advantage of using Makefiles for this?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

First, it's the same as MetaMath, so we keep it for consistency. Second, it checks which experiments have already run and only runs the missing ones. I'd say that's pretty much the main use case for make.

RUN_SCRIPT := run.py
EXPERIMENTS_DIR := experiments
RESULTS_DIR := results

OPTIONAL_FLAGS =

ifdef UPLOAD_BUCKET
OPTIONAL_FLAGS += --bucket_name "${UPLOAD_BUCKET}"
endif

# --- Automatic Experiment and Result Discovery ---

# 1. Find all experiment directories by looking for adapter_config.json files.
# This gives us a list like: experiments/lora/llama-3.2-3B-rank32 ...
EXPERIMENT_PATHS := $(shell find $(EXPERIMENTS_DIR) \
-name "adapter_config.json" -or \
-name "training_params.json" | xargs dirname | sort -u)

# 2. Define a function to replace all occurrences of a character in a string.
# This is needed to replicate the result naming logic from run.py (e.g., "lora/foo" -> "lora-foo").
# Usage: $(call replace-all, string, char_to_replace, replacement_char)
replace-all = $(if $(findstring $(2),$(1)),$(call replace-all,$(subst $(2),$(3),$(1)),$(2),$(3)),$(1))

# 3. Define a function to convert an experiment path to its flat result file path.
# e.g., "experiments/lora/llama-3.2-3B-rank32" -> "results/lora-llama-3.2-3B-rank32.json"
exp_to_res = $(RESULTS_DIR)/$(call replace-all,$(patsubst $(EXPERIMENTS_DIR)/%,%,$(1)),/,--).json

# 4. Generate the list of all target result files we want to build.
RESULT_FILES := $(foreach exp,$(EXPERIMENT_PATHS),$(call exp_to_res,$(exp)))


# --- Main Rules ---

# The default 'all' target depends on all possible result files.
# Running `make` or `make all` will check and run any outdated or missing experiments.
all: $(RESULT_FILES)


# --- Dynamic Rule Generation ---

# This is the core logic. We dynamically generate a specific Makefile rule for each experiment found.
# This avoids a complex pattern rule and makes the logic clearer.
define EXPERIMENT_template
# Input $1: The full experiment path (e.g., experiments/lora/llama-3.2-3B-rank32)

# Define the rule:
# The target is the result file (e.g., results/lora-llama-3.2-3B-rank32.json).
# The dependencies are its config files, code changes need to be audited manually since they can
# vary in degree of importance. Note that we explicitly ignore when the script fails to run
# so that the other experiments still have a chance to run.
$(call exp_to_res,$(1)): $(wildcard $(1)/adapter_config.json) $(wildcard $(1)/training_params.json)
@echo "---"
@echo "Running experiment: $(1)"
-$(PYTHON) $(RUN_SCRIPT) $(OPTIONAL_FLAGS) -v $(1)
@echo "Finished: $$@"
@echo "---"

endef

# This command iterates through every found experiment path and evaluates the template,
# effectively stamping out a unique, explicit rule for each one.
$(foreach exp_path,$(EXPERIMENT_PATHS),$(eval $(call EXPERIMENT_template,$(exp_path))))


# --- Utility Rules ---

.PHONY: all clean list dump_rules

# The 'clean' rule removes all generated results.
clean:
@echo "Cleaning results directory..."
@([ -n "$(wildcard $(RESULTS_DIR)/*.json)" ] && rm $(RESULTS_DIR)/*.json) || exit 0

# The 'list' rule is for debugging. It shows the discovered experiments
# and the result files the Makefile expects to create for them.
list:
@echo "Discovered experiment configurations:"
@$(foreach exp,$(EXPERIMENT_PATHS),echo " - $(exp)/adapter_config.json";)
@echo "\nTarget result files:"
@$(foreach res,$(RESULT_FILES),echo " - $(res)";)

# The 'dump_rules' rule is for debugging. It dumps all dynamically defined rules.
define newline


endef
define DUMPED_RULES
$(foreach exp_path,$(EXPERIMENT_PATHS),$(call EXPERIMENT_template,$(exp_path)))
endef

dump_rules:
@echo -e "$(subst $(newline),\n,${DUMPED_RULES})"
Loading
Loading