1+ import pandas as pd
2+ import matplotlib .pyplot as plt
3+ import seaborn as sns
4+ import numpy as np
5+ from matplotlib import rcParams
6+
7+ # Set font aesthetics
8+ rcParams ['font.family' ] = 'DejaVu Sans'
9+ rcParams ['font.size' ] = 16
10+
11+ # Define the data for each environment
12+ data_combo_burger = {
13+ 'EXPERIMENT_ID' : [
14+ 'INTERPRET' , 'Ours' , 'Ours-no-geo' , 'Ours-no-invent' ,
15+ 'Ours-no-subselection' , 'Ours-no-visual' , 'Ours-vlm-subselection' ,
16+ 'VILA-pure' , 'VILA-with-fewshot'
17+ ],
18+ 'NUM_SOLVED' : [0.00 , 8.20 , 0.00 , 0.00 , 0.00 , 0.00 , 0.00 , 0.00 , 3.80 ],
19+ 'NUM_SOLVED_STDDEV' : [0.00 , 1.17 , 0.00 , 0.00 , 0.00 , 0.00 , 0.00 , 0.00 , 0.40 ]
20+ }
21+
22+ data_fatter_burger = {
23+ 'EXPERIMENT_ID' : [
24+ 'INTERPRET' , 'Ours' , 'Ours-no-geo' , 'Ours-no-invent' ,
25+ 'Ours-no-subselection' , 'Ours-no-visual' , 'Ours-vlm-subselection' ,
26+ 'VILA-pure' , 'VILA-with-fewshot'
27+ ],
28+ 'NUM_SOLVED' : [0.00 , 9.60 , 1.20 , 0.00 , 0.00 , 1.20 , 3.00 , 0.80 , 3.80 ],
29+ 'NUM_SOLVED_STDDEV' : [0.00 , 0.80 , 2.40 , 0.00 , 0.00 , 2.40 , 1.41 , 0.40 , 0.40 ]
30+ }
31+
32+ data_more_stacks = {
33+ 'EXPERIMENT_ID' : [
34+ 'INTERPRET' , 'Ours' , 'Ours-no-geo' , 'Ours-no-invent' ,
35+ 'Ours-no-subselection' , 'Ours-no-visual' , 'Ours-vlm-subselection' ,
36+ 'VILA-pure' , 'VILA-with-fewshot'
37+ ],
38+ 'NUM_SOLVED' : [0.00 , 9.40 , 0.00 , 0.00 , 0.00 , 0.00 , 3.60 , 0.80 , 3.80 ],
39+ 'NUM_SOLVED_STDDEV' : [0.00 , 0.80 , 0.00 , 0.00 , 0.00 , 0.00 , 2.24 , 1.17 , 0.40 ]
40+ }
41+
42+ data_kitchen_boil_kettle = {
43+ 'EXPERIMENT_ID' : [
44+ 'INTERPRET' , 'Ours' , 'Ours-no-geo' , 'Ours-no-invent' ,
45+ 'Ours-no-subselection' , 'Ours-no-visual' , 'Ours-vlm-subselection' ,
46+ 'VILA-pure' , 'VILA-with-fewshot'
47+ ],
48+ 'NUM_SOLVED' : [0.00 , 9.80 , 9.80 , 0.00 , 0.00 , 9.80 , 1.00 , 6.60 , 10.00 ],
49+ 'NUM_SOLVED_STDDEV' : [0.00 , 0.40 , 0.40 , 0.00 , 0.00 , 0.40 , 2.00 , 1.02 , 0.00 ]
50+ }
51+
52+ # Convert each dataset to a DataFrame
53+ df_combo_burger = pd .DataFrame (data_combo_burger )
54+ df_fatter_burger = pd .DataFrame (data_fatter_burger )
55+ df_more_stacks = pd .DataFrame (data_more_stacks )
56+ df_kitchen_boil_kettle = pd .DataFrame (data_kitchen_boil_kettle )
57+
58+ # Reorder the 'EXPERIMENT_ID' column to match 'custom_order'
59+ custom_order = [
60+ 'Ours' , 'Ours-no-geo' , 'Ours-no-invent' , 'Ours-no-subselection' ,
61+ 'Ours-no-visual' , 'Ours-vlm-subselection' , 'INTERPRET' , 'VILA-pure' ,
62+ 'VILA-with-fewshot'
63+ ]
64+
65+ # Apply Categorical ordering before any transformations
66+ for df in [df_combo_burger , df_fatter_burger , df_more_stacks , df_kitchen_boil_kettle ]:
67+ df ['EXPERIMENT_ID' ] = pd .Categorical (df ['EXPERIMENT_ID' ], categories = custom_order , ordered = True )
68+ df .sort_values ('EXPERIMENT_ID' , inplace = True )
69+
70+ # Convert 'NUM_SOLVED' to percentages and calculate standard error
71+ for df in [df_combo_burger , df_fatter_burger , df_more_stacks , df_kitchen_boil_kettle ]:
72+ df ['NUM_SOLVED' ] = df ['NUM_SOLVED' ] * 10
73+ df ['NUM_SOLVED_SE' ] = df ['NUM_SOLVED_STDDEV' ] / np .sqrt (5 ) * 10
74+
75+ # Initialize subplots
76+ fig , axes = plt .subplots (1 , 4 , figsize = (18 , 6 ), sharey = True )
77+
78+ # Assign a larger color palette for the bars, so that each bar has a unique color
79+ unique_palette = sns .color_palette ("pastel" , n_colors = len (df_combo_burger ))
80+
81+ # Plot in the new order: 'Boil Kettle', 'More Stacks', 'Bigger Burger', then 'Combo Burger'
82+ environments = [df_kitchen_boil_kettle , df_more_stacks , df_fatter_burger , df_combo_burger ]
83+ titles = ["Kitchen Boil Kettle" , "More Burger Stacks" , "Bigger Burger" , "Combo Burger" ]
84+
85+ for i , (df , title ) in enumerate (zip (environments , titles )):
86+ sns .barplot (
87+ data = df , y = 'EXPERIMENT_ID' , x = 'NUM_SOLVED' , ax = axes [i ], palette = unique_palette , capsize = 0.1
88+ )
89+ axes [i ].errorbar (
90+ df ['NUM_SOLVED' ], df ['EXPERIMENT_ID' ],
91+ xerr = df ['NUM_SOLVED_SE' ], fmt = 'none' , c = 'black' , capsize = 5 , capthick = 1
92+ )
93+ axes [i ].set_title (title , fontsize = 20 ) # Increase title font size
94+ axes [i ].set_xlabel ('' ) # Clear individual x-labels
95+ axes [i ].set_ylabel ('' , fontsize = 16 ) # Increase y-label font size
96+ axes [i ].tick_params (axis = 'both' , labelsize = 14 ) # Increase tick label size
97+ axes [i ].grid (True , linestyle = '--' , alpha = 0.6 ) # Add gridlines for clarity
98+
99+ # Set shared x-label
100+ fig .text (0.5 , 0.01 , '% Evaluation Tasks Solved' , ha = 'center' , fontsize = 18 )
101+
102+ # Adjust layout with tighter spacing
103+ plt .tight_layout (rect = [0.02 , 0.05 , 1 , 1 ])
104+ plt .show ()
0 commit comments