Skip to content

Commit 88cfe88

Browse files
committed
Remove debug files
1 parent 642ac86 commit 88cfe88

12 files changed

Lines changed: 4072 additions & 8594 deletions

analyze_assertion_discrepancies.py

Lines changed: 0 additions & 58 deletions
This file was deleted.

debug_advanced.py

Lines changed: 0 additions & 98 deletions
This file was deleted.

debug_chart.py

Lines changed: 0 additions & 18 deletions
This file was deleted.

debug_template.py

Lines changed: 0 additions & 39 deletions
This file was deleted.

find_discrepancies.py

Lines changed: 0 additions & 57 deletions
This file was deleted.

report/charts_and_tables.py

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1973,3 +1973,116 @@ def create_pass_fail_by_evaluation_type_chart(data):
19731973
fig3.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
19741974
fig3.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
19751975
return fig3
1976+
1977+
1978+
def create_combined_assertions_bar_chart(data: Dict[str, pd.DataFrame]) -> go.Figure:
1979+
"""
1980+
Create a combined bar chart showing assertions passed, failed, and missed for all models.
1981+
This replaces the separate single-shot and multi-shot charts.
1982+
1983+
Args:
1984+
data: Dictionary of datasets
1985+
1986+
Returns:
1987+
Plotly Figure with combined assertions data
1988+
"""
1989+
# Prepare data for plotting
1990+
models = []
1991+
assertions_passed = []
1992+
assertions_failed = []
1993+
assertions_missed = []
1994+
1995+
for key, df in data.items():
1996+
# Handle different naming conventions
1997+
if '_singleshot_' in key or '_multishot_' in key:
1998+
# Advanced results naming convention
1999+
model_name = key.split('_')[0]
2000+
# Add configuration info to make it unique
2001+
if '_rag_' in key:
2002+
rag_status = 'rag' if '_rag_detailed' in key or '_rag_simple' in key else 'norag'
2003+
else:
2004+
rag_status = 'norag'
2005+
2006+
instruction_type = 'detailed' if '_detailed' in key else 'simple'
2007+
eval_type = 'multishot' if '_multishot_' in key else 'singleshot'
2008+
model_name_full = f"{model_name}_{rag_status}_{instruction_type}_{eval_type}"
2009+
else:
2010+
# Exercism results naming: use existing logic
2011+
model_name = key.split('_multi_shot')[0]
2012+
is_multishot = 'multi_shot_build_error_enabled_True' in key
2013+
eval_type = 'multishot' if is_multishot else 'singleshot'
2014+
model_name_full = f"{model_name}_{eval_type}"
2015+
2016+
models.append(model_name_full)
2017+
total_passed = df['number_of_assertions_passed'].sum()
2018+
total_failed = df['number_of_assertions_failed'].sum()
2019+
total_missed = df['number_of_assertions_missed'].sum()
2020+
2021+
assertions_passed.append(total_passed)
2022+
assertions_failed.append(total_failed)
2023+
assertions_missed.append(total_missed)
2024+
2025+
# Create interactive bar chart
2026+
fig = go.Figure()
2027+
2028+
fig.add_trace(go.Bar(
2029+
name='Assertions Passed',
2030+
x=models,
2031+
y=assertions_passed,
2032+
marker_color='#28a745', # Green
2033+
opacity=0.8,
2034+
text=assertions_passed,
2035+
textposition='outside',
2036+
hovertemplate='<b>%{x}</b><br>Assertions Passed: %{y}<extra></extra>'
2037+
))
2038+
2039+
fig.add_trace(go.Bar(
2040+
name='Assertions Failed',
2041+
x=models,
2042+
y=assertions_failed,
2043+
marker_color='#dc3545', # Red
2044+
opacity=0.8,
2045+
text=assertions_failed,
2046+
textposition='outside',
2047+
hovertemplate='<b>%{x}</b><br>Assertions Failed: %{y}<extra></extra>'
2048+
))
2049+
2050+
fig.add_trace(go.Bar(
2051+
name='Assertions Missed',
2052+
x=models,
2053+
y=assertions_missed,
2054+
marker_color='#ffc107', # Yellow/Orange
2055+
opacity=0.8,
2056+
text=assertions_missed,
2057+
textposition='outside',
2058+
hovertemplate='<b>%{x}</b><br>Assertions Missed: %{y}<br><i>(Due to build errors, etc.)</i><extra></extra>'
2059+
))
2060+
2061+
fig.update_layout(
2062+
title={
2063+
'text': 'Assertions Performance by Model and Configuration',
2064+
'x': 0.5,
2065+
'xanchor': 'center',
2066+
'font': {'size': 18}
2067+
},
2068+
xaxis_title='Models and Configurations',
2069+
yaxis_title='Number of Assertions',
2070+
barmode='group',
2071+
width=1200,
2072+
height=700,
2073+
template='plotly_white',
2074+
legend=dict(
2075+
orientation="h",
2076+
yanchor="bottom",
2077+
y=1.02,
2078+
xanchor="right",
2079+
x=1
2080+
),
2081+
margin=dict(b=150, l=80, r=80, t=100)
2082+
)
2083+
2084+
fig.update_xaxes(tickangle=45, tickfont=dict(size=10))
2085+
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
2086+
2087+
return fig
2088+

0 commit comments

Comments
 (0)