Skip to content

Commit 8441acc

Browse files
committed
Added qualitative analysis pipeline
1 parent 2354987 commit 8441acc

13 files changed

Lines changed: 26173 additions & 0 deletions

experiments/kdd 2026/.DS_Store

8 KB
Binary file not shown.

experiments/kdd 2026/bayesian_bootstrap_results/qualitative/bayesian_boostrapping_results.json

Lines changed: 1898 additions & 0 deletions
Large diffs are not rendered by default.

experiments/kdd 2026/qualitative_analysis/qualitative_data_analysis.ipynb

Lines changed: 3789 additions & 0 deletions
Large diffs are not rendered by default.

experiments/kdd 2026/qualitative_analysis/qualitative_data_extraction.ipynb

Lines changed: 2304 additions & 0 deletions
Large diffs are not rendered by default.

experiments/kdd 2026/qualitative_analysis/test_suites/box_bench.json

Lines changed: 2246 additions & 0 deletions
Large diffs are not rendered by default.

experiments/kdd 2026/qualitative_analysis/test_suites/calendar_bench.json

Lines changed: 6079 additions & 0 deletions
Large diffs are not rendered by default.

experiments/kdd 2026/qualitative_analysis/test_suites/linear_bench.json

Lines changed: 3388 additions & 0 deletions
Large diffs are not rendered by default.

experiments/kdd 2026/qualitative_analysis/test_suites/slack_bench_v2.json

Lines changed: 4065 additions & 0 deletions
Large diffs are not rendered by default.

experiments/kdd 2026/qualitative_analysis/utils/bayes_bootstrap.py

Lines changed: 595 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
"""
2+
Clean merged results by removing runs affected by server errors.
3+
"""
4+
5+
import json
6+
import os
7+
from datetime import datetime
8+
from collections import defaultdict
9+
from tqdm import tqdm
10+
from typing import Optional
11+
12+
# Import the unified error classifier
13+
import sys
14+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
15+
from unified_error_classifier import classify_response
16+
17+
18+
def has_server_error(run: dict) -> bool:
19+
"""
20+
Check if a run contains any server errors.
21+
22+
Args:
23+
run: Raw run dict from merged results
24+
25+
Returns:
26+
True if the run has at least one server error, False otherwise
27+
"""
28+
service = run.get('service', '')
29+
trace = run.get('trace', {})
30+
steps = trace.get('steps', []) if isinstance(trace, dict) else []
31+
32+
for step in steps:
33+
observation = step.get('observation', {})
34+
35+
# Get stdout from observation
36+
if isinstance(observation, dict):
37+
stdout = observation.get('stdout', '')
38+
else:
39+
stdout = str(observation) if observation else ''
40+
41+
# Classify the response
42+
try:
43+
classification = classify_response(stdout, service)
44+
if classification.get("error_type") == "server_error":
45+
return True
46+
except ValueError:
47+
# Unknown service - skip
48+
pass
49+
50+
return False
51+
52+
53+
def clean_merged_results(
54+
merged_results_path: str,
55+
output_folder: Optional[str] = None,
56+
output_filename: Optional[str] = None,
57+
verbose: bool = True
58+
) -> tuple[list, str]:
59+
"""
60+
Clean merged results by removing runs with server errors.
61+
62+
Args:
63+
merged_results_path: Path to the merged results JSON file
64+
output_folder: Folder to save cleaned results. Defaults to same folder as input.
65+
output_filename: Custom output filename. Defaults to 'cleaned_<original_name>.json'
66+
verbose: Whether to print progress and statistics
67+
68+
Returns:
69+
Tuple of (cleaned_runs list, output_filepath)
70+
"""
71+
# Load merged results
72+
with open(merged_results_path, 'r') as f:
73+
runs = json.load(f)
74+
75+
if verbose:
76+
print(f"Loaded {len(runs)} runs from {merged_results_path}")
77+
78+
# Filter out runs with server errors
79+
cleaned_runs = []
80+
removed_count = 0
81+
82+
for run in tqdm(runs, desc="Checking for server errors", disable=not verbose):
83+
if has_server_error(run):
84+
removed_count += 1
85+
else:
86+
cleaned_runs.append(run)
87+
88+
if verbose:
89+
print(f"\nRemoved {removed_count} runs with server errors ({100*removed_count/len(runs):.1f}%)")
90+
print(f"Cleaned runs: {len(cleaned_runs)}")
91+
92+
# Show breakdown by service
93+
print("\n--- Breakdown by Service ---")
94+
service_counts = defaultdict(lambda: {"original": 0, "cleaned": 0})
95+
96+
for run in runs:
97+
svc = run.get("service", "unknown")
98+
service_counts[svc]["original"] += 1
99+
100+
for run in cleaned_runs:
101+
svc = run.get("service", "unknown")
102+
service_counts[svc]["cleaned"] += 1
103+
104+
for svc, counts in sorted(service_counts.items()):
105+
removed = counts["original"] - counts["cleaned"]
106+
pct_removed = 100 * removed / counts["original"] if counts["original"] > 0 else 0
107+
print(f" {svc.upper()}: {counts['original']} -> {counts['cleaned']} (removed {removed}, {pct_removed:.1f}%)")
108+
109+
# Determine output path
110+
if output_folder is None:
111+
output_folder = os.path.dirname(merged_results_path)
112+
113+
if output_filename is None:
114+
base_name = os.path.basename(merged_results_path)
115+
output_filename = f"cleaned_{base_name}"
116+
117+
output_filepath = os.path.join(output_folder, output_filename)
118+
119+
# Save cleaned results
120+
with open(output_filepath, 'w') as f:
121+
json.dump(cleaned_runs, f, indent=2)
122+
123+
if verbose:
124+
print(f"\nSaved cleaned dataset to: {output_filepath}")
125+
print(f"Original size: {len(json.dumps(runs)) / 1024 / 1024:.2f} MB")
126+
print(f"Cleaned size: {len(json.dumps(cleaned_runs)) / 1024 / 1024:.2f} MB")
127+
128+
return cleaned_runs, output_filepath
129+
130+
131+
def get_or_create_cleaned_results(
132+
merged_results_path: str,
133+
output_folder: Optional[str] = None,
134+
output_filename: Optional[str] = None,
135+
force_recreate: bool = False
136+
) -> tuple[list, str]:
137+
"""
138+
Get cleaned results from cache or create them if they don't exist.
139+
140+
Args:
141+
merged_results_path: Path to the merged results JSON file
142+
output_folder: Folder to save/load cleaned results
143+
output_filename: Custom output filename
144+
force_recreate: If True, recreate even if cached file exists
145+
146+
Returns:
147+
Tuple of (cleaned_runs list, output_filepath)
148+
"""
149+
# Determine expected output path
150+
if output_folder is None:
151+
output_folder = os.path.dirname(merged_results_path)
152+
153+
if output_filename is None:
154+
base_name = os.path.basename(merged_results_path)
155+
output_filename = f"cleaned_{base_name}"
156+
157+
output_filepath = os.path.join(output_folder, output_filename)
158+
159+
# Check if cached file exists
160+
if not force_recreate and os.path.exists(output_filepath):
161+
print(f"Loading existing cleaned results from: {output_filepath}")
162+
with open(output_filepath, 'r') as f:
163+
cleaned_runs = json.load(f)
164+
print(f"Loaded {len(cleaned_runs)} cleaned runs")
165+
return cleaned_runs, output_filepath
166+
167+
# Create cleaned results
168+
return clean_merged_results(
169+
merged_results_path=merged_results_path,
170+
output_folder=output_folder,
171+
output_filename=output_filename
172+
)
173+
174+
175+
if __name__ == "__main__":
176+
# Example usage
177+
MERGED_RESULTS_FILE = "merged_results_20260204_221118.json"
178+
179+
cleaned_runs, output_path = get_or_create_cleaned_results(MERGED_RESULTS_FILE)
180+
print(f"\nDone! {len(cleaned_runs)} runs saved to {output_path}")

0 commit comments

Comments
 (0)