Skip to content

Commit f1ddce6

Browse files
committed
eval: add 30 conversational queries per dataset + benchmark results
Each dataset now has a new "*_conversational.json" file with 30 complex / natural-language queries covering: - Paraphrase (직접 매칭 불가) - Multi-hop (cross-table chaining) - Conditional aggregation (WHERE + GROUP BY) - Filter + comparison - Recommendation / opinion - Time-range temporal - Absence / negation Registered as KRRA/assort/X2BEE Conv in run_all.py. Agent benchmark loop now picks up datasets with "Hard" OR "Conv" in the name. Agent results (GPT-4o-mini, 5 turns, LLM-judge enabled): KRRA Conv: 15/30 (50%) assort Conv: 19/24 (79%) X2BEE Conv: 20/27 (74%) Single-shot results stay low (0.15-0.26 MRR) — conversational queries by design need multi-turn exploration, that's why the agent scores are so much higher. KRRA conversational GT is auto-generated from keyword matches against concept/decision/rule/artifact nodes (documents are stored under those kinds, not NodeKind.DOCUMENT in KRRA).
1 parent 0dd30e9 commit f1ddce6

5 files changed

Lines changed: 1001 additions & 7 deletions

File tree

eval/baselines/qa_latest.json

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@
33
"mrr": 0.9667,
44
"p_at_k": 0.5032,
55
"r_at_k": 0.8934,
6-
"ndcg": 0.9012,
6+
"ndcg": 0.9008,
77
"corpus_size": 20
88
},
99
"KRRA Hard": {
1010
"mrr": 1.0,
11-
"p_at_k": 0.5683,
12-
"r_at_k": 0.2901,
13-
"ndcg": 0.6239,
11+
"p_at_k": 0.6099,
12+
"r_at_k": 0.2957,
13+
"ndcg": 0.6312,
1414
"corpus_size": 15
1515
},
1616
"assort Easy": {
@@ -41,15 +41,36 @@
4141
"ndcg": 0.2616,
4242
"corpus_size": 20
4343
},
44+
"KRRA Conv": {
45+
"mrr": 0.167,
46+
"p_at_k": 0.0564,
47+
"r_at_k": 0.0867,
48+
"ndcg": 0.0855,
49+
"corpus_size": 30
50+
},
51+
"assort Conv": {
52+
"mrr": 0.2563,
53+
"p_at_k": 0.0625,
54+
"r_at_k": 0.3438,
55+
"ndcg": 0.2645,
56+
"corpus_size": 30
57+
},
58+
"X2BEE Conv": {
59+
"mrr": 0.1498,
60+
"p_at_k": 0.0593,
61+
"r_at_k": 0.2407,
62+
"ndcg": 0.1716,
63+
"corpus_size": 30
64+
},
4465
"KRRA Hard (agent)": {
45-
"mrr": 0.6667,
66+
"mrr": 0.5333,
4667
"p_at_k": 0,
4768
"r_at_k": 0,
4869
"ndcg": 0,
4970
"corpus_size": 15
5071
},
5172
"assort Hard (agent)": {
52-
"mrr": 0.8,
73+
"mrr": 0.6667,
5374
"p_at_k": 0,
5475
"r_at_k": 0,
5576
"ndcg": 0,
@@ -61,5 +82,26 @@
6182
"r_at_k": 0,
6283
"ndcg": 0,
6384
"corpus_size": 19
85+
},
86+
"KRRA Conv (agent)": {
87+
"mrr": 0.5,
88+
"p_at_k": 0,
89+
"r_at_k": 0,
90+
"ndcg": 0,
91+
"corpus_size": 30
92+
},
93+
"assort Conv (agent)": {
94+
"mrr": 0.7917,
95+
"p_at_k": 0,
96+
"r_at_k": 0,
97+
"ndcg": 0,
98+
"corpus_size": 24
99+
},
100+
"X2BEE Conv (agent)": {
101+
"mrr": 0.7407,
102+
"p_at_k": 0,
103+
"r_at_k": 0,
104+
"ndcg": 0,
105+
"corpus_size": 27
64106
}
65107
}
Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
{
2+
"dataset": "assort_conversational",
3+
"description": "assort 패션 이커머스 데이터에 대한 복합·대화형 질문 30개 — 리뷰 감성, 사이즈/핏, 시즌 조합, 쇼핑 추천",
4+
"id_field": "node_title",
5+
"queries": [
6+
{
7+
"qid": "c001",
8+
"level": "L5",
9+
"type": "aggregation",
10+
"query": "이번 시즌(25SS)에 몇 벌이나 나왔어?",
11+
"description": "season = 25SS 카운트 (29개)",
12+
"relevant_docs": [
13+
"products:12800000", "products:12800002", "products:12800008", "products:12800010"
14+
]
15+
},
16+
{
17+
"qid": "c002",
18+
"level": "L4",
19+
"type": "filter",
20+
"query": "10만원 이하 티셔츠 추천해줘",
21+
"description": "price <= 100000 + 티셔츠",
22+
"relevant_docs": ["products:12800002"]
23+
},
24+
{
25+
"qid": "c003",
26+
"level": "L6",
27+
"type": "multi_hop",
28+
"query": "가장 판매량 많은 상품의 평균 리뷰 평점은?",
29+
"description": "cumulative_sales TOP + review 평균",
30+
"relevant_docs": ["products:12800000"]
31+
},
32+
{
33+
"qid": "c004",
34+
"level": "L6",
35+
"type": "review_sentiment",
36+
"query": "사이즈가 크다는 리뷰가 많은 상품 있어?",
37+
"description": "size_satisfaction=큼 + product 집계",
38+
"relevant_docs": []
39+
},
40+
{
41+
"qid": "c005",
42+
"level": "L7",
43+
"type": "conversational",
44+
"query": "봄에 입기 좋은 니트 뭐가 있을까?",
45+
"description": "25SS + 니트 필터",
46+
"relevant_docs": ["products:12800000", "products:12800004", "products:12800011"]
47+
},
48+
{
49+
"qid": "c006",
50+
"level": "L7",
51+
"type": "conversational",
52+
"query": "결혼식 하객으로 입기 좋은 원피스?",
53+
"description": "원피스 + 고가 제품 추천",
54+
"relevant_docs": ["products:12800016"]
55+
},
56+
{
57+
"qid": "c007",
58+
"level": "L6",
59+
"type": "multi_hop",
60+
"query": "할인율 20% 이상 상품 중 누적 판매량 3000개 넘는 것",
61+
"description": "discount_rate + cumulative_sales 복합 필터",
62+
"relevant_docs": ["products:12800000", "products:12800004"]
63+
},
64+
{
65+
"qid": "c008",
66+
"level": "L5",
67+
"type": "aggregation",
68+
"query": "색상별로 몇 개의 상품 variant이 있어?",
69+
"description": "product_variants GROUP BY color_id",
70+
"relevant_docs": ["colors:1", "colors:2", "colors:3"]
71+
},
72+
{
73+
"qid": "c009",
74+
"level": "L3",
75+
"type": "cross_table",
76+
"query": "실크블렌드 가디건의 모든 사이즈 버전",
77+
"description": "products → product_variants 조인",
78+
"relevant_docs": ["products:12800000"]
79+
},
80+
{
81+
"qid": "c010",
82+
"level": "L7",
83+
"type": "conversational",
84+
"query": "55사이즈로 예쁜 거 뭐 있어?",
85+
"description": "size=55 variant + product 조인",
86+
"relevant_docs": ["sizes:2"]
87+
},
88+
{
89+
"qid": "c011",
90+
"level": "L6",
91+
"type": "review_sentiment",
92+
"query": "\"촉감이 좋다\"는 리뷰가 있는 상품은?",
93+
"description": "review_content contains 촉감",
94+
"relevant_docs": []
95+
},
96+
{
97+
"qid": "c012",
98+
"level": "L5",
99+
"type": "aggregation",
100+
"query": "어느 시즌 상품이 가장 많이 팔려?",
101+
"description": "season GROUP BY + SUM cumulative_sales",
102+
"relevant_docs": ["products:12800000", "products:12800001"]
103+
},
104+
{
105+
"qid": "c013",
106+
"level": "L4",
107+
"type": "filter",
108+
"query": "5만원 미만 상품 전부 보여줘",
109+
"description": "selling_price < 50000",
110+
"relevant_docs": ["products:12800002", "products:12800003", "products:12800010"]
111+
},
112+
{
113+
"qid": "c014",
114+
"level": "L6",
115+
"type": "multi_hop",
116+
"query": "방송 판매된 상품 중 가장 누적 매출이 큰 것은?",
117+
"description": "broadcasts 조인 + cumulative_amount 최대",
118+
"relevant_docs": ["products:12800000"]
119+
},
120+
{
121+
"qid": "c015",
122+
"level": "L7",
123+
"type": "conversational",
124+
"query": "엄마 선물로 괜찮은 블라우스 있을까?",
125+
"description": "블라우스 + 고가(품질) 추천",
126+
"relevant_docs": ["products:12800005", "products:12800013", "products:12800015"]
127+
},
128+
{
129+
"qid": "c016",
130+
"level": "L5",
131+
"type": "aggregation",
132+
"query": "상품당 평균 리뷰 개수는?",
133+
"description": "reviews GROUP BY product_code + avg",
134+
"relevant_docs": []
135+
},
136+
{
137+
"qid": "c017",
138+
"level": "L6",
139+
"type": "multi_hop",
140+
"query": "핑크 색상 variant이 있는 상품은?",
141+
"description": "colors:핑크 → variant → product",
142+
"relevant_docs": ["products:12800000"]
143+
},
144+
{
145+
"qid": "c018",
146+
"level": "L6",
147+
"type": "review_sentiment",
148+
"query": "재구매 후기가 있는 상품은?",
149+
"description": "review_content contains 재구매",
150+
"relevant_docs": []
151+
},
152+
{
153+
"qid": "c019",
154+
"level": "L4",
155+
"type": "filter",
156+
"query": "24SS 시즌 상품 중에 가장 비싼 것",
157+
"description": "season=24SS + max price",
158+
"relevant_docs": ["products:12800001", "products:12800009"]
159+
},
160+
{
161+
"qid": "c020",
162+
"level": "L7",
163+
"type": "conversational",
164+
"query": "이번주 홈쇼핑 방송 예정 상품 있어?",
165+
"description": "broadcasts 최신 + 상품",
166+
"relevant_docs": ["broadcasts:1"]
167+
},
168+
{
169+
"qid": "c021",
170+
"level": "L5",
171+
"type": "aggregation",
172+
"query": "평균 할인율이 얼마야?",
173+
"description": "discount_rate 평균",
174+
"relevant_docs": []
175+
},
176+
{
177+
"qid": "c022",
178+
"level": "L6",
179+
"type": "multi_hop",
180+
"query": "30% 이상 할인 중인 상품 중 25SS 신상은?",
181+
"description": "discount + season 복합 필터",
182+
"relevant_docs": ["products:12800000"]
183+
},
184+
{
185+
"qid": "c023",
186+
"level": "L3",
187+
"type": "cross_table",
188+
"query": "12800000 상품에 대해 알려진 모든 정보",
189+
"description": "product + variants + reviews + broadcasts + sales 통합",
190+
"relevant_docs": ["products:12800000"]
191+
},
192+
{
193+
"qid": "c024",
194+
"level": "L7",
195+
"type": "conversational",
196+
"query": "스커트 종류 뭐가 있어?",
197+
"description": "product_name contains 스커트",
198+
"relevant_docs": ["products:12800001", "products:12800003", "products:12800006"]
199+
},
200+
{
201+
"qid": "c025",
202+
"level": "L5",
203+
"type": "aggregation",
204+
"query": "블랙 컬러가 들어간 variant 개수는?",
205+
"description": "colors:블랙 → variant 카운트",
206+
"relevant_docs": ["colors:5"]
207+
},
208+
{
209+
"qid": "c026",
210+
"level": "L6",
211+
"type": "review_sentiment",
212+
"query": "엄마 선물로 샀다는 리뷰가 달린 상품 목록",
213+
"description": "review_content contains 엄마",
214+
"relevant_docs": []
215+
},
216+
{
217+
"qid": "c027",
218+
"level": "L7",
219+
"type": "conversational",
220+
"query": "가성비 좋은 걸로 3개만 추천해줘",
221+
"description": "가격 저렴 + 판매량 많음 조합",
222+
"relevant_docs": ["products:12800002", "products:12800003"]
223+
},
224+
{
225+
"qid": "c028",
226+
"level": "L6",
227+
"type": "multi_hop",
228+
"query": "FREE 사이즈 상품은 몇 가지 색상이 있어?",
229+
"description": "sizes:FREE → variant → color 집계",
230+
"relevant_docs": ["sizes:6"]
231+
},
232+
{
233+
"qid": "c029",
234+
"level": "L4",
235+
"type": "filter",
236+
"query": "24FW 가을/겨울 신상품 목록",
237+
"description": "season = 24FW",
238+
"relevant_docs": ["products:12800005", "products:12800006", "products:12800016"]
239+
},
240+
{
241+
"qid": "c030",
242+
"level": "L7",
243+
"type": "conversational",
244+
"query": "오늘 뭘 사면 좋을지 추천해줄래?",
245+
"description": "cumulative_sales 상위 + 할인 조합",
246+
"relevant_docs": ["products:12800000", "products:12800004"]
247+
}
248+
]
249+
}

0 commit comments

Comments
 (0)