Skip to content

Commit 2f67395

Browse files
committed
docs: cleaner GT answer format in Excel export
- Strip redundant 'table_name:' prefix from structured answers. - Numbered list (1. 2. 3.) for multi-answer queries. - Content snippet capped at 150 chars with ellipsis. - Document vs structured nodes get different formats (documents show title ▸ content, structured show content only). Before: [pr_goods_base:G00007] pr_goods_base:G00007 ▸ pr_goods_base: iPhone 15 Pro... After: 1. iPhone 15 Pro 512GB | 아이폰 15 프로 | G00007 | ...
1 parent ffd58b6 commit 2f67395

2 files changed

Lines changed: 32 additions & 9 deletions

File tree

eval/data/gt_datasets.xlsx

1.77 KB
Binary file not shown.

eval/scripts/export_gt_to_excel.py

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -61,18 +61,41 @@ def _load_resolver(graph_path: Path) -> dict[str, str]:
6161
).fetchall()
6262
for r in rows:
6363
title = r["title"] or ""
64-
content = (r["content"] or "")[:120].replace("\n", " ").strip()
65-
summary = f"{title}{content}" if content else title
64+
raw_content = (r["content"] or "").replace("\n", " ").strip()
65+
66+
# Strip redundant "table_name: " prefix from structured content
67+
# so the answer column is readable.
68+
if ":" in title:
69+
tbl = title.split(":", 1)[0]
70+
prefix = f"{tbl}: "
71+
if raw_content.startswith(prefix):
72+
raw_content = raw_content[len(prefix):]
73+
74+
# Cap content length for readability in Excel cells
75+
content_snippet = raw_content[:150]
76+
if len(raw_content) > 150:
77+
content_snippet += "…"
78+
79+
# For structured nodes: title already shows "table:pk", so just
80+
# show the content (which starts with the name). For document
81+
# nodes: title is the doc/chunk title; prepend it.
82+
try:
83+
props = json.loads(r["properties_json"] or "{}")
84+
except json.JSONDecodeError:
85+
props = {}
86+
87+
is_structured = bool(props.get("_table_name"))
88+
if is_structured:
89+
summary = content_snippet or title
90+
else:
91+
# Document node — show title + content preview
92+
summary = f"{title}{content_snippet}" if content_snippet else title
6693

6794
# Key by title (structured: "products:12800000")
6895
if title:
6996
resolver[title] = summary
7097

7198
# Key by properties.doc_id (documents: "0346542e...")
72-
try:
73-
props = json.loads(r["properties_json"] or "{}")
74-
except json.JSONDecodeError:
75-
props = {}
7699
did = props.get("doc_id", "")
77100
if did:
78101
resolver[str(did)] = summary
@@ -101,17 +124,17 @@ def _flatten_query(q: dict, resolver: dict[str, str] | None = None) -> dict:
101124
# Resolve GT IDs to readable answers (title + content snippet)
102125
resolved_lines: list[str] = []
103126
if resolver is not None:
104-
for rid in relevant[:20]: # cap at 20 for readability
127+
for i, rid in enumerate(relevant[:20], start=1): # cap at 20 for readability
105128
key = str(rid)
106129
answer = resolver.get(key, "")
107130
if not answer:
108131
# Try stripping chunk suffix ("#1", "#2") for doc lookup
109132
base = key.rsplit(" #", 1)[0]
110133
answer = resolver.get(base, "")
111134
if answer:
112-
resolved_lines.append(f"[{key}] {answer}")
135+
resolved_lines.append(f"{i}. {answer}")
113136
else:
114-
resolved_lines.append(f"[{key}] (not found in graph)")
137+
resolved_lines.append(f"{i}. [{key}] (not found)")
115138
if len(relevant) > 20:
116139
resolved_lines.append(f"... +{len(relevant) - 20} more")
117140

0 commit comments

Comments
 (0)