Skip to content

Commit c6a0f7c

Browse files
committed
update dataset
1 parent 12736b1 commit c6a0f7c

6 files changed

Lines changed: 2381 additions & 1046 deletions

File tree

examples/vision_food_reasoning_dataset/data/vision_food_reasoning_full.jsonl

Lines changed: 1000 additions & 1000 deletions
Large diffs are not rendered by default.

examples/vision_food_reasoning_dataset/data/vision_food_reasoning_raw_full.jsonl

Lines changed: 1000 additions & 0 deletions
Large diffs are not rendered by default.

examples/vision_food_reasoning_dataset/data/vision_food_reasoning_sample.jsonl

Lines changed: 120 additions & 8 deletions
Large diffs are not rendered by default.

examples/vision_food_reasoning_dataset/requirements.txt

Whitespace-only changes.
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Utility script to convert the raw fireworks vision-food-reasoning dataset into native
4+
Eval Protocol EvaluationRow JSONL files so that the default dataset adapter can be used.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
import argparse
10+
import json
11+
import re
12+
from pathlib import Path
13+
from typing import Any, Iterable
14+
15+
DATASET_SOURCE_ID = "fireworks-ai/vision-food-reasoning-dataset"
16+
17+
_BOLD_LABEL_PATTERN = re.compile(r"\*\*(?P<label>[^*]+)\*\*")
18+
_APPEARS_PATTERN = re.compile(r"appears to be\s+(?P<label>[A-Za-z0-9_\- ]+)", re.IGNORECASE)
19+
_IS_PATTERN = re.compile(r"is\s+(?:a|an|the)?\s*(?P<label>[A-Za-z0-9_\- ]+)", re.IGNORECASE)
20+
_SECTION_HEADINGS = {
21+
"visual characteristics",
22+
"texture and shape",
23+
"texture",
24+
"shape",
25+
"cooking method or preparation style",
26+
"cooking method",
27+
"preparation style",
28+
"cultural context or typical presentation",
29+
"cultural context",
30+
"presentation",
31+
"distinguishing features",
32+
"ingredients",
33+
"aroma",
34+
"flavor profile",
35+
}
36+
37+
38+
def _normalize_label(label: str | None) -> str:
39+
if not label:
40+
return ""
41+
cleaned = re.sub(r"[^a-z0-9]+", "_", label.lower())
42+
cleaned = re.sub(r"_+", "_", cleaned).strip("_")
43+
return cleaned
44+
45+
46+
def _content_to_text(content: Any) -> str:
47+
if content is None:
48+
return ""
49+
if isinstance(content, str):
50+
return content
51+
if isinstance(content, Iterable):
52+
parts: list[str] = []
53+
for part in content:
54+
if isinstance(part, dict) and part.get("type") == "text":
55+
text_val = part.get("text")
56+
if isinstance(text_val, str):
57+
parts.append(text_val)
58+
return "\n".join(parts)
59+
return ""
60+
61+
62+
def _extract_label_from_text(text: str) -> str | None:
63+
if not text:
64+
return None
65+
bold_matches = _BOLD_LABEL_PATTERN.findall(text)
66+
if bold_matches:
67+
for candidate in reversed(bold_matches):
68+
normalized = candidate.strip().lower()
69+
if normalized not in _SECTION_HEADINGS:
70+
return candidate.strip()
71+
for pattern in (_APPEARS_PATTERN, _IS_PATTERN):
72+
match = pattern.search(text)
73+
if match:
74+
label = match.group("label").strip()
75+
if len(label.split()) <= 5:
76+
return label
77+
sentences = [segment.strip() for segment in re.split(r"[.!?\n]+", text) if segment.strip()]
78+
if sentences:
79+
tail = sentences[-1]
80+
tokens = re.findall(r"[A-Za-z][A-Za-z0-9_\- ]+", tail)
81+
if tokens:
82+
return tokens[-1].strip()
83+
return None
84+
85+
86+
def convert_dataset(input_path: Path, output_path: Path) -> None:
87+
rows: list[dict[str, Any]] = []
88+
with input_path.open() as infile:
89+
for line in infile:
90+
line = line.strip()
91+
if not line:
92+
continue
93+
rows.append(json.loads(line))
94+
95+
converted_rows: list[dict[str, Any]] = []
96+
skipped = 0
97+
98+
for idx, raw in enumerate(rows):
99+
messages_payload = raw.get("messages")
100+
if not isinstance(messages_payload, list) or len(messages_payload) < 2:
101+
skipped += 1
102+
continue
103+
104+
assistant_reference = messages_payload[-1]
105+
prompt_messages = [
106+
message
107+
for message in messages_payload[:-1]
108+
if isinstance(message, dict) and message.get("role") in {"system", "user"}
109+
]
110+
if not prompt_messages:
111+
skipped += 1
112+
continue
113+
114+
reference_text = _content_to_text(assistant_reference.get("content"))
115+
raw_label = _extract_label_from_text(reference_text)
116+
normalized_label = _normalize_label(raw_label)
117+
if not normalized_label:
118+
skipped += 1
119+
continue
120+
121+
row_id = str(raw.get("id") or f"vision_food_reasoning_{idx}")
122+
converted_rows.append(
123+
{
124+
"messages": prompt_messages,
125+
"ground_truth": {
126+
"label": normalized_label,
127+
"raw_label": raw_label or "",
128+
"reference_answer": reference_text,
129+
},
130+
"input_metadata": {
131+
"row_id": row_id,
132+
"dataset_info": {
133+
"source": DATASET_SOURCE_ID,
134+
"normalized_label": normalized_label,
135+
},
136+
},
137+
}
138+
)
139+
140+
with output_path.open("w") as outfile:
141+
for row in converted_rows:
142+
outfile.write(json.dumps(row, ensure_ascii=False) + "\n")
143+
144+
print(f"Converted {len(converted_rows)} rows (skipped {skipped}) from {input_path} -> {output_path}")
145+
146+
147+
def main() -> None:
148+
parser = argparse.ArgumentParser(description="Convert raw vision food reasoning dataset.")
149+
parser.add_argument("--input", required=True, type=Path, help="Path to the raw JSONL dataset.")
150+
parser.add_argument(
151+
"--output",
152+
required=True,
153+
type=Path,
154+
help="Destination JSONL path for the converted EvaluationRow dataset.",
155+
)
156+
args = parser.parse_args()
157+
convert_dataset(args.input, args.output)
158+
159+
160+
if __name__ == "__main__":
161+
main()

0 commit comments

Comments
 (0)