Skip to content

Commit 87a1f86

Browse files
authored
[feat] support messages column as JSON string in iterable datasets (#147)
Add automatic JSON string decoding for the messages column in iterable dataset loaders. This allows parquet files to store chat messages as JSON strings instead of nested Arrow structs, avoiding schema inference issues with deeply nested message formats. Co-authored-by: mwxely <mwxely@users.noreply.github.com>
1 parent 749eeef commit 87a1f86

2 files changed

Lines changed: 10 additions & 0 deletions

File tree

src/lmms_engine/datasets/iterable/qwen3_vl_iterable_dataset.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import os
23
from typing import Dict, Tuple
34

@@ -22,6 +23,9 @@ def load_from_json(self, data, data_folder=None) -> Dict[str, torch.Tensor]:
2223
videos = []
2324
kwargs = {}
2425
messages = data["messages"]
26+
# Support messages stored as JSON string (common in parquet files)
27+
if isinstance(messages, str):
28+
messages = json.loads(messages)
2529
for message in messages:
2630
for content in message["content"]:
2731
if content["type"] == "image_url":

src/lmms_engine/datasets/iterable/vision_iterable_dataset.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import os
23
from typing import Dict
34

@@ -63,6 +64,9 @@ def load_from_json(self, data, data_folder=None) -> Dict[str, torch.Tensor]:
6364
videos = []
6465
kwargs = {}
6566
messages = data["messages"]
67+
# Support messages stored as JSON string (common in parquet files)
68+
if isinstance(messages, str):
69+
messages = json.loads(messages)
6670
for message in messages:
6771
for content in message["content"]:
6872
if content["type"] == "image_url":
@@ -92,6 +96,8 @@ def load_from_json(self, data, data_folder=None) -> Dict[str, torch.Tensor]:
9296

9397
def load_from_hf(self, data) -> Dict[str, torch.Tensor]:
9498
messages = data["messages"]
99+
if isinstance(messages, str):
100+
messages = json.loads(messages)
95101
hf_messages = TrainUtilities.convert_open_to_hf(messages)
96102
if isinstance(data["image"], list):
97103
images = data["image"]

0 commit comments

Comments
 (0)