Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions nemo_curator/stages/text/io/writer/jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def write_data(self, task: DocumentBatch, file_path: str) -> None:
write_kwargs = {
"lines": True,
"orient": "records",
"force_ascii": False,
}

# Add any additional kwargs, allowing them to override defaults
Expand Down
31 changes: 31 additions & 0 deletions tests/stages/text/io/writer/test_jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,37 @@ def test_jsonl_writer_with_columns_subset(self, pandas_document_batch: DocumentB
expected = pandas_document_batch.to_pandas()[["text", "score"]]
pd.testing.assert_frame_equal(df, expected)


def test_jsonl_writer_defaults_to_utf8_output(self, tmpdir: str):
"""JsonlWriter should preserve non-ASCII characters by default."""
output_dir = os.path.join(tmpdir, "jsonl_utf8")
writer = JsonlWriter(path=output_dir)
Comment thread
sarahyurick marked this conversation as resolved.
batch = DocumentBatch(data=pd.DataFrame({"text": ["你好, 世界"], "id": [1]}), task_id="utf8", dataset_name="test")

writer.setup()
result = writer.process(batch)

file_path = result.data[0]
with open(file_path, encoding="utf-8") as f:
payload = f.read()
assert "你好, 世界" in payload
assert "\\u4f60" not in payload

def test_jsonl_writer_allows_force_ascii_override(self, tmpdir: str):
"""JsonlWriter should still honor user-provided force_ascii=True."""
output_dir = os.path.join(tmpdir, "jsonl_ascii")
writer = JsonlWriter(path=output_dir, write_kwargs={"force_ascii": True})
batch = DocumentBatch(data=pd.DataFrame({"text": ["你好, 世界"], "id": [1]}), task_id="ascii", dataset_name="test")

writer.setup()
result = writer.process(batch)

file_path = result.data[0]
with open(file_path, encoding="utf-8") as f:
payload = f.read()
assert "你好, 世界" not in payload
assert "\\u4f60" in payload

def test_jsonl_writer_with_custom_options(self, pandas_document_batch: DocumentBatch, tmpdir: str):
"""Test JsonlWriter with custom formatting options."""
output_dir = os.path.join(tmpdir, "jsonl_custom")
Expand Down
Loading