diff --git a/nemo_curator/stages/text/io/writer/jsonl.py b/nemo_curator/stages/text/io/writer/jsonl.py index 7eff540a23..9b568d7673 100644 --- a/nemo_curator/stages/text/io/writer/jsonl.py +++ b/nemo_curator/stages/text/io/writer/jsonl.py @@ -40,6 +40,7 @@ def write_data(self, task: DocumentBatch, file_path: str) -> None: write_kwargs = { "lines": True, "orient": "records", + "force_ascii": False, } # Add any additional kwargs, allowing them to override defaults diff --git a/tests/stages/text/io/writer/test_jsonl.py b/tests/stages/text/io/writer/test_jsonl.py index c6286e9d66..66f51c8a2a 100644 --- a/tests/stages/text/io/writer/test_jsonl.py +++ b/tests/stages/text/io/writer/test_jsonl.py @@ -107,6 +107,37 @@ def test_jsonl_writer_with_columns_subset(self, pandas_document_batch: DocumentB expected = pandas_document_batch.to_pandas()[["text", "score"]] pd.testing.assert_frame_equal(df, expected) + + def test_jsonl_writer_defaults_to_utf8_output(self, tmpdir: str): + """JsonlWriter should preserve non-ASCII characters by default.""" + output_dir = os.path.join(tmpdir, "jsonl_utf8") + writer = JsonlWriter(path=output_dir) + batch = DocumentBatch(data=pd.DataFrame({"text": ["你好, 世界"], "id": [1]}), task_id="utf8", dataset_name="test") + + writer.setup() + result = writer.process(batch) + + file_path = result.data[0] + with open(file_path, encoding="utf-8") as f: + payload = f.read() + assert "你好, 世界" in payload + assert "\\u4f60" not in payload + + def test_jsonl_writer_allows_force_ascii_override(self, tmpdir: str): + """JsonlWriter should still honor user-provided force_ascii=True.""" + output_dir = os.path.join(tmpdir, "jsonl_ascii") + writer = JsonlWriter(path=output_dir, write_kwargs={"force_ascii": True}) + batch = DocumentBatch(data=pd.DataFrame({"text": ["你好, 世界"], "id": [1]}), task_id="ascii", dataset_name="test") + + writer.setup() + result = writer.process(batch) + + file_path = result.data[0] + with open(file_path, encoding="utf-8") as f: + payload = f.read() + assert "你好, 世界" not in payload + assert "\\u4f60" in payload + def test_jsonl_writer_with_custom_options(self, pandas_document_batch: DocumentBatch, tmpdir: str): """Test JsonlWriter with custom formatting options.""" output_dir = os.path.join(tmpdir, "jsonl_custom")