Skip to content

Commit 406928d

Browse files
fix: escape special characters in SchemaTransformProcessor JSON templates (#250)
Fixes GitHub issue #227 where SchemaTransformProcessor fails with JSONDecodeError when LLM-generated content contains quotes, backslashes, newlines, or other special characters that break JSON parsing. The fix properly escapes all string values before template rendering using json.dumps to handle all JSON-special characters.
1 parent 3d86a38 commit 406928d

2 files changed

Lines changed: 87 additions & 7 deletions

File tree

packages/data-designer-engine/src/data_designer/engine/processing/processors/schema_transform.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import json
77
import logging
8-
from typing import TYPE_CHECKING
8+
from typing import TYPE_CHECKING, Any
99

1010
from data_designer.config.processors import SchemaTransformProcessorConfig
1111
from data_designer.engine.dataset_builders.artifact_storage import BatchStage
@@ -20,17 +20,39 @@
2020
logger = logging.getLogger(__name__)
2121

2222

23+
def _json_escape_record(record: dict[str, Any]) -> dict[str, Any]:
24+
"""Escape record values for safe insertion into a JSON template."""
25+
26+
def escape_for_json_string(s: str) -> str:
27+
"""Use json.dumps to escape, then strip the surrounding quotes."""
28+
return json.dumps(s)[1:-1]
29+
30+
escaped = {}
31+
for key, value in record.items():
32+
if isinstance(value, str):
33+
escaped[key] = escape_for_json_string(value)
34+
elif isinstance(value, (dict, list)):
35+
escaped[key] = escape_for_json_string(json.dumps(value))
36+
elif value is None:
37+
escaped[key] = "null"
38+
else:
39+
escaped[key] = str(value)
40+
return escaped
41+
42+
2343
class SchemaTransformProcessor(WithJinja2UserTemplateRendering, Processor[SchemaTransformProcessorConfig]):
2444
@property
2545
def template_as_str(self) -> str:
2646
return json.dumps(self.config.template)
2747

2848
def process(self, data: pd.DataFrame, *, current_batch_number: int | None = None) -> pd.DataFrame:
2949
self.prepare_jinja2_template_renderer(self.template_as_str, data.columns.to_list())
30-
formatted_records = [
31-
json.loads(self.render_template(deserialize_json_values(record)).replace("\n", "\\n"))
32-
for record in data.to_dict(orient="records")
33-
]
50+
formatted_records = []
51+
for record in data.to_dict(orient="records"):
52+
deserialized = deserialize_json_values(record)
53+
escaped = _json_escape_record(deserialized)
54+
rendered = self.render_template(escaped)
55+
formatted_records.append(json.loads(rendered))
3456
formatted_data = pd.DataFrame(formatted_records)
3557
if current_batch_number is not None:
3658
self.artifact_storage.write_batch_to_parquet_file(

packages/data-designer-engine/tests/engine/processing/processors/test_schema_transform.py

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,65 @@ def test_process_with_json_serialized_values(stub_processor: SchemaTransformProc
129129
assert written_dataframe is not None
130130
assert len(written_dataframe) == 2
131131

132-
# Verify that nested JSON values are properly deserialized in template rendering
132+
# Verify that nested JSON values are properly serialized as JSON strings in template rendering
133133
first_output = written_dataframe.iloc[0].to_dict()
134134
assert first_output["text"] == "hello"
135-
assert first_output["value"] == "{'nested': 'value1'}"
135+
# Nested JSON should be properly serialized as JSON string (not Python repr)
136+
assert first_output["value"] == '{"nested": "value1"}'
137+
138+
139+
def test_process_with_special_characters_in_llm_output(stub_processor: SchemaTransformProcessor) -> None:
140+
"""Test that LLM outputs with special characters are properly escaped for JSON.
141+
142+
This addresses GitHub issue #227 where SchemaTransformProcessor fails with JSONDecodeError
143+
when LLM-generated content contains quotes, backslashes, or newlines.
144+
"""
145+
df_with_special_chars = pd.DataFrame(
146+
{
147+
"col1": [
148+
'He said "Hello"',
149+
"Line1\nLine2",
150+
"Path: C:\\Users\\test",
151+
"Tab\there",
152+
],
153+
"col2": [1, 2, 3, 4],
154+
}
155+
)
156+
157+
# Process should not raise JSONDecodeError
158+
stub_processor.process(df_with_special_chars, current_batch_number=0)
159+
written_dataframe: pd.DataFrame = stub_processor.artifact_storage.write_batch_to_parquet_file.call_args.kwargs[
160+
"dataframe"
161+
]
162+
163+
# Verify all rows were processed successfully
164+
assert written_dataframe is not None
165+
assert len(written_dataframe) == 4
166+
167+
# Verify the special characters are preserved in the output
168+
outputs = written_dataframe.to_dict(orient="records")
169+
assert outputs[0]["text"] == 'He said "Hello"'
170+
assert outputs[1]["text"] == "Line1\nLine2"
171+
assert outputs[2]["text"] == "Path: C:\\Users\\test"
172+
assert outputs[3]["text"] == "Tab\there"
173+
174+
175+
def test_process_with_mixed_special_characters(stub_processor: SchemaTransformProcessor) -> None:
176+
"""Test complex LLM output with multiple types of special characters."""
177+
df_complex = pd.DataFrame(
178+
{
179+
"col1": [
180+
'She replied: "I\'m not sure about that\\nLet me think..."',
181+
],
182+
"col2": [42],
183+
}
184+
)
185+
186+
stub_processor.process(df_complex, current_batch_number=0)
187+
written_dataframe: pd.DataFrame = stub_processor.artifact_storage.write_batch_to_parquet_file.call_args.kwargs[
188+
"dataframe"
189+
]
190+
191+
assert len(written_dataframe) == 1
192+
output = written_dataframe.iloc[0].to_dict()
193+
assert output["text"] == 'She replied: "I\'m not sure about that\\nLet me think..."'

0 commit comments

Comments
 (0)