fix: escape special characters in SchemaTransformProcessor JSON templates (#250)

andreatgretel · web-flow · commit 406928d83a70 · 2026-01-28T20:54:02.000-03:00
Fixes GitHub issue #227 where SchemaTransformProcessor fails with JSONDecodeError when LLM-generated content contains quotes, backslashes, newlines, or other special characters that break JSON parsing. The fix properly escapes all string values before template rendering using json.dumps to handle all JSON-special characters.
diff --git a/packages/data-designer-engine/src/data_designer/engine/processing/processors/schema_transform.py b/packages/data-designer-engine/src/data_designer/engine/processing/processors/schema_transform.py
@@ -5,7 +5,7 @@
 
 import json
 import logging
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 from data_designer.config.processors import SchemaTransformProcessorConfig
 from data_designer.engine.dataset_builders.artifact_storage import BatchStage
@@ -20,17 +20,39 @@
 logger = logging.getLogger(__name__)
 
 
+def _json_escape_record(record: dict[str, Any]) -> dict[str, Any]:
+    """Escape record values for safe insertion into a JSON template."""
+
+    def escape_for_json_string(s: str) -> str:
+        """Use json.dumps to escape, then strip the surrounding quotes."""
+        return json.dumps(s)[1:-1]
+
+    escaped = {}
+    for key, value in record.items():
+        if isinstance(value, str):
+            escaped[key] = escape_for_json_string(value)
+        elif isinstance(value, (dict, list)):
+            escaped[key] = escape_for_json_string(json.dumps(value))
+        elif value is None:
+            escaped[key] = "null"
+        else:
+            escaped[key] = str(value)
+    return escaped
+
+
 class SchemaTransformProcessor(WithJinja2UserTemplateRendering, Processor[SchemaTransformProcessorConfig]):
     @property
     def template_as_str(self) -> str:
         return json.dumps(self.config.template)
 
     def process(self, data: pd.DataFrame, *, current_batch_number: int | None = None) -> pd.DataFrame:
         self.prepare_jinja2_template_renderer(self.template_as_str, data.columns.to_list())
-        formatted_records = [
-            json.loads(self.render_template(deserialize_json_values(record)).replace("\n", "\\n"))
-            for record in data.to_dict(orient="records")
-        ]
+        formatted_records = []
+        for record in data.to_dict(orient="records"):
+            deserialized = deserialize_json_values(record)
+            escaped = _json_escape_record(deserialized)
+            rendered = self.render_template(escaped)
+            formatted_records.append(json.loads(rendered))
         formatted_data = pd.DataFrame(formatted_records)
         if current_batch_number is not None:
             self.artifact_storage.write_batch_to_parquet_file(
diff --git a/packages/data-designer-engine/tests/engine/processing/processors/test_schema_transform.py b/packages/data-designer-engine/tests/engine/processing/processors/test_schema_transform.py
@@ -129,7 +129,65 @@ def test_process_with_json_serialized_values(stub_processor: SchemaTransformProc
     assert written_dataframe is not None
     assert len(written_dataframe) == 2
 
-    # Verify that nested JSON values are properly deserialized in template rendering
+    # Verify that nested JSON values are properly serialized as JSON strings in template rendering
     first_output = written_dataframe.iloc[0].to_dict()
     assert first_output["text"] == "hello"
-    assert first_output["value"] == "{'nested': 'value1'}"
+    # Nested JSON should be properly serialized as JSON string (not Python repr)
+    assert first_output["value"] == '{"nested": "value1"}'
+
+
+def test_process_with_special_characters_in_llm_output(stub_processor: SchemaTransformProcessor) -> None:
+    """Test that LLM outputs with special characters are properly escaped for JSON.
+
+    This addresses GitHub issue #227 where SchemaTransformProcessor fails with JSONDecodeError
+    when LLM-generated content contains quotes, backslashes, or newlines.
+    """
+    df_with_special_chars = pd.DataFrame(
+        {
+            "col1": [
+                'He said "Hello"',
+                "Line1\nLine2",
+                "Path: C:\\Users\\test",
+                "Tab\there",
+            ],
+            "col2": [1, 2, 3, 4],
+        }
+    )
+
+    # Process should not raise JSONDecodeError
+    stub_processor.process(df_with_special_chars, current_batch_number=0)
+    written_dataframe: pd.DataFrame = stub_processor.artifact_storage.write_batch_to_parquet_file.call_args.kwargs[
+        "dataframe"
+    ]
+
+    # Verify all rows were processed successfully
+    assert written_dataframe is not None
+    assert len(written_dataframe) == 4
+
+    # Verify the special characters are preserved in the output
+    outputs = written_dataframe.to_dict(orient="records")
+    assert outputs[0]["text"] == 'He said "Hello"'
+    assert outputs[1]["text"] == "Line1\nLine2"
+    assert outputs[2]["text"] == "Path: C:\\Users\\test"
+    assert outputs[3]["text"] == "Tab\there"
+
+
+def test_process_with_mixed_special_characters(stub_processor: SchemaTransformProcessor) -> None:
+    """Test complex LLM output with multiple types of special characters."""
+    df_complex = pd.DataFrame(
+        {
+            "col1": [
+                'She replied: "I\'m not sure about that\\nLet me think..."',
+            ],
+            "col2": [42],
+        }
+    )
+
+    stub_processor.process(df_complex, current_batch_number=0)
+    written_dataframe: pd.DataFrame = stub_processor.artifact_storage.write_batch_to_parquet_file.call_args.kwargs[
+        "dataframe"
+    ]
+
+    assert len(written_dataframe) == 1
+    output = written_dataframe.iloc[0].to_dict()
+    assert output["text"] == 'She replied: "I\'m not sure about that\\nLet me think..."'