11# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22# SPDX-License-Identifier: Apache-2.0
33
4- from abc import ABC
4+ from abc import ABC , abstractmethod
55from typing import Annotated , Literal
66
77from pydantic import BaseModel , Discriminator , Field , model_validator
1313from data_designer .config .sampler_params import SamplerParamsT , SamplerType
1414from data_designer .config .utils .code_lang import CodeLang
1515from data_designer .config .utils .constants import REASONING_TRACE_COLUMN_POSTFIX
16- from data_designer .config .utils .misc import assert_valid_jinja2_template , get_prompt_template_keywords
16+ from data_designer .config .utils .misc import assert_valid_jinja2_template , extract_keywords_from_jinja2_template
1717from data_designer .config .validator_params import ValidatorParamsT , ValidatorType
1818
1919
@@ -35,17 +35,22 @@ class SingleColumnConfig(ConfigBase, ABC):
3535 drop : bool = False
3636 column_type : str
3737
38+ @staticmethod
39+ def get_column_emoji () -> str :
40+ return "🎨"
41+
3842 @property
43+ @abstractmethod
3944 def required_columns (self ) -> list [str ]:
4045 """Returns a list of column names that must exist before this column can be generated.
4146
4247 Returns:
4348 List of column names that this column depends on. Empty list indicates
4449 no dependencies. Override in subclasses to specify dependencies.
4550 """
46- return []
4751
4852 @property
53+ @abstractmethod
4954 def side_effect_columns (self ) -> list [str ]:
5055 """Returns a list of additional columns that this column will create as a side effect.
5156
@@ -56,7 +61,6 @@ def side_effect_columns(self) -> list[str]:
5661 List of column names that this column will create as a side effect. Empty list
5762 indicates no side effect columns. Override in subclasses to specify side effects.
5863 """
59- return []
6064
6165
6266class SamplerColumnConfig (SingleColumnConfig ):
@@ -94,6 +98,18 @@ class SamplerColumnConfig(SingleColumnConfig):
9498 convert_to : str | None = None
9599 column_type : Literal ["sampler" ] = "sampler"
96100
101+ @staticmethod
102+ def get_column_emoji () -> str :
103+ return "🎲"
104+
105+ @property
106+ def required_columns (self ) -> list [str ]:
107+ return []
108+
109+ @property
110+ def side_effect_columns (self ) -> list [str ]:
111+ return []
112+
97113 @model_validator (mode = "before" )
98114 @classmethod
99115 def inject_sampler_type_into_params (cls , data : dict ) -> dict :
@@ -150,16 +166,20 @@ class LLMTextColumnConfig(SingleColumnConfig):
150166 multi_modal_context : list [ImageContext ] | None = None
151167 column_type : Literal ["llm-text" ] = "llm-text"
152168
169+ @staticmethod
170+ def get_column_emoji () -> str :
171+ return "📝"
172+
153173 @property
154174 def required_columns (self ) -> list [str ]:
155175 """Get columns referenced in the prompt and system_prompt templates.
156176
157177 Returns:
158178 List of unique column names referenced in Jinja2 templates.
159179 """
160- required_cols = list (get_prompt_template_keywords (self .prompt ))
180+ required_cols = list (extract_keywords_from_jinja2_template (self .prompt ))
161181 if self .system_prompt :
162- required_cols .extend (list (get_prompt_template_keywords (self .system_prompt )))
182+ required_cols .extend (list (extract_keywords_from_jinja2_template (self .system_prompt )))
163183 return list (set (required_cols ))
164184
165185 @property
@@ -207,6 +227,10 @@ class LLMCodeColumnConfig(LLMTextColumnConfig):
207227 code_lang : CodeLang
208228 column_type : Literal ["llm-code" ] = "llm-code"
209229
230+ @staticmethod
231+ def get_column_emoji () -> str :
232+ return "💻"
233+
210234
211235class LLMStructuredColumnConfig (LLMTextColumnConfig ):
212236 """Configuration for structured JSON generation columns using Large Language Models.
@@ -225,6 +249,10 @@ class LLMStructuredColumnConfig(LLMTextColumnConfig):
225249 output_format : dict | type [BaseModel ]
226250 column_type : Literal ["llm-structured" ] = "llm-structured"
227251
252+ @staticmethod
253+ def get_column_emoji () -> str :
254+ return "🗂️"
255+
228256 @model_validator (mode = "after" )
229257 def validate_output_format (self ) -> Self :
230258 """Convert Pydantic model to JSON schema if needed.
@@ -275,6 +303,10 @@ class LLMJudgeColumnConfig(LLMTextColumnConfig):
275303 scores : list [Score ] = Field (..., min_length = 1 )
276304 column_type : Literal ["llm-judge" ] = "llm-judge"
277305
306+ @staticmethod
307+ def get_column_emoji () -> str :
308+ return "⚖️"
309+
278310
279311class ExpressionColumnConfig (SingleColumnConfig ):
280312 """Configuration for derived columns using Jinja2 expressions.
@@ -297,10 +329,18 @@ class ExpressionColumnConfig(SingleColumnConfig):
297329 dtype : Literal ["int" , "float" , "str" , "bool" ] = "str"
298330 column_type : Literal ["expression" ] = "expression"
299331
332+ @staticmethod
333+ def get_column_emoji () -> str :
334+ return "🧩"
335+
300336 @property
301337 def required_columns (self ) -> list [str ]:
302338 """Returns the columns referenced in the expression template."""
303- return list (get_prompt_template_keywords (self .expr ))
339+ return list (extract_keywords_from_jinja2_template (self .expr ))
340+
341+ @property
342+ def side_effect_columns (self ) -> list [str ]:
343+ return []
304344
305345 @model_validator (mode = "after" )
306346 def assert_expression_valid_jinja (self ) -> Self :
@@ -359,11 +399,19 @@ class ValidationColumnConfig(SingleColumnConfig):
359399 batch_size : int = Field (default = 10 , ge = 1 , description = "Number of records to process in each batch" )
360400 column_type : Literal ["validation" ] = "validation"
361401
402+ @staticmethod
403+ def get_column_emoji () -> str :
404+ return "🔍"
405+
362406 @property
363407 def required_columns (self ) -> list [str ]:
364408 """Returns the columns that need to be validated."""
365409 return self .target_columns
366410
411+ @property
412+ def side_effect_columns (self ) -> list [str ]:
413+ return []
414+
367415
368416class SeedDatasetColumnConfig (SingleColumnConfig ):
369417 """Configuration for columns sourced from seed datasets.
@@ -378,6 +426,18 @@ class SeedDatasetColumnConfig(SingleColumnConfig):
378426
379427 column_type : Literal ["seed-dataset" ] = "seed-dataset"
380428
429+ @staticmethod
430+ def get_column_emoji () -> str :
431+ return "🌱"
432+
433+ @property
434+ def required_columns (self ) -> list [str ]:
435+ return []
436+
437+ @property
438+ def side_effect_columns (self ) -> list [str ]:
439+ return []
440+
381441
382442class EmbeddingColumnConfig (SingleColumnConfig ):
383443 """Configuration for embedding generation columns.
@@ -395,6 +455,14 @@ class EmbeddingColumnConfig(SingleColumnConfig):
395455 model_alias : str
396456 column_type : Literal ["embedding" ] = "embedding"
397457
458+ @staticmethod
459+ def get_column_emoji () -> str :
460+ return "🧬"
461+
398462 @property
399463 def required_columns (self ) -> list [str ]:
400464 return [self .target_column ]
465+
466+ @property
467+ def side_effect_columns (self ) -> list [str ]:
468+ return []
0 commit comments