@@ -83,7 +83,11 @@ def _format_to_prefilter_regex(spark_format: str) -> str:
8383 while idx < len (spark_format ):
8484 if spark_format [idx ] == "'" :
8585 end_idx = spark_format .find ("'" , idx + 1 )
86- literal = spark_format [idx + 1 :] if end_idx == - 1 else spark_format [idx + 1 : end_idx ]
86+ literal = (
87+ spark_format [idx + 1 :]
88+ if end_idx == - 1
89+ else spark_format [idx + 1 : end_idx ]
90+ )
8791 parts .append (re .escape (literal ))
8892 idx = len (spark_format ) if end_idx == - 1 else end_idx + 1
8993 continue
@@ -128,7 +132,9 @@ def safe_to_timestamp(
128132 if spark is not None :
129133 from tablespec .session import get_capabilities
130134
131- can_use_try_with_format = get_capabilities (spark )["try_to_timestamp_with_format" ]
135+ can_use_try_with_format = get_capabilities (spark )[
136+ "try_to_timestamp_with_format"
137+ ]
132138
133139 if can_use_try_with_format :
134140 return F .try_to_timestamp (column , F .lit (spark_format )) # type: ignore[attr-defined]
@@ -146,7 +152,9 @@ def safe_to_date(
146152 spark : object | None = None ,
147153) -> Column :
148154 """Compatibility wrapper that delegates to ``safe_to_timestamp`` then casts to date."""
149- return safe_to_timestamp (column , spark_format = spark_format , spark = spark ).cast ("date" )
155+ return safe_to_timestamp (column , spark_format = spark_format , spark = spark ).cast (
156+ "date"
157+ )
150158
151159
152160def build_flexible_formats (
@@ -174,7 +182,9 @@ def build_flexible_formats(
174182 return []
175183
176184 # Get formats from SUPPORTED_DATE_FORMATS (explicit UMF formats)
177- supported = [fmt .umf_format for fmt in SUPPORTED_DATE_FORMATS if fmt .format_type in allowed ]
185+ supported = [
186+ fmt .umf_format for fmt in SUPPORTED_DATE_FORMATS if fmt .format_type in allowed
187+ ]
178188
179189 seen : set [str ] = set ()
180190 ordered : list [str ] = []
@@ -311,6 +321,77 @@ def convert_umf_format_to_spark(umf_format: str) -> str:
311321 return result
312322
313323
324+ def cast_column_sql (
325+ column : str ,
326+ target_type : str ,
327+ format : str | None = None ,
328+ * ,
329+ precision : int | None = None ,
330+ scale : int | None = None ,
331+ ) -> str :
332+ """Return a Spark SQL expression that casts *column* to *target_type*.
333+
334+ SQL counterpart of :func:`cast_column_with_format`: it emits the same casting
335+ logic as a plain Spark SQL string so it can be embedded in a committed,
336+ independently-runnable ingest artifact -- no PySpark at runtime. Both functions
337+ share :func:`convert_umf_format_to_spark`, so the date/timestamp formats are
338+ guaranteed identical.
339+
340+ Args:
341+ ----
342+ column: Raw column reference (assumed to be a valid SQL identifier).
343+ target_type: UMF/Spark target type (DATE, TIMESTAMP, INTEGER, DECIMAL, ...).
344+ format: Optional UMF date/timestamp format (e.g. "YYYYMMDD").
345+ precision: DECIMAL precision (defaults to 10, matching the runtime caster).
346+ scale: DECIMAL scale (defaults to 2, matching the runtime caster).
347+
348+ Returns:
349+ -------
350+ A Spark SQL expression string,
351+ e.g. ``cast(try_to_timestamp(d, 'yyyyMMdd') as date)``.
352+
353+ Examples:
354+ --------
355+ >>> cast_column_sql("birth_date", "DATE", "MM/DD/YYYY")
356+ "cast(try_to_timestamp(birth_date, 'MM/dd/yyyy') as date)"
357+ >>> cast_column_sql("age", "INTEGER")
358+ "cast(nullif(trim(regexp_replace(age, '^\\ \\ $', '')), '') as INT)"
359+
360+ """
361+ t = target_type .upper ()
362+
363+ # String types: raw landing data is already a string -- passthrough.
364+ if t in ("STRING" , "VARCHAR" , "TEXT" , "CHAR" ):
365+ return column
366+
367+ # Numerics: strip a leading "$", trim, and treat empty/whitespace strings as
368+ # NULL (Spark's cast fails on "") before casting -- mirrors cast_column_with_format.
369+ if t in ("INTEGER" , "DECIMAL" , "DOUBLE" , "FLOAT" ):
370+ cleaned = f"nullif(trim(regexp_replace({ column } , '^\\ \\ $', '')), '')"
371+ if t == "INTEGER" :
372+ sql_type = "INT"
373+ elif t == "DECIMAL" :
374+ sql_type = f"DECIMAL({ precision or 10 } ,{ scale if scale is not None else 2 } )"
375+ else : # DOUBLE, FLOAT -> double (runtime maps FLOAT to DoubleType)
376+ sql_type = "DOUBLE"
377+ return f"cast({ cleaned } as { sql_type } )"
378+
379+ # Date/time: try_to_timestamp yields graceful NULL-on-failure (Spark 4.0+).
380+ if t in ("DATE" , "DATETIME" , "TIMESTAMP" ):
381+ if format :
382+ spark_format = convert_umf_format_to_spark (format )
383+ expr = f"try_to_timestamp({ column } , '{ spark_format } ')"
384+ else :
385+ expr = f"try_to_timestamp({ column } )"
386+ return f"cast({ expr } as date)" if t == "DATE" else expr
387+
388+ if t == "BOOLEAN" :
389+ return f"cast({ column } as boolean)"
390+
391+ msg = f"Unsupported target_type for SQL cast: { target_type } "
392+ raise ValueError (msg )
393+
394+
314395def cast_column_with_format (
315396 column : Column ,
316397 target_type : str ,
@@ -371,7 +452,9 @@ def cast_column_with_format(
371452 # Strip leading currency symbol ($) and trim whitespace
372453 column = F .regexp_replace (F .trim (column ), r"^\$" , "" )
373454 # Convert empty/whitespace-only strings to NULL (Spark cast fails on empty strings)
374- column = F .when (F .trim (column ) == "" , F .lit (None ).cast (StringType ())).otherwise (column )
455+ column = F .when (F .trim (column ) == "" , F .lit (None ).cast (StringType ())).otherwise (
456+ column
457+ )
375458
376459 # STRING type - no casting needed, already string
377460 if target_type_upper == "STRING" :
@@ -819,7 +902,9 @@ def cast_timestamp_with_flexible_fallback(
819902 # Detect if value looks like epoch milliseconds (check before normalization affects it)
820903 scientific_pattern = r"^[0-9]+\.?[0-9]*[Ee][+\-]?[0-9]+$"
821904 large_number_pattern = r"^[0-9]{12,}$"
822- is_epoch = trimmed_col .rlike (scientific_pattern ) | trimmed_col .rlike (large_number_pattern )
905+ is_epoch = trimmed_col .rlike (scientific_pattern ) | trimmed_col .rlike (
906+ large_number_pattern
907+ )
823908
824909 # Convert epoch ms to timestamp
825910 epoch_seconds = trimmed_col .cast ("double" ) / 1000
0 commit comments