From 4806819a477f5de24ef74565f9b6835544233195 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Wed, 10 Jun 2026 17:04:17 -0600 Subject: [PATCH 01/10] fix(spark): parse month, day without leading zeros --- sqlglot/dialects/spark.py | 8 ++++++++ sqlglot/generators/spark.py | 35 ++++++++++++++++++++++----------- sqlglot/generators/spark2.py | 4 ++-- tests/dialects/test_dialect.py | 4 ++-- tests/dialects/test_teradata.py | 4 ++-- tests/dialects/test_tsql.py | 2 +- 6 files changed, 39 insertions(+), 18 deletions(-) diff --git a/sqlglot/dialects/spark.py b/sqlglot/dialects/spark.py index 0cc9ef7d07..5ea11f6f84 100644 --- a/sqlglot/dialects/spark.py +++ b/sqlglot/dialects/spark.py @@ -6,6 +6,7 @@ from sqlglot.generators.spark import SparkGenerator from sqlglot.parsers.spark import SparkParser from sqlglot.tokens import TokenType +from sqlglot.trie import new_trie from sqlglot.typing.spark import EXPRESSION_METADATA @@ -16,6 +17,13 @@ class Spark(Spark2): ARRAY_FUNCS_PROPAGATES_NULLS = True EXPRESSION_METADATA = EXPRESSION_METADATA.copy() + LENIENT_INVERSE_TIME_MAPPING = {v: k for k, v in Spark2.TIME_MAPPING.items()} | { + # Parse zero-padded months and days, as per strptime() behavior. + "%m": "M", + "%d": "d", + } + LENIENT_INVERSE_TIME_TRIE = new_trie(LENIENT_INVERSE_TIME_MAPPING) + class Tokenizer(Spark2.Tokenizer): STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = False diff --git a/sqlglot/generators/spark.py b/sqlglot/generators/spark.py index 206eced7b9..135b9fb498 100644 --- a/sqlglot/generators/spark.py +++ b/sqlglot/generators/spark.py @@ -11,6 +11,7 @@ date_delta_to_binary_interval_op, groupconcat_sql, ) +from sqlglot.generators.hive import HIVE_DATE_FORMAT from sqlglot.generators.spark2 import Spark2Generator, temporary_storage_provider from sqlglot.helper import seq_get from sqlglot.transforms import ( @@ -21,6 +22,17 @@ ) +def _groupconcat_sql(self: SparkGenerator, expression: exp.GroupConcat) -> str: + if self.dialect.version < (4,): + expr = exp.ArrayToString( + this=exp.ArrayAgg(this=expression.this), + expression=expression.args.get("separator") or exp.Literal.string(""), + ) + return self.sql(expr) + + return groupconcat_sql(self, expression) + + def _normalize_partition(e: exp.Expr) -> exp.Expr: """Normalize the expressions in PARTITION BY (, , ...)""" if isinstance(e, str): @@ -30,6 +42,17 @@ def _normalize_partition(e: exp.Expr) -> exp.Expr: return e +def _str_to_date_sql(self: SparkGenerator, expression: exp.StrToDate) -> str: + time_format = self.format_time( + expression, + self.dialect.LENIENT_INVERSE_TIME_MAPPING, + self.dialect.LENIENT_INVERSE_TIME_TRIE, + ) + if time_format == HIVE_DATE_FORMAT: + return self.func("TO_DATE", expression.this) + return self.func("TO_DATE", expression.this, time_format) + + def _dateadd_sql(self: SparkGenerator, expression: exp.TsOrDsAdd | exp.TimestampAdd) -> str: if not expression.unit or ( isinstance(expression, exp.TsOrDsAdd) and expression.text("unit").upper() == "DAY" @@ -54,17 +77,6 @@ def _dateadd_sql(self: SparkGenerator, expression: exp.TsOrDsAdd | exp.Timestamp return this -def _groupconcat_sql(self: SparkGenerator, expression: exp.GroupConcat) -> str: - if self.dialect.version < (4,): - expr = exp.ArrayToString( - this=exp.ArrayAgg(this=expression.this), - expression=expression.args.get("separator") or exp.Literal.string(""), - ) - return self.sql(expr) - - return groupconcat_sql(self, expression) - - class SparkGenerator(Spark2Generator): SUPPORTS_TO_NUMBER = True PAD_FILL_PATTERN_IS_REQUIRED = False @@ -129,6 +141,7 @@ class SparkGenerator(Spark2Generator): exp.SafeMultiply: rename_func("TRY_MULTIPLY"), exp.SafeSubtract: rename_func("TRY_SUBTRACT"), exp.StartsWith: rename_func("STARTSWITH"), + exp.StrToDate: _str_to_date_sql, exp.TimeAdd: date_delta_to_binary_interval_op(cast=False), exp.TimeSub: date_delta_to_binary_interval_op(cast=False), exp.TsOrDsAdd: _dateadd_sql, diff --git a/sqlglot/generators/spark2.py b/sqlglot/generators/spark2.py index 4ba6e6c92a..7e5e989b57 100644 --- a/sqlglot/generators/spark2.py +++ b/sqlglot/generators/spark2.py @@ -27,7 +27,7 @@ def _map_sql(self: Spark2Generator, expression: exp.Map) -> str: return self.func("MAP_FROM_ARRAYS", keys, values) -def _str_to_date(self: Spark2Generator, expression: exp.StrToDate) -> str: +def _str_to_date_sql(self: Spark2Generator, expression: exp.StrToDate) -> str: time_format = self.format_time(expression) if time_format == HIVE_DATE_FORMAT: return self.func("TO_DATE", expression.this) @@ -184,7 +184,7 @@ class Spark2Generator(HiveGenerator): exp.SHA2Digest: lambda self, e: self.func( "SHA2", e.this, e.args.get("length") or exp.Literal.number(256) ), - exp.StrToDate: _str_to_date, + exp.StrToDate: _str_to_date_sql, exp.StrToTime: lambda self, e: self.func("TO_TIMESTAMP", e.this, self.format_time(e)), exp.TimestampTrunc: lambda self, e: self.func("DATE_TRUNC", unit_to_str(e), e.this), exp.UnixToTime: _unix_to_time_sql, diff --git a/tests/dialects/test_dialect.py b/tests/dialects/test_dialect.py index 6a0abcae06..1f78097a23 100644 --- a/tests/dialects/test_dialect.py +++ b/tests/dialects/test_dialect.py @@ -1219,7 +1219,7 @@ def test_time(self): "starrocks": "STR_TO_DATE(x, '%Y-%m-%dT%T')", "hive": "CAST(FROM_UNIXTIME(UNIX_TIMESTAMP(x, 'yyyy-MM-ddTHH:mm:ss')) AS DATE)", "presto": "CAST(DATE_PARSE(x, '%Y-%m-%dT%T') AS DATE)", - "spark": "TO_DATE(x, 'yyyy-MM-ddTHH:mm:ss')", + "spark": "TO_DATE(x, 'yyyy-M-dTHH:mm:ss')", "doris": "STR_TO_DATE(x, '%Y-%m-%dT%T')", }, ) @@ -1231,7 +1231,7 @@ def test_time(self): "starrocks": "STR_TO_DATE(x, '%Y-%m-%d')", "hive": "CAST(x AS DATE)", "presto": "CAST(DATE_PARSE(x, '%Y-%m-%d') AS DATE)", - "spark": "TO_DATE(x)", + "spark": "TO_DATE(x, 'yyyy-M-d')", "doris": "STR_TO_DATE(x, '%Y-%m-%d')", }, ) diff --git a/tests/dialects/test_teradata.py b/tests/dialects/test_teradata.py index cf0e32478c..2cf8b1d0d8 100644 --- a/tests/dialects/test_teradata.py +++ b/tests/dialects/test_teradata.py @@ -233,9 +233,9 @@ def test_cast(self): write={ "teradata": "CAST('1992-01' AS DATE FORMAT 'YYYY-DD')", "bigquery": "PARSE_DATE('%Y-%d', '1992-01')", - "databricks": "TO_DATE('1992-01', 'yyyy-dd')", + "databricks": "TO_DATE('1992-01', 'yyyy-d')", "mysql": "STR_TO_DATE('1992-01', '%Y-%d')", - "spark": "TO_DATE('1992-01', 'yyyy-dd')", + "spark": "TO_DATE('1992-01', 'yyyy-d')", "": "STR_TO_DATE('1992-01', '%Y-%d')", }, ) diff --git a/tests/dialects/test_tsql.py b/tests/dialects/test_tsql.py index d6040dac30..9a25008ec5 100644 --- a/tests/dialects/test_tsql.py +++ b/tests/dialects/test_tsql.py @@ -1749,7 +1749,7 @@ def test_convert(self): self.validate_all( "CONVERT(DATE, x, 121)", write={ - "spark": "TO_DATE(x, 'yyyy-MM-dd HH:mm:ss.SSSSSS')", + "spark": "TO_DATE(x, 'yyyy-M-d HH:mm:ss.SSSSSS')", "tsql": "CONVERT(DATE, x, 121)", }, ) From 4c2e16b7e02b7598281989d9804117a411a75167 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Thu, 11 Jun 2026 00:31:56 -0600 Subject: [PATCH 02/10] fix(spark): also fix generation of `exp.StrToTime` --- sqlglot/generators/spark.py | 23 ++++++++++++----------- tests/dialects/test_dialect.py | 4 ++-- tests/dialects/test_presto.py | 8 ++++---- tests/dialects/test_snowflake.py | 4 ++-- tests/dialects/test_spark.py | 10 +++++----- tests/dialects/test_tsql.py | 4 ++-- 6 files changed, 27 insertions(+), 26 deletions(-) diff --git a/sqlglot/generators/spark.py b/sqlglot/generators/spark.py index 135b9fb498..53ac66d5fe 100644 --- a/sqlglot/generators/spark.py +++ b/sqlglot/generators/spark.py @@ -11,7 +11,6 @@ date_delta_to_binary_interval_op, groupconcat_sql, ) -from sqlglot.generators.hive import HIVE_DATE_FORMAT from sqlglot.generators.spark2 import Spark2Generator, temporary_storage_provider from sqlglot.helper import seq_get from sqlglot.transforms import ( @@ -42,15 +41,16 @@ def _normalize_partition(e: exp.Expr) -> exp.Expr: return e -def _str_to_date_sql(self: SparkGenerator, expression: exp.StrToDate) -> str: - time_format = self.format_time( - expression, - self.dialect.LENIENT_INVERSE_TIME_MAPPING, - self.dialect.LENIENT_INVERSE_TIME_TRIE, - ) - if time_format == HIVE_DATE_FORMAT: - return self.func("TO_DATE", expression.this) - return self.func("TO_DATE", expression.this, time_format) +def _str_to_datetime_sql(self: SparkGenerator, expression: exp.StrToDate | exp.StrToTime) -> str: + return self.func( + f"TO_{'DATE' if isinstance(expression, exp.StrToDate) else 'TIMESTAMP'}", + expression.this, + self.format_time( + expression, + self.dialect.LENIENT_INVERSE_TIME_MAPPING, + self.dialect.LENIENT_INVERSE_TIME_TRIE, + ), + ) def _dateadd_sql(self: SparkGenerator, expression: exp.TsOrDsAdd | exp.TimestampAdd) -> str: @@ -141,7 +141,8 @@ class SparkGenerator(Spark2Generator): exp.SafeMultiply: rename_func("TRY_MULTIPLY"), exp.SafeSubtract: rename_func("TRY_SUBTRACT"), exp.StartsWith: rename_func("STARTSWITH"), - exp.StrToDate: _str_to_date_sql, + exp.StrToDate: _str_to_datetime_sql, + exp.StrToTime: _str_to_datetime_sql, exp.TimeAdd: date_delta_to_binary_interval_op(cast=False), exp.TimeSub: date_delta_to_binary_interval_op(cast=False), exp.TsOrDsAdd: _dateadd_sql, diff --git a/tests/dialects/test_dialect.py b/tests/dialects/test_dialect.py index 1f78097a23..90b5e80fa7 100644 --- a/tests/dialects/test_dialect.py +++ b/tests/dialects/test_dialect.py @@ -763,7 +763,7 @@ def test_time(self): "presto": "DATE_PARSE(x, '%Y-%m-%dT%T')", "drill": "TO_TIMESTAMP(x, 'yyyy-MM-dd''T''HH:mm:ss')", "redshift": "TO_TIMESTAMP(x, 'YYYY-MM-DDTHH24:MI:SS')", - "spark": "TO_TIMESTAMP(x, 'yyyy-MM-ddTHH:mm:ss')", + "spark": "TO_TIMESTAMP(x, 'yyyy-M-dTHH:mm:ss')", }, ) self.validate_all( @@ -776,7 +776,7 @@ def test_time(self): "postgres": "TO_TIMESTAMP('2020-01-01', 'YYYY-MM-DD')", "presto": "DATE_PARSE('2020-01-01', '%Y-%m-%d')", "redshift": "TO_TIMESTAMP('2020-01-01', 'YYYY-MM-DD')", - "spark": "TO_TIMESTAMP('2020-01-01', 'yyyy-MM-dd')", + "spark": "TO_TIMESTAMP('2020-01-01', 'yyyy-M-d')", }, ) self.validate_all( diff --git a/tests/dialects/test_presto.py b/tests/dialects/test_presto.py index 3aa75f2e82..152c569103 100644 --- a/tests/dialects/test_presto.py +++ b/tests/dialects/test_presto.py @@ -306,7 +306,7 @@ def test_time(self): "duckdb": "STRPTIME(x, '%Y-%m-%d %H:%M:%S')", "presto": "DATE_PARSE(x, '%Y-%m-%d %T')", "hive": "CAST(x AS TIMESTAMP)", - "spark": "TO_TIMESTAMP(x, 'yyyy-MM-dd HH:mm:ss')", + "spark": "TO_TIMESTAMP(x, 'yyyy-M-d HH:mm:ss')", }, ) self.validate_all( @@ -315,7 +315,7 @@ def test_time(self): "duckdb": "STRPTIME(x, '%Y-%m-%d')", "presto": "DATE_PARSE(x, '%Y-%m-%d')", "hive": "CAST(x AS TIMESTAMP)", - "spark": "TO_TIMESTAMP(x, 'yyyy-MM-dd')", + "spark": "TO_TIMESTAMP(x, 'yyyy-M-d')", }, ) self.validate_all( @@ -330,7 +330,7 @@ def test_time(self): "duckdb": "STRPTIME(SUBSTRING(x, 1, 10), '%Y-%m-%d')", "presto": "DATE_PARSE(SUBSTR(x, 1, 10), '%Y-%m-%d')", "hive": "CAST(SUBSTRING(x, 1, 10) AS TIMESTAMP)", - "spark": "TO_TIMESTAMP(SUBSTRING(x, 1, 10), 'yyyy-MM-dd')", + "spark": "TO_TIMESTAMP(SUBSTRING(x, 1, 10), 'yyyy-M-d')", }, ) self.validate_all( @@ -339,7 +339,7 @@ def test_time(self): "duckdb": "STRPTIME(SUBSTRING(x, 1, 10), '%Y-%m-%d')", "presto": "DATE_PARSE(SUBSTR(x, 1, 10), '%Y-%m-%d')", "hive": "CAST(SUBSTRING(x, 1, 10) AS TIMESTAMP)", - "spark": "TO_TIMESTAMP(SUBSTRING(x, 1, 10), 'yyyy-MM-dd')", + "spark": "TO_TIMESTAMP(SUBSTRING(x, 1, 10), 'yyyy-M-d')", }, ) self.validate_all( diff --git a/tests/dialects/test_snowflake.py b/tests/dialects/test_snowflake.py index 8eccf066cf..c33b832049 100644 --- a/tests/dialects/test_snowflake.py +++ b/tests/dialects/test_snowflake.py @@ -1839,7 +1839,7 @@ def test_snowflake(self): "bigquery": "SELECT PARSE_TIMESTAMP('%d-%m-%Y %I:%M:%S', col) FROM t", "duckdb": "SELECT STRPTIME(col, '%d-%m-%Y %I:%M:%S') FROM t", "snowflake": "SELECT TO_TIMESTAMP(col, 'DD-mm-yyyy hh12:mi:ss') FROM t", - "spark": "SELECT TO_TIMESTAMP(col, 'dd-MM-yyyy hh:mm:ss') FROM t", + "spark": "SELECT TO_TIMESTAMP(col, 'd-M-yyyy hh:mm:ss') FROM t", }, ) self.validate_all( @@ -1904,7 +1904,7 @@ def test_snowflake(self): write={ "bigquery": "SELECT PARSE_TIMESTAMP('%m/%d/%Y %T', '04/05/2013 01:02:03')", "snowflake": "SELECT TO_TIMESTAMP('04/05/2013 01:02:03', 'mm/DD/yyyy hh24:mi:ss')", - "spark": "SELECT TO_TIMESTAMP('04/05/2013 01:02:03', 'MM/dd/yyyy HH:mm:ss')", + "spark": "SELECT TO_TIMESTAMP('04/05/2013 01:02:03', 'M/d/yyyy HH:mm:ss')", }, ) self.validate_all( diff --git a/tests/dialects/test_spark.py b/tests/dialects/test_spark.py index 8d0b890211..e91a9f9a57 100644 --- a/tests/dialects/test_spark.py +++ b/tests/dialects/test_spark.py @@ -659,14 +659,14 @@ def test_spark(self): }, ) self.validate_all( - "SELECT TO_TIMESTAMP('2016-12-31', 'yyyy-MM-dd')", + "SELECT TO_TIMESTAMP('2016-1-1', 'yyyy-M-d')", read={ - "duckdb": "SELECT STRPTIME('2016-12-31', '%Y-%m-%d')", + "duckdb": "SELECT STRPTIME('2016-1-1', '%Y-%m-%d')", }, write={ - "": "SELECT STR_TO_TIME('2016-12-31', '%Y-%m-%d')", - "duckdb": "SELECT STRPTIME('2016-12-31', '%Y-%m-%d')", - "spark": "SELECT TO_TIMESTAMP('2016-12-31', 'yyyy-MM-dd')", + "": "SELECT STR_TO_TIME('2016-1-1', '%Y-%-m-%-d')", + "duckdb": "SELECT STRPTIME('2016-1-1', '%Y-%-m-%-d')", + "spark": "SELECT TO_TIMESTAMP('2016-1-1', 'yyyy-M-d')", }, ) self.validate_all( diff --git a/tests/dialects/test_tsql.py b/tests/dialects/test_tsql.py index 9a25008ec5..ee0ad38f87 100644 --- a/tests/dialects/test_tsql.py +++ b/tests/dialects/test_tsql.py @@ -1756,14 +1756,14 @@ def test_convert(self): self.validate_all( "CONVERT(DATETIME, x, 121)", write={ - "spark": "TO_TIMESTAMP(x, 'yyyy-MM-dd HH:mm:ss.SSSSSS')", + "spark": "TO_TIMESTAMP(x, 'yyyy-M-d HH:mm:ss.SSSSSS')", "tsql": "CONVERT(DATETIME, x, 121)", }, ) self.validate_all( "CONVERT(DATETIME2, x, 121)", write={ - "spark": "TO_TIMESTAMP(x, 'yyyy-MM-dd HH:mm:ss.SSSSSS')", + "spark": "TO_TIMESTAMP(x, 'yyyy-M-d HH:mm:ss.SSSSSS')", "tsql": "CONVERT(DATETIME2, x, 121)", }, ) From af89ba3b50ee8d4798afa804a98fd463d655b3a4 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Thu, 11 Jun 2026 09:28:40 -0600 Subject: [PATCH 03/10] chore(spark): narrow type to fix mypy attr-defined --- sqlglot/generators/spark.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sqlglot/generators/spark.py b/sqlglot/generators/spark.py index 53ac66d5fe..991736de14 100644 --- a/sqlglot/generators/spark.py +++ b/sqlglot/generators/spark.py @@ -42,6 +42,9 @@ def _normalize_partition(e: exp.Expr) -> exp.Expr: def _str_to_datetime_sql(self: SparkGenerator, expression: exp.StrToDate | exp.StrToTime) -> str: + from sqlglot.dialects.spark import Spark + + assert isinstance(self.dialect, Spark) return self.func( f"TO_{'DATE' if isinstance(expression, exp.StrToDate) else 'TIMESTAMP'}", expression.this, From bc598cd27e5fd84179b8d8f3eced439d08b54fa5 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Sat, 20 Jun 2026 11:09:40 -0600 Subject: [PATCH 04/10] fix(spark): parse strict month/day via distinct canonical format Spark 3+ parses MM/dd strictly (single-digit months/days don't parse), unlike the lax %m/%d most dialects produce. Map Spark's MM/dd to a distinct canonical token (%mstrict/%dstrict) only when parsing, so the strict format roundtrips while lax %m/%d still becomes the lenient M/d. Formatting keeps the padded %m/%d -> MM/dd. The internal tokens degrade to %m/%d for every other dialect via the metaclass inverse fallback and the generic strtotime_sql. Assisted-by: Claude Opus 4.8 --- sqlglot/dialects/dialect.py | 44 ++++++++++++++++++++++---- sqlglot/dialects/spark.py | 15 +++++++-- sqlglot/generator.py | 10 ++++-- sqlglot/generators/spark.py | 59 ++++++++++++++++++----------------- sqlglot/generators/spark2.py | 4 +-- sqlglot/time.py | 7 +++++ tests/dialects/test_exasol.py | 4 +-- tests/dialects/test_spark.py | 13 ++++++++ 8 files changed, 114 insertions(+), 42 deletions(-) diff --git a/sqlglot/dialects/dialect.py b/sqlglot/dialects/dialect.py index 3ab40d673c..4f8946ba7b 100644 --- a/sqlglot/dialects/dialect.py +++ b/sqlglot/dialects/dialect.py @@ -26,7 +26,7 @@ from sqlglot.jsonpath import ALL_JSON_PATH_PARTS, JSONPathTokenizer, parse as parse_json_path from sqlglot.parser import Parser from sqlglot.parsers.base import BaseParser -from sqlglot.time import TIMEZONES, format_time, subsecond_precision +from sqlglot.time import STRICT_TIME_FORMATS, TIMEZONES, format_time, subsecond_precision from sqlglot.tokens import Token, Tokenizer, TokenType from sqlglot.trie import new_trie from sqlglot.typing import EXPRESSION_METADATA @@ -232,14 +232,22 @@ def __new__(cls, clsname, bases, attrs): cls._classes[enum.value if enum is not None else clsname.lower()] = klass klass.TIME_TRIE = new_trie(klass.TIME_MAPPING) + klass.STRICT_TIME_TRIE = new_trie(klass.STRICT_TIME_MAPPING) klass.FORMAT_TRIE = ( new_trie(klass.FORMAT_MAPPING) if klass.FORMAT_MAPPING else klass.TIME_TRIE ) # Merge class-defined INVERSE_TIME_MAPPING with auto-generated mappings # This allows dialects to define custom inverse mappings for roundtrip correctness - klass.INVERSE_TIME_MAPPING = {v: k for k, v in klass.TIME_MAPPING.items()} | ( + inverse_time_mapping = {v: k for k, v in klass.TIME_MAPPING.items()} | ( klass.__dict__.get("INVERSE_TIME_MAPPING") or {} ) + # Dialects that define a "strict" format (e.g. Spark) keep their own mapping; + # everyone else degrades it to the lax counterpart's mapping. + for strict_format, lax_format in STRICT_TIME_FORMATS.items(): + inverse_time_mapping.setdefault( + strict_format, inverse_time_mapping.get(lax_format, lax_format) + ) + klass.INVERSE_TIME_MAPPING = inverse_time_mapping klass.INVERSE_TIME_TRIE = new_trie(klass.INVERSE_TIME_MAPPING) klass.INVERSE_FORMAT_MAPPING = {v: k for k, v in klass.FORMAT_MAPPING.items()} klass.INVERSE_FORMAT_TRIE = new_trie(klass.INVERSE_FORMAT_MAPPING) @@ -412,6 +420,13 @@ class Dialect(metaclass=_Dialect): TIME_MAPPING: dict[str, str] = {} """Associates this dialect's time formats with their equivalent Python `strftime` formats.""" + STRICT_TIME_MAPPING: dict[str, str] = {} + """ + Variant of `TIME_MAPPING` used when *parsing* a string with a format (e.g. `StrToTime`). + Lets dialects with strict parsing (e.g. Spark 3+'s zero-padded `MM`/`dd`) map those to a + distinct canonical format, preserving the roundtrip. Empty means `TIME_MAPPING` is used. + """ + # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_model_rules_date_time # https://docs.teradata.com/r/Teradata-Database-SQL-Functions-Operators-Exprs-and-Predicates/March-2017/Data-Type-Conversions/Character-to-DATE-Conversion/Forcing-a-FORMAT-on-CAST-for-Converting-Character-to-DATE FORMAT_MAPPING: dict[str, str] = {} @@ -770,6 +785,7 @@ class Dialect(metaclass=_Dialect): # A trie of the time_mapping keys TIME_TRIE: dict = {} + STRICT_TIME_TRIE: dict = {} FORMAT_TRIE: dict = {} INVERSE_TIME_MAPPING: dict[str, str] = {} @@ -966,16 +982,23 @@ def get_or_raise(cls, dialect: DialectType) -> Dialect: raise ValueError(f"Invalid dialect type for '{dialect}': '{type(dialect)}'.") @classmethod - def format_time(cls, expression: str | exp.Expr | None) -> exp.Expr | None: + def format_time( + cls, expression: str | exp.Expr | None, strict: bool = False + ) -> exp.Expr | None: """Converts a time format in this dialect to its equivalent Python `strftime` format.""" + if strict and cls.STRICT_TIME_MAPPING: + mapping, trie = cls.STRICT_TIME_MAPPING, cls.STRICT_TIME_TRIE + else: + mapping, trie = cls.TIME_MAPPING, cls.TIME_TRIE + if isinstance(expression, str): return exp.Literal.string( # the time formats are quoted - format_time(expression[1:-1], cls.TIME_MAPPING, cls.TIME_TRIE) + format_time(expression[1:-1], mapping, trie) ) if expression and expression.is_string: - return exp.Literal.string(format_time(expression.this, cls.TIME_MAPPING, cls.TIME_TRIE)) + return exp.Literal.string(format_time(expression.this, mapping, trie)) return expression @@ -1544,6 +1567,12 @@ def months_between_sql(self: Generator, expression: exp.MonthsBetween) -> str: return self.sql(result) +# Expressions that parse a string with a format. Dialects with strict parsing +# semantics (STRICT_TIME_MAPPING) use it for these, but not for formatting. Must stay in +# sync with the generator's lenient set (e.g. SparkGenerator.LENIENT_TIME_EXPRESSIONS). +STRICT_PARSE_TIME_EXPRESSIONS = (exp.StrToTime, exp.StrToDate, exp.TsOrDsToDate) + + def build_formatted_time( exp_class: Type[E], dialect_override: str | None = None, default: bool | str | None = None ) -> t.Callable[[BuilderArgs, Dialect], E]: @@ -1569,7 +1598,10 @@ def _builder(args: BuilderArgs, dialect: Dialect) -> E: if not fmt: fmt = target_dialect.TIME_FORMAT if default is True else default or None - return exp_class(this=seq_get(args, 0), format=target_dialect.format_time(fmt)) + strict = exp_class in STRICT_PARSE_TIME_EXPRESSIONS + return exp_class( + this=seq_get(args, 0), format=target_dialect.format_time(fmt, strict=strict) + ) return _builder diff --git a/sqlglot/dialects/spark.py b/sqlglot/dialects/spark.py index 5ea11f6f84..322d78db43 100644 --- a/sqlglot/dialects/spark.py +++ b/sqlglot/dialects/spark.py @@ -17,8 +17,19 @@ class Spark(Spark2): ARRAY_FUNCS_PROPAGATES_NULLS = True EXPRESSION_METADATA = EXPRESSION_METADATA.copy() - LENIENT_INVERSE_TIME_MAPPING = {v: k for k, v in Spark2.TIME_MAPPING.items()} | { - # Parse zero-padded months and days, as per strptime() behavior. + # Spark 3+ parses MM/dd strictly (single-digit months/days don't parse), unlike the + # lax %m/%d other dialects produce. When *parsing* (StrToTime/StrToDate/...), MM/dd + # map to a distinct canonical token so the strict roundtrip is preserved; formatting + # keeps the regular padded %m/%d -> MM/dd (TIME_MAPPING is unchanged). + STRICT_TIME_MAPPING = { + **Spark2.TIME_MAPPING, + "MM": "%mstrict", + "dd": "%dstrict", + } + # Generating a parse format is lenient: %m/%d -> M/d (matching strptime), while the + # strict tokens map back to MM/dd. Used by the generator's format_time override. + LENIENT_INVERSE_TIME_MAPPING = { + **{v: k for k, v in STRICT_TIME_MAPPING.items()}, "%m": "M", "%d": "d", } diff --git a/sqlglot/generator.py b/sqlglot/generator.py index 80deeedffc..dec0f60669 100644 --- a/sqlglot/generator.py +++ b/sqlglot/generator.py @@ -11,7 +11,7 @@ from sqlglot.expressions import apply_index_offset from sqlglot.helper import csv, name_sequence, seq_get from sqlglot.jsonpath import ALL_JSON_PATH_PARTS, JSON_PATH_PART_TRANSFORMS -from sqlglot.time import format_time +from sqlglot.time import STRICT_TIME_FORMATS, STRICT_TIME_TRIE, format_time from sqlglot.tokens import TokenType if t.TYPE_CHECKING: @@ -4052,7 +4052,13 @@ def cast_sql(self, expression: exp.Cast, safe_prefix: str | None = None) -> str: # Base implementation that excludes safe, zone, and target_type metadata args def strtotime_sql(self, expression: exp.StrToTime) -> str: - return self.func("STR_TO_TIME", expression.this, expression.args.get("format")) + # Normalize internal "strict" canonical formats (e.g. Spark's %mstrict) to + # standard strftime, since this generic fallback emits the format verbatim. + return self.func( + "STR_TO_TIME", + expression.this, + self.format_time(expression, STRICT_TIME_FORMATS, STRICT_TIME_TRIE), + ) def currentdate_sql(self, expression: exp.CurrentDate) -> str: zone = self.sql(expression, "this") diff --git a/sqlglot/generators/spark.py b/sqlglot/generators/spark.py index 991736de14..33f315eacd 100644 --- a/sqlglot/generators/spark.py +++ b/sqlglot/generators/spark.py @@ -21,17 +21,6 @@ ) -def _groupconcat_sql(self: SparkGenerator, expression: exp.GroupConcat) -> str: - if self.dialect.version < (4,): - expr = exp.ArrayToString( - this=exp.ArrayAgg(this=expression.this), - expression=expression.args.get("separator") or exp.Literal.string(""), - ) - return self.sql(expr) - - return groupconcat_sql(self, expression) - - def _normalize_partition(e: exp.Expr) -> exp.Expr: """Normalize the expressions in PARTITION BY (, , ...)""" if isinstance(e, str): @@ -41,21 +30,6 @@ def _normalize_partition(e: exp.Expr) -> exp.Expr: return e -def _str_to_datetime_sql(self: SparkGenerator, expression: exp.StrToDate | exp.StrToTime) -> str: - from sqlglot.dialects.spark import Spark - - assert isinstance(self.dialect, Spark) - return self.func( - f"TO_{'DATE' if isinstance(expression, exp.StrToDate) else 'TIMESTAMP'}", - expression.this, - self.format_time( - expression, - self.dialect.LENIENT_INVERSE_TIME_MAPPING, - self.dialect.LENIENT_INVERSE_TIME_TRIE, - ), - ) - - def _dateadd_sql(self: SparkGenerator, expression: exp.TsOrDsAdd | exp.TimestampAdd) -> str: if not expression.unit or ( isinstance(expression, exp.TsOrDsAdd) and expression.text("unit").upper() == "DAY" @@ -80,6 +54,17 @@ def _dateadd_sql(self: SparkGenerator, expression: exp.TsOrDsAdd | exp.Timestamp return this +def _groupconcat_sql(self: SparkGenerator, expression: exp.GroupConcat) -> str: + if self.dialect.version < (4,): + expr = exp.ArrayToString( + this=exp.ArrayAgg(this=expression.this), + expression=expression.args.get("separator") or exp.Literal.string(""), + ) + return self.sql(expr) + + return groupconcat_sql(self, expression) + + class SparkGenerator(Spark2Generator): SUPPORTS_TO_NUMBER = True PAD_FILL_PATTERN_IS_REQUIRED = False @@ -104,6 +89,26 @@ class SparkGenerator(Spark2Generator): exp.DType.SMALLMONEY: ((6, 4), ()), } + # Expressions that parse a string with a format; Spark 3+ parses these leniently, + # so emit M/d (not the padded MM/dd used for formatting) for the canonical %m/%d. Must + # stay in sync with dialect.STRICT_PARSE_TIME_EXPRESSIONS (the parse-side counterpart). + LENIENT_TIME_EXPRESSIONS = (exp.StrToDate, exp.StrToTime, exp.TsOrDsToDate) + + def format_time( + self, + expression: exp.Expr, + inverse_time_mapping: dict[str, str] | None = None, + inverse_time_trie: dict | None = None, + ) -> str | None: + if inverse_time_mapping is None and isinstance(expression, self.LENIENT_TIME_EXPRESSIONS): + from sqlglot.dialects.spark import Spark + + assert isinstance(self.dialect, Spark) + inverse_time_mapping = self.dialect.LENIENT_INVERSE_TIME_MAPPING + inverse_time_trie = self.dialect.LENIENT_INVERSE_TIME_TRIE + + return super().format_time(expression, inverse_time_mapping, inverse_time_trie) + TRANSFORMS = { k: v for k, v in { @@ -144,8 +149,6 @@ class SparkGenerator(Spark2Generator): exp.SafeMultiply: rename_func("TRY_MULTIPLY"), exp.SafeSubtract: rename_func("TRY_SUBTRACT"), exp.StartsWith: rename_func("STARTSWITH"), - exp.StrToDate: _str_to_datetime_sql, - exp.StrToTime: _str_to_datetime_sql, exp.TimeAdd: date_delta_to_binary_interval_op(cast=False), exp.TimeSub: date_delta_to_binary_interval_op(cast=False), exp.TsOrDsAdd: _dateadd_sql, diff --git a/sqlglot/generators/spark2.py b/sqlglot/generators/spark2.py index 7e5e989b57..4ba6e6c92a 100644 --- a/sqlglot/generators/spark2.py +++ b/sqlglot/generators/spark2.py @@ -27,7 +27,7 @@ def _map_sql(self: Spark2Generator, expression: exp.Map) -> str: return self.func("MAP_FROM_ARRAYS", keys, values) -def _str_to_date_sql(self: Spark2Generator, expression: exp.StrToDate) -> str: +def _str_to_date(self: Spark2Generator, expression: exp.StrToDate) -> str: time_format = self.format_time(expression) if time_format == HIVE_DATE_FORMAT: return self.func("TO_DATE", expression.this) @@ -184,7 +184,7 @@ class Spark2Generator(HiveGenerator): exp.SHA2Digest: lambda self, e: self.func( "SHA2", e.this, e.args.get("length") or exp.Literal.number(256) ), - exp.StrToDate: _str_to_date_sql, + exp.StrToDate: _str_to_date, exp.StrToTime: lambda self, e: self.func("TO_TIMESTAMP", e.this, self.format_time(e)), exp.TimestampTrunc: lambda self, e: self.func("DATE_TRUNC", unit_to_str(e), e.this), exp.UnixToTime: _unix_to_time_sql, diff --git a/sqlglot/time.py b/sqlglot/time.py index 520734ff16..3c1da05759 100644 --- a/sqlglot/time.py +++ b/sqlglot/time.py @@ -6,6 +6,13 @@ # https://docs.python.org/3/library/time.html#time.strftime from sqlglot.trie import TrieResult, in_trie, new_trie +# "Strict" canonical time formats round-trip in dialects that define them (e.g. +# Spark 3+'s zero-padded MM/dd, which don't parse single-digit values) and degrade +# to their lax counterpart elsewhere. These are sqlglot-internal tokens, not valid +# strftime directives, so they must be normalized away when emitting generic SQL. +STRICT_TIME_FORMATS = {"%mstrict": "%m", "%dstrict": "%d"} +STRICT_TIME_TRIE = new_trie(STRICT_TIME_FORMATS) + def format_time( string: str, mapping: dict[str, str], trie: dict[t.Any, t.Any] | None = None diff --git a/tests/dialects/test_exasol.py b/tests/dialects/test_exasol.py index 1404e1a3ca..aaf6db0b85 100644 --- a/tests/dialects/test_exasol.py +++ b/tests/dialects/test_exasol.py @@ -480,9 +480,9 @@ def test_datetime_functions(self): "duckdb": "CAST(x AS DATE)", "hive": "TO_DATE(x)", "presto": "CAST(CAST(x AS TIMESTAMP) AS DATE)", - "spark": "TO_DATE(x)", + "spark": "TO_DATE(x, 'yyyy-M-d')", "snowflake": "TO_DATE(x, 'yyyy-mm-DD')", - "databricks": "TO_DATE(x)", + "databricks": "TO_DATE(x, 'yyyy-M-d')", }, ) self.validate_all( diff --git a/tests/dialects/test_spark.py b/tests/dialects/test_spark.py index e91a9f9a57..fad53a053c 100644 --- a/tests/dialects/test_spark.py +++ b/tests/dialects/test_spark.py @@ -669,6 +669,19 @@ def test_spark(self): "spark": "SELECT TO_TIMESTAMP('2016-1-1', 'yyyy-M-d')", }, ) + # Spark 3+ parses MM/dd strictly, so the strict parse format roundtrips, but + # widens to the lax %m/%d for dialects that parse leniently (e.g. duckdb). + self.validate_all( + "SELECT TO_TIMESTAMP('2016-12-31', 'yyyy-MM-dd')", + write={ + "": "SELECT STR_TO_TIME('2016-12-31', '%Y-%m-%d')", + "duckdb": "SELECT STRPTIME('2016-12-31', '%Y-%m-%d')", + "spark": "SELECT TO_TIMESTAMP('2016-12-31', 'yyyy-MM-dd')", + "databricks": "SELECT TO_TIMESTAMP('2016-12-31', 'yyyy-MM-dd')", + }, + ) + # Formatting keeps zero-padded MM/dd, unlike the lenient parsing above. + self.validate_identity("SELECT DATE_FORMAT(x, 'yyyy-MM-dd')") self.validate_all( "SELECT RLIKE('John Doe', 'John.*')", write={ From 79d0daf97e095e954a6f2ff4fe7b1fa5641d32ce Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Sat, 20 Jun 2026 11:11:01 -0600 Subject: [PATCH 05/10] refactor(spark): narrow generator dialect type via TYPE_CHECKING Replace the per-call local import and `assert isinstance(self.dialect, Spark)` in SparkGenerator.format_time with a TYPE_CHECKING-guarded `dialect: Spark` class annotation, removing the runtime overhead while keeping mypy happy. Assisted-by: Claude Opus 4.8 --- sqlglot/generators/spark.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sqlglot/generators/spark.py b/sqlglot/generators/spark.py index 33f315eacd..d7bd073813 100644 --- a/sqlglot/generators/spark.py +++ b/sqlglot/generators/spark.py @@ -1,5 +1,6 @@ from __future__ import annotations +import typing as t from sqlglot import exp from sqlglot import generator @@ -20,6 +21,9 @@ move_partitioned_by_to_schema_columns, ) +if t.TYPE_CHECKING: + from sqlglot.dialects.spark import Spark + def _normalize_partition(e: exp.Expr) -> exp.Expr: """Normalize the expressions in PARTITION BY (, , ...)""" @@ -89,6 +93,8 @@ class SparkGenerator(Spark2Generator): exp.DType.SMALLMONEY: ((6, 4), ()), } + dialect: Spark + # Expressions that parse a string with a format; Spark 3+ parses these leniently, # so emit M/d (not the padded MM/dd used for formatting) for the canonical %m/%d. Must # stay in sync with dialect.STRICT_PARSE_TIME_EXPRESSIONS (the parse-side counterpart). @@ -101,9 +107,6 @@ def format_time( inverse_time_trie: dict | None = None, ) -> str | None: if inverse_time_mapping is None and isinstance(expression, self.LENIENT_TIME_EXPRESSIONS): - from sqlglot.dialects.spark import Spark - - assert isinstance(self.dialect, Spark) inverse_time_mapping = self.dialect.LENIENT_INVERSE_TIME_MAPPING inverse_time_trie = self.dialect.LENIENT_INVERSE_TIME_TRIE From a69203ab7a590eeb2b3b88cc4cd5b1de99139a48 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Sat, 20 Jun 2026 14:54:25 -0600 Subject: [PATCH 06/10] fix(bigquery): degrade strict time token in FORMAT clause The strict->lax canonical fallback was applied only to INVERSE_TIME_MAPPING, so BigQuery's `FORMAT '...'` clause (which uses INVERSE_FORMAT_MAPPING) leaked the internal token, e.g. Spark `TO_DATE(x, 'MM/dd/yyyy')` -> BigQuery `... FORMAT 'MMstrict/DDstrict/YYYY'`. Apply the same fallback to INVERSE_FORMAT_MAPPING via a shared helper so it degrades to 'MM/DD/YYYY'. Assisted-by: Claude Opus 4.8 --- sqlglot/dialects/dialect.py | 26 ++++++++++++++++---------- tests/dialects/test_spark.py | 12 ++++++++++++ 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/sqlglot/dialects/dialect.py b/sqlglot/dialects/dialect.py index 4f8946ba7b..051c4c2d45 100644 --- a/sqlglot/dialects/dialect.py +++ b/sqlglot/dialects/dialect.py @@ -134,6 +134,16 @@ class NormalizationStrategy(str, AutoName): """Always case-insensitive (uppercase), regardless of quotes.""" +def _with_strict_time_fallback(inverse_mapping: dict[str, str]) -> dict[str, str]: + # Dialects that define a "strict" format (e.g. Spark) keep their own mapping; + # everyone else degrades it to the lax counterpart's mapping, so the internal + # token never leaks into generated SQL. + for strict_format, lax_format in STRICT_TIME_FORMATS.items(): + inverse_mapping.setdefault(strict_format, inverse_mapping.get(lax_format, lax_format)) + + return inverse_mapping + + class _Dialect(type): _classes: dict[str, Type[Dialect]] = {} @@ -238,18 +248,14 @@ def __new__(cls, clsname, bases, attrs): ) # Merge class-defined INVERSE_TIME_MAPPING with auto-generated mappings # This allows dialects to define custom inverse mappings for roundtrip correctness - inverse_time_mapping = {v: k for k, v in klass.TIME_MAPPING.items()} | ( - klass.__dict__.get("INVERSE_TIME_MAPPING") or {} + klass.INVERSE_TIME_MAPPING = _with_strict_time_fallback( + {v: k for k, v in klass.TIME_MAPPING.items()} + | (klass.__dict__.get("INVERSE_TIME_MAPPING") or {}) ) - # Dialects that define a "strict" format (e.g. Spark) keep their own mapping; - # everyone else degrades it to the lax counterpart's mapping. - for strict_format, lax_format in STRICT_TIME_FORMATS.items(): - inverse_time_mapping.setdefault( - strict_format, inverse_time_mapping.get(lax_format, lax_format) - ) - klass.INVERSE_TIME_MAPPING = inverse_time_mapping klass.INVERSE_TIME_TRIE = new_trie(klass.INVERSE_TIME_MAPPING) - klass.INVERSE_FORMAT_MAPPING = {v: k for k, v in klass.FORMAT_MAPPING.items()} + klass.INVERSE_FORMAT_MAPPING = _with_strict_time_fallback( + {v: k for k, v in klass.FORMAT_MAPPING.items()} + ) klass.INVERSE_FORMAT_TRIE = new_trie(klass.INVERSE_FORMAT_MAPPING) klass.INVERSE_CREATABLE_KIND_MAPPING = { diff --git a/tests/dialects/test_spark.py b/tests/dialects/test_spark.py index fad53a053c..e029768f82 100644 --- a/tests/dialects/test_spark.py +++ b/tests/dialects/test_spark.py @@ -682,6 +682,18 @@ def test_spark(self): ) # Formatting keeps zero-padded MM/dd, unlike the lenient parsing above. self.validate_identity("SELECT DATE_FORMAT(x, 'yyyy-MM-dd')") + # The strict canonical token must degrade in BigQuery's FORMAT clause too, + # not just INVERSE_TIME_MAPPING (it previously leaked as 'MMstrict/DDstrict'). + self.validate_all( + "SELECT TO_DATE(x, 'MM/dd/yyyy')", + write={ + "": "SELECT CAST(STR_TO_TIME(x, '%m/%d/%Y') AS DATE)", + "duckdb": "SELECT CAST(CAST(TRY_STRPTIME(x, '%m/%d/%Y') AS TIMESTAMP) AS DATE)", + "bigquery": "SELECT CAST(SAFE_CAST(x AS TIMESTAMP FORMAT 'MM/DD/YYYY') AS DATE)", + "spark": "SELECT TO_DATE(x, 'MM/dd/yyyy')", + "databricks": "SELECT TO_DATE(x, 'MM/dd/yyyy')", + }, + ) self.validate_all( "SELECT RLIKE('John Doe', 'John.*')", write={ From fd8324e9548ece94e8a476272d6186d9d6c89a9f Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Sat, 20 Jun 2026 17:46:16 -0600 Subject: [PATCH 07/10] refactor(spark): share parse-time expression set, clarify strtotime CLAUDE Reuse dialect.STRICT_PARSE_TIME_EXPRESSIONS in SparkGenerator.format_time instead of a duplicate LENIENT_TIME_EXPRESSIONS tuple, removing the cross-module constant that had to be kept in sync by hand. Also document why the base strtotime_sql degrades only the strict tokens rather than routing the whole format through self.format_time(). Assisted-by: Claude Opus 4.8 --- sqlglot/dialects/dialect.py | 7 ++++--- sqlglot/generator.py | 6 ++++-- sqlglot/generators/spark.py | 11 +++++------ 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/sqlglot/dialects/dialect.py b/sqlglot/dialects/dialect.py index 051c4c2d45..2a387ca7d0 100644 --- a/sqlglot/dialects/dialect.py +++ b/sqlglot/dialects/dialect.py @@ -1573,9 +1573,10 @@ def months_between_sql(self: Generator, expression: exp.MonthsBetween) -> str: return self.sql(result) -# Expressions that parse a string with a format. Dialects with strict parsing -# semantics (STRICT_TIME_MAPPING) use it for these, but not for formatting. Must stay in -# sync with the generator's lenient set (e.g. SparkGenerator.LENIENT_TIME_EXPRESSIONS). +# Expressions that parse a string with a format (vs. formatting one, like TimeToStr). +# Dialects with strict parsing semantics (STRICT_TIME_MAPPING) use it for these on the +# parser side, and the corresponding generator (e.g. SparkGenerator.format_time) reuses +# this same set to emit the lenient inverse, which is what preserves the roundtrip. STRICT_PARSE_TIME_EXPRESSIONS = (exp.StrToTime, exp.StrToDate, exp.TsOrDsToDate) diff --git a/sqlglot/generator.py b/sqlglot/generator.py index dec0f60669..128c268591 100644 --- a/sqlglot/generator.py +++ b/sqlglot/generator.py @@ -4052,8 +4052,10 @@ def cast_sql(self, expression: exp.Cast, safe_prefix: str | None = None) -> str: # Base implementation that excludes safe, zone, and target_type metadata args def strtotime_sql(self, expression: exp.StrToTime) -> str: - # Normalize internal "strict" canonical formats (e.g. Spark's %mstrict) to - # standard strftime, since this generic fallback emits the format verbatim. + # STR_TO_TIME is sqlglot's canonical form, so the format must stay canonical + # strftime - we only strip the internal "strict" tokens (e.g. Spark's %mstrict) + # rather than routing through self.format_time(), which would also rewrite every + # other specifier into the dialect's INVERSE_TIME_MAPPING. return self.func( "STR_TO_TIME", expression.this, diff --git a/sqlglot/generators/spark.py b/sqlglot/generators/spark.py index d7bd073813..4298022e7f 100644 --- a/sqlglot/generators/spark.py +++ b/sqlglot/generators/spark.py @@ -5,6 +5,7 @@ from sqlglot import exp from sqlglot import generator from sqlglot.dialects.dialect import ( + STRICT_PARSE_TIME_EXPRESSIONS, array_append_sql, rename_func, unit_to_var, @@ -95,18 +96,16 @@ class SparkGenerator(Spark2Generator): dialect: Spark - # Expressions that parse a string with a format; Spark 3+ parses these leniently, - # so emit M/d (not the padded MM/dd used for formatting) for the canonical %m/%d. Must - # stay in sync with dialect.STRICT_PARSE_TIME_EXPRESSIONS (the parse-side counterpart). - LENIENT_TIME_EXPRESSIONS = (exp.StrToDate, exp.StrToTime, exp.TsOrDsToDate) - def format_time( self, expression: exp.Expr, inverse_time_mapping: dict[str, str] | None = None, inverse_time_trie: dict | None = None, ) -> str | None: - if inverse_time_mapping is None and isinstance(expression, self.LENIENT_TIME_EXPRESSIONS): + # Spark 3+ parses these leniently, so emit M/d (not the padded MM/dd used for + # formatting) for the canonical %m/%d. The expression set is shared with the parser + # (STRICT_PARSE_TIME_EXPRESSIONS), which is what guarantees the strict roundtrip. + if inverse_time_mapping is None and isinstance(expression, STRICT_PARSE_TIME_EXPRESSIONS): inverse_time_mapping = self.dialect.LENIENT_INVERSE_TIME_MAPPING inverse_time_trie = self.dialect.LENIENT_INVERSE_TIME_TRIE From 2f82830f9fc8ca1e0e1e25fc8b46edf1b998c918 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 22 Jun 2026 18:25:13 -0600 Subject: [PATCH 08/10] refactor(spark): host lenient time mapping on the base Dialect CLAUDE Declare LENIENT_INVERSE_TIME_MAPPING/_TRIE on the base Dialect (mirroring STRICT_TIME_MAPPING and INVERSE_TIME_MAPPING) and build the trie in the metaclass. This drops the Spark-only attribute that forced a `dialect: Spark` type-narrowing annotation in SparkGenerator plus its TYPE_CHECKING import, and lets spark.py stop hand-building the trie. No behavior change. Assisted-by: Claude Opus 4.8 --- sqlglot/dialects/dialect.py | 9 +++++++++ sqlglot/dialects/spark.py | 5 ++--- sqlglot/generators/spark.py | 7 ------- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/sqlglot/dialects/dialect.py b/sqlglot/dialects/dialect.py index 2a387ca7d0..3721bcad52 100644 --- a/sqlglot/dialects/dialect.py +++ b/sqlglot/dialects/dialect.py @@ -243,6 +243,7 @@ def __new__(cls, clsname, bases, attrs): klass.TIME_TRIE = new_trie(klass.TIME_MAPPING) klass.STRICT_TIME_TRIE = new_trie(klass.STRICT_TIME_MAPPING) + klass.LENIENT_INVERSE_TIME_TRIE = new_trie(klass.LENIENT_INVERSE_TIME_MAPPING) klass.FORMAT_TRIE = ( new_trie(klass.FORMAT_MAPPING) if klass.FORMAT_MAPPING else klass.TIME_TRIE ) @@ -433,6 +434,13 @@ class Dialect(metaclass=_Dialect): distinct canonical format, preserving the roundtrip. Empty means `TIME_MAPPING` is used. """ + LENIENT_INVERSE_TIME_MAPPING: dict[str, str] = {} + """ + Inverse mapping used when *generating* a parse format (e.g. `StrToTime`) for dialects that + parse leniently. Lets e.g. Spark emit the lenient single-letter specifiers from the + canonical formats while mapping the strict tokens back to the padded forms. + """ + # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_model_rules_date_time # https://docs.teradata.com/r/Teradata-Database-SQL-Functions-Operators-Exprs-and-Predicates/March-2017/Data-Type-Conversions/Character-to-DATE-Conversion/Forcing-a-FORMAT-on-CAST-for-Converting-Character-to-DATE FORMAT_MAPPING: dict[str, str] = {} @@ -796,6 +804,7 @@ class Dialect(metaclass=_Dialect): INVERSE_TIME_MAPPING: dict[str, str] = {} INVERSE_TIME_TRIE: dict = {} + LENIENT_INVERSE_TIME_TRIE: dict = {} INVERSE_FORMAT_MAPPING: dict[str, str] = {} INVERSE_FORMAT_TRIE: dict = {} diff --git a/sqlglot/dialects/spark.py b/sqlglot/dialects/spark.py index 322d78db43..6098ead689 100644 --- a/sqlglot/dialects/spark.py +++ b/sqlglot/dialects/spark.py @@ -6,7 +6,6 @@ from sqlglot.generators.spark import SparkGenerator from sqlglot.parsers.spark import SparkParser from sqlglot.tokens import TokenType -from sqlglot.trie import new_trie from sqlglot.typing.spark import EXPRESSION_METADATA @@ -27,13 +26,13 @@ class Spark(Spark2): "dd": "%dstrict", } # Generating a parse format is lenient: %m/%d -> M/d (matching strptime), while the - # strict tokens map back to MM/dd. Used by the generator's format_time override. + # strict tokens map back to MM/dd. Used by the generator's format_time override (the + # matching trie is built by the metaclass). LENIENT_INVERSE_TIME_MAPPING = { **{v: k for k, v in STRICT_TIME_MAPPING.items()}, "%m": "M", "%d": "d", } - LENIENT_INVERSE_TIME_TRIE = new_trie(LENIENT_INVERSE_TIME_MAPPING) class Tokenizer(Spark2.Tokenizer): STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = False diff --git a/sqlglot/generators/spark.py b/sqlglot/generators/spark.py index 4298022e7f..889d730278 100644 --- a/sqlglot/generators/spark.py +++ b/sqlglot/generators/spark.py @@ -1,7 +1,5 @@ from __future__ import annotations -import typing as t - from sqlglot import exp from sqlglot import generator from sqlglot.dialects.dialect import ( @@ -22,9 +20,6 @@ move_partitioned_by_to_schema_columns, ) -if t.TYPE_CHECKING: - from sqlglot.dialects.spark import Spark - def _normalize_partition(e: exp.Expr) -> exp.Expr: """Normalize the expressions in PARTITION BY (, , ...)""" @@ -94,8 +89,6 @@ class SparkGenerator(Spark2Generator): exp.DType.SMALLMONEY: ((6, 4), ()), } - dialect: Spark - def format_time( self, expression: exp.Expr, From beb59a4273169db9effdd0f6b5509fcdbb5498dd Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 22 Jun 2026 18:30:36 -0600 Subject: [PATCH 09/10] refactor(spark): use `or` fallback in format_time, matching the base CLAUDE Replace the `inverse_time_mapping is None and ...` guard with the same `inverse_time_mapping or ...` fallback idiom the base Generator.format_time uses. Same semantics (an explicitly-passed mapping still wins), but consistent with the base and without the extra clause. No behavior change. Assisted-by: Claude Opus 4.8 --- sqlglot/generators/spark.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sqlglot/generators/spark.py b/sqlglot/generators/spark.py index 889d730278..52b4a5e5ca 100644 --- a/sqlglot/generators/spark.py +++ b/sqlglot/generators/spark.py @@ -98,9 +98,9 @@ def format_time( # Spark 3+ parses these leniently, so emit M/d (not the padded MM/dd used for # formatting) for the canonical %m/%d. The expression set is shared with the parser # (STRICT_PARSE_TIME_EXPRESSIONS), which is what guarantees the strict roundtrip. - if inverse_time_mapping is None and isinstance(expression, STRICT_PARSE_TIME_EXPRESSIONS): - inverse_time_mapping = self.dialect.LENIENT_INVERSE_TIME_MAPPING - inverse_time_trie = self.dialect.LENIENT_INVERSE_TIME_TRIE + if isinstance(expression, STRICT_PARSE_TIME_EXPRESSIONS): + inverse_time_mapping = inverse_time_mapping or self.dialect.LENIENT_INVERSE_TIME_MAPPING + inverse_time_trie = inverse_time_trie or self.dialect.LENIENT_INVERSE_TIME_TRIE return super().format_time(expression, inverse_time_mapping, inverse_time_trie) From 60b878dd93c02cab9051eed28c9e9d2228bc8f3a Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 22 Jun 2026 18:47:01 -0600 Subject: [PATCH 10/10] docs(spark): trim over-explaining comments on lenient time mapping CLAUDE Drop the "used by the generator's format_time override / trie built by the metaclass" note on Spark.LENIENT_INVERSE_TIME_MAPPING (no other mapping documents where it's consumed or that the metaclass builds its trie), and reword the base docstring's awkward "Lets e.g. Spark" to the parenthetical "(e.g. Spark)" form already used by STRICT_TIME_MAPPING. Assisted-by: Claude Opus 4.8 --- sqlglot/dialects/dialect.py | 4 ++-- sqlglot/dialects/spark.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sqlglot/dialects/dialect.py b/sqlglot/dialects/dialect.py index 3721bcad52..74fb8802c8 100644 --- a/sqlglot/dialects/dialect.py +++ b/sqlglot/dialects/dialect.py @@ -437,8 +437,8 @@ class Dialect(metaclass=_Dialect): LENIENT_INVERSE_TIME_MAPPING: dict[str, str] = {} """ Inverse mapping used when *generating* a parse format (e.g. `StrToTime`) for dialects that - parse leniently. Lets e.g. Spark emit the lenient single-letter specifiers from the - canonical formats while mapping the strict tokens back to the padded forms. + parse leniently (e.g. Spark). Maps the canonical specifiers to their lenient single-letter + forms, and the strict tokens back to the padded forms. """ # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_model_rules_date_time diff --git a/sqlglot/dialects/spark.py b/sqlglot/dialects/spark.py index 6098ead689..0c591c650b 100644 --- a/sqlglot/dialects/spark.py +++ b/sqlglot/dialects/spark.py @@ -26,8 +26,7 @@ class Spark(Spark2): "dd": "%dstrict", } # Generating a parse format is lenient: %m/%d -> M/d (matching strptime), while the - # strict tokens map back to MM/dd. Used by the generator's format_time override (the - # matching trie is built by the metaclass). + # strict tokens map back to MM/dd. LENIENT_INVERSE_TIME_MAPPING = { **{v: k for k, v in STRICT_TIME_MAPPING.items()}, "%m": "M",