Fix!: Change how partitioned_by is parsed so that partition expressions with specialized AST nodes are captured (#4224)

erindru · web-flow · commit 3c1a44a5d0a4 · 2025-04-25T18:21:36.000+12:00
diff --git a/sqlmesh/core/dialect.py b/sqlmesh/core/dialect.py
@@ -610,6 +610,12 @@ def parse(self: Parser) -> t.Optional[exp.Expression]:
                     value = self.expression(ModelKind, this=kind.value, expressions=props)
             elif key == "expression":
                 value = self._parse_conjunction()
+            elif key == "partitioned_by":
+                partitioned_by = self._parse_partitioned_by()
+                if isinstance(partitioned_by.this, exp.Schema):
+                    value = exp.tuple_(*partitioned_by.this.expressions)
+                else:
+                    value = partitioned_by.this
             else:
                 value = self._parse_bracket(self._parse_field(any_token=True))
 
diff --git a/sqlmesh/core/model/meta.py b/sqlmesh/core/model/meta.py
@@ -5,7 +5,7 @@
 from typing_extensions import Self
 
 from pydantic import Field
-from sqlglot import Dialect, exp
+from sqlglot import Dialect, exp, parse_one
 from sqlglot.helper import ensure_collection, ensure_list
 from sqlglot.optimizer.normalize_identifiers import normalize_identifiers
 
@@ -39,6 +39,7 @@
     field_validator,
     list_of_fields_validator,
     model_validator,
+    get_dialect,
 )
 
 if t.TYPE_CHECKING:
@@ -182,6 +183,22 @@ def _gateway_validator(cls, v: t.Any) -> t.Optional[str]:
     def _partition_and_cluster_validator(
         cls, v: t.Any, info: ValidationInfo
     ) -> t.List[exp.Expression]:
+        if (
+            isinstance(v, list)
+            and all(isinstance(i, str) for i in v)
+            and info.field_name == "partitioned_by_"
+        ):
+            # this branch gets hit when we are deserializing from json because `partitioned_by` is stored as a List[str]
+            # however, we should only invoke this if the list contains strings because this validator is also
+            # called by Python models which might pass a List[exp.Expression]
+            string_to_parse = (
+                f"({','.join(v)})"  # recreate the (a, b, c) part of "partitioned_by (a, b, c)"
+            )
+            parsed = parse_one(
+                string_to_parse, into=exp.PartitionedByProperty, dialect=get_dialect(info)
+            )
+            v = parsed.this.expressions if isinstance(parsed.this, exp.Schema) else v
+
         expressions = list_of_fields_validator(v, info.data)
 
         for expression in expressions:
diff --git a/sqlmesh/migrations/v0081_update_partitioned_by.py b/sqlmesh/migrations/v0081_update_partitioned_by.py
@@ -0,0 +1,91 @@
+"""Remove superfluous exp.Paren references from partitioned_by"""
+
+import json
+
+import pandas as pd
+from sqlglot import exp
+
+from sqlmesh.utils.migration import index_text_type
+from sqlmesh.utils.migration import blob_text_type
+
+
+def migrate(state_sync, **kwargs):  # type: ignore
+    engine_adapter = state_sync.engine_adapter
+    schema = state_sync.schema
+    snapshots_table = "_snapshots"
+    index_type = index_text_type(engine_adapter.dialect)
+    if schema:
+        snapshots_table = f"{schema}.{snapshots_table}"
+
+    new_snapshots = []
+    updated = False
+
+    for (
+        name,
+        identifier,
+        version,
+        snapshot,
+        kind_name,
+        updated_ts,
+        unpaused_ts,
+        ttl_ms,
+        unrestorable,
+    ) in engine_adapter.fetchall(
+        exp.select(
+            "name",
+            "identifier",
+            "version",
+            "snapshot",
+            "kind_name",
+            "updated_ts",
+            "unpaused_ts",
+            "ttl_ms",
+            "unrestorable",
+        ).from_(snapshots_table),
+        quote_identifiers=True,
+    ):
+        parsed_snapshot = json.loads(snapshot)
+
+        if partitioned_by := parsed_snapshot["node"].get("partitioned_by"):
+            new_partitioned_by = []
+            for item in partitioned_by:
+                # rewrite '(foo)' to 'foo'
+                if item.startswith("(") and item.endswith(")"):
+                    item = item[1:-1]
+                    updated = True
+                new_partitioned_by.append(item)
+            parsed_snapshot["node"]["partitioned_by"] = new_partitioned_by
+
+        new_snapshots.append(
+            {
+                "name": name,
+                "identifier": identifier,
+                "version": version,
+                "snapshot": json.dumps(parsed_snapshot),
+                "kind_name": kind_name,
+                "updated_ts": updated_ts,
+                "unpaused_ts": unpaused_ts,
+                "ttl_ms": ttl_ms,
+                "unrestorable": unrestorable,
+            }
+        )
+
+    if new_snapshots and updated:
+        engine_adapter.delete_from(snapshots_table, "TRUE")
+        blob_type = blob_text_type(engine_adapter.dialect)
+
+        engine_adapter.insert_append(
+            snapshots_table,
+            pd.DataFrame(new_snapshots),
+            columns_to_types={
+                "name": exp.DataType.build(index_type),
+                "identifier": exp.DataType.build(index_type),
+                "version": exp.DataType.build(index_type),
+                "snapshot": exp.DataType.build(blob_type),
+                "kind_name": exp.DataType.build(index_type),
+                "updated_ts": exp.DataType.build("bigint"),
+                "unpaused_ts": exp.DataType.build("bigint"),
+                "ttl_ms": exp.DataType.build("bigint"),
+                "unrestorable": exp.DataType.build("boolean"),
+            },
+        )
diff --git a/tests/core/engine_adapter/test_athena.py b/tests/core/engine_adapter/test_athena.py
@@ -435,3 +435,51 @@ def test_drop_partitions_from_metastore_uses_batches(
     # third call 50-62
     assert calls[2][1]["PartitionsToDelete"][0]["Values"][0] == "50"
     assert calls[2][1]["PartitionsToDelete"][-1]["Values"][0] == "62"
+
+
+def test_iceberg_partition_transforms(adapter: AthenaEngineAdapter):
+    expressions = d.parse(
+        """
+        MODEL (
+            name test_table,
+            kind FULL,
+            table_format iceberg,
+            partitioned_by (month(business_date), bucket(4, colb), colc)
+        );
+
+        SELECT 1::timestamp AS business_date, 2::varchar as colb, 'foo' as colc;
+    """
+    )
+    model: SqlModel = t.cast(SqlModel, load_sql_based_model(expressions))
+
+    assert model.partitioned_by == [
+        exp.Month(this=exp.column("business_date", quoted=True)),
+        exp.PartitionedByBucket(
+            this=exp.column("colb", quoted=True), expression=exp.Literal.number(4)
+        ),
+        exp.column("colc", quoted=True),
+    ]
+
+    adapter.s3_warehouse_location = "s3://bucket/prefix/"
+
+    adapter.create_table(
+        table_name=model.name,
+        columns_to_types=model.columns_to_types_or_raise,
+        partitioned_by=model.partitioned_by,
+        table_format=model.table_format,
+    )
+
+    adapter.ctas(
+        table_name=model.name,
+        columns_to_types=model.columns_to_types_or_raise,
+        partitioned_by=model.partitioned_by,
+        query_or_df=model.ctas_query(),
+        table_format=model.table_format,
+    )
+
+    assert to_sql_calls(adapter) == [
+        # Hive syntax - create table
+        """CREATE TABLE IF NOT EXISTS `test_table` (`business_date` TIMESTAMP, `colb` STRING, `colc` STRING) PARTITIONED BY (MONTH(`business_date`), BUCKET(4, `colb`), `colc`) LOCATION 's3://bucket/prefix/test_table/' TBLPROPERTIES ('table_type'='iceberg')""",
+        # Trino syntax - CTAS
+        """CREATE TABLE IF NOT EXISTS "test_table" WITH (table_type='iceberg', partitioning=ARRAY['MONTH(business_date)', 'BUCKET(colb, 4)', 'colc'], location='s3://bucket/prefix/test_table/', is_external=false) AS SELECT CAST("business_date" AS TIMESTAMP) AS "business_date", CAST("colb" AS VARCHAR) AS "colb", CAST("colc" AS VARCHAR) AS "colc" FROM (SELECT CAST(1 AS TIMESTAMP) AS "business_date", CAST(2 AS VARCHAR) AS "colb", 'foo' AS "colc" LIMIT 0) AS "_subquery\"""",
+    ]
diff --git a/tests/core/test_context.py b/tests/core/test_context.py
@@ -1978,19 +1978,22 @@ def test_plan_audit_intervals(tmp_path: pathlib.Path, capsys, caplog):
         )
     )
 
-    ctx.plan(
+    plan = ctx.plan(
         environment="dev", auto_apply=True, no_prompts=True, start="2025-02-01", end="2025-02-01"
     )
 
+    date_snapshot = next(s for s in plan.new_snapshots if "date_example" in s.name)
+    timestamp_snapshot = next(s for s in plan.new_snapshots if "timestamp_example" in s.name)
+
     # Case 1: The timestamp audit should be in the inclusive range ['2025-02-01 00:00:00', '2025-02-01 23:59:59.999999']
     assert (
-        """SELECT COUNT(*) FROM (SELECT ("timestamp_id") AS "timestamp_id" FROM (SELECT * FROM "sqlmesh__sqlmesh_audit"."sqlmesh_audit__timestamp_example__2797548448" AS "sqlmesh_audit__timestamp_example__2797548448" WHERE "timestamp_id" BETWEEN CAST('2025-02-01 00:00:00' AS TIMESTAMP) AND CAST('2025-02-01 23:59:59.999999' AS TIMESTAMP)) AS "_q_0" WHERE TRUE GROUP BY ("timestamp_id") HAVING COUNT(*) > 1) AS "audit\""""
+        f"""SELECT COUNT(*) FROM (SELECT ("timestamp_id") AS "timestamp_id" FROM (SELECT * FROM "sqlmesh__sqlmesh_audit"."sqlmesh_audit__timestamp_example__{timestamp_snapshot.version}" AS "sqlmesh_audit__timestamp_example__{timestamp_snapshot.version}" WHERE "timestamp_id" BETWEEN CAST('2025-02-01 00:00:00' AS TIMESTAMP) AND CAST('2025-02-01 23:59:59.999999' AS TIMESTAMP)) AS "_q_0" WHERE TRUE GROUP BY ("timestamp_id") HAVING COUNT(*) > 1) AS "audit\""""
         in caplog.text
     )
 
     # Case 2: The date audit should be in the inclusive range ['2025-02-01', '2025-02-01']
     assert (
-        """SELECT COUNT(*) FROM (SELECT ("date_id") AS "date_id" FROM (SELECT * FROM "sqlmesh__sqlmesh_audit"."sqlmesh_audit__date_example__4100277424" AS "sqlmesh_audit__date_example__4100277424" WHERE "date_id" BETWEEN CAST('2025-02-01' AS DATE) AND CAST('2025-02-01' AS DATE)) AS "_q_0" WHERE TRUE GROUP BY ("date_id") HAVING COUNT(*) > 1) AS "audit\""""
+        f"""SELECT COUNT(*) FROM (SELECT ("date_id") AS "date_id" FROM (SELECT * FROM "sqlmesh__sqlmesh_audit"."sqlmesh_audit__date_example__{date_snapshot.version}" AS "sqlmesh_audit__date_example__{date_snapshot.version}" WHERE "date_id" BETWEEN CAST('2025-02-01' AS DATE) AND CAST('2025-02-01' AS DATE)) AS "_q_0" WHERE TRUE GROUP BY ("date_id") HAVING COUNT(*) > 1) AS "audit\""""
         in caplog.text
     )
 
diff --git a/tests/core/test_model.py b/tests/core/test_model.py
@@ -1514,6 +1514,134 @@ def test_render_definition_with_defaults():
     ) == d.format_model_expressions(expected_expressions)
 
 
+def test_render_definition_partitioned_by():
+    # no parenthesis in definition, no parenthesis when rendered
+    model = load_sql_based_model(
+        d.parse(
+            f"""
+        MODEL (
+            name db.table,
+            kind FULL,
+            partitioned_by a
+        );
+
+        select 1 as a;
+        """
+        )
+    )
+
+    assert model.partitioned_by == [exp.column("a", quoted=True)]
+    assert (
+        model.render_definition()[0].sql(pretty=True)
+        == """MODEL (
+  name db.table,
+  kind FULL,
+  partitioned_by "a"
+)"""
+    )
+
+    # single column wrapped in parenthesis in defintion, no parenthesis in rendered
+    model = load_sql_based_model(
+        d.parse(
+            f"""
+        MODEL (
+            name db.table,
+            kind FULL,
+            partitioned_by (a)
+        );
+
+        select 1 as a;
+        """
+        )
+    )
+
+    assert model.partitioned_by == [exp.column("a", quoted=True)]
+    assert (
+        model.render_definition()[0].sql(pretty=True)
+        == """MODEL (
+  name db.table,
+  kind FULL,
+  partitioned_by "a"
+)"""
+    )
+
+    # multiple columns wrapped in parenthesis in definition, parenthesis in rendered
+    model = load_sql_based_model(
+        d.parse(
+            f"""
+        MODEL (
+            name db.table,
+            kind FULL,
+            partitioned_by (a, b)
+        );
+
+        select 1 as a, 2 as b;
+        """
+        )
+    )
+
+    assert model.partitioned_by == [exp.column("a", quoted=True), exp.column("b", quoted=True)]
+    assert (
+        model.render_definition()[0].sql(pretty=True)
+        == """MODEL (
+  name db.table,
+  kind FULL,
+  partitioned_by ("a", "b")
+)"""
+    )
+
+    # multiple columns not wrapped in parenthesis in the definition is an error
+    with pytest.raises(ParseError, match=r"keyword: 'value' missing"):
+        load_sql_based_model(
+            d.parse(
+                f"""
+            MODEL (
+                name db.table,
+                kind FULL,
+                partitioned_by a, b
+            );
+
+            select 1 as a, 2 as b;
+            """
+            )
+        )
+
+    # Iceberg transforms / functions
+    model = load_sql_based_model(
+        d.parse(
+            f"""
+        MODEL (
+            name db.table,
+            kind FULL,
+            partitioned_by (day(a), truncate(b, 4), bucket(c, 3))
+        );
+
+        select 1 as a, 2 as b, 3 as c;
+        """
+        ),
+        dialect="trino",
+    )
+
+    assert model.partitioned_by == [
+        exp.Day(this=exp.column("a", quoted=True)),
+        exp.PartitionByTruncate(
+            this=exp.column("b", quoted=True), expression=exp.Literal.number(4)
+        ),
+        exp.PartitionedByBucket(
+            this=exp.column("c", quoted=True), expression=exp.Literal.number(3)
+        ),
+    ]
+    assert (
+        model.render_definition()[0].sql(pretty=True)
+        == """MODEL (
+  name db.table,
+  dialect trino,
+  kind FULL,
+  partitioned_by (DAY("a"), TRUNCATE("b", 4), BUCKET("c", 3))
+)"""
+    )
+
+
 def test_cron():
     daily = _Node(name="x", cron="@daily")
     assert to_datetime(daily.cron_prev("2020-01-01")) == to_datetime("2019-12-31")
diff --git a/tests/core/test_snapshot.py b/tests/core/test_snapshot.py
@@ -2938,3 +2938,37 @@ def check_types(batch, env: str, sql: list[SQL], table: exp.Table, default: int
     )
     snapshot_a = make_snapshot(sql_model)
     assert snapshot_a.check_ready_intervals([(0, 1)], mocker.Mock()) == [(0, 1)]
+
+
+def test_partitioned_by_roundtrip(make_snapshot: t.Callable):
+    sql_model = load_sql_based_model(
+        parse("""
+        MODEL (
+            name test_schema.test_model,
+            kind full,
+            partitioned_by (a, bucket(4, b), truncate(3, c), month(d))
+        );
+        SELECT a, b, c, d FROM tbl;
+        """)
+    )
+    snapshot = make_snapshot(sql_model)
+    assert isinstance(snapshot, Snapshot)
+    assert isinstance(snapshot.node, SqlModel)
+
+    assert snapshot.node.partitioned_by == [
+        exp.column("a", quoted=True),
+        exp.PartitionedByBucket(
+            this=exp.column("b", quoted=True), expression=exp.Literal.number(4)
+        ),
+        exp.PartitionByTruncate(
+            this=exp.column("c", quoted=True), expression=exp.Literal.number(3)
+        ),
+        exp.Month(this=exp.column("d", quoted=True)),
+    ]
+
+    # roundtrip through json and ensure we get correct AST nodes on the other end
+    serialized = snapshot.json()
+    deserialized = snapshot.parse_raw(serialized)
+
+    assert isinstance(deserialized.node, SqlModel)
+    assert deserialized.node.partitioned_by == snapshot.node.partitioned_by

Original file line number	Diff line number	Diff line change
`@@ -1978,19 +1978,22 @@ def test_plan_audit_intervals(tmp_path: pathlib.Path, capsys, caplog):`
`1978`	`1978`	`)`
`1979`	`1979`	`)`
`1980`	`1980`
`1981`		`- ctx.plan(`
	`1981`	`+ plan = ctx.plan(`
`1982`	`1982`	`environment="dev", auto_apply=True, no_prompts=True, start="2025-02-01", end="2025-02-01"`
`1983`	`1983`	`)`
`1984`	`1984`
	`1985`	`+ date_snapshot = next(s for s in plan.new_snapshots if "date_example" in s.name)`
	`1986`	`+ timestamp_snapshot = next(s for s in plan.new_snapshots if "timestamp_example" in s.name)`
	`1987`	`+`
`1985`	`1988`	`# Case 1: The timestamp audit should be in the inclusive range ['2025-02-01 00:00:00', '2025-02-01 23:59:59.999999']`
`1986`	`1989`	`assert (`
`1987`		`- """SELECT COUNT() FROM (SELECT ("timestamp_id") AS "timestamp_id" FROM (SELECT FROM "sqlmesh__sqlmesh_audit"."sqlmesh_audit__timestamp_example__2797548448" AS "sqlmesh_audit__timestamp_example__2797548448" WHERE "timestamp_id" BETWEEN CAST('2025-02-01 00:00:00' AS TIMESTAMP) AND CAST('2025-02-01 23:59:59.999999' AS TIMESTAMP)) AS "_q_0" WHERE TRUE GROUP BY ("timestamp_id") HAVING COUNT(*) > 1) AS "audit\""""`
	`1990`	`+ f"""SELECT COUNT() FROM (SELECT ("timestamp_id") AS "timestamp_id" FROM (SELECT FROM "sqlmesh__sqlmesh_audit"."sqlmesh_audit__timestamp_example__{timestamp_snapshot.version}" AS "sqlmesh_audit__timestamp_example__{timestamp_snapshot.version}" WHERE "timestamp_id" BETWEEN CAST('2025-02-01 00:00:00' AS TIMESTAMP) AND CAST('2025-02-01 23:59:59.999999' AS TIMESTAMP)) AS "_q_0" WHERE TRUE GROUP BY ("timestamp_id") HAVING COUNT(*) > 1) AS "audit\""""`
`1988`	`1991`	`in caplog.text`
`1989`	`1992`	`)`
`1990`	`1993`
`1991`	`1994`	`# Case 2: The date audit should be in the inclusive range ['2025-02-01', '2025-02-01']`
`1992`	`1995`	`assert (`
`1993`		`- """SELECT COUNT() FROM (SELECT ("date_id") AS "date_id" FROM (SELECT FROM "sqlmesh__sqlmesh_audit"."sqlmesh_audit__date_example__4100277424" AS "sqlmesh_audit__date_example__4100277424" WHERE "date_id" BETWEEN CAST('2025-02-01' AS DATE) AND CAST('2025-02-01' AS DATE)) AS "_q_0" WHERE TRUE GROUP BY ("date_id") HAVING COUNT(*) > 1) AS "audit\""""`
	`1996`	`+ f"""SELECT COUNT() FROM (SELECT ("date_id") AS "date_id" FROM (SELECT FROM "sqlmesh__sqlmesh_audit"."sqlmesh_audit__date_example__{date_snapshot.version}" AS "sqlmesh_audit__date_example__{date_snapshot.version}" WHERE "date_id" BETWEEN CAST('2025-02-01' AS DATE) AND CAST('2025-02-01' AS DATE)) AS "_q_0" WHERE TRUE GROUP BY ("date_id") HAVING COUNT(*) > 1) AS "audit\""""`
`1994`	`1997`	`in caplog.text`
`1995`	`1998`	`)`
`1996`	`1999`