SQLMesh
diff --git a/‎docs/concepts/models/overview.md‎
Lines changed: 3 additions & 0 deletions b/‎docs/concepts/models/overview.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎sqlmesh/core/engine_adapter/base.py‎
Lines changed: 2 additions & 0 deletions b/‎sqlmesh/core/engine_adapter/base.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sqlmesh/core/engine_adapter/bigquery.py‎
Lines changed: 36 additions & 30 deletions b/‎sqlmesh/core/engine_adapter/bigquery.py‎
Lines changed: 36 additions & 30 deletions
diff --git a/‎sqlmesh/core/engine_adapter/spark.py‎
Lines changed: 11 additions & 12 deletions b/‎sqlmesh/core/engine_adapter/spark.py‎
Lines changed: 11 additions & 12 deletions
diff --git a/‎sqlmesh/core/model/definition.py‎
Lines changed: 29 additions & 18 deletions b/‎sqlmesh/core/model/definition.py‎
Lines changed: 29 additions & 18 deletions
diff --git a/‎sqlmesh/core/model/meta.py‎
Lines changed: 5 additions & 3 deletions b/‎sqlmesh/core/model/meta.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎sqlmesh/core/snapshot/definition.py‎
Lines changed: 1 addition & 0 deletions b/‎sqlmesh/core/snapshot/definition.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎sqlmesh/core/snapshot/evaluator.py‎
Lines changed: 2 additions & 0 deletions b/‎sqlmesh/core/snapshot/evaluator.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sqlmesh/dbt/model.py‎
Lines changed: 5 additions & 4 deletions b/‎sqlmesh/dbt/model.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎tests/core/engine_adapter/test_bigquery.py‎
Lines changed: 2 additions & 1 deletion b/‎tests/core/engine_adapter/test_bigquery.py‎
Lines changed: 2 additions & 1 deletion
@@ -95,6 +95,9 @@ Name is ***required*** and must be ***unique***.
 ### partitioned_by
 - Partitioned by is an optional property for engines such as Spark or Hive that support partitioning. Use this to add additional columns to the time column partition key.
 
+### clustered_by
+- Clustered by is an optional property for engines such as Bigquery that support clustering.
+
 ### tags
 - Tags are one or more labels used to organize your models.
 
 
@@ -857,7 +857,9 @@ def _create_table_properties(
         storage_format: t.Optional[str] = None,
         partitioned_by: t.Optional[t.List[exp.Expression]] = None,
         partition_interval_unit: t.Optional[IntervalUnit] = None,
+        clustered_by: t.Optional[t.List[str]] = None,
     ) -> t.Optional[exp.Properties]:
+        """Creates a SQLGlot table properties expression for ddl."""
         return None
 
     def _to_sql(self, e: exp.Expression, **kwargs: t.Any) -> str:
 
@@ -337,39 +337,45 @@ def _create_table_properties(
         storage_format: t.Optional[str] = None,
         partitioned_by: t.Optional[t.List[exp.Expression]] = None,
         partition_interval_unit: t.Optional[IntervalUnit] = None,
+        clustered_by: t.Optional[t.List[str]] = None,
     ) -> t.Optional[exp.Properties]:
-        if not partitioned_by:
-            return None
-        if partition_interval_unit is None:
-            raise SQLMeshError("partition_interval_unit is required when partitioning a table")
-        if len(partitioned_by) > 1:
-            raise SQLMeshError("BigQuery only supports partitioning by a single column")
-
-        this: exp.Expression
-        if isinstance(partitioned_by[0], exp.Column):
-            if partition_interval_unit == IntervalUnit.MINUTE:
-                raise SQLMeshError("BigQuery does not support partitioning by minute")
-
-            trunc_func: t.Optional[str] = None
-            if partition_interval_unit == IntervalUnit.HOUR:
-                trunc_func = "TIMESTAMP_TRUNC"
-            elif partition_interval_unit in (IntervalUnit.MONTH, IntervalUnit.YEAR):
-                trunc_func = "DATE_TRUNC"
-
-            if trunc_func:
-                this = exp.func(
-                    trunc_func,
-                    partitioned_by[0],
-                    exp.var(partition_interval_unit.value.upper()),
-                    dialect=self.dialect,
-                )
-            else:
-                this = partitioned_by[0]
-        else:
+        properties: t.List[exp.Expression] = []
+
+        if partitioned_by:
+            if partition_interval_unit is None:
+                raise SQLMeshError("partition_interval_unit is required when partitioning a table")
+            if len(partitioned_by) > 1:
+                raise SQLMeshError("BigQuery only supports partitioning by a single column")
+
             this = partitioned_by[0]
 
-        partition_columns_property = exp.PartitionedByProperty(this=this)
-        return exp.Properties(expressions=[partition_columns_property])
+            if isinstance(this, exp.Column):
+                if partition_interval_unit == IntervalUnit.MINUTE:
+                    raise SQLMeshError("BigQuery does not support partitioning by minute")
+
+                if partition_interval_unit == IntervalUnit.HOUR:
+                    trunc_func = "TIMESTAMP_TRUNC"
+                elif partition_interval_unit in (IntervalUnit.MONTH, IntervalUnit.YEAR):
+                    trunc_func = "DATE_TRUNC"
+                else:
+                    trunc_func = ""
+
+                if trunc_func:
+                    this = exp.func(
+                        trunc_func,
+                        this,
+                        exp.var(partition_interval_unit.value.upper()),
+                        dialect=self.dialect,
+                    )
+
+            properties.append(exp.PartitionedByProperty(this=this))
+
+        if clustered_by:
+            properties.append(exp.Cluster(expressions=[exp.column(col) for col in clustered_by]))
+
+        if properties:
+            return exp.Properties(expressions=properties)
+        return None
 
     def create_state_table(
         self,
 
@@ -239,27 +239,26 @@ def _create_table_properties(
         storage_format: t.Optional[str] = None,
         partitioned_by: t.Optional[t.List[exp.Expression]] = None,
         partition_interval_unit: t.Optional[IntervalUnit] = None,
+        clustered_by: t.Optional[t.List[str]] = None,
     ) -> t.Optional[exp.Properties]:
-        format_property = None
-        partition_columns_property = None
+        properties: t.List[exp.Expression] = []
+
         if storage_format:
-            format_property = exp.FileFormatProperty(this=exp.Var(this=storage_format))
+            properties.append(exp.FileFormatProperty(this=exp.Var(this=storage_format)))
         if partitioned_by:
             for expr in partitioned_by:
                 if not isinstance(expr, exp.Column):
                     raise SQLMeshError(
                         f"PARTITIONED BY contains non-column value '{expr.sql(dialect='spark')}'."
                     )
-            partition_columns_property = exp.PartitionedByProperty(
-                this=exp.Schema(expressions=partitioned_by),
+            properties.append(
+                exp.PartitionedByProperty(
+                    this=exp.Schema(expressions=partitioned_by),
+                )
             )
-        return exp.Properties(
-            expressions=[
-                table_property
-                for table_property in [format_property, partition_columns_property]
-                if table_property
-            ]
-        )
+        if properties:
+            return exp.Properties(expressions=properties)
+        return None
 
     def supports_transactions(self, transaction_type: TransactionType) -> bool:
         return False
@@ -101,6 +101,7 @@ class _Model(ModelMeta, frozen=True):
         storage_format: The storage format used to store the physical table, only applicable in certain engines.
             (eg. 'parquet')
         partitioned_by: The partition columns or engine specific expressions, only applicable in certain engines. (eg. (ds, hour))
+        clustered_by: The cluster columns, only applicable in certain engines. (eg. (ds, hour))
         python_env: Dictionary containing all global variables needed to render the model's macros.
         mapping_schema: The schema of table names to column and types.
     """
@@ -556,29 +557,38 @@ def validate_definition(self) -> None:
         Raises:
             ConfigError
         """
-        if self.partitioned_by:
-            unique_partition_keys = {
-                col.name.strip().lower()
-                for expr in self.partitioned_by
-                for col in expr.find_all(exp.Column)
-            }
-            if len(self.partitioned_by) != len(unique_partition_keys):
-                raise_config_error(
-                    "All partition keys must be unique in the model definition",
-                    self._path,
-                )
 
-            columns_to_types = self.columns_to_types
-            if columns_to_types is not None:
-                column_names = {c.lower() for c in columns_to_types}
-                missing_keys = unique_partition_keys - column_names
-                if missing_keys:
-                    missing_keys_str = ", ".join(f"'{k}'" for k in sorted(missing_keys))
+        for field in ("partitioned_by", "clustered_by"):
+            values = getattr(self, field)
+
+            if values:
+                values = [
+                    col.name
+                    for expr in values
+                    for col in t.cast(
+                        exp.Expression, exp.maybe_parse(expr, dialect=self.dialect)
+                    ).find_all(exp.Column)
+                ]
+
+                unique_keys = set(values)
+
+                if len(values) != len(unique_keys):
                     raise_config_error(
-                        f"Partition keys [{missing_keys_str}] are missing in the model definition",
+                        "All keys in '{field}' must be unique in the model definition",
                         self._path,
                     )
 
+                columns_to_types = self.columns_to_types
+                if columns_to_types is not None:
+                    column_names = {c.lower() for c in columns_to_types}
+                    missing_keys = unique_keys - column_names
+                    if missing_keys:
+                        missing_keys_str = ", ".join(f"'{k}'" for k in sorted(missing_keys))
+                        raise_config_error(
+                            f"{field} keys [{missing_keys_str}] are missing in the model definition",
+                            self._path,
+                        )
+
         if self.kind.is_incremental_by_time_range and not self.time_column:
             raise_config_error(
                 "Incremental by time range models must have a time_column field.",
@@ -1658,6 +1668,7 @@ def _single_expr_or_tuple(values: t.Sequence[exp.Expression]) -> exp.Expression
     "cron": lambda value: exp.Literal.string(value),
     "batch_size": lambda value: exp.Literal.number(value),
     "partitioned_by_": _single_expr_or_tuple,
+    "clustered_by": _single_value_or_tuple,
     "depends_on_": lambda value: exp.Tuple(expressions=value),
     "pre": _list_of_calls_to_exp,
     "post": _list_of_calls_to_exp,
 
@@ -57,6 +57,7 @@ class ModelMeta(PydanticModel):
     retention: t.Optional[int]  # not implemented yet
     storage_format: t.Optional[str]
     partitioned_by_: t.List[exp.Expression] = Field(default=[], alias="partitioned_by")
+    clustered_by: t.List[str] = []
     depends_on_: t.Optional[t.Set[str]] = Field(default=None, alias="depends_on")
     columns_to_types_: t.Optional[t.Dict[str, exp.DataType]] = Field(default=None, alias="columns")
     column_descriptions_: t.Optional[t.Dict[str, str]]
@@ -115,7 +116,7 @@ def extract(v: exp.Expression) -> t.Tuple[str, t.Dict[str, str]]:
             ]
         return v
 
-    @validator("tags", "grain", pre=True)
+    @validator("clustered_by", "tags", "grain", pre=True)
     def _value_or_tuple_validator(cls, v: t.Any) -> t.Any:
         if isinstance(v, (exp.Tuple, exp.Array)):
             return [e.name for e in v.expressions]
@@ -215,8 +216,9 @@ def _date_validator(cls, v: t.Any) -> t.Optional[TimeLike]:
     def _kind_validator(cls, values: t.Dict[str, t.Any]) -> t.Dict[str, t.Any]:
         kind = values.get("kind")
         if kind:
-            if values.get("partitioned_by_") and not kind.is_materialized:
-                raise ValueError(f"partitioned_by field cannot be set for {kind} models")
+            for field in ("partitioned_by_", "clustered_by"):
+                if values.get(field) and not kind.is_materialized:
+                    raise ValueError(f"{field} field cannot be set for {kind} models")
 
         return values
 
 
@@ -915,6 +915,7 @@ def _model_data_hash(model: Model) -> str:
         model.storage_format,
         str(model.lookback),
         *(expr.sql() for expr in (model.partitioned_by or [])),
+        *(model.clustered_by or []),
         model.stamp,
     ]
 
 
@@ -646,6 +646,7 @@ def create(
                 storage_format=model.storage_format,
                 partitioned_by=model.partitioned_by,
                 partition_interval_unit=model.interval_unit(),
+                clustered_by=model.clustered_by,
             )
         else:
             self.adapter.ctas(
@@ -655,6 +656,7 @@ def create(
                 storage_format=model.storage_format,
                 partitioned_by=model.partitioned_by,
                 partition_interval_unit=model.interval_unit(),
+                clustered_by=model.clustered_by,
             )
 
     def migrate(self, target_table_name: str, source_table_name: str) -> None:
 
@@ -219,12 +219,13 @@ def to_sqlmesh(self, context: DbtContext) -> Model:
             optional_kwargs["partitioned_by"] = [exp.to_column(val) for val in self.partition_by]
         elif self.partition_by and isinstance(self.partition_by, dict):
             optional_kwargs["partitioned_by"] = [
-                d.parse_one(
-                    f"TIMESTAMP_TRUNC(`{self.partition_by['field']}`, {self.partition_by['granularity']})",
-                    dialect=dialect,
+                exp.TimestampTrunc(
+                    this=exp.to_column(self.partition_by["field"]),
+                    unit=exp.var(self.partition_by["granularity"]),
                 )
             ]
-
+        if self.cluster_by:
+            optional_kwargs["clustered_by"] = self.cluster_by
         for field in ["cron"]:
             field_val = getattr(self, field, None) or self.meta.get(field, None)
             if field_val:
 
@@ -169,6 +169,7 @@ def test_create_table_date_partition(
         {"a": "int", "b": "int"},
         partitioned_by=partition_by_cols,
         partition_interval_unit=IntervalUnit.DAY,
+        clustered_by=["b"],
     )
 
     sql_calls = [
@@ -179,7 +180,7 @@ def test_create_table_date_partition(
         for call in execute_mock.call_args_list
     ]
     assert sql_calls == [
-        f"CREATE TABLE IF NOT EXISTS `test_table` (`a` int, `b` int) PARTITION BY {partition_by_statement}"
+        f"CREATE TABLE IF NOT EXISTS `test_table` (`a` int, `b` int) PARTITION BY {partition_by_statement} CLUSTER BY `b`"
     ]
Original file line number	Diff line number	Diff line change
`@@ -915,6 +915,7 @@ def _model_data_hash(model: Model) -> str:`
`915`	`915`	`model.storage_format,`
`916`	`916`	`str(model.lookback),`
`917`	`917`	`*(expr.sql() for expr in (model.partitioned_by or [])),`
	`918`	`+ *(model.clustered_by or []),`
`918`	`919`	`model.stamp,`
`919`	`920`	`]`
`920`	`921`
Original file line number	Diff line number	Diff line change
`@@ -646,6 +646,7 @@ def create(`
`646`	`646`	`storage_format=model.storage_format,`
`647`	`647`	`partitioned_by=model.partitioned_by,`
`648`	`648`	`partition_interval_unit=model.interval_unit(),`
	`649`	`+ clustered_by=model.clustered_by,`
`649`	`650`	`)`
`650`	`651`	`else:`
`651`	`652`	`self.adapter.ctas(`
`@@ -655,6 +656,7 @@ def create(`
`655`	`656`	`storage_format=model.storage_format,`
`656`	`657`	`partitioned_by=model.partitioned_by,`
`657`	`658`	`partition_interval_unit=model.interval_unit(),`
	`659`	`+ clustered_by=model.clustered_by,`
`658`	`660`	`)`
`659`	`661`
`660`	`662`	`def migrate(self, target_table_name: str, source_table_name: str) -> None:`
Original file line number	Diff line number	Diff line change
`@@ -169,6 +169,7 @@ def test_create_table_date_partition(`
`169`	`169`	`{"a": "int", "b": "int"},`
`170`	`170`	`partitioned_by=partition_by_cols,`
`171`	`171`	`partition_interval_unit=IntervalUnit.DAY,`
	`172`	`+ clustered_by=["b"],`
`172`	`173`	`)`
`173`	`174`
`174`	`175`	`sql_calls = [`
`@@ -179,7 +180,7 @@ def test_create_table_date_partition(`
`179`	`180`	`for call in execute_mock.call_args_list`
`180`	`181`	`]`
`181`	`182`	`assert sql_calls == [`
`182`		- f"CREATE TABLE IF NOT EXISTS `test_table` (`a` int, `b` int) PARTITION BY {partition_by_statement}"
	`183`	+ f"CREATE TABLE IF NOT EXISTS `test_table` (`a` int, `b` int) PARTITION BY {partition_by_statement} CLUSTER BY `b`"
`183`	`184`	`]`
`184`	`185`
`185`	`186`