feat: support when matched expression for merge (#1569)

eakmanrq · web-flow · commit dfaeffd47040 · 2023-10-13T16:06:15.000-07:00
diff --git a/docs/concepts/models/model_kinds.md b/docs/concepts/models/model_kinds.md
@@ -115,7 +115,7 @@ If a key is missing in the model's table, the new data row is inserted; otherwis
 * There is at most one record associated with each unique key.
 * It is appropriate to upsert records, so existing records can be overridden by new arrivals when their keys match.
 
-A [Slowly Changing Dimension](../glossary.md#slowly-changing-dimension-scd) (SCD) is one approach that fits this description well.
+A [Slowly Changing Dimension](../glossary.md#slowly-changing-dimension-scd) (SCD) is one approach that fits this description well. See the [SCD Type 2](#scd-type-2) model kind for a specific model kind for SCD Type 2 models.
 
 The name of the unique key column must be provided as part of the `MODEL` DDL, as in this example:
 ```sql linenums="1" hl_lines="3-5"
@@ -156,6 +156,43 @@ WHERE
 
 **Note:** Models of the `INCREMENTAL_BY_UNIQUE_KEY` kind are inherently [non-idempotent](../glossary.md#idempotency), which should be taken into consideration during data [restatement](../plans.md#restatement-plans).
 
+### Unique Key Expressions
+
+The `unique_key` values can either be column names or SQL expressions. For example, if you wanted to create a key that is based on the coalesce of a value then you could do the following:
+
+```sql linenums="1" hl_lines="4"
+MODEL (
+  name db.employees,
+  kind INCREMENTAL_BY_UNIQUE_KEY (
+    unique_key (COALESCE("ds", ''))
+  )
+);
+```
+
+### When Matched Expression
+
+The logic to use when updating columns when a match occurs (the source and target match on the given keys) by default updates all the columns. This can be overriden with custom logic like below:
+    
+```sql linenums="1" hl_lines="4"
+MODEL (
+  name db.employees,
+  kind INCREMENTAL_BY_UNIQUE_KEY (
+    unique_key name,
+    when_matched WHEN MATCHED THEN UPDATE SET target.salary = COALESCE(source.salary, target.salary)
+  )
+);
+```
+
+The `source` and `target` aliases are required when using the `when_matched` expression in order to distinguish between the source and target columns.
+
+**Note**: `when_matched` is only available on engines that support the `MERGE` statement. Currently supported engines include:
+
+* BigQuery
+* Databricks
+* Postgres
+* Snowflake
+* Spark
+
 ### Materialization strategy
 Depending on the target engine, models of the `INCREMENTAL_BY_UNIQUE_KEY` kind are materialized using the following strategies:
 
diff --git a/setup.py b/setup.py
@@ -46,7 +46,7 @@
         "requests",
         "rich[jupyter]",
         "ruamel.yaml",
-        "sqlglot~=18.12.0",
+        "sqlglot~=18.13.0",
     ],
     extras_require={
         "bigquery": [
diff --git a/sqlmesh/core/dialect.py b/sqlmesh/core/dialect.py
@@ -294,15 +294,17 @@ def _parse_props(self: Parser) -> t.Optional[exp.Expression]:
     if not key:
         return None
 
-    if self._match(TokenType.L_PAREN):
-        value: t.Optional[exp.Expression] = self.expression(
+    name = key.name.lower()
+    if name == "when_matched":
+        value: t.Optional[exp.Expression] = self._parse_when_matched()[0]
+    elif self._match(TokenType.L_PAREN):
+        value = self.expression(
             exp.Tuple, expressions=self._parse_csv(lambda: _parse_prop_value(self))
         )
         self._match_r_paren()
     else:
         value = self._parse_bracket(self._parse_field(any_token=True))
 
-    name = key.name.lower()
     if name == "path" and value:
         # Make sure if we get a windows path that it is converted to posix
         value = exp.Literal.string(value.this.replace("\\", "/"))
diff --git a/sqlmesh/core/engine_adapter/base.py b/sqlmesh/core/engine_adapter/base.py
@@ -1159,6 +1159,7 @@ def merge(
         source_table: QueryOrDF,
         columns_to_types: t.Optional[t.Dict[str, exp.DataType]],
         unique_key: t.Sequence[exp.Expression],
+        when_matched: t.Optional[exp.When] = None,
     ) -> None:
         source_queries, columns_to_types = self._get_source_queries_and_columns_to_types(
             source_table, columns_to_types, target_table=target_table
@@ -1170,16 +1171,17 @@ def merge(
                 for part in unique_key
             )
         )
-        when_matched = exp.When(
-            matched=True,
-            source=False,
-            then=exp.Update(
-                expressions=[
-                    exp.column(col, MERGE_TARGET_ALIAS).eq(exp.column(col, MERGE_SOURCE_ALIAS))
-                    for col in columns_to_types
-                ],
-            ),
-        )
+        if not when_matched:
+            when_matched = exp.When(
+                matched=True,
+                source=False,
+                then=exp.Update(
+                    expressions=[
+                        exp.column(col, MERGE_TARGET_ALIAS).eq(exp.column(col, MERGE_SOURCE_ALIAS))
+                        for col in columns_to_types
+                    ],
+                ),
+            )
         when_not_matched = exp.When(
             matched=False,
             source=False,
diff --git a/sqlmesh/core/engine_adapter/mixins.py b/sqlmesh/core/engine_adapter/mixins.py
@@ -24,6 +24,7 @@ def merge(
         source_table: QueryOrDF,
         columns_to_types: t.Optional[t.Dict[str, exp.DataType]],
         unique_key: t.Sequence[exp.Expression],
+        when_matched: t.Optional[exp.When] = None,
     ) -> None:
         """
         Merge implementation for engine adapters that do not support merge natively.
@@ -35,6 +36,10 @@ def merge(
            within the temporary table are ommitted.
         4. Drop the temporary table.
         """
+        if when_matched:
+            raise SQLMeshError(
+                "This engine does not support MERGE expressions and therefore `when_matched` is not supported."
+            )
         if columns_to_types is None:
             columns_to_types = self.columns(target_table)
 
diff --git a/sqlmesh/core/model/kind.py b/sqlmesh/core/model/kind.py
@@ -18,6 +18,7 @@
     SQLGlotPositiveInt,
     SQLGlotString,
     field_validator,
+    field_validator_v1_args,
     model_validator,
     model_validator_v1_args,
 )
@@ -208,8 +209,39 @@ def to_expression(self, dialect: str = "", **kwargs: t.Any) -> d.ModelKind:
 class IncrementalByUniqueKeyKind(_Incremental):
     name: Literal[ModelKindName.INCREMENTAL_BY_UNIQUE_KEY] = ModelKindName.INCREMENTAL_BY_UNIQUE_KEY
     unique_key: t.List[exp.Expression]
+    when_matched: t.Optional[exp.When] = None
+
     _unique_key_validator = _unique_key_validator
 
+    @field_validator("when_matched", mode="before")
+    @field_validator_v1_args
+    def _when_matched_validator(
+        cls, v: t.Optional[exp.When], values: t.Dict[str, t.Any]
+    ) -> t.Optional[exp.When]:
+        def replace_table_references(expression: exp.Expression) -> exp.Expression:
+            from sqlmesh.core.engine_adapter.base import (
+                MERGE_SOURCE_ALIAS,
+                MERGE_TARGET_ALIAS,
+            )
+
+            if isinstance(expression, exp.Column):
+                if expression.table.lower() == "target":
+                    expression.set(
+                        "table",
+                        exp.to_identifier(MERGE_TARGET_ALIAS),
+                    )
+                elif expression.table.lower() == "source":
+                    expression.set(
+                        "table",
+                        exp.to_identifier(MERGE_SOURCE_ALIAS),
+                    )
+            return expression
+
+        if not v:
+            return v
+        v.meta["dialect"] = values.get("dialect")
+        return v.transform(replace_table_references)
+
 
 class IncrementalUnmanagedKind(_ModelKind):
     name: Literal[ModelKindName.INCREMENTAL_UNMANAGED] = ModelKindName.INCREMENTAL_UNMANAGED
diff --git a/sqlmesh/core/model/meta.py b/sqlmesh/core/model/meta.py
@@ -354,3 +354,9 @@ def _partition_by_columns(self) -> t.List[exp.Column]:
     @property
     def managed_columns(self) -> t.Dict[str, exp.DataType]:
         return getattr(self.kind, "managed_columns", {})
+
+    @property
+    def when_matched(self) -> t.Optional[exp.When]:
+        if isinstance(self.kind, IncrementalByUniqueKeyKind):
+            return self.kind.when_matched
+        return None
diff --git a/sqlmesh/core/snapshot/evaluator.py b/sqlmesh/core/snapshot/evaluator.py
@@ -925,6 +925,7 @@ def insert(
             query_or_df,
             columns_to_types=model.columns_to_types,
             unique_key=model.unique_key,
+            when_matched=model.when_matched,
         )
 
     def append(
@@ -942,6 +943,7 @@ def append(
             query_or_df,
             columns_to_types=model.columns_to_types,
             unique_key=model.unique_key,
+            when_matched=model.when_matched,
         )
 
 
diff --git a/tests/core/engine_adapter/test_base.py b/tests/core/engine_adapter/test_base.py
@@ -814,6 +814,53 @@ def test_merge_upsert_pandas(make_mocked_engine_adapter: t.Callable):
     )
 
 
+def test_merge_when_matched(make_mocked_engine_adapter: t.Callable, assert_exp_eq):
+    adapter = make_mocked_engine_adapter(EngineAdapter)
+
+    adapter.merge(
+        target_table="target",
+        source_table=t.cast(exp.Select, parse_one('SELECT "ID", ts, val FROM source')),
+        columns_to_types={
+            "ID": exp.DataType.Type.INT,
+            "ts": exp.DataType.Type.TIMESTAMP,
+            "val": exp.DataType.Type.INT,
+        },
+        unique_key=[exp.to_identifier("ID", quoted=True)],
+        when_matched=exp.When(
+            matched=True,
+            source=False,
+            then=exp.Update(
+                expressions=[
+                    exp.column("val", "__MERGE_TARGET__").eq(exp.column("val", "__MERGE_SOURCE__")),
+                    exp.column("ts", "__MERGE_TARGET__").eq(
+                        exp.Coalesce(
+                            this=exp.column("ts", "__MERGE_SOURCE__"),
+                            expressions=[exp.column("ts", "__MERGE_TARGET__")],
+                        )
+                    ),
+                ],
+            ),
+        ),
+    )
+
+    assert_exp_eq(
+        adapter.cursor.execute.call_args[0][0],
+        """
+MERGE INTO "target" AS "__MERGE_TARGET__" USING (
+  SELECT
+    "ID",
+    "ts",
+    "val"
+  FROM "source"
+) AS "__MERGE_SOURCE__"
+  ON "__MERGE_TARGET__"."ID" = "__MERGE_SOURCE__"."ID"
+  WHEN MATCHED THEN UPDATE SET "__MERGE_TARGET__"."val" = "__MERGE_SOURCE__"."val", "__MERGE_TARGET__"."ts" = COALESCE("__MERGE_SOURCE__"."ts", "__MERGE_TARGET__"."ts")
+  WHEN NOT MATCHED THEN INSERT ("ID", "ts", "val")
+    VALUES ("__MERGE_SOURCE__"."ID", "__MERGE_SOURCE__"."ts", "__MERGE_SOURCE__"."val")
+""",
+    )
+
+
 def test_scd_type_2(make_mocked_engine_adapter: t.Callable):
     adapter = make_mocked_engine_adapter(EngineAdapter)
 
diff --git a/tests/core/test_snapshot_evaluator.py b/tests/core/test_snapshot_evaluator.py
@@ -9,7 +9,11 @@
 from sqlmesh.core.audit import StandaloneAudit
 from sqlmesh.core.dialect import to_schema
 from sqlmesh.core.engine_adapter import EngineAdapter, create_engine_adapter
-from sqlmesh.core.engine_adapter.base import InsertOverwriteStrategy
+from sqlmesh.core.engine_adapter.base import (
+    MERGE_SOURCE_ALIAS,
+    MERGE_TARGET_ALIAS,
+    InsertOverwriteStrategy,
+)
 from sqlmesh.core.environment import EnvironmentNamingInfo
 from sqlmesh.core.macros import RuntimeStage, macro
 from sqlmesh.core.model import (
@@ -1009,6 +1013,64 @@ def test_insert_into_scd_type_2(adapter_mock, make_snapshot):
     )
 
 
+def test_create_incremental_by_unique_key_updated_at_exp(adapter_mock, make_snapshot):
+    evaluator = SnapshotEvaluator(adapter_mock)
+    model = load_sql_based_model(
+        parse(  # type: ignore
+            """
+            MODEL (
+                name test_schema.test_model,
+                kind INCREMENTAL_BY_UNIQUE_KEY (
+                    unique_key [id],
+                    when_matched WHEN MATCHED THEN UPDATE SET target.name = source.name, target.updated_at = COALESCE(source.updated_at, target.updated_at)
+                )
+            );
+
+            SELECT id::int, name::string, updated_at::timestamp FROM tbl;
+            """
+        )
+    )
+
+    snapshot = make_snapshot(model)
+    snapshot.categorize_as(SnapshotChangeCategory.BREAKING)
+
+    evaluator.evaluate(
+        snapshot,
+        "2020-01-01",
+        "2020-01-02",
+        "2020-01-02",
+        snapshots={},
+    )
+
+    adapter_mock.merge.assert_called_once_with(
+        snapshot.table_name(),
+        model.render_query(),
+        columns_to_types={
+            "id": exp.DataType.build("INT"),
+            "name": exp.DataType.build("STRING"),
+            "updated_at": exp.DataType.build("TIMESTAMP"),
+        },
+        unique_key=[exp.to_column("id")],
+        when_matched=exp.When(
+            matched=True,
+            source=False,
+            then=exp.Update(
+                expressions=[
+                    exp.column("name", MERGE_TARGET_ALIAS).eq(
+                        exp.column("name", MERGE_SOURCE_ALIAS)
+                    ),
+                    exp.column("updated_at", MERGE_TARGET_ALIAS).eq(
+                        exp.Coalesce(
+                            this=exp.column("updated_at", MERGE_SOURCE_ALIAS),
+                            expressions=[exp.column("updated_at", MERGE_TARGET_ALIAS)],
+                        )
+                    ),
+                ],
+            ),
+        ),
+    )
+
+
 def test_standalone_audit(mocker: MockerFixture, adapter_mock, make_snapshot):
     evaluator = SnapshotEvaluator(adapter_mock)
 

Original file line number	Diff line number	Diff line change
`@@ -925,6 +925,7 @@ def insert(`
`925`	`925`	`query_or_df,`
`926`	`926`	`columns_to_types=model.columns_to_types,`
`927`	`927`	`unique_key=model.unique_key,`
	`928`	`+ when_matched=model.when_matched,`
`928`	`929`	`)`
`929`	`930`
`930`	`931`	`def append(`
`@@ -942,6 +943,7 @@ def append(`
`942`	`943`	`query_or_df,`
`943`	`944`	`columns_to_types=model.columns_to_types,`
`944`	`945`	`unique_key=model.unique_key,`
	`946`	`+ when_matched=model.when_matched,`
`945`	`947`	`)`
`946`	`948`
`947`	`949`