fix(quicksight): resolve column aliases to correct upstream column in lineage (#26670)#27031
Conversation
|
Hi there 👋 Thanks for your contribution! The OpenMetadata team will review the PR shortly! Once it has been labeled as Let us know if you need any help! |
|
Hi there 👋 Thanks for your contribution! The OpenMetadata team will review the PR shortly! Once it has been labeled as Let us know if you need any help! |
There was a problem hiding this comment.
Pull request overview
This PR aims to fix incorrect QuickSight column-level lineage for datasets backed by CustomSql by using SQL parser-derived column mappings (including aliases) instead of relying solely on name-based matching.
Changes:
- Route CustomSql-based lineage generation through SQL parser column mappings to resolve aliased target columns to their true upstream source columns.
- Add unit tests for alias-based lineage mapping and fallback behavior when the parser produces no column lineage.
Reviewed changes
Copilot reviewed 2 out of 2 changed files in this pull request and generated 2 comments.
| File | Description |
|---|---|
ingestion/src/metadata/ingestion/source/dashboard/quicksight/metadata.py |
Introduces _build_column_lineage_from_parser and uses it for CustomSql query lineage to map source→alias columns via parsed SQL. |
ingestion/tests/unit/topology/dashboard/test_quicksight.py |
Adds unit tests to validate alias resolution and fallback-to-name-matching behavior. |
| for col_pair in lineage_parser.column_lineage or []: | ||
| try: | ||
| src_col = col_pair[0] | ||
| tgt_col = col_pair[-1] | ||
| from_col = get_column_fqn( | ||
| table_entity=from_entity, column=src_col.raw_name | ||
| ) | ||
| to_col = self._get_data_model_column_fqn( | ||
| data_model_entity=data_model_entity, | ||
| column=tgt_col.raw_name, | ||
| ) |
There was a problem hiding this comment.
_build_column_lineage_from_parser matches parsed columns to from_entity using only src_col.raw_name. In multi-table CustomSql queries, different source tables often share column names (e.g., id), so this can incorrectly attach a lineage pair to the wrong from_entity (because get_column_fqn will happily resolve id on any table that has it). Consider filtering lineage_parser.column_lineage by the source column’s parent table (e.g., src_col._parent) or by passing the current source table context into this method and using get_column_fqn(..., table=..., schema=..., database=...) so only pairs belonging to the current upstream table are emitted.
| @pytest.mark.order(9) | ||
| def test_build_column_lineage_from_parser_uses_aliases(self): | ||
| """ | ||
| When a CustomSql query uses column aliases (SELECT src_col AS alias_col), | ||
| _build_column_lineage_from_parser must resolve the source column name and | ||
| map it to the aliased data model column, not drop the lineage pair. | ||
| """ | ||
| from unittest.mock import MagicMock | ||
|
|
||
| src_col = MagicMock() | ||
| src_col.raw_name = "original_col" | ||
|
|
||
| tgt_col = MagicMock() | ||
| tgt_col.raw_name = "alias_col" | ||
|
|
||
| mock_parser = MagicMock() | ||
| mock_parser.column_lineage = [(src_col, tgt_col)] | ||
|
|
||
| src_fqn = "db.schema.my_table.original_col" | ||
| alias_fqn = "qs_service.dataset.alias_col" | ||
|
|
||
| mock_from_entity = MagicMock() | ||
| mock_data_model = MagicMock() | ||
|
|
||
| with patch( | ||
| "metadata.ingestion.source.dashboard.quicksight.metadata.get_column_fqn", | ||
| return_value=src_fqn, | ||
| ) as mock_get_col_fqn: | ||
| with patch.object( | ||
| self.quicksight, | ||
| "_get_data_model_column_fqn", | ||
| return_value=alias_fqn, | ||
| ) as mock_get_dm_col_fqn: | ||
| result = self.quicksight._build_column_lineage_from_parser( | ||
| mock_parser, mock_from_entity, mock_data_model | ||
| ) | ||
|
|
||
| mock_get_col_fqn.assert_called_once_with( | ||
| table_entity=mock_from_entity, column="original_col" | ||
| ) | ||
| mock_get_dm_col_fqn.assert_called_once_with( | ||
| data_model_entity=mock_data_model, column="alias_col" | ||
| ) | ||
| assert len(result) == 1 | ||
| assert result[0].fromColumns == [src_fqn] | ||
| assert result[0].toColumn == alias_fqn | ||
|
|
||
| @pytest.mark.order(10) | ||
| def test_build_column_lineage_from_parser_falls_back_when_no_parser_results(self): | ||
| """ | ||
| When lineage_parser.column_lineage is empty, _build_column_lineage_from_parser | ||
| must fall back to name-based matching via _get_column_lineage. | ||
| """ | ||
| mock_parser = MagicMock() | ||
| mock_parser.column_lineage = [] | ||
|
|
||
| fallback_lineage = [MagicMock()] | ||
| mock_from_entity = MagicMock() | ||
| mock_data_model = MagicMock() | ||
| mock_data_model.columns = [MagicMock(name=MagicMock(root="col_a"))] | ||
|
|
||
| with patch.object( | ||
| self.quicksight, | ||
| "_get_column_lineage", | ||
| return_value=fallback_lineage, | ||
| ) as mock_get_col_lineage: | ||
| result = self.quicksight._build_column_lineage_from_parser( | ||
| mock_parser, mock_from_entity, mock_data_model | ||
| ) | ||
|
|
||
| mock_get_col_lineage.assert_called_once() | ||
| assert result is fallback_lineage |
There was a problem hiding this comment.
The added unit tests validate alias mapping and the empty-parser fallback, but they don’t cover the most failure-prone scenario for this change: a CustomSql query that references multiple upstream tables where the same source column name exists in more than one table (e.g., t1.id and t2.id). Adding a test that asserts _build_column_lineage_from_parser only produces column lineage for the correct upstream table would prevent regressions of incorrect column-level lineage resolution.
|
Hi there 👋 Thanks for your contribution! The OpenMetadata team will review the PR shortly! Once it has been labeled as Let us know if you need any help! |
| src_col = MagicMock() | ||
| src_col.raw_name = "original_col" | ||
|
|
||
| tgt_col = MagicMock() | ||
| tgt_col.raw_name = "alias_col" | ||
|
|
||
| mock_parser = MagicMock() | ||
| mock_parser.column_lineage = [(src_col, tgt_col)] | ||
|
|
||
| src_fqn = "db.schema.my_table.original_col" |
There was a problem hiding this comment.
This test sets src_col.raw_name to a bare column name (original_col), but in real LineageParser.column_lineage output raw_name is frequently fully-qualified (e.g. testdb.public.users.id). As written, the test can pass even if _build_column_lineage_from_parser fails to handle fully-qualified raw_name values (and would then fall back to name-based matching).
To prevent regressions, consider updating the fixture to use a fully-qualified raw_name (and/or a quoted identifier) and assert that the implementation still resolves the correct source column FQN.
|
The Python checkstyle failed. Please run You can install the pre-commit hooks with |
🔴 Playwright Results — 1 failure(s), 22 flaky✅ 3601 passed · ❌ 1 failed · 🟡 22 flaky · ⏭️ 207 skipped
Genuine Failures (failed on all attempts)❌
|
Code Review 👍 Approved with suggestions 1 resolved / 2 findingsFixes QuickSight column alias resolution in lineage tracking by correctly mapping upstream columns. The MagicMock 💡 Edge Case: col_pair[0] and col_pair[-1] alias to same object for 1-element tuples📄 ingestion/src/metadata/ingestion/source/dashboard/quicksight/metadata.py:600-601 In Suggested fix✅ 1 resolved✅ Bug: MagicMock
|
| Compact |
|
Was this helpful? React with 👍 / 👎 | Gitar
|
The Python checkstyle failed. Please run You can install the pre-commit hooks with |
|
TeddyCr
left a comment
There was a problem hiding this comment.
can you check copilot comments, all valid
| for col_pair in lineage_parser.column_lineage or []: | ||
| try: | ||
| src_col = col_pair[0] | ||
| tgt_col = col_pair[-1] | ||
| from_col = get_column_fqn( | ||
| table_entity=from_entity, column=src_col.raw_name | ||
| ) | ||
| to_col = self._get_data_model_column_fqn( | ||
| data_model_entity=data_model_entity, | ||
| column=tgt_col.raw_name, | ||
| ) |
| src_col = MagicMock() | ||
| src_col.raw_name = "original_col" | ||
|
|
||
| tgt_col = MagicMock() | ||
| tgt_col.raw_name = "alias_col" | ||
|
|
||
| mock_parser = MagicMock() | ||
| mock_parser.column_lineage = [(src_col, tgt_col)] | ||
|
|
||
| src_fqn = "db.schema.my_table.original_col" |
| @pytest.mark.order(9) | ||
| def test_build_column_lineage_from_parser_uses_aliases(self): | ||
| """ | ||
| When a CustomSql query uses column aliases (SELECT src_col AS alias_col), | ||
| _build_column_lineage_from_parser must resolve the source column name and | ||
| map it to the aliased data model column, not drop the lineage pair. | ||
| """ | ||
| from unittest.mock import MagicMock | ||
|
|
||
| src_col = MagicMock() | ||
| src_col.raw_name = "original_col" | ||
|
|
||
| tgt_col = MagicMock() | ||
| tgt_col.raw_name = "alias_col" | ||
|
|
||
| mock_parser = MagicMock() | ||
| mock_parser.column_lineage = [(src_col, tgt_col)] | ||
|
|
||
| src_fqn = "db.schema.my_table.original_col" | ||
| alias_fqn = "qs_service.dataset.alias_col" | ||
|
|
||
| mock_from_entity = MagicMock() | ||
| mock_data_model = MagicMock() | ||
|
|
||
| with patch( | ||
| "metadata.ingestion.source.dashboard.quicksight.metadata.get_column_fqn", | ||
| return_value=src_fqn, | ||
| ) as mock_get_col_fqn: | ||
| with patch.object( | ||
| self.quicksight, | ||
| "_get_data_model_column_fqn", | ||
| return_value=alias_fqn, | ||
| ) as mock_get_dm_col_fqn: | ||
| result = self.quicksight._build_column_lineage_from_parser( | ||
| mock_parser, mock_from_entity, mock_data_model | ||
| ) | ||
|
|
||
| mock_get_col_fqn.assert_called_once_with( | ||
| table_entity=mock_from_entity, column="original_col" | ||
| ) | ||
| mock_get_dm_col_fqn.assert_called_once_with( | ||
| data_model_entity=mock_data_model, column="alias_col" | ||
| ) | ||
| assert len(result) == 1 | ||
| assert result[0].fromColumns == [src_fqn] | ||
| assert result[0].toColumn == alias_fqn | ||
|
|
||
| @pytest.mark.order(10) | ||
| def test_build_column_lineage_from_parser_falls_back_when_no_parser_results(self): | ||
| """ | ||
| When lineage_parser.column_lineage is empty, _build_column_lineage_from_parser | ||
| must fall back to name-based matching via _get_column_lineage. | ||
| """ | ||
| mock_parser = MagicMock() | ||
| mock_parser.column_lineage = [] | ||
|
|
||
| fallback_lineage = [MagicMock()] | ||
| mock_from_entity = MagicMock() | ||
| mock_data_model = MagicMock() | ||
| mock_data_model.columns = [MagicMock(name=MagicMock(root="col_a"))] | ||
|
|
||
| with patch.object( | ||
| self.quicksight, | ||
| "_get_column_lineage", | ||
| return_value=fallback_lineage, | ||
| ) as mock_get_col_lineage: | ||
| result = self.quicksight._build_column_lineage_from_parser( | ||
| mock_parser, mock_from_entity, mock_data_model | ||
| ) | ||
|
|
||
| mock_get_col_lineage.assert_called_once() | ||
| assert result is fallback_lineage |
|
hi @SaaiAravindhRaja, thank you for this pr, but this issue was assigned to another pr in progress here, we will reopen this pr if needed |



Fixes #26670
QuickSight datasets backed by
CustomSqlhave aliased columns (e.g.SELECT id AS relation_id). The connector was matching by column name instead of tracing the alias back through the SQL expression, leading to incorrect column-level lineage.