Skip to content

Commit 1c5f127

Browse files
fix(quicksight): resolve column aliases to correct upstream column in lineage
Fixes #26670
1 parent 7e7ed3b commit 1c5f127

2 files changed

Lines changed: 284 additions & 3 deletions

File tree

ingestion/src/metadata/ingestion/source/dashboard/quicksight/metadata.py

Lines changed: 94 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,18 @@
4545
SourceUrl,
4646
Uuid,
4747
)
48-
from metadata.generated.schema.type.entityLineage import EntitiesEdge, LineageDetails
48+
from metadata.generated.schema.type.entityLineage import (
49+
ColumnLineage,
50+
EntitiesEdge,
51+
LineageDetails,
52+
)
4953
from metadata.generated.schema.type.entityLineage import Source as LineageSource
5054
from metadata.generated.schema.type.entityReference import EntityReference
5155
from metadata.ingestion.api.models import Either
5256
from metadata.ingestion.api.steps import InvalidSourceException
5357
from metadata.ingestion.lineage.models import ConnectionTypeDialectMapper, Dialect
5458
from metadata.ingestion.lineage.parser import LineageParser
59+
from metadata.ingestion.lineage.sql_lineage import get_column_fqn
5560
from metadata.ingestion.ometa.ometa_api import OpenMetadata
5661
from metadata.ingestion.source.dashboard.dashboard_service import (
5762
LINEAGE_MAP,
@@ -233,6 +238,91 @@ def _describe_data_sets(self, dataset_id, dashboard_details: DashboardDetail) ->
233238
logger.info(f"Cannot parse lineage from the dashboard: {dashboard_details.Name} to dataset due to: {err}")
234239
return dataset_id, []
235240

241+
def _build_column_lineage_from_parser(
242+
self,
243+
lineage_parser: LineageParser,
244+
from_entity: Table,
245+
data_model_entity: DashboardDataModel,
246+
) -> List[ColumnLineage]:
247+
"""
248+
Build column-level lineage using SQL parser alias mappings.
249+
250+
When a QuickSight CustomSql query uses column aliases
251+
(e.g. ``SELECT id AS relation_id``), name-based matching fails
252+
because the alias name is matched against upstream columns instead
253+
of tracing back through the SQL expression.
254+
255+
This method uses :class:`LineageParser` column mappings to resolve
256+
the true source column (``id``) from the alias (``relation_id``),
257+
and filters by ``src_col._parent`` to avoid multi-table column
258+
name collisions.
259+
260+
Falls back to :meth:`_get_column_lineage` when the parser returns
261+
no column lineage (e.g. SQL too complex, parsing failed, or no
262+
aliases present).
263+
264+
Issue #26670.
265+
"""
266+
column_lineage: List[ColumnLineage] = []
267+
268+
for col_pair in lineage_parser.column_lineage or []:
269+
# Guard: parser may return single-element tuples in edge cases
270+
if len(col_pair) < 2:
271+
continue
272+
273+
src_col = col_pair[0]
274+
tgt_col = col_pair[-1]
275+
276+
# Multi-table safety: filter by parent table to avoid resolving
277+
# a shared column name (e.g. 'id') to the wrong upstream table.
278+
if src_col._parent:
279+
# _parent may be qualified: '<default>.table' or 'schema.table'
280+
parent_str = str(src_col._parent).replace("<default>.", "")
281+
# Compare only the table name portion (last segment)
282+
parent_table = parent_str.split(".")[-1].lower()
283+
entity_table = from_entity.name.root.lower()
284+
if parent_table != entity_table:
285+
continue
286+
287+
# raw_name may be fully-qualified (e.g. 'schema.table.col')
288+
# Extract just the column name portion.
289+
src_col_name = src_col.raw_name.split(".")[-1]
290+
tgt_col_name = tgt_col.raw_name.split(".")[-1]
291+
292+
try:
293+
from_col_fqn = get_column_fqn(
294+
table_entity=from_entity, column=src_col_name
295+
)
296+
to_col_fqn = self._get_data_model_column_fqn(
297+
data_model_entity=data_model_entity,
298+
column=tgt_col_name,
299+
)
300+
if from_col_fqn and to_col_fqn:
301+
column_lineage.append(
302+
ColumnLineage(
303+
fromColumns=[from_col_fqn],
304+
toColumn=to_col_fqn,
305+
)
306+
)
307+
except Exception as exc: # pylint: disable=broad-except
308+
logger.debug(
309+
f"Failed to build column lineage for {src_col_name} -> {tgt_col_name}: {exc}"
310+
)
311+
logger.debug(traceback.format_exc())
312+
313+
# Only fall back to name-based matching when the parser found
314+
# NO column lineage globally (parse failure, too complex, no aliases).
315+
# If the parser DID produce lineage but none matched this specific
316+
# from_entity (multi-table query), return an empty list rather than
317+
# manufacturing incorrect cross-table lineage.
318+
if not column_lineage and not lineage_parser.column_lineage:
319+
columns = [col.name.root for col in data_model_entity.columns]
320+
return self._get_column_lineage(
321+
from_entity, data_model_entity, columns
322+
)
323+
324+
return column_lineage
325+
236326
def _yield_lineage_from_query(
237327
self,
238328
data_model_entity,
@@ -308,8 +398,9 @@ def _yield_lineage_from_query(
308398
)
309399
for from_entity in from_entities or []:
310400
if from_entity is not None and data_model_entity is not None:
311-
columns = [col.name.root for col in data_model_entity.columns]
312-
column_lineage = self._get_column_lineage(from_entity, data_model_entity, columns)
401+
column_lineage = self._build_column_lineage_from_parser(
402+
lineage_parser, from_entity, data_model_entity
403+
)
313404
lineage_details.columnsLineage = column_lineage
314405
yield Either(
315406
right=AddLineageRequest(

ingestion/tests/unit/topology/dashboard/test_quicksight.py

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@ def describe_data_set_side_effect(**kwargs):
364364
col_names_b = {col.name.root for col in dm_b.columns}
365365
assert col_names_b == {"email", "created_at"}
366366

367+
@pytest.mark.order(9)
367368
def test_chart_source_state_populated(self):
368369
"""Verify register_record_chart populates chart_source_state after yield_dashboard_chart."""
369370
dashboard_details = DashboardDetail(**{**MOCK_DASHBOARD_DETAILS, "Version": mock_data["Version"]})
@@ -372,3 +373,192 @@ def test_chart_source_state_populated(self):
372373
assert len(self.quicksight.chart_source_state) == len(mock_data["Version"]["Sheets"])
373374
for fqn in self.quicksight.chart_source_state:
374375
assert "quicksight_source_test" in fqn
376+
377+
@pytest.mark.order(10)
378+
def test_build_column_lineage_from_parser_resolves_alias(self):
379+
"""
380+
When CustomSql uses SELECT src_col AS alias_col,
381+
_build_column_lineage_from_parser must map src_col (source)
382+
to alias_col (data model column) — not match by name.
383+
Issue #26670.
384+
"""
385+
src_col = MagicMock()
386+
src_col.raw_name = "id"
387+
src_col._parent = None # No parent — single-table query
388+
389+
tgt_col = MagicMock()
390+
tgt_col.raw_name = "relation_id"
391+
392+
mock_parser = MagicMock()
393+
mock_parser.column_lineage = [(src_col, tgt_col)]
394+
395+
src_fqn = "postgres.public.relation_table.id"
396+
alias_fqn = "quicksight_service.dataset.relation_id"
397+
398+
mock_from_entity = MagicMock()
399+
mock_from_entity.name.root = "relation_table"
400+
mock_data_model = MagicMock()
401+
402+
with patch(
403+
"metadata.ingestion.source.dashboard.quicksight.metadata.get_column_fqn",
404+
return_value=src_fqn,
405+
) as mock_get_col_fqn:
406+
with patch.object(
407+
self.quicksight,
408+
"_get_data_model_column_fqn",
409+
return_value=alias_fqn,
410+
) as mock_get_dm_col_fqn:
411+
result = self.quicksight._build_column_lineage_from_parser(
412+
mock_parser, mock_from_entity, mock_data_model
413+
)
414+
415+
mock_get_col_fqn.assert_called_once_with(
416+
table_entity=mock_from_entity, column="id"
417+
)
418+
mock_get_dm_col_fqn.assert_called_once_with(
419+
data_model_entity=mock_data_model, column="relation_id"
420+
)
421+
assert len(result) == 1
422+
assert result[0].fromColumns == [src_fqn]
423+
assert result[0].toColumn == alias_fqn
424+
425+
@pytest.mark.order(11)
426+
def test_build_column_lineage_from_parser_multi_table_filters_correctly(self):
427+
"""
428+
When CustomSql joins multiple tables with shared column names
429+
(e.g. t1.id and t2.id), _build_column_lineage_from_parser must
430+
only emit lineage for columns belonging to from_entity — not
431+
columns from other tables with the same name.
432+
Issue #26670.
433+
"""
434+
# Column from the correct upstream table
435+
src_col_correct = MagicMock()
436+
src_col_correct.raw_name = "id"
437+
src_col_correct._parent = MagicMock()
438+
src_col_correct._parent.__str__ = MagicMock(
439+
return_value="relation_table"
440+
)
441+
442+
tgt_col_correct = MagicMock()
443+
tgt_col_correct.raw_name = "relation_id"
444+
445+
# Column from a DIFFERENT table with same name 'id'
446+
src_col_wrong = MagicMock()
447+
src_col_wrong.raw_name = "id"
448+
src_col_wrong._parent = MagicMock()
449+
src_col_wrong._parent.__str__ = MagicMock(return_value="other_table")
450+
451+
tgt_col_wrong = MagicMock()
452+
tgt_col_wrong.raw_name = "other_relation_id"
453+
454+
mock_parser = MagicMock()
455+
mock_parser.column_lineage = [
456+
(src_col_correct, tgt_col_correct),
457+
(src_col_wrong, tgt_col_wrong),
458+
]
459+
460+
src_fqn = "postgres.public.relation_table.id"
461+
alias_fqn = "quicksight_service.dataset.relation_id"
462+
463+
mock_from_entity = MagicMock()
464+
mock_from_entity.name.root = "relation_table"
465+
mock_data_model = MagicMock()
466+
467+
with patch(
468+
"metadata.ingestion.source.dashboard.quicksight.metadata.get_column_fqn",
469+
return_value=src_fqn,
470+
):
471+
with patch.object(
472+
self.quicksight,
473+
"_get_data_model_column_fqn",
474+
return_value=alias_fqn,
475+
):
476+
result = self.quicksight._build_column_lineage_from_parser(
477+
mock_parser, mock_from_entity, mock_data_model
478+
)
479+
480+
# Only 1 result — the wrong table's column must be filtered out
481+
assert len(result) == 1
482+
assert result[0].fromColumns == [src_fqn]
483+
assert result[0].toColumn == alias_fqn
484+
485+
@pytest.mark.order(12)
486+
def test_build_column_lineage_no_fallback_when_parser_has_global_lineage(self):
487+
"""
488+
Regression test for the multi-table fallback bug (Issue #26670).
489+
490+
When lineage_parser.column_lineage is non-empty (parser succeeded)
491+
but none of the pairs match from_entity (because they belong to a
492+
different upstream table in a multi-table JOIN), the method must
493+
return an empty list and must NOT call _get_column_lineage (the
494+
name-based fallback). Calling the fallback here would manufacture
495+
incorrect cross-table column lineage.
496+
"""
497+
# Parser found lineage for a DIFFERENT table, not our from_entity
498+
other_src_col = MagicMock()
499+
other_src_col.raw_name = "user_id"
500+
other_src_col._parent = MagicMock()
501+
other_src_col._parent.__str__ = MagicMock(return_value="users_table")
502+
503+
other_tgt_col = MagicMock()
504+
other_tgt_col.raw_name = "uid"
505+
506+
mock_parser = MagicMock()
507+
# Parser globally found lineage — but only for 'users_table'
508+
mock_parser.column_lineage = [(other_src_col, other_tgt_col)]
509+
510+
mock_from_entity = MagicMock()
511+
# Our from_entity is 'orders_table' — no parser pairs match it
512+
mock_from_entity.name.root = "orders_table"
513+
mock_data_model = MagicMock()
514+
515+
with patch.object(
516+
self.quicksight,
517+
"_get_column_lineage",
518+
) as mock_fallback:
519+
result = self.quicksight._build_column_lineage_from_parser(
520+
mock_parser, mock_from_entity, mock_data_model
521+
)
522+
523+
# Must NOT have called the name-based fallback
524+
mock_fallback.assert_not_called()
525+
# Must return an empty list — no manufactured lineage
526+
assert result == []
527+
528+
@pytest.mark.order(13)
529+
def test_build_column_lineage_from_parser_falls_back_when_empty(self):
530+
"""
531+
When lineage_parser.column_lineage is empty (parser failed or
532+
no aliases), _build_column_lineage_from_parser must fall back
533+
to name-based matching via _get_column_lineage.
534+
Issue #26670.
535+
"""
536+
mock_parser = MagicMock()
537+
mock_parser.column_lineage = []
538+
539+
fallback_lineage = [MagicMock()]
540+
mock_from_entity = MagicMock()
541+
mock_from_entity.name.root = "relation_table"
542+
543+
# Build mock columns properly — avoid MagicMock name kwarg trap
544+
mock_col = MagicMock()
545+
mock_col.name = MagicMock()
546+
mock_col.name.root = "col_a"
547+
548+
mock_data_model = MagicMock()
549+
mock_data_model.columns = [mock_col]
550+
551+
with patch.object(
552+
self.quicksight,
553+
"_get_column_lineage",
554+
return_value=fallback_lineage,
555+
) as mock_get_col_lineage:
556+
result = self.quicksight._build_column_lineage_from_parser(
557+
mock_parser, mock_from_entity, mock_data_model
558+
)
559+
560+
# Verify fallback was called with correct column names
561+
mock_get_col_lineage.assert_called_once_with(
562+
mock_from_entity, mock_data_model, ["col_a"]
563+
)
564+
assert result is fallback_lineage

0 commit comments

Comments
 (0)