diff --git a/ingestion/src/metadata/ingestion/models/custom_basemodel_validation.py b/ingestion/src/metadata/ingestion/models/custom_basemodel_validation.py index e2a183d0371b..c4dd8d792e58 100644 --- a/ingestion/src/metadata/ingestion/models/custom_basemodel_validation.py +++ b/ingestion/src/metadata/ingestion/models/custom_basemodel_validation.py @@ -22,6 +22,9 @@ RESERVED_COLON_KEYWORD = "__reserved__colon__" RESERVED_ARROW_KEYWORD = "__reserved__arrow__" RESERVED_QUOTE_KEYWORD = "__reserved__quote__" +RESERVED_NEWLINE_KEYWORD = "__reserved__newline__" +RESERVED_CARRIAGE_RETURN_KEYWORD = "__reserved__carriage_return__" +RESERVED_TAB_KEYWORD = "__reserved__tab__" class TransformDirection(Enum): @@ -115,6 +118,9 @@ def revert_separators(value): value.replace(RESERVED_COLON_KEYWORD, "::") .replace(RESERVED_ARROW_KEYWORD, ">") .replace(RESERVED_QUOTE_KEYWORD, '"') + .replace(RESERVED_NEWLINE_KEYWORD, "\n") + .replace(RESERVED_CARRIAGE_RETURN_KEYWORD, "\r") + .replace(RESERVED_TAB_KEYWORD, "\t") ) @@ -123,6 +129,9 @@ def replace_separators(value): value.replace("::", RESERVED_COLON_KEYWORD) .replace(">", RESERVED_ARROW_KEYWORD) .replace('"', RESERVED_QUOTE_KEYWORD) + .replace("\n", RESERVED_NEWLINE_KEYWORD) + .replace("\r", RESERVED_CARRIAGE_RETURN_KEYWORD) + .replace("\t", RESERVED_TAB_KEYWORD) ) diff --git a/ingestion/src/metadata/ingestion/source/dashboard/powerbi/client.py b/ingestion/src/metadata/ingestion/source/dashboard/powerbi/client.py index 0a4d32ffc916..2ee079dba527 100644 --- a/ingestion/src/metadata/ingestion/source/dashboard/powerbi/client.py +++ b/ingestion/src/metadata/ingestion/source/dashboard/powerbi/client.py @@ -542,11 +542,11 @@ def fetch_workspace_scan_status(self, scan_id: str) -> Optional[WorkSpaceScanRes return None def fetch_workspace_scan_result(self, scan_id: str) -> Optional[Workspaces]: # noqa: UP045 - """Get Workspace scan result by id method - Args: - scan_id: - Returns: - Workspaces + """Get Workspace scan result by id method. + + Parse each workspace individually so a single malformed workspace + (or any nested entity that still fails validation) does not invalidate + the whole scan-result response and drop the entire chunk of workspaces. """ try: logger.debug( @@ -554,7 +554,29 @@ def fetch_workspace_scan_result(self, scan_id: str) -> Optional[Workspaces]: # " to get workspace scan result" ) response_data = self.client.get(f"/myorg/admin/workspaces/scanResult/{scan_id}") - return Workspaces(**response_data) + if not response_data: + return None + parsed_workspaces: List[Group] = [] # noqa: UP006 + for raw_ws in response_data.get("workspaces", []) or []: # pyright: ignore[reportAttributeAccessIssue] + if isinstance(raw_ws, dict) and raw_ws.get("id") is not None: + try: + parsed_workspaces.append(Group(**raw_ws)) + except Exception as ws_exc: # pylint: disable=broad-except + logger.debug(traceback.format_exc()) + logger.warning( + "Skipping workspace [id=%s] in scan [%s] due to parse error: %s", + raw_ws.get("id"), + scan_id, + ws_exc, + ) + else: + workspace_entry_type = type(raw_ws).__name__ + logger.warning( + "Skipping a workspace in scan [%s] due to missing 'id' field or invalid format. Entry type: %s", + scan_id, + workspace_entry_type, + ) + return Workspaces(workspaces=parsed_workspaces) except Exception as exc: # pylint: disable=broad-except logger.debug(traceback.format_exc()) logger.warning(f"Error fetching workspace scan result: {exc}") diff --git a/ingestion/src/metadata/ingestion/source/dashboard/powerbi/metadata.py b/ingestion/src/metadata/ingestion/source/dashboard/powerbi/metadata.py index 28da6876b900..2a847952751e 100644 --- a/ingestion/src/metadata/ingestion/source/dashboard/powerbi/metadata.py +++ b/ingestion/src/metadata/ingestion/source/dashboard/powerbi/metadata.py @@ -306,6 +306,12 @@ def get_dashboard(self) -> Any: for dashboard in self.get_dashboards_list() or []: dashboard_details = self.get_dashboard_details(dashboard) dashboard_name = self.get_dashboard_name(dashboard_details) + if not dashboard_name: + logger.debug( + "Skipping PowerBI dashboard with empty name on workspace [%s]", + workspace.name, # pyright: ignore[reportOptionalMemberAccess] + ) + continue if filter_by_dashboard( self.source_config.dashboardFilterPattern, dashboard_name, @@ -338,7 +344,7 @@ def get_dashboards_list( """ return self.context.get().workspace.reports + self.context.get().workspace.dashboards # pyright: ignore[reportAttributeAccessIssue] - def get_dashboard_name(self, dashboard: Union[PowerBIDashboard, PowerBIReport]) -> str: # noqa: UP007 + def get_dashboard_name(self, dashboard: Union[PowerBIDashboard, PowerBIReport]) -> str | None: # noqa: UP007 # pyright: ignore[reportIncompatibleMethodOverride] """ Get Dashboard Name """ @@ -375,7 +381,9 @@ def _get_report_url(self, workspace_id: str, dashboard_details: PowerBIReport) - reports_prefix = RDL_REPORTS_PREFIX try: pages: Optional[List[ReportPage]] = self.client.api_client.fetch_report_pages(workspace_id, dashboard_id) # noqa: UP006, UP045 - if len(pages) >= 1: + if ( + pages and pages[0].name + ): # if there are pages and page has name then only add page id in url: # if there are pages and page has name then only add page id in url # get first page out of multiple pages otherwise # get page if of single page page_id = pages[0].name @@ -494,7 +502,6 @@ def yield_dashboard_chart(self, dashboard_details: Group) -> Iterable[Either[Cre if filter_by_chart(self.source_config.chartFilterPattern, chart_display_name): self.status.filter(chart_display_name, "Chart Pattern not Allowed") continue - self.state.add_dashboard_chart(dashboard_details.id, chart.id) chart_request = CreateChartRequest( name=EntityName(chart.id), displayName=chart_display_name, @@ -509,6 +516,7 @@ def yield_dashboard_chart(self, dashboard_details: Group) -> Iterable[Either[Cre service=FullyQualifiedEntityName(self.context.get().dashboard_service), # pyright: ignore[reportAttributeAccessIssue] ) yield Either(right=chart_request) + self.state.add_dashboard_chart(dashboard_details.id, chart.id) self.register_record_chart(chart_request=chart_request) except Exception as exc: yield Either( @@ -525,6 +533,12 @@ def _get_child_measures(self, table: PowerBiTable) -> List[Column]: # noqa: UP0 """ measures = [] for measure in table.measures or []: + if not measure.name: + logger.debug( + "Skipping PowerBI measure with empty name on table [%s]", + table.name, + ) + continue try: measure_type = DataType.MEASURE_VISIBLE if measure.isHidden: @@ -551,6 +565,12 @@ def _get_child_columns(self, table: PowerBiTable) -> List[Column]: # noqa: UP00 """ columns = [] for column in table.columns or []: + if not column.name: + logger.debug( + "Skipping PowerBI column with empty name on table [%s]", + table.name, + ) + continue try: parsed_column = { "dataTypeDisplay": (column.dataType if column.dataType else DataType.UNKNOWN.value), @@ -571,6 +591,12 @@ def _get_column_info(self, dataset: Dataset) -> Optional[List[Column]]: # noqa: """Build columns from dataset""" datasource_columns = [] for table in dataset.tables or []: + if not table.name: + logger.debug( + "Skipping PowerBI table with empty name on dataset [id=%s]", + dataset.id, + ) + continue try: table_display_name = None if self.service_connection.displayTableNameFromSource: @@ -603,6 +629,9 @@ def _get_dataflow_column_info(self, dataflow_export: DataflowExportResponse) -> """Build columns from dataflow export response entities""" datasource_columns = [] for entity in dataflow_export.entities or []: + if not entity.name: + logger.debug("Skipping PowerBI dataflow column entity with empty name") + continue try: parsed_table = { "dataTypeDisplay": "PowerBI Table", @@ -614,6 +643,12 @@ def _get_dataflow_column_info(self, dataflow_export: DataflowExportResponse) -> } child_columns = [] for attribute in entity.attributes or []: + if not attribute.name: + logger.debug( + "Skipping PowerBI dataflow attribute(column entity) with empty name on entity [%s]", + entity.name, + ) + continue try: parsed_column = { "dataTypeDisplay": (attribute.dataType if attribute.dataType else DataType.UNKNOWN.value), @@ -651,6 +686,12 @@ def _filtered_datamodels(self) -> list: return cached filtered: list = [] for dataset in self._get_datamodels_list() or []: + if not dataset.name: + logger.debug( + "Skipping PowerBI data model with empty name [id=%s]", + dataset.id, + ) + continue if filter_by_datamodel(self.source_config.dataModelFilterPattern, dataset.name): self.status.filter(dataset.name, "Data model filtered out.") continue @@ -662,55 +703,68 @@ def yield_datamodel(self, dashboard_details: Group) -> Iterable[Either[CreateDas """ Get All the Powerbi Datasets """ + if not self.source_config.includeDataModels: + return try: - if self.source_config.includeDataModels: - for dataset in self._filtered_datamodels(): - if isinstance(dataset, Dataset): - data_model_type = DataModelType.PowerBIDataModel.value - datamodel_columns = self._get_column_info(dataset) - source_url = self._get_dataset_url( - workspace_id=self.context.get().workspace.id, # pyright: ignore[reportAttributeAccessIssue] - dataset_id=dataset.id, - ) - elif isinstance(dataset, Dataflow): - data_model_type = DataModelType.PowerBIDataFlow.value - datamodel_columns = [] - source_url = self._get_dataflow_url( - workspace_id=self.context.get().workspace.id, # pyright: ignore[reportAttributeAccessIssue] - dataflow_id=dataset.id, - ) - # dataflow export api for detailed metadata - # api: https://api.powerbi.com/v1.0/myorg/admin/dataflows/DATAFLOW_ID/export - # doc: https://learn.microsoft.com/en-us/rest/api/power-bi/admin/dataflows-export-dataflow-as-admin - dataflow_export = self.client.api_client.fetch_dataflow_export(dataflow_id=dataset.id) - if dataflow_export: - self.state.cache_dataflow_export(dataset.id, dataflow_export) - datamodel_columns = self._get_dataflow_column_info(dataflow_export) - else: - logger.warning(f"Unknown dataset type: {type(dataset)}, name: {dataset.name}") - continue - data_model_request = CreateDashboardDataModelRequest( - name=EntityName(dataset.id), - displayName=dataset.name, - description=(Markdown(dataset.description) if dataset.description else None), - service=FullyQualifiedEntityName(self.context.get().dashboard_service), # pyright: ignore[reportAttributeAccessIssue] - dataModelType=data_model_type, - serviceType=DashboardServiceType.PowerBI.value, - columns=datamodel_columns, - project=self.get_project_name(dashboard_details=dataset), - owners=self.get_owner_ref(dashboard_details=dataset), - sourceUrl=SourceUrl(source_url), - ) - yield Either(right=data_model_request) - self.register_record_datamodel(datamodel_request=data_model_request) + datasets = self._filtered_datamodels() except Exception as exc: yield Either( left=StackTraceError( - name=dataset.name, - error=f"Error yielding Data Model [{dataset.name}]: {exc}", + name="datamodels", + error=f"Error fetching PowerBI data models: {exc}", stackTrace=traceback.format_exc(), ) ) + return + for dataset in datasets: + try: + if isinstance(dataset, Dataset): + data_model_type = DataModelType.PowerBIDataModel.value + datamodel_columns = self._get_column_info(dataset) + source_url = self._get_dataset_url( + workspace_id=self.context.get().workspace.id, # pyright: ignore[reportAttributeAccessIssue] + dataset_id=dataset.id, + ) + elif isinstance(dataset, Dataflow): + data_model_type = DataModelType.PowerBIDataFlow.value + datamodel_columns = [] + source_url = self._get_dataflow_url( + workspace_id=self.context.get().workspace.id, # pyright: ignore[reportAttributeAccessIssue] + dataflow_id=dataset.id, + ) + # dataflow export api for detailed metadata + # api: https://api.powerbi.com/v1.0/myorg/admin/dataflows/DATAFLOW_ID/export + # doc: https://learn.microsoft.com/en-us/rest/api/power-bi/admin/dataflows-export-dataflow-as-admin + dataflow_export = self.client.api_client.fetch_dataflow_export(dataflow_id=dataset.id) + if dataflow_export: + self.state.cache_dataflow_export(dataset.id, dataflow_export) + datamodel_columns = self._get_dataflow_column_info(dataflow_export) + else: + logger.warning(f"Unknown dataset type: {type(dataset)}, name: {dataset.name}") + continue + data_model_request = CreateDashboardDataModelRequest( # pyright: ignore[reportCallIssue] + name=EntityName(dataset.id), + displayName=dataset.name, + description=(Markdown(dataset.description) if dataset.description else None), + service=FullyQualifiedEntityName(self.context.get().dashboard_service), # pyright: ignore[reportAttributeAccessIssue] + dataModelType=data_model_type, + serviceType=DashboardServiceType.PowerBI.value, + columns=datamodel_columns, + project=self.get_project_name(dashboard_details=dataset), + owners=self.get_owner_ref(dashboard_details=dataset), + sourceUrl=SourceUrl(source_url), + ) + yield Either(right=data_model_request) # pyright: ignore[reportCallIssue] + self.register_record_datamodel(datamodel_request=data_model_request) + except Exception as exc: + dataset_name = dataset.name or dataset.id or "" + yield Either( # pyright: ignore[reportCallIssue] + left=StackTraceError( + name=dataset_name, + error=f"Error yielding Data Model [{dataset_name}]: {exc}", + stackTrace=traceback.format_exc(), + ) + ) def create_report_dashboard_lineage( self, @@ -1399,7 +1453,17 @@ def _get_table_and_datamodel_lineage( table_info_list = self._parse_table_info_from_source_exp(table, datamodel_entity) if not table_info_list: # if tables are not found from source expression - # try establishing lineage using powerbi's table name + # try establishing lineage using powerbi's table name. + # PowerBiTable.name is now Optional, so skip nameless tables here + # to match _get_column_info and avoid build_es_fqn_search_string + # raising on a None table_name (which surfaces as a noisy lineage + # error rather than a quiet skip). + if not table.name: + logger.debug( + "Skipping PowerBI table with empty name for lineage to datamodel [%s]", + datamodel_entity.name, + ) + return table_info_list = [{"table": table.name}] if isinstance(table_info_list, List): # noqa: UP006 for table_info in table_info_list: @@ -1435,7 +1499,12 @@ def _get_table_and_datamodel_lineage( fqn_search_string=fqn_search_string, ) if table_entity and datamodel_entity: - columns_list = [column.name for column in table.columns] + logger.debug( + "Creating lineage between db table=%s and datamodel=%s", + table_entity.name.root, # pyright: ignore[reportAttributeAccessIssue] + datamodel_entity.name.root, + ) + columns_list = [column.name for column in (table.columns or []) if column.name] column_lineage = self._get_column_lineage(table_entity, datamodel_entity, columns_list) yield self._get_add_lineage_request( to_entity=datamodel_entity, @@ -1877,10 +1946,20 @@ def create_dataflow_table_lineage( logger.debug(f"No table references found in dataflow [{datamodel.name}] M document") return - # Build a map of entity_name -> entity attributes for column lineage + # Build a map of entity_name -> entity attributes for column lineage. + # Skip nameless entities/attributes since both are now Optional and a + # None entity name can never match a parsed M-document reference, + # while None attribute names break the List[str] contract of + # _get_dataflow_column_lineage and produce noisy failed lookups. entity_attributes_map = {} for entity in dataflow_export.entities or []: - entity_attributes_map[entity.name] = [attr.name for attr in entity.attributes or []] + if not entity.name: + logger.debug( + "Skipping nameless dataflow entity while building attributes map for dataflow [%s]", + datamodel.name, + ) + continue + entity_attributes_map[entity.name] = [attr.name for attr in entity.attributes or [] if attr.name] for parsed_entity in parsed_entities: entity_name = parsed_entity["entity_name"] diff --git a/ingestion/src/metadata/ingestion/source/dashboard/powerbi/models.py b/ingestion/src/metadata/ingestion/source/dashboard/powerbi/models.py index 134f9bb3e6ca..3e6052073681 100644 --- a/ingestion/src/metadata/ingestion/source/dashboard/powerbi/models.py +++ b/ingestion/src/metadata/ingestion/source/dashboard/powerbi/models.py @@ -54,7 +54,7 @@ class PowerBIDashboard(BaseModel): """ id: str - displayName: str # noqa: N815 + displayName: str | None = None # noqa: N815 webUrl: Optional[str] = None # noqa: N815, UP045 embedUrl: Optional[str] = None # noqa: N815, UP045 tiles: Optional[List[Tile]] = [] # noqa: UP006, UP045 @@ -68,7 +68,7 @@ class PowerBIReport(BaseModel): """ id: str - name: str + name: str | None = None datasetId: Optional[str] = None # noqa: N815, UP045 users: Optional[List[PowerBIUser]] = [] # noqa: UP006, UP045 modifiedBy: Optional[str] = None # noqa: N815, UP045 @@ -112,7 +112,7 @@ class PowerBiColumns(BaseModel): Definition: https://learn.microsoft.com/en-us/rest/api/power-bi/push-datasets/datasets-get-tables-in-group#column """ - name: str + name: str | None = None dataType: Optional[str] = None # noqa: N815, UP045 columnType: Optional[str] = None # noqa: N815, UP045 description: Optional[str] = None # noqa: UP045 @@ -125,7 +125,7 @@ class PowerBiMeasureModel(BaseModel): dataType: str # noqa: N815 dataTypeDisplay: str # noqa: N815 - name: str + name: str | None = None displayName: Optional[str] = None # noqa: N815, UP045 description: str @@ -136,7 +136,7 @@ class PowerBiMeasures(BaseModel): Definition: https://learn.microsoft.com/en-us/rest/api/power-bi/push-datasets/datasets-get-tables-in-group#measure """ - name: str + name: str | None = None expression: Optional[Union[str, List[str]]] = None # noqa: UP006, UP007, UP045 description: Optional[str] = None # noqa: UP045 isHidden: Optional[bool] = False # noqa: N815, UP045 @@ -180,7 +180,7 @@ class PowerBiTable(BaseModel): Definition: https://learn.microsoft.com/en-us/rest/api/power-bi/push-datasets/datasets-get-tables-in-group#table """ - name: str + name: str | None = None columns: Optional[List[PowerBiColumns]] = None # noqa: UP006, UP045 measures: Optional[List[PowerBiMeasures]] = None # noqa: UP006, UP045 description: Optional[str] = None # noqa: UP045 @@ -212,7 +212,7 @@ class TablesResponse(BaseModel): class DatasetExpression(BaseModel): - name: str + name: str | None = None expression: Optional[Union[str, List[str]]] = None # noqa: UP006, UP007, UP045 @field_validator("expression", mode="before") @@ -240,7 +240,7 @@ class Dataset(BaseModel): """ id: str - name: str + name: str | None = None tables: Optional[List[PowerBiTable]] = [] # noqa: UP006, UP045 description: Optional[str] = None # noqa: UP045 users: Optional[List[PowerBIUser]] = [] # noqa: UP006, UP045 @@ -262,7 +262,7 @@ class DatasetResponse(BaseModel): class Dataflow(BaseModel): id: str = Field(alias="objectId") - name: str + name: str | None = None description: Optional[str] = None # noqa: UP045 users: Optional[List[PowerBIUser]] = [] # noqa: UP006, UP045 modifiedBy: Optional[str] = None # noqa: N815, UP045 @@ -357,7 +357,7 @@ class ReportPage(BaseModel): single report Page object """ - name: str + name: str | None = None displayName: Optional[str] = None # noqa: N815, UP045 @@ -410,7 +410,7 @@ class DataflowEntityAttribute(BaseModel): API doc: https://learn.microsoft.com/en-us/rest/api/power-bi/admin/dataflows-export-dataflow-as-admin """ - name: str + name: str | None = None dataType: Optional[str] = None # noqa: N815, UP045 description: Optional[str] = None # noqa: UP045 @@ -422,7 +422,7 @@ class DataflowEntity(BaseModel): API doc: https://learn.microsoft.com/en-us/rest/api/power-bi/admin/dataflows-export-dataflow-as-admin """ - name: str + name: str | None = None description: Optional[str] = None # noqa: UP045 attributes: Optional[List[DataflowEntityAttribute]] = [] # noqa: UP006, UP045 diff --git a/ingestion/tests/unit/models/test_custom_basemodel_validation.py b/ingestion/tests/unit/models/test_custom_basemodel_validation.py index d0529e886157..485908f6d397 100644 --- a/ingestion/tests/unit/models/test_custom_basemodel_validation.py +++ b/ingestion/tests/unit/models/test_custom_basemodel_validation.py @@ -40,8 +40,11 @@ from metadata.generated.schema.type.entityReference import EntityReference from metadata.ingestion.models.custom_basemodel_validation import ( RESERVED_ARROW_KEYWORD, + RESERVED_CARRIAGE_RETURN_KEYWORD, RESERVED_COLON_KEYWORD, + RESERVED_NEWLINE_KEYWORD, RESERVED_QUOTE_KEYWORD, + RESERVED_TAB_KEYWORD, TRANSFORMABLE_ENTITIES, TransformDirection, get_entity_config, @@ -228,6 +231,26 @@ def test_replace_separators_function(self): '"""', "__reserved__quote____reserved__quote____reserved__quote__", ), # Multiple quotes - each " replaced + ( + "line1\nline2", + "line1__reserved__newline__line2", + ), + ( + "row1\rrow2", + "row1__reserved__carriage_return__row2", + ), + ( + "col1\tcol2", + "col1__reserved__tab__col2", + ), + ( + "mixed\n\r\tend", + "mixed__reserved__newline____reserved__carriage_return____reserved__tab__end", + ), + ( + "student\ndetailed\ndata", + "student__reserved__newline__detailed__reserved__newline__data", + ), ] for input_val, expected in test_cases: @@ -263,6 +286,26 @@ def test_revert_separators_function(self): "__reserved__colon__:", ":::", ), # Multiple colons: __reserved__colon__ + : = :: + : = ::: + ( + "line1__reserved__newline__line2", + "line1\nline2", + ), + ( + "row1__reserved__carriage_return__row2", + "row1\rrow2", + ), + ( + "col1__reserved__tab__col2", + "col1\tcol2", + ), + ( + "mixed__reserved__newline____reserved__carriage_return____reserved__tab__end", + "mixed\n\r\tend", + ), + ( + "student__reserved__newline__detailed__reserved__newline__data", + "student\ndetailed\ndata", + ), ] for input_val, expected in test_cases: @@ -285,6 +328,12 @@ def test_round_trip_transformations(self): 'emoji🚀::data📊>chart"report', " spaced :: values ", # Leading/trailing spaces "special!@#$%^&*()_+-={}[]|\\:;'<>?,./", # Special characters (non-reserved) + "student\ndetailed\ndata", + "row1\rrow2", + "col1\tcol2", + 'all\nthe\r\twhitespace::and>specials"too', + "leading\n\r\twhitespace", + "trailing\t\r\n", ] for original in test_values: @@ -568,6 +617,107 @@ def test_dashboard_data_model_transformations(self): self.assertEqual(result.columns[0].children[0].name.root, "nested::metric") self.assertEqual(result.columns[0].children[1].name.root, "nested>dimension") + def test_whitespace_transformations_on_create_table(self): + """CreateTableRequest should encode \\n, \\r, \\t in name and column names.""" + create_request = CreateTableRequest( + name=EntityName("student\ndetailed\ndata"), + columns=[ + Column(name=ColumnName("col\twith\ttabs"), dataType=DataType.STRING), + Column(name=ColumnName("col\rwith\rreturns"), dataType=DataType.STRING), + ], + databaseSchema=FullyQualifiedEntityName("db.schema"), + ) + + result = transform_entity_names(create_request, CreateTableRequest) + + self.assertEqual( + result.name.root, + "student__reserved__newline__detailed__reserved__newline__data", + ) + self.assertEqual( + result.columns[0].name.root, + "col__reserved__tab__with__reserved__tab__tabs", + ) + self.assertEqual( + result.columns[1].name.root, + "col__reserved__carriage_return__with__reserved__carriage_return__returns", + ) + + def test_whitespace_transformations_on_fetch_table(self): + """Table fetch should decode \\n, \\r, \\t back to original characters.""" + table = Table( + id=self.sample_table_id, + name="student__reserved__newline__detailed__reserved__newline__data", + databaseSchema=self.sample_schema_ref, + fullyQualifiedName="db.schema.student_data", + columns=[ + Column( + name=ColumnName("col__reserved__tab__with__reserved__tab__tabs"), + dataType=DataType.STRING, + ), + Column( + name=ColumnName("col__reserved__carriage_return__with__reserved__carriage_return__returns"), + dataType=DataType.STRING, + ), + ], + ) + + result = transform_entity_names(table, Table) + + self.assertEqual(result.name.root, "student\ndetailed\ndata") + self.assertEqual(result.columns[0].name.root, "col\twith\ttabs") + self.assertEqual(result.columns[1].name.root, "col\rwith\rreturns") + + def test_whitespace_transformations_round_trip_on_dashboard_datamodel(self): + """CreateDashboardDataModel encode → DashboardDataModel decode preserves \\n/\\r/\\t.""" + original_name = "student\ndetailed\ndata" + original_column = "measure\twith\ttab" + original_child = "child\rwith\rreturn" + + create_request = CreateDashboardDataModelRequest( + name=EntityName(original_name), + displayName="Student Data", + dataModelType=DataModelType.PowerBIDataModel, + service=FullyQualifiedEntityName("service.powerbi"), + columns=[ + Column( + name=ColumnName(original_column), + dataType=DataType.STRUCT, + children=[Column(name=ColumnName(original_child), dataType=DataType.STRING)], + ) + ], + ) + + encoded = transform_entity_names(create_request, CreateDashboardDataModelRequest) + encoded_name = encoded.name.root + encoded_column = encoded.columns[0].name.root + encoded_child = encoded.columns[0].children[0].name.root + + # Encoded names must not contain raw whitespace control chars + for forbidden in ("\n", "\r", "\t"): + self.assertNotIn(forbidden, encoded_name) + self.assertNotIn(forbidden, encoded_column) + self.assertNotIn(forbidden, encoded_child) + + # Decode side: simulate the fetch path + fetch_model = DashboardDataModel( + id=uuid.uuid4(), + name=encoded_name, + dataModelType=DataModelType.PowerBIDataModel, + columns=[ + Column( + name=ColumnName(encoded_column), + dataType=DataType.STRUCT, + children=[Column(name=ColumnName(encoded_child), dataType=DataType.STRING)], + ) + ], + ) + + decoded = transform_entity_names(fetch_model, DashboardDataModel) + self.assertEqual(decoded.name.root, original_name) + self.assertEqual(decoded.columns[0].name.root, original_column) + self.assertEqual(decoded.columns[0].children[0].name.root, original_child) + def test_configuration_consistency(self): """Test consistency of configuration across the system.""" # Verify that all configured entities have consistent field mappings @@ -594,6 +744,9 @@ def test_reserved_keywords_constants(self): self.assertEqual(RESERVED_COLON_KEYWORD, "__reserved__colon__") self.assertEqual(RESERVED_ARROW_KEYWORD, "__reserved__arrow__") self.assertEqual(RESERVED_QUOTE_KEYWORD, "__reserved__quote__") + self.assertEqual(RESERVED_NEWLINE_KEYWORD, "__reserved__newline__") + self.assertEqual(RESERVED_CARRIAGE_RETURN_KEYWORD, "__reserved__carriage_return__") + self.assertEqual(RESERVED_TAB_KEYWORD, "__reserved__tab__") def test_reserved_keywords_uniqueness(self): """Test that reserved keywords are unique and don't conflict.""" @@ -601,6 +754,9 @@ def test_reserved_keywords_uniqueness(self): RESERVED_COLON_KEYWORD, RESERVED_ARROW_KEYWORD, RESERVED_QUOTE_KEYWORD, + RESERVED_NEWLINE_KEYWORD, + RESERVED_CARRIAGE_RETURN_KEYWORD, + RESERVED_TAB_KEYWORD, ] self.assertEqual(len(keywords), len(set(keywords)), "Reserved keywords should be unique") diff --git a/ingestion/tests/unit/models/test_custom_pydantic.py b/ingestion/tests/unit/models/test_custom_pydantic.py index b55927183655..1217b0259e52 100644 --- a/ingestion/tests/unit/models/test_custom_pydantic.py +++ b/ingestion/tests/unit/models/test_custom_pydantic.py @@ -558,8 +558,11 @@ def test_whitespace_handling(self): (" test :: name ", " test __reserved__colon__ name "), # Multiple spaces ("test :: name", "test __reserved__colon__ name"), - # Tabs and newlines (should be preserved) - ("test\t::\nname", "test\t__reserved__colon__\nname"), + # Tabs and newlines (now encoded as reserved keywords) + ( + "test\t::\nname", + "test__reserved__tab____reserved__colon____reserved__newline__name", + ), ] for input_name, expected in whitespace_cases: diff --git a/ingestion/tests/unit/topology/dashboard/test_powerbi_resilience.py b/ingestion/tests/unit/topology/dashboard/test_powerbi_resilience.py new file mode 100644 index 000000000000..b26b518abbf9 --- /dev/null +++ b/ingestion/tests/unit/topology/dashboard/test_powerbi_resilience.py @@ -0,0 +1,150 @@ +# Copyright 2025 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Resilience tests for the PowerBI connector. + +Covers two production failure modes: + +1. A single nullable ``name`` in the PowerBI admin scan response must not + invalidate the whole workspace batch (`Dataflow`, `Dataset`, + `PowerBIReport`, `PowerBIDashboard`, etc. all accept ``name=None``). +2. PowerBI/DAX names containing ``::`` are sanitized before being sent to + the OpenMetadata API, which enforces ``^((?!::).)*$`` on column names. +""" + +from unittest.mock import MagicMock + +import pytest + +from metadata.ingestion.source.dashboard.powerbi.models import ( + Dataflow, + DataflowEntity, + DataflowEntityAttribute, + Dataset, + DatasetExpression, + Group, + PowerBiColumns, + PowerBIDashboard, + PowerBiMeasureModel, + PowerBiMeasures, + PowerBIReport, + PowerBiTable, + ReportPage, + Workspaces, +) + +_LOOSENED_MODELS_NAME_FIELD = [ + (PowerBIReport, {"id": "r-1"}, "name"), + (PowerBiColumns, {}, "name"), + (PowerBiMeasureModel, {"dataType": "STRING", "dataTypeDisplay": "STRING", "description": ""}, "name"), + (PowerBiMeasures, {}, "name"), + (PowerBiTable, {}, "name"), + (DatasetExpression, {}, "name"), + (Dataset, {"id": "ds-1"}, "name"), + (Dataflow, {"objectId": "df-1"}, "name"), + (ReportPage, {}, "name"), + (DataflowEntityAttribute, {}, "name"), + (DataflowEntity, {}, "name"), +] + + +@pytest.mark.parametrize("model_cls, base_payload, field", _LOOSENED_MODELS_NAME_FIELD) +def test_loosened_name_field_accepts_none(model_cls, base_payload, field): + """Every loosened model parses cleanly when its ``name`` is null.""" + payload = {**base_payload, field: None} + instance = model_cls(**payload) + assert getattr(instance, field) is None + + +def test_powerbi_dashboard_display_name_accepts_none(): + """`PowerBIDashboard.displayName` is now optional.""" + dashboard = PowerBIDashboard(id="d-1", displayName=None) + assert dashboard.displayName is None + assert dashboard.id == "d-1" + + +def test_workspaces_round_trip_survives_nullable_nested_name(): + """A null ``Dataflow.name`` no longer breaks the parent ``Workspaces`` parse. + ``workspaces.83.dataflows.17.name: Input should be a valid string``. + """ + raw = { + "workspaces": [ + { + "id": "ws-1", + "name": "Sales", + "state": "Active", + "dataflows": [ + {"objectId": "df-good", "name": "Customers"}, + {"objectId": "df-bad", "name": None}, + ], + } + ] + } + workspaces = Workspaces(**raw) + assert len(workspaces.workspaces) == 1 + dataflows = workspaces.workspaces[0].dataflows + assert [df.id for df in dataflows] == ["df-good", "df-bad"] + assert dataflows[1].name is None + + +def test_fetch_workspace_scan_result_skips_one_bad_workspace(monkeypatch): + """A single un-parseable workspace must not drop the rest of the batch.""" + from metadata.ingestion.source.dashboard.powerbi import client as client_module + + api_client = client_module.PowerBiApiClient.__new__(client_module.PowerBiApiClient) + api_client.client = MagicMock() + api_client.client._base_url = "https://api.powerbi.com/v1.0" + + api_client.client.get.return_value = { + "workspaces": [ + {"id": "ws-good", "name": "Good", "state": "Active"}, + {"id": "ws-bad", "name": "Bad", "state": "Active", "dashboards": "not-a-list"}, + ] + } + + result = api_client.fetch_workspace_scan_result(scan_id="scan-1") + + assert result is not None + assert [ws.id for ws in result.workspaces] == ["ws-good"] + + +def test_fetch_workspace_scan_result_handles_empty_response(monkeypatch): + from metadata.ingestion.source.dashboard.powerbi import client as client_module + + api_client = client_module.PowerBiApiClient.__new__(client_module.PowerBiApiClient) + api_client.client = MagicMock() + api_client.client._base_url = "https://api.powerbi.com/v1.0" + api_client.client.get.return_value = None + + assert api_client.fetch_workspace_scan_result(scan_id="scan-1") is None + + +def test_loosened_models_preserve_provided_name(): + """Loosening to Optional must not silently change a provided value.""" + ds = Dataset(id="ds-1", name="Orders") + assert ds.name == "Orders" + df = Dataflow(objectId="df-1", name="Customers") + assert df.name == "Customers" + + +def test_group_with_nameless_dataflow_parses_via_workspaces(): + """End-to-end: the original scan payload shape with a null dataflow name.""" + workspaces = Workspaces( + workspaces=[ + Group( + id="ws-1", + name="WS", + state="Active", + dataflows=[Dataflow(objectId="df-1", name=None)], + ) + ] + ) + assert workspaces.workspaces[0].dataflows[0].name is None