Skip to content

Commit 2591526

Browse files
Fix-16888: sas viya 4 table type error fixes (#27222)
1 parent f8979cf commit 2591526

2 files changed

Lines changed: 555 additions & 44 deletions

File tree

ingestion/src/metadata/ingestion/source/database/sas/metadata.py

Lines changed: 168 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import json
1919
import re
2020
import traceback
21+
from dataclasses import dataclass
2122
from datetime import datetime, timezone
2223
from typing import Any, Iterable, Optional, Tuple
2324

@@ -83,6 +84,86 @@
8384
logger = ingestion_logger()
8485

8586

87+
@dataclass(frozen=True)
88+
class SASResourceContext:
89+
"""Components extracted from a SAS Information Catalog resourceId.
90+
91+
The SAS Data Tables REST API exposes table resources at paths of the form:
92+
93+
/dataTables/dataSources/{provider}~fs~{host}~fs~{library}/tables/{table}
94+
95+
where ``~fs~`` is the field separator (literal, not URL-encoded).
96+
97+
Known provider values
98+
---------------------
99+
- ``cas`` — CAS (Cloud Analytic Services) table. *host* is the CAS
100+
server name (e.g. ``cas-shared-default``).
101+
- ``Compute`` — SAS Compute session table. *host* is a session UUID
102+
(e.g. ``49736234-36b3-48d2-b2e2-e12aa365ce05``).
103+
104+
Real-world examples
105+
-------------------
106+
CAS table:
107+
``/dataTables/dataSources/cas~fs~cas-shared-default~fs~Samples/tables/WATER_CLUSTER``
108+
Compute table:
109+
``/dataTables/dataSources/Compute~fs~49736234-…~fs~PUBLIC/tables/LAS_TRAIN``
110+
111+
Reference
112+
---------
113+
SAS REST API — Data Tables service:
114+
https://developer.sas.com/rest-apis/dataTables
115+
"""
116+
117+
provider: str
118+
host: str
119+
library: str
120+
raw_resource_id: str
121+
122+
@property
123+
def database_name(self) -> str:
124+
return f"{self.provider}.{self.host}"
125+
126+
127+
# The field separator used inside the ``dataSources`` path segment.
128+
_SAS_FIELD_SEPARATOR = "~fs~"
129+
130+
131+
def parse_resource_id(resource_id: str) -> Optional[SASResourceContext]:
132+
"""Parse a SAS Information Catalog resourceId into its components.
133+
134+
Returns ``None`` (instead of raising) when the resourceId does not
135+
conform to the expected shape so that callers can cleanly fall back
136+
to the relationships-based lookup.
137+
"""
138+
segments = resource_id.split("/")
139+
# Expected: ['', 'dataTables', 'dataSources', '<context>', 'tables', ...]
140+
if len(segments) < 4:
141+
logger.warning(
142+
"resourceId %r has fewer than 4 slash-delimited segments; "
143+
"cannot extract provider/host/library.",
144+
resource_id,
145+
)
146+
return None
147+
148+
context = segments[3]
149+
parts = context.split(_SAS_FIELD_SEPARATOR)
150+
if len(parts) < 3:
151+
logger.warning(
152+
"resourceId context segment %r has %d field(s) (expected 3: "
153+
"provider, host, library); cannot derive database/schema.",
154+
context,
155+
len(parts),
156+
)
157+
return None
158+
159+
return SASResourceContext(
160+
provider=parts[0],
161+
host=parts[1],
162+
library=parts[2],
163+
raw_resource_id=resource_id,
164+
)
165+
166+
86167
class SasSource(
87168
DatabaseServiceSource
88169
): # pylint: disable=too-many-instance-attributes,too-many-public-methods
@@ -232,53 +313,77 @@ def create_database_alt(self, db):
232313

233314
def create_database_schema(self, table):
234315
"""
235-
create database schema
316+
Create database and schema entities for the given table.
317+
318+
First attempts to derive provider/host/library from the table's
319+
``resourceId`` via ``parse_resource_id``. If the resourceId does
320+
not match the expected SAS Data Tables shape, or the resulting
321+
create/update call fails, falls back to a relationships-based
322+
lookup through the Information Catalog.
236323
"""
237-
try:
238-
context = table["resourceId"].split("/")[3]
324+
resource_id = table.get("resourceId", "")
325+
ctx = parse_resource_id(resource_id)
239326

240-
provider = context.split("~")[0]
241-
self.db_name = provider + "." + context.split("~")[2]
242-
self.db_schema_name = context.split("~")[4]
327+
if ctx is not None:
328+
try:
329+
self.db_name = ctx.database_name
330+
self.db_schema_name = ctx.library
243331

244-
database = CreateDatabaseRequest(
245-
name=self.db_name,
246-
displayName=self.db_name,
247-
service=self.config.serviceName,
248-
)
249-
database = self.metadata.create_or_update(data=database)
332+
database = CreateDatabaseRequest(
333+
name=self.db_name,
334+
displayName=self.db_name,
335+
service=self.config.serviceName,
336+
)
337+
database = self.metadata.create_or_update(data=database)
250338

251-
db_schema = CreateDatabaseSchemaRequest(
252-
name=self.db_schema_name, database=database.fullyQualifiedName
253-
)
254-
db_schema_entity = self.metadata.create_or_update(db_schema)
255-
return db_schema_entity
256-
257-
except HTTPError as _:
258-
# Find the "database" entity in Information Catalog
259-
# First see if the table is a member of the library through the relationships attribute
260-
# Or we could use views to query the dataStores
261-
data_store_data_sets = "4b114f6e-1c2a-4060-9184-6809a612f27b"
262-
data_store_id = None
263-
for relation in table["relationships"]:
264-
if relation["definitionId"] != data_store_data_sets:
265-
continue
266-
data_store_id = relation["endpointId"]
267-
break
339+
db_schema = CreateDatabaseSchemaRequest(
340+
name=self.db_schema_name, database=database.fullyQualifiedName
341+
)
342+
return self.metadata.create_or_update(db_schema)
343+
344+
except HTTPError as exc:
345+
logger.debug(
346+
"Falling back to relationships-based schema lookup for "
347+
"%s after HTTP error: %s",
348+
resource_id,
349+
exc,
350+
)
268351

269-
if data_store_id is None:
270-
# log error due to exclude amount of work with tables in dataTables
271-
logger.error("Data store id should not be none")
272-
return None
352+
return self._create_database_schema_from_relationships(table)
273353

274-
data_store = self.sas_client.get_instance(data_store_id)
275-
database = self.create_database_alt(data_store)
276-
self.db_schema_name = data_store["name"]
277-
db_schema = CreateDatabaseSchemaRequest(
278-
name=data_store["name"], database=database.fullyQualifiedName
354+
def _create_database_schema_from_relationships(self, table):
355+
"""Derive database/schema from the table's catalog relationships.
356+
357+
This is the fallback path when ``parse_resource_id`` returns
358+
``None`` or the primary create fails. It looks for a
359+
``dataStoreDataSets`` relationship to locate the parent data
360+
store, then uses ``create_database_alt`` for the database entity.
361+
"""
362+
data_store_data_sets = "4b114f6e-1c2a-4060-9184-6809a612f27b"
363+
data_store_id = None
364+
for relation in table.get("relationships", []):
365+
if relation["definitionId"] != data_store_data_sets:
366+
continue
367+
data_store_id = relation["endpointId"]
368+
break
369+
370+
if data_store_id is None:
371+
logger.error(
372+
"Failed to derive database schema for SAS table '%s' (resourceId=%s): "
373+
"missing data store identifier because the expected "
374+
"'dataStoreDataSets' relationship was not found.",
375+
table.get("name", "<unknown>"),
376+
table.get("resourceId", "<missing>"),
279377
)
280-
db_schema_entity = self.metadata.create_or_update(db_schema)
281-
return db_schema_entity
378+
return None
379+
380+
data_store = self.sas_client.get_instance(data_store_id)
381+
database = self.create_database_alt(data_store)
382+
self.db_schema_name = data_store["name"]
383+
db_schema = CreateDatabaseSchemaRequest(
384+
name=data_store["name"], database=database.fullyQualifiedName
385+
)
386+
return self.metadata.create_or_update(db_schema)
282387

283388
def create_columns_alt(self, table):
284389
"""
@@ -439,6 +544,7 @@ def create_table_entity(self, table) -> Iterable[Either[CreateTableRequest]]:
439544
global table_fqn
440545

441546
table_entity, table_fqn = None, None
547+
table_name = table.get("name") if isinstance(table, dict) else None
442548

443549
try:
444550
table_url = self.sas_client.get_information_catalog_link(table["id"])
@@ -506,10 +612,13 @@ def create_table_entity(self, table) -> Iterable[Either[CreateTableRequest]]:
506612
custom_attributes = [
507613
custom_attribute["name"] for custom_attribute in TABLE_CUSTOM_ATTR
508614
]
615+
# Drop null values — OpenMetadata's custom-field types
616+
# (e.g. STRING_TYPE) reject null and fail the create with
617+
# "Custom field <name> has invalid JSON [$: null found, string expected]"
509618
extension_attributes = {
510619
attr: value
511620
for attr, value in table_extension.items()
512-
if attr in custom_attributes
621+
if attr in custom_attributes and value is not None
513622
}
514623

515624
table_request = CreateTableRequest(
@@ -529,6 +638,18 @@ def create_table_entity(self, table) -> Iterable[Either[CreateTableRequest]]:
529638
table_entity = self.metadata.get_by_name(
530639
entity=Table, fqn=self.get_table_fqn(table_name)
531640
)
641+
# If the table wasn't actually persisted (e.g. the sink
642+
# rejected the CreateTableRequest), skip the follow-up
643+
# patch/profile calls so we don't raise an AttributeError
644+
# that masks the real sink-side failure.
645+
if table_entity is None:
646+
logger.warning(
647+
f"Table [{table_name}] was not created in OpenMetadata; "
648+
"skipping description/extension/profile updates. "
649+
"Check the sink logs for the underlying error."
650+
)
651+
return
652+
532653
# update the description
533654
logger.debug(
534655
f"Updating description for {table_entity.id.root} with {table_description}"
@@ -595,10 +716,13 @@ def create_table_entity(self, table) -> Iterable[Either[CreateTableRequest]]:
595716

596717
except Exception as exc:
597718
logger.error(f"table failed to create: {table}")
719+
error_name = table_name or (
720+
table.get("id") if isinstance(table, dict) else "unknown"
721+
)
598722
yield Either(
599723
left=StackTraceError(
600-
name=table_name,
601-
error=f"Unexpected exception to create table [{table_name}]: {exc}",
724+
name=str(error_name),
725+
error=f"Unexpected exception to create table [{error_name}]: {exc}",
602726
stackTrace=traceback.format_exc(),
603727
)
604728
)
@@ -637,7 +761,7 @@ def create_lineage_table_source(self, table_extension, table_name):
637761
entity=Table, fqn=source_table_fqn
638762
)
639763

640-
if source_table_entity:
764+
if source_table_entity and target_table_entity:
641765
yield from self.create_table_lineage(
642766
source_table_entity, target_table_entity
643767
)

0 commit comments

Comments
 (0)