Skip to content

Commit ad9e1b7

Browse files
Containers: batch container data-model column tag retrieval to avoid subtree fan-out (open-metadata#27836)
* Containers with deep nesting causing performance issues due to tag fetch * Batch derived-tag fetch across data-model columns populateDataModelColumnTags previously called addDerivedTagsGracefully once per flattened column, which internally batches across that column's own tags but issues a separate derived-tag DB lookup for every column. On data models with many columns (or struct types with deep nesting) this becomes an N+1 pattern. Refactor: - Pre-compute Map<String, Column> hashToColumn once (LinkedHashMap to preserve column order) so we no longer hash each FQN twice — once for the target-hash list and again on lookup. - After fetching tags by target hash, flatten all returned TagLabels into a single list and call TagLabelUtil.batchFetchDerivedTags(...) once for the whole data model. - Per column, use addDerivedTagsWithPreFetched(columnTags, derivedMap) to avoid further DB lookups. - Fall back to the per-column addDerivedTagsGracefully path if the batch derived-tag fetch raises, preserving existing semantics. Net effect: total derived-tag DB queries drop from O(N) to 1 regardless of column count or nesting depth. Co-authored-by: sonika-shah <58761340+sonika-shah@users.noreply.github.com>
1 parent 56a3cb7 commit ad9e1b7

33 files changed

Lines changed: 1757 additions & 220 deletions

File tree

conf/openmetadata.yaml

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -194,22 +194,6 @@ logging:
194194
archivedFileCount: 7
195195
timeZone: UTC
196196
maxFileSize: 50MB
197-
org.openmetadata.slowrequest:
198-
level: ${SLOW_REQUEST_LOG_LEVEL:-OFF}
199-
additive: false
200-
appenders:
201-
- type: file
202-
layout:
203-
type: om-event-layout
204-
format: ${LOG_FORMAT:-text}
205-
pattern: "%level [%d{ISO8601,UTC}] [%t] %logger{5} - %msg%n"
206-
appendLineSeparator: true
207-
threshold: WARN
208-
currentLogFilename: ./logs/slow-requests.log
209-
archivedLogFilenamePattern: ./logs/slow-requests-%d{yyyy-MM-dd}-%i.log.gz
210-
archivedFileCount: 7
211-
timeZone: UTC
212-
maxFileSize: 50MB
213197
org.openmetadata.service.util.OpenMetadataSetup:
214198
level: INFO
215199
appenders:

ingestion/src/metadata/ingestion/ometa/mixins/container_mixin.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
from metadata.generated.schema.entity.data.container import Container
2222
from metadata.generated.schema.entity.data.table import TableData
2323
from metadata.ingestion.ometa.client import REST
24+
from metadata.ingestion.ometa.models import EntityList
25+
from metadata.ingestion.ometa.utils import quote
2426
from metadata.utils.logger import ometa_logger
2527

2628
logger = ometa_logger()
@@ -105,6 +107,38 @@ def ingest_container_sample_data(self, container: Container, sample_data: TableD
105107
logger.warning(f"Error trying to PUT sample data for {container.fullyQualifiedName.root}: {exc}")
106108
return None
107109

110+
def list_container_children(
111+
self,
112+
container_fqn: str,
113+
limit: int = 100,
114+
offset: int = 0,
115+
) -> EntityList[Container]:
116+
"""
117+
Page through the immediate children of a Container via the dedicated
118+
``/v1/containers/name/{fqn}/children`` endpoint. Use this instead of
119+
fetching the parent with ``fields=children`` — that field is no longer
120+
served because the inline payload is unbounded for buckets with many
121+
objects.
122+
123+
Each row is a slim projection (id, name, displayName, fqn, description,
124+
service); ``dataModel``, ``tags``, ``owners``, ``extension`` are not
125+
populated. Re-fetch the specific child via :meth:`get_by_name` when
126+
full details are needed.
127+
"""
128+
path = f"/containers/name/{quote(container_fqn)}/children?limit={limit}&offset={offset}"
129+
resp = self.client.get(path)
130+
if not isinstance(resp, dict):
131+
return EntityList(entities=[], total=0)
132+
133+
entities = [Container(**elmt) for elmt in resp.get("data") or []]
134+
paging = resp.get("paging") or {}
135+
return EntityList(
136+
entities=entities,
137+
total=paging.get("total", len(entities)),
138+
after=paging.get("after"),
139+
before=paging.get("before"),
140+
)
141+
108142
def get_container_sample_data(self, container: Container) -> Optional[Container]: # noqa: UP045
109143
"""
110144
GET call for the /sampleData endpoint for a given Container

ingestion/src/metadata/sdk/entities/containers.py

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22
Containers entity SDK with fluent API
33
"""
44

5-
from typing import Type # noqa: UP035
5+
from typing import List, Type # noqa: UP035
66

77
from metadata.generated.schema.api.data.createContainer import CreateContainerRequest
88
from metadata.generated.schema.entity.data.container import Container
9-
from metadata.sdk.entities.base import BaseEntity
9+
from metadata.generated.schema.type.entityReference import EntityReference
10+
from metadata.sdk.entities.base import BaseEntity, EntityList
1011

1112

1213
class Containers(BaseEntity[Container, CreateContainerRequest]):
@@ -16,3 +17,53 @@ class Containers(BaseEntity[Container, CreateContainerRequest]):
1617
def entity_type(cls) -> Type[Container]: # noqa: UP006
1718
"""Return the Container entity type"""
1819
return Container
20+
21+
@classmethod
22+
def list_children(
23+
cls,
24+
container_fqn: str,
25+
*,
26+
limit: int = 100,
27+
offset: int = 0,
28+
) -> EntityList[Container]:
29+
"""
30+
Page through the immediate children of a Container via the dedicated
31+
``/v1/containers/name/{fqn}/children`` endpoint. Use this instead of
32+
fetching the parent with ``fields=["children"]`` — that field is no longer
33+
served because the inline payload is unbounded for buckets with many
34+
objects.
35+
36+
Each row is a slim projection (id, name, displayName, fqn, description,
37+
service); ``dataModel``, ``tags``, ``owners``, ``extension`` are not
38+
populated. Re-fetch the specific child via :meth:`retrieve_by_name`
39+
when full details are needed.
40+
"""
41+
client = cls._get_client()
42+
page = client.list_container_children(container_fqn, limit=limit, offset=offset)
43+
entities = [cls._coerce_entity(item) for item in page.entities]
44+
return EntityList(
45+
entities=entities,
46+
after=getattr(page, "after", None),
47+
before=getattr(page, "before", None),
48+
)
49+
50+
@classmethod
51+
def list_ancestors(cls, container_fqn: str) -> List[EntityReference]: # noqa: UP006
52+
"""
53+
Resolve the full ancestor chain for a container in a single call.
54+
Returns ``EntityReference``s ordered from the root container (immediate
55+
child of the storage service) down to the immediate parent of
56+
``container_fqn``. Empty list when the container is at the top level.
57+
"""
58+
client = cls._get_client()
59+
rest_client = cls._get_rest_client(client)
60+
endpoint = cls._get_endpoint_path(client)
61+
from metadata.ingestion.ometa.utils import ( # noqa: PLC0415
62+
quote,
63+
)
64+
65+
path = f"{endpoint}/name/{quote(container_fqn)}/ancestors"
66+
resp = rest_client.get(path)
67+
if not isinstance(resp, list):
68+
return []
69+
return [EntityReference(**ref) for ref in resp]

ingestion/tests/integration/auto_classification/containers/test_container_classification.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,11 @@ def test_containers_ingested(metadata: OpenMetadata, ingest_storage_metadata, se
3030
"""Verify containers were ingested with data models"""
3131
bucket = metadata.get_by_name(entity=Container, fqn=f"{service_name}.{bucket_name}", fields=["*"])
3232
assert bucket is not None
33-
assert bucket.children is not None
34-
assert len(bucket.children.root) >= 3
33+
34+
# `children` is no longer inlined into the parent payload — it's an unbounded
35+
# collection for object stores. Use the dedicated paginated endpoint.
36+
children = metadata.list_container_children(f"{service_name}.{bucket_name}")
37+
assert len(children.entities) >= 3
3538

3639
customers_container = metadata.get_by_name(
3740
entity=Container, fqn=f"{service_name}.{bucket_name}.customers", fields=["*"]

ingestion/tests/integration/s3/test_s3_storage.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,11 @@ def test_s3_ingestion(metadata, ingest_s3_storage, service_name):
2222

2323
# We should have the bucket and all its structured children
2424
bucket: Container = metadata.get_by_name(entity=Container, fqn=f"{service_name}.test-bucket", fields=["*"])
25-
# The bucket has children and no dataModel
26-
assert 7 == len(bucket.children.root) # noqa: SIM300
25+
# The bucket has children (via the dedicated paginated endpoint, not inlined
26+
# into the parent payload) and no dataModel
2727
assert not bucket.dataModel
28+
children = metadata.list_container_children(f"{service_name}.test-bucket")
29+
assert 7 == len(children.entities) # noqa: SIM300
2830

2931
# We can validate the children
3032
cities: Container = metadata.get_by_name(entity=Container, fqn=f"{service_name}.test-bucket.cities", fields=["*"])

0 commit comments

Comments
 (0)