Skip to content

Commit b3cc20f

Browse files
committed
GenAI metadata generation - resolve conflicts+add guardrails
1 parent d138ff8 commit b3cc20f

File tree

9 files changed

+2413
-430
lines changed

9 files changed

+2413
-430
lines changed

backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_folder_template.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ Input:
3434

3535
Output:
3636
{{
37-
"label": "NotEnoughData",
37+
"label": "my-folder-1",
3838
"description": "NotEnoughData"
3939
}}
4040

backend/dataall/modules/s3_datasets/aws/s3_dataset_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def list_bucket_files(self, bucket_name, prefix):
8181
Bucket=bucket_name,
8282
Prefix=prefix,
8383
ExpectedBucketOwner=dataset.AwsAccountId,
84-
MaxKeys=1000,
84+
MaxKeys=100,
8585
)
8686
return response.get('Contents', [])
8787
except ClientError as e:

backend/dataall/modules/s3_datasets/db/dataset_column_repositories.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ def paginate_active_columns_for_table(session, table_uri: str, filter: dict):
4848
return paginate(query=q, page=filter.get('page', 1), page_size=filter.get('pageSize', 10)).to_dict()
4949

5050
@staticmethod
51-
def list_active_columns_for_table(session, table_uri: str):
51+
def list_active_columns_for_table(session, table_uri: str, limit=None):
5252
q = DatasetColumnRepository.query_active_columns_for_table(session, table_uri)
53+
if limit:
54+
q = q.limit(limit)
5355
return q.all()

backend/dataall/modules/s3_datasets/db/dataset_location_repositories.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,12 @@ def delete_dataset_locations(session, dataset_uri) -> bool:
9797
return True
9898

9999
@staticmethod
100-
def get_dataset_folders(session, dataset_uri):
100+
def get_dataset_folders(session, dataset_uri, limit=None):
101101
"""return the dataset folders"""
102-
return session.query(DatasetStorageLocation).filter(DatasetStorageLocation.datasetUri == dataset_uri).all()
102+
query = session.query(DatasetStorageLocation).filter(DatasetStorageLocation.datasetUri == dataset_uri)
103+
if limit:
104+
query = query.limit(limit)
105+
return query.all()
103106

104107
@staticmethod
105108
def paginated_dataset_locations(session, uri, data=None) -> dict:

backend/dataall/modules/s3_datasets/db/dataset_repositories.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,12 @@ def update_glue_database_status(session, dataset_uri):
186186
dataset.glueDatabaseCreated = True
187187

188188
@staticmethod
189-
def get_dataset_tables(session, dataset_uri):
189+
def get_dataset_tables(session, dataset_uri, limit=None):
190190
"""return the dataset tables"""
191-
return session.query(DatasetTable).filter(DatasetTable.datasetUri == dataset_uri).all()
191+
query = session.query(DatasetTable).filter(DatasetTable.datasetUri == dataset_uri)
192+
if limit:
193+
query = query.limit(limit)
194+
return query.all()
192195

193196
@staticmethod
194197
def delete_dataset(session, dataset) -> bool:

backend/dataall/modules/s3_datasets/services/dataset_service.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -601,8 +601,8 @@ def generate_metadata_for_dataset(uri, metadata_types):
601601
context = get_context()
602602
with context.db_engine.scoped_session() as session:
603603
dataset = DatasetBaseRepository.get_dataset_by_uri(session, uri)
604-
tables = DatasetRepository.get_dataset_tables(session, dataset.datasetUri)
605-
folders = DatasetLocationRepository.get_dataset_folders(session, dataset.datasetUri)
604+
tables = DatasetRepository.get_dataset_tables(session, dataset.datasetUri, limit=50)
605+
folders = DatasetLocationRepository.get_dataset_folders(session, dataset.datasetUri, limit=50)
606606
metadata = BedrockClient().invoke_model_dataset_metadata(
607607
metadata_types=metadata_types,
608608
dataset=dataset,

backend/dataall/modules/s3_datasets/services/dataset_table_service.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from dataall.base.db import exceptions
3434
from dataall.modules.s3_datasets.aws.bedrock_metadata_client import BedrockClient
3535
from dataall.modules.s3_datasets.db.dataset_column_repositories import DatasetColumnRepository
36+
from dataall.modules.s3_datasets.services.dataset_enums import MetadataGenerationTypes
3637

3738

3839
log = logging.getLogger(__name__)
@@ -207,12 +208,14 @@ def _delete_dataset_table_read_permission(session, table_uri):
207208
# 'metadata', 'modules.s3_datasets.features.generate_metadata_ai.max_count_per_day'
208209
# )
209210
def generate_metadata_for_table(uri, metadata_types, sample_data):
211+
metadataTypesForTable = [MetadataGenerationTypes.Description, MetadataGenerationTypes.Tag]
212+
table_metadata_types = [item for item in metadata_types if item in metadataTypesForTable]
210213
context = get_context()
211214
with context.db_engine.scoped_session() as session:
212215
table = DatasetTableRepository.get_dataset_table_by_uri(session, uri)
213-
table_columns = DatasetColumnRepository.list_active_columns_for_table(session, table.tableUri)
216+
table_columns = DatasetColumnRepository.list_active_columns_for_table(session, table.tableUri, limit=50)
214217
metadata = BedrockClient().invoke_model_table_metadata(
215-
table=table, columns=table_columns, metadata_types=metadata_types, sample_data=sample_data
218+
table=table, columns=table_columns, metadata_types=table_metadata_types, sample_data=sample_data
216219
)
217220

218221
result = [{'targetUri': uri, 'targetType': 'Table', **metadata}]

frontend/src/modules/S3_Datasets/components/ReviewMetadataComponent.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,10 +95,10 @@ export const ReviewMetadataComponent = (props) => {
9595
if (targetIndex !== -1) {
9696
const updatedTarget = {
9797
...targets[targetIndex],
98-
description: response.data.generateMetadata[0].description,
99-
label: response.data.generateMetadata[0].label,
100-
tags: response.data.generateMetadata[0].tags,
101-
topics: response.data.generateMetadata[0].topics
98+
description: response.data.generateMetadata[0].description ?? '',
99+
label: response.data.generateMetadata[0].label ?? '',
100+
tags: response.data.generateMetadata[0].tags ?? '',
101+
topics: response.data.generateMetadata[0].topics ?? ''
102102
};
103103

104104
const updatedTargets = [...targets];

0 commit comments

Comments
 (0)