Skip to content

Commit b0c884c

Browse files
committed
GenAI metadata generation - resolve conflicts+add guardrails
1 parent d138ff8 commit b0c884c

File tree

7 files changed

+2404
-424
lines changed

7 files changed

+2404
-424
lines changed

backend/dataall/modules/s3_datasets/aws/s3_dataset_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def list_bucket_files(self, bucket_name, prefix):
8181
Bucket=bucket_name,
8282
Prefix=prefix,
8383
ExpectedBucketOwner=dataset.AwsAccountId,
84-
MaxKeys=1000,
84+
MaxKeys=100,
8585
)
8686
return response.get('Contents', [])
8787
except ClientError as e:

backend/dataall/modules/s3_datasets/db/dataset_column_repositories.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ def paginate_active_columns_for_table(session, table_uri: str, filter: dict):
4848
return paginate(query=q, page=filter.get('page', 1), page_size=filter.get('pageSize', 10)).to_dict()
4949

5050
@staticmethod
51-
def list_active_columns_for_table(session, table_uri: str):
51+
def list_active_columns_for_table(session, table_uri: str, limit=None):
5252
q = DatasetColumnRepository.query_active_columns_for_table(session, table_uri)
53+
if limit:
54+
q = q.limit(limit)
5355
return q.all()

backend/dataall/modules/s3_datasets/db/dataset_location_repositories.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,12 @@ def delete_dataset_locations(session, dataset_uri) -> bool:
9797
return True
9898

9999
@staticmethod
100-
def get_dataset_folders(session, dataset_uri):
100+
def get_dataset_folders(session, dataset_uri, limit=None):
101101
"""return the dataset folders"""
102-
return session.query(DatasetStorageLocation).filter(DatasetStorageLocation.datasetUri == dataset_uri).all()
102+
query = session.query(DatasetStorageLocation).filter(DatasetStorageLocation.datasetUri == dataset_uri)
103+
if limit:
104+
query = query.limit(limit)
105+
return query.all()
103106

104107
@staticmethod
105108
def paginated_dataset_locations(session, uri, data=None) -> dict:

backend/dataall/modules/s3_datasets/db/dataset_repositories.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,12 @@ def update_glue_database_status(session, dataset_uri):
186186
dataset.glueDatabaseCreated = True
187187

188188
@staticmethod
189-
def get_dataset_tables(session, dataset_uri):
189+
def get_dataset_tables(session, dataset_uri, limit=None):
190190
"""return the dataset tables"""
191-
return session.query(DatasetTable).filter(DatasetTable.datasetUri == dataset_uri).all()
191+
query = session.query(DatasetTable).filter(DatasetTable.datasetUri == dataset_uri)
192+
if limit:
193+
query = query.limit(limit)
194+
return query.all()
192195

193196
@staticmethod
194197
def delete_dataset(session, dataset) -> bool:

backend/dataall/modules/s3_datasets/services/dataset_service.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -601,8 +601,8 @@ def generate_metadata_for_dataset(uri, metadata_types):
601601
context = get_context()
602602
with context.db_engine.scoped_session() as session:
603603
dataset = DatasetBaseRepository.get_dataset_by_uri(session, uri)
604-
tables = DatasetRepository.get_dataset_tables(session, dataset.datasetUri)
605-
folders = DatasetLocationRepository.get_dataset_folders(session, dataset.datasetUri)
604+
tables = DatasetRepository.get_dataset_tables(session, dataset.datasetUri, limit=50)
605+
folders = DatasetLocationRepository.get_dataset_folders(session, dataset.datasetUri, limit=50)
606606
metadata = BedrockClient().invoke_model_dataset_metadata(
607607
metadata_types=metadata_types,
608608
dataset=dataset,

backend/dataall/modules/s3_datasets/services/dataset_table_service.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ def generate_metadata_for_table(uri, metadata_types, sample_data):
210210
context = get_context()
211211
with context.db_engine.scoped_session() as session:
212212
table = DatasetTableRepository.get_dataset_table_by_uri(session, uri)
213-
table_columns = DatasetColumnRepository.list_active_columns_for_table(session, table.tableUri)
213+
table_columns = DatasetColumnRepository.list_active_columns_for_table(session, table.tableUri, limit=50)
214214
metadata = BedrockClient().invoke_model_table_metadata(
215215
table=table, columns=table_columns, metadata_types=metadata_types, sample_data=sample_data
216216
)

0 commit comments

Comments
 (0)