Skip to content

Commit 48b4dc1

Browse files
committed
GenAI metadata generation - improvements
1 parent 586fb79 commit 48b4dc1

23 files changed

+3363
-934
lines changed

backend/dataall/base/cdkproxy/requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
aws-cdk-lib==2.177.0
2-
boto3==1.35.26
2+
boto3==1.39.7
33
boto3-stubs==1.35.26
44
cdk-nag==2.7.2
5-
fastapi == 0.115.5
5+
fastapi == 0.116.1
66
PyYAML==6.0
77
requests==2.32.2
88
tabulate==0.8.9

backend/dataall/modules/s3_datasets/api/dataset/queries.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
get_file_upload_presigned_url,
66
list_datasets_owned_by_env_group,
77
list_dataset_tables_folders,
8-
read_sample_data,
98
)
109

1110
getDataset = gql.QueryField(
@@ -56,9 +55,3 @@
5655
type=gql.Ref('DatasetItemsSearchResult'),
5756
resolver=list_dataset_tables_folders,
5857
)
59-
listSampleData = gql.QueryField(
60-
name='listSampleData',
61-
args=[gql.Argument(name='tableUri', type=gql.NonNullableType(gql.String))],
62-
type=gql.Ref('QueryPreviewResult'), # basically returns nothing...?
63-
resolver=read_sample_data,
64-
) # return the data -> user invokes generateMetadata again + sample data ; similar api exists

backend/dataall/modules/s3_datasets/api/dataset/resolvers.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -193,11 +193,6 @@ def generate_metadata(
193193
raise Exception('Unsupported target type for metadata generation')
194194

195195

196-
def read_sample_data(context: Context, source: S3Dataset, tableUri: str):
197-
RequestValidator.validate_uri(param_name='tableUri', param_value=tableUri)
198-
return DatasetTableService.preview(uri=tableUri)
199-
200-
201196
def update_dataset_metadata(context: Context, source: S3Dataset, resourceUri: str):
202197
return DatasetService.update_dataset(uri=resourceUri, data=input)
203198

backend/dataall/modules/s3_datasets/api/table/queries.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
from dataall.base.api import gql
22
from dataall.modules.s3_datasets.api.table.input_types import DatasetTableFilter
3-
from dataall.modules.s3_datasets.api.table.resolvers import get_table, preview, list_table_data_filters
3+
from dataall.modules.s3_datasets.api.table.resolvers import (
4+
get_table,
5+
preview,
6+
read_sample_data,
7+
list_table_data_filters,
8+
)
49
from dataall.modules.s3_datasets.api.table.types import (
510
DatasetTable,
611
DatasetTableSearchResult,
@@ -38,6 +43,13 @@
3843
type=gql.Ref('QueryPreviewResult'),
3944
)
4045

46+
listTableSampleData = gql.QueryField(
47+
name='listTableSampleData',
48+
args=[gql.Argument(name='tableUri', type=gql.NonNullableType(gql.String))],
49+
type=gql.Ref('QueryPreviewResult'),
50+
resolver=read_sample_data,
51+
)
52+
4153
listTableDataFilters = gql.QueryField(
4254
name='listTableDataFilters',
4355
args=[

backend/dataall/modules/s3_datasets/api/table/resolvers.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@ def preview(context, source, tableUri: str = None):
3333
return DatasetTableService.preview(uri=tableUri)
3434

3535

36+
def read_sample_data(context: Context, source: S3Dataset, tableUri: str):
37+
if not tableUri:
38+
return None
39+
return DatasetTableService.read_table_sample(uri=tableUri)
40+
41+
3642
def get_glue_table_properties(context: Context, source: DatasetTable, **kwargs):
3743
if not source:
3844
return None

backend/dataall/modules/s3_datasets/aws/athena_table_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def get_table(self):
3737
)
3838
cursor = connection.cursor()
3939

40-
sql = 'select * from {table_identifier} limit 50'.format(
40+
sql = 'select * from {table_identifier} order by rand() limit 50'.format(
4141
table_identifier=sql_utils.Identifier(self._table.GlueDatabaseName, self._table.GlueTableName)
4242
)
4343
cursor.execute(sql) # nosemgrep

backend/dataall/modules/s3_datasets/aws/bedrock_metadata_client.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
import logging
22
import os
33

4-
from dataall.base.db import exceptions
54
from dataall.base.aws.sts import SessionHelper
65
from typing import List, Optional
76
from langchain_core.prompts import PromptTemplate
8-
from langchain_core.pydantic_v1 import BaseModel
9-
from langchain_aws import ChatBedrock as BedrockChat
7+
from pydantic import BaseModel
8+
from langchain_aws import ChatBedrockConverse
109
from langchain_core.output_parsers import JsonOutputParser
1110

1211
log = logging.getLogger(__name__)
@@ -22,12 +21,17 @@
2221
)
2322

2423

24+
class ColumnMetadata(BaseModel):
25+
label: str
26+
description: str
27+
28+
2529
class MetadataOutput(BaseModel):
2630
tags: Optional[List[str]] = None
2731
description: Optional[str] = None
2832
label: Optional[str] = None
2933
topics: Optional[List[str]] = None
30-
columns_metadata: Optional[List[dict]] = None
34+
subitem_descriptions: Optional[List[ColumnMetadata]] = None
3135

3236

3337
class BedrockClient:
@@ -39,10 +43,9 @@ def __init__(self):
3943
'max_tokens': 4096,
4044
'temperature': 0.5,
4145
'top_k': 250,
42-
'top_p': 0.5,
43-
'stop_sequences': ['\n\nHuman'],
46+
'stop_sequences': [],
4447
}
45-
self._model = BedrockChat(client=self._client, model_id=model_id, model_kwargs=model_kwargs)
48+
self._model = ChatBedrockConverse(client=self._client, model_id=model_id, **model_kwargs)
4649

4750
def invoke_model_dataset_metadata(self, metadata_types, dataset, tables, folders):
4851
try:
@@ -63,18 +66,20 @@ def invoke_model_dataset_metadata(self, metadata_types, dataset, tables, folders
6366
except Exception as e:
6467
raise e
6568

66-
def invoke_model_table_metadata(self, metadata_types, table, columns, sample_data, generate_columns_metadata=False):
69+
def invoke_model_table_metadata(self, metadata_types, table, columns, sample_data):
6770
try:
6871
prompt_template = PromptTemplate.from_file(METADATA_GENERATION_TABLE_TEMPLATE_PATH)
6972
parser = JsonOutputParser(pydantic_object=MetadataOutput)
7073
chain = prompt_template | self._model | parser
74+
75+
# Check if subitem_descriptions is in the requested metadata types
76+
generate_columns = 'subitem_descriptions' in metadata_types
7177
context = {
7278
'metadata_types': metadata_types,
73-
'generate_columns_metadata': generate_columns_metadata,
79+
'generate_columns_metadata': generate_columns,
7480
'label': table.label,
7581
'description': table.description,
7682
'tags': table.tags,
77-
'topics': table.topics,
7883
'column_labels': [c.label for c in columns],
7984
'column_descriptions': [c.description for c in columns],
8085
'sample_data': sample_data,
@@ -93,7 +98,6 @@ def invoke_model_folder_metadata(self, metadata_types, folder, files):
9398
'label': folder.label,
9499
'description': folder.description,
95100
'tags': folder.tags,
96-
'topics': folder.topics,
97101
'file_names': files,
98102
}
99103
return chain.invoke(context)
Lines changed: 65 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,82 +1,76 @@
1-
Your task is to generate or improve the metadata fields of a Dataset.
2-
3-
Use the following input parameters:
4-
- Dataset name: {dataset_label}
5-
- Current dataset description: {description}
6-
- Current tags for dataset: {tags}
7-
- Current topics for dataset: {topics}
8-
- Table names in the dataset: {table_names}
9-
- Folder names in the dataset: {folder_names}
10-
11-
12-
There are 4 metadata fields that can be requested to you.
13-
1. label - 1 to 3 words that give a "title" to the Dataset. If provided, you can use the current Dataset name as starting point.
14-
2. description - less than 30 words that summarize the Tables and Folders contained in the Dataset. If provided, use the current description and tags as starting point; but mainly use the Table names and Folder names.
15-
3. tags - list of strings (less than 3), where each string can take any value. Tags should highlight the most important field or thematic of the Dataset. If there are current tags that represent additional information, add them to the list of tags. Do not return the label as a tag.
16-
4. topics - list of strings (1 or 2), where each string must be one of the following topics that represent company departments ['Finance', 'Marketing', 'Engineering', 'HR', 'Operations', 'Sales', 'Other'] Choose a topic according to the Tables and Folders of the dataset. If there are current topics that represent additional information, add them to the list of topics.
17-
18-
There are some rules that you MUST follow:
19-
- If any of the input parameters is equal to "No description provided" or is None or [] do not use that particular input
20-
for generating the metadata fields.
21-
- This time the user has requested ONLY the following metadata fields: {metadata_types} Your response should strictly
22-
contain only the requested metadata fields.
23-
- Evaluate if the given parameters are sufficient for generating the requested metadata, if not, respond with
24-
"NotEnoughData" for all values of dictionary keys.
25-
- If the Table names and the Folder names are both none or [], return "Empty Dataset" as the description and "empty" as one of the tags.
26-
- Return the result as a Python dictionary where the keys are the requested metadata fields, all the keys must be
27-
lowercase and the values are the corresponding generated metadata.
28-
- Do not return any explanations, ONLY the Python dictionary.
29-
30-
---------------------------------------
31-
---------------------------------------
32-
Here are some examples:
33-
34-
Example 1.
35-
36-
Given the following input parameters:
37-
label: None,
38-
description: No description provided,
39-
tags: [],
40-
table_names: [],
41-
folder_names: [],
42-
metadata_types: ["label", "description", "tags", "topics"]
43-
44-
response = {{
1+
You are a metadata generation assistant for AWS data assets. Your task is to generate or enhance metadata fields for a Dataset based on the provided information.
2+
3+
INPUT PARAMETERS:
4+
- Dataset name: {dataset_label}
5+
- Current dataset description: {description}
6+
- Current tags for dataset: {tags}
7+
- Current topics for dataset: {topics}
8+
- Table names in the dataset: {table_names}
9+
- Table descriptions in the dataset: {table_descriptions}
10+
- Folder names in the dataset: {folder_names}
11+
12+
METADATA FIELDS REQUESTED: {metadata_types}
13+
You will only generate the fields listed above. Each field has specific requirements:
14+
15+
1. label - A concise title (1-3 words) for the Dataset. Use the current name as a starting point if available.
16+
2. description - A concise summary (<30 words) of the Dataset's contents, focusing primarily on the Tables and Folders it contains.
17+
3. tags - Up to 3 keywords highlighting the Dataset's main themes or content types. Do not duplicate the label as a tag.
18+
4. topics - 1-2 topics from this fixed list: ['Finances', 'HumanResources', 'Products', 'Services', 'Operations', 'Research', 'Sales', 'Orders', 'Sites', 'Energy', 'Customers', 'Misc']
19+
20+
RULES:
21+
- Ignore any input parameter that is "No description provided", None, or an empty list [].
22+
- Return ONLY the requested metadata fields as specified in {metadata_types}.
23+
- If insufficient data exists to generate meaningful metadata, return "NotEnoughData" for those fields.
24+
- If both Table names and Folder names are empty or None, use "Empty Dataset" as the description and include "empty" as a tag.
25+
- Return results as a Python dictionary with lowercase keys matching the requested metadata fields.
26+
- Provide ONLY the Python dictionary in your response, no explanations or additional text.
27+
28+
EXAMPLES:
29+
30+
Example 1: Insufficient data
31+
Input:
32+
- Dataset name: None
33+
- Current description: No description provided
34+
- Current tags: []
35+
- Table names: []
36+
- Folder names: []
37+
- Requested fields: ["label", "description", "tags", "topics"]
38+
39+
Output:
40+
{{
4541
"label": "NotEnoughData",
4642
"description": "Empty Dataset",
47-
"topics": "NotEnoughData",
48-
"tags": ["empty"]
43+
"tags": ["empty"],
44+
"topics": "NotEnoughData"
4945
}}
5046

51-
Example 2.
52-
Given the following input parameters:
53-
label: None,
54-
description: No description provided,
55-
tags: [],
56-
table_names: ["customer_orders", "product_inventory", "sales_transactions"],
57-
folder_names: ["orders", "inventory", "sales"],
58-
metadata_types: ["label", "description"]
59-
60-
response = {{
47+
Example 2: Sales data
48+
Input:
49+
- Dataset name: None
50+
- Current description: No description provided
51+
- Current tags: []
52+
- Table names: ["customer_orders", "product_inventory", "sales_transactions"]
53+
- Folder names: ["orders", "inventory", "sales"]
54+
- Requested fields: ["label", "description"]
55+
56+
Output:
57+
{{
6158
"label": "Sales and Inventory",
6259
"description": "Dataset containing customer orders, product inventory, and sales transactions information, organized into orders, inventory, and sales folders."
6360
}}
6461

65-
Example 3.
66-
Given the following input parameters:
67-
label: None,
68-
description: No description provided,
69-
tags: [],
70-
table_names: ["employee_data", "payroll", "performance_reviews"],,
71-
folder_names: ["hr_records", "financial", "evaluations"],
72-
metadata_types: ["label", "tags", "topics"]
73-
74-
response = {{
62+
Example 3: HR data
63+
Input:
64+
- Dataset name: None
65+
- Current description: No description provided
66+
- Current tags: []
67+
- Table names: ["employee_data", "payroll", "performance_reviews"]
68+
- Folder names: ["hr_records", "financial", "evaluations"]
69+
- Requested fields: ["label", "tags", "topics"]
70+
71+
Output:
72+
{{
7573
"label": "HR Management System",
7674
"tags": ["employee", "payroll", "performance"],
77-
"topics": ["HR", "Finance"]
75+
"topics": ["HumanResources", "Finances"]
7876
}}
79-
80-
81-
82-

0 commit comments

Comments
 (0)