data-dot-all
diff --git a/‎backend/dataall/base/cdkproxy/requirements.txt‎
Lines changed: 2 additions & 2 deletions b/‎backend/dataall/base/cdkproxy/requirements.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backend/dataall/modules/s3_datasets/api/dataset/queries.py‎
Lines changed: 0 additions & 7 deletions b/‎backend/dataall/modules/s3_datasets/api/dataset/queries.py‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎backend/dataall/modules/s3_datasets/api/dataset/resolvers.py‎
Lines changed: 0 additions & 5 deletions b/‎backend/dataall/modules/s3_datasets/api/dataset/resolvers.py‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎backend/dataall/modules/s3_datasets/api/table/queries.py‎
Lines changed: 13 additions & 1 deletion b/‎backend/dataall/modules/s3_datasets/api/table/queries.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎backend/dataall/modules/s3_datasets/api/table/resolvers.py‎
Lines changed: 6 additions & 0 deletions b/‎backend/dataall/modules/s3_datasets/api/table/resolvers.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎backend/dataall/modules/s3_datasets/aws/athena_table_client.py‎
Lines changed: 1 addition & 1 deletion b/‎backend/dataall/modules/s3_datasets/aws/athena_table_client.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/dataall/modules/s3_datasets/aws/bedrock_metadata_client.py‎
Lines changed: 15 additions & 11 deletions b/‎backend/dataall/modules/s3_datasets/aws/bedrock_metadata_client.py‎
Lines changed: 15 additions & 11 deletions
diff --git a/‎backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_dataset_template.txt‎
Lines changed: 65 additions & 71 deletions b/‎backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_dataset_template.txt‎
Lines changed: 65 additions & 71 deletions
@@ -1,8 +1,8 @@
 aws-cdk-lib==2.177.0
-boto3==1.35.26
+boto3==1.39.7
 boto3-stubs==1.35.26
 cdk-nag==2.7.2
-fastapi == 0.115.5
+fastapi == 0.116.1
 PyYAML==6.0
 requests==2.32.2
 tabulate==0.8.9
 
@@ -5,7 +5,6 @@
     get_file_upload_presigned_url,
     list_datasets_owned_by_env_group,
     list_dataset_tables_folders,
-    read_sample_data,
 )
 
 getDataset = gql.QueryField(
@@ -56,9 +55,3 @@
     type=gql.Ref('DatasetItemsSearchResult'),
     resolver=list_dataset_tables_folders,
 )
-listSampleData = gql.QueryField(
-    name='listSampleData',
-    args=[gql.Argument(name='tableUri', type=gql.NonNullableType(gql.String))],
-    type=gql.Ref('QueryPreviewResult'),  # basically returns nothing...?
-    resolver=read_sample_data,
-)  # return the data -> user invokes generateMetadata again + sample data ; similar api exists
@@ -193,11 +193,6 @@ def generate_metadata(
         raise Exception('Unsupported target type for metadata generation')
 
 
-def read_sample_data(context: Context, source: S3Dataset, tableUri: str):
-    RequestValidator.validate_uri(param_name='tableUri', param_value=tableUri)
-    return DatasetTableService.preview(uri=tableUri)
-
-
 def update_dataset_metadata(context: Context, source: S3Dataset, resourceUri: str):
     return DatasetService.update_dataset(uri=resourceUri, data=input)
 
 
@@ -1,6 +1,11 @@
 from dataall.base.api import gql
 from dataall.modules.s3_datasets.api.table.input_types import DatasetTableFilter
-from dataall.modules.s3_datasets.api.table.resolvers import get_table, preview, list_table_data_filters
+from dataall.modules.s3_datasets.api.table.resolvers import (
+    get_table,
+    preview,
+    read_sample_data,
+    list_table_data_filters,
+)
 from dataall.modules.s3_datasets.api.table.types import (
     DatasetTable,
     DatasetTableSearchResult,
@@ -38,6 +43,13 @@
     type=gql.Ref('QueryPreviewResult'),
 )
 
+listTableSampleData = gql.QueryField(
+    name='listTableSampleData',
+    args=[gql.Argument(name='tableUri', type=gql.NonNullableType(gql.String))],
+    type=gql.Ref('QueryPreviewResult'),
+    resolver=read_sample_data,
+)
+
 listTableDataFilters = gql.QueryField(
     name='listTableDataFilters',
     args=[
 
@@ -33,6 +33,12 @@ def preview(context, source, tableUri: str = None):
     return DatasetTableService.preview(uri=tableUri)
 
 
+def read_sample_data(context: Context, source: S3Dataset, tableUri: str):
+    if not tableUri:
+        return None
+    return DatasetTableService.read_table_sample(uri=tableUri)
+
+
 def get_glue_table_properties(context: Context, source: DatasetTable, **kwargs):
     if not source:
         return None
 
@@ -37,7 +37,7 @@ def get_table(self):
         )
         cursor = connection.cursor()
 
-        sql = 'select * from {table_identifier} limit 50'.format(
+        sql = 'select * from {table_identifier} order by rand() limit 50'.format(
             table_identifier=sql_utils.Identifier(self._table.GlueDatabaseName, self._table.GlueTableName)
         )
         cursor.execute(sql)  # nosemgrep
 
@@ -1,12 +1,11 @@
 import logging
 import os
 
-from dataall.base.db import exceptions
 from dataall.base.aws.sts import SessionHelper
 from typing import List, Optional
 from langchain_core.prompts import PromptTemplate
-from langchain_core.pydantic_v1 import BaseModel
-from langchain_aws import ChatBedrock as BedrockChat
+from pydantic import BaseModel
+from langchain_aws import ChatBedrockConverse
 from langchain_core.output_parsers import JsonOutputParser
 
 log = logging.getLogger(__name__)
@@ -22,12 +21,17 @@
 )
 
 
+class ColumnMetadata(BaseModel):
+    label: str
+    description: str
+
+
 class MetadataOutput(BaseModel):
     tags: Optional[List[str]] = None
     description: Optional[str] = None
     label: Optional[str] = None
     topics: Optional[List[str]] = None
-    columns_metadata: Optional[List[dict]] = None
+    subitem_descriptions: Optional[List[ColumnMetadata]] = None
 
 
 class BedrockClient:
@@ -39,10 +43,9 @@ def __init__(self):
             'max_tokens': 4096,
             'temperature': 0.5,
             'top_k': 250,
-            'top_p': 0.5,
-            'stop_sequences': ['\n\nHuman'],
+            'stop_sequences': [],
         }
-        self._model = BedrockChat(client=self._client, model_id=model_id, model_kwargs=model_kwargs)
+        self._model = ChatBedrockConverse(client=self._client, model_id=model_id, **model_kwargs)
 
     def invoke_model_dataset_metadata(self, metadata_types, dataset, tables, folders):
         try:
@@ -63,18 +66,20 @@ def invoke_model_dataset_metadata(self, metadata_types, dataset, tables, folders
         except Exception as e:
             raise e
 
-    def invoke_model_table_metadata(self, metadata_types, table, columns, sample_data, generate_columns_metadata=False):
+    def invoke_model_table_metadata(self, metadata_types, table, columns, sample_data):
         try:
             prompt_template = PromptTemplate.from_file(METADATA_GENERATION_TABLE_TEMPLATE_PATH)
             parser = JsonOutputParser(pydantic_object=MetadataOutput)
             chain = prompt_template | self._model | parser
+
+            # Check if subitem_descriptions is in the requested metadata types
+            generate_columns = 'subitem_descriptions' in metadata_types
             context = {
                 'metadata_types': metadata_types,
-                'generate_columns_metadata': generate_columns_metadata,
+                'generate_columns_metadata': generate_columns,
                 'label': table.label,
                 'description': table.description,
                 'tags': table.tags,
-                'topics': table.topics,
                 'column_labels': [c.label for c in columns],
                 'column_descriptions': [c.description for c in columns],
                 'sample_data': sample_data,
@@ -93,7 +98,6 @@ def invoke_model_folder_metadata(self, metadata_types, folder, files):
                 'label': folder.label,
                 'description': folder.description,
                 'tags': folder.tags,
-                'topics': folder.topics,
                 'file_names': files,
             }
             return chain.invoke(context)
 
@@ -1,82 +1,76 @@
-Your task is to generate or improve the metadata fields of a Dataset.
-
-Use the following input parameters:
-    - Dataset name: {dataset_label}
-    - Current dataset description: {description}
-    - Current tags for dataset: {tags}
-    - Current topics for dataset: {topics}
-    - Table names in the dataset: {table_names}
-    - Folder names in the dataset: {folder_names}
-
-
-There are 4 metadata fields that can be requested to you.
-    1. label - 1 to 3 words that give a "title" to the Dataset. If provided, you can use the current Dataset name as starting point.
-    2. description - less than 30 words that summarize the Tables and Folders contained in the Dataset. If provided, use the current description and tags as starting point; but mainly use the Table names and Folder names.
-    3. tags - list of strings (less than 3), where each string can take any value. Tags should highlight the most important field or thematic of the Dataset. If there are current tags that represent additional information, add them to the list of tags. Do not return the label as a tag.
-    4. topics -  list of strings (1 or 2), where each string must be one of the following topics that represent company departments ['Finance', 'Marketing', 'Engineering', 'HR', 'Operations', 'Sales', 'Other'] Choose a topic according to the Tables and Folders of the dataset. If there are current topics that represent additional information, add them to the list of topics.
-
-There are some rules that you MUST follow:
-- If any of the input parameters is equal to "No description provided" or is None or [] do not use that particular input
-for generating the metadata fields.
-- This time the user has requested ONLY the following metadata fields: {metadata_types} Your response should strictly
-contain only the requested metadata fields.
-- Evaluate if the given parameters are sufficient for generating the requested metadata, if not, respond with
-"NotEnoughData" for all values of dictionary keys.
-- If the Table names and the Folder names are both none or [], return "Empty Dataset" as the description and "empty" as one of the tags.
-- Return the result as a Python dictionary where the keys are the requested metadata fields, all the keys must be
-lowercase and the values are the corresponding generated metadata.
-- Do not return any explanations, ONLY the Python dictionary.
-
----------------------------------------
----------------------------------------
-Here are some examples:
-
-Example 1.
-
-Given the following input parameters:
-    label: None,
-    description: No description provided,
-    tags: [],
-    table_names: [],
-    folder_names: [],
-    metadata_types: ["label", "description", "tags", "topics"]
-
-response = {{
+You are a metadata generation assistant for AWS data assets. Your task is to generate or enhance metadata fields for a Dataset based on the provided information.
+
+INPUT PARAMETERS:
+- Dataset name: {dataset_label}
+- Current dataset description: {description}
+- Current tags for dataset: {tags}
+- Current topics for dataset: {topics}
+- Table names in the dataset: {table_names}
+- Table descriptions in the dataset: {table_descriptions}
+- Folder names in the dataset: {folder_names}
+
+METADATA FIELDS REQUESTED: {metadata_types}
+You will only generate the fields listed above. Each field has specific requirements:
+
+1. label - A concise title (1-3 words) for the Dataset. Use the current name as a starting point if available.
+2. description - A concise summary (<30 words) of the Dataset's contents, focusing primarily on the Tables and Folders it contains.
+3. tags - Up to 3 keywords highlighting the Dataset's main themes or content types. Do not duplicate the label as a tag.
+4. topics - 1-2 topics from this fixed list: ['Finances', 'HumanResources', 'Products', 'Services', 'Operations', 'Research', 'Sales', 'Orders', 'Sites', 'Energy', 'Customers', 'Misc']
+
+RULES:
+- Ignore any input parameter that is "No description provided", None, or an empty list [].
+- Return ONLY the requested metadata fields as specified in {metadata_types}.
+- If insufficient data exists to generate meaningful metadata, return "NotEnoughData" for those fields.
+- If both Table names and Folder names are empty or None, use "Empty Dataset" as the description and include "empty" as a tag.
+- Return results as a Python dictionary with lowercase keys matching the requested metadata fields.
+- Provide ONLY the Python dictionary in your response, no explanations or additional text.
+
+EXAMPLES:
+
+Example 1: Insufficient data
+Input:
+- Dataset name: None
+- Current description: No description provided
+- Current tags: []
+- Table names: []
+- Folder names: []
+- Requested fields: ["label", "description", "tags", "topics"]
+
+Output:
+{{
     "label": "NotEnoughData",
     "description": "Empty Dataset",
-    "topics": "NotEnoughData",
-    "tags": ["empty"]
+    "tags": ["empty"],
+    "topics": "NotEnoughData"
 }}
 
-Example 2.
-Given the following input parameters:
-    label: None,
-    description: No description provided,
-    tags: [],
-    table_names: ["customer_orders", "product_inventory", "sales_transactions"],
-    folder_names: ["orders", "inventory", "sales"],
-    metadata_types: ["label", "description"]
-
-response = {{
+Example 2: Sales data
+Input:
+- Dataset name: None
+- Current description: No description provided
+- Current tags: []
+- Table names: ["customer_orders", "product_inventory", "sales_transactions"]
+- Folder names: ["orders", "inventory", "sales"]
+- Requested fields: ["label", "description"]
+
+Output:
+{{
     "label": "Sales and Inventory",
     "description": "Dataset containing customer orders, product inventory, and sales transactions information, organized into orders, inventory, and sales folders."
 }}
 
-Example 3.
-Given the following input parameters:
-    label: None,
-    description: No description provided,
-    tags: [],
-    table_names: ["employee_data", "payroll", "performance_reviews"],,
-    folder_names: ["hr_records", "financial", "evaluations"],
-    metadata_types: ["label", "tags", "topics"]
-
-response = {{
+Example 3: HR data
+Input:
+- Dataset name: None
+- Current description: No description provided
+- Current tags: []
+- Table names: ["employee_data", "payroll", "performance_reviews"]
+- Folder names: ["hr_records", "financial", "evaluations"]
+- Requested fields: ["label", "tags", "topics"]
+
+Output:
+{{
     "label": "HR Management System",
     "tags": ["employee", "payroll", "performance"],
-    "topics": ["HR", "Finance"]
+    "topics": ["HumanResources", "Finances"]
 }}
-
-
-
-
Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ def get_table(self):`
`37`	`37`	`)`
`38`	`38`	`cursor = connection.cursor()`
`39`	`39`
`40`		`- sql = 'select * from {table_identifier} limit 50'.format(`
	`40`	`+ sql = 'select * from {table_identifier} order by rand() limit 50'.format(`
`41`	`41`	`table_identifier=sql_utils.Identifier(self._table.GlueDatabaseName, self._table.GlueTableName)`
`42`	`42`	`)`
`43`	`43`	`cursor.execute(sql) # nosemgrep`