FE work and Bedrock client cross-region

dlpzx · dlpzx · commit f4e6c64c28c3 · 2024-10-31T14:17:42.000+01:00
diff --git a/backend/dataall/modules/s3_datasets/api/dataset/resolvers.py b/backend/dataall/modules/s3_datasets/api/dataset/resolvers.py
@@ -169,7 +169,7 @@ def generate_metadata(
     tableSampleData: dict = {},
 ):
     RequestValidator.validate_uri(param_name='resourceUri', param_value=resourceUri)
-    if metadataTypes not in [item.value for item in MetadataGenerationTypes]:
+    if any(metadata_type not in [item.value for item in MetadataGenerationTypes] for metadata_type in metadataTypes):
         raise InvalidInput(
             'metadataType',
             metadataTypes,
diff --git a/backend/dataall/modules/s3_datasets/api/dataset/types.py b/backend/dataall/modules/s3_datasets/api/dataset/types.py
@@ -142,9 +142,9 @@
         gql.Field(name='targetUri', type=gql.String),
         gql.Field(name='targetType', type=gql.String),
         gql.Field(name='label', type=gql.String),
-        gql.Field(name='topics', type=gql.ArrayType(gql.String)),
-        gql.Field(name='tags', type=gql.ArrayType(gql.String)),
         gql.Field(name='description', type=gql.String),
+        gql.Field(name='tags', type=gql.ArrayType(gql.String)),
+        gql.Field(name='topics', type=gql.ArrayType(gql.String)),
     ],
 )
 
diff --git a/backend/dataall/modules/s3_datasets/aws/bedrock_metadata_client.py b/backend/dataall/modules/s3_datasets/aws/bedrock_metadata_client.py
@@ -4,9 +4,9 @@
 from dataall.base.db import exceptions
 from dataall.base.aws.sts import SessionHelper
 from typing import List, Optional
-from langchain.prompts import PromptTemplate
+from langchain_core.prompts import PromptTemplate
 from langchain_core.pydantic_v1 import BaseModel
-from langchain_aws import BedrockLLM
+from langchain_aws import ChatBedrock as BedrockChat
 from langchain_core.output_parsers import JsonOutputParser
 
 log = logging.getLogger(__name__)
@@ -34,65 +34,68 @@ class BedrockClient:
     def __init__(self):
         session = SessionHelper.get_session()
         self._client = session.client('bedrock-runtime', region_name=os.getenv('AWS_REGION', 'eu-west-1'))
-        model_id = 'anthropic.claude-3-5-sonnet-20240620-v1:0'
+        model_id = 'eu.anthropic.claude-3-5-sonnet-20240620-v1:0'
         model_kwargs = {
             'max_tokens': 4096,
             'temperature': 0.5,
             'top_k': 250,
             'top_p': 0.5,
             'stop_sequences': ['\n\nHuman'],
         }
-        self._model = BedrockLLM(model_id=model_id, client=self._client, model_kwargs=model_kwargs)
+        self._model = BedrockChat(client=self._client, model_id=model_id, model_kwargs=model_kwargs)
 
     def invoke_model_dataset_metadata(self, metadata_types, dataset, tables, folders):
-        prompt_template = PromptTemplate.from_file(METADATA_GENERATION_DATASET_TEMPLATE_PATH)
-        parser = JsonOutputParser(pydantic_object=MetadataOutput)
-        chain = prompt_template | self._model | parser
-        context = {
-            'metadata_types': metadata_types,
-            'label': dataset.label,
-            'description': dataset.description,
-            'tags': dataset.tags,
-            'table_labels': [t.label for t in tables],
-            'table_descriptions': [t.description for t in tables],
-            'folder_labels': [f.label for f in folders],
-        }
-        response = chain.invoke(context)
-        if response.startswith('Error:'):
-            raise exceptions.ModelGuardrailException(response)
-        return response
+        try:
+            prompt_template = PromptTemplate.from_file(METADATA_GENERATION_DATASET_TEMPLATE_PATH)
+            parser = JsonOutputParser(pydantic_object=MetadataOutput)
+            chain = prompt_template | self._model | parser
+            context = {
+                'metadata_types': metadata_types,
+                'dataset_label': dataset.label,
+                'description': dataset.description,
+                'tags': dataset.tags,
+                'topics': dataset.topics,
+                'table_names': [t.label for t in tables],
+                'table_descriptions': [t.description for t in tables],
+                'folder_names': [f.label for f in folders],
+            }
+            return chain.invoke(context)
+        except Exception as e:
+            raise e
 
     def invoke_model_table_metadata(self, metadata_types, table, columns, sample_data, generate_columns_metadata=False):
-        prompt_template = PromptTemplate.from_file(METADATA_GENERATION_TABLE_TEMPLATE_PATH)
-        parser = JsonOutputParser(pydantic_object=MetadataOutput)
-        chain = prompt_template | self._model | parser
-        context = {
-            'metadata_types': metadata_types,
-            'generate_columns_metadata': generate_columns_metadata,
-            'label': table.label,
-            'description': table.description,
-            'tags': table.tags,
-            'column_labels': [c.label for c in columns],
-            'column_descriptions': [c.description for c in columns],
-            'sample_data': sample_data,
-        }
-        response = chain.invoke(context)
-        if response.startswith('Error:'):
-            raise exceptions.ModelGuardrailException(response)
-        return response
+        try:
+            prompt_template = PromptTemplate.from_file(METADATA_GENERATION_TABLE_TEMPLATE_PATH)
+            parser = JsonOutputParser(pydantic_object=MetadataOutput)
+            chain = prompt_template | self._model | parser
+            context = {
+                'metadata_types': metadata_types,
+                'generate_columns_metadata': generate_columns_metadata,
+                'label': table.label,
+                'description': table.description,
+                'tags': table.tags,
+                'topics': table.topics,
+                'column_labels': [c.label for c in columns],
+                'column_descriptions': [c.description for c in columns],
+                'sample_data': sample_data,
+            }
+            return chain.invoke(context)
+        except Exception as e:
+            raise e
 
     def invoke_model_folder_metadata(self, metadata_types, folder, files):
-        prompt_template = PromptTemplate.from_file(METADATA_GENERATION_FOLDER_TEMPLATE_PATH)
-        parser = JsonOutputParser(pydantic_object=MetadataOutput)
-        chain = prompt_template | self._model | parser
-        context = {
-            'metadata_types': metadata_types,
-            'label': folder.label,
-            'description': folder.description,
-            'tags': folder.tags,
-            'file_names': files,
-        }
-        response = chain.invoke(context)
-        if response.startswith('Error:'):
-            raise exceptions.ModelGuardrailException(response)
-        return response
+        try:
+            prompt_template = PromptTemplate.from_file(METADATA_GENERATION_FOLDER_TEMPLATE_PATH)
+            parser = JsonOutputParser(pydantic_object=MetadataOutput)
+            chain = prompt_template | self._model | parser
+            context = {
+                'metadata_types': metadata_types,
+                'label': folder.label,
+                'description': folder.description,
+                'tags': folder.tags,
+                'topics': folder.topics,
+                'file_names': files,
+            }
+            return chain.invoke(context)
+        except Exception as e:
+            raise e
diff --git a/backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_dataset_template.txt b/backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_dataset_template.txt
@@ -1,18 +1,19 @@
 Your task is to generate or improve the metadata fields of a Dataset.
 
 Use the following input parameters:
-    - Dataset name: {label}
+    - Dataset name: {dataset_label}
     - Current dataset description: {description}
     - Current tags for dataset: {tags}
+    - Current topics for dataset: {topics}
     - Table names in the dataset: {table_names}
     - Folder names in the dataset: {folder_names}
 
 
 There are 4 metadata fields that can be requested to you.
     1. label - 1 to 3 words that give a "title" to the Dataset. If provided, you can use the current Dataset name as starting point.
     2. description - less than 30 words that summarize the Tables and Folders contained in the Dataset. If provided, use the current description and tags as starting point; but mainly use the Table names and Folder names.
-    3. tags - list of strings (less than 3), where each string can take any value. Tags should highlight the most important field or thematic of the Dataset. If there are current tags that represent additional information, add them to the list of tags.
-    4. topics -  list of strings (1 or 2), where each string must be one of the following topics that represent company departments ['Finance', 'Marketing', 'Engineering', 'HR', 'Operations', 'Sales', 'Other'] Choose a topic according to the Tables and Folders of the dataset.
+    3. tags - list of strings (less than 3), where each string can take any value. Tags should highlight the most important field or thematic of the Dataset. If there are current tags that represent additional information, add them to the list of tags. Do not return the label as a tag.
+    4. topics -  list of strings (1 or 2), where each string must be one of the following topics that represent company departments ['Finance', 'Marketing', 'Engineering', 'HR', 'Operations', 'Sales', 'Other'] Choose a topic according to the Tables and Folders of the dataset. If there are current topics that represent additional information, add them to the list of topics.
 
 There are some rules that you MUST follow:
 - If any of the input parameters is equal to "No description provided" or is None or [] do not use that particular input
@@ -24,6 +25,7 @@ contain only the requested metadata fields.
 - If the Table names and the Folder names are both none or [], return "Empty Dataset" as the description and "empty" as one of the tags.
 - Return the result as a Python dictionary where the keys are the requested metadata fields, all the keys must be
 lowercase and the values are the corresponding generated metadata.
+- Do not return any explanations, ONLY the Python dictionary.
 
 ---------------------------------------
 ---------------------------------------
@@ -39,12 +41,12 @@ Given the following input parameters:
     folder_names: [],
     metadata_types: ["label", "description", "tags", "topics"]
 
-response = {
+response = {{
     "label": "NotEnoughData",
     "description": "Empty Dataset",
     "topics": "NotEnoughData",
     "tags": ["empty"]
-}
+}}
 
 Example 2.
 Given the following input parameters:
@@ -55,10 +57,10 @@ Given the following input parameters:
     folder_names: ["orders", "inventory", "sales"],
     metadata_types: ["label", "description"]
 
-response = {
+response = {{
     "label": "Sales and Inventory",
     "description": "Dataset containing customer orders, product inventory, and sales transactions information, organized into orders, inventory, and sales folders."
-}
+}}
 
 Example 3.
 Given the following input parameters:
@@ -69,11 +71,11 @@ Given the following input parameters:
     folder_names: ["hr_records", "financial", "evaluations"],
     metadata_types: ["label", "tags", "topics"]
 
-response = {
+response = {{
     "label": "HR Management System",
     "tags": ["employee", "payroll", "performance"],
     "topics": ["HR", "Finance"]
-}
+}}
 
 
 
diff --git a/backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_folder_template.txt b/backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_folder_template.txt
@@ -4,14 +4,15 @@ Use the following input parameters:
     - Folder name: {label},
     - Current Folder description: {description}
     - Current tags for Folder: {tags}
+    - Current topics for Folder: {topics}
     - File names (files stored inside the folder): {file_names}
 
 
 There are 4 metadata fields that can be requested to you.
     1. label - 1 to 3 words that give a "title" to the Folder. If provided, you can use the current Folder name as starting point.
     2. description - less than 30 words that summarize the files contained in the Folder. If provided, use the current description and tags as starting point; but mainly use the file names.
     3. tags - list of strings (less than 3), where each string can take any value. Tags should highlight the most important field or thematic of the Folder. If there are current tags that represent additional information, add them to the list of tags.
-    4. topics -  list of strings (1 or 2), where each string must be one of the following topics that represent company departments ['Finance', 'Marketing', 'Engineering', 'HR', 'Operations', 'Sales', 'Other'] Choose a topic according to the files contained in the Folder.
+    4. topics -  list of strings (1 or 2), where each string must be one of the following topics that represent company departments ['Finance', 'Marketing', 'Engineering', 'HR', 'Operations', 'Sales', 'Other'] Choose a topic according to the files contained in the Folder. If there are current topics that represent additional information, add them to the list of topics.
 
 There are some rules that you MUST follow:
 - If any of the input parameters is equal to "No description provided" or is None or [] do not use that particular input
diff --git a/backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_table_template.txt b/backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_table_template.txt
@@ -4,6 +4,7 @@ Use the following input parameters:
     - Table name: {label}
     - Current table description: {description}
     - Current tags for table: {tags}
+    - Current topics for table: {topics}
     - Column names: {columns}
     - Column Descriptions: {column_descriptions}
     - Sample data: {sample_data}
@@ -14,7 +15,7 @@ There are 4 metadata fields that can be requested to you.
     1. label - 1 to 3 words that give a "title" to the Table. If provided, you can use the current Table name as starting point.
     2. description - less than 30 words that summarize the content of the table. If provided, take the current Table description as starting point but mainly use the sample data (if provided), and then the column names and descriptions to generate the table description.
     3. tags - list of strings (less than 3), where each string can take any value. Tags should highlight the most important field or thematic of the Table. If there are current tags that represent additional information, add them to the list of tags.
-    4. topics -  list of strings (1 or 2), where each string must be one of the following topics that represent company departments ['Finance', 'Marketing', 'Engineering', 'HR', 'Operations', 'Sales', 'Other'] Choose a topic according to the Table description.
+    4. topics -  list of strings (1 or 2), where each string must be one of the following topics that represent company departments ['Finance', 'Marketing', 'Engineering', 'HR', 'Operations', 'Sales', 'Other'] Choose a topic according to the Table description. If there are current topics that represent additional information, add them to the list of topics.
 
 
 There are some rules that you MUST follow:
diff --git a/backend/dataall/modules/s3_datasets/services/dataset_enums.py b/backend/dataall/modules/s3_datasets/services/dataset_enums.py
@@ -12,7 +12,7 @@ class MetadataGenerationTargets(Enum):
 class MetadataGenerationTypes(Enum):
     """Describes the s3_datasets metadata generation types"""
 
-    Description = 'Description'
-    Label = 'Label'
-    Tag = 'Tag'
-    Topic = 'Topic'
+    Description = 'description'
+    Label = 'label'
+    Tag = 'tags'
+    Topic = 'topics'
diff --git a/backend/dataall/modules/s3_datasets/services/dataset_location_service.py b/backend/dataall/modules/s3_datasets/services/dataset_location_service.py
@@ -4,7 +4,9 @@
 from dataall.base.context import get_context
 from dataall.core.permissions.services.resource_policy_service import ResourcePolicyService
 from dataall.core.permissions.services.tenant_policy_service import TenantPolicyService
-from dataall.core.resource_threshold.services.resource_threshold_service import ResourceThresholdService
+
+##TODO
+##from dataall.core.resource_threshold.services.resource_threshold_service import ResourceThresholdService
 from dataall.modules.catalog.db.glossary_repositories import GlossaryRepository
 from dataall.base.db.exceptions import ResourceShared, ResourceAlreadyExists
 from dataall.modules.s3_datasets.services.dataset_service import DatasetService
@@ -145,9 +147,9 @@ def _delete_dataset_folder_read_permission(session, dataset: S3Dataset, location
 
     @staticmethod
     @ResourcePolicyService.has_resource_permission(UPDATE_DATASET_FOLDER)
-    @ResourceThresholdService.check_invocation_count(
-        'metadata', 'modules.s3_datasets.features.generate_metadata_ai.max_count_per_day'
-    )
+    # @ResourceThresholdService.check_invocation_count(
+    #     'metadata', 'modules.s3_datasets.features.generate_metadata_ai.max_count_per_day'
+    # )
     def generate_metadata_for_folder(uri, metadata_types):
         context = get_context()
         with context.db_engine.scoped_session() as session:
diff --git a/backend/dataall/modules/s3_datasets/services/dataset_service.py b/backend/dataall/modules/s3_datasets/services/dataset_service.py
@@ -19,7 +19,9 @@
 from dataall.core.stacks.db.stack_repositories import StackRepository
 from dataall.core.stacks.db.stack_models import Stack
 from dataall.core.tasks.db.task_models import Task
-from dataall.core.resource_threshold.services.resource_threshold_service import ResourceThresholdService
+
+##TODO
+##from dataall.core.resource_threshold.services.resource_threshold_service import ResourceThresholdService
 from dataall.modules.catalog.db.glossary_repositories import GlossaryRepository
 from dataall.modules.s3_datasets.db.dataset_bucket_repositories import DatasetBucketRepository
 from dataall.modules.shares_base.db.share_object_repositories import ShareObjectRepository
@@ -574,9 +576,9 @@ def list_dataset_tables_folders(uri, filter):
 
     @staticmethod
     @ResourcePolicyService.has_resource_permission(UPDATE_DATASET)
-    @ResourceThresholdService.check_invocation_count(
-        'metadata', 'modules.s3_datasets.features.generate_metadata_ai.max_count_per_day'
-    )
+    # @ResourceThresholdService.check_invocation_count(
+    #     'metadata', 'modules.s3_datasets.features.generate_metadata_ai.max_count_per_day'
+    # )
     def generate_metadata_for_dataset(uri, metadata_types):
         context = get_context()
         with context.db_engine.scoped_session() as session:
diff --git a/backend/dataall/modules/s3_datasets/services/dataset_table_service.py b/backend/dataall/modules/s3_datasets/services/dataset_table_service.py
@@ -3,7 +3,9 @@
 from dataall.base.context import get_context
 from dataall.core.permissions.services.resource_policy_service import ResourcePolicyService
 from dataall.core.permissions.services.tenant_policy_service import TenantPolicyService
-from dataall.core.resource_threshold.services.resource_threshold_service import ResourceThresholdService
+
+##TODO
+##from dataall.core.resource_threshold.services.resource_threshold_service import ResourceThresholdService
 from dataall.modules.catalog.db.glossary_repositories import GlossaryRepository
 from dataall.core.environment.services.environment_service import EnvironmentService
 from dataall.modules.s3_datasets.aws.athena_table_client import AthenaTableClient
@@ -190,9 +192,9 @@ def _delete_dataset_table_read_permission(session, table_uri):
 
     @staticmethod
     @ResourcePolicyService.has_resource_permission(UPDATE_DATASET_TABLE)
-    @ResourceThresholdService.check_invocation_count(
-        'metadata', 'modules.s3_datasets.features.generate_metadata_ai.max_count_per_day'
-    )
+    # @ResourceThresholdService.check_invocation_count(
+    #     'metadata', 'modules.s3_datasets.features.generate_metadata_ai.max_count_per_day'
+    # )
     def generate_metadata_for_table(uri, metadata_types, sample_data):
         context = get_context()
         with context.db_engine.scoped_session() as session:
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -13,3 +13,5 @@ requests_aws4auth==1.1.1
 sqlalchemy==1.3.24
 alembic==1.13.1
 retrying==1.3.4
+langchain-aws==0.2.2
+langchain-core==0.3.11
diff --git a/frontend/src/modules/S3_Datasets/components/GenerateMetadataComponent.js b/frontend/src/modules/S3_Datasets/components/GenerateMetadataComponent.js
diff --git a/frontend/src/modules/S3_Datasets/components/MetadataMainModal.js b/frontend/src/modules/S3_Datasets/components/MetadataMainModal.js
diff --git a/frontend/src/modules/S3_Datasets/components/ReviewMetadataComponent.js b/frontend/src/modules/S3_Datasets/components/ReviewMetadataComponent.js
diff --git a/frontend/src/modules/S3_Datasets/services/generateMetadataBedrock.js b/frontend/src/modules/S3_Datasets/services/generateMetadataBedrock.js