Skip to content

Commit f4e6c64

Browse files
committed
FE work and Bedrock client cross-region
1 parent 2c3bbf9 commit f4e6c64

15 files changed

Lines changed: 146 additions & 142 deletions

backend/dataall/modules/s3_datasets/api/dataset/resolvers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ def generate_metadata(
169169
tableSampleData: dict = {},
170170
):
171171
RequestValidator.validate_uri(param_name='resourceUri', param_value=resourceUri)
172-
if metadataTypes not in [item.value for item in MetadataGenerationTypes]:
172+
if any(metadata_type not in [item.value for item in MetadataGenerationTypes] for metadata_type in metadataTypes):
173173
raise InvalidInput(
174174
'metadataType',
175175
metadataTypes,

backend/dataall/modules/s3_datasets/api/dataset/types.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,9 +142,9 @@
142142
gql.Field(name='targetUri', type=gql.String),
143143
gql.Field(name='targetType', type=gql.String),
144144
gql.Field(name='label', type=gql.String),
145-
gql.Field(name='topics', type=gql.ArrayType(gql.String)),
146-
gql.Field(name='tags', type=gql.ArrayType(gql.String)),
147145
gql.Field(name='description', type=gql.String),
146+
gql.Field(name='tags', type=gql.ArrayType(gql.String)),
147+
gql.Field(name='topics', type=gql.ArrayType(gql.String)),
148148
],
149149
)
150150

backend/dataall/modules/s3_datasets/aws/bedrock_metadata_client.py

Lines changed: 54 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
from dataall.base.db import exceptions
55
from dataall.base.aws.sts import SessionHelper
66
from typing import List, Optional
7-
from langchain.prompts import PromptTemplate
7+
from langchain_core.prompts import PromptTemplate
88
from langchain_core.pydantic_v1 import BaseModel
9-
from langchain_aws import BedrockLLM
9+
from langchain_aws import ChatBedrock as BedrockChat
1010
from langchain_core.output_parsers import JsonOutputParser
1111

1212
log = logging.getLogger(__name__)
@@ -34,65 +34,68 @@ class BedrockClient:
3434
def __init__(self):
3535
session = SessionHelper.get_session()
3636
self._client = session.client('bedrock-runtime', region_name=os.getenv('AWS_REGION', 'eu-west-1'))
37-
model_id = 'anthropic.claude-3-5-sonnet-20240620-v1:0'
37+
model_id = 'eu.anthropic.claude-3-5-sonnet-20240620-v1:0'
3838
model_kwargs = {
3939
'max_tokens': 4096,
4040
'temperature': 0.5,
4141
'top_k': 250,
4242
'top_p': 0.5,
4343
'stop_sequences': ['\n\nHuman'],
4444
}
45-
self._model = BedrockLLM(model_id=model_id, client=self._client, model_kwargs=model_kwargs)
45+
self._model = BedrockChat(client=self._client, model_id=model_id, model_kwargs=model_kwargs)
4646

4747
def invoke_model_dataset_metadata(self, metadata_types, dataset, tables, folders):
48-
prompt_template = PromptTemplate.from_file(METADATA_GENERATION_DATASET_TEMPLATE_PATH)
49-
parser = JsonOutputParser(pydantic_object=MetadataOutput)
50-
chain = prompt_template | self._model | parser
51-
context = {
52-
'metadata_types': metadata_types,
53-
'label': dataset.label,
54-
'description': dataset.description,
55-
'tags': dataset.tags,
56-
'table_labels': [t.label for t in tables],
57-
'table_descriptions': [t.description for t in tables],
58-
'folder_labels': [f.label for f in folders],
59-
}
60-
response = chain.invoke(context)
61-
if response.startswith('Error:'):
62-
raise exceptions.ModelGuardrailException(response)
63-
return response
48+
try:
49+
prompt_template = PromptTemplate.from_file(METADATA_GENERATION_DATASET_TEMPLATE_PATH)
50+
parser = JsonOutputParser(pydantic_object=MetadataOutput)
51+
chain = prompt_template | self._model | parser
52+
context = {
53+
'metadata_types': metadata_types,
54+
'dataset_label': dataset.label,
55+
'description': dataset.description,
56+
'tags': dataset.tags,
57+
'topics': dataset.topics,
58+
'table_names': [t.label for t in tables],
59+
'table_descriptions': [t.description for t in tables],
60+
'folder_names': [f.label for f in folders],
61+
}
62+
return chain.invoke(context)
63+
except Exception as e:
64+
raise e
6465

6566
def invoke_model_table_metadata(self, metadata_types, table, columns, sample_data, generate_columns_metadata=False):
66-
prompt_template = PromptTemplate.from_file(METADATA_GENERATION_TABLE_TEMPLATE_PATH)
67-
parser = JsonOutputParser(pydantic_object=MetadataOutput)
68-
chain = prompt_template | self._model | parser
69-
context = {
70-
'metadata_types': metadata_types,
71-
'generate_columns_metadata': generate_columns_metadata,
72-
'label': table.label,
73-
'description': table.description,
74-
'tags': table.tags,
75-
'column_labels': [c.label for c in columns],
76-
'column_descriptions': [c.description for c in columns],
77-
'sample_data': sample_data,
78-
}
79-
response = chain.invoke(context)
80-
if response.startswith('Error:'):
81-
raise exceptions.ModelGuardrailException(response)
82-
return response
67+
try:
68+
prompt_template = PromptTemplate.from_file(METADATA_GENERATION_TABLE_TEMPLATE_PATH)
69+
parser = JsonOutputParser(pydantic_object=MetadataOutput)
70+
chain = prompt_template | self._model | parser
71+
context = {
72+
'metadata_types': metadata_types,
73+
'generate_columns_metadata': generate_columns_metadata,
74+
'label': table.label,
75+
'description': table.description,
76+
'tags': table.tags,
77+
'topics': table.topics,
78+
'column_labels': [c.label for c in columns],
79+
'column_descriptions': [c.description for c in columns],
80+
'sample_data': sample_data,
81+
}
82+
return chain.invoke(context)
83+
except Exception as e:
84+
raise e
8385

8486
def invoke_model_folder_metadata(self, metadata_types, folder, files):
85-
prompt_template = PromptTemplate.from_file(METADATA_GENERATION_FOLDER_TEMPLATE_PATH)
86-
parser = JsonOutputParser(pydantic_object=MetadataOutput)
87-
chain = prompt_template | self._model | parser
88-
context = {
89-
'metadata_types': metadata_types,
90-
'label': folder.label,
91-
'description': folder.description,
92-
'tags': folder.tags,
93-
'file_names': files,
94-
}
95-
response = chain.invoke(context)
96-
if response.startswith('Error:'):
97-
raise exceptions.ModelGuardrailException(response)
98-
return response
87+
try:
88+
prompt_template = PromptTemplate.from_file(METADATA_GENERATION_FOLDER_TEMPLATE_PATH)
89+
parser = JsonOutputParser(pydantic_object=MetadataOutput)
90+
chain = prompt_template | self._model | parser
91+
context = {
92+
'metadata_types': metadata_types,
93+
'label': folder.label,
94+
'description': folder.description,
95+
'tags': folder.tags,
96+
'topics': folder.topics,
97+
'file_names': files,
98+
}
99+
return chain.invoke(context)
100+
except Exception as e:
101+
raise e

backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_dataset_template.txt

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,19 @@
11
Your task is to generate or improve the metadata fields of a Dataset.
22

33
Use the following input parameters:
4-
- Dataset name: {label}
4+
- Dataset name: {dataset_label}
55
- Current dataset description: {description}
66
- Current tags for dataset: {tags}
7+
- Current topics for dataset: {topics}
78
- Table names in the dataset: {table_names}
89
- Folder names in the dataset: {folder_names}
910

1011

1112
There are 4 metadata fields that can be requested to you.
1213
1. label - 1 to 3 words that give a "title" to the Dataset. If provided, you can use the current Dataset name as starting point.
1314
2. description - less than 30 words that summarize the Tables and Folders contained in the Dataset. If provided, use the current description and tags as starting point; but mainly use the Table names and Folder names.
14-
3. tags - list of strings (less than 3), where each string can take any value. Tags should highlight the most important field or thematic of the Dataset. If there are current tags that represent additional information, add them to the list of tags.
15-
4. topics - list of strings (1 or 2), where each string must be one of the following topics that represent company departments ['Finance', 'Marketing', 'Engineering', 'HR', 'Operations', 'Sales', 'Other'] Choose a topic according to the Tables and Folders of the dataset.
15+
3. tags - list of strings (less than 3), where each string can take any value. Tags should highlight the most important field or thematic of the Dataset. If there are current tags that represent additional information, add them to the list of tags. Do not return the label as a tag.
16+
4. topics - list of strings (1 or 2), where each string must be one of the following topics that represent company departments ['Finance', 'Marketing', 'Engineering', 'HR', 'Operations', 'Sales', 'Other'] Choose a topic according to the Tables and Folders of the dataset. If there are current topics that represent additional information, add them to the list of topics.
1617

1718
There are some rules that you MUST follow:
1819
- If any of the input parameters is equal to "No description provided" or is None or [] do not use that particular input
@@ -24,6 +25,7 @@ contain only the requested metadata fields.
2425
- If the Table names and the Folder names are both none or [], return "Empty Dataset" as the description and "empty" as one of the tags.
2526
- Return the result as a Python dictionary where the keys are the requested metadata fields, all the keys must be
2627
lowercase and the values are the corresponding generated metadata.
28+
- Do not return any explanations, ONLY the Python dictionary.
2729

2830
---------------------------------------
2931
---------------------------------------
@@ -39,12 +41,12 @@ Given the following input parameters:
3941
folder_names: [],
4042
metadata_types: ["label", "description", "tags", "topics"]
4143

42-
response = {
44+
response = {{
4345
"label": "NotEnoughData",
4446
"description": "Empty Dataset",
4547
"topics": "NotEnoughData",
4648
"tags": ["empty"]
47-
}
49+
}}
4850

4951
Example 2.
5052
Given the following input parameters:
@@ -55,10 +57,10 @@ Given the following input parameters:
5557
folder_names: ["orders", "inventory", "sales"],
5658
metadata_types: ["label", "description"]
5759

58-
response = {
60+
response = {{
5961
"label": "Sales and Inventory",
6062
"description": "Dataset containing customer orders, product inventory, and sales transactions information, organized into orders, inventory, and sales folders."
61-
}
63+
}}
6264

6365
Example 3.
6466
Given the following input parameters:
@@ -69,11 +71,11 @@ Given the following input parameters:
6971
folder_names: ["hr_records", "financial", "evaluations"],
7072
metadata_types: ["label", "tags", "topics"]
7173

72-
response = {
74+
response = {{
7375
"label": "HR Management System",
7476
"tags": ["employee", "payroll", "performance"],
7577
"topics": ["HR", "Finance"]
76-
}
78+
}}
7779

7880

7981

backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_folder_template.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,15 @@ Use the following input parameters:
44
- Folder name: {label},
55
- Current Folder description: {description}
66
- Current tags for Folder: {tags}
7+
- Current topics for Folder: {topics}
78
- File names (files stored inside the folder): {file_names}
89

910

1011
There are 4 metadata fields that can be requested to you.
1112
1. label - 1 to 3 words that give a "title" to the Folder. If provided, you can use the current Folder name as starting point.
1213
2. description - less than 30 words that summarize the files contained in the Folder. If provided, use the current description and tags as starting point; but mainly use the file names.
1314
3. tags - list of strings (less than 3), where each string can take any value. Tags should highlight the most important field or thematic of the Folder. If there are current tags that represent additional information, add them to the list of tags.
14-
4. topics - list of strings (1 or 2), where each string must be one of the following topics that represent company departments ['Finance', 'Marketing', 'Engineering', 'HR', 'Operations', 'Sales', 'Other'] Choose a topic according to the files contained in the Folder.
15+
4. topics - list of strings (1 or 2), where each string must be one of the following topics that represent company departments ['Finance', 'Marketing', 'Engineering', 'HR', 'Operations', 'Sales', 'Other'] Choose a topic according to the files contained in the Folder. If there are current topics that represent additional information, add them to the list of topics.
1516

1617
There are some rules that you MUST follow:
1718
- If any of the input parameters is equal to "No description provided" or is None or [] do not use that particular input

backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_table_template.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ Use the following input parameters:
44
- Table name: {label}
55
- Current table description: {description}
66
- Current tags for table: {tags}
7+
- Current topics for table: {topics}
78
- Column names: {columns}
89
- Column Descriptions: {column_descriptions}
910
- Sample data: {sample_data}
@@ -14,7 +15,7 @@ There are 4 metadata fields that can be requested to you.
1415
1. label - 1 to 3 words that give a "title" to the Table. If provided, you can use the current Table name as starting point.
1516
2. description - less than 30 words that summarize the content of the table. If provided, take the current Table description as starting point but mainly use the sample data (if provided), and then the column names and descriptions to generate the table description.
1617
3. tags - list of strings (less than 3), where each string can take any value. Tags should highlight the most important field or thematic of the Table. If there are current tags that represent additional information, add them to the list of tags.
17-
4. topics - list of strings (1 or 2), where each string must be one of the following topics that represent company departments ['Finance', 'Marketing', 'Engineering', 'HR', 'Operations', 'Sales', 'Other'] Choose a topic according to the Table description.
18+
4. topics - list of strings (1 or 2), where each string must be one of the following topics that represent company departments ['Finance', 'Marketing', 'Engineering', 'HR', 'Operations', 'Sales', 'Other'] Choose a topic according to the Table description. If there are current topics that represent additional information, add them to the list of topics.
1819

1920

2021
There are some rules that you MUST follow:

backend/dataall/modules/s3_datasets/services/dataset_enums.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ class MetadataGenerationTargets(Enum):
1212
class MetadataGenerationTypes(Enum):
1313
"""Describes the s3_datasets metadata generation types"""
1414

15-
Description = 'Description'
16-
Label = 'Label'
17-
Tag = 'Tag'
18-
Topic = 'Topic'
15+
Description = 'description'
16+
Label = 'label'
17+
Tag = 'tags'
18+
Topic = 'topics'

backend/dataall/modules/s3_datasets/services/dataset_location_service.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
from dataall.base.context import get_context
55
from dataall.core.permissions.services.resource_policy_service import ResourcePolicyService
66
from dataall.core.permissions.services.tenant_policy_service import TenantPolicyService
7-
from dataall.core.resource_threshold.services.resource_threshold_service import ResourceThresholdService
7+
8+
##TODO
9+
##from dataall.core.resource_threshold.services.resource_threshold_service import ResourceThresholdService
810
from dataall.modules.catalog.db.glossary_repositories import GlossaryRepository
911
from dataall.base.db.exceptions import ResourceShared, ResourceAlreadyExists
1012
from dataall.modules.s3_datasets.services.dataset_service import DatasetService
@@ -145,9 +147,9 @@ def _delete_dataset_folder_read_permission(session, dataset: S3Dataset, location
145147

146148
@staticmethod
147149
@ResourcePolicyService.has_resource_permission(UPDATE_DATASET_FOLDER)
148-
@ResourceThresholdService.check_invocation_count(
149-
'metadata', 'modules.s3_datasets.features.generate_metadata_ai.max_count_per_day'
150-
)
150+
# @ResourceThresholdService.check_invocation_count(
151+
# 'metadata', 'modules.s3_datasets.features.generate_metadata_ai.max_count_per_day'
152+
# )
151153
def generate_metadata_for_folder(uri, metadata_types):
152154
context = get_context()
153155
with context.db_engine.scoped_session() as session:

backend/dataall/modules/s3_datasets/services/dataset_service.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@
1919
from dataall.core.stacks.db.stack_repositories import StackRepository
2020
from dataall.core.stacks.db.stack_models import Stack
2121
from dataall.core.tasks.db.task_models import Task
22-
from dataall.core.resource_threshold.services.resource_threshold_service import ResourceThresholdService
22+
23+
##TODO
24+
##from dataall.core.resource_threshold.services.resource_threshold_service import ResourceThresholdService
2325
from dataall.modules.catalog.db.glossary_repositories import GlossaryRepository
2426
from dataall.modules.s3_datasets.db.dataset_bucket_repositories import DatasetBucketRepository
2527
from dataall.modules.shares_base.db.share_object_repositories import ShareObjectRepository
@@ -574,9 +576,9 @@ def list_dataset_tables_folders(uri, filter):
574576

575577
@staticmethod
576578
@ResourcePolicyService.has_resource_permission(UPDATE_DATASET)
577-
@ResourceThresholdService.check_invocation_count(
578-
'metadata', 'modules.s3_datasets.features.generate_metadata_ai.max_count_per_day'
579-
)
579+
# @ResourceThresholdService.check_invocation_count(
580+
# 'metadata', 'modules.s3_datasets.features.generate_metadata_ai.max_count_per_day'
581+
# )
580582
def generate_metadata_for_dataset(uri, metadata_types):
581583
context = get_context()
582584
with context.db_engine.scoped_session() as session:

backend/dataall/modules/s3_datasets/services/dataset_table_service.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
from dataall.base.context import get_context
44
from dataall.core.permissions.services.resource_policy_service import ResourcePolicyService
55
from dataall.core.permissions.services.tenant_policy_service import TenantPolicyService
6-
from dataall.core.resource_threshold.services.resource_threshold_service import ResourceThresholdService
6+
7+
##TODO
8+
##from dataall.core.resource_threshold.services.resource_threshold_service import ResourceThresholdService
79
from dataall.modules.catalog.db.glossary_repositories import GlossaryRepository
810
from dataall.core.environment.services.environment_service import EnvironmentService
911
from dataall.modules.s3_datasets.aws.athena_table_client import AthenaTableClient
@@ -190,9 +192,9 @@ def _delete_dataset_table_read_permission(session, table_uri):
190192

191193
@staticmethod
192194
@ResourcePolicyService.has_resource_permission(UPDATE_DATASET_TABLE)
193-
@ResourceThresholdService.check_invocation_count(
194-
'metadata', 'modules.s3_datasets.features.generate_metadata_ai.max_count_per_day'
195-
)
195+
# @ResourceThresholdService.check_invocation_count(
196+
# 'metadata', 'modules.s3_datasets.features.generate_metadata_ai.max_count_per_day'
197+
# )
196198
def generate_metadata_for_table(uri, metadata_types, sample_data):
197199
context = get_context()
198200
with context.db_engine.scoped_session() as session:

0 commit comments

Comments
 (0)