Skip to content

Commit ba78df7

Browse files
committed
GenAI metadata generation - improvements
1 parent cfef943 commit ba78df7

File tree

11 files changed

+617
-23
lines changed

11 files changed

+617
-23
lines changed

backend/dataall/modules/s3_datasets/api/dataset/resolvers.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ def generate_metadata(
175175
tableSampleData: dict = {},
176176
):
177177
RequestValidator.validate_uri(param_name='resourceUri', param_value=resourceUri)
178+
RequestValidator.validate_table_sample_data(tableSampleData)
178179
if any(metadata_type not in [item.value for item in MetadataGenerationTypes] for metadata_type in metadataTypes):
179180
raise InvalidInput(
180181
'metadataType',
@@ -264,3 +265,92 @@ def validate_import_request(data):
264265
RequestValidator.validate_creation_request(data)
265266
if not data.get('bucketName'):
266267
raise RequiredParameter('bucketName')
268+
269+
@staticmethod
270+
def validate_table_sample_data(table_sample_data: dict):
271+
"""
272+
Validates tableSampleData parameter structure to match readTableSampleData API output.
273+
Expected structure:
274+
{
275+
"fields": [JSON string objects with "name" property],
276+
"rows": [JSON array strings matching field count]
277+
}
278+
"""
279+
if not table_sample_data:
280+
return # Empty dict is allowed as default parameter
281+
282+
if not isinstance(table_sample_data, dict):
283+
raise InvalidInput(
284+
param_name='tableSampleData',
285+
param_value=str(table_sample_data),
286+
constraint='must be a dictionary object',
287+
)
288+
289+
# Validate fields array
290+
fields = table_sample_data.get('fields')
291+
if fields is not None:
292+
if not isinstance(fields, list):
293+
raise InvalidInput(
294+
param_name='tableSampleData.fields', param_value=str(fields), constraint='must be an array'
295+
)
296+
297+
for i, field in enumerate(fields):
298+
if not isinstance(field, str):
299+
raise InvalidInput(
300+
param_name=f'tableSampleData.fields[{i}]',
301+
param_value=str(field),
302+
constraint='must be a JSON string',
303+
)
304+
305+
try:
306+
import json
307+
308+
field_obj = json.loads(field)
309+
if not isinstance(field_obj, dict) or 'name' not in field_obj:
310+
raise InvalidInput(
311+
param_name=f'tableSampleData.fields[{i}]',
312+
param_value=field,
313+
constraint='must be a JSON object with "name" property',
314+
)
315+
except json.JSONDecodeError:
316+
raise InvalidInput(
317+
param_name=f'tableSampleData.fields[{i}]', param_value=field, constraint='must be valid JSON'
318+
)
319+
320+
# Validate rows array
321+
rows = table_sample_data.get('rows')
322+
if rows is not None:
323+
if not isinstance(rows, list):
324+
raise InvalidInput(
325+
param_name='tableSampleData.rows', param_value=str(rows), constraint='must be an array'
326+
)
327+
328+
expected_field_count = len(fields) if fields else 0
329+
330+
for i, row in enumerate(rows):
331+
if not isinstance(row, str):
332+
raise InvalidInput(
333+
param_name=f'tableSampleData.rows[{i}]',
334+
param_value=str(row),
335+
constraint='must be a JSON string',
336+
)
337+
338+
try:
339+
import json
340+
341+
row_array = json.loads(row)
342+
if not isinstance(row_array, list):
343+
raise InvalidInput(
344+
param_name=f'tableSampleData.rows[{i}]', param_value=row, constraint='must be a JSON array'
345+
)
346+
347+
if expected_field_count > 0 and len(row_array) != expected_field_count:
348+
raise InvalidInput(
349+
param_name=f'tableSampleData.rows[{i}]',
350+
param_value=row,
351+
constraint=f'must contain {expected_field_count} elements to match fields count',
352+
)
353+
except json.JSONDecodeError:
354+
raise InvalidInput(
355+
param_name=f'tableSampleData.rows[{i}]', param_value=row, constraint='must be valid JSON'
356+
)

backend/dataall/modules/s3_datasets/api/table/queries.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@
4343
type=gql.Ref('QueryPreviewResult'),
4444
)
4545

46-
listTableSampleData = gql.QueryField(
47-
name='listTableSampleData',
46+
readTableSampleData = gql.QueryField(
47+
name='readTableSampleData',
4848
args=[gql.Argument(name='tableUri', type=gql.NonNullableType(gql.String))],
4949
type=gql.Ref('QueryPreviewResult'),
5050
resolver=read_sample_data,

backend/dataall/modules/s3_datasets/services/dataset_service.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,7 @@ def list_dataset_tables_folders(uri, filter):
591591
return DatasetRepository.paginated_dataset_tables_folders(session, uri, filter)
592592

593593
@staticmethod
594+
@TenantPolicyService.has_tenant_permission(MANAGE_DATASETS)
594595
@ResourcePolicyService.has_resource_permission(UPDATE_DATASET)
595596
##TODO Uncomment the following to use the ResourceThresholdService once https://github.com/data-dot-all/dataall/pull/1653 is merged
596597
# @ResourceThresholdService.check_invocation_count(

frontend/src/modules/S3_Datasets/components/ReviewMetadataComponent.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ import { useClient } from 'services';
1818
import { updateDatasetTable } from 'modules/Tables/services';
1919
import { updateDatasetStorageLocation } from 'modules/Folders/services';
2020
import {
21-
listTableSampleData,
21+
readTableSampleData,
2222
updateDataset,
2323
generateMetadataBedrock
2424
} from '../services';
@@ -55,12 +55,12 @@ export const ReviewMetadataComponent = (props) => {
5555
try {
5656
setLoadingSampleData(true);
5757
const response = await client.query(
58-
listTableSampleData({
58+
readTableSampleData({
5959
tableUri: table.targetUri
6060
})
6161
);
6262
if (!response.errors) {
63-
openSampleDataPopup(response.data.listTableSampleData);
63+
openSampleDataPopup(response.data.readTableSampleData);
6464
setTargetUri(table.targetUri);
6565
enqueueSnackbar('Successfully read sample data', {
6666
variant: 'success'
@@ -152,7 +152,7 @@ export const ReviewMetadataComponent = (props) => {
152152
metadataTypes: Object.entries(selectedMetadataTypes)
153153
.filter(([key, value]) => value === true)
154154
.map(([key]) => key),
155-
sampleData: sampleDataWithoutTypename
155+
tableSampleData: sampleDataWithoutTypename
156156
})
157157
);
158158

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
export * from './batchUpdateTableColumnDescriptions';
21
export * from './createDataset';
32
export * from './deleteDataset';
43
export * from './generateDatasetAccessToken';
@@ -7,7 +6,7 @@ export * from './getDatasetPresignedUrl';
76
export * from './importDataset';
87
export * from './listDatasetStorageLocations';
98
export * from './listDatasetTablesFolders';
10-
export * from './listTableSampleData';
9+
export * from './readTableSampleData';
1110
export * from './startGlueCrawler';
1211
export * from './syncTables';
1312
export * from './updateDataset';

frontend/src/modules/S3_Datasets/services/listTableSampleData.js

Lines changed: 0 additions & 15 deletions
This file was deleted.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import { gql } from 'apollo-boost';
2+
3+
export const readTableSampleData = ({ tableUri }) => ({
4+
variables: {
5+
tableUri
6+
},
7+
query: gql`
8+
query readTableSampleData($tableUri: String!) {
9+
readTableSampleData(tableUri: $tableUri) {
10+
fields
11+
rows
12+
}
13+
}
14+
`
15+
});

0 commit comments

Comments
 (0)