Skip to content

Commit 5c1e40c

Browse files
feat(data-quality): Add impact ranking for data quality checks
Implement Data Quality Checks Impact feature for issue #26658 - Add scoring model that calculates impact based on: - Downstream usage (40% weight) - Consumer count (30% weight) - Recent incidents (30% weight) - Add REST API endpoint for impact ranking - Add frontend components to display impact scores - Add schema for dataQualityCheckImpact This helps prioritize which data quality checks are most critical to fix by ranking them based on their business impact.
1 parent 5926abd commit 5c1e40c

8 files changed

Lines changed: 514 additions & 4 deletions

File tree

openmetadata-service/src/main/java/org/openmetadata/service/jdbi3/TestSuiteRepository.java

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,7 @@ public DataQualityReport getDataQualityReport(
359359
return searchRepository.genericAggregation(q, index, searchAggregation, subjectContext);
360360
}
361361

362-
public DataQualityReport getDataQualityReport(
362+
public DataQualityReport getDataQualityReport(
363363
String q, String aggQuery, String index, String domain, SubjectContext subjectContext)
364364
throws IOException {
365365
String queryWithDomain = addDomainFilter(q, domain, index);
@@ -368,6 +368,94 @@ public DataQualityReport getDataQualityReport(
368368
queryWithDomain, index, searchAggregation, subjectContext);
369369
}
370370

371+
public List<Map<String, Object>> getDataQualityCheckImpact(
372+
int limit, String testCaseStatus, SubjectContext subjectContext) throws IOException {
373+
374+
long thirtyDaysAgo = System.currentTimeMillis() - (30L * 24 * 60 * 60 * 1000);
375+
376+
String query = buildTestCaseImpactQuery(testCaseStatus, thirtyDaysAgo);
377+
378+
List<Map<String, Object>> testCases =
379+
searchRepository.searchTestCasesForImpact(query, 0, limit * 2, subjectContext);
380+
381+
List<Map<String, Object>> rankedResults = new ArrayList<>();
382+
Map<String, Integer> maxValues = calculateMaxValues(testCases);
383+
384+
int downstreamMax = maxValues.getOrDefault("downstreamUsage", 100);
385+
int consumerMax = maxValues.getOrDefault("consumerCount", 50);
386+
int incidentMax = maxValues.getOrDefault("recentIncidents", 10);
387+
388+
for (Map<String, Object> testCase : testCases) {
389+
int downstreamUsage =
390+
((Number) testCase.getOrDefault("downstreamUsage", 0)).intValue();
391+
int consumerCount = ((Number) testCase.getOrDefault("consumerCount", 0)).intValue();
392+
int recentIncidents =
393+
((Number) testCase.getOrDefault("recentIncidents", 0)).intValue();
394+
395+
double normalizedDownstream =
396+
downstreamMax > 0 ? Math.min((double) downstreamUsage / downstreamMax, 1.0) : 0;
397+
double normalizedConsumer =
398+
consumerMax > 0 ? Math.min((double) consumerCount / consumerMax, 1.0) : 0;
399+
double incidentFactor =
400+
incidentMax > 0 ? Math.min((double) recentIncidents / incidentMax, 1.0) : 0;
401+
402+
double impactScore =
403+
(0.4 * normalizedDownstream) + (0.3 * normalizedConsumer) + (0.3 * incidentFactor);
404+
impactScore = Math.round(impactScore * 100.0) / 100.0;
405+
406+
Map<String, Object> result = new HashMap<>(testCase);
407+
result.put("impactScore", impactScore * 100);
408+
rankedResults.add(result);
409+
}
410+
411+
rankedResults.sort(
412+
(a, b) -> {
413+
double scoreA = ((Number) a.getOrDefault("impactScore", 0.0)).doubleValue();
414+
double scoreB = ((Number) b.getOrDefault("impactScore", 0.0)).doubleValue();
415+
return Double.compare(scoreB, scoreA);
416+
});
417+
418+
return rankedResults.stream().limit(limit).collect(Collectors.toList());
419+
}
420+
421+
private String buildTestCaseImpactQuery(String testCaseStatus, long thirtyDaysAgo) {
422+
StringBuilder query = new StringBuilder();
423+
query.append("{\"query\": {\"bool\": {\"must\": [");
424+
425+
if (testCaseStatus != null && !testCaseStatus.isEmpty()) {
426+
query.append(String.format("{\"term\": {\"testCaseStatus\": \"%s\"}},", testCaseStatus.toLowerCase()));
427+
}
428+
429+
query.append(
430+
String.format(
431+
"{\"range\": {\"timestamp\": {\"gte\": %d}}}}}", thirtyDaysAgo / 1000));
432+
return query.toString();
433+
}
434+
435+
private Map<String, Integer> calculateMaxValues(List<Map<String, Object>> testCases) {
436+
Map<String, Integer> maxValues = new HashMap<>();
437+
int maxDownstream = 0;
438+
int maxConsumer = 0;
439+
int maxIncident = 0;
440+
441+
for (Map<String, Object> tc : testCases) {
442+
int downstream =
443+
((Number) tc.getOrDefault("downstreamUsage", 0)).intValue();
444+
int consumer = ((Number) tc.getOrDefault("consumerCount", 0)).intValue();
445+
int incident = ((Number) tc.getOrDefault("recentIncidents", 0)).intValue();
446+
447+
if (downstream > maxDownstream) maxDownstream = downstream;
448+
if (consumer > maxConsumer) maxConsumer = consumer;
449+
if (incident > maxIncident) maxIncident = incident;
450+
}
451+
452+
maxValues.put("downstreamUsage", Math.max(maxDownstream, 1));
453+
maxValues.put("consumerCount", Math.max(maxConsumer, 1));
454+
maxValues.put("recentIncidents", Math.max(maxIncident, 1));
455+
456+
return maxValues;
457+
}
458+
371459
private String addDomainFilter(String query, String domain, String index) {
372460
if (nullOrEmpty(domain)) {
373461
return query;

openmetadata-service/src/main/java/org/openmetadata/service/resources/dqtests/TestSuiteResource.java

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -542,10 +542,53 @@ public DataQualityReport getDataQualityReport(
542542
if (nullOrEmpty(aggregationQuery) || nullOrEmpty(index)) {
543543
throw new IllegalArgumentException("aggregationQuery and index are required parameters");
544544
}
545-
SubjectContext subjectContext = getSubjectContext(securityContext);
545+
SubjectContext subjectContext = getSubjectContext(securityContext);
546546
return repository.getDataQualityReport(query, aggregationQuery, index, domain, subjectContext);
547547
}
548548

549+
@GET
550+
@Path("/dataQualityCheckImpact")
551+
@Operation(
552+
operationId = "getDataQualityCheckImpact",
553+
summary = "Get Data Quality Check Impact Ranking",
554+
description =
555+
"""
556+
Get data quality checks ranked by impact score. The impact score is calculated based on:
557+
- Downstream usage (number of downstream entities using the data)
558+
- Consumer count (number of direct consumers)
559+
- Recent incidents (failed test results in the last 30 days)
560+
This helps prioritize which data quality checks are most critical to fix.
561+
""",
562+
responses = {
563+
@ApiResponse(
564+
responseCode = "200",
565+
description = "List of data quality checks ranked by impact",
566+
content =
567+
@Content(
568+
mediaType = "application/json",
569+
schema = @Schema(implementation = List.class)))
570+
})
571+
public List<?> getDataQualityCheckImpact(
572+
@Context UriInfo uriInfo,
573+
@Context SecurityContext securityContext,
574+
@Parameter(
575+
description = "Number of results to return",
576+
schema = @Schema(type = "integer", defaultValue = "10"))
577+
@QueryParam("limit")
578+
@DefaultValue("10")
579+
int limit,
580+
@Parameter(
581+
description = "Filter by test case status (e.g., Failed, Success)",
582+
schema = @Schema(type = "string"))
583+
@QueryParam("testCaseStatus")
584+
String testCaseStatus)
585+
throws IOException {
586+
List<AuthRequest> authRequests = getAuthRequestsForListOps();
587+
authorizer.authorizeRequests(securityContext, authRequests, AuthorizationLogic.ANY);
588+
SubjectContext subjectContext = getSubjectContext(securityContext);
589+
return repository.getDataQualityCheckImpact(limit, testCaseStatus, subjectContext);
590+
}
591+
549592
@POST
550593
@Operation(
551594
operationId = "createLogicalTestSuite",

openmetadata-service/src/main/java/org/openmetadata/service/search/SearchRepository.java

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2784,11 +2784,63 @@ public Response searchEntityRelationship(
27842784
fqn, upstreamDepth, downstreamDepth, queryFilter, deleted);
27852785
}
27862786

2787-
public Response searchDataQualityLineage(
2787+
public Response searchDataQualityLineage(
27882788
String fqn, int upstreamDepth, String queryFilter, boolean deleted) throws IOException {
27892789
return searchClient.searchDataQualityLineage(fqn, upstreamDepth, queryFilter, deleted);
27902790
}
27912791

2792+
@SuppressWarnings("unchecked")
2793+
public List<Map<String, Object>> searchTestCasesForImpact(
2794+
String query, int from, int size, SubjectContext subjectContext) throws IOException {
2795+
SearchRequest searchRequest =
2796+
new SearchRequest()
2797+
.withIndex(Entity.TEST_CASE)
2798+
.withQuery(query)
2799+
.withFrom(from)
2800+
.withSize(size)
2801+
.withSortFieldParam("timestamp")
2802+
.withDeleted(false)
2803+
.withSortOrder("desc");
2804+
2805+
Response response = search(searchRequest, subjectContext);
2806+
2807+
if (response.getStatus() != 200) {
2808+
return new ArrayList<>();
2809+
}
2810+
2811+
String json = (String) response.getEntity();
2812+
List<Map<String, Object>> results = new ArrayList<>();
2813+
2814+
try {
2815+
JsonNode hitsNode = JsonUtils.extractValue(json, HITS, HITS);
2816+
if (hitsNode == null || !hitsNode.isArray()) {
2817+
return results;
2818+
}
2819+
2820+
for (Iterator<JsonNode> it = hitsNode.elements(); it.hasNext(); ) {
2821+
JsonNode jsonNode = it.next();
2822+
JsonNode sourceNode = JsonUtils.extractValue(jsonNode.toString(), SEARCH_SOURCE);
2823+
if (sourceNode != null) {
2824+
Map<String, Object> doc = new HashMap<>();
2825+
doc.put("testCaseId", JsonUtils.extractValue(sourceNode.toString(), ID));
2826+
doc.put(
2827+
"testCaseFullyQualifiedName",
2828+
JsonUtils.extractValue(sourceNode.toString(), FULLY_QUALIFIED_NAME));
2829+
doc.put("entityFQN", JsonUtils.extractValue(sourceNode.toString(), "entityFQN"));
2830+
doc.put("testCaseStatus", JsonUtils.extractValue(sourceNode.toString(), "testCaseStatus"));
2831+
doc.put("timestamp", JsonUtils.extractValue(sourceNode.toString(), "timestamp"));
2832+
doc.put("downstreamUsage", 0);
2833+
doc.put("consumerCount", 0);
2834+
results.add(doc);
2835+
}
2836+
}
2837+
} catch (Exception e) {
2838+
LOG.error("Error parsing search test cases for impact", e);
2839+
}
2840+
2841+
return results;
2842+
}
2843+
27922844
public Response searchSchemaEntityRelationship(
27932845
String fqn, int upstreamDepth, int downstreamDepth, String queryFilter, boolean deleted)
27942846
throws IOException {
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
{
2+
"$id": "https://open-metadata.org/schema/tests/dataQualityCheckImpact.json",
3+
"$schema": "http://json-schema.org/draft-07/schema#",
4+
"title": "DataQualityCheckImpact",
5+
"description": "Data Quality Check Impact model for ranking checks by business criticality.",
6+
"type": "object",
7+
"javaType": "org.openmetadata.schema.tests.DataQualityCheckImpact",
8+
"properties": {
9+
"testCaseId": {
10+
"description": "Unique identifier of the test case.",
11+
"$ref": "../type/basic.json#/definitions/uuid"
12+
},
13+
"testCaseFullyQualifiedName": {
14+
"description": "Fully qualified name of the test case.",
15+
"$ref": "../type/basic.json#/definitions/fullyQualifiedEntityName"
16+
},
17+
"testSuiteId": {
18+
"description": "Unique identifier of the test suite.",
19+
"$ref": "../type/basic.json#/definitions/uuid"
20+
},
21+
"testSuiteFullyQualifiedName": {
22+
"description": "Fully qualified name of the test suite.",
23+
"$ref": "../type/basic.json#/definitions/fullyQualifiedEntityName"
24+
},
25+
"entityFullyQualifiedName": {
26+
"description": "The data entity this test case is testing.",
27+
"$ref": "../type/basic.json#/definitions/fullyQualifiedEntityName"
28+
},
29+
"entityType": {
30+
"description": "The type of entity being tested.",
31+
"type": "string"
32+
},
33+
"impactScore": {
34+
"description": "Calculated impact score (0-100) based on downstream usage, consumers, and incidents.",
35+
"type": "number",
36+
"minimum": 0,
37+
"maximum": 100
38+
},
39+
"downstreamUsage": {
40+
"description": "Number of downstream entities using this data.",
41+
"type": "integer",
42+
"minimum": 0
43+
},
44+
"consumerCount": {
45+
"description": "Number of direct consumers of this data.",
46+
"type": "integer",
47+
"minimum": 0
48+
},
49+
"recentIncidents": {
50+
"description": "Number of failed test results in the last 30 days.",
51+
"type": "integer",
52+
"minimum": 0
53+
},
54+
"lastFailedAt": {
55+
"description": "Timestamp of the most recent test failure.",
56+
"$ref": "../type/basic.json#/definitions/timestamp"
57+
},
58+
"testCaseStatus": {
59+
"description": "Current status of the test case.",
60+
"$ref": "./basic.json#/definitions/testCaseStatus"
61+
},
62+
"dataQualityDimension": {
63+
"description": "Data quality dimension category.",
64+
"type": "string"
65+
}
66+
},
67+
"required": [
68+
"testCaseId",
69+
"testCaseFullyQualifiedName",
70+
"impactScore",
71+
"testCaseStatus"
72+
],
73+
"additionalProperties": false
74+
}

0 commit comments

Comments
 (0)