|
37 | 37 | import org.apache.druid.java.util.common.Pair; |
38 | 38 | import org.apache.druid.java.util.common.StringUtils; |
39 | 39 | import org.apache.druid.java.util.common.concurrent.ScheduledExecutors; |
| 40 | +import org.apache.druid.java.util.common.guava.Sequence; |
40 | 41 | import org.apache.druid.java.util.common.guava.Sequences; |
41 | 42 | import org.apache.druid.java.util.emitter.EmittingLogger; |
42 | 43 | import org.apache.druid.java.util.emitter.service.ServiceEmitter; |
|
47 | 48 | import org.apache.druid.metadata.TestDerbyConnector; |
48 | 49 | import org.apache.druid.query.DruidMetrics; |
49 | 50 | import org.apache.druid.query.QueryContexts; |
| 51 | +import org.apache.druid.query.QueryInterruptedException; |
| 52 | +import org.apache.druid.query.QueryTimeoutException; |
50 | 53 | import org.apache.druid.query.TableDataSource; |
51 | 54 | import org.apache.druid.query.aggregation.CountAggregatorFactory; |
52 | 55 | import org.apache.druid.query.aggregation.DoubleSumAggregatorFactory; |
|
112 | 115 | import java.util.Set; |
113 | 116 | import java.util.concurrent.CountDownLatch; |
114 | 117 | import java.util.concurrent.TimeUnit; |
| 118 | +import java.util.concurrent.atomic.AtomicReference; |
115 | 119 | import java.util.stream.Collectors; |
116 | 120 |
|
117 | 121 | public class CoordinatorSegmentMetadataCacheTest extends CoordinatorSegmentMetadataCacheTestBase |
@@ -2455,4 +2459,121 @@ private void verifyFoo2DSSchema(CoordinatorSegmentMetadataCache schema) |
2455 | 2459 | Assert.assertEquals("m1", columnNames.get(2)); |
2456 | 2460 | Assert.assertEquals(ColumnType.LONG, fooRowSignature.getColumnType(columnNames.get(2)).get()); |
2457 | 2461 | } |
| 2462 | + |
| 2463 | + /** |
| 2464 | + * A failure refreshing one dataSource (e.g. a SegmentMetadataQuery timeout) must not abort the whole |
| 2465 | + * refresh cycle: other dataSources are still refreshed, and a {@code segment/schemaCache/refresh/failed} |
| 2466 | + * metric is emitted for the failing dataSource. |
| 2467 | + */ |
| 2468 | + @Test |
| 2469 | + public void testRefreshFailureForOneDatasourceIsIsolated() throws InterruptedException, IOException |
| 2470 | + { |
| 2471 | + final StubServiceEmitter emitter = new StubServiceEmitter("test", "test"); |
| 2472 | + final CoordinatorSegmentMetadataCache schema = new CoordinatorSegmentMetadataCache( |
| 2473 | + getQueryLifecycleFactory(walker), |
| 2474 | + serverView, |
| 2475 | + SEGMENT_CACHE_CONFIG_DEFAULT, |
| 2476 | + new NoopEscalator(), |
| 2477 | + new InternalQueryConfig(), |
| 2478 | + emitter, |
| 2479 | + segmentSchemaCache, |
| 2480 | + backFillQueue, |
| 2481 | + segmentsMetadataManager, |
| 2482 | + segmentsMetadataManagerConfigSupplier |
| 2483 | + ) |
| 2484 | + { |
| 2485 | + @Override |
| 2486 | + public Sequence<SegmentAnalysis> runSegmentMetadataQuery(Iterable<SegmentId> segments) |
| 2487 | + { |
| 2488 | + // Simulate a metadata query that times out for DATASOURCE1 but succeeds for everything else. |
| 2489 | + final SegmentId first = segments.iterator().next(); |
| 2490 | + if (DATASOURCE1.equals(first.getDataSource())) { |
| 2491 | + throw new QueryTimeoutException("test-induced timeout for " + DATASOURCE1); |
| 2492 | + } |
| 2493 | + return super.runSegmentMetadataQuery(segments); |
| 2494 | + } |
| 2495 | + }; |
| 2496 | + |
| 2497 | + schema.onLeaderStart(); |
| 2498 | + schema.awaitInitialization(); |
| 2499 | + |
| 2500 | + final Set<SegmentId> allSegmentIds = schema.getSegmentMetadataSnapshot().keySet(); |
| 2501 | + emitter.flush(); |
| 2502 | + |
| 2503 | + // Must not propagate the DATASOURCE1 failure. |
| 2504 | + final Set<SegmentId> refreshed = schema.refreshSegments(allSegmentIds); |
| 2505 | + |
| 2506 | + // The healthy dataSources were still refreshed despite DATASOURCE1 failing. |
| 2507 | + Assert.assertTrue( |
| 2508 | + "expected a refreshed segment from " + DATASOURCE2, |
| 2509 | + refreshed.stream().anyMatch(id -> DATASOURCE2.equals(id.getDataSource())) |
| 2510 | + ); |
| 2511 | + Assert.assertTrue( |
| 2512 | + "expected a refreshed segment from " + SOME_DATASOURCE, |
| 2513 | + refreshed.stream().anyMatch(id -> SOME_DATASOURCE.equals(id.getDataSource())) |
| 2514 | + ); |
| 2515 | + // The failing dataSource produced no refreshed segments. |
| 2516 | + Assert.assertTrue( |
| 2517 | + "expected no refreshed segments from the failing " + DATASOURCE1, |
| 2518 | + refreshed.stream().noneMatch(id -> DATASOURCE1.equals(id.getDataSource())) |
| 2519 | + ); |
| 2520 | + |
| 2521 | + // A failure metric was emitted, dimensioned by the failing dataSource. |
| 2522 | + final List<Number> failures = emitter.getMetricValues( |
| 2523 | + Metric.REFRESH_FAILED, |
| 2524 | + ImmutableMap.of(DruidMetrics.DATASOURCE, DATASOURCE1) |
| 2525 | + ); |
| 2526 | + Assert.assertFalse("expected a refresh/failed metric for " + DATASOURCE1, failures.isEmpty()); |
| 2527 | + Assert.assertEquals(1, failures.get(0).intValue()); |
| 2528 | + } |
| 2529 | + |
| 2530 | + @Test |
| 2531 | + public void testLocalInterruptionPropagatesButWrappedQueryFailureIsIsolated() throws IOException |
| 2532 | + { |
| 2533 | + final StubServiceEmitter emitter = new StubServiceEmitter("test", "test"); |
| 2534 | + final AtomicReference<Throwable> causeRef = new AtomicReference<>(); |
| 2535 | + final CoordinatorSegmentMetadataCache schema = new CoordinatorSegmentMetadataCache( |
| 2536 | + getQueryLifecycleFactory(walker), |
| 2537 | + serverView, |
| 2538 | + SEGMENT_CACHE_CONFIG_DEFAULT, |
| 2539 | + new NoopEscalator(), |
| 2540 | + new InternalQueryConfig(), |
| 2541 | + emitter, |
| 2542 | + segmentSchemaCache, |
| 2543 | + backFillQueue, |
| 2544 | + segmentsMetadataManager, |
| 2545 | + segmentsMetadataManagerConfigSupplier |
| 2546 | + ) |
| 2547 | + { |
| 2548 | + @Override |
| 2549 | + public Sequence<SegmentAnalysis> runSegmentMetadataQuery(Iterable<SegmentId> segments) |
| 2550 | + { |
| 2551 | + throw new QueryInterruptedException(causeRef.get()); |
| 2552 | + } |
| 2553 | + }; |
| 2554 | + |
| 2555 | + final Set<SegmentId> segments = ImmutableSet.of( |
| 2556 | + SegmentId.of(DATASOURCE1, Intervals.of("2000/2001"), "v1", 0) |
| 2557 | + ); |
| 2558 | + |
| 2559 | + // Genuine local interruption: propagate, restore the interrupt flag, record no failure. |
| 2560 | + causeRef.set(new InterruptedException("test interrupt")); |
| 2561 | + Assert.assertThrows(QueryInterruptedException.class, () -> schema.refreshSegments(segments)); |
| 2562 | + Assert.assertTrue("interrupt flag should be restored", Thread.interrupted()); // also clears it for the next case |
| 2563 | + Assert.assertTrue( |
| 2564 | + "local interruption must not emit a refresh/failed metric", |
| 2565 | + emitter.getMetricEvents(Metric.REFRESH_FAILED).isEmpty() |
| 2566 | + ); |
| 2567 | + |
| 2568 | + // Wrapped ordinary failure (no InterruptedException cause): isolate it - no propagation, failure recorded. |
| 2569 | + causeRef.set(new RuntimeException("wrapped query failure")); |
| 2570 | + schema.refreshSegments(segments); // must not throw |
| 2571 | + Assert.assertFalse("interrupt flag must not be set for a wrapped failure", Thread.currentThread().isInterrupted()); |
| 2572 | + final List<Number> failures = emitter.getMetricValues( |
| 2573 | + Metric.REFRESH_FAILED, |
| 2574 | + ImmutableMap.of(DruidMetrics.DATASOURCE, DATASOURCE1) |
| 2575 | + ); |
| 2576 | + Assert.assertFalse("a wrapped query failure should emit refresh/failed", failures.isEmpty()); |
| 2577 | + Assert.assertEquals(1, failures.get(0).intValue()); |
| 2578 | + } |
2458 | 2579 | } |
0 commit comments