|
10 | 10 | #![recursion_limit = "256"] |
11 | 11 |
|
12 | 12 | use std::collections::BTreeMap; |
| 13 | +use std::sync::Arc; |
13 | 14 |
|
14 | 15 | use insta::assert_debug_snapshot; |
15 | 16 | use itertools::Itertools; |
16 | 17 | use mz_audit_log::{EventDetails, EventType, EventV1, IdNameV1, VersionedEvent}; |
17 | 18 | use mz_catalog::durable::objects::serialization::proto; |
18 | 19 | use mz_catalog::durable::objects::{DurableType, IdAlloc}; |
19 | 20 | use mz_catalog::durable::{ |
20 | | - CatalogError, DurableCatalogError, FenceError, Item, TestCatalogStateBuilder, |
21 | | - USER_ITEM_ALLOC_KEY, test_bootstrap_args, |
| 21 | + CatalogError, Database, DurableCatalogError, FenceError, Item, Metrics, |
| 22 | + TestCatalogStateBuilder, USER_ITEM_ALLOC_KEY, test_bootstrap_args, |
22 | 23 | }; |
23 | 24 | use mz_ore::assert_ok; |
24 | 25 | use mz_ore::collections::HashSet; |
| 26 | +use mz_ore::metrics::MetricsRegistry; |
25 | 27 | use mz_ore::now::SYSTEM_TIME; |
26 | 28 | use mz_persist_client::PersistClient; |
27 | 29 | use mz_proto::RustType; |
@@ -553,3 +555,188 @@ async fn test_persist_ddl_detection_with_batch_allocated_ids() { |
553 | 555 |
|
554 | 556 | Box::new(state).expire().await; |
555 | 557 | } |
| 558 | + |
| 559 | +/// Regression test for incident-970: quadratic consolidation during catalog sync. |
| 560 | +/// |
| 561 | +/// When a reader syncs through K timestamps, apply_updates() was calling |
| 562 | +/// consolidate() on the entire snapshot for each timestamp, resulting in |
| 563 | +/// O(K * N log N) work instead of O(N log N). This test verifies that syncing |
| 564 | +/// through many timestamps only consolidates the snapshot a constant number of |
| 565 | +/// times, not once per timestamp. |
| 566 | +#[mz_ore::test(tokio::test)] |
| 567 | +#[cfg_attr(miri, ignore)] |
| 568 | +async fn test_persist_sync_consolidation_not_quadratic() { |
| 569 | + let persist_client = PersistClient::new_for_tests().await; |
| 570 | + let metrics = Arc::new(Metrics::new(&MetricsRegistry::new())); |
| 571 | + let state_builder = |
| 572 | + TestCatalogStateBuilder::new(persist_client).with_default_deploy_generation(); |
| 573 | + // Share metrics between writer and reader so we can observe consolidation counts. |
| 574 | + let state_builder = state_builder.with_metrics(Arc::clone(&metrics)); |
| 575 | + |
| 576 | + // Open a writer catalog. |
| 577 | + let mut writer = state_builder |
| 578 | + .clone() |
| 579 | + .unwrap_build() |
| 580 | + .await |
| 581 | + .open(SYSTEM_TIME().into(), &test_bootstrap_args()) |
| 582 | + .await |
| 583 | + .unwrap() |
| 584 | + .0; |
| 585 | + let _ = writer.sync_to_current_updates().await.unwrap(); |
| 586 | + |
| 587 | + // Open a read-only catalog, caught up to the current upper. |
| 588 | + let mut reader = state_builder |
| 589 | + .clone() |
| 590 | + .unwrap_build() |
| 591 | + .await |
| 592 | + .open_read_only(&test_bootstrap_args()) |
| 593 | + .await |
| 594 | + .unwrap(); |
| 595 | + let _ = reader.sync_to_current_updates().await.unwrap(); |
| 596 | + |
| 597 | + // Writer creates many databases, each in its own transaction at a distinct |
| 598 | + // timestamp. This mirrors the incident scenario where DDL happened across |
| 599 | + // many timestamps while a read-only envd was restarting. |
| 600 | + let num_timestamps: u64 = 100; |
| 601 | + for i in 0..num_timestamps { |
| 602 | + let mut txn = writer.transaction().await.unwrap(); |
| 603 | + txn.insert_user_database( |
| 604 | + &format!("db_{i}"), |
| 605 | + RoleId::User(1), |
| 606 | + Vec::new(), |
| 607 | + &HashSet::new(), |
| 608 | + ) |
| 609 | + .unwrap(); |
| 610 | + let _ = txn.get_and_commit_op_updates(); |
| 611 | + let commit_ts = txn.upper(); |
| 612 | + txn.commit(commit_ts).await.unwrap(); |
| 613 | + } |
| 614 | + |
| 615 | + // Record the consolidation counter before the reader syncs. |
| 616 | + let consolidations_before = metrics.snapshot_consolidations.get(); |
| 617 | + |
| 618 | + // Reader syncs through all timestamps. With the quadratic bug, this would |
| 619 | + // call consolidate() once per timestamp (num_timestamps times). With the |
| 620 | + // fix, it should consolidate only once after processing all timestamps. |
| 621 | + let updates = reader.sync_to_current_updates().await.unwrap(); |
| 622 | + let consolidations_after = metrics.snapshot_consolidations.get(); |
| 623 | + let consolidations_during_sync = consolidations_after - consolidations_before; |
| 624 | + |
| 625 | + // Verify correctness: reader received updates and can see all databases. |
| 626 | + assert!( |
| 627 | + !updates.is_empty(), |
| 628 | + "reader should have received updates from writer" |
| 629 | + ); |
| 630 | + let snapshot = reader.snapshot().await.unwrap(); |
| 631 | + for i in 0..num_timestamps { |
| 632 | + let db_name = format!("db_{i}"); |
| 633 | + let found = snapshot.databases.values().any(|db| db.name == db_name); |
| 634 | + assert!(found, "database {db_name} not found in reader snapshot"); |
| 635 | + } |
| 636 | + |
| 637 | + // The key assertion: consolidation should happen O(log N) times during |
| 638 | + // the sync (from the doubling strategy), NOT once per timestamp (which |
| 639 | + // would be num_timestamps = 100). We allow a generous bound here. |
| 640 | + assert!( |
| 641 | + consolidations_during_sync < 10, |
| 642 | + "sync through {num_timestamps} timestamps triggered {consolidations_during_sync} \ |
| 643 | + snapshot consolidations, suggesting quadratic behavior (expected < 10)" |
| 644 | + ); |
| 645 | + |
| 646 | + Box::new(writer).expire().await; |
| 647 | + Box::new(reader).expire().await; |
| 648 | +} |
| 649 | + |
| 650 | +/// Verify that the reader's snapshot stays bounded during sync catch-up, even |
| 651 | +/// when the writer churns the same object many times across timestamps. Without |
| 652 | +/// the doubling consolidation in `sync_inner`, the snapshot would grow with |
| 653 | +/// every retract+insert pair; with it, the snapshot stays within ~2x the live |
| 654 | +/// catalog size. |
| 655 | +#[mz_ore::test(tokio::test)] |
| 656 | +#[cfg_attr(miri, ignore)] |
| 657 | +async fn test_persist_sync_snapshot_stays_bounded_under_churn() { |
| 658 | + let persist_client = PersistClient::new_for_tests().await; |
| 659 | + let metrics = Arc::new(Metrics::new(&MetricsRegistry::new())); |
| 660 | + let state_builder = TestCatalogStateBuilder::new(persist_client) |
| 661 | + .with_default_deploy_generation() |
| 662 | + .with_metrics(Arc::clone(&metrics)); |
| 663 | + |
| 664 | + // Open writer, create one database to churn. |
| 665 | + let mut writer = state_builder |
| 666 | + .clone() |
| 667 | + .unwrap_build() |
| 668 | + .await |
| 669 | + .open(SYSTEM_TIME().into(), &test_bootstrap_args()) |
| 670 | + .await |
| 671 | + .unwrap() |
| 672 | + .0; |
| 673 | + let _ = writer.sync_to_current_updates().await.unwrap(); |
| 674 | + |
| 675 | + let mut txn = writer.transaction().await.unwrap(); |
| 676 | + let (db_id, db_oid) = txn |
| 677 | + .insert_user_database("churn_db", RoleId::User(1), Vec::new(), &HashSet::new()) |
| 678 | + .unwrap(); |
| 679 | + let _ = txn.get_and_commit_op_updates(); |
| 680 | + let commit_ts = txn.upper(); |
| 681 | + txn.commit(commit_ts).await.unwrap(); |
| 682 | + |
| 683 | + // Open reader, sync to current state. |
| 684 | + let mut reader = state_builder |
| 685 | + .unwrap_build() |
| 686 | + .await |
| 687 | + .open_read_only(&test_bootstrap_args()) |
| 688 | + .await |
| 689 | + .unwrap(); |
| 690 | + let _ = reader.sync_to_current_updates().await.unwrap(); |
| 691 | + let peak_before = metrics.snapshot_max_entries.get(); |
| 692 | + |
| 693 | + // Rename the same database 200 times, each in its own transaction. |
| 694 | + let num_renames: u64 = 200; |
| 695 | + let mut db = Database { |
| 696 | + id: db_id, |
| 697 | + oid: db_oid, |
| 698 | + name: "churn_db".to_string(), |
| 699 | + owner_id: RoleId::User(1), |
| 700 | + privileges: Vec::new(), |
| 701 | + }; |
| 702 | + for i in 0..num_renames { |
| 703 | + let mut txn = writer.transaction().await.unwrap(); |
| 704 | + db.name = format!("churn_db_{i}"); |
| 705 | + txn.update_database(db.id, db.clone()).unwrap(); |
| 706 | + let _ = txn.get_and_commit_op_updates(); |
| 707 | + let commit_ts = txn.upper(); |
| 708 | + txn.commit(commit_ts).await.unwrap(); |
| 709 | + } |
| 710 | + |
| 711 | + // Reader syncs through all 200 renames. |
| 712 | + let _ = reader.sync_to_current_updates().await.unwrap(); |
| 713 | + |
| 714 | + // Verify correctness: only one database, with the final name. |
| 715 | + let snapshot = reader.snapshot().await.unwrap(); |
| 716 | + let churn_dbs: Vec<_> = snapshot |
| 717 | + .databases |
| 718 | + .values() |
| 719 | + .filter(|d| d.name.starts_with("churn_db")) |
| 720 | + .collect(); |
| 721 | + assert_eq!(churn_dbs.len(), 1, "{churn_dbs:#?}"); |
| 722 | + assert_eq!(churn_dbs[0].name, format!("churn_db_{}", num_renames - 1)); |
| 723 | + |
| 724 | + // The key assertion: the snapshot high-water mark should stay bounded, |
| 725 | + // not grow proportionally to num_renames. The doubling consolidation |
| 726 | + // keeps it within ~2x the live catalog size. |
| 727 | + let peak_after = metrics.snapshot_max_entries.get(); |
| 728 | + let peak_delta = peak_after - peak_before; |
| 729 | + // With doubling consolidation, the snapshot stays within ~2x the live |
| 730 | + // catalog size. Without consolidation this would grow by ~387 for 200 |
| 731 | + // renames; with it, the delta should be much smaller. |
| 732 | + let bounded = peak_before * 2; |
| 733 | + assert!( |
| 734 | + peak_delta < bounded, |
| 735 | + "peak unconsolidated snapshot grew by {peak_delta} over {num_renames} \ |
| 736 | + renames (peak_before={peak_before}, peak_after={peak_after}); \ |
| 737 | + expected < {bounded}" |
| 738 | + ); |
| 739 | + |
| 740 | + Box::new(writer).expire().await; |
| 741 | + Box::new(reader).expire().await; |
| 742 | +} |
0 commit comments