diff --git a/.github/workflows/docker-build-test.yaml b/.github/workflows/docker-build-test.yaml index fe5454e26fd..128529f383d 100644 --- a/.github/workflows/docker-build-test.yaml +++ b/.github/workflows/docker-build-test.yaml @@ -268,9 +268,9 @@ jobs: secrets: inherit with: GIT_SHA: ${{ needs.determine-docker-build-metadata.outputs.gitSha }} - FORGE_TEST_SUITE: realistic_env_max_load + FORGE_TEST_SUITE: land_blocking IMAGE_TAG: ${{ needs.determine-docker-build-metadata.outputs.gitSha }} - FORGE_RUNNER_DURATION_SECS: 480 + FORGE_RUNNER_DURATION_SECS: 600 COMMENT_HEADER: forge-e2e # Use the cache ID as the Forge namespace so we can limit Forge test concurrency on k8s, since Forge # test lifecycle is separate from that of GHA. This protects us from the case where many Forge tests are triggered diff --git a/.github/workflows/forge-continuous-land-blocking-test.yaml b/.github/workflows/forge-continuous-land-blocking-test.yaml index e91057e0dd0..b5414a4daba 100644 --- a/.github/workflows/forge-continuous-land-blocking-test.yaml +++ b/.github/workflows/forge-continuous-land-blocking-test.yaml @@ -86,7 +86,7 @@ jobs: secrets: inherit with: GIT_SHA: ${{ needs.determine-docker-build-metadata.outputs.gitSha }} - FORGE_TEST_SUITE: realistic_env_max_load + FORGE_TEST_SUITE: land_blocking IMAGE_TAG: ${{ needs.determine-docker-build-metadata.outputs.gitSha }} FORGE_RUNNER_DURATION_SECS: 480 FORGE_CLUSTER_NAME: ${{ inputs.FORGE_CLUSTER_NAME }} diff --git a/testsuite/forge-cli/src/suites/land_blocking.rs b/testsuite/forge-cli/src/suites/land_blocking.rs index 0f9ac0fac1b..b005367bda3 100644 --- a/testsuite/forge-cli/src/suites/land_blocking.rs +++ b/testsuite/forge-cli/src/suites/land_blocking.rs @@ -2,7 +2,10 @@ // Licensed pursuant to the Innovation-Enabling Source Code License, available at https://github.com/aptos-labs/aptos-core/blob/main/LICENSE use super::ungrouped::mixed_compatible_emit_job; -use crate::{suites::realistic_environment::realistic_env_max_load_test, TestCommand}; +use crate::{ + suites::realistic_environment::realistic_env_p90_latency_test, + TestCommand, +}; use aptos_forge::{success_criteria::SuccessCriteria, ForgeConfig}; use aptos_testcases::{ compatibility_test::SimpleValidatorUpgrade, framework_upgrade::FrameworkUpgrade, @@ -16,8 +19,8 @@ pub(crate) fn get_land_blocking_test( test_cmd: &TestCommand, ) -> Option { let test = match test_name { - "land_blocking" | "realistic_env_max_load" => { - realistic_env_max_load_test(duration, test_cmd, 7, 0, 3) + "land_blocking" | "realistic_env_max_load" | "realistic_env_p90_latency" => { + realistic_env_p90_latency_test() }, "compat" => compat(), "framework_upgrade" => framework_upgrade(), diff --git a/testsuite/forge-cli/src/suites/realistic_environment.rs b/testsuite/forge-cli/src/suites/realistic_environment.rs index beab2f2073d..e25e6d92857 100644 --- a/testsuite/forge-cli/src/suites/realistic_environment.rs +++ b/testsuite/forge-cli/src/suites/realistic_environment.rs @@ -24,7 +24,7 @@ use aptos_sdk::types::on_chain_config::{ }; use aptos_testcases::{ load_vs_perf_benchmark::{LoadVsPerfBenchmark, TransactionWorkload, Workloads}, - multi_region_network_test::MultiRegionNetworkEmulationTest, + multi_region_network_test::{MultiRegionNetworkEmulationConfig, MultiRegionNetworkEmulationTest}, performance_test::PerformanceBenchmark, two_traffics_test::TwoTrafficsTest, CompositeNetworkTest, @@ -53,6 +53,7 @@ pub(crate) fn get_realistic_env_test( "realistic_env_graceful_overload" => realistic_env_graceful_overload(duration), "realistic_network_tuned_for_throughput" => realistic_network_tuned_for_throughput_test(), "realistic_env_max_load_encrypted" => realistic_env_max_load_encrypted_test(duration), + "realistic_env_p90_latency" => realistic_env_p90_latency_test(), _ => return None, // The test name does not match a realistic-env test }; Some(test) @@ -714,6 +715,47 @@ pub(crate) fn realistic_network_tuned_for_throughput_test() -> ForgeConfig { forge_config } +/// A latency-focused test that runs at a moderate TPS with a mainnet-like validator distribution: +/// ~70% EU (split across two EU regions), ~20% US East, and ~10% Asia. The geographic bias +/// matches real mainnet topology so that the P90 latency thresholds are meaningful; with an even +/// four-region split the test would under-weight EU and over-weight Asia relative to mainnet. +pub(crate) fn realistic_env_p90_latency_test() -> ForgeConfig { + let num_validators = 20; + + ForgeConfig::default() + .with_initial_validator_count(NonZeroUsize::new(num_validators).unwrap()) + .add_network_test(CompositeNetworkTest::new( + MultiRegionNetworkEmulationTest::new_with_config( + MultiRegionNetworkEmulationConfig::four_regions_mainnet_like(num_validators), + ), + PerformanceBenchmark, + )) + .with_emit_job( + EmitJobRequest::default() + .mode(EmitJobMode::ConstTps { tps: 3500 }) + .latency_polling_interval(Duration::from_millis(100)), + ) + .with_genesis_helm_config_fn(Arc::new(|helm_values| { + // No epoch change so latency measurements are stable. + helm_values["chain"]["epoch_duration_secs"] = (24 * 3600).into(); + helm_values["chain"]["on_chain_consensus_config"] = + serde_yaml::to_value(OnChainConsensusConfig::default_for_genesis()) + .expect("must serialize"); + helm_values["chain"]["on_chain_execution_config"] = + serde_yaml::to_value(OnChainExecutionConfig::default_for_genesis()) + .expect("must serialize"); + })) + .with_success_criteria( + SuccessCriteria::new(3000) + .add_no_restarts() + .add_wait_for_catchup_s(60) + .add_latency_threshold(1.5, LatencyType::P50) + .add_latency_threshold(2.5, LatencyType::P90) + .add_latency_threshold(4.0, LatencyType::P99) + .add_chain_progress(RELIABLE_REAL_ENV_PROGRESS_THRESHOLD.clone()), + ) +} + pub fn wrap_with_realistic_env( num_validators: usize, test: T, diff --git a/testsuite/testcases/src/multi_region_network_test.rs b/testsuite/testcases/src/multi_region_network_test.rs index 10b87a9b6c9..ca745f89fcc 100644 --- a/testsuite/testcases/src/multi_region_network_test.rs +++ b/testsuite/testcases/src/multi_region_network_test.rs @@ -63,12 +63,31 @@ pub(crate) fn chunk_peers(mut peers: Vec>, num_chunks: usize) -> Vec chunks } +/// Splits peers into chunks with the given exact counts. The last chunk absorbs any remaining +/// peers not accounted for by the counts. +fn chunk_peers_with_counts(mut peers: Vec>, counts: &[usize]) -> Vec> { + let mut chunks = vec![]; + for (i, &count) in counts.iter().enumerate() { + let take = if i == counts.len() - 1 { + peers.len() + } else { + count.min(peers.len()) + }; + let remaining = peers.split_off(take); + chunks.push(peers.iter().flatten().cloned().collect()); + peers = remaining; + } + chunks +} + /// Creates a table of peers grouped by region. The peers are divided into N groups, where N is the -/// number of regions provided in the link stats table. Any remaining peers are added to the first -/// group. +/// number of regions provided in the link stats table. +/// If `region_counts` is provided, each group gets exactly that many peers (last group absorbs any +/// remainder). Otherwise peers are distributed evenly across regions. fn create_link_stats_table_with_peer_groups( peers: Vec>, link_stats_table: &LinkStatsTable, + region_counts: Option<&[usize]>, ) -> LinkStatsTableWithPeerGroups { // Verify that we have enough grouped peers to simulate the link stats table assert!(peers.len() >= link_stats_table.len()); @@ -85,7 +104,17 @@ fn create_link_stats_table_with_peer_groups( ); // Create the link stats table with peer groups - let peer_chunks = chunk_peers(peers, number_of_regions); + let peer_chunks = match region_counts { + Some(counts) => { + assert_eq!( + counts.len(), + number_of_regions, + "region_counts length must match number of regions" + ); + chunk_peers_with_counts(peers, counts) + }, + None => chunk_peers(peers, number_of_regions), + }; peer_chunks .into_iter() .zip(link_stats_table.iter()) @@ -217,6 +246,8 @@ pub struct MultiRegionNetworkEmulationConfig { pub link_stats_table: LinkStatsTable, pub inter_region_config: InterRegionNetEmConfig, pub intra_region_config: Option, + /// Optional per-region peer counts. If None, peers are distributed evenly across regions. + pub region_counts: Option>, } impl Default for MultiRegionNetworkEmulationConfig { @@ -225,6 +256,7 @@ impl Default for MultiRegionNetworkEmulationConfig { link_stats_table: get_link_stats_table(FOUR_REGION_LINK_STATS), inter_region_config: InterRegionNetEmConfig::default(), intra_region_config: Some(IntraRegionNetEmConfig::default()), + region_counts: None, } } } @@ -250,6 +282,35 @@ impl MultiRegionNetworkEmulationConfig { ..Default::default() } } + + /// A four-region config that reflects the mainnet validator distribution: + /// ~70% EU, ~25% North America, ~5% Asia. The regions in the CSV are sorted + /// lexicographically, so the weights correspond to: + /// "1-gcp--eu-west2" (Netherlands / Ireland / UK) — 30% + /// "2-gcp--eu-west6" (Germany / France / CH) — 40% + /// "3-gcp--us-east4" (US East / Canada) — 20% + /// "4-gcp--as-southeast1" (Tokyo / Singapore) — 10% + /// Asia is intentionally over-represented relative to mainnet (2%) so that + /// inter-continental tail latency is exercised even with a small validator set. + pub fn four_regions_mainnet_like(num_validators: usize) -> Self { + // Weights in the same lexicographic order as the BTreeMap keys in the CSV. + let weights = [30usize, 40, 20, 10]; + let total_weight: usize = weights.iter().sum(); + let mut counts: Vec = weights + .iter() + .map(|&w| num_validators * w / total_weight) + .collect(); + // Distribute any integer-division remainder to front regions. + let allocated: usize = counts.iter().sum(); + for i in 0..(num_validators - allocated) { + counts[i % weights.len()] += 1; + } + Self { + link_stats_table: get_link_stats_table(FOUR_REGION_LINK_STATS), + region_counts: Some(counts), + ..Default::default() + } + } } /// A test to emulate network conditions for a multi-region setup. @@ -327,6 +388,7 @@ pub fn create_multi_region_swarm_network_chaos( let peer_groups = create_link_stats_table_with_peer_groups( all_peers, &network_emulation_config.link_stats_table, + network_emulation_config.region_counts.as_deref(), ); // Create the inter and intra network emulation configs @@ -385,42 +447,34 @@ mod tests { fn test_create_multi_region_swarm_network_chaos() { aptos_logger::Logger::new().init(); - // Create a config with 8 peers and multiple regions + // Default config: four regions, with intra-region netem. + // 4 intra-region + C(4,2)*2 inter-region = 4 + 12 = 16 group netems. + + // Create a config with 8 peers across 4 regions (2 per region) let all_peers: Vec<_> = (0..8).map(|_| vec![PeerId::random()]).collect(); let netem = create_multi_region_swarm_network_chaos(all_peers, None); + assert_eq!(netem.group_netems.len(), 16); - // Verify the number of group netems - assert_eq!(netem.group_netems.len(), 10); - - // Create a config with 10 peers and multiple regions - let all_peers: Vec<_> = (0..10).map(|_| vec![PeerId::random()]).collect(); + // Create a config with 12 peers across 4 regions (3 per region) + let all_peers: Vec<_> = (0..12).map(|_| vec![PeerId::random()]).collect(); let netem = create_multi_region_swarm_network_chaos(all_peers.clone(), None); + assert_eq!(netem.group_netems.len(), 16); - // Verify the resulting group netems - assert_eq!(netem.group_netems.len(), 10); - assert_eq!(netem.group_netems[0].source_nodes.len(), 4); - assert_eq!(netem.group_netems[0].target_nodes.len(), 4); + // Intra-region netems come first (sorted by BTreeMap key order). + // First region lexicographically is "1-gcp--eu-west2". + assert_eq!(netem.group_netems[0].source_nodes.len(), 3); + assert_eq!(netem.group_netems[0].target_nodes.len(), 3); assert_eq!(netem.group_netems[0], GroupNetEm { - name: "aws--ap-northeast-1-self-netem".to_owned(), + name: "1-gcp--eu-west2-self-netem".to_owned(), rate_in_mbps: 10000, - source_nodes: vec![ - all_peers[0][0], - all_peers[1][0], - all_peers[8][0], - all_peers[9][0], - ], - target_nodes: vec![ - all_peers[0][0], - all_peers[1][0], - all_peers[8][0], - all_peers[9][0], - ], - delay_latency_ms: 50, - delay_jitter_ms: 5, - delay_correlation_percentage: 50, + source_nodes: vec![all_peers[0][0], all_peers[1][0], all_peers[2][0]], + target_nodes: vec![all_peers[0][0], all_peers[1][0], all_peers[2][0]], + delay_latency_ms: 20, + delay_jitter_ms: 0, + delay_correlation_percentage: 20, loss_percentage: 1, - loss_correlation_percentage: 50 - }) + loss_correlation_percentage: 20, + }); } #[test]