Skip to content

Commit 30a41f4

Browse files
danielxiangzlclaude
andcommitted
[forge] Latency baseline with faulty validators at 4k TPS
Experiment branch for testing leader reputation improvements. - Last 2 validators (by ordered index) skip proposals 10% of the time (both regular and opt proposals on round % 10 == 0) - realistic_env_max_load at ConstTps 4k, no fullnodes, no epoch change - 15 min duration override (bypasses CI default of 480s) - Add duration_override field to ForgeConfig for test-controlled duration Not for merge to main. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent c340ec2 commit 30a41f4

6 files changed

Lines changed: 53 additions & 24 deletions

File tree

.github/workflows/docker-build-test.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ jobs:
270270
GIT_SHA: ${{ needs.determine-docker-build-metadata.outputs.gitSha }}
271271
FORGE_TEST_SUITE: realistic_env_max_load
272272
IMAGE_TAG: ${{ needs.determine-docker-build-metadata.outputs.gitSha }}
273-
FORGE_RUNNER_DURATION_SECS: 480
273+
FORGE_RUNNER_DURATION_SECS: 1800
274274
COMMENT_HEADER: forge-e2e
275275
# Use the cache ID as the Forge namespace so we can limit Forge test concurrency on k8s, since Forge
276276
# test lifecycle is separate from that of GHA. This protects us from the case where many Forge tests are triggered

consensus/src/round_manager.rs

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -482,8 +482,25 @@ impl RoundManager {
482482
.expect("Sending to a self loopback unbounded channel cannot fail");
483483
}
484484

485+
// HACK: Simulate proposer failures. The last 2 validators (by ordered index)
486+
// skip their proposal 10% of the time (round % 10 == 0).
487+
let should_skip_proposal = {
488+
let author = self.proposal_generator.author();
489+
let ordered = self.epoch_state.verifier.get_ordered_account_addresses();
490+
let n = ordered.len();
491+
let idx = ordered.iter().position(|a| *a == author);
492+
matches!(idx, Some(i) if i >= n - 2) && new_round_event.round % 10 == 0
493+
};
494+
if should_skip_proposal {
495+
warn!(
496+
self.new_log(LogEvent::NewRound),
497+
"HACK: skipping proposal for round {}", new_round_event.round
498+
);
499+
}
500+
485501
// If the current proposer is the leading, try to propose a regular block if not opt proposed already
486502
if is_current_proposer
503+
&& !should_skip_proposal
487504
&& self
488505
.proposal_generator
489506
.can_propose_in_round(new_round_event.round)
@@ -1427,9 +1444,19 @@ impl RoundManager {
14271444

14281445
let parent = parent_vote.vote_data().proposed().clone();
14291446
let opt_proposal_round = parent.round() + 1;
1430-
if self
1431-
.proposer_election
1432-
.is_valid_proposer(self.proposal_generator.author(), opt_proposal_round)
1447+
// HACK: Skip opt proposal for faulty validators too.
1448+
let should_skip_opt = {
1449+
let author = self.proposal_generator.author();
1450+
let ordered = self.epoch_state.verifier.get_ordered_account_addresses();
1451+
let n = ordered.len();
1452+
let idx = ordered.iter().position(|a| *a == author);
1453+
matches!(idx, Some(i) if i >= n - 2) && opt_proposal_round % 10 == 0
1454+
};
1455+
1456+
if !should_skip_opt
1457+
&& self
1458+
.proposer_election
1459+
.is_valid_proposer(self.proposal_generator.author(), opt_proposal_round)
14331460
{
14341461
let expected_grandparent_round = parent
14351462
.round()

testsuite/forge-cli/src/suites/land_blocking.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ pub(crate) fn get_land_blocking_test(
1717
) -> Option<ForgeConfig> {
1818
let test = match test_name {
1919
"land_blocking" | "realistic_env_max_load" => {
20-
realistic_env_max_load_test(duration, test_cmd, 7, 0, 3)
20+
realistic_env_max_load_test(duration, test_cmd, 7, 0, 0)
21+
.with_duration_override(Duration::from_secs(900))
2122
},
2223
"compat" => compat(),
2324
"framework_upgrade" => framework_upgrade(),

testsuite/forge-cli/src/suites/realistic_environment.rs

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -432,30 +432,18 @@ pub(crate) fn realistic_env_max_load_test(
432432
}
433433

434434
// Create the test
435-
let mempool_backlog = if ha_proxy { 28000 } else { 38000 };
436435
ForgeConfig::default()
437436
.with_initial_validator_count(NonZeroUsize::new(num_validators).unwrap())
438437
.with_initial_fullnode_count(num_vfns)
439438
.add_network_test(wrap_with_realistic_env(num_validators, TwoTrafficsTest {
440439
inner_traffic: EmitJobRequest::default()
441-
.mode(EmitJobMode::MaxLoad { mempool_backlog })
440+
.mode(EmitJobMode::ConstTps { tps: 4000 })
442441
.init_gas_price_multiplier(20),
443-
inner_success_criteria: SuccessCriteria::new(
444-
if ha_proxy {
445-
7000
446-
} else if long_running {
447-
// This is for forge stable
448-
11000
449-
} else {
450-
// During land time we want to be less strict, otherwise we flaky fail
451-
10000
452-
},
453-
),
442+
inner_success_criteria: SuccessCriteria::new(3500),
454443
}))
455444
.with_genesis_helm_config_fn(Arc::new(move |helm_values| {
456-
// Have single epoch change in land blocking, and a few on long-running
457-
helm_values["chain"]["epoch_duration_secs"] =
458-
(if long_running { 600 } else { 300 }).into();
445+
// No epoch change so measurements are stable.
446+
helm_values["chain"]["epoch_duration_secs"] = (24 * 3600).into();
459447
helm_values["chain"]["on_chain_consensus_config"] =
460448
serde_yaml::to_value(OnChainConsensusConfig::default_for_genesis())
461449
.expect("must serialize");

testsuite/forge/src/config.rs

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use aptos_config::config::{
1010
OverrideNodeConfig,
1111
};
1212
use aptos_framework::ReleaseBundle;
13-
use std::{num::NonZeroUsize, sync::Arc};
13+
use std::{num::NonZeroUsize, sync::Arc, time::Duration};
1414

1515
/// A PFN deployment configuration. Each entry maps to one helm release via the PFN deployer.
1616
pub struct PfnDeployment {
@@ -72,6 +72,9 @@ pub struct ForgeConfig {
7272
/// Retain debug logs and above for all nodes instead of just the first 5 nodes
7373
pub retain_debug_logs: bool,
7474

75+
/// Override the CLI-specified test duration
76+
pub duration_override: Option<Duration>,
77+
7578
/// URL to download the trusted setup blob for chunky DKG into the validator init-container
7679
pub decryption_setup_blob_url: Option<String>,
7780
}
@@ -199,6 +202,11 @@ impl ForgeConfig {
199202
self
200203
}
201204

205+
pub fn with_duration_override(mut self, duration: Duration) -> Self {
206+
self.duration_override = Some(duration);
207+
self
208+
}
209+
202210
pub fn with_multi_region_config(mut self) -> Self {
203211
self.multi_region_config = true;
204212
self
@@ -435,6 +443,7 @@ impl Default for ForgeConfig {
435443
fullnode_resource_override: NodeResourceOverride::default(),
436444
retain_debug_logs: false,
437445
decryption_setup_blob_url: None,
446+
duration_override: None,
438447
}
439448
}
440449
}

testsuite/forge/src/runner.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ impl<'cfg, F: Factory> Forge<'cfg, F> {
311311
&initial_version,
312312
&genesis_version,
313313
self.tests.genesis_config.as_ref(),
314-
self.global_duration + Duration::from_secs(NAMESPACE_CLEANUP_DURATION_BUFFER_SECS),
314+
self.tests.duration_override.unwrap_or(self.global_duration) + Duration::from_secs(NAMESPACE_CLEANUP_DURATION_BUFFER_SECS),
315315
self.tests.genesis_helm_config_fn.clone(),
316316
self.tests.build_node_helm_config_fn(retain_debug_logs),
317317
self.tests.existing_db_tag.clone(),
@@ -343,12 +343,16 @@ impl<'cfg, F: Factory> Forge<'cfg, F> {
343343

344344
let logs_location = swarm.logs_location();
345345
let swarm = Arc::new(tokio::sync::RwLock::new(swarm));
346+
let effective_duration = self
347+
.tests
348+
.duration_override
349+
.unwrap_or(self.global_duration);
346350
for test in self.filter_tests(&self.tests.network_tests) {
347351
let network_ctx = NetworkContext::new(
348352
CoreContext::from_rng(&mut rng),
349353
swarm.clone(),
350354
&mut report,
351-
self.global_duration,
355+
effective_duration,
352356
self.tests.emit_job_request.clone(),
353357
self.tests.success_criteria.clone(),
354358
);

0 commit comments

Comments
 (0)