Skip to content

Commit 6848b54

Browse files
core, graph, store: Fix unfail retry when deployment head behind error block
When a subgraph encounters a non-deterministic error and restarts from a checkpoint before the error block, the unfail mechanism would incorrectly stop retrying after the first attempt returned Noop. This happened because `should_try_unfail_non_deterministic` was unconditionally set to `false` after the first unfail attempt, even when the deployment head hadn't reached the error block yet. Fix: Add `UnfailOutcome::BehindErrorBlock` variant to distinguish "deployment head behind error block" (keep retrying) from other Noop cases (stop retrying). Fixes #6205
1 parent 68bb8ac commit 6848b54

5 files changed

Lines changed: 90 additions & 9 deletions

File tree

core/src/subgraph/runner/mod.rs

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1303,12 +1303,14 @@ where
13031303
.unfail_non_deterministic_error(&block_ptr)
13041304
.await?;
13051305

1306-
// Stop trying to unfail.
1307-
self.state.should_try_unfail_non_deterministic = false;
1306+
// Stop trying unless we're still behind the error block.
1307+
if outcome != UnfailOutcome::BehindErrorBlock {
1308+
self.state.should_try_unfail_non_deterministic = false;
13081309

1309-
if let UnfailOutcome::Unfailed = outcome {
1310-
self.metrics.subgraph.deployment_status.running();
1311-
self.state.backoff.reset();
1310+
if outcome == UnfailOutcome::Unfailed {
1311+
self.metrics.subgraph.deployment_status.running();
1312+
self.state.backoff.reset();
1313+
}
13121314
}
13131315
}
13141316

graph/src/components/store/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -668,8 +668,12 @@ pub enum EntityOperation {
668668

669669
#[derive(Debug, PartialEq)]
670670
pub enum UnfailOutcome {
671+
/// Nothing to do - no error exists, or error is of wrong type (e.g., deterministic).
671672
Noop,
673+
/// Successfully unfailed the subgraph.
672674
Unfailed,
675+
/// The deployment head is still behind the error block, retry on subsequent blocks.
676+
BehindErrorBlock,
673677
}
674678

675679
#[derive(Clone, PartialEq, Eq, Debug)]

store/postgres/src/deployment_store.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1872,7 +1872,8 @@ impl DeploymentStore {
18721872

18731873
Ok(UnfailOutcome::Unfailed)
18741874
}
1875-
// NOOP, the deployment head is still before where non-deterministic error happened.
1875+
// The deployment head is still before where non-deterministic error happened.
1876+
// Return BehindErrorBlock so the caller knows to retry on subsequent blocks.
18761877
block_range => {
18771878
info!(
18781879
self.logger,
@@ -1884,7 +1885,7 @@ impl DeploymentStore {
18841885
"error_block_hash" => subgraph_error.block_hash.as_ref().map(|hash| format!("0x{}", hex::encode(hash))),
18851886
);
18861887

1887-
Ok(UnfailOutcome::Noop)
1888+
Ok(UnfailOutcome::BehindErrorBlock)
18881889
}
18891890
}
18901891
}.scope_boxed()).await

store/test-store/tests/postgres/subgraph.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1215,14 +1215,14 @@ fn fail_unfail_non_deterministic_error_noop() {
12151215
// Fail the subgraph with a non-deterministic error, but with an advanced block.
12161216
writable.fail_subgraph(error).await.unwrap();
12171217

1218-
// Since the block range of the block won't match the deployment head, this will be NOOP.
1218+
// Since the deployment head is behind the error block, this returns BehindErrorBlock.
12191219
let outcome = writable
12201220
.unfail_non_deterministic_error(&BLOCKS[1])
12211221
.await
12221222
.unwrap();
12231223

12241224
// State continues the same besides a new error added to the database.
1225-
assert_eq!(outcome, UnfailOutcome::Noop);
1225+
assert_eq!(outcome, UnfailOutcome::BehindErrorBlock);
12261226
assert_eq!(count().await, 2);
12271227
let vi = get_version_info(&store, NAME).await;
12281228
assert_eq!(NAME, vi.deployment_id.as_str());

tests/tests/runner_tests.rs

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1418,3 +1418,77 @@ async fn skip_duplicates() {
14181418
})
14191419
);
14201420
}
1421+
1422+
/// Test that the unfail mechanism retries until the deployment head reaches
1423+
/// the error block. This is a regression test for issue #6205.
1424+
///
1425+
/// Scenario:
1426+
/// 1. Sync to block 1
1427+
/// 2. Inject non-deterministic error at block 3 (ahead of head)
1428+
/// 3. Run runner to block 3
1429+
/// 4. At blocks 1-2: unfail returns BehindErrorBlock, keeps trying
1430+
/// 5. At block 3: unfail succeeds, health → Healthy
1431+
#[graph::test]
1432+
async fn non_deterministic_unfail_retries_until_error_block() -> anyhow::Result<()> {
1433+
let RunnerTestRecipe { stores, test_info } =
1434+
RunnerTestRecipe::new("non_deterministic_unfail_retries", "end-block").await;
1435+
1436+
let blocks = {
1437+
let block_0 = genesis();
1438+
let mut block_1 = empty_block(block_0.ptr(), test_ptr(1));
1439+
let mut block_2 = empty_block(block_1.ptr(), test_ptr(2));
1440+
let mut block_3 = empty_block(block_2.ptr(), test_ptr(3));
1441+
1442+
// Add triggers to exercise normal block processing.
1443+
push_test_log(&mut block_1, "a");
1444+
push_test_log(&mut block_2, "b");
1445+
push_test_log(&mut block_3, "c");
1446+
1447+
vec![block_0, block_1, block_2, block_3]
1448+
};
1449+
1450+
let chain = chain(&test_info.test_name, blocks, &stores, None).await;
1451+
let ctx = fixture::setup(&test_info, &stores, &chain, None, None).await;
1452+
1453+
// Advance head to block 1.
1454+
ctx.start_and_sync_to(test_ptr(1)).await;
1455+
1456+
// Inject non-deterministic fatal error at block 3 (ahead of current head).
1457+
let writable = ctx
1458+
.store
1459+
.clone()
1460+
.writable(ctx.logger.clone(), ctx.deployment.id, Arc::new(vec![]))
1461+
.await
1462+
.unwrap();
1463+
1464+
writable
1465+
.fail_subgraph(SubgraphError {
1466+
subgraph_id: ctx.deployment.hash.clone(),
1467+
message: "injected transient error".to_string(),
1468+
block_ptr: Some(test_ptr(3)),
1469+
handler: None,
1470+
deterministic: false,
1471+
})
1472+
.await
1473+
.unwrap();
1474+
1475+
writable.flush().await.unwrap();
1476+
1477+
// Precondition: deployment is failed with non-deterministic error.
1478+
let status = ctx.indexing_status().await;
1479+
assert!(status.health == SubgraphHealth::Failed);
1480+
assert!(!status.fatal_error.as_ref().unwrap().deterministic);
1481+
1482+
// Run runner to block 3. The unfail mechanism should:
1483+
// - Return BehindErrorBlock at blocks 1-2 (keep trying)
1484+
// - Return Unfailed at block 3 (success)
1485+
let stop_block = test_ptr(3);
1486+
let _runner = ctx.runner(stop_block).await.run_for_test(false).await?;
1487+
1488+
// Postcondition: deployment is healthy, no fatal error.
1489+
let status = ctx.indexing_status().await;
1490+
assert!(status.health == SubgraphHealth::Healthy);
1491+
assert!(status.fatal_error.is_none());
1492+
1493+
Ok(())
1494+
}

0 commit comments

Comments
 (0)