Skip to content

Commit afbc5a0

Browse files
committed
Fixup: Fix flaky test and ensure sync from imported checkpoint
Prevously compute may not sync the latest round and checkpoint might fail. This is fixed by querying compute explicitly.
1 parent a00d4a9 commit afbc5a0

1 file changed

Lines changed: 70 additions & 23 deletions

File tree

go/oasis-test-runner/scenario/e2e/runtime/checkpoint_create_import.go

Lines changed: 70 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -48,43 +48,57 @@ func (sc *checkpointCreateImportImpl) Run(ctx context.Context, childEnv *env.Env
4848
return err
4949
}
5050

51+
src := sc.Net.ComputeWorkers()[0]
52+
srcCtrl, err := oasis.NewController(src.SocketPath())
53+
if err != nil {
54+
return fmt.Errorf("failed to create controller for the source node: %w", err)
55+
}
56+
5157
// Use height - 3 so that blocks at h, h+1, h+2 all exist in the block store.
52-
blk, err := sc.Net.Controller().Consensus.GetBlock(ctx, consensus.HeightLatest)
58+
blk, err := srcCtrl.Consensus.GetBlock(ctx, consensus.HeightLatest)
5359
if err != nil {
5460
return fmt.Errorf("failed to get latest consensus block: %w", err)
5561
}
56-
height := blk.Height - 3
57-
58-
rtBlk, err := sc.Net.ClientController().Roothash.GetLatestBlock(ctx, &roothash.RuntimeRequest{
62+
candidateHeight := blk.Height - 3
63+
rtState, err := srcCtrl.Roothash.GetRuntimeState(ctx, &roothash.RuntimeRequest{
5964
RuntimeID: KeyValueRuntimeID,
60-
Height: height,
65+
Height: candidateHeight,
6166
})
6267
if err != nil {
63-
return fmt.Errorf("failed to get runtime block at height %d: %w", height, err)
68+
return fmt.Errorf("failed to get runtime state for height %d: %w", candidateHeight, err)
6469
}
65-
round := rtBlk.Header.Round
6670

67-
sc.Logger.Info("creating checkpoints",
68-
"height", height,
69-
"round", round,
70-
"runtime_id", KeyValueRuntimeID,
71-
)
71+
// Pick runtime state's LastBlockHeight as the consensus checkpoint height else
72+
// runtime light history indexer might miss authoritative light block for the
73+
// corresponding runtime round.
74+
cpRound := rtState.LastBlock.Header.Round
75+
cpHeight := rtState.LastBlockHeight
7276

73-
cpDir := filepath.Join(childEnv.Dir(), "checkpoint")
77+
// Ensure runtime round is synced before stopping the node and creating a checkpoint for it.
78+
if err := srcCtrl.WaitRuntimeRound(ctx, KeyValueRuntimeID, cpRound); err != nil {
79+
return fmt.Errorf("waiting runtime round %d: %w", cpRound, err)
80+
}
7481

7582
// Stop compute worker 0 (source node).
76-
source := sc.Net.ComputeWorkers()[0]
77-
if err := source.StopGracefully(); err != nil {
83+
if err := src.StopGracefully(); err != nil {
7884
return fmt.Errorf("failed to stop source compute worker: %w", err)
7985
}
8086

8187
// Create checkpoints from the source node's data.
88+
cpDir := filepath.Join(childEnv.Dir(), "checkpoint")
89+
90+
sc.Logger.Info("creating checkpoints",
91+
"height", cpHeight,
92+
"round", cpRound,
93+
"runtime_id", KeyValueRuntimeID,
94+
)
95+
8296
args := []string{
8397
"storage", "checkpoint", "create",
84-
"--config", source.ConfigFile(),
85-
"--height", fmt.Sprintf("%d", height),
98+
"--config", src.ConfigFile(),
99+
"--height", fmt.Sprintf("%d", cpHeight),
86100
"--runtime", KeyValueRuntimeID.Hex(),
87-
"--round", fmt.Sprintf("%d", round),
101+
"--round", fmt.Sprintf("%d", cpRound),
88102
"--output-dir", cpDir,
89103
"--debug.dont_blame_oasis",
90104
"--debug.allow_test_keys",
@@ -96,7 +110,7 @@ func (sc *checkpointCreateImportImpl) Run(ctx context.Context, childEnv *env.Env
96110
sc.Logger.Info("checkpoints created successfully")
97111

98112
// Start the source compute worker again.
99-
if err := source.Start(); err != nil {
113+
if err := src.Start(); err != nil {
100114
return fmt.Errorf("failed to restart source compute worker: %w", err)
101115
}
102116

@@ -135,15 +149,48 @@ func (sc *checkpointCreateImportImpl) Run(ctx context.Context, childEnv *env.Env
135149
return fmt.Errorf("failed to start target node: %w", err)
136150
}
137151

138-
// Wait for the target node to sync.
139-
sc.Logger.Info("waiting for target node to sync")
140-
ctrl, err := oasis.NewController(target.SocketPath())
152+
targetCtrl, err := oasis.NewController(target.SocketPath())
141153
if err != nil {
142154
return fmt.Errorf("failed to create controller for target node: %w", err)
143155
}
144-
if err := ctrl.WaitReady(ctx); err != nil {
156+
157+
// Ensure target node syncs to the tip of the chain from the imported checkpoints.
158+
sc.Logger.Info("waiting for target node to sync")
159+
if err := targetCtrl.WaitReady(ctx); err != nil {
145160
return fmt.Errorf("target node failed to sync: %w", err)
146161
}
162+
sc.Logger.Info("target node is ready")
163+
164+
// Manually ensure that runtime state was synced up to the latest round as
165+
// WaitReady only guarantees consensus sync.
166+
latestBlk, err := sc.Net.ClientController().Roothash.GetLatestBlock(ctx, &roothash.RuntimeRequest{
167+
RuntimeID: KeyValueRuntimeID,
168+
Height: consensus.HeightLatest,
169+
})
170+
if err != nil {
171+
return fmt.Errorf("failed to get latest runtime block: %w", err)
172+
}
173+
latestRound := latestBlk.Header.Round
174+
sc.Logger.Info("waiting the target node to have runtime state synced")
175+
if err := targetCtrl.WaitRuntimeRound(ctx, KeyValueRuntimeID, latestRound); err != nil {
176+
return fmt.Errorf("waiting synced runtime round %d: %w", latestRound, err)
177+
}
178+
sc.Logger.Info("target node has runtime state synced")
179+
180+
// Ensure target synced from the imported checkpoint and not from the genesis.
181+
status, err := targetCtrl.NodeController.GetStatus(ctx)
182+
if err != nil {
183+
return fmt.Errorf("failed to get target node status: %w", err)
184+
}
185+
if lastRetainedHeight := status.Consensus.LastRetainedHeight; lastRetainedHeight != cpHeight {
186+
sc.Logger.Info("last retained height is not equal to the imported checkpoint height",
187+
"cp_height", cpHeight,
188+
"last_retained_height", lastRetainedHeight)
189+
return fmt.Errorf("failed to ensure consensus synced from the imported checkpoint")
190+
}
191+
// No need to assert target node didn't sync runtime state from the genesis,
192+
// since runtime genesis sync cannot succeed with a missing runtime light history
193+
// (the case when consensus is synced using an imported checkpoint, asserted above).
147194

148195
return nil
149196
}

0 commit comments

Comments
 (0)