Skip to content

Commit bf3f601

Browse files
authored
Merge pull request #497 from bootjp/feat/metrics-etcd
raftengine: persist peers during migration to prevent split-brain leader election
2 parents 551390c + 4dfedc7 commit bf3f601

5 files changed

Lines changed: 93 additions & 1 deletion

File tree

internal/raftengine/etcd/migrate.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,14 @@ func seedMigrationDir(tempDir string, peers []Peer, snapshotData []byte) error {
9696
if err := closePersist(disk.Persist); err != nil {
9797
return err
9898
}
99+
// Persist the peer list so the engine discovers all cluster members on
100+
// first open even when the caller's FactoryConfig.Peers is empty (the
101+
// common case during migration, where --raftBootstrapMembers is not
102+
// repeated). Without this file the engine falls back to a single-node
103+
// configuration and every node elects itself leader independently.
104+
if err := savePersistedPeers(tempDir, state.Snapshot.Metadata.Index, peers); err != nil {
105+
return err
106+
}
99107
return nil
100108
}
101109

internal/raftengine/etcd/migrate_test.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,49 @@ func TestMigrateFSMStoreSeedsEtcdDataDir(t *testing.T) {
9898
require.NoError(t, err)
9999
require.Equal(t, []byte("one"), value)
100100
}
101+
102+
func TestMigrateFSMStorePersistsSingleNodePeer(t *testing.T) {
103+
sourcePath := filepath.Join(t.TempDir(), "fsm.db")
104+
source, err := store.NewPebbleStore(sourcePath)
105+
require.NoError(t, err)
106+
require.NoError(t, source.Close())
107+
108+
destDataDir := filepath.Join(t.TempDir(), "raft")
109+
peers := []Peer{{NodeID: 1, ID: "n1", Address: "127.0.0.1:7001"}}
110+
_, err = MigrateFSMStore(sourcePath, destDataDir, peers)
111+
require.NoError(t, err)
112+
113+
loaded, ok, err := LoadPersistedPeers(destDataDir)
114+
require.NoError(t, err)
115+
require.True(t, ok, "persisted peers file must exist after single-node migration")
116+
require.Len(t, loaded, 1)
117+
require.Equal(t, peers[0].NodeID, loaded[0].NodeID)
118+
}
119+
120+
func TestMigrateFSMStorePersistsMultiNodePeers(t *testing.T) {
121+
sourcePath := filepath.Join(t.TempDir(), "fsm.db")
122+
source, err := store.NewPebbleStore(sourcePath)
123+
require.NoError(t, err)
124+
require.NoError(t, source.Close())
125+
126+
destDataDir := filepath.Join(t.TempDir(), "raft")
127+
peers := []Peer{
128+
{NodeID: 1, ID: "n1", Address: "127.0.0.1:7001"},
129+
{NodeID: 2, ID: "n2", Address: "127.0.0.1:7002"},
130+
{NodeID: 3, ID: "n3", Address: "127.0.0.1:7003"},
131+
}
132+
_, err = MigrateFSMStore(sourcePath, destDataDir, peers)
133+
require.NoError(t, err)
134+
135+
// Persisted peers must exist so the engine discovers all cluster members
136+
// even when FactoryConfig.Peers is empty (the common post-migration case).
137+
loaded, ok, err := LoadPersistedPeers(destDataDir)
138+
require.NoError(t, err)
139+
require.True(t, ok, "persisted peers file must exist after migration")
140+
require.Len(t, loaded, 3)
141+
for i, peer := range loaded {
142+
require.Equal(t, peers[i].NodeID, peer.NodeID)
143+
require.Equal(t, peers[i].ID, peer.ID)
144+
require.Equal(t, peers[i].Address, peer.Address)
145+
}
146+
}

multiraft_runtime.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,8 @@ func detectRaftEngineFromDataDir(dir string) (raftEngineType, bool, error) {
140140
return "", false, err
141141
}
142142
etcdArtifacts, err := hasRaftArtifacts(dir,
143+
"wal",
144+
"snap",
143145
filepath.Join("member", "wal"),
144146
filepath.Join("member", "snap"),
145147
"etcd-raft-state.bin",

multiraft_runtime_test.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,4 +224,31 @@ func TestEnsureRaftEngineDataDir(t *testing.T) {
224224
require.True(t, ok)
225225
require.Equal(t, raftEngineEtcd, engineType)
226226
})
227+
228+
t.Run("detects bare wal dir as etcd artifact", func(t *testing.T) {
229+
dir := t.TempDir()
230+
require.NoError(t, os.MkdirAll(filepath.Join(dir, "wal"), 0o755))
231+
engineType, ok, err := detectRaftEngineFromDataDir(dir)
232+
require.NoError(t, err)
233+
require.True(t, ok)
234+
require.Equal(t, raftEngineEtcd, engineType)
235+
})
236+
237+
t.Run("detects bare snap dir as etcd artifact", func(t *testing.T) {
238+
dir := t.TempDir()
239+
require.NoError(t, os.MkdirAll(filepath.Join(dir, "snap"), 0o755))
240+
engineType, ok, err := detectRaftEngineFromDataDir(dir)
241+
require.NoError(t, err)
242+
require.True(t, ok)
243+
require.Equal(t, raftEngineEtcd, engineType)
244+
})
245+
246+
t.Run("detects mixed engine artifacts with bare wal", func(t *testing.T) {
247+
dir := t.TempDir()
248+
require.NoError(t, os.WriteFile(filepath.Join(dir, "raft.db"), []byte("dummy"), 0o600))
249+
require.NoError(t, os.MkdirAll(filepath.Join(dir, "wal"), 0o755))
250+
_, _, err := detectRaftEngineFromDataDir(dir)
251+
require.Error(t, err)
252+
require.ErrorIs(t, err, ErrMixedRaftEngineArtifacts)
253+
})
227254
}

scripts/engine-migrate.sh

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,16 @@ sudo -n "$MIGRATE_BIN" \
288288
-peers "$PEERS"
289289
290290
echo " moving etcd artifacts into place"
291-
sudo -n mv "${migrate_dest}/data/member" "${node_dir}/member"
291+
for artifact in wal snap etcd-raft-peers.bin; do
292+
if sudo -n test -e "${migrate_dest}/data/${artifact}"; then
293+
sudo -n mv "${migrate_dest}/data/${artifact}" "${node_dir}/${artifact}"
294+
fi
295+
done
296+
# Fallback: handle legacy member/ directory layout produced by older
297+
# migration binaries so WAL/snap data is never silently dropped.
298+
if sudo -n test -d "${migrate_dest}/data/member"; then
299+
sudo -n mv "${migrate_dest}/data/member" "${node_dir}/member"
300+
fi
292301
sudo -n rm -rf "$migrate_dest"
293302
294303
echo " archiving hashicorp raft artifacts"

0 commit comments

Comments
 (0)