Skip to content

Commit 048b0ef

Browse files
committed
feat: [ha] force resync button from studio
New red "Emergency Recovery" card on the Cluster tab. It lists each database with a "Resync from Leader" button, shown only when the local node is a follower. Clicking confirms (warning that local-only changes are discarded), POSTs to the local node's resync endpoint, and refreshes.
1 parent f671adb commit 048b0ef

7 files changed

Lines changed: 371 additions & 0 deletions

File tree

ha-raft/src/main/java/com/arcadedb/server/ha/raft/ArcadeStateMachine.java

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -996,6 +996,62 @@ private void installFromLeaderForBootstrap(final String dbName) {
996996
}
997997
}
998998

999+
/**
1000+
* Operator-triggered emergency recovery: drop the local copy of {@code dbName} and re-acquire a
1001+
* fresh full snapshot from the current leader. This is the manual equivalent of the automatic
1002+
* snapshot install path ({@link #notifyInstallSnapshotFromLeader}) and uses the same crash-safe
1003+
* {@link SnapshotInstaller} machinery as {@link #installFromLeaderForBootstrap}.
1004+
* <p>
1005+
* The intended use case is a follower that has diverged from the leader (e.g. a
1006+
* {@link WALVersionGapException} reported "snapshot resync required"): the diverged page versions
1007+
* can never be reconciled by applying further deltas, so the only safe fix is to replace the local
1008+
* files with the leader's authoritative copy. After install the local database matches the leader's
1009+
* snapshot point; any Raft log entries replayed afterwards that predate the snapshot are skipped by
1010+
* the page-version guard in {@code applyChanges}, and forward replication resumes normally.
1011+
* <p>
1012+
* Runs synchronously on the caller thread (the HTTP worker thread). Refuses to run on the leader
1013+
* (it holds the authoritative copy) and when no leader is currently known.
1014+
*
1015+
* @param dbName name of the database to resync from the leader
1016+
* @throws ReplicationException if Raft HA is not enabled, this node is the leader, the leader is
1017+
* unknown, or the snapshot install fails
1018+
*/
1019+
public void resyncDatabaseFromLeader(final String dbName) {
1020+
final RaftHAServer raft = raftHAServer;
1021+
if (raft == null)
1022+
throw new ReplicationException("Cannot resync database '" + dbName + "': Raft HA is not enabled");
1023+
1024+
if (raft.isLeader())
1025+
throw new ReplicationException("Cannot resync database '" + dbName
1026+
+ "' on the leader: the leader holds the authoritative copy. Run the resync on the diverged follower.");
1027+
1028+
if (raft.getLeaderHttpAddress() == null)
1029+
throw new ReplicationException("Cannot resync database '" + dbName
1030+
+ "': the leader is currently unknown (election in progress?). Retry once a leader is elected.");
1031+
1032+
LogManager.instance().log(this, Level.WARNING,
1033+
"Operator-triggered resync of database '%s' from leader: dropping local copy and re-acquiring full snapshot", dbName);
1034+
1035+
try {
1036+
final String databasePath;
1037+
if (server.existsDatabase(dbName)) {
1038+
final DatabaseInternal db = (DatabaseInternal) server.getDatabase(dbName);
1039+
databasePath = db.getDatabasePath();
1040+
db.getEmbedded().close();
1041+
server.removeDatabase(dbName);
1042+
} else {
1043+
databasePath = server.getConfiguration().getValueAsString(GlobalConfiguration.SERVER_DATABASE_DIRECTORY)
1044+
+ File.separator + dbName;
1045+
}
1046+
// Resolve the leader address on each retry (it can change mid-operation if leadership moves).
1047+
final String clusterToken = raft.getClusterToken();
1048+
SnapshotInstaller.install(dbName, databasePath, raft::getLeaderHttpAddress, clusterToken, server);
1049+
LogManager.instance().log(this, Level.INFO, "Database '%s' resynced from leader on operator request", dbName);
1050+
} catch (final IOException e) {
1051+
throw new ReplicationException("Failed to resync database '" + dbName + "' from leader", e);
1052+
}
1053+
}
1054+
9991055
/**
10001056
* Returns the bootstrap baseline committed for {@code dbName}, or {@code null} if no
10011057
* {@link RaftLogEntryType#BOOTSTRAP_FINGERPRINT_ENTRY} has been applied for it. Visible to

ha-raft/src/main/java/com/arcadedb/server/ha/raft/GetClusterHandler.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,21 @@ public ExecutionResponse execute(final HttpServerExchange exchange, final Server
8989
}
9090
response.put("peers", peers);
9191

92+
// Per-database list, used by Studio to render per-database actions (e.g. the emergency
93+
// "Resync from Leader" control on followers) and to surface bootstrap baselines when present.
94+
final JSONArray databases = new JSONArray();
95+
for (final String dbName : httpServer.getServer().getDatabaseNames()) {
96+
final JSONObject dbJson = new JSONObject();
97+
dbJson.put("name", dbName);
98+
final ArcadeStateMachine.BootstrapBaseline baseline = stateMachine.getBootstrapBaseline(dbName);
99+
if (baseline != null) {
100+
dbJson.put("bootstrapLastTxId", baseline.lastTxId());
101+
dbJson.put("bootstrapFingerprint", baseline.fingerprint());
102+
}
103+
databases.put(dbJson);
104+
}
105+
response.put("databases", databases);
106+
92107
return new ExecutionResponse(200, response.toString());
93108
}
94109
}
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
/*
2+
* Copyright 2021-present Arcade Data Ltd (info@arcadedata.com)
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*
16+
* SPDX-FileCopyrightText: 2021-present Arcade Data Ltd (info@arcadedata.com)
17+
* SPDX-License-Identifier: Apache-2.0
18+
*/
19+
package com.arcadedb.server.ha.raft;
20+
21+
import com.arcadedb.serializer.json.JSONObject;
22+
import com.arcadedb.server.http.HttpServer;
23+
import com.arcadedb.server.http.handler.AbstractServerHttpHandler;
24+
import com.arcadedb.server.http.handler.ExecutionResponse;
25+
import com.arcadedb.server.security.ServerSecurityUser;
26+
import io.undertow.server.HttpServerExchange;
27+
28+
/**
29+
* POST /api/v1/cluster/resync/{database} - emergency recovery that forces THIS node to drop its
30+
* local copy of {@code database} and re-acquire a fresh full snapshot from the current leader.
31+
* <p>
32+
* Intended for a follower that has diverged from the leader (e.g. a {@code WALVersionGapException}
33+
* reporting "snapshot resync required"). The endpoint always operates on the node that receives the
34+
* request, so an operator points Studio (or curl) at the diverged follower and triggers it there.
35+
* Refuses to run on the leader, which holds the authoritative copy.
36+
*
37+
* @author Luca Garulli (l.garulli@arcadedata.com)
38+
*/
39+
public class PostResyncDatabaseHandler extends AbstractServerHttpHandler {
40+
41+
private final RaftHAPlugin plugin;
42+
43+
public PostResyncDatabaseHandler(final HttpServer httpServer, final RaftHAPlugin plugin) {
44+
super(httpServer);
45+
this.plugin = plugin;
46+
}
47+
48+
@Override
49+
protected boolean mustExecuteOnWorkerThread() {
50+
// Closing the database and downloading the snapshot from the leader is blocking I/O.
51+
return true;
52+
}
53+
54+
@Override
55+
public ExecutionResponse execute(final HttpServerExchange exchange, final ServerSecurityUser user,
56+
final JSONObject payload) {
57+
checkRootUser(user);
58+
59+
final RaftHAServer raftHAServer = plugin.getRaftHAServer();
60+
if (raftHAServer == null)
61+
return new ExecutionResponse(400, new JSONObject().put("error", "Raft HA is not enabled").toString());
62+
63+
// Extract database name from path: /api/v1/cluster/resync/{database}
64+
final String path = exchange.getRelativePath();
65+
final String databaseName = (path.startsWith("/") ? path.substring(1) : path).trim();
66+
67+
if (databaseName.isEmpty())
68+
return new ExecutionResponse(400, new JSONObject().put("error", "Database name is required in path").toString());
69+
70+
if (!PostVerifyDatabaseHandler.VALID_DATABASE_NAME.matcher(databaseName).matches())
71+
return new ExecutionResponse(400, new JSONObject().put("error", "Invalid database name").toString());
72+
73+
if (raftHAServer.isLeader())
74+
return new ExecutionResponse(400, new JSONObject().put("error",
75+
"Cannot resync database '" + databaseName
76+
+ "' on the leader: the leader holds the authoritative copy. Run the resync on the diverged follower.").toString());
77+
78+
if (raftHAServer.getLeaderHttpAddress() == null)
79+
return new ExecutionResponse(503, new JSONObject().put("error",
80+
"Cannot resync database '" + databaseName
81+
+ "': the leader is currently unknown (election in progress?). Retry once a leader is elected.").toString());
82+
83+
try {
84+
raftHAServer.getStateMachine().resyncDatabaseFromLeader(databaseName);
85+
} catch (final Exception e) {
86+
return new ExecutionResponse(500, new JSONObject().put("error",
87+
"Resync of database '" + databaseName + "' failed: " + e.getMessage()).toString());
88+
}
89+
90+
return new ExecutionResponse(200, new JSONObject()
91+
.put("result", "Database '" + databaseName + "' resynced from leader")
92+
.put("database", databaseName)
93+
.put("localServer", httpServer.getServer().getServerName()).toString());
94+
}
95+
}

ha-raft/src/main/java/com/arcadedb/server/ha/raft/RaftHAPlugin.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ public void registerAPI(final HttpServer httpServer, final PathHandler routes) {
143143
routes.addExactPath("/api/v1/cluster/stepdown", new PostStepDownHandler(httpServer, this));
144144
routes.addExactPath("/api/v1/cluster/leave", new PostLeaveHandler(httpServer, this));
145145
routes.addPrefixPath("/api/v1/cluster/verify/", new PostVerifyDatabaseHandler(httpServer, this));
146+
routes.addPrefixPath("/api/v1/cluster/resync/", new PostResyncDatabaseHandler(httpServer, this));
146147
// Issue #4147: pre-bootstrap state RPC, used by the bootstrap leader at first cluster
147148
// formation to collect each peer's (fingerprint, lastTxId) per database.
148149
routes.addExactPath("/api/v1/cluster/bootstrap-state", new PostBootstrapStateHandler(httpServer, this));
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
/*
2+
* Copyright 2021-present Arcade Data Ltd (info@arcadedata.com)
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*
16+
* SPDX-FileCopyrightText: 2021-present Arcade Data Ltd (info@arcadedata.com)
17+
* SPDX-License-Identifier: Apache-2.0
18+
*/
19+
package com.arcadedb.server.ha.raft;
20+
21+
import com.arcadedb.database.Database;
22+
import com.arcadedb.graph.MutableVertex;
23+
import com.arcadedb.serializer.json.JSONObject;
24+
import org.junit.jupiter.api.Test;
25+
26+
import java.net.HttpURLConnection;
27+
import java.net.URI;
28+
import java.nio.charset.StandardCharsets;
29+
import java.util.Base64;
30+
31+
import static org.assertj.core.api.Assertions.assertThat;
32+
33+
/**
34+
* Integration tests for the operator-triggered emergency resync endpoint
35+
* (POST /api/v1/cluster/resync/{database}) and {@link ArcadeStateMachine#resyncDatabaseFromLeader}.
36+
*/
37+
class RaftForceResyncIT extends BaseRaftHATest {
38+
39+
@Override
40+
protected int getServerCount() {
41+
// 3 nodes so a majority (2) is still available while one follower is being resynced.
42+
return 3;
43+
}
44+
45+
@Test
46+
void followerResyncFromLeaderKeepsDataAndResumesReplication() throws Exception {
47+
final int leaderIndex = findLeaderIndex();
48+
assertThat(leaderIndex).as("A Raft leader must be elected").isGreaterThanOrEqualTo(0);
49+
final int followerIndex = (leaderIndex + 1) % getServerCount();
50+
51+
final Database leaderDb = getServerDatabase(leaderIndex, getDatabaseName());
52+
leaderDb.transaction(() -> {
53+
if (!leaderDb.getSchema().existsType("ResyncTest"))
54+
leaderDb.getSchema().createVertexType("ResyncTest");
55+
for (int i = 0; i < 25; i++)
56+
leaderDb.newVertex("ResyncTest").set("index", i).save();
57+
});
58+
59+
assertClusterConsistency();
60+
assertThat(getServerDatabase(followerIndex, getDatabaseName()).countType("ResyncTest", true))
61+
.as("Follower has the initial data before resync").isEqualTo(25);
62+
63+
// Trigger the emergency resync on the follower: it drops its local copy and re-downloads a full
64+
// snapshot from the leader.
65+
final JSONObject response = resync(followerIndex, getDatabaseName());
66+
assertThat(response.getString("result", "")).contains("resynced from leader");
67+
68+
assertThat(getServerDatabase(followerIndex, getDatabaseName()).countType("ResyncTest", true))
69+
.as("Follower has the same data after resync").isEqualTo(25);
70+
71+
// Forward replication must resume after the resync: new writes on the leader reach the follower.
72+
leaderDb.transaction(() -> {
73+
for (int i = 25; i < 40; i++)
74+
leaderDb.newVertex("ResyncTest").set("index", i).save();
75+
});
76+
77+
assertClusterConsistency();
78+
assertThat(getServerDatabase(followerIndex, getDatabaseName()).countType("ResyncTest", true))
79+
.as("Follower receives writes committed after the resync").isEqualTo(40);
80+
}
81+
82+
@Test
83+
void resyncOnLeaderIsRejected() throws Exception {
84+
final int leaderIndex = findLeaderIndex();
85+
assertThat(leaderIndex).isGreaterThanOrEqualTo(0);
86+
87+
final int httpPort = getServer(leaderIndex).getHttpServer().getPort();
88+
final HttpURLConnection conn = (HttpURLConnection) new URI(
89+
"http://localhost:" + httpPort + "/api/v1/cluster/resync/" + getDatabaseName()).toURL().openConnection();
90+
conn.setRequestMethod("POST");
91+
conn.setRequestProperty("Authorization", basicAuth());
92+
93+
assertThat(conn.getResponseCode()).isEqualTo(400);
94+
final String body = new String(conn.getErrorStream().readAllBytes(), StandardCharsets.UTF_8);
95+
assertThat(new JSONObject(body).getString("error", "")).contains("leader");
96+
conn.disconnect();
97+
}
98+
99+
private JSONObject resync(final int serverIndex, final String databaseName) throws Exception {
100+
final int httpPort = getServer(serverIndex).getHttpServer().getPort();
101+
final HttpURLConnection conn = (HttpURLConnection) new URI(
102+
"http://localhost:" + httpPort + "/api/v1/cluster/resync/" + databaseName).toURL().openConnection();
103+
conn.setRequestMethod("POST");
104+
conn.setRequestProperty("Authorization", basicAuth());
105+
try {
106+
assertThat(conn.getResponseCode()).isEqualTo(200);
107+
return new JSONObject(new String(conn.getInputStream().readAllBytes(), StandardCharsets.UTF_8));
108+
} finally {
109+
conn.disconnect();
110+
}
111+
}
112+
113+
private static String basicAuth() {
114+
return "Basic " + Base64.getEncoder().encodeToString(
115+
("root:" + DEFAULT_PASSWORD_FOR_TESTS).getBytes(StandardCharsets.UTF_8));
116+
}
117+
}

studio/src/main/resources/static/cluster.html

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,29 @@ <h6 style="color: var(--text-secondary); font-weight: 600;">Cluster Nodes</h6>
144144
</div>
145145
</div>
146146

147+
<!-- Emergency recovery: force this follower to drop and re-acquire a database from the
148+
leader. Hidden on the leader (it holds the authoritative copy) and toggled by
149+
renderResyncRecovery in studio-cluster.js. -->
150+
<div class="card mb-3" id="clusterResyncCard" style="display:none; border: 1px solid #fca5a5; border-radius: 10px;">
151+
<div class="card-header py-2" style="background: #fef2f2; border-bottom: 1px solid #fca5a5; border-radius: 10px 10px 0 0;">
152+
<i class="fas fa-sync-alt" style="color: #ef4444;"></i>
153+
<span style="font-weight: 600; font-size: 0.88rem; margin-left: 6px; color: #dc2626;">Emergency Recovery</span>
154+
<span style="font-size: 0.72rem; color: var(--text-secondary); margin-left: 8px;">
155+
— resync a diverged database from the leader
156+
</span>
157+
</div>
158+
<div class="card-body py-3">
159+
<p style="font-size: 0.78rem; color: var(--text-secondary); margin-bottom: 10px;">
160+
Drops this node's local copy of the selected database and re-downloads a fresh full
161+
snapshot from the leader. Use when this follower has diverged (e.g. a
162+
<span style="font-family: monospace;">WALVersionGapException / snapshot resync required</span> error).
163+
</p>
164+
<div id="clusterResyncList">
165+
<!-- Dynamically populated by renderResyncRecovery -->
166+
</div>
167+
</div>
168+
</div>
169+
147170
</div>
148171

149172
<!-- Right column: Danger -->

0 commit comments

Comments
 (0)