Skip to content

Commit 56b9b80

Browse files
authored
HDDS-14699. Fix orphan snapshot versions handling when snapshot chain tableKey mapping is stale (#9810)
1 parent ca00857 commit 56b9b80

3 files changed

Lines changed: 76 additions & 6 deletions

File tree

hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -284,12 +284,23 @@ public OmSnapshotManager(OzoneManager ozoneManager) throws IOException {
284284

285285
public static boolean isSnapshotPurged(SnapshotChainManager chainManager, OMMetadataManager omMetadataManager,
286286
UUID snapshotId, TransactionInfo transactionInfo) throws IOException {
287+
boolean purgeFlushed = transactionInfo != null &&
288+
isTransactionFlushedToDisk(omMetadataManager, transactionInfo);
287289
String tableKey = chainManager.getTableKey(snapshotId);
288290
if (tableKey == null) {
289-
return true;
291+
// Snapshot chain is rebuilt from DB on every OM restart (loadFromSnapshotInfoTable),
292+
// but entries committed to the Raft log (but not yet flushed) after the restart
293+
// are applied in addToDBBatch (skipping validateAndUpdateCache), so they never call
294+
// addSnapshot(). This can affect any OM (leader or follower) after a restart.
295+
//
296+
// Need to fall back to transactionInfo. null means no purge has been recorded. Treat as active.
297+
LOG.debug("snapshotId {} has null tableKey in SnapshotChainManager. "
298+
+ "transactionInfo={} purgeFlushed={}. Returning {}",
299+
snapshotId, transactionInfo, purgeFlushed, purgeFlushed);
300+
return purgeFlushed;
290301
}
291-
return !omMetadataManager.getSnapshotInfoTable().isExist(tableKey) && transactionInfo != null &&
292-
isTransactionFlushedToDisk(omMetadataManager, transactionInfo);
302+
boolean inDB = omMetadataManager.getSnapshotInfoTable().isExist(tableKey);
303+
return !inDB && purgeFlushed;
293304
}
294305

295306
/**
@@ -371,8 +382,12 @@ public OmSnapshot load(@Nonnull UUID snapshotId) throws IOException {
371382
}
372383
try (OmSnapshotLocalDataManager.ReadableOmSnapshotLocalDataMetaProvider snapshotLocalDataProvider =
373384
snapshotLocalDataManager.getOmSnapshotLocalDataMeta(snapshotInfo)) {
385+
final OmSnapshotLocalDataManager.SnapshotVersionsMeta snapshotMeta = snapshotLocalDataProvider.getMeta();
386+
if (snapshotMeta == null) {
387+
throw new OMException("Snapshot local metadata is missing for snapshotId: " + snapshotId, FILE_NOT_FOUND);
388+
}
374389
snapshotMetadataManager = getSnapshotOmMetadataManager(snapshotInfo,
375-
snapshotLocalDataProvider.getMeta().getVersion(), maxOpenSstFilesInSnapshotDb, conf);
390+
snapshotMeta.getVersion(), maxOpenSstFilesInSnapshotDb, conf);
376391
}
377392
} catch (IOException e) {
378393
LOG.error("Failed to retrieve snapshot: {}", snapshotTableKey, e);

hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,10 @@ private boolean loadFromSnapshotInfoTable(OMMetadataManager metadataManager) {
296296
globalSnapshotChain.clear();
297297
snapshotChainByPath.clear();
298298
latestSnapshotIdByPath.clear();
299+
if (!snapshotIdToTableKey.isEmpty()) {
300+
LOG.debug("Clearing snapshotIdToTableKey (had {} entries) on thread {}",
301+
snapshotIdToTableKey.size(), Thread.currentThread().getName());
302+
}
299303
snapshotIdToTableKey.clear();
300304

301305
while (keyIter.hasNext()) {
@@ -353,6 +357,8 @@ public synchronized void addSnapshot(SnapshotInfo snapshotInfo)
353357
// store snapshot ID to snapshot DB table key in the map
354358
snapshotIdToTableKey.put(snapshotInfo.getSnapshotId(),
355359
snapshotInfo.getTableKey());
360+
LOG.debug("Added to snapshotIdToTableKey: snapshotId={} tableKey={}",
361+
snapshotInfo.getSnapshotId(), snapshotInfo.getTableKey());
356362
}
357363

358364
/**
@@ -378,7 +384,8 @@ public synchronized boolean deleteSnapshot(SnapshotInfo snapshotInfo)
378384
*/
379385
public synchronized void removeFromSnapshotIdToTable(UUID snapshotId) throws IOException {
380386
validateSnapshotChain();
381-
snapshotIdToTableKey.remove(snapshotId);
387+
String tableKey = snapshotIdToTableKey.remove(snapshotId);
388+
LOG.debug("Removed from snapshotIdToTableKey: snapshotId={} tableKey={}", snapshotId, tableKey);
382389
}
383390

384391
/**
@@ -570,7 +577,12 @@ public UUID previousPathSnapshot(String snapshotPath,
570577
}
571578

572579
public String getTableKey(UUID snapshotId) {
573-
return snapshotIdToTableKey.get(snapshotId);
580+
String tableKey = snapshotIdToTableKey.get(snapshotId);
581+
if (tableKey == null) {
582+
LOG.debug("getTableKey returned null for snapshotId={}. snapshotIdToTableKey has {} entries",
583+
snapshotId, snapshotIdToTableKey.size());
584+
}
585+
return tableKey;
574586
}
575587

576588
public LinkedHashMap<UUID, SnapshotChainInfo> getSnapshotChainPath(

hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOmSnapshotLocalDataManager.java

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282
import org.apache.hadoop.ozone.om.OmSnapshotLocalData;
8383
import org.apache.hadoop.ozone.om.OmSnapshotLocalDataYaml;
8484
import org.apache.hadoop.ozone.om.OmSnapshotManager;
85+
import org.apache.hadoop.ozone.om.SnapshotChainManager;
8586
import org.apache.hadoop.ozone.om.helpers.SnapshotInfo;
8687
import org.apache.hadoop.ozone.om.lock.DAGLeveledResource;
8788
import org.apache.hadoop.ozone.om.lock.HierarchicalResourceLockManager;
@@ -1027,6 +1028,48 @@ public void testInitWithMissingYamlFiles(boolean needsUpgrade) throws IOExceptio
10271028
}
10281029
}
10291030

1031+
/**
1032+
* Regression test for NullPointerException in OmSnapshotManager#createCacheLoader.
1033+
* <p>
1034+
* isSnapshotPurged() now falls back to transactionInfo when getTableKey() returns null.
1035+
* A null transactionInfo means no purge was ever recorded for this snapshot in its YAML,
1036+
* so the snapshot is treated as active and the orphan check should correctly skip it.
1037+
*/
1038+
@Test
1039+
public void testCheckOrphanSnapshotVersionsWithStaleSnapshotChain() throws IOException {
1040+
localDataManager = getNewOmSnapshotLocalDataManager();
1041+
UUID snapshotId = createSnapshotLocalData(localDataManager, 1).get(0);
1042+
1043+
// Snapshot must be in versionNodeMap before the orphan check.
1044+
assertNotNull(localDataManager.getVersionNodeMapUnmodifiable().get(snapshotId));
1045+
1046+
// Use the real isSnapshotPurged
1047+
snapshotUtilMock.when(() -> OmSnapshotManager.isSnapshotPurged(any(), any(), any(), any()))
1048+
.thenCallRealMethod();
1049+
1050+
// Simulate a stale SnapshotChainManager: getTableKey returns null for the
1051+
// snapshot because the snapshot chain has not been correctly updated
1052+
SnapshotChainManager staleChain = mock(SnapshotChainManager.class);
1053+
when(staleChain.getTableKey(snapshotId)).thenReturn(null);
1054+
1055+
localDataManager.checkOrphanSnapshotVersions(omMetadataManager, staleChain, snapshotId);
1056+
1057+
// Before the fix: isSnapshotPurged returned true for any null tableKey, so the snapshot
1058+
// was removed from versionNodeMap. getMeta() then returned null, causing NullPointerException
1059+
1060+
// After the fix: null transactionInfo means no purge has been recorded, assuming active snapshot.
1061+
// versionNodeMap entry will survive the orphan check. getMeta() will be non-null.
1062+
1063+
assertNotNull(localDataManager.getVersionNodeMapUnmodifiable().get(snapshotId),
1064+
"Active snapshot was removed erroneously from versionNodeMap due to stale SnapshotChainManager");
1065+
1066+
try (OmSnapshotLocalDataManager.ReadableOmSnapshotLocalDataMetaProvider provider =
1067+
localDataManager.getOmSnapshotLocalDataMeta(snapshotId)) {
1068+
assertNotNull(provider.getMeta(),
1069+
"getMeta() returned null. Calling getVersion() on it throws NullPointerException");
1070+
}
1071+
}
1072+
10301073
@Test
10311074
public void testInitWithInvalidPathThrowsException() throws IOException {
10321075
UUID snapshotId = UUID.randomUUID();

0 commit comments

Comments
 (0)