Skip to content

Commit dd8fd7e

Browse files
authored
HDFS-17821. Fix SBN repeatedly do checkpoint after fsimage transfer failure to part of the multiple NNs (#7876)
Reviewed-by: Tao Li <[email protected]> Signed-off-by: Tao Li <[email protected]>
1 parent 4f9efcb commit dd8fd7e

File tree

2 files changed

+46
-3
lines changed

2 files changed

+46
-3
lines changed

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ public TransferFsImage.TransferResult call()
342342
throw ie;
343343
}
344344

345-
if (!ioes.isEmpty()) {
345+
if (ioes.size() > activeNNAddresses.size() / 2) {
346346
throw MultipleIOException.createIOException(ioes);
347347
}
348348
}

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -721,8 +721,51 @@ private void doCreate() throws IOException {
721721
out.write(42);
722722
out.close();
723723
}
724-
725-
724+
725+
/**
726+
* Test checkpoint still succeeds when no more than half of the fsimages upload failed.
727+
*/
728+
@Test
729+
@Timeout(value = 300)
730+
public void testPutFsimagePartFailed() throws Exception {
731+
for (int i = 1; i < NUM_NNS; i++) {
732+
cluster.shutdownNameNode(i);
733+
734+
// Make true checkpoint for DFS_NAMENODE_CHECKPOINT_PERIOD_KEY
735+
cluster.getConfiguration(i).setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 3);
736+
cluster.getConfiguration(i).setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 1000);
737+
}
738+
doEdits(0, 10);
739+
cluster.transitionToStandby(0);
740+
741+
for (int i = 1; i < NUM_NNS; i++) {
742+
cluster.restartNameNode(i, false);
743+
}
744+
cluster.waitClusterUp();
745+
setNNs();
746+
747+
for (int i = 0; i < NUM_NNS; i++) {
748+
// Once the standby catches up, it should do a checkpoint
749+
// and save to local directories.
750+
HATestUtil.waitForCheckpoint(cluster, i, ImmutableList.of(12));
751+
}
752+
753+
long snnCheckpointTime1 = nns[1].getNamesystem().getStandbyLastCheckpointTime();
754+
cluster.transitionToActive(0);
755+
cluster.transitionToObserver(2);
756+
cluster.shutdownNameNode(2);
757+
758+
doEdits(11, 20);
759+
nns[0].getRpcServer().rollEditLog();
760+
HATestUtil.waitForCheckpoint(cluster, 0, ImmutableList.of(23));
761+
762+
long snnCheckpointTime2 = nns[1].getNamesystem().getStandbyLastCheckpointTime();
763+
764+
// Make sure that standby namenode checkpoint success and update the lastCheckpointTime
765+
// even though it send fsimage to nn2 failed because nn2 is shut down.
766+
assertTrue(snnCheckpointTime2 > snnCheckpointTime1);
767+
}
768+
726769
/**
727770
* A codec which just slows down the saving of the image significantly
728771
* by sleeping a few milliseconds on every write. This makes it easy to

0 commit comments

Comments
 (0)