Resetting replication recovery when cluster reset is issued (#1319)

Xizt · Tejas Kulkarni · vazois · web-flow · commit d2d6944b5daa · 2025-09-24T11:39:39.000-07:00
* Resetting replication recovery when cluster reset is issued

* validating using clusterendpoint during reset

* missed flag change

* using reset cancellation token

* Merge fix

* ADded other recovery statuses during reset recovery.

* using storeWrapper.serverOptions.ReplicaAttachTimeout for replica diskless sync

* ran dotnet formatter

* added reset during replication tests

* ran dotnet format

* removed if debug

* if debug encompasses using statements

* version bump

---------

Co-authored-by: Tejas Kulkarni &lt;tekulka@microsoft.com&gt;
Co-authored-by: Vasileios Zois &lt;96085550+vazois@users.noreply.github.com&gt;
diff --git a/Version.props b/Version.props
@@ -1,6 +1,6 @@
 <Project>
 	<!-- VersionPrefix property for builds and packages -->
 	<PropertyGroup>
-		<VersionPrefix>1.0.83</VersionPrefix>
+		<VersionPrefix>1.0.84</VersionPrefix>
 	</PropertyGroup>
 </Project>
diff --git a/libs/cluster/Server/ClusterManagerWorkerState.cs b/libs/cluster/Server/ClusterManagerWorkerState.cs
@@ -100,6 +100,10 @@ public ReadOnlySpan<byte> TryReset(bool soft, int expirySeconds = 60)
             try
             {
                 SuspendConfigMerge();
+
+                // Reset recovery operations before proceeding with reset
+                clusterProvider.replicationManager.ResetRecovery();
+
                 var resp = CmdStrings.RESP_OK;
                 while (true)
                 {
@@ -113,8 +117,9 @@ public ReadOnlySpan<byte> TryReset(bool soft, int expirySeconds = 60)
                     this.clusterConnectionStore.CloseAll();
 
                     var newNodeId = soft ? current.LocalNodeId : Generator.CreateHexId();
-                    var address = current.LocalNodeIp;
-                    var port = current.LocalNodePort;
+                    var endpoint = clusterProvider.storeWrapper.GetClusterEndpoint();
+                    var address = endpoint.Address.ToString();
+                    var port = endpoint.Port;
 
                     var configEpoch = soft ? current.LocalNodeConfigEpoch : 0;
                     var expiry = DateTimeOffset.UtcNow.Ticks + TimeSpan.FromSeconds(expirySeconds).Ticks;
diff --git a/libs/cluster/Server/Replication/ReplicaOps/ReplicaDisklessSync.cs b/libs/cluster/Server/Replication/ReplicaOps/ReplicaDisklessSync.cs
@@ -4,9 +4,11 @@
 using System;
 using System.Net;
 using System.Text;
+using System.Threading;
 using System.Threading.Tasks;
 using Garnet.client;
 using Garnet.cluster.Server.Replication;
+using Garnet.common;
 using Microsoft.Extensions.Logging;
 
 namespace Garnet.cluster
@@ -58,6 +60,7 @@ async Task<string> TryBeginReplicaSync(bool downgradeLock)
                 var disklessSync = clusterProvider.serverOptions.ReplicaDisklessSync;
                 var disableObjects = clusterProvider.serverOptions.DisableObjects;
                 GarnetClientSession gcs = null;
+                resetHandler ??= new CancellationTokenSource();
                 try
                 {
                     if (!clusterProvider.serverOptions.EnableFastCommit)
@@ -119,7 +122,12 @@ async Task<string> TryBeginReplicaSync(bool downgradeLock)
                         currentReplicationOffset: ReplicationOffset,
                         checkpointEntry: checkpointEntry);
 
-                    var resp = await gcs.ExecuteAttachSync(syncMetadata.ToByteArray()).ConfigureAwait(false);
+                    using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ctsRepManager.Token, resetHandler.Token);
+
+                    // Exception injection point for testing cluster reset during diskless replication
+                    await ExceptionInjectionHelper.WaitOnSet(ExceptionInjectionType.Replication_InProgress_During_Diskless_Replica_Attach_Sync).WaitAsync(storeWrapper.serverOptions.ReplicaAttachTimeout, linkedCts.Token).ConfigureAwait(false);
+
+                    var resp = await gcs.ExecuteAttachSync(syncMetadata.ToByteArray()).WaitAsync(storeWrapper.serverOptions.ReplicaAttachTimeout, linkedCts.Token).ConfigureAwait(false);
                 }
                 catch (Exception ex)
                 {
@@ -144,6 +152,11 @@ async Task<string> TryBeginReplicaSync(bool downgradeLock)
                     }
                     gcs?.Dispose();
                     recvCheckpointHandler?.Dispose();
+                    if (!resetHandler.TryReset())
+                    {
+                        resetHandler.Dispose();
+                        resetHandler = new CancellationTokenSource();
+                    }
                 }
                 return null;
             }
diff --git a/libs/cluster/Server/Replication/ReplicaOps/ReplicaReceiveCheckpoint.cs b/libs/cluster/Server/Replication/ReplicaOps/ReplicaReceiveCheckpoint.cs
@@ -6,9 +6,11 @@
 using System.IO;
 using System.Net;
 using System.Text;
+using System.Threading;
 using System.Threading.Tasks;
 using Garnet.client;
 using Garnet.cluster.Server.Replication;
+using Garnet.common;
 using Garnet.server;
 using Microsoft.Extensions.Logging;
 using Tsavorite.core;
@@ -72,6 +74,7 @@ async Task<string> ReplicaSyncAttachTask(bool downgradeLock)
             {
                 Debug.Assert(IsRecovering);
                 GarnetClientSession gcs = null;
+                resetHandler ??= new CancellationTokenSource();
                 try
                 {
                     // Immediately try to connect to a primary, so we FAIL
@@ -139,12 +142,16 @@ async Task<string> ReplicaSyncAttachTask(bool downgradeLock)
                     // 4. Replica responds with aofStartAddress sync
                     // 5. Primary will initiate aof sync task
                     // 6. Primary releases checkpoint
+                    using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ctsRepManager.Token, resetHandler.Token);
+
+                    // Exception injection point for testing cluster reset during disk-based replication
+                    await ExceptionInjectionHelper.WaitOnSet(ExceptionInjectionType.Replication_InProgress_During_DiskBased_Replica_Attach_Sync).WaitAsync(storeWrapper.serverOptions.ReplicaAttachTimeout, linkedCts.Token).ConfigureAwait(false);
                     var resp = await gcs.ExecuteReplicaSync(
                         nodeId,
                         PrimaryReplId,
                         cEntry.ToByteArray(),
                         storeWrapper.appendOnlyFile.BeginAddress,
-                        storeWrapper.appendOnlyFile.TailAddress).WaitAsync(storeWrapper.serverOptions.ReplicaAttachTimeout, ctsRepManager.Token).ConfigureAwait(false);
+                        storeWrapper.appendOnlyFile.TailAddress).WaitAsync(storeWrapper.serverOptions.ReplicaAttachTimeout, linkedCts.Token).ConfigureAwait(false);
                 }
                 catch (Exception ex)
                 {
@@ -167,6 +174,11 @@ async Task<string> ReplicaSyncAttachTask(bool downgradeLock)
                     }
                     recvCheckpointHandler?.Dispose();
                     gcs?.Dispose();
+                    if (!resetHandler.TryReset())
+                    {
+                        resetHandler.Dispose();
+                        resetHandler = new CancellationTokenSource();
+                    }
                 }
                 return null;
             }
diff --git a/libs/cluster/Server/Replication/ReplicationManager.cs b/libs/cluster/Server/Replication/ReplicationManager.cs
@@ -25,6 +25,7 @@ internal sealed partial class ReplicationManager : IDisposable
         readonly CheckpointStore checkpointStore;
         readonly ReplicationSyncManager replicationSyncManager;
         readonly CancellationTokenSource ctsRepManager = new();
+        CancellationTokenSource resetHandler = new();
 
         readonly int pageSizeBits;
 
@@ -454,6 +455,20 @@ public void EndRecovery(RecoveryStatus nextRecoveryStatus, bool downgradeLock)
             }
         }
 
+        public void ResetRecovery()
+        {
+            switch (currentRecoveryStatus)
+            {
+                case RecoveryStatus.ClusterReplicate:
+                case RecoveryStatus.ClusterFailover:
+                case RecoveryStatus.ReplicaOfNoOne:
+                case RecoveryStatus.CheckpointRecoveredAtReplica:
+                case RecoveryStatus.InitializeRecover:
+                    resetHandler.Cancel();
+                    break;
+            }
+        }
+
         public void Dispose()
         {
             _disposed = true;
@@ -470,6 +485,8 @@ public void Dispose()
             replicaReplayTaskCts.Dispose();
             ctsRepManager.Cancel();
             ctsRepManager.Dispose();
+            resetHandler.Cancel();
+            resetHandler.Dispose();
             aofTaskStore.Dispose();
             aofProcessor?.Dispose();
             networkPool?.Dispose();
diff --git a/libs/common/ExceptionInjectionType.cs b/libs/common/ExceptionInjectionType.cs
@@ -53,5 +53,13 @@ public enum ExceptionInjectionType
         /// Delay response on receive checkpoint to trigger timeout
         /// </summary>
         Replication_Timeout_On_Receive_Checkpoint,
+        /// <summary>
+        /// Replication InProgress during disk-based replica attach sync operation
+        /// </summary>
+        Replication_InProgress_During_DiskBased_Replica_Attach_Sync,
+        /// <summary>
+        /// Replication InProgress during diskless replica attach sync operation
+        /// </summary>
+        Replication_InProgress_During_Diskless_Replica_Attach_Sync,
     }
 }
diff --git a/test/Garnet.test.cluster/ReplicationTests/ClusterResetDuringReplicationTests.cs b/test/Garnet.test.cluster/ReplicationTests/ClusterResetDuringReplicationTests.cs
@@ -0,0 +1,154 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#if DEBUG
+using System.Collections.Generic;
+using System.Threading;
+using System.Threading.Tasks;
+using Garnet.common;
+using Microsoft.Extensions.Logging;
+using NUnit.Framework;
+using NUnit.Framework.Legacy;
+
+namespace Garnet.test.cluster.ReplicationTests
+{
+    /// <summary>
+    /// These tests simulate scenarios where a replica gets stuck or is in replication attach and verify that
+    /// CLUSTER RESET HARD can properly cancel ongoing operations and allow the replica to be reused.
+    /// </summary>
+    [NonParallelizable]
+    public class ClusterResetDuringReplicationTests
+    {
+        ClusterTestContext context;
+
+        readonly int createInstanceTimeout = (int)System.TimeSpan.FromSeconds(30).TotalSeconds;
+        const int testTimeout = 120_000;
+
+        readonly Dictionary<string, LogLevel> monitorTests = [];
+
+        [SetUp]
+        public void Setup()
+        {
+            context = new ClusterTestContext();
+            context.Setup(monitorTests, testTimeoutSeconds: testTimeout);
+        }
+
+        [TearDown]
+        public void TearDown()
+        {
+            context?.TearDown();
+        }
+
+        /// <summary>
+        /// Test CLUSTER RESET HARD functionality during diskless replication attach.
+        /// This test simulates a scenario where a replica gets stuck while attaching to a primary
+        /// and verifies that CLUSTER RESET HARD can properly cancel the operation and reset the node.
+        /// </summary>
+        [Test, Order(1), CancelAfter(testTimeout)]
+        [Category("REPLICATION")]
+        public async Task ClusterResetHardDuringDisklessReplicationAttach(CancellationToken cancellationToken)
+        {
+            var primaryIndex = 0;
+            var replicaIndex = 1;
+            var nodes_count = 2;
+
+            // Create instances with diskless sync enabled
+            context.CreateInstances(nodes_count, disableObjects: false, enableAOF: true, enableDisklessSync: true, timeout: createInstanceTimeout);
+            context.CreateConnection();
+
+            // Setup primary
+            _ = context.clusterTestUtils.AddDelSlotsRange(primaryIndex, [(0, 16383)], addslot: true, logger: context.logger);
+            context.clusterTestUtils.SetConfigEpoch(primaryIndex, primaryIndex + 1, logger: context.logger);
+            context.clusterTestUtils.SetConfigEpoch(replicaIndex, replicaIndex + 1, logger: context.logger);
+            context.clusterTestUtils.Meet(primaryIndex, replicaIndex, logger: context.logger);
+
+            // Ensure nodes know each other
+            context.clusterTestUtils.WaitUntilNodeIsKnown(primaryIndex, replicaIndex, logger: context.logger);
+
+            try
+            {
+                ExceptionInjectionHelper.EnableException(ExceptionInjectionType.Replication_InProgress_During_Diskless_Replica_Attach_Sync);
+
+                var resp = context.clusterTestUtils.ClusterReplicate(replicaNodeIndex: replicaIndex, primaryNodeIndex: primaryIndex, failEx: false, async: true, logger: context.logger);
+
+                await Task.Delay(1000, cancellationToken);
+
+                // Verify that the replica is in a replicating state
+                var replicationInfo = context.clusterTestUtils.GetReplicationInfo(replicaIndex, [ReplicationInfoItem.RECOVER_STATUS], logger: context.logger);
+                ClassicAssert.AreEqual("ClusterReplicate", replicationInfo[0].Item2);
+
+                // Issuing CLUSTER RESET HARD while replication is ongoing/stuck.
+                var resetResp = context.clusterTestUtils.ClusterReset(replicaIndex, soft: false, expiry: 60, logger: context.logger);
+                ClassicAssert.AreEqual("OK", resetResp);
+
+                // Verify that the node is no longer in recovery state
+                replicationInfo = context.clusterTestUtils.GetReplicationInfo(replicaIndex, [ReplicationInfoItem.RECOVER_STATUS], logger: context.logger);
+                ClassicAssert.AreEqual("NoRecovery", replicationInfo[0].Item2);
+
+                // Verify the node role is back to PRIMARY (default after reset)
+                var role = context.clusterTestUtils.RoleCommand(replicaIndex, logger: context.logger);
+                ClassicAssert.AreEqual("master", role.Value);
+            }
+            finally
+            {
+                ExceptionInjectionHelper.DisableException(ExceptionInjectionType.Replication_InProgress_During_Diskless_Replica_Attach_Sync);
+            }
+        }
+
+        /// <summary>
+        /// Test CLUSTER RESET HARD functionality during diskbased replication attach.
+        /// This test simulates a scenario where a replica gets stuck while attaching to a primary
+        /// and verifies that CLUSTER RESET HARD can properly cancel the operation and reset the node.
+        /// </summary>
+        [Test, Order(2), CancelAfter(testTimeout)]
+        [Category("REPLICATION")]
+        public async Task ClusterResetHardDuringDiskBasedReplicationAttach(CancellationToken cancellationToken)
+        {
+            var primaryIndex = 0;
+            var replicaIndex = 1;
+            var nodes_count = 2;
+
+            // (diskless sync is false)
+            context.CreateInstances(nodes_count, disableObjects: false, enableAOF: true, enableDisklessSync: false, timeout: createInstanceTimeout);
+            context.CreateConnection();
+
+            // Setup primary
+            _ = context.clusterTestUtils.AddDelSlotsRange(primaryIndex, [(0, 16383)], addslot: true, logger: context.logger);
+            context.clusterTestUtils.SetConfigEpoch(primaryIndex, primaryIndex + 1, logger: context.logger);
+            context.clusterTestUtils.SetConfigEpoch(replicaIndex, replicaIndex + 1, logger: context.logger);
+            context.clusterTestUtils.Meet(primaryIndex, replicaIndex, logger: context.logger);
+
+            context.clusterTestUtils.WaitUntilNodeIsKnown(primaryIndex, replicaIndex, logger: context.logger);
+
+            try
+            {
+                ExceptionInjectionHelper.EnableException(ExceptionInjectionType.Replication_InProgress_During_DiskBased_Replica_Attach_Sync);
+
+                var resp = context.clusterTestUtils.ClusterReplicate(replicaNodeIndex: replicaIndex, primaryNodeIndex: primaryIndex, failEx: false, async: true, logger: context.logger);
+
+                await Task.Delay(1000, cancellationToken);
+
+                // Verify that the replica is in a replicating state
+                var replicationInfo = context.clusterTestUtils.GetReplicationInfo(replicaIndex, [ReplicationInfoItem.RECOVER_STATUS], logger: context.logger);
+                ClassicAssert.AreEqual("ClusterReplicate", replicationInfo[0].Item2);
+
+                // Issueing CLUSTER RESET HARD while replication is ongoing/stuck.
+                var resetResp = context.clusterTestUtils.ClusterReset(replicaIndex, soft: false, expiry: 60, logger: context.logger);
+                ClassicAssert.AreEqual("OK", resetResp);
+
+                // Verify that the node is no longer in recovery state
+                replicationInfo = context.clusterTestUtils.GetReplicationInfo(replicaIndex, [ReplicationInfoItem.RECOVER_STATUS], logger: context.logger);
+                ClassicAssert.AreEqual("NoRecovery", replicationInfo[0].Item2);
+
+                // Verify the node role is back to PRIMARY (default after reset)
+                var role = context.clusterTestUtils.RoleCommand(replicaIndex, logger: context.logger);
+                ClassicAssert.AreEqual("master", role.Value);
+            }
+            finally
+            {
+                ExceptionInjectionHelper.DisableException(ExceptionInjectionType.Replication_InProgress_During_DiskBased_Replica_Attach_Sync);
+            }
+        }
+    }
+}
+#endif

Original file line number	Diff line number	Diff line change
`@@ -53,5 +53,13 @@ public enum ExceptionInjectionType`
`53`	`53`	`/// Delay response on receive checkpoint to trigger timeout`
`54`	`54`	`/// </summary>`
`55`	`55`	`Replication_Timeout_On_Receive_Checkpoint,`
	`56`	`+ /// <summary>`
	`57`	`+ /// Replication InProgress during disk-based replica attach sync operation`
	`58`	`+ /// </summary>`
	`59`	`+ Replication_InProgress_During_DiskBased_Replica_Attach_Sync,`
	`60`	`+ /// <summary>`
	`61`	`+ /// Replication InProgress during diskless replica attach sync operation`
	`62`	`+ /// </summary>`
	`63`	`+ Replication_InProgress_During_Diskless_Replica_Attach_Sync,`
`56`	`64`	`}`
`57`	`65`	`}`