1+ // Copyright (c) Microsoft Corporation.
2+ // Licensed under the MIT license.
3+
4+ #if DEBUG
5+ using System . Collections . Generic ;
6+ using System . Threading ;
7+ using System . Threading . Tasks ;
8+ using Garnet . common ;
9+ using Microsoft . Extensions . Logging ;
10+ using NUnit . Framework ;
11+ using NUnit . Framework . Legacy ;
12+
13+ namespace Garnet . test . cluster . ReplicationTests
14+ {
15+ /// <summary>
16+ /// These tests simulate scenarios where a replica gets stuck or is in replication attach and verify that
17+ /// CLUSTER RESET HARD can properly cancel ongoing operations and allow the replica to be reused.
18+ /// </summary>
19+ [ NonParallelizable ]
20+ public class ClusterResetDuringReplicationTests
21+ {
22+ ClusterTestContext context ;
23+
24+ readonly int createInstanceTimeout = ( int ) System . TimeSpan . FromSeconds ( 30 ) . TotalSeconds ;
25+ const int testTimeout = 120_000 ;
26+
27+ readonly Dictionary < string , LogLevel > monitorTests = [ ] ;
28+
29+ [ SetUp ]
30+ public void Setup ( )
31+ {
32+ context = new ClusterTestContext ( ) ;
33+ context . Setup ( monitorTests , testTimeoutSeconds : testTimeout ) ;
34+ }
35+
36+ [ TearDown ]
37+ public void TearDown ( )
38+ {
39+ context ? . TearDown ( ) ;
40+ }
41+
42+ /// <summary>
43+ /// Test CLUSTER RESET HARD functionality during diskless replication attach.
44+ /// This test simulates a scenario where a replica gets stuck while attaching to a primary
45+ /// and verifies that CLUSTER RESET HARD can properly cancel the operation and reset the node.
46+ /// </summary>
47+ [ Test , Order ( 1 ) , CancelAfter ( testTimeout ) ]
48+ [ Category ( "REPLICATION" ) ]
49+ public async Task ClusterResetHardDuringDisklessReplicationAttach ( CancellationToken cancellationToken )
50+ {
51+ var primaryIndex = 0 ;
52+ var replicaIndex = 1 ;
53+ var nodes_count = 2 ;
54+
55+ // Create instances with diskless sync enabled
56+ context . CreateInstances ( nodes_count , disableObjects : false , enableAOF : true , enableDisklessSync : true , timeout : createInstanceTimeout ) ;
57+ context . CreateConnection ( ) ;
58+
59+ // Setup primary
60+ _ = context . clusterTestUtils . AddDelSlotsRange ( primaryIndex , [ ( 0 , 16383 ) ] , addslot : true , logger : context . logger ) ;
61+ context . clusterTestUtils . SetConfigEpoch ( primaryIndex , primaryIndex + 1 , logger : context . logger ) ;
62+ context . clusterTestUtils . SetConfigEpoch ( replicaIndex , replicaIndex + 1 , logger : context . logger ) ;
63+ context . clusterTestUtils . Meet ( primaryIndex , replicaIndex , logger : context . logger ) ;
64+
65+ // Ensure nodes know each other
66+ context . clusterTestUtils . WaitUntilNodeIsKnown ( primaryIndex , replicaIndex , logger : context . logger ) ;
67+
68+ try
69+ {
70+ ExceptionInjectionHelper . EnableException ( ExceptionInjectionType . Replication_InProgress_During_Diskless_Replica_Attach_Sync ) ;
71+
72+ var resp = context . clusterTestUtils . ClusterReplicate ( replicaNodeIndex : replicaIndex , primaryNodeIndex : primaryIndex , failEx : false , async: true , logger : context . logger ) ;
73+
74+ await Task . Delay ( 1000 , cancellationToken ) ;
75+
76+ // Verify that the replica is in a replicating state
77+ var replicationInfo = context . clusterTestUtils . GetReplicationInfo ( replicaIndex , [ ReplicationInfoItem . RECOVER_STATUS ] , logger : context . logger ) ;
78+ ClassicAssert . AreEqual ( "ClusterReplicate" , replicationInfo [ 0 ] . Item2 ) ;
79+
80+ // Issuing CLUSTER RESET HARD while replication is ongoing/stuck.
81+ var resetResp = context . clusterTestUtils . ClusterReset ( replicaIndex , soft : false , expiry : 60 , logger : context . logger ) ;
82+ ClassicAssert . AreEqual ( "OK" , resetResp ) ;
83+
84+ // Verify that the node is no longer in recovery state
85+ replicationInfo = context . clusterTestUtils . GetReplicationInfo ( replicaIndex , [ ReplicationInfoItem . RECOVER_STATUS ] , logger : context . logger ) ;
86+ ClassicAssert . AreEqual ( "NoRecovery" , replicationInfo [ 0 ] . Item2 ) ;
87+
88+ // Verify the node role is back to PRIMARY (default after reset)
89+ var role = context . clusterTestUtils . RoleCommand ( replicaIndex , logger : context . logger ) ;
90+ ClassicAssert . AreEqual ( "master" , role . Value ) ;
91+ }
92+ finally
93+ {
94+ ExceptionInjectionHelper . DisableException ( ExceptionInjectionType . Replication_InProgress_During_Diskless_Replica_Attach_Sync ) ;
95+ }
96+ }
97+
98+ /// <summary>
99+ /// Test CLUSTER RESET HARD functionality during diskbased replication attach.
100+ /// This test simulates a scenario where a replica gets stuck while attaching to a primary
101+ /// and verifies that CLUSTER RESET HARD can properly cancel the operation and reset the node.
102+ /// </summary>
103+ [ Test , Order ( 2 ) , CancelAfter ( testTimeout ) ]
104+ [ Category ( "REPLICATION" ) ]
105+ public async Task ClusterResetHardDuringDiskBasedReplicationAttach ( CancellationToken cancellationToken )
106+ {
107+ var primaryIndex = 0 ;
108+ var replicaIndex = 1 ;
109+ var nodes_count = 2 ;
110+
111+ // (diskless sync is false)
112+ context . CreateInstances ( nodes_count , disableObjects : false , enableAOF : true , enableDisklessSync : false , timeout : createInstanceTimeout ) ;
113+ context . CreateConnection ( ) ;
114+
115+ // Setup primary
116+ _ = context . clusterTestUtils . AddDelSlotsRange ( primaryIndex , [ ( 0 , 16383 ) ] , addslot : true , logger : context . logger ) ;
117+ context . clusterTestUtils . SetConfigEpoch ( primaryIndex , primaryIndex + 1 , logger : context . logger ) ;
118+ context . clusterTestUtils . SetConfigEpoch ( replicaIndex , replicaIndex + 1 , logger : context . logger ) ;
119+ context . clusterTestUtils . Meet ( primaryIndex , replicaIndex , logger : context . logger ) ;
120+
121+ context . clusterTestUtils . WaitUntilNodeIsKnown ( primaryIndex , replicaIndex , logger : context . logger ) ;
122+
123+ try
124+ {
125+ ExceptionInjectionHelper . EnableException ( ExceptionInjectionType . Replication_InProgress_During_DiskBased_Replica_Attach_Sync ) ;
126+
127+ var resp = context . clusterTestUtils . ClusterReplicate ( replicaNodeIndex : replicaIndex , primaryNodeIndex : primaryIndex , failEx : false , async: true , logger : context . logger ) ;
128+
129+ await Task . Delay ( 1000 , cancellationToken ) ;
130+
131+ // Verify that the replica is in a replicating state
132+ var replicationInfo = context . clusterTestUtils . GetReplicationInfo ( replicaIndex , [ ReplicationInfoItem . RECOVER_STATUS ] , logger : context . logger ) ;
133+ ClassicAssert . AreEqual ( "ClusterReplicate" , replicationInfo [ 0 ] . Item2 ) ;
134+
135+ // Issueing CLUSTER RESET HARD while replication is ongoing/stuck.
136+ var resetResp = context . clusterTestUtils . ClusterReset ( replicaIndex , soft : false , expiry : 60 , logger : context . logger ) ;
137+ ClassicAssert . AreEqual ( "OK" , resetResp ) ;
138+
139+ // Verify that the node is no longer in recovery state
140+ replicationInfo = context . clusterTestUtils . GetReplicationInfo ( replicaIndex , [ ReplicationInfoItem . RECOVER_STATUS ] , logger : context . logger ) ;
141+ ClassicAssert . AreEqual ( "NoRecovery" , replicationInfo [ 0 ] . Item2 ) ;
142+
143+ // Verify the node role is back to PRIMARY (default after reset)
144+ var role = context . clusterTestUtils . RoleCommand ( replicaIndex , logger : context . logger ) ;
145+ ClassicAssert . AreEqual ( "master" , role . Value ) ;
146+ }
147+ finally
148+ {
149+ ExceptionInjectionHelper . DisableException ( ExceptionInjectionType . Replication_InProgress_During_DiskBased_Replica_Attach_Sync ) ;
150+ }
151+ }
152+ }
153+ }
154+ #endif
0 commit comments