1717import contextlib
1818import ccf .ledger
1919from reconfiguration import test_ledger_invariants
20+ import subprocess
2021
2122from loguru import logger as LOG
2223
@@ -711,6 +712,141 @@ def wait_for_new_view(node, original_view, timeout_multiplier):
711712 return network
712713
713714
715+ @reqs .supports_methods ("/app/log/public" )
716+ def test_recovery_elections (orig_network , args ):
717+ # Ensure we have 3 nodes
718+ original_size = orig_network .resize (3 , args )
719+
720+ old_primary , _ = orig_network .find_nodes ()
721+ with old_primary .client ("user0" ) as c :
722+ LOG .warning ("Writing some initial state" )
723+ for _ in range (300 ):
724+ r = c .post (
725+ "/app/log/public" ,
726+ {
727+ "id" : 42 ,
728+ "msg" : "Uninteresting recoverable transactions" ,
729+ },
730+ )
731+ assert r .status_code == 200 , r
732+
733+ r = c .get ("/node/network" )
734+ assert r .status_code == 200 , r
735+ previous_identity = orig_network .save_service_identity (args )
736+ c .wait_for_commit (
737+ orig_network .consortium .set_recovery_threshold (old_primary , 1 )
738+ )
739+ orig_network .stop_all_nodes (skip_verification = True )
740+ current_ledger_dir , committed_ledger_dirs = old_primary .get_ledger ()
741+
742+ # Create a recovery network, where we will manually take the recovery steps (transition to open and submit share)
743+ network = infra .network .Network (
744+ args .nodes ,
745+ args .binary_dir ,
746+ args .debug_nodes ,
747+ args .perf_nodes ,
748+ existing_network = orig_network ,
749+ )
750+ network .start_in_recovery (
751+ args ,
752+ ledger_dir = current_ledger_dir ,
753+ committed_ledger_dirs = committed_ledger_dirs ,
754+ )
755+ new_primary , new_backups = network .find_nodes ()
756+ network .consortium .transition_service_to_open (
757+ new_primary , previous_service_identity = previous_identity
758+ )
759+
760+ with new_primary .client ("user0" ) as c :
761+ previous_identity = network .save_service_identity (args )
762+
763+ member = network .consortium .get_active_recovery_participants ()[0 ]
764+
765+ # We need to delay a backup's private recovery process until:
766+ # - The primary has completed its private recovery, and fully opened the network
767+ # - The backup has called and won an election
768+ # So that the backup node _is primary_ at the point it completes private recovery.
769+ # We force the delay by injecting a delay into the file operations of the backup,
770+ # and force an election (after the primary has completed its recovery) by killing
771+ # the original primary node.
772+ backup = new_backups [0 ]
773+ LOG .info (f"Using strace to inject delays in file IO of { backup } " )
774+ assert not backup .remote .check_done ()
775+
776+ strace_command = [
777+ "strace" ,
778+ f"--attach={ backup .remote .remote .proc .pid } " ,
779+ "--inject=lseek:delay_exit=10s" ,
780+ "-tt" ,
781+ "--trace=lseek,read,open,openat" ,
782+ "--decode-fds=all" ,
783+ "--output=strace_output.txt" ,
784+ ]
785+ LOG .warning (f"About to run strace: { strace_command } " )
786+ strace_process = subprocess .Popen (
787+ strace_command ,
788+ stdout = subprocess .PIPE ,
789+ stderr = subprocess .PIPE ,
790+ )
791+
792+ member .get_and_submit_recovery_share (new_primary )
793+ network .recovery_count += 1
794+
795+ LOG .info ("Confirming that primary completes private recovery" )
796+ network .wait_for_state (
797+ new_primary ,
798+ infra .node .State .PART_OF_NETWORK .value ,
799+ timeout = 30 ,
800+ )
801+
802+ election_s = args .election_timeout_ms / 1000
803+ LOG .info (
804+ f"Holding backup stalled via strace for { election_s } , to trigger an election"
805+ )
806+ time .sleep (election_s )
807+
808+ LOG .info ("Ending strace, and terminating primary node" )
809+ strace_process .terminate ()
810+ strace_process .communicate ()
811+
812+ new_primary .stop ()
813+
814+ LOG .info (
815+ f"Give { backup } time to finish its recovery (including becoming primary), and confirm that it dies in the process"
816+ )
817+ time .sleep (election_s )
818+ # The result of all of that is that this node, which had become primary while it
819+ # completed its private recovery, crashed at the end of recovery (rather than)
820+ # producing an invalid ledger)
821+ assert backup .remote .check_done ()
822+
823+ network .ignore_errors_on_shutdown ()
824+ network .stop_all_nodes (skip_verification = True )
825+ current_ledger_dir , committed_ledger_dirs = backup .get_ledger ()
826+
827+ LOG .info (
828+ "Trying a further recovery, to confirm that the ledger is in a recoverable state"
829+ )
830+ recovery_network = infra .network .Network (
831+ args .nodes ,
832+ args .binary_dir ,
833+ args .debug_nodes ,
834+ args .perf_nodes ,
835+ existing_network = network ,
836+ )
837+ recovery_network .start_in_recovery (
838+ args ,
839+ ledger_dir = current_ledger_dir ,
840+ committed_ledger_dirs = committed_ledger_dirs ,
841+ )
842+ recovery_network .recover (args )
843+
844+ # Restore original network size
845+ recovery_network .resize (original_size , args )
846+
847+ return recovery_network
848+
849+
714850def run (args ):
715851 txs = app .LoggingTxs ("user0" )
716852
@@ -737,6 +873,7 @@ def run(args):
737873 # HTTP2 doesn't support forwarding
738874 if not args .http2 :
739875 test_session_consistency (network , args )
876+ network = test_recovery_elections (network , args )
740877 test_ledger_invariants (network , args )
741878
742879
0 commit comments