Ensure that end-of-private-recovery service open transaction is written at-most-once (#6926)

eddyashton · achamayou · web-flow · commit 7b64e99a0157 · 2025-03-28T18:56:41.000Z
Co-authored-by: Amaury Chamayou &lt;amchamay@microsoft.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ## [6.0.0]
 
-[6.0.0]: https://github.com/microsoft/CCF/releases/tag/6.0.0
+[6.0.0]: https://github.com/microsoft/CCF/releases/tag/ccf-6.0.0
 
 ### Important
 
@@ -134,6 +134,7 @@ It also adds COSE_Sign1 ledger signatures, to support the generation of COSE rec
 - Fixed thread-safety issues when CCF nodes attempted to contact non-TLS servers. This previously could cause errors when running SNP builds with multiple worker threads (#6836).
 - SNP nodes will no longer crash when run on firmware returning v3 attestations (#6841).
 - Fixed potential races in indexing interfaces and `LazyStrategy` (#6886).
+- Fixed a bug which could produce an invalid secret chain (containing duplicate ledger secret sealing entries) in the ledger if an election occurred during private recovery (#6926).
 
 ### Removed
 
diff --git a/scripts/setup-ci.sh b/scripts/setup-ci.sh
@@ -59,6 +59,7 @@ chmod +x /opt/pebble/pebble_linux-amd64 /opt/pebble/pebble-challtestsrv_linux-am
 
 # partitions test
 tdnf -y install iptables
+tdnf -y install strace
 
 # For packaging
 tdnf -y install rpm-build
diff --git a/src/node/node_state.h b/src/node/node_state.h
@@ -1216,7 +1216,10 @@ namespace ccf
       {
         auto entry = ::consensus::LedgerEnclave::get_entry(data, size);
 
-        LOG_INFO_FMT("Deserialising private ledger entry [{}]", entry.size());
+        LOG_INFO_FMT(
+          "Deserialising private ledger entry {} [{}]",
+          last_recovered_idx + 1,
+          entry.size());
 
         // When reading the private ledger, deserialise in the recovery store
         ccf::kv::ApplyResult result = ccf::kv::ApplyResult::FAIL;
@@ -1309,6 +1312,29 @@ namespace ccf
 
         auto tx = network.tables->create_tx();
 
+        {
+          // Ensure this transition happens at-most-once, by checking that no
+          // other node has already advanced the state
+          auto service = tx.ro<ccf::Service>(Tables::SERVICE);
+          auto active_service = service->get();
+
+          if (!active_service.has_value())
+          {
+            throw std::logic_error(fmt::format(
+              "Error in {}: no value in {}", __func__, Tables::SERVICE));
+          }
+
+          if (
+            active_service->status !=
+            ServiceStatus::WAITING_FOR_RECOVERY_SHARES)
+          {
+            throw std::logic_error(fmt::format(
+              "Error in {}: current service status is {}",
+              __func__,
+              active_service->status));
+          }
+        }
+
         // Clear recovery shares that were submitted to initiate the recovery
         // procedure
         ShareManager::clear_submitted_recovery_shares(tx);
diff --git a/tests/infra/network.py b/tests/infra/network.py
@@ -1698,7 +1698,7 @@ def save_service_identity(self, args):
         with open(previous_identity, "w", encoding="utf-8") as f:
             f.write(current_ident)
         args.previous_service_identity_file = previous_identity
-        return args
+        return current_ident
 
     def identity(self, name=None):
         if name is not None:
diff --git a/tests/partitions_test.py b/tests/partitions_test.py
@@ -17,6 +17,7 @@
 import contextlib
 import ccf.ledger
 from reconfiguration import test_ledger_invariants
+import subprocess
 
 from loguru import logger as LOG
 
@@ -711,6 +712,141 @@ def wait_for_new_view(node, original_view, timeout_multiplier):
     return network
 
 
+@reqs.supports_methods("/app/log/public")
+def test_recovery_elections(orig_network, args):
+    # Ensure we have 3 nodes
+    original_size = orig_network.resize(3, args)
+
+    old_primary, _ = orig_network.find_nodes()
+    with old_primary.client("user0") as c:
+        LOG.warning("Writing some initial state")
+        for _ in range(300):
+            r = c.post(
+                "/app/log/public",
+                {
+                    "id": 42,
+                    "msg": "Uninteresting recoverable transactions",
+                },
+            )
+            assert r.status_code == 200, r
+
+        r = c.get("/node/network")
+        assert r.status_code == 200, r
+        previous_identity = orig_network.save_service_identity(args)
+        c.wait_for_commit(
+            orig_network.consortium.set_recovery_threshold(old_primary, 1)
+        )
+    orig_network.stop_all_nodes(skip_verification=True)
+    current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger()
+
+    # Create a recovery network, where we will manually take the recovery steps (transition to open and submit share)
+    network = infra.network.Network(
+        args.nodes,
+        args.binary_dir,
+        args.debug_nodes,
+        args.perf_nodes,
+        existing_network=orig_network,
+    )
+    network.start_in_recovery(
+        args,
+        ledger_dir=current_ledger_dir,
+        committed_ledger_dirs=committed_ledger_dirs,
+    )
+    new_primary, new_backups = network.find_nodes()
+    network.consortium.transition_service_to_open(
+        new_primary, previous_service_identity=previous_identity
+    )
+
+    with new_primary.client("user0") as c:
+        previous_identity = network.save_service_identity(args)
+
+    member = network.consortium.get_active_recovery_participants()[0]
+
+    # We need to delay a backup's private recovery process until:
+    # - The primary has completed its private recovery, and fully opened the network
+    # - The backup has called and won an election
+    # So that the backup node _is primary_ at the point it completes private recovery.
+    # We force the delay by injecting a delay into the file operations of the backup,
+    # and force an election (after the primary has completed its recovery) by killing
+    # the original primary node.
+    backup = new_backups[0]
+    LOG.info(f"Using strace to inject delays in file IO of {backup}")
+    assert not backup.remote.check_done()
+
+    strace_command = [
+        "strace",
+        f"--attach={backup.remote.remote.proc.pid}",
+        "--inject=lseek:delay_exit=10s",
+        "-tt",
+        "--trace=lseek,read,open,openat",
+        "--decode-fds=all",
+        "--output=strace_output.txt",
+    ]
+    LOG.warning(f"About to run strace: {strace_command}")
+    strace_process = subprocess.Popen(
+        strace_command,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+
+    member.get_and_submit_recovery_share(new_primary)
+    network.recovery_count += 1
+
+    LOG.info("Confirming that primary completes private recovery")
+    network.wait_for_state(
+        new_primary,
+        infra.node.State.PART_OF_NETWORK.value,
+        timeout=30,
+    )
+
+    election_s = args.election_timeout_ms / 1000
+    LOG.info(
+        f"Holding backup stalled via strace for {election_s}, to trigger an election"
+    )
+    time.sleep(election_s)
+
+    LOG.info("Ending strace, and terminating primary node")
+    strace_process.terminate()
+    strace_process.communicate()
+
+    new_primary.stop()
+
+    LOG.info(
+        f"Give {backup} time to finish its recovery (including becoming primary), and confirm that it dies in the process"
+    )
+    time.sleep(election_s)
+    # The result of all of that is that this node, which had become primary while it
+    # completed its private recovery, crashed at the end of recovery (rather than)
+    # producing an invalid ledger)
+    assert backup.remote.check_done()
+
+    network.ignore_errors_on_shutdown()
+    network.stop_all_nodes(skip_verification=True)
+    current_ledger_dir, committed_ledger_dirs = backup.get_ledger()
+
+    LOG.info(
+        "Trying a further recovery, to confirm that the ledger is in a recoverable state"
+    )
+    recovery_network = infra.network.Network(
+        args.nodes,
+        args.binary_dir,
+        args.debug_nodes,
+        args.perf_nodes,
+        existing_network=network,
+    )
+    recovery_network.start_in_recovery(
+        args,
+        ledger_dir=current_ledger_dir,
+        committed_ledger_dirs=committed_ledger_dirs,
+    )
+    recovery_network.recover(args)
+
+    # Restore original network size
+    recovery_network.resize(original_size, args)
+
+    return recovery_network
+
+
 def run(args):
     txs = app.LoggingTxs("user0")
 
@@ -737,6 +873,7 @@ def run(args):
         # HTTP2 doesn't support forwarding
         if not args.http2:
             test_session_consistency(network, args)
+        network = test_recovery_elections(network, args)
         test_ledger_invariants(network, args)