Skip to content

Commit d2d56a6

Browse files
committed
[consensus] fix deadlock during recovery (#24292)
## Description #24024 introduced a potential deadlock when recovering consensus. Possible sequence of events: 1. Thread A: acquires `dag_state` read lock for the duration of the function call in `recover_blocks_after_round(observer.dag_state.read().gc_round())` 2. Thread B: `CommitFinalizer::run()` tries to acquire the `dag_state` write lock, and is blocked on the reader lock from (1) 3. Thread A: `recover_and_vote_on_blocks()` tries to acquire read lock on `dag_state` again. This usually succeeds but in this case because of (2), this is blocked, causing a deadlock. Also: - Run the last step of `TransactionCertifier` recovery in a blocking thread. - Improve logging. ## Test plan CI The affected validator is fixed.
1 parent 78afac0 commit d2d56a6

File tree

4 files changed

+28
-9
lines changed

4 files changed

+28
-9
lines changed

consensus/core/src/commit_observer.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,16 @@ impl CommitObserver {
7777
// Recover blocks needed for future commits (and block proposals).
7878
// Some blocks might have been recovered as committed blocks in recover_and_send_commits().
7979
// They will just be ignored.
80-
observer
81-
.transaction_certifier
82-
.recover_blocks_after_round(observer.dag_state.read().gc_round());
80+
tokio::runtime::Handle::current()
81+
.spawn_blocking({
82+
let transaction_certifier = observer.transaction_certifier.clone();
83+
let gc_round = observer.dag_state.read().gc_round();
84+
move || {
85+
transaction_certifier.recover_blocks_after_round(gc_round);
86+
}
87+
})
88+
.await
89+
.expect("Spawn blocking should not fail");
8390

8491
observer
8592
}

consensus/core/src/transaction_certifier.rs

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ impl TransactionCertifier {
100100
.scan_blocks_by_author(authority_index, recovery_start_round)
101101
.unwrap();
102102
info!(
103-
"Recovering and voting on {} blocks for authority {} {}",
103+
"Recovered and voting on {} blocks from authority {} {}",
104104
blocks.len(),
105105
authority_index,
106106
context.committee.authority(authority_index).hostname
@@ -116,11 +116,21 @@ impl TransactionCertifier {
116116
/// Because own votes on blocks are not stored, input blocks are voted on if they can be
117117
/// included in a future proposed block.
118118
pub(crate) fn recover_and_vote_on_blocks(&self, blocks: Vec<VerifiedBlock>) {
119-
let dag_state = self.dag_state.read();
119+
let (gc_round, hard_linked) = {
120+
let dag_state = self.dag_state.read();
121+
(
122+
dag_state.gc_round(),
123+
blocks
124+
.iter()
125+
.map(|b| dag_state.is_hard_linked(&b.reference()))
126+
.collect::<Vec<_>>(),
127+
)
128+
};
120129
let voted_blocks = blocks
121130
.into_iter()
122-
.map(|b| {
123-
if b.round() <= dag_state.gc_round() || dag_state.is_hard_linked(&b.reference()) {
131+
.zip(hard_linked)
132+
.map(|(b, linked)| {
133+
if b.round() <= gc_round || linked {
124134
// Voting is unnecessary for blocks already included in own proposed blocks,
125135
// or outside of local DAG GC bound.
126136
(b, vec![])

crates/sui-core/src/authority_server.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1441,7 +1441,7 @@ impl ValidatorService {
14411441
Ok((tonic::Response::new(response), Weight::zero()))
14421442
}
14431443

1444-
#[instrument(name= "ValidatorService::wait_for_effects_response", level = "error", skip_all, err(level = "debug"), fields(consensus_position = ?request.consensus_position, fast_path_effects = tracing::field::Empty))]
1444+
#[instrument(name= "ValidatorService::wait_for_effects_response", level = "error", skip_all, fields(consensus_position = ?request.consensus_position, fast_path_effects = tracing::field::Empty))]
14451445
async fn wait_for_effects_response(
14461446
&self,
14471447
request: WaitForEffectsRequest,

crates/sui-core/src/consensus_validator.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ impl SuiTxValidator {
176176

177177
let tx_digest = *tx.digest();
178178
if let Err(error) = self.vote_transaction(&epoch_store, tx) {
179-
debug!(?tx_digest, "Transaction rejected during voting: {error}");
179+
debug!(?tx_digest, "Voting to reject transaction: {error}");
180180
self.metrics
181181
.transaction_reject_votes
182182
.with_label_values(&[error.to_variant_name()])
@@ -191,6 +191,8 @@ impl SuiTxValidator {
191191
},
192192
&error,
193193
);
194+
} else {
195+
debug!(?tx_digest, "Voting to accept transaction");
194196
}
195197
}
196198

0 commit comments

Comments
 (0)