Skip to content

Commit 6146294

Browse files
akichidismwtian
andcommitted
[MFP] Refactor the use of scoring - use latencies instead (#23741)
## Description The current scoring mechanism is working well but it is less intuitive and it currently contains as well unnecessary calculations. An alternative is to use instead the calculated average latencies (end to end) and expose those as the "scores". The reliability metric will now be used to penalise the latency with the max latency been chosen the `10s` - anything above that doesn't have much meaning anyways. I've also steered away from re-using the "score" term so it becomes clear end to end what we are measuring here. ## Test plan CI/PT --- ## Release notes Check each box that your changes affect. If none of the boxes relate to your changes, release notes aren't required. For each box you select, include information after the relevant heading that describes the impact of your changes that a user might notice and any actions they must take to implement updates. - [ ] Protocol: - [ ] Nodes (Validators and Full nodes): - [ ] gRPC: - [ ] JSON-RPC: - [ ] GraphQL: - [ ] CLI: - [ ] Rust SDK: --------- Co-authored-by: Mingwei Tian <[email protected]>
1 parent 7f1b811 commit 6146294

File tree

8 files changed

+317
-376
lines changed

8 files changed

+317
-376
lines changed

crates/sui-config/src/validator_client_monitor_config.rs

Lines changed: 7 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -113,12 +113,6 @@ pub struct ValidatorClientMonitorConfig {
113113
#[serde(default = "default_health_check_timeout")]
114114
pub health_check_timeout: Duration,
115115

116-
/// Weight configuration for score calculation.
117-
///
118-
/// Determines how different factors contribute to validator selection.
119-
#[serde(default)]
120-
pub score_weights: ScoreWeights,
121-
122116
/// Cooldown period after failures before considering a validator again.
123117
///
124118
/// Should be long enough to allow transient issues to resolve,
@@ -132,85 +126,29 @@ pub struct ValidatorClientMonitorConfig {
132126
/// Higher values are more tolerant of intermittent issues.
133127
#[serde(default = "default_max_consecutive_failures")]
134128
pub max_consecutive_failures: u32,
135-
}
136129

137-
/// Weights for different factors in score calculation
138-
#[derive(Debug, Clone, Serialize, Deserialize)]
139-
#[serde(rename_all = "kebab-case")]
140-
pub struct ScoreWeights {
141-
/// Weight for latency (lower is better).
130+
/// Weight for reliability.
142131
///
143-
/// This is the overall weight for all latency scores combined.
144-
/// Individual operation latencies are weighted separately below.
145-
#[serde(default = "default_latency_weight")]
146-
pub latency: f64,
147-
148-
/// Weight for success rate.
149-
///
150-
/// Higher values prioritize reliability over performance.
132+
/// Controls importance of reliability when adjusting the validator's latency for transaction submission
133+
/// selection. The higher the weight, the more penalty is given to unreliable validators.
134+
/// Default to 2.0. Value should be positive.
151135
#[serde(default = "default_reliability_weight")]
152-
pub reliability: f64,
153-
154-
/// Weight for submit transaction latency.
155-
///
156-
/// Controls importance of transaction submission speed.
157-
#[serde(default = "default_submit_latency_weight")]
158-
pub submit_latency_weight: f64,
159-
160-
/// Weight for effects retrieval latency.
161-
///
162-
/// Controls importance of effects query speed.
163-
/// Often the most critical operation for application responsiveness.
164-
#[serde(default = "default_effects_latency_weight")]
165-
pub effects_latency_weight: f64,
166-
167-
/// Weight for health check latency.
168-
///
169-
/// Usually less critical than actual operations.
170-
#[serde(default = "default_health_check_latency_weight")]
171-
pub health_check_latency_weight: f64,
172-
173-
/// Weight for fast path latency.
174-
///
175-
/// Controls importance of finalization speed.
176-
#[serde(default = "default_fast_path_latency_weight")]
177-
pub fast_path_latency_weight: f64,
178-
179-
/// Weight for consensus latency.
180-
///
181-
/// Controls importance of consensus speed.
182-
#[serde(default = "default_consensus_latency_weight")]
183-
pub consensus_latency_weight: f64,
136+
pub reliability_weight: f64,
184137
}
185138

186139
impl Default for ValidatorClientMonitorConfig {
187140
fn default() -> Self {
188141
Self {
189142
health_check_interval: default_health_check_interval(),
190143
health_check_timeout: default_health_check_timeout(),
191-
score_weights: ScoreWeights::default(),
192144
failure_cooldown: default_failure_cooldown(),
193145
max_consecutive_failures: default_max_consecutive_failures(),
194-
}
195-
}
196-
}
197-
198-
impl Default for ScoreWeights {
199-
fn default() -> Self {
200-
Self {
201-
latency: default_latency_weight(),
202-
reliability: default_reliability_weight(),
203-
submit_latency_weight: default_submit_latency_weight(),
204-
effects_latency_weight: default_effects_latency_weight(),
205-
health_check_latency_weight: default_health_check_latency_weight(),
206-
fast_path_latency_weight: default_fast_path_latency_weight(),
207-
consensus_latency_weight: default_consensus_latency_weight(),
146+
reliability_weight: default_reliability_weight(),
208147
}
209148
}
210149
}
211150

212151
// Default value functions
213-
214152
fn default_health_check_interval() -> Duration {
215153
Duration::from_secs(10)
216154
}
@@ -227,30 +165,6 @@ fn default_max_consecutive_failures() -> u32 {
227165
100
228166
}
229167

230-
fn default_latency_weight() -> f64 {
231-
0.9
232-
}
233-
234168
fn default_reliability_weight() -> f64 {
235-
0.1
236-
}
237-
238-
fn default_submit_latency_weight() -> f64 {
239-
0.0
240-
}
241-
242-
fn default_effects_latency_weight() -> f64 {
243-
0.0
244-
}
245-
246-
fn default_health_check_latency_weight() -> f64 {
247-
0.0
248-
}
249-
250-
fn default_fast_path_latency_weight() -> f64 {
251-
1.0
252-
}
253-
254-
fn default_consensus_latency_weight() -> f64 {
255-
1.0
169+
2.0
256170
}

crates/sui-core/src/transaction_driver/metrics.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ pub struct TransactionDriverMetrics {
1818
pub(crate) settlement_finality_latency: HistogramVec,
1919
pub(crate) total_transactions_submitted: IntCounterVec,
2020
pub(crate) submit_transaction_retries: Histogram,
21-
pub(crate) submit_transaction_latency: Histogram,
21+
pub(crate) submit_transaction_latency: HistogramVec,
2222
pub(crate) validator_submit_transaction_errors: IntCounterVec,
2323
pub(crate) validator_submit_transaction_successes: IntCounterVec,
2424
pub(crate) executed_transactions: IntCounter,
@@ -60,11 +60,12 @@ impl TransactionDriverMetrics {
6060
registry,
6161
)
6262
.unwrap(),
63-
submit_transaction_latency: register_histogram_with_registry!(
63+
submit_transaction_latency: register_histogram_vec_with_registry!(
6464
"transaction_driver_submit_transaction_latency",
6565
"Time in seconds to successfully submit a transaction to a validator.\n\
6666
Includes all retries and measures from the start of submission\n\
6767
until a validator accepts the transaction.",
68+
&["validator", "tx_type", "ping"],
6869
mysten_metrics::LATENCY_SEC_BUCKETS.to_vec(),
6970
registry,
7071
)

crates/sui-core/src/transaction_driver/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ where
121121
async fn run_latency_checks(self: Arc<Self>) {
122122
const INTERVAL_BETWEEN_RUNS: Duration = Duration::from_secs(15);
123123
const MAX_JITTER: Duration = Duration::from_secs(10);
124-
const PING_REQUEST_TIMEOUT: Duration = Duration::from_millis(5_000);
124+
const PING_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
125125

126126
let mut interval = interval(INTERVAL_BETWEEN_RUNS);
127127
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);

crates/sui-core/src/transaction_driver/transaction_submitter.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,10 @@ impl TransactionSubmitter {
144144
.submit_transaction_retries
145145
.observe(retries as f64);
146146
let elapsed = start_time.elapsed().as_secs_f64();
147-
self.metrics.submit_transaction_latency.observe(elapsed);
147+
self.metrics
148+
.submit_transaction_latency
149+
.with_label_values(&[&display_name, tx_type.as_str(), ping_label])
150+
.observe(elapsed);
148151

149152
return Ok((name, result));
150153
}

crates/sui-core/src/validator_client_monitor/metrics.rs

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,9 @@ pub struct ValidatorClientMetrics {
2222
/// Failure count per validator and operation type
2323
pub operation_failure: IntCounterVec,
2424

25-
/// Current performance score per validator
26-
pub performance_score: GaugeVec,
25+
/// Current performance per validator. The performance is the average latency of the validator
26+
/// weighted by the reliability of the validator.
27+
pub performance: GaugeVec,
2728

2829
/// Consecutive failures per validator
2930
pub consecutive_failures: IntGaugeVec,
@@ -57,9 +58,10 @@ impl ValidatorClientMetrics {
5758
)
5859
.unwrap(),
5960

60-
performance_score: register_gauge_vec_with_registry!(
61-
"validator_client_observed_score",
62-
"Current client-observed score per validator",
61+
performance: register_gauge_vec_with_registry!(
62+
"validator_client_observed_performance",
63+
"Current client-observed performance per validator. The performance is the average latency of the validator
64+
weighted by the reliability of the validator.",
6365
&["validator", "tx_type"],
6466
registry,
6567
)

crates/sui-core/src/validator_client_monitor/monitor.rs

Lines changed: 33 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ pub struct ValidatorClientMonitor<A: Clone> {
3939
metrics: Arc<ValidatorClientMetrics>,
4040
client_stats: RwLock<ClientObservedStats>,
4141
authority_aggregator: Arc<ArcSwap<AuthorityAggregator<A>>>,
42-
cached_scores: RwLock<HashMap<TxType, HashMap<AuthorityName, f64>>>,
42+
cached_latencies: RwLock<HashMap<TxType, HashMap<AuthorityName, f64>>>,
4343
}
4444

4545
impl<A> ValidatorClientMonitor<A>
@@ -61,7 +61,7 @@ where
6161
metrics,
6262
client_stats: RwLock::new(ClientObservedStats::new(config)),
6363
authority_aggregator,
64-
cached_scores: RwLock::new(HashMap::new()),
64+
cached_latencies: RwLock::new(HashMap::new()),
6565
});
6666

6767
let monitor_clone = monitor.clone();
@@ -155,41 +155,42 @@ where
155155
}
156156
}
157157

158-
self.update_cached_scores(&authority_agg);
158+
self.update_cached_latencies(&authority_agg);
159159
}
160160
}
161161
}
162162

163163
impl<A: Clone> ValidatorClientMonitor<A> {
164-
/// Calculate and cache scores for all validators.
164+
/// Calculate and cache latencies for all validators.
165165
///
166166
/// This method is called periodically after health checks complete to update
167-
/// the cached validator scores.
168-
fn update_cached_scores(&self, authority_agg: &AuthorityAggregator<A>) {
167+
/// the cached validator latencies. Those are the end to end latencies as calculated for each validator
168+
/// taking into account the reliability of the validator.
169+
fn update_cached_latencies(&self, authority_agg: &AuthorityAggregator<A>) {
169170
let committee = &authority_agg.committee;
170-
let mut cached_scores = self.cached_scores.write();
171+
let mut cached_latencies = self.cached_latencies.write();
171172

172173
for tx_type in TxType::iter() {
173-
let score_map = self
174+
let latencies_map = self
174175
.client_stats
175176
.read()
176177
.get_all_validator_stats(committee, tx_type);
177178

178-
for (validator, score) in score_map.iter() {
179+
for (validator, latency) in latencies_map.iter() {
179180
debug!(
180-
"Validator {}, tx type {}: score {}",
181+
"Validator {}, tx type {}: latency {}",
181182
validator,
182183
tx_type.as_str(),
183-
score
184+
*latency
184185
);
185186
let display_name = authority_agg.get_display_name(validator);
186187
self.metrics
187-
.performance_score
188+
.performance
188189
.with_label_values(&[&display_name, tx_type.as_str()])
189-
.set(*score);
190+
.set(*latency);
190191
}
191192

192-
cached_scores.insert(tx_type, score_map);
193+
cached_latencies.insert(tx_type, latencies_map);
193194
}
194195
}
195196

@@ -239,15 +240,15 @@ impl<A: Clone> ValidatorClientMonitor<A> {
239240
/// is called, and we need to maintain an invariant that the selected
240241
/// validators are always in the committee passed in.
241242
///
242-
/// Also the tx type is passed in so that we can select validators based on their respective scores
243+
/// Also the tx type is passed in so that we can select validators based on their respective latencies
243244
/// for the transaction type.
244245
///
245246
/// We shuffle the top k validators to avoid the same validator being selected
246247
/// too many times in a row and getting overloaded.
247248
///
248249
/// Returns a vector containing:
249-
/// 1. The top `k` validators by score (shuffled)
250-
/// 2. The remaining validators ordered by score (not shuffled)
250+
/// 1. The top `k` validators by latency (shuffled)
251+
/// 2. The remaining validators ordered by latency (not shuffled)
251252
pub fn select_shuffled_preferred_validators(
252253
&self,
253254
committee: &Committee,
@@ -256,30 +257,34 @@ impl<A: Clone> ValidatorClientMonitor<A> {
256257
) -> Vec<AuthorityName> {
257258
let mut rng = rand::thread_rng();
258259

259-
let cached_scores = self.cached_scores.read();
260-
let Some(cached_scores) = cached_scores.get(&tx_type) else {
260+
let cached_latencies = self.cached_latencies.read();
261+
let Some(cached_latencies) = cached_latencies.get(&tx_type) else {
261262
let mut validators: Vec<_> = committee.names().cloned().collect();
262263
validators.shuffle(&mut rng);
263264
return validators;
264265
};
265266

266-
// Since the cached scores are updated periodically, it is possible that it was ran on
267+
// Since the cached latencies are updated periodically, it is possible that it was ran on
267268
// an out-of-date committee.
268-
let mut validator_with_scores: Vec<_> = committee
269+
let mut validator_with_latencies: Vec<_> = committee
269270
.names()
270-
.map(|v| (*v, cached_scores.get(v).cloned().unwrap_or(0.0)))
271+
.map(|v| (*v, cached_latencies.get(v).cloned().unwrap_or(0.0)))
271272
.collect();
272-
validator_with_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
273+
// Sort by latency in ascending order. We want to select the validators with the lowest latencies.
274+
validator_with_latencies.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
273275

274-
let k = k.min(validator_with_scores.len());
275-
validator_with_scores[..k].shuffle(&mut rng);
276+
let k = k.min(validator_with_latencies.len());
277+
validator_with_latencies[..k].shuffle(&mut rng);
276278

277-
validator_with_scores.into_iter().map(|(v, _)| v).collect()
279+
validator_with_latencies
280+
.into_iter()
281+
.map(|(v, _)| v)
282+
.collect()
278283
}
279284

280285
#[cfg(test)]
281-
pub fn force_update_cached_scores(&self, authority_agg: &AuthorityAggregator<A>) {
282-
self.update_cached_scores(authority_agg);
286+
pub fn force_update_cached_latencies(&self, authority_agg: &AuthorityAggregator<A>) {
287+
self.update_cached_latencies(authority_agg);
283288
}
284289

285290
#[cfg(test)]

0 commit comments

Comments
 (0)