Skip to content

Commit faa2119

Browse files
authored
feat(multipath): add back basic metrics (#3672)
## Description This adds back basic metrics to `feat-multipath`. * Refactor the `conn_` metrics (they were all currently unused on `feat-multipath`) into something meaningful: Track the total number of opened and closed connections - their diff at any time is the number of currently-active conne * Adds `transport_ip_paths_added`, `transport_ip_paths_removed`, `transport_relay_paths_added`, `transport_relay_paths_removed` counters * Removes metrics that are unused and - IMO - not going to come back * Comments out metrics that are unused and likely to come back ## Breaking Changes <!-- Optional, if there are any breaking changes document them, including how to migrate older code. --> ## Notes & open questions <!-- Any notes, remarks or open questions you have to make about the PR. --> ## Change checklist <!-- Remove any that are not relevant. --> - [ ] Self-review. - [ ] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [ ] Tests if relevant. - [ ] All breaking changes documented. - [ ] List all breaking changes in the above "Breaking Changes" section. - [ ] Open an issue or PR on any number0 repos that are affected by this breaking change. Give guidance on how the updates should be handled or do the actual updates themselves. The major ones are: - [ ] [`quic-rpc`](https://github.com/n0-computer/quic-rpc) - [ ] [`iroh-gossip`](https://github.com/n0-computer/iroh-gossip) - [ ] [`iroh-blobs`](https://github.com/n0-computer/iroh-blobs) - [ ] [`dumbpipe`](https://github.com/n0-computer/dumbpipe) - [ ] [`sendme`](https://github.com/n0-computer/sendme)
1 parent 3eff16d commit faa2119

File tree

3 files changed

+80
-57
lines changed

3 files changed

+80
-57
lines changed

iroh/src/magicsock/metrics.rs

Lines changed: 17 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use iroh_metrics::{Counter, Histogram, MetricsGroup};
1+
use iroh_metrics::{Counter, MetricsGroup};
22
use serde::{Deserialize, Serialize};
33

44
/// Enum of metrics for the module
@@ -14,11 +14,8 @@ pub struct Metrics {
1414
pub send_ipv4: Counter,
1515
pub send_ipv6: Counter,
1616
pub send_relay: Counter,
17-
pub send_relay_error: Counter,
1817

1918
// Data packets (non-disco)
20-
pub send_data: Counter,
21-
pub send_data_network_down: Counter,
2219
pub recv_data_relay: Counter,
2320
pub recv_data_ipv4: Counter,
2421
pub recv_data_ipv6: Counter,
@@ -38,15 +35,23 @@ pub struct Metrics {
3835

3936
/*
4037
* Connection Metrics
38+
*
39+
* These all only count connections that completed the TLS handshake successfully. This means
40+
* that short lived 0RTT connections are potentially not included in these counts.
4141
*/
42-
/// The number of direct connections we have made to peers.
43-
pub num_direct_conns_added: Counter,
44-
/// The number of direct connections we have lost to peers.
45-
pub num_direct_conns_removed: Counter,
46-
/// The number of connections to peers we have added over relay.
47-
pub num_relay_conns_added: Counter,
48-
/// The number of connections to peers we have removed over relay.
49-
pub num_relay_conns_removed: Counter,
42+
/// Number of connections opened (only handshaked connections are counted).
43+
pub num_conns_opened: Counter,
44+
/// Number of connections closed (only handshaked connections are counted).
45+
pub num_conns_closed: Counter,
46+
47+
/// Number of IP transport paths opened.
48+
pub transport_ip_paths_added: Counter,
49+
/// Number of IP transport paths closed.
50+
pub transport_ip_paths_removed: Counter,
51+
/// Number of relay transport paths opened.
52+
pub transport_relay_paths_added: Counter,
53+
/// Number of relay transport paths closed.
54+
pub transport_relay_paths_removed: Counter,
5055

5156
pub actor_tick_main: Counter,
5257
pub actor_tick_msg: Counter,
@@ -55,36 +60,4 @@ pub struct Metrics {
5560
pub actor_tick_direct_addr_heartbeat: Counter,
5661
pub actor_link_change: Counter,
5762
pub actor_tick_other: Counter,
58-
59-
/// Number of endpoints we have attempted to contact.
60-
pub endpoints_contacted: Counter,
61-
/// Number of endpoints we have managed to contact directly.
62-
pub endpoints_contacted_directly: Counter,
63-
64-
/// Number of connections with a successful handshake.
65-
pub connection_handshake_success: Counter,
66-
/// Number of connections with a successful handshake that became direct.
67-
pub connection_became_direct: Counter,
68-
/// Histogram of connection latency in milliseconds across all endpoint connections.
69-
#[default(Histogram::new(vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0, f64::INFINITY]))]
70-
pub connection_latency_ms: Histogram,
71-
72-
/*
73-
* Path Congestion Metrics
74-
*/
75-
/// Number of times a path was marked as outdated due to consecutive ping failures.
76-
pub path_marked_outdated: Counter,
77-
/// Number of ping failures recorded across all paths.
78-
pub path_ping_failures: Counter,
79-
/// Number of consecutive failure resets (path recovered).
80-
pub path_failure_resets: Counter,
81-
/// Histogram of packet loss rates (0.0-1.0) observed on UDP paths.
82-
#[default(Histogram::new(vec![0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0]))]
83-
pub path_packet_loss_rate: Histogram,
84-
/// Histogram of RTT variance (in milliseconds) as a congestion indicator.
85-
#[default(Histogram::new(vec![0.0, 1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0]))]
86-
pub path_rtt_variance_ms: Histogram,
87-
/// Histogram of path quality scores (0.0-1.0).
88-
#[default(Histogram::new(vec![0.0, 0.3, 0.5, 0.7, 0.85, 0.95, 1.0]))]
89-
pub path_quality_score: Histogram,
9063
}

iroh/src/magicsock/remote_map/remote_state.rs

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -192,15 +192,15 @@ impl RemoteStateActor {
192192
Self {
193193
endpoint_id,
194194
local_endpoint_id,
195-
metrics,
195+
metrics: metrics.clone(),
196196
local_direct_addrs,
197197
relay_mapped_addrs,
198198
discovery,
199199
connections: FxHashMap::default(),
200200
connections_close: Default::default(),
201201
path_events: Default::default(),
202202
addr_events: Default::default(),
203-
paths: Default::default(),
203+
paths: RemotePathState::new(metrics),
204204
last_holepunch: None,
205205
selected_path: Default::default(),
206206
scheduled_holepunch: None,
@@ -273,11 +273,7 @@ impl RemoteStateActor {
273273
self.trigger_holepunching().await;
274274
}
275275
Some(conn_id) = self.connections_close.next(), if !self.connections_close.is_empty() => {
276-
self.connections.remove(&conn_id);
277-
if self.connections.is_empty() {
278-
trace!("last connection closed - clearing selected_path");
279-
self.selected_path.set(None).ok();
280-
}
276+
self.handle_connection_close(conn_id);
281277
}
282278
res = self.local_direct_addrs.updated() => {
283279
if let Err(n0_watcher::Disconnected) = res {
@@ -397,6 +393,7 @@ impl RemoteStateActor {
397393
) {
398394
let pub_open_paths = Watchable::default();
399395
if let Some(conn) = handle.upgrade() {
396+
self.metrics.num_conns_opened.inc();
400397
// Remove any conflicting stable_ids from the local state.
401398
let conn_id = ConnId(conn.stable_id());
402399
self.connections.remove(&conn_id);
@@ -487,6 +484,16 @@ impl RemoteStateActor {
487484
self.trigger_discovery();
488485
}
489486

487+
fn handle_connection_close(&mut self, conn_id: ConnId) {
488+
if self.connections.remove(&conn_id).is_some() {
489+
self.metrics.num_conns_closed.inc();
490+
}
491+
if self.connections.is_empty() {
492+
trace!("last connection closed - clearing selected_path");
493+
self.selected_path.set(None).ok();
494+
}
495+
}
496+
490497
fn handle_discovery_item(&mut self, item: Option<Result<DiscoveryItem, DiscoveryError>>) {
491498
match item {
492499
None => {
@@ -1052,7 +1059,6 @@ impl ConnectionState {
10521059
self.paths.insert(path_id, remote.clone());
10531060
self.open_paths.insert(path_id, remote.clone());
10541061
self.path_ids.insert(remote, path_id);
1055-
10561062
self.update_pub_path_info();
10571063
}
10581064

iroh/src/magicsock/remote_map/remote_state/path_state.rs

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
//! The state kept for each network path to a remote endpoint.
22
3-
use std::collections::{HashMap, HashSet, VecDeque};
3+
use std::{
4+
collections::{HashMap, HashSet, VecDeque},
5+
sync::Arc,
6+
};
47

58
use n0_error::e;
69
use n0_future::time::Instant;
@@ -9,7 +12,7 @@ use tokio::sync::oneshot;
912
use tracing::trace;
1013

1114
use super::Source;
12-
use crate::{discovery::DiscoveryError, magicsock::transports};
15+
use crate::{discovery::DiscoveryError, magicsock::transports, metrics::MagicsockMetrics};
1316

1417
/// Maximum number of IP paths we keep around per endpoint.
1518
pub(super) const MAX_IP_PATHS: usize = 30;
@@ -23,7 +26,7 @@ pub(super) const MAX_INACTIVE_IP_PATHS: usize = 10;
2326
///
2427
/// Also stores a list of resolve requests which are triggered once at least one path is known,
2528
/// or once this struct is notified of a failed discovery run.
26-
#[derive(Debug, Default)]
29+
#[derive(Debug)]
2730
pub(super) struct RemotePathState {
2831
/// All possible paths we are aware of.
2932
///
@@ -32,6 +35,7 @@ pub(super) struct RemotePathState {
3235
paths: FxHashMap<transports::Addr, PathState>,
3336
/// Pending resolve requests from [`Self::resolve_remote`].
3437
pending_resolve_requests: VecDeque<oneshot::Sender<Result<(), DiscoveryError>>>,
38+
metrics: Arc<MagicsockMetrics>,
3539
}
3640

3741
/// Describes the usability of this path, i.e. whether it has ever been opened,
@@ -51,10 +55,21 @@ pub(super) enum PathStatus {
5155
}
5256

5357
impl RemotePathState {
58+
pub(super) fn new(metrics: Arc<MagicsockMetrics>) -> Self {
59+
Self {
60+
paths: Default::default(),
61+
pending_resolve_requests: Default::default(),
62+
metrics,
63+
}
64+
}
5465
/// Insert a new address of an open path into our list of paths.
5566
///
5667
/// This will emit pending resolve requests and trigger pruning paths.
5768
pub(super) fn insert_open_path(&mut self, addr: transports::Addr, source: Source) {
69+
match addr {
70+
transports::Addr::Ip(_) => self.metrics.transport_ip_paths_added.inc(),
71+
transports::Addr::Relay(_, _) => self.metrics.transport_relay_paths_added.inc(),
72+
};
5873
let state = self.paths.entry(addr).or_default();
5974
state.status = PathStatus::Open;
6075
state.sources.insert(source.clone(), Instant::now());
@@ -68,6 +83,14 @@ impl RemotePathState {
6883
/// `RemotePathState`
6984
pub(super) fn abandoned_path(&mut self, addr: &transports::Addr) {
7085
if let Some(state) = self.paths.get_mut(addr) {
86+
if matches!(state.status, PathStatus::Open) {
87+
match addr {
88+
transports::Addr::Ip(_) => self.metrics.transport_ip_paths_removed.inc(),
89+
transports::Addr::Relay(_, _) => {
90+
self.metrics.transport_relay_paths_removed.inc()
91+
}
92+
};
93+
}
7194
match state.status {
7295
PathStatus::Open | PathStatus::Inactive(_) => {
7396
state.status = PathStatus::Inactive(Instant::now());
@@ -491,7 +514,7 @@ mod tests {
491514

492515
#[test]
493516
fn test_insert_open_path() {
494-
let mut state = RemotePathState::default();
517+
let mut state = RemotePathState::new(Default::default());
495518
let addr = ip_addr(1000);
496519
let source = Source::Udp;
497520

@@ -509,25 +532,31 @@ mod tests {
509532

510533
#[test]
511534
fn test_abandoned_path() {
512-
let mut state = RemotePathState::default();
535+
let metrics = Arc::new(MagicsockMetrics::default());
536+
let mut state = RemotePathState::new(metrics.clone());
513537

514538
// Test: Open goes to Inactive
515539
let addr_open = ip_addr(1000);
516540
state.insert_open_path(addr_open.clone(), Source::Udp);
517541
assert!(matches!(state.paths[&addr_open].status, PathStatus::Open));
542+
assert_eq!(metrics.transport_ip_paths_added.get(), 1);
518543

519544
state.abandoned_path(&addr_open);
520545
assert!(matches!(
521546
state.paths[&addr_open].status,
522547
PathStatus::Inactive(_)
523548
));
549+
assert_eq!(metrics.transport_ip_paths_added.get(), 1);
550+
assert_eq!(metrics.transport_ip_paths_removed.get(), 1);
524551

525552
// Test: Inactive stays Inactive
526553
state.abandoned_path(&addr_open);
527554
assert!(matches!(
528555
state.paths[&addr_open].status,
529556
PathStatus::Inactive(_)
530557
));
558+
assert_eq!(metrics.transport_ip_paths_added.get(), 1);
559+
assert_eq!(metrics.transport_ip_paths_removed.get(), 1);
531560

532561
// Test: Unknown goes to Unusable
533562
let addr_unknown = ip_addr(2000);
@@ -536,18 +565,33 @@ mod tests {
536565
state.paths[&addr_unknown].status,
537566
PathStatus::Unknown
538567
));
568+
assert_eq!(metrics.transport_ip_paths_added.get(), 1);
569+
assert_eq!(metrics.transport_ip_paths_removed.get(), 1);
539570

540571
state.abandoned_path(&addr_unknown);
541572
assert!(matches!(
542573
state.paths[&addr_unknown].status,
543574
PathStatus::Unusable
544575
));
576+
assert_eq!(metrics.transport_ip_paths_added.get(), 1);
577+
assert_eq!(metrics.transport_ip_paths_removed.get(), 1);
545578

546579
// Test: Unusable stays Unusable
547580
state.abandoned_path(&addr_unknown);
548581
assert!(matches!(
549582
state.paths[&addr_unknown].status,
550583
PathStatus::Unusable
551584
));
585+
assert_eq!(metrics.transport_ip_paths_added.get(), 1);
586+
assert_eq!(metrics.transport_ip_paths_removed.get(), 1);
587+
588+
// Test: Unusable can go to open
589+
state.insert_open_path(addr_unknown.clone(), Source::Udp);
590+
assert!(matches!(
591+
state.paths[&addr_unknown].status,
592+
PathStatus::Open
593+
));
594+
assert_eq!(metrics.transport_ip_paths_added.get(), 2);
595+
assert_eq!(metrics.transport_ip_paths_removed.get(), 1);
552596
}
553597
}

0 commit comments

Comments
 (0)