Skip to content

Commit 5d4c40e

Browse files
feat(provide): slow reprovide alerts when SweepEnabled (#11021)
Co-authored-by: Marcin Rataj <[email protected]>
1 parent 886ac22 commit 5d4c40e

File tree

2 files changed

+144
-0
lines changed

2 files changed

+144
-0
lines changed

β€Žcore/node/provider.goβ€Ž

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,21 @@ import (
3939
// The size of a batch that will be used for calculating average announcement
4040
// time per CID, inside of boxo/provider.ThroughputReport
4141
// and in 'ipfs stats provide' report.
42+
// Used when Provide.DHT.SweepEnabled=false
4243
const sampledBatchSize = 1000
4344

4445
// Datastore key used to store previous reprovide strategy.
4546
const reprovideStrategyKey = "/reprovideStrategy"
4647

48+
// Interval between reprovide queue monitoring checks for slow reprovide alerts.
49+
// Used when Provide.DHT.SweepEnabled=true
50+
const reprovideAlertPollInterval = 15 * time.Minute
51+
52+
// Number of consecutive polling intervals with sustained queue growth before
53+
// triggering a slow reprovide alert (3 intervals = 45 minutes).
54+
// Used when Provide.DHT.SweepEnabled=true
55+
const consecutiveAlertsThreshold = 3
56+
4757
// DHTProvider is an interface for providing keys to a DHT swarm. It holds a
4858
// state of keys to be advertised, and is responsible for periodically
4959
// publishing provider records for these keys to the DHT swarm before the
@@ -508,9 +518,127 @@ func SweepingProviderOpt(cfg *config.Config) fx.Option {
508518
})
509519
})
510520

521+
// extractSweepingProvider extracts a SweepingProvider from the given provider interface.
522+
// It handles unwrapping buffered and dual providers, always selecting WAN for dual DHT.
523+
// Returns nil if the provider is not a sweeping provider type.
524+
var extractSweepingProvider func(prov any) *dhtprovider.SweepingProvider
525+
extractSweepingProvider = func(prov any) *dhtprovider.SweepingProvider {
526+
switch p := prov.(type) {
527+
case *dhtprovider.SweepingProvider:
528+
return p
529+
case *ddhtprovider.SweepingProvider:
530+
return p.WAN
531+
case *buffered.SweepingProvider:
532+
// Recursively extract from the inner provider
533+
return extractSweepingProvider(p.Provider)
534+
default:
535+
return nil
536+
}
537+
}
538+
539+
type alertInput struct {
540+
fx.In
541+
Provider DHTProvider
542+
}
543+
reprovideAlert := fx.Invoke(func(lc fx.Lifecycle, in alertInput) {
544+
prov := extractSweepingProvider(in.Provider)
545+
546+
var (
547+
cancel context.CancelFunc
548+
done = make(chan struct{})
549+
)
550+
551+
lc.Append(fx.Hook{
552+
OnStart: func(ctx context.Context) error {
553+
if prov == nil {
554+
return nil
555+
}
556+
gcCtx, c := context.WithCancel(context.Background())
557+
cancel = c
558+
go func() {
559+
defer close(done)
560+
561+
ticker := time.NewTicker(reprovideAlertPollInterval)
562+
defer ticker.Stop()
563+
564+
var (
565+
queueSize, prevQueueSize int
566+
queuedWorkers, prevQueuedWorkers bool
567+
count int
568+
)
569+
570+
for {
571+
select {
572+
case <-gcCtx.Done():
573+
return
574+
case <-ticker.C:
575+
}
576+
577+
stats := prov.Stats()
578+
queuedWorkers = stats.Workers.QueuedPeriodic > 0
579+
queueSize = stats.Queues.PendingRegionReprovides
580+
581+
// Alert if reprovide queue keeps growing and all periodic workers are busy.
582+
// Requires consecutiveAlertsThreshold intervals of sustained growth.
583+
if prevQueuedWorkers && queuedWorkers && queueSize > prevQueueSize {
584+
count++
585+
if count >= consecutiveAlertsThreshold {
586+
logger.Errorf(`
587+
πŸ””πŸ””πŸ”” Reprovide Operations Too Slow πŸ””πŸ””πŸ””
588+
589+
Your node is falling behind on DHT reprovides, which will affect content availability.
590+
591+
Keyspace regions enqueued for reprovide:
592+
%s ago:\t%d
593+
Now:\t%d
594+
595+
All periodic workers are busy!
596+
Active workers:\t%d / %d (max)
597+
Active workers types:\t%d periodic, %d burst
598+
Dedicated workers:\t%d periodic, %d burst
599+
600+
Solutions (try in order):
601+
1. Increase Provide.DHT.MaxWorkers (current %d)
602+
2. Increase Provide.DHT.DedicatedPeriodicWorkers (current %d)
603+
3. Set Provide.DHT.SweepEnabled=false and Routing.AcceleratedDHTClient=true (last resort, not recommended)
604+
605+
See how the reprovide queue is processed in real-time with 'watch ipfs provide stat --all --compact'
606+
607+
See docs: https://github.com/ipfs/kubo/blob/master/docs/config.md#providedhtmaxworkers`,
608+
reprovideAlertPollInterval.Truncate(time.Minute).String(), prevQueueSize, queueSize,
609+
stats.Workers.Active, stats.Workers.Max,
610+
stats.Workers.ActivePeriodic, stats.Workers.ActiveBurst,
611+
stats.Workers.DedicatedPeriodic, stats.Workers.DedicatedBurst,
612+
stats.Workers.Max, stats.Workers.DedicatedPeriodic)
613+
}
614+
} else if !queuedWorkers {
615+
count = 0
616+
}
617+
618+
prevQueueSize, prevQueuedWorkers = queueSize, queuedWorkers
619+
}
620+
}()
621+
return nil
622+
},
623+
OnStop: func(ctx context.Context) error {
624+
// Cancel the alert loop
625+
if cancel != nil {
626+
cancel()
627+
}
628+
select {
629+
case <-done:
630+
case <-ctx.Done():
631+
return ctx.Err()
632+
}
633+
return nil
634+
},
635+
})
636+
})
637+
511638
return fx.Options(
512639
sweepingReprovider,
513640
initKeystore,
641+
reprovideAlert,
514642
)
515643
}
516644

β€Ždocs/changelogs/v0.39.mdβ€Ž

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ This release was brought to you by the [Shipyard](https://ipshipyard.com/) team.
1111
- [Overview](#overview)
1212
- [πŸ”¦ Highlights](#-highlights)
1313
- [πŸ“Š Detailed statistics for Sweep provider with `ipfs provide stat`](#-detailed-statistics-for-sweep-provider-with-ipfs-provide-stat)
14+
- [πŸ”” Sweep provider slow reprovide warnings](#-sweep-provider-slow-reprovide-warnings)
1415
- [πŸͺ¦ Deprecated `go-ipfs` name no longer published](#-deprecated-go-ipfs-name-no-longer-published)
1516
- [πŸ“¦οΈ Important dependency updates](#-important-dependency-updates)
1617
- [πŸ“ Changelog](#-changelog)
@@ -61,6 +62,21 @@ provider statistics instead of the default WAN DHT stats.
6162
> [`Provide.DHT.SweepEnabled`](https://github.com/ipfs/kubo/blob/master/docs/config.md#providedhtsweepenabled)).
6263
> Legacy provider shows basic statistics without flag support.
6364
65+
#### πŸ”” Sweep provider slow reprovide warnings
66+
67+
Kubo now monitors DHT reprovide operations when `Provide.DHT.SweepEnabled=true`
68+
and alerts you if your node is falling behind on reprovides.
69+
70+
When the reprovide queue consistently grows and all periodic workers are busy,
71+
a warning displays with:
72+
73+
- Queue size and worker utilization details
74+
- Recommended solutions: increase `Provide.DHT.MaxWorkers` or `Provide.DHT.DedicatedPeriodicWorkers`
75+
- Command to monitor real-time progress: `watch ipfs provide stat --all --compact`
76+
77+
The alert polls every 15 minutes and only triggers after sustained growth
78+
across multiple intervals. The legacy provider is unaffected by this change.
79+
6480
#### πŸͺ¦ Deprecated `go-ipfs` name no longer published
6581

6682
The `go-ipfs` name was deprecated in 2022 and renamed to `kubo`. Starting with this release, we have stopped publishing Docker images and distribution binaries under the old `go-ipfs` name.

0 commit comments

Comments
Β (0)