Skip to content

Commit 97722e9

Browse files
authored
perf: replace runtime.ReadMemStats with runtime/metrics (#1724)
## Which problem is this PR solving? [runtime.ReadMemStats](https://pkg.go.dev/runtime#ReadMemStats) requires a `stopTheWorld` call when retrieving memory usage stats. Since Go 1.16, the new [runtime/metrics](https://pkg.go.dev/runtime/metrics#pkg-overview) doesn't require a `stopTheWorld` call. The reason why I would like to avoid the `stopTheWorld` call here is because we are calling `ReadMemStats` every 100ms now. In Refinery 3.0, the `ReadMemStats` is only called when peer queue is empty. That's way less frequent and less disruptive. I'm hoping this PR will allow us to maintain accurate memory usage reporting while having less interruption to the main work of Refinery. ## Short description of the changes - use `runtime/metrics` to read `"/memory/classes/heap/objects:bytes"` stats in `checkAlloc` If this new package proves to work well, we can also use it report an estimated CPU usage. I will leave that work in a future PR
1 parent ff10385 commit 97722e9

File tree

1 file changed

+27
-18
lines changed

1 file changed

+27
-18
lines changed

collect/collect.go

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"fmt"
77
"os"
88
"runtime"
9+
rtmetrics "runtime/metrics"
910
"sync"
1011
"time"
1112

@@ -37,6 +38,7 @@ const (
3738
defaultKeptDecisionTickerInterval = 1 * time.Second
3839

3940
collectorHealthKey = "collector"
41+
memMetricName = "/memory/classes/heap/objects:bytes"
4042
)
4143

4244
var ErrWouldBlock = errors.New("Dropping span as channel buffer is full. Span will not be processed and will be lost.")
@@ -113,6 +115,8 @@ type InMemCollector struct {
113115
done chan struct{}
114116

115117
hostname string
118+
119+
memMetricSample []rtmetrics.Sample // Memory monitoring using runtime/metrics
116120
}
117121

118122
// These are the names of the metrics we use to track the number of events sent to peers through the router.
@@ -209,6 +213,10 @@ func (i *InMemCollector) Start() error {
209213
}
210214
}
211215

216+
// Initialize runtime/metrics sample for efficient memory monitoring
217+
i.memMetricSample = make([]rtmetrics.Sample, 1)
218+
i.memMetricSample[0].Name = memMetricName
219+
212220
i.workers = make([]*CollectorWorker, numWorkers)
213221

214222
for loopID := range i.workers {
@@ -261,29 +269,31 @@ func (i *InMemCollector) reloadConfigs() {
261269
// TODO add resizing the LRU sent trace cache on config reload
262270
}
263271

272+
// checkAlloc performs memory monitoring using runtime/metrics instead of
273+
// runtime.ReadMemStats. This avoids stop-the-world pauses that can impact performance.
264274
func (i *InMemCollector) checkAlloc(ctx context.Context) {
265-
_, span := otelutil.StartSpan(ctx, i.Tracer, "checkAlloc")
266-
defer span.End()
267-
268275
inMemConfig := i.Config.GetCollectionConfig()
269276
maxAlloc := inMemConfig.GetMaxAlloc()
270277
i.Metrics.Store("MEMORY_MAX_ALLOC", float64(maxAlloc))
271278

272-
var mem runtime.MemStats
273-
runtime.ReadMemStats(&mem)
274-
i.Metrics.Gauge("memory_heap_allocation", float64(mem.Alloc))
275-
if maxAlloc == 0 || mem.Alloc < uint64(maxAlloc) {
279+
rtmetrics.Read(i.memMetricSample)
280+
currentAlloc := i.memMetricSample[0].Value.Uint64()
281+
282+
i.Metrics.Gauge("memory_heap_allocation", float64(currentAlloc))
283+
284+
// Check if we're within memory budget
285+
if maxAlloc == 0 || currentAlloc < uint64(maxAlloc) {
276286
return
277287
}
288+
289+
// Memory over budget - trigger cache eviction
278290
i.Metrics.Increment("collector_cache_eviction")
279291

280-
// Figure out what fraction of the total cache we should remove. We'd like it to be
281-
// enough to get us below the max capacity, but not TOO much below.
282-
// Because our impact numbers are only the data size, reducing by enough to reach
283-
// max alloc will actually do more than that.
284-
totalToRemove := mem.Alloc - uint64(maxAlloc)
292+
// Calculate how much to remove from each worker
293+
totalToRemove := currentAlloc - uint64(maxAlloc)
285294
perWorkerToRemove := int(totalToRemove) / len(i.workers)
286295

296+
// Coordinate eviction across all workers
287297
var wg sync.WaitGroup
288298
var cacheSizeBefore, cacheSizeAfter int
289299
wg.Add(len(i.workers))
@@ -296,21 +306,20 @@ func (i *InMemCollector) checkAlloc(ctx context.Context) {
296306
}
297307
wg.Wait()
298308

309+
// Calculate actual eviction results
299310
for _, worker := range i.workers {
300311
cacheSizeAfter += worker.GetCacheSize()
301312
}
302313

303-
// Treat any MaxAlloc overage as an error so we know it's happening
314+
// Log the eviction
304315
i.Logger.Warn().
305-
WithField("alloc", mem.Alloc).
316+
WithField("alloc", currentAlloc).
306317
WithField("old_trace_count", cacheSizeBefore).
307318
WithField("new_trace_count", cacheSizeAfter).
308319
Logf("Making some trace decisions early due to memory overrun.")
309320

310-
// Manually GC here - without this we can easily end up evicting more than we
311-
// need to, since total alloc won't be updated until after a GC pass.
321+
// Manually trigger GC to reclaim memory immediately
312322
runtime.GC()
313-
return
314323
}
315324

316325
// monitor runs background maintenance tasks including:
@@ -356,7 +365,7 @@ func (i *InMemCollector) monitor() {
356365
i.Metrics.Count("span_received", totalReceived)
357366
i.Metrics.Count("spans_waiting", totalWaiting)
358367

359-
// Send traces early if we're over memory budget
368+
// Check memory and evict if needed (using runtime/metrics - no STW)
360369
i.checkAlloc(ctx)
361370
case <-i.reload:
362371
i.reloadConfigs()

0 commit comments

Comments
 (0)