Skip to content

Commit 0d21d39

Browse files
authored
[Profiler] Generate heap snapshot (#7681)
## Summary of changes Allow the generation of heap snapshots that are sent to the backend ## Reason for change For memory leak workflow, we would like to compare the surviving objects count to identify the leaky ones. These heap snapshots will be transformed into class histograms to look like what is generated by Java. ## Implementation details The .NET runtime allows the creation of "gcdump" thanks to dotnet-gcdump tool. Behind the scene, an induced GC is triggered when an event pipe session is created with the right keyword/verbosity. Undocumented events are then emitted by the runtime at the end of the GC - read https://chnasarre.medium.com/net-gcdump-internals-fcce5d327be7?source=friends_link&sk=3225ff119458adafc0e6935951fcc323 for more details. ## Test coverage Test added as needed ## Other details <!-- Fixes #{issue} --> <!-- ⚠️ Note: Where possible, please obtain 2 approvals prior to merging. Unless CODEOWNERS specifies otherwise, for external teams it is typically best to have one review from a team member, and one review from apm-dotnet. Trivial changes do not require 2 reviews. MergeQueue is NOT enabled in this repository. If you have write access to the repo, the PR has 1-2 approvals (see above), and all of the required checks have passed, you can use the Squash and Merge button to merge the PR. If you don't have write access, or you need help, reach out in the #apm-dotnet channel in Slack. -->
1 parent 0a4d09b commit 0d21d39

40 files changed

+1373
-72
lines changed

profiler/src/Demos/Samples.BuggyBits/Properties/launchSettings.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,17 @@
5353
"DOTNET_gcServer=": "1",
5454
"DOTNET_GCHeapCount": "3",
5555

56+
"DD_PROFILING_MANAGED_ACTIVATION_ENABLED": "0",
5657
"DD_PROFILING_ENABLED": "1",
5758
"DD_PROFILING_CPU_ENABLED": "1",
5859
"DD_PROFILING_WALLTIME_ENABLED": "1",
5960
"DD_PROFILING_EXCEPTION_ENABLED": "0",
6061
"DD_PROFILING_LOCK_ENABLED": "1",
6162
"DD_PROFILING_ALLOCATION_ENABLED": "1",
62-
"DD_PROFILING_HEAP_ENABLED": "0",
63+
"DD_PROFILING_HEAP_ENABLED": "1",
6364
"DD_INTERNAL_PROFILING_HTTP_ENABLED": "1",
6465
"DD_INTERNAL_PROFILING_FORCE_HTTP_SAMPLING": "1",
66+
"DD_PROFILING_HEAPSNAPSHOT_ENABLED": "1",
6567

6668
"DD_PROFILING_GC_ENABLED": "1",
6769
"DD_INTERNAL_PROFILING_TIMESTAMPS_AS_LABEL_ENABLED": "1",

profiler/src/Demos/Samples.Computer01/MemoryLeak.cs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
using System;
77
using System.Collections.Generic;
8+
using System.Runtime.InteropServices;
89

910
namespace Samples.Computer01
1011
{
@@ -46,7 +47,13 @@ private void AllocateWithLeak()
4647
//root[count] = new byte[BufferSize];
4748
//root[0, count] = new byte[BufferSize];
4849
#endif
49-
GC.Collect();
50+
// 1 instead of 2 so the heap snapshot manager won't mess up
51+
// i.e. it could detect this GC as the collection used to generate the GCBulkXXX events
52+
GC.Collect(1);
53+
54+
//// DEBUG: just to avoid flooding the logs
55+
// Console.WriteLine("Press ENTER to continue...");
56+
// Console.ReadLine();
5057

5158
count++;
5259
}

profiler/src/Demos/Samples.Computer01/Properties/launchSettings.json

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,17 @@
88
"Computer01+Profiler": {
99
"commandName": "Executable",
1010
"executablePath": "$(BaseOutputPath)\\$(ConfigBasedRelativeOutputPath)\\profiler\\src\\Demos\\Samples.Computer01\\$(TargetFramework)\\Samples.Computer01.exe",
11-
"commandLineArgs": "--scenario 10 --threads 4 --param 250",
11+
"commandLineArgs": "--scenario 7 --threads 4",
1212
"environmentVariables": {
1313
"COR_ENABLE_PROFILING": "1",
1414
"COR_PROFILER": "{BD1A650D-AC5D-4896-B64F-D6FA25D6B26A}",
1515
"COR_PROFILER_PATH_64": "$(BuildOutputRoot)\\bin\\$(Configuration)-x64\\profiler\\src\\ProfilerEngine\\Datadog.Profiler.Native.Windows\\Datadog.Profiler.Native.dll",
1616
"COR_PROFILER_PATH_32": "$(BuildOutputRoot)\\bin\\$(Configuration)-x86\\profiler\\src\\ProfilerEngine\\Datadog.Profiler.Native.Windows\\Datadog.Profiler.Native.dll",
17-
1817
"CORECLR_ENABLE_PROFILING": "1",
1918
"CORECLR_PROFILER": "{BD1A650D-AC5D-4896-B64F-D6FA25D6B26A}",
2019
"CORECLR_PROFILER_PATH_64": "$(BuildOutputRoot)\\bin\\$(Configuration)-x64\\profiler\\src\\ProfilerEngine\\Datadog.Profiler.Native.Windows\\Datadog.Profiler.Native.dll",
2120
"CORECLR_PROFILER_PATH_32": "$(BuildOutputRoot)\\bin\\$(Configuration)-x86\\profiler\\src\\ProfilerEngine\\Datadog.Profiler.Native.Windows\\Datadog.Profiler.Native.dll",
22-
2321
"COMPlus_EnableDiagnostics": "1",
24-
2522
"DD_PROFILING_ENABLED": "1",
2623
"DD_INTERNAL_PROFILING_ETW_ENABLED": "1",
2724
"DD_PROFILING_CPU_ENABLED": "1",
@@ -30,53 +27,49 @@
3027
"DD_PROFILING_ALLOCATION_ENABLED": "0",
3128
"DD_PROFILING_LOCK_ENABLED": "1",
3229
"DD_PROFILING_GC_ENABLED": "1",
33-
3430
"DD_INTERNAL_PROFILING_DEBUG_INFO_ENABLED": "0",
3531
"DD_TRACE_DEBUG": "1",
3632
"DD_INTERNAL_PROFILING_OUTPUT_DIR": "$(PROGRAMDATA)\\Datadog-APM\\Pprof-files\\DotNet",
37-
3833
"DD_ENV": "apm-profiling-local",
3934
"DD_SERVICE": "dd-dotnet-computer01-framework",
40-
4135
"DD_INTERNAL_USE_DEVELOPMENT_CONFIGURATION": "true"
4236
},
4337
"nativeDebugging": true
4438
},
4539
"Tracer+Profiler": {
4640
"commandName": "Executable",
4741
"executablePath": "$(BaseOutputPath)\\$(ConfigBasedRelativeOutputPath)\\profiler\\src\\Demos\\Samples.Computer01\\$(TargetFramework)\\Samples.Computer01.exe",
48-
"commandLineArgs": "--scenario 10 --threads 4 --param 250",
42+
"commandLineArgs": "--scenario 13 --iterations 25 --param 10000",
4943
"environmentVariables": {
5044
"COR_ENABLE_PROFILING": "1",
5145
"COR_PROFILER": "{846F5F1C-F9AE-4B07-969E-05C26BC060D8}",
52-
5346
"CORECLR_ENABLE_PROFILING": "1",
5447
"CORECLR_PROFILER": "{846F5F1C-F9AE-4B07-969E-05C26BC060D8}",
55-
5648
"COMPlus_EnableDiagnostics": "1",
57-
58-
"DOTNET_gcServer": "0",
49+
"DOTNET_gcServer": "1",
5950
"DOTNET_gcConcurrent": "1",
6051

52+
"DD_PROFILING_MANAGED_ACTIVATION_ENABLED": "0",
6153
"DD_PROFILING_ENABLED": "1",
6254
"DD_INTERNAL_PROFILING_ETW_ENABLED": "1",
6355
"DD_PROFILING_CPU_ENABLED": "1",
64-
"DD_PROFILING_WALLTIME_ENABLED": "0",
65-
"DD_PROFILING_EXCEPTION_ENABLED": "0",
66-
"DD_PROFILING_ALLOCATION_ENABLED": "0",
56+
"DD_PROFILING_WALLTIME_ENABLED": "1",
57+
"DD_PROFILING_EXCEPTION_ENABLED": "1",
58+
"DD_PROFILING_ALLOCATION_ENABLED": "1",
6759
"DD_PROFILING_LOCK_ENABLED": "1",
6860
"DD_PROFILING_GC_ENABLED": "1",
61+
"DD_PROFILING_HEAPSNAPSHOT_ENABLED": "1",
62+
"DD_INTERNAL_PROFILING_HEAPSNAPSHOT_MEMORY_PRESSURE_THRESHOLD": "0",
6963

7064
"DD_INTERNAL_PROFILING_DEBUG_INFO_ENABLED": "0",
71-
"DD_TRACE_DEBUG": "1",
7265
"DD_INTERNAL_PROFILING_OUTPUT_DIR": "$(PROGRAMDATA)\\Datadog-APM\\Pprof-files\\DotNet",
73-
7466
"DD_ENV": "apm-profiling-local",
75-
"DD_SERVICE": "dd-dotnet-computer01-gc",
67+
"DD_SERVICE": "dd-dotnet-computer01-classhistogram",
7668

69+
"DD_TRACE_DEBUG": "1",
7770
"DD_INTERNAL_USE_DEVELOPMENT_CONFIGURATION": "true"
7871
},
7972
"nativeDebugging": true
8073
}
8174
}
82-
}
75+
}

profiler/src/ProfilerEngine/Datadog.Profiler.Native.Windows/EtwEventsManager.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,9 @@ EtwEventsManager::EtwEventsManager(
3939
_parser = std::make_unique<ClrEventsParser>(
4040
nullptr, // to avoid duplicates with what is done in EtwEventsHandler
4141
nullptr, // to avoid duplicates with what is done in EtwEventsHandler
42-
pGCSuspensionsListener);
42+
pGCSuspensionsListener,
43+
nullptr // no GC dump for .NET Framework (TODO: how to trigger it from ETW?)
44+
);
4345
_logger = std::make_unique<ProfilerLogger>();
4446
_IpcClient = nullptr;
4547
_IpcServer = nullptr;

profiler/src/ProfilerEngine/Datadog.Profiler.Native/ClrEventsParser.cpp

Lines changed: 63 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,13 @@ void ClrEventsParser::LogGcEvent(
4141
ClrEventsParser::ClrEventsParser(
4242
IAllocationsListener* pAllocationListener,
4343
IContentionListener* pContentionListener,
44-
IGCSuspensionsListener* pGCSuspensionsListener)
44+
IGCSuspensionsListener* pGCSuspensionsListener,
45+
IGCDumpListener* pGCDumpListener)
4546
:
4647
_pAllocationListener{pAllocationListener},
4748
_pContentionListener{pContentionListener},
48-
_pGCSuspensionsListener{pGCSuspensionsListener}
49+
_pGCSuspensionsListener{pGCSuspensionsListener},
50+
_pGCDumpListener{pGCDumpListener}
4951
{
5052
ResetGC(_gcInProgress);
5153
ResetGC(_currentBGC);
@@ -69,7 +71,10 @@ void ClrEventsParser::ParseEvent(
6971
ULONG cbEventData,
7072
LPCBYTE eventData)
7173
{
72-
if (KEYWORD_GC == (keywords & KEYWORD_GC))
74+
if (
75+
(KEYWORD_GC == (keywords & KEYWORD_GC)) ||
76+
(KEYWORD_GCHEAPDUMP == (keywords & KEYWORD_GCHEAPDUMP)) // GCBulkXXX events are treated as GC events
77+
)
7378
{
7479
ParseGcEvent(timestamp, id, version, cbEventData, eventData);
7580
}
@@ -242,6 +247,47 @@ ClrEventsParser::ParseGcEvent(std::chrono::nanoseconds timestamp, DWORD id, DWOR
242247
return;
243248
}
244249

250+
// GC dump related events
251+
if (id == EVENT_GC_BULK_NODE)
252+
{
253+
// get the list of objects in the GC heap dump
254+
LogGcEvent("OnGCBulkNode");
255+
256+
if (_pGCDumpListener != nullptr)
257+
{
258+
GCBulkNodePayload payload{0};
259+
ULONG offset = 0;
260+
if (!EventsParserHelper::Read<GCBulkNodePayload>(payload, pEventData, cbEventData, offset))
261+
{
262+
return;
263+
}
264+
265+
// sanity check
266+
_pGCDumpListener->OnBulkNodes(
267+
payload.Index,
268+
payload.Count,
269+
(GCBulkNodeValue*)(pEventData + offset));
270+
}
271+
}
272+
else if (id == EVENT_GC_BULK_EDGE)
273+
{
274+
// get the list of references between objects in the GC heap dump
275+
LogGcEvent("OnGCBulkEdge");
276+
277+
if (_pGCDumpListener != nullptr)
278+
{
279+
GCBulkEdgePayload payload{0};
280+
ULONG offset = 0;
281+
if (!EventsParserHelper::Read<GCBulkEdgePayload>(payload, pEventData, cbEventData, offset))
282+
{
283+
_pGCDumpListener->OnBulkEdges(
284+
payload.Index,
285+
payload.Count,
286+
(GCBulkEdgeValue*)(pEventData + offset));
287+
}
288+
}
289+
}
290+
245291
// the rest of events are related to garbage collections lifetime
246292
// read https://medium.com/criteo-engineering/spying-on-net-garbage-collector-with-net-core-eventpipes-9f2a986d5705?source=friends_link&sk=baf9a7766fb5c7899b781f016803597f
247293
// for more details about the state machine
@@ -480,11 +526,12 @@ void ClrEventsParser::NotifyGarbageCollectionEnd(
480526
std::chrono::nanoseconds endTimestamp,
481527
uint64_t gen2Size,
482528
uint64_t lohSize,
483-
uint64_t pohSize)
529+
uint64_t pohSize,
530+
uint32_t memPressure)
484531
{
485532
for (auto& pGarbageCollectionsListener : _pGarbageCollectionsListeners)
486533
{
487-
LogGcEvent("OnGarbageCollectionEnd: ", number, " ", generation, " ", reason, " ", type);
534+
LogGcEvent("OnGarbageCollectionEnd: #", number, " gen", generation, " ", reason, " ", type, " ", memPressure, "%");
488535

489536
pGarbageCollectionsListener->OnGarbageCollectionEnd(
490537
number,
@@ -497,7 +544,8 @@ void ClrEventsParser::NotifyGarbageCollectionEnd(
497544
endTimestamp,
498545
gen2Size,
499546
lohSize,
500-
pohSize);
547+
pohSize,
548+
memPressure);
501549
}
502550
}
503551

@@ -525,6 +573,7 @@ void ClrEventsParser::ResetGC(GCDetails& gc)
525573
gc.gen2Size = 0;
526574
gc.lohSize = 0;
527575
gc.pohSize = 0;
576+
gc.memPressure = 0;
528577
}
529578

530579
void ClrEventsParser::InitializeGC(std::chrono::nanoseconds timestamp, GCDetails& gc, GCStartPayload& payload)
@@ -541,6 +590,7 @@ void ClrEventsParser::InitializeGC(std::chrono::nanoseconds timestamp, GCDetails
541590
gc.gen2Size = 0;
542591
gc.lohSize = 0;
543592
gc.pohSize = 0;
593+
gc.memPressure = 0;
544594
}
545595

546596
void ClrEventsParser::OnGCTriggered()
@@ -623,7 +673,8 @@ void ClrEventsParser::OnGCRestartEEEnd(std::chrono::nanoseconds timestamp)
623673
timestamp,
624674
gc.gen2Size,
625675
gc.lohSize,
626-
gc.pohSize);
676+
gc.pohSize,
677+
gc.memPressure);
627678
ResetGC(gc);
628679
}
629680
}
@@ -641,7 +692,6 @@ void ClrEventsParser::OnGCHeapStats(std::chrono::nanoseconds timestamp, uint64_t
641692
gc.gen2Size = gen2Size;
642693
gc.lohSize = lohSize;
643694
gc.pohSize = pohSize;
644-
645695
if (gc.HasGlobalHeapHistoryBeenReceived && (gc.Generation == 2) && (gc.Type == GCType::BackgroundGC))
646696
{
647697
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(timestamp - gc.StartTimestamp).count();
@@ -658,7 +708,8 @@ void ClrEventsParser::OnGCHeapStats(std::chrono::nanoseconds timestamp, uint64_t
658708
timestamp,
659709
gc.gen2Size,
660710
gc.lohSize,
661-
gc.pohSize);
711+
gc.pohSize,
712+
gc.memPressure);
662713
ResetGC(gc);
663714
}
664715
}
@@ -673,6 +724,7 @@ void ClrEventsParser::OnGCGlobalHeapHistory(std::chrono::nanoseconds timestamp,
673724
return;
674725
}
675726
gc.HasGlobalHeapHistoryBeenReceived = true;
727+
gc.memPressure = payload.MemPressure;
676728

677729
// check if the collection was compacting
678730
gc.IsCompacting =
@@ -694,7 +746,8 @@ void ClrEventsParser::OnGCGlobalHeapHistory(std::chrono::nanoseconds timestamp,
694746
timestamp,
695747
gc.gen2Size,
696748
gc.lohSize,
697-
gc.pohSize);
749+
gc.pohSize,
750+
payload.MemPressure);
698751
ResetGC(gc);
699752
}
700753
}

profiler/src/ProfilerEngine/Datadog.Profiler.Native/ClrEventsParser.h

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "IAllocationsListener.h"
2020
#include "IGarbageCollectionsListener.h"
2121
#include "IGCSuspensionsListener.h"
22+
#include "IGCDumpListener.h"
2223

2324
#include "../../../../shared/src/native-src/string.h"
2425
#include "assert.h"
@@ -50,6 +51,9 @@ const int EVENT_GC_PINOBJECTATGCTIME = 33;
5051

5152
const int EVENT_SW_STACK = 82;
5253

54+
// events sent during heap dumps
55+
const int EVENT_GC_BULK_NODE = 18;
56+
const int EVENT_GC_BULK_EDGE = 19;
5357

5458

5559
#define LONG_LENGTH 1024
@@ -233,6 +237,9 @@ struct GCGlobalHeapPayload
233237
uint32_t Gen0ReductionCount;
234238
uint32_t Reason;
235239
uint32_t GlobalMechanisms;
240+
uint16_t ClrInstanceID;
241+
uint32_t PauseMode;
242+
uint32_t MemPressure;
236243
};
237244

238245
struct WaitHandleWaitStartPayload // for .NET 9+
@@ -246,8 +253,10 @@ struct WaitHandleWaitStopPayload // for .NET 9+
246253
{
247254
uint16_t ClrInstanceId; // Unique ID for the instance of CLR.
248255
};
256+
249257
#pragma pack()
250258

259+
251260
class IContentionListener;
252261

253262

@@ -264,6 +273,7 @@ struct GCDetails
264273
uint64_t gen2Size;
265274
uint64_t lohSize;
266275
uint64_t pohSize;
276+
uint32_t memPressure;
267277

268278
// GlobalHeapHistory and HeapStats events are not received in the same order
269279
// between Framework and CoreCLR. So we need to keep track of what has been received
@@ -276,17 +286,18 @@ class ClrEventsParser
276286
public:
277287
static const int64_t KEYWORD_GC = 0x1;
278288
static const int64_t KEYWORD_CONTENTION = 0x4000;
289+
static const int64_t KEYWORD_GCHEAPDUMP = 0x100000; // for gcdump
279290
static const int64_t KEYWORD_WAITHANDLE = 0x40000000000; // .NET 9+ only
280291
static const int64_t KEYWORD_ALLOCATION_SAMPLING = 0x80000000000; // .NET 10+ only
281292

282293
public:
283294
ClrEventsParser(
284295
IAllocationsListener* pAllocationListener,
285296
IContentionListener* pContentionListener,
286-
IGCSuspensionsListener* pGCSuspensionsListener
297+
IGCSuspensionsListener* pGCSuspensionsListener,
298+
IGCDumpListener* pGCDumpListener
287299
);
288300

289-
290301
// the parser is used both for synchronous (ICorProfilerCallback) and
291302
// asynchronous (.NET Framework via the Agent) cases. The timestamp parameter
292303
// is only valid (different from 0) in the asynchronous scenario.
@@ -335,7 +346,8 @@ class ClrEventsParser
335346
std::chrono::nanoseconds endTimestamp,
336347
uint64_t gen2Size,
337348
uint64_t lohSize,
338-
uint64_t pohSize
349+
uint64_t pohSize,
350+
uint32_t memPressure
339351
);
340352
GCDetails& GetCurrentGC();
341353
void InitializeGC(std::chrono::nanoseconds timestamp, GCDetails& gc, GCStartPayload& payload);
@@ -346,6 +358,7 @@ class ClrEventsParser
346358
IContentionListener* _pContentionListener = nullptr;
347359
IGCSuspensionsListener* _pGCSuspensionsListener = nullptr;
348360
std::vector<IGarbageCollectionsListener*> _pGarbageCollectionsListeners;
361+
IGCDumpListener* _pGCDumpListener = nullptr;
349362

350363
template <typename... Args>
351364
void LogGcEvent(Args const&... args);

0 commit comments

Comments
 (0)