Skip to content

Commit ac3141b

Browse files
committed
Fix the ExceptionReplayProbe on single customer frame support.
1 parent 20c29fa commit ac3141b

File tree

1 file changed

+88
-1
lines changed

1 file changed

+88
-1
lines changed

tracer/src/Datadog.Trace/Debugger/ExceptionAutoInstrumentation/ExceptionReplayProbe.cs

Lines changed: 88 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,23 @@
55

66
using System;
77
using System.Collections.Generic;
8+
using System.Diagnostics;
89
using System.Linq;
910
using System.Threading;
1011
using Datadog.Trace.Debugger.Expressions;
1112
using Datadog.Trace.Debugger.Helpers;
13+
using Datadog.Trace.Debugger.PInvoke;
1214
using Datadog.Trace.Debugger.RateLimiting;
1315
using Datadog.Trace.Debugger.Sink.Models;
14-
using Datadog.Trace.Vendors.Serilog;
16+
using Datadog.Trace.Logging;
17+
using Datadog.Trace.Vendors.Serilog.Events;
1518

1619
#nullable enable
1720
namespace Datadog.Trace.Debugger.ExceptionAutoInstrumentation
1821
{
1922
internal class ExceptionReplayProbe
2023
{
24+
private static readonly IDatadogLogger Log = DatadogLogging.GetLoggerFor<ExceptionReplayProbe>();
2125
private readonly int _hashCode;
2226
private readonly object _locker = new();
2327
private readonly List<ExceptionCase> _exceptionCases = new();
@@ -107,6 +111,8 @@ private void ProcessCase(ExceptionCase @case)
107111

108112
internal void AddExceptionCase(ExceptionCase @case, bool isPartOfCase)
109113
{
114+
var shouldRefreshAfterLock = false;
115+
110116
lock (_locker)
111117
{
112118
if (isPartOfCase && ShouldInstrument())
@@ -134,6 +140,13 @@ internal void AddExceptionCase(ExceptionCase @case, bool isPartOfCase)
134140

135141
_exceptionCases.Add(@case);
136142
ProcessCase(@case);
143+
144+
shouldRefreshAfterLock = @case.Probes?.Length == 1;
145+
}
146+
147+
if (shouldRefreshAfterLock)
148+
{
149+
TryRefreshSingleFrameProbeStatus();
137150
}
138151
}
139152

@@ -166,5 +179,79 @@ public override int GetHashCode()
166179
{
167180
return _hashCode;
168181
}
182+
183+
/// <summary>
184+
/// If an exception case only contains a single customer frame, we never build parent/child call-path hashes,
185+
/// meaning the ordinary probe-status polling code in <see cref="ExceptionProbeProcessor"/> never executes.
186+
/// For CI Visibility (and other single-frame scenarios) this left probes permanently stuck in the default
187+
/// <see cref="Status.RECEIVED"/> state, so snapshots were never captured. To avoid changing the behaviour
188+
/// for multi-frame cases, we perform a one-off eager poll right after the probe is attached. The poll is
189+
/// executed outside the probe lock because we may wait up to a few seconds while the CLR completes ReJIT and
190+
/// we do not want to block unrelated instrumentation updates.
191+
/// </summary>
192+
private void TryRefreshSingleFrameProbeStatus()
193+
{
194+
if (string.IsNullOrEmpty(ProbeId))
195+
{
196+
return;
197+
}
198+
199+
try
200+
{
201+
// In practice the native tracer reports INSTALLED for ~500 ms after we request ReJIT, but CI Visibility
202+
// tests regularly need a little longer (module load + async offloader). We therefore try a handful of
203+
// times with a generous delay so we can observe the final INSTRUMENTED status without changing the
204+
// behaviour for other scenarios.
205+
const int maxAttempts = 20;
206+
var stopwatch = Stopwatch.StartNew();
207+
208+
for (var attempt = 0; attempt < maxAttempts; attempt++)
209+
{
210+
var statuses = DebuggerNativeMethods.GetProbesStatuses(new[] { ProbeId });
211+
if (statuses.Length == 0)
212+
{
213+
return;
214+
}
215+
216+
var previous = ProbeStatus;
217+
ProbeStatus = statuses[0].Status;
218+
ErrorMessage = statuses[0].ErrorMessage;
219+
220+
if (Log.IsEnabled(LogEventLevel.Debug))
221+
{
222+
var message = $"Eager status refresh for single-frame probe {ProbeId}. Previous={previous}, Current={ProbeStatus}, Attempt={attempt + 1}, ElapsedMs={stopwatch.ElapsedMilliseconds}";
223+
Log.Debug("{Message}", message);
224+
}
225+
226+
if (ProbeStatus == Status.INSTRUMENTED)
227+
{
228+
break;
229+
}
230+
231+
if (ProbeStatus == Status.ERROR || ProbeStatus == Status.BLOCKED)
232+
{
233+
break;
234+
}
235+
236+
if (attempt < maxAttempts - 1)
237+
{
238+
Thread.Sleep(attempt == 0 ? 1_500 : 250);
239+
}
240+
}
241+
242+
if (ProbeStatus != Status.INSTRUMENTED)
243+
{
244+
Log.Warning(
245+
"Single-frame probe {ProbeId} never reported INSTRUMENTED during eager refresh. FinalStatus={Status}, TotalWaitMs={ElapsedMs}",
246+
ProbeId,
247+
ProbeStatus,
248+
stopwatch.ElapsedMilliseconds);
249+
}
250+
}
251+
catch (Exception ex)
252+
{
253+
Log.Warning(ex, "Failed to eagerly refresh probe status for {ProbeId}", ProbeId);
254+
}
255+
}
169256
}
170257
}

0 commit comments

Comments
 (0)