Skip to content

Commit f5ee20e

Browse files
fix: Ensure CUPTI flushes events before stopping CPU profiler; update timestamp handling to use microseconds
1 parent 05ca05a commit f5ee20e

File tree

2 files changed

+118
-42
lines changed

2 files changed

+118
-42
lines changed

src/xpu/flamegraph/gpuperf.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,10 @@ def run_with_trace(self, command, output_trace=None, chrome_trace=None, cpu_prof
184184
print(f"Error running command: {e}", file=sys.stderr)
185185
return_code = 1
186186
finally:
187+
# Give CUPTI time to flush remaining buffered events
188+
# CUPTI may continue recording events after target exits
189+
time.sleep(0.5)
190+
187191
# Stop CPU profiler if running
188192
self.stop_cpu_profiler()
189193

src/xpu/flamegraph/merge_gpu_cpu_trace.py

Lines changed: 114 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -15,26 +15,26 @@
1515

1616

1717
class GPUKernelEvent:
18-
"""Represents a GPU kernel execution event"""
19-
def __init__(self, name: str, start_ns: int, end_ns: int, correlation_id: int):
18+
"""Represents a GPU kernel execution event - timestamps kept in microseconds"""
19+
def __init__(self, name: str, start_us: float, end_us: float, correlation_id: int):
2020
self.name = name
21-
self.start_ns = start_ns
22-
self.end_ns = end_ns
21+
self.start_us = start_us # Keep in microseconds (native GPU format)
22+
self.end_us = end_us
2323
self.correlation_id = correlation_id
2424

2525
def __repr__(self):
26-
return f"GPUKernel({self.name}, {self.start_ns}-{self.end_ns}, corr={self.correlation_id})"
26+
return f"GPUKernel({self.name}, {self.start_us}-{self.end_us} us, corr={self.correlation_id})"
2727

2828

2929
class CudaLaunchEvent:
30-
"""Represents a cudaLaunchKernel runtime API call"""
31-
def __init__(self, start_ns: int, end_ns: int, correlation_id: int):
32-
self.start_ns = start_ns
33-
self.end_ns = end_ns
30+
"""Represents a cudaLaunchKernel runtime API call - timestamps kept in microseconds"""
31+
def __init__(self, start_us: float, end_us: float, correlation_id: int):
32+
self.start_us = start_us # Keep in microseconds (native GPU format)
33+
self.end_us = end_us
3434
self.correlation_id = correlation_id
3535

3636
def __repr__(self):
37-
return f"CudaLaunch({self.start_ns}-{self.end_ns}, corr={self.correlation_id})"
37+
return f"CudaLaunch({self.start_us}-{self.end_us} us, corr={self.correlation_id})"
3838

3939

4040
class CPUStack:
@@ -138,11 +138,11 @@ def parse_gpu_trace(self, gpu_json_file: str):
138138
duration_us = event.get('dur', 0)
139139

140140
if start_us > 0 and duration_us > 0 and correlation_id > 0:
141-
start_ns = int(start_us * 1000)
142-
end_ns = int((start_us + duration_us) * 1000)
141+
# Keep timestamps in microseconds (native GPU format)
142+
end_us = start_us + duration_us
143143

144144
self.cuda_launches[correlation_id] = CudaLaunchEvent(
145-
start_ns, end_ns, correlation_id
145+
start_us, end_us, correlation_id
146146
)
147147
launch_count += 1
148148

@@ -153,13 +153,13 @@ def parse_gpu_trace(self, gpu_json_file: str):
153153
duration_us = event.get('dur', 0)
154154

155155
if start_us > 0 and duration_us > 0 and correlation_id > 0:
156-
start_ns = int(start_us * 1000)
157-
end_ns = int((start_us + duration_us) * 1000)
156+
# Keep timestamps in microseconds (native GPU format)
157+
end_us = start_us + duration_us
158158

159159
self.gpu_kernels.append(GPUKernelEvent(
160160
kernel_name,
161-
start_ns,
162-
end_ns,
161+
start_us,
162+
end_us,
163163
correlation_id
164164
))
165165
kernel_count += 1
@@ -170,40 +170,112 @@ def parse_gpu_trace(self, gpu_json_file: str):
170170
print(f"Parsed {kernel_count} GPU kernel events")
171171
print(f"Parsed {launch_count} cudaLaunchKernel runtime events")
172172

173+
def calculate_clock_offset(self):
174+
"""
175+
Calculate the offset between CPU and GPU clocks.
176+
CPU and GPU use different time bases, so we need to align them.
177+
178+
Strategy: Use the median offset from the first few events to be robust against outliers.
179+
Also report drift to help diagnose correlation issues.
180+
"""
181+
if not self.cpu_stacks or not self.cuda_launches:
182+
return 0.0
183+
184+
# Sample first 100 events from each to calculate offset
185+
sample_size = min(100, len(self.cpu_stacks), len(self.cuda_launches))
186+
187+
sorted_cpu = sorted(self.cpu_stacks[:sample_size], key=lambda x: x.timestamp_ns)
188+
sorted_gpu = sorted(self.cuda_launches.values(), key=lambda x: x.start_us)[:sample_size]
189+
190+
offsets = []
191+
for cpu, gpu in zip(sorted_cpu, sorted_gpu):
192+
cpu_us = cpu.timestamp_ns / 1000.0
193+
offset = cpu_us - gpu.start_us
194+
offsets.append(offset)
195+
196+
# Use median to be robust against outliers
197+
offsets.sort()
198+
median_offset = offsets[len(offsets) // 2]
199+
200+
# Calculate drift across entire trace to warn about correlation issues
201+
if len(self.cpu_stacks) > 100 and len(self.cuda_launches) > 100:
202+
# Sample at start and end
203+
cpu_first = min(self.cpu_stacks, key=lambda x: x.timestamp_ns)
204+
cpu_last = max(self.cpu_stacks, key=lambda x: x.timestamp_ns)
205+
gpu_first = min(self.cuda_launches.values(), key=lambda x: x.start_us)
206+
gpu_last = max(self.cuda_launches.values(), key=lambda x: x.start_us)
207+
208+
offset_start = cpu_first.timestamp_ns / 1000.0 - gpu_first.start_us
209+
offset_end = cpu_last.timestamp_ns / 1000.0 - gpu_last.start_us
210+
drift = offset_end - offset_start
211+
212+
cpu_duration = (cpu_last.timestamp_ns - cpu_first.timestamp_ns) / 1_000_000 # ms
213+
214+
print(f"Clock offset: {median_offset / 1000:.3f} ms (CPU - GPU)")
215+
print(f"Clock drift: {drift / 1000:.3f} ms over {cpu_duration:.1f} ms trace duration")
216+
if abs(drift) > 1000: # More than 1ms drift
217+
print(f"WARNING: Significant clock drift detected ({drift / cpu_duration:.3f} ms/ms)")
218+
print(f" This may cause timestamp correlation issues")
219+
else:
220+
print(f"Calculated clock offset: {median_offset / 1000:.3f} ms (CPU - GPU)")
221+
222+
return median_offset
223+
173224
def find_matching_kernel(self, cpu_stack: CPUStack) -> Optional[GPUKernelEvent]:
174225
"""
175226
Find GPU kernel that matches the CPU stack trace.
176227
Strategy:
177-
1. Find cudaLaunchKernel runtime call within timestamp tolerance
178-
2. Use correlation ID to find actual GPU kernel execution
228+
1. Convert CPU nanosecond timestamp to microseconds
229+
2. Apply clock offset to align CPU and GPU time bases
230+
3. Use binary search to find cudaLaunchKernel runtime call within timestamp tolerance
231+
4. Use correlation ID to find actual GPU kernel execution
179232
"""
233+
import bisect
234+
235+
# Convert CPU timestamp from nanoseconds to microseconds
236+
cpu_timestamp_us = cpu_stack.timestamp_ns / 1000.0
180237

181-
# Find cudaLaunchKernel runtime event that matches timestamp
182-
best_launch = None
183-
min_time_diff = self.timestamp_tolerance_ns
238+
# Apply clock offset to align CPU and GPU timestamps
239+
cpu_timestamp_aligned = cpu_timestamp_us - self.clock_offset_us
184240

185-
for launch in self.cuda_launches.values():
186-
# Check if CPU stack timestamp is close to launch time
187-
time_diff = abs(cpu_stack.timestamp_ns - launch.start_ns)
241+
tolerance_us = self.timestamp_tolerance_ns / 1000.0
188242

189-
if time_diff < min_time_diff:
190-
min_time_diff = time_diff
191-
best_launch = launch
243+
# Binary search to find nearest GPU launch timestamp
244+
idx = bisect.bisect_left(self.launch_timestamps, cpu_timestamp_aligned)
192245

193-
if not best_launch:
246+
# Check surrounding launches (idx-1, idx, idx+1) for best match
247+
candidates = []
248+
for i in [idx - 1, idx, idx + 1]:
249+
if 0 <= i < len(self.sorted_launches):
250+
launch = self.sorted_launches[i]
251+
time_diff = abs(cpu_timestamp_aligned - launch.start_us)
252+
if time_diff < tolerance_us:
253+
candidates.append((time_diff, launch))
254+
255+
if not candidates:
194256
return None
195257

196-
# Find GPU kernel with matching correlation ID
197-
for kernel in self.gpu_kernels:
198-
if kernel.correlation_id == best_launch.correlation_id:
199-
return kernel
258+
# Get launch with smallest time difference
259+
candidates.sort(key=lambda x: x[0])
260+
best_launch = candidates[0][1]
261+
262+
# Find GPU kernel with matching correlation ID (using pre-built map)
263+
if not hasattr(self, 'corr_to_kernel'):
264+
self.corr_to_kernel = {k.correlation_id: k for k in self.gpu_kernels}
200265

201-
return None
266+
return self.corr_to_kernel.get(best_launch.correlation_id)
202267

203268
def merge_traces(self):
204269
"""Correlate CPU stacks with GPU kernels using correlation IDs and timestamps"""
205270
print("Correlating CPU stacks with GPU kernels...")
206271

272+
# Calculate clock offset between CPU and GPU timestamps
273+
self.clock_offset_us = self.calculate_clock_offset()
274+
275+
# Pre-sort GPU launches by timestamp for efficient binary search
276+
self.sorted_launches = sorted(self.cuda_launches.values(), key=lambda x: x.start_us)
277+
self.launch_timestamps = [l.start_us for l in self.sorted_launches]
278+
207279
matched_count = 0
208280
unmatched_count = 0
209281

@@ -218,18 +290,18 @@ def merge_traces(self):
218290
# Add GPU kernel to the top of the stack
219291
merged_stack.append(f"[GPU_Kernel]{gpu_kernel.name}")
220292
matched_count += 1
221-
else:
222-
# Mark as unmatched launch (may happen if kernel hasn't executed yet)
223-
merged_stack.append("[GPU_Launch_Pending]")
224-
unmatched_count += 1
225293

226-
# Create folded stack string
227-
if merged_stack:
294+
# Create folded stack string - only add matched stacks
228295
stack_str = ';'.join(merged_stack)
229296
self.merged_stacks[stack_str] += 1
297+
else:
298+
# Skip unmatched launches - don't add to merged output
299+
unmatched_count += 1
230300

231301
print(f"Matched {matched_count} CPU stacks with GPU kernels")
232-
print(f"Unmatched: {unmatched_count}")
302+
if unmatched_count > 0:
303+
print(f"WARNING: {unmatched_count} CPU stacks could not be correlated with GPU kernels")
304+
print(f" This may indicate profiler timing mismatch or clock drift")
233305
print(f"Total unique stacks: {len(self.merged_stacks)}")
234306

235307
def write_folded_output(self, output_file: str):
@@ -261,7 +333,7 @@ def generate_summary(self):
261333
print(f"\nGPU kernels executed: {len(self.gpu_kernels)}")
262334
print(f"CUDA launch events: {len(self.cuda_launches)}")
263335

264-
total_kernel_time = sum(k.end_ns - k.start_ns for k in self.gpu_kernels) / 1_000_000
336+
total_kernel_time = sum(k.end_us - k.start_us for k in self.gpu_kernels) / 1_000
265337
print(f"Total kernel execution time: {total_kernel_time:.2f} ms")
266338

267339
# Show kernel breakdown

0 commit comments

Comments
 (0)