1515
1616
1717class GPUKernelEvent :
18- """Represents a GPU kernel execution event"""
19- def __init__ (self , name : str , start_ns : int , end_ns : int , correlation_id : int ):
18+ """Represents a GPU kernel execution event - timestamps kept in microseconds """
19+ def __init__ (self , name : str , start_us : float , end_us : float , correlation_id : int ):
2020 self .name = name
21- self .start_ns = start_ns
22- self .end_ns = end_ns
21+ self .start_us = start_us # Keep in microseconds (native GPU format)
22+ self .end_us = end_us
2323 self .correlation_id = correlation_id
2424
2525 def __repr__ (self ):
26- return f"GPUKernel({ self .name } , { self .start_ns } -{ self .end_ns } , corr={ self .correlation_id } )"
26+ return f"GPUKernel({ self .name } , { self .start_us } -{ self .end_us } us , corr={ self .correlation_id } )"
2727
2828
2929class CudaLaunchEvent :
30- """Represents a cudaLaunchKernel runtime API call"""
31- def __init__ (self , start_ns : int , end_ns : int , correlation_id : int ):
32- self .start_ns = start_ns
33- self .end_ns = end_ns
30+ """Represents a cudaLaunchKernel runtime API call - timestamps kept in microseconds """
31+ def __init__ (self , start_us : float , end_us : float , correlation_id : int ):
32+ self .start_us = start_us # Keep in microseconds (native GPU format)
33+ self .end_us = end_us
3434 self .correlation_id = correlation_id
3535
3636 def __repr__ (self ):
37- return f"CudaLaunch({ self .start_ns } -{ self .end_ns } , corr={ self .correlation_id } )"
37+ return f"CudaLaunch({ self .start_us } -{ self .end_us } us , corr={ self .correlation_id } )"
3838
3939
4040class CPUStack :
@@ -138,11 +138,11 @@ def parse_gpu_trace(self, gpu_json_file: str):
138138 duration_us = event .get ('dur' , 0 )
139139
140140 if start_us > 0 and duration_us > 0 and correlation_id > 0 :
141- start_ns = int ( start_us * 1000 )
142- end_ns = int (( start_us + duration_us ) * 1000 )
141+ # Keep timestamps in microseconds (native GPU format )
142+ end_us = start_us + duration_us
143143
144144 self .cuda_launches [correlation_id ] = CudaLaunchEvent (
145- start_ns , end_ns , correlation_id
145+ start_us , end_us , correlation_id
146146 )
147147 launch_count += 1
148148
@@ -153,13 +153,13 @@ def parse_gpu_trace(self, gpu_json_file: str):
153153 duration_us = event .get ('dur' , 0 )
154154
155155 if start_us > 0 and duration_us > 0 and correlation_id > 0 :
156- start_ns = int ( start_us * 1000 )
157- end_ns = int (( start_us + duration_us ) * 1000 )
156+ # Keep timestamps in microseconds (native GPU format )
157+ end_us = start_us + duration_us
158158
159159 self .gpu_kernels .append (GPUKernelEvent (
160160 kernel_name ,
161- start_ns ,
162- end_ns ,
161+ start_us ,
162+ end_us ,
163163 correlation_id
164164 ))
165165 kernel_count += 1
@@ -170,40 +170,112 @@ def parse_gpu_trace(self, gpu_json_file: str):
170170 print (f"Parsed { kernel_count } GPU kernel events" )
171171 print (f"Parsed { launch_count } cudaLaunchKernel runtime events" )
172172
173+ def calculate_clock_offset (self ):
174+ """
175+ Calculate the offset between CPU and GPU clocks.
176+ CPU and GPU use different time bases, so we need to align them.
177+
178+ Strategy: Use the median offset from the first few events to be robust against outliers.
179+ Also report drift to help diagnose correlation issues.
180+ """
181+ if not self .cpu_stacks or not self .cuda_launches :
182+ return 0.0
183+
184+ # Sample first 100 events from each to calculate offset
185+ sample_size = min (100 , len (self .cpu_stacks ), len (self .cuda_launches ))
186+
187+ sorted_cpu = sorted (self .cpu_stacks [:sample_size ], key = lambda x : x .timestamp_ns )
188+ sorted_gpu = sorted (self .cuda_launches .values (), key = lambda x : x .start_us )[:sample_size ]
189+
190+ offsets = []
191+ for cpu , gpu in zip (sorted_cpu , sorted_gpu ):
192+ cpu_us = cpu .timestamp_ns / 1000.0
193+ offset = cpu_us - gpu .start_us
194+ offsets .append (offset )
195+
196+ # Use median to be robust against outliers
197+ offsets .sort ()
198+ median_offset = offsets [len (offsets ) // 2 ]
199+
200+ # Calculate drift across entire trace to warn about correlation issues
201+ if len (self .cpu_stacks ) > 100 and len (self .cuda_launches ) > 100 :
202+ # Sample at start and end
203+ cpu_first = min (self .cpu_stacks , key = lambda x : x .timestamp_ns )
204+ cpu_last = max (self .cpu_stacks , key = lambda x : x .timestamp_ns )
205+ gpu_first = min (self .cuda_launches .values (), key = lambda x : x .start_us )
206+ gpu_last = max (self .cuda_launches .values (), key = lambda x : x .start_us )
207+
208+ offset_start = cpu_first .timestamp_ns / 1000.0 - gpu_first .start_us
209+ offset_end = cpu_last .timestamp_ns / 1000.0 - gpu_last .start_us
210+ drift = offset_end - offset_start
211+
212+ cpu_duration = (cpu_last .timestamp_ns - cpu_first .timestamp_ns ) / 1_000_000 # ms
213+
214+ print (f"Clock offset: { median_offset / 1000 :.3f} ms (CPU - GPU)" )
215+ print (f"Clock drift: { drift / 1000 :.3f} ms over { cpu_duration :.1f} ms trace duration" )
216+ if abs (drift ) > 1000 : # More than 1ms drift
217+ print (f"WARNING: Significant clock drift detected ({ drift / cpu_duration :.3f} ms/ms)" )
218+ print (f" This may cause timestamp correlation issues" )
219+ else :
220+ print (f"Calculated clock offset: { median_offset / 1000 :.3f} ms (CPU - GPU)" )
221+
222+ return median_offset
223+
173224 def find_matching_kernel (self , cpu_stack : CPUStack ) -> Optional [GPUKernelEvent ]:
174225 """
175226 Find GPU kernel that matches the CPU stack trace.
176227 Strategy:
177- 1. Find cudaLaunchKernel runtime call within timestamp tolerance
178- 2. Use correlation ID to find actual GPU kernel execution
228+ 1. Convert CPU nanosecond timestamp to microseconds
229+ 2. Apply clock offset to align CPU and GPU time bases
230+ 3. Use binary search to find cudaLaunchKernel runtime call within timestamp tolerance
231+ 4. Use correlation ID to find actual GPU kernel execution
179232 """
233+ import bisect
234+
235+ # Convert CPU timestamp from nanoseconds to microseconds
236+ cpu_timestamp_us = cpu_stack .timestamp_ns / 1000.0
180237
181- # Find cudaLaunchKernel runtime event that matches timestamp
182- best_launch = None
183- min_time_diff = self .timestamp_tolerance_ns
238+ # Apply clock offset to align CPU and GPU timestamps
239+ cpu_timestamp_aligned = cpu_timestamp_us - self .clock_offset_us
184240
185- for launch in self .cuda_launches .values ():
186- # Check if CPU stack timestamp is close to launch time
187- time_diff = abs (cpu_stack .timestamp_ns - launch .start_ns )
241+ tolerance_us = self .timestamp_tolerance_ns / 1000.0
188242
189- if time_diff < min_time_diff :
190- min_time_diff = time_diff
191- best_launch = launch
243+ # Binary search to find nearest GPU launch timestamp
244+ idx = bisect .bisect_left (self .launch_timestamps , cpu_timestamp_aligned )
192245
193- if not best_launch :
246+ # Check surrounding launches (idx-1, idx, idx+1) for best match
247+ candidates = []
248+ for i in [idx - 1 , idx , idx + 1 ]:
249+ if 0 <= i < len (self .sorted_launches ):
250+ launch = self .sorted_launches [i ]
251+ time_diff = abs (cpu_timestamp_aligned - launch .start_us )
252+ if time_diff < tolerance_us :
253+ candidates .append ((time_diff , launch ))
254+
255+ if not candidates :
194256 return None
195257
196- # Find GPU kernel with matching correlation ID
197- for kernel in self .gpu_kernels :
198- if kernel .correlation_id == best_launch .correlation_id :
199- return kernel
258+ # Get launch with smallest time difference
259+ candidates .sort (key = lambda x : x [0 ])
260+ best_launch = candidates [0 ][1 ]
261+
262+ # Find GPU kernel with matching correlation ID (using pre-built map)
263+ if not hasattr (self , 'corr_to_kernel' ):
264+ self .corr_to_kernel = {k .correlation_id : k for k in self .gpu_kernels }
200265
201- return None
266+ return self . corr_to_kernel . get ( best_launch . correlation_id )
202267
203268 def merge_traces (self ):
204269 """Correlate CPU stacks with GPU kernels using correlation IDs and timestamps"""
205270 print ("Correlating CPU stacks with GPU kernels..." )
206271
272+ # Calculate clock offset between CPU and GPU timestamps
273+ self .clock_offset_us = self .calculate_clock_offset ()
274+
275+ # Pre-sort GPU launches by timestamp for efficient binary search
276+ self .sorted_launches = sorted (self .cuda_launches .values (), key = lambda x : x .start_us )
277+ self .launch_timestamps = [l .start_us for l in self .sorted_launches ]
278+
207279 matched_count = 0
208280 unmatched_count = 0
209281
@@ -218,18 +290,18 @@ def merge_traces(self):
218290 # Add GPU kernel to the top of the stack
219291 merged_stack .append (f"[GPU_Kernel]{ gpu_kernel .name } " )
220292 matched_count += 1
221- else :
222- # Mark as unmatched launch (may happen if kernel hasn't executed yet)
223- merged_stack .append ("[GPU_Launch_Pending]" )
224- unmatched_count += 1
225293
226- # Create folded stack string
227- if merged_stack :
294+ # Create folded stack string - only add matched stacks
228295 stack_str = ';' .join (merged_stack )
229296 self .merged_stacks [stack_str ] += 1
297+ else :
298+ # Skip unmatched launches - don't add to merged output
299+ unmatched_count += 1
230300
231301 print (f"Matched { matched_count } CPU stacks with GPU kernels" )
232- print (f"Unmatched: { unmatched_count } " )
302+ if unmatched_count > 0 :
303+ print (f"WARNING: { unmatched_count } CPU stacks could not be correlated with GPU kernels" )
304+ print (f" This may indicate profiler timing mismatch or clock drift" )
233305 print (f"Total unique stacks: { len (self .merged_stacks )} " )
234306
235307 def write_folded_output (self , output_file : str ):
@@ -261,7 +333,7 @@ def generate_summary(self):
261333 print (f"\n GPU kernels executed: { len (self .gpu_kernels )} " )
262334 print (f"CUDA launch events: { len (self .cuda_launches )} " )
263335
264- total_kernel_time = sum (k .end_ns - k .start_ns for k in self .gpu_kernels ) / 1_000_000
336+ total_kernel_time = sum (k .end_us - k .start_us for k in self .gpu_kernels ) / 1_000
265337 print (f"Total kernel execution time: { total_kernel_time :.2f} ms" )
266338
267339 # Show kernel breakdown
0 commit comments