@@ -35,6 +35,11 @@ def parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path):
3535 den = float (f .read ())
3636 return den / num / 1e6
3737
38+ def parse_clock_calibration_info (clock_calibration_file_path ):
39+ with open (clock_calibration_file_path , 'r' ) as f :
40+ num = float (f .read ())
41+ return num
42+
3843def parse_gpu_event (event_bytes ):
3944 return {
4045 'id' : int .from_bytes (event_bytes [0 :1 ], byteorder = 'little' , signed = False ),
@@ -51,11 +56,11 @@ def parse_cpu_event(event_bytes):
5156 'timestamp' : int .from_bytes (event_bytes [8 :16 ], byteorder = 'little' , signed = False )
5257 }
5358
54- def parse_gpu_event_file (npkit_dump_dir , npkit_event_def , rank , buf_idx , gpu_clock_scale , cpu_clock_scale ):
59+ def parse_gpu_event_file (npkit_dump_dir , npkit_event_def , rank , buf_idx , gpu_clock_scale , cpu_clock_scale , gpu_time_cpu , gpu_time_gpu ):
5560 gpu_event_file_path = os .path .join (npkit_dump_dir , 'gpu_events_rank_%d_buf_%d' % (rank , buf_idx ))
5661 raw_event_size = 16
57- curr_cpu_base_time = None
58- curr_gpu_base_time = None
62+ cpu_base_time = gpu_time_cpu / cpu_clock_scale
63+ gpu_base_time = gpu_time_gpu / gpu_clock_scale
5964 gpu_events = []
6065 event_type_to_seq = {}
6166 with open (gpu_event_file_path , 'rb' ) as f :
@@ -64,46 +69,37 @@ def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clo
6469 raw_content_idx = 0
6570 while raw_content_idx < raw_content_size :
6671 parsed_gpu_event = parse_gpu_event (raw_content [raw_content_idx : raw_content_idx + raw_event_size ])
67- if npkit_event_def ['id_to_type' ][parsed_gpu_event ['id' ]] == 'NPKIT_EVENT_TIME_SYNC_CPU' :
68- curr_cpu_base_time = parsed_gpu_event ['timestamp' ] / cpu_clock_scale
69- curr_gpu_base_time = None
70- elif npkit_event_def ['id_to_type' ][parsed_gpu_event ['id' ]] == 'NPKIT_EVENT_TIME_SYNC_GPU' :
71- if curr_gpu_base_time is None :
72- curr_gpu_base_time = parsed_gpu_event ['timestamp' ] / gpu_clock_scale
73- else :
74- if curr_gpu_base_time is None :
75- curr_gpu_base_time = parsed_gpu_event ['timestamp' ] / gpu_clock_scale
76- event_type = npkit_event_def ['id_to_type' ][parsed_gpu_event ['id' ]]
77- phase = 'B' if event_type .endswith ('_ENTRY' ) else 'E'
78- gpu_events .append ({
79- 'ph' : phase ,
80- 'ts' : curr_cpu_base_time + parsed_gpu_event ['timestamp' ] / gpu_clock_scale - curr_gpu_base_time ,
81- 'pid' : rank ,
82- 'tid' : buf_idx + 1
72+ event_type = npkit_event_def ['id_to_type' ][parsed_gpu_event ['id' ]]
73+ phase = 'B' if event_type .endswith ('_ENTRY' ) else 'E'
74+ gpu_events .append ({
75+ 'ph' : phase ,
76+ 'ts' : cpu_base_time + parsed_gpu_event ['timestamp' ] / gpu_clock_scale - gpu_base_time ,
77+ 'pid' : rank ,
78+ 'tid' : buf_idx + 1
79+ })
80+ if phase == 'B' :
81+ if event_type not in event_type_to_seq :
82+ event_type_to_seq [event_type ] = 0
83+ gpu_events [- 1 ].update ({
84+ 'name' : event_type ,
85+ 'cat' : 'GPU' ,
86+ 'args' : {
87+ 'rank' : rank ,
88+ 'buf_idx' : buf_idx ,
89+ 'seq' : event_type_to_seq [event_type ],
90+ 'rsvd_0' : parsed_gpu_event ['rsvd' ],
91+ 'size_0' : parsed_gpu_event ['size' ]
92+ }
8393 })
84- if phase == 'B' :
85- if event_type not in event_type_to_seq :
86- event_type_to_seq [event_type ] = 0
87- gpu_events [- 1 ].update ({
88- 'name' : event_type ,
89- 'cat' : 'GPU' ,
90- 'args' : {
91- 'rank' : rank ,
92- 'buf_idx' : buf_idx ,
93- 'seq' : event_type_to_seq [event_type ],
94- 'rsvd_0' : parsed_gpu_event ['rsvd' ],
95- 'size_0' : parsed_gpu_event ['size' ]
96- }
97- })
98- event_type_to_seq [event_type ] += 1
99- else :
100- gpu_events [- 1 ]['args' ] = {'size' : parsed_gpu_event ['size' ], 'rsvd' : parsed_gpu_event ['rsvd' ]}
101- delta_time = gpu_events [- 1 ]['ts' ] - gpu_events [- 2 ]['ts' ]
102- gpu_events [- 1 ]['args' ]['bw (GB/s)' ] = gpu_events [- 1 ]['args' ]['size' ] / delta_time / 1e3
94+ event_type_to_seq [event_type ] += 1
95+ else :
96+ gpu_events [- 1 ]['args' ] = {'size' : parsed_gpu_event ['size' ], 'rsvd' : parsed_gpu_event ['rsvd' ]}
97+ delta_time = max (0.001 , gpu_events [- 1 ]['ts' ] - gpu_events [- 2 ]['ts' ])
98+ gpu_events [- 1 ]['args' ]['bw (GB/s)' ] = gpu_events [- 1 ]['args' ]['size' ] / delta_time / 1e3
10399 raw_content_idx += raw_event_size
104100 return gpu_events
105101
106- def parse_cpu_event_file (npkit_dump_dir , npkit_event_def , rank , channel , cpu_clock_scale ):
102+ def parse_cpu_event_file (npkit_dump_dir , npkit_event_def , rank , channel , cpu_clock_scale , cpu_time_global , cpu_time_local ):
107103 cpu_event_file_path = os .path .join (npkit_dump_dir , 'cpu_events_rank_%d_channel_%d' % (rank , channel ))
108104 raw_event_size = 16
109105 cpu_events = []
@@ -124,7 +120,7 @@ def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clo
124120 phase = 'B' if event_type .endswith ('_ENTRY' ) else 'E'
125121 cpu_events .append ({
126122 'ph' : phase ,
127- 'ts' : parsed_cpu_event ['timestamp' ] / cpu_clock_scale ,
123+ 'ts' : ( cpu_time_global + ( parsed_cpu_event ['timestamp' ] - cpu_time_local )) / cpu_clock_scale ,
128124 'pid' : rank
129125 })
130126 slot = parsed_cpu_event ['slot' ]
@@ -192,12 +188,17 @@ def convert_npkit_dump_to_trace(npkit_dump_dir, output_dir, npkit_event_def):
192188 gpu_clock_file_path = os .path .join (npkit_dump_dir , 'gpu_clock_rate_rank_%d' % rank )
193189 gpu_clock_scale = parse_gpu_clock_scale (gpu_clock_file_path )
194190
191+ cpu_time_global = parse_clock_calibration_info (os .path .join (npkit_dump_dir , 'clock_calibration_cpu_global_rank_%d' % rank ))
192+ cpu_time_local = parse_clock_calibration_info (os .path .join (npkit_dump_dir , 'clock_calibration_cpu_local_rank_%d' % rank ))
193+ gpu_time_cpu = parse_clock_calibration_info (os .path .join (npkit_dump_dir , 'clock_calibration_gpu_cpu_rank_%d' % rank ))
194+ gpu_time_gpu = parse_clock_calibration_info (os .path .join (npkit_dump_dir , 'clock_calibration_gpu_gpu_rank_%d' % rank ))
195+
195196 for buf_idx in buf_indices :
196- gpu_events = parse_gpu_event_file (npkit_dump_dir , npkit_event_def , rank , buf_idx , gpu_clock_scale , cpu_clock_scale )
197+ gpu_events = parse_gpu_event_file (npkit_dump_dir , npkit_event_def , rank , buf_idx , gpu_clock_scale , cpu_clock_scale , gpu_time_cpu , gpu_time_gpu )
197198 trace ['traceEvents' ].extend (gpu_events )
198199
199200 for channel in channels :
200- cpu_events = parse_cpu_event_file (npkit_dump_dir , npkit_event_def , rank , channel , cpu_clock_scale )
201+ cpu_events = parse_cpu_event_file (npkit_dump_dir , npkit_event_def , rank , channel , cpu_clock_scale , cpu_time_global , cpu_time_local )
201202 trace ['traceEvents' ].extend (cpu_events )
202203
203204 trace ['traceEvents' ].sort (key = lambda x : x ['ts' ])
0 commit comments