Skip to content

Commit 85afc90

Browse files
authored
Fix zero division error in bandwidth calculation (#26)
1 parent c0bb69d commit 85afc90

File tree

3 files changed

+6
-9
lines changed

3 files changed

+6
-9
lines changed

msccl_samples/npkit_trace_generator.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clo
9999
else:
100100
gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']}
101101
delta_time = gpu_events[-1]['ts'] - gpu_events[-2]['ts']
102-
gpu_events[-1]['args']['bw (GB/s)'] = gpu_events[-1]['args']['size'] / delta_time / 1e3
102+
gpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else gpu_events[-1]['args']['size'] / delta_time / 1e3
103103
raw_content_idx += raw_event_size
104104
return gpu_events
105105

@@ -165,8 +165,7 @@ def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clo
165165

166166
delta_time = max(0.001, cpu_events[-1]['ts'] - last_ts)
167167
cpu_events[-1]['args'] = {'size': parsed_cpu_event['size']}
168-
cpu_events[-1]['args']['bw (GB/s)'] = \
169-
cpu_events[-1]['args']['size'] / delta_time / 1e3
168+
cpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else cpu_events[-1]['args']['size'] / delta_time / 1e3
170169

171170
cpu_events[-1]['tid'] = fiber_id + (channel + 1) * channel_shift
172171

nccl_samples/npkit_trace_generator.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clo
9999
else:
100100
gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']}
101101
delta_time = gpu_events[-1]['ts'] - gpu_events[-2]['ts']
102-
gpu_events[-1]['args']['bw (GB/s)'] = gpu_events[-1]['args']['size'] / delta_time / 1e3
102+
gpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else gpu_events[-1]['args']['size'] / delta_time / 1e3
103103
raw_content_idx += raw_event_size
104104
return gpu_events
105105

@@ -165,8 +165,7 @@ def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clo
165165

166166
delta_time = max(0.001, cpu_events[-1]['ts'] - last_ts)
167167
cpu_events[-1]['args'] = {'size': parsed_cpu_event['size']}
168-
cpu_events[-1]['args']['bw (GB/s)'] = \
169-
cpu_events[-1]['args']['size'] / delta_time / 1e3
168+
cpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else cpu_events[-1]['args']['size'] / delta_time / 1e3
170169

171170
cpu_events[-1]['tid'] = fiber_id + (channel + 1) * channel_shift
172171

rccl_samples/npkit_trace_generator.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clo
9999
else:
100100
gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']}
101101
delta_time = gpu_events[-1]['ts'] - gpu_events[-2]['ts']
102-
gpu_events[-1]['args']['bw (GB/s)'] = gpu_events[-1]['args']['size'] / delta_time / 1e3
102+
gpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else gpu_events[-1]['args']['size'] / delta_time / 1e3
103103
raw_content_idx += raw_event_size
104104
return gpu_events
105105

@@ -165,8 +165,7 @@ def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clo
165165

166166
delta_time = max(0.001, cpu_events[-1]['ts'] - last_ts)
167167
cpu_events[-1]['args'] = {'size': parsed_cpu_event['size']}
168-
cpu_events[-1]['args']['bw (GB/s)'] = \
169-
cpu_events[-1]['args']['size'] / delta_time / 1e3
168+
cpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else cpu_events[-1]['args']['size'] / delta_time / 1e3
170169

171170
cpu_events[-1]['tid'] = fiber_id + (channel + 1) * channel_shift
172171

0 commit comments

Comments
 (0)