Skip to content

Commit 3f68025

Browse files
authored
[Feature] Align ascend profiler with torch profiler (#895)
* align ascend profiler with torch profiler * fix cpp formatting & complete directory removal todo * fix time alignment between aclprof and kineto & add testcase for profiler_ascend & fix format * fix dangling pointer dump_path_cstring * update time diff calculate & refactor temp path generate & align HostToDevice event to corresponding acl event * refactor: add comments, remove unnecessary headers, put aclprofStop in stopTrace(), remove unused member variable * update ascend profiler test & refactor kineto msprof merge python & fix some comments, symbol name in cpp
1 parent 93d8013 commit 3f68025

File tree

5 files changed

+694
-0
lines changed

5 files changed

+694
-0
lines changed

dipu/tests/python/unittests/test_profiler_vendor.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,83 @@ def fn(x):
6464
self.assertTrue(check_string_in_directory(path, "mulrelu"))
6565
self.assertTrue(check_string_in_directory(path, "softmax"))
6666

67+
@onlyOn("NPU")
68+
def test_profiler_ascend(self):
69+
def fn(x):
70+
y = torch.nn.functional.softmax(x, -1)
71+
y = y * 5
72+
y = torch.relu(y)
73+
return y
74+
75+
input = torch.randn(2, 3).cuda()
76+
with profile(
77+
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
78+
profile_memory=True,
79+
record_shapes=True,
80+
with_modules=True,
81+
with_stack=True,
82+
) as prof:
83+
for _ in range(10):
84+
y = fn(input)
85+
y = y + y
86+
87+
export_path = "./tmp/test_profiler_ascend.json"
88+
prof.export_chrome_trace(export_path)
89+
90+
self.assertTrue(
91+
check_string_in_directory(export_path, "test_profiler_vendor.py")
92+
)
93+
self.assertTrue(check_string_in_directory(export_path, "AscendCL@aclnnSoftmax"))
94+
self.assertTrue(check_string_in_directory(export_path, "aten::mul"))
95+
self.assertTrue(check_string_in_directory(export_path, "[memory]"))
96+
self.assertTrue(check_string_in_directory(export_path, "[Input Dims]"))
97+
self.assertTrue(check_string_in_directory(export_path, "CANN"))
98+
self.assertTrue(check_string_in_directory(export_path, "Ascend Hardware"))
99+
100+
self.assertFalse(check_string_in_directory(export_path, "Node@launch"))
101+
102+
@onlyOn("NPU")
103+
def test_profiler_ascend_schedule(self):
104+
def fn(x):
105+
y = torch.nn.functional.softmax(x, -1)
106+
y = y * 5
107+
y = torch.relu(y)
108+
return y
109+
110+
def export_step_chrome_trace(prof):
111+
export_path = f"./tmp/test_profiler_ascend_schedule_{prof.step_num}.json"
112+
prof.export_chrome_trace(export_path)
113+
114+
self.assertTrue(
115+
check_string_in_directory(export_path, "test_profiler_vendor.py")
116+
)
117+
self.assertTrue(
118+
check_string_in_directory(export_path, "AscendCL@aclnnSoftmax")
119+
)
120+
self.assertTrue(check_string_in_directory(export_path, "aten::mul"))
121+
self.assertTrue(check_string_in_directory(export_path, "[memory]"))
122+
self.assertTrue(check_string_in_directory(export_path, "[Input Dims]"))
123+
self.assertTrue(check_string_in_directory(export_path, "ProfilerStep"))
124+
self.assertTrue(check_string_in_directory(export_path, "CANN"))
125+
self.assertTrue(check_string_in_directory(export_path, "Ascend Hardware"))
126+
127+
self.assertFalse(check_string_in_directory(export_path, "Node@launch"))
128+
129+
input = torch.randn(2, 3).cuda()
130+
with profile(
131+
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
132+
profile_memory=True,
133+
record_shapes=True,
134+
with_modules=True,
135+
with_stack=True,
136+
schedule=torch.profiler.schedule(wait=2, warmup=2, active=2, repeat=2),
137+
on_trace_ready=export_step_chrome_trace,
138+
) as prof:
139+
for _ in range(12):
140+
y = fn(input)
141+
y = y + y
142+
prof.step()
143+
67144
@onlyOn("CUDA")
68145
def test_profiler_cuda(self):
69146
model = models.resnet18().cuda()
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
// Copyright (c) 2024, DeepLink.
2+
#include "AscendDeviceActivity.h"
3+
4+
#include <ctime>
5+
#include <dirent.h>
6+
#include <output_base.h>
7+
#include <sys/stat.h>
8+
#include <unistd.h>
9+
10+
#include "csrc_dipu/base/environ.hpp"
11+
#include "csrc_dipu/utils/Log.h"
12+
13+
namespace dipu {
14+
15+
static const uint64_t kNpuEvents = 431;
16+
static const uint64_t kAicoreMetrics = 1;
17+
18+
using libkineto::GenericTraceActivity;
19+
20+
AscendDeviceActivity::AscendDeviceActivity() {}
21+
22+
AscendDeviceActivity& AscendDeviceActivity::instance() {
23+
static AscendDeviceActivity instance;
24+
return instance;
25+
}
26+
27+
// override pure virtual function, do nothing
28+
void AscendDeviceActivity::pushCorrelationID(
29+
uint64_t id, DeviceActivityInterface::CorrelationFlowType type) {}
30+
31+
// override pure virtual function, do nothing
32+
void AscendDeviceActivity::popCorrelationID(
33+
DeviceActivityInterface::CorrelationFlowType type) {}
34+
35+
// override pure virtual function, do nothing
36+
void AscendDeviceActivity::enableActivities(
37+
const std::set<libkineto::ActivityType>& selected_activities) {}
38+
39+
// override pure virtual function, do nothing
40+
void AscendDeviceActivity::disableActivities(
41+
const std::set<libkineto::ActivityType>& selected_activities) {}
42+
43+
// override pure virtual function, do nothing
44+
void AscendDeviceActivity::clearActivities() {}
45+
46+
bool AscendDeviceActivity::remove_temp_dump_path_(const std::string& path) {
47+
std::unique_ptr<DIR, decltype(&closedir)> dir(opendir(path.c_str()),
48+
&closedir);
49+
if (!dir) {
50+
return false;
51+
}
52+
53+
dirent* entry;
54+
while ((entry = readdir(dir.get())) != nullptr) {
55+
std::string entry_name = entry->d_name;
56+
if (entry_name == "." || entry_name == "..") {
57+
continue;
58+
}
59+
60+
std::string entry_path = path + "/" + entry_name;
61+
struct stat entry_stat;
62+
if (stat(entry_path.c_str(), &entry_stat) == -1) {
63+
return false;
64+
}
65+
66+
if (S_ISDIR(entry_stat.st_mode)) {
67+
if (!remove_temp_dump_path_(entry_path)) {
68+
return false;
69+
}
70+
} else {
71+
if (remove(entry_path.c_str()) != 0) {
72+
return false;
73+
}
74+
}
75+
}
76+
return rmdir(path.c_str()) == 0;
77+
}
78+
79+
int32_t AscendDeviceActivity::processActivities(
80+
libkineto::ActivityLogger& logger,
81+
std::function<const libkineto::ITraceActivity*(int32_t)> linked_activity,
82+
int64_t start_time, int64_t end_time) {
83+
struct timespec tx;
84+
struct timespec ts;
85+
struct timespec ty;
86+
87+
DIPU_CALLACLRT(aclprofFinalize());
88+
89+
// clang-format off
90+
// the difference between torch realtime and monotonic_raw,
91+
// as well as the difference between aclprof realtime and monotonic_raw,
92+
// are statistically calculated to align the timestamps of aclprof and torch
93+
// for example: torch's time difference is 10, and aclprof's time difference
94+
// is 3, we assume that monotonic_raw time is the correct baseline, then,
95+
// the time diff between torch and aclprof is 10 - 3 = 7
96+
// aclprof's time difference is in generated file by msprof tool: PROF_XXXX/host/end_info
97+
// clang-format on
98+
99+
clock_gettime(CLOCK_REALTIME, &tx);
100+
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
101+
clock_gettime(CLOCK_REALTIME, &ty);
102+
103+
// clang-format off
104+
// time interval between these two calculation(aclprof and torch) is the smaller the better
105+
// and aclprof will generate its time difference in start_info and end_info
106+
// the reason why I use aclprof end_info and put the calculation right after aclprofFinalize is that,
107+
// after test, this is more precise and less stable than calculate at startTrace() and compare with start_info
108+
// clang-format on
109+
110+
const int S_TO_NS_FACTOR = 1000000000;
111+
auto time_x = tx.tv_sec * S_TO_NS_FACTOR + tx.tv_nsec;
112+
auto monotonic_raw = ts.tv_sec * S_TO_NS_FACTOR + ts.tv_nsec;
113+
auto time_y = ty.tv_sec * S_TO_NS_FACTOR + ty.tv_nsec;
114+
115+
int64_t diff = (time_x >> 1) + (time_y >> 1) - monotonic_raw;
116+
117+
GenericTraceActivity time_diff;
118+
time_diff.activityName = "torch_time_diff:" + std::to_string(diff);
119+
time_diff.activityType = libkineto::ActivityType::USER_ANNOTATION;
120+
time_diff.startTime = start_time;
121+
time_diff.endTime = end_time;
122+
123+
GenericTraceActivity tmp_path;
124+
tmp_path.activityName = "random_temp_dir:" + current_dump_path_;
125+
tmp_path.activityType = libkineto::ActivityType::USER_ANNOTATION;
126+
tmp_path.startTime = start_time;
127+
tmp_path.endTime = end_time;
128+
logger.handleGenericActivity(tmp_path);
129+
logger.handleGenericActivity(time_diff);
130+
131+
std::string temp_path_prefix = "./tmp/aclprof";
132+
if (last_dump_path_.compare(0, temp_path_prefix.size(), temp_path_prefix) ==
133+
0) {
134+
if (remove_temp_dump_path_(last_dump_path_) == false) {
135+
DIPU_LOGW(
136+
"remove ascend profiler temp file failed, may need to remove "
137+
"manually");
138+
}
139+
}
140+
141+
return 0;
142+
}
143+
144+
void AscendDeviceActivity::startTrace(
145+
const std::set<libkineto::ActivityType>& selected_activities) {
146+
if (enable_) {
147+
DIPU_LOGW("ascend profiler has already enabled");
148+
return;
149+
}
150+
enable_ = true;
151+
152+
last_dump_path_ = current_dump_path_;
153+
154+
struct stat st;
155+
if (stat("./tmp", &st) == -1) {
156+
mkdir("./tmp", 0644);
157+
}
158+
if (stat("./tmp/aclprof", &st) == -1) {
159+
mkdir("./tmp/aclprof", 0644);
160+
}
161+
162+
char dump_path_template[] = "./tmp/aclprof/aclprofXXXXXX";
163+
char* dump_path_cstring = mkdtemp(dump_path_template);
164+
165+
if (dump_path_cstring != nullptr) {
166+
current_dump_path_ = dump_path_cstring;
167+
} else {
168+
DIPU_LOGE(
169+
"aclprof random dump path generate failed, the export results may be "
170+
"incorrect");
171+
current_dump_path_ = "./tmp/aclprof/aclprof_error";
172+
}
173+
174+
DIPU_CALLACLRT(
175+
aclprofInit(current_dump_path_.c_str(), current_dump_path_.size()));
176+
DIPU_CALLACLRT(aclrtSynchronizeDevice());
177+
178+
int32_t device_index = 0;
179+
DIPU_CALLACLRT(aclrtGetDevice(&device_index));
180+
181+
std::array<uint32_t, 1> device_ids = {static_cast<uint32_t>(device_index)};
182+
aclprofAicoreEvents* events = nullptr;
183+
config_ = aclprofCreateConfig(
184+
device_ids.data(), device_ids.size(),
185+
static_cast<aclprofAicoreMetrics>(kAicoreMetrics), events, kNpuEvents);
186+
TORCH_CHECK(config_ != nullptr,
187+
"aclprofCreateConfig fail, device_index = ", device_index,
188+
"npu_event = ", kNpuEvents, "aicore_metrics = ", kAicoreMetrics);
189+
190+
DIPU_CALLACLRT(aclprofStart(config_));
191+
}
192+
193+
void AscendDeviceActivity::stopTrace(
194+
const std::set<libkineto::ActivityType>& selected_activities) {
195+
if (!enable_) {
196+
DIPU_LOGW("ascend profiler has already disabled");
197+
return;
198+
}
199+
200+
DIPU_CALLACLRT(aclrtSynchronizeDevice());
201+
DIPU_CALLACLRT(aclprofStop(config_));
202+
DIPU_CALLACLRT(aclprofDestroyConfig(config_))
203+
204+
config_ = nullptr;
205+
enable_ = false;
206+
}
207+
208+
// override pure virtual function, do nothing
209+
void AscendDeviceActivity::teardownContext() {}
210+
211+
// override pure virtual function, do nothing
212+
void AscendDeviceActivity::setMaxBufferSize(int32_t size) {}
213+
214+
// NOLINTNEXTLINE(cppcoreguidelines-interfaces-global-init)
215+
const static int32_t Ascend_device_activity_init = []() {
216+
if (!dipu::environ::detail::getEnvOrDefault<bool>("FORCE_USE_DIPU_PROFILER",
217+
false)) {
218+
libkineto::device_activity_singleton = &AscendDeviceActivity::instance();
219+
return 1;
220+
}
221+
return 0;
222+
}();
223+
224+
} // namespace dipu
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
// Copyright (c) 2024, DeepLink.
2+
#pragma once
3+
4+
#include <DeviceActivityInterface.h>
5+
#include <GenericTraceActivity.h>
6+
#include <IActivityProfiler.h>
7+
#include <acl/acl.h>
8+
#include <acl/acl_prof.h>
9+
#include <array>
10+
#include <vector>
11+
12+
#include "basecommimpl.hpp"
13+
14+
namespace dipu {
15+
16+
class AscendDeviceActivity : public libkineto::DeviceActivityInterface {
17+
public:
18+
~AscendDeviceActivity() override = default;
19+
AscendDeviceActivity(const AscendDeviceActivity&) = delete;
20+
AscendDeviceActivity& operator=(const AscendDeviceActivity&) = delete;
21+
22+
// AscendDeviceActivity designed as a singleton
23+
static AscendDeviceActivity& instance();
24+
25+
void pushCorrelationID(
26+
uint64_t id,
27+
libkineto::DeviceActivityInterface::CorrelationFlowType type) override;
28+
void popCorrelationID(
29+
libkineto::DeviceActivityInterface::CorrelationFlowType type) override;
30+
31+
void enableActivities(
32+
const std::set<libkineto::ActivityType>& selected_activities) override;
33+
void disableActivities(
34+
const std::set<libkineto::ActivityType>& selected_activities) override;
35+
void clearActivities() override;
36+
int32_t processActivities(
37+
libkineto::ActivityLogger& logger,
38+
std::function<const libkineto::ITraceActivity*(int32_t)> linked_activity,
39+
int64_t start_time, int64_t end_time) override;
40+
41+
void startTrace(
42+
const std::set<libkineto::ActivityType>& selected_activities) override;
43+
void stopTrace(
44+
const std::set<libkineto::ActivityType>& selected_activities) override;
45+
46+
void teardownContext() override;
47+
void setMaxBufferSize(int32_t size) override;
48+
49+
private:
50+
AscendDeviceActivity();
51+
bool remove_temp_dump_path_(const std::string& path);
52+
aclprofConfig* config_ = nullptr;
53+
bool enable_ = false;
54+
std::string current_dump_path_;
55+
std::string last_dump_path_;
56+
};
57+
58+
} // namespace dipu

0 commit comments

Comments
 (0)