Skip to content

Commit 26465a2

Browse files
committed
Copy plain-malloc to multi-setup-runtime
1 parent 920229c commit 26465a2

File tree

3 files changed

+372
-0
lines changed

3 files changed

+372
-0
lines changed

examples/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,8 @@ add_subdirectory(
1111
${CMAKE_CURRENT_LIST_DIR}/simple-mallocMC
1212
${CMAKE_BINARY_DIR}/examples/simple-mallocMC
1313
)
14+
15+
add_subdirectory(
16+
${CMAKE_CURRENT_LIST_DIR}/multi-setup-runtime
17+
${CMAKE_BINARY_DIR}/examples/multi-setup-runtime
18+
)
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
cmake_minimum_required(VERSION 3.14...3.22)
2+
3+
if(POLICY CMP0167)
4+
cmake_policy(SET CMP0167 NEW)
5+
endif()
6+
project(KitGenBenchExampleMultiSetupRuntime LANGUAGES CXX)
7+
8+
# --- Import tools ----
9+
10+
include(../../cmake/tools.cmake)
11+
12+
# ---- Dependencies ----
13+
14+
include(../../cmake/CPM.cmake)
15+
16+
cpmaddpackage(
17+
NAME nlohmann_json
18+
GITHUB_REPOSITORY nlohmann/json
19+
VERSION 3.11.3 NO_TESTS
20+
)
21+
22+
cpmaddpackage(
23+
NAME alpaka
24+
GITHUB_REPOSITORY alpaka-group/alpaka
25+
GIT_TAG 1.2.0
26+
)
27+
28+
cpmaddpackage(NAME KitGenBench SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../..)
29+
30+
# ---- Create standalone executable ----
31+
32+
file(GLOB sources CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/source/*.cpp)
33+
34+
alpaka_add_executable(${PROJECT_NAME} ${sources})
35+
36+
set_target_properties(
37+
${PROJECT_NAME}
38+
PROPERTIES
39+
CXX_STANDARD 20
40+
OUTPUT_NAME ${PROJECT_NAME}
41+
CXX_STANDARD_REQUIRED ON
42+
CXX_EXTENSIONS OFF
43+
)
44+
45+
target_link_libraries(
46+
${PROJECT_NAME}
47+
KitGenBench::KitGenBench
48+
nlohmann_json::nlohmann_json
49+
alpaka::alpaka
50+
)
Lines changed: 317 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,317 @@
1+
#include <kitgenbench/DeviceClock.h>
2+
#include <kitgenbench/kitgenbench.h>
3+
#include <kitgenbench/setup.h>
4+
#include <kitgenbench/version.h>
5+
6+
#include <alpaka/workdiv/WorkDivMembers.hpp>
7+
#include <cstdint>
8+
#include <limits>
9+
#include <tuple>
10+
#include <utility>
11+
#include <variant>
12+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
13+
# include <cuda_runtime.h>
14+
#endif // alpaka_ACC_GPU_CUDA_ENABLE
15+
16+
#include <alpaka/acc/AccCpuSerial.hpp>
17+
#include <alpaka/acc/Tag.hpp>
18+
#include <alpaka/atomic/Traits.hpp>
19+
#include <alpaka/core/Common.hpp>
20+
#include <alpaka/mem/buf/Traits.hpp>
21+
#include <alpaka/mem/view/Traits.hpp>
22+
#include <cstdlib>
23+
#include <type_traits>
24+
#include <vector>
25+
26+
#include "nlohmann/json_fwd.hpp"
27+
28+
using nlohmann::json;
29+
using namespace kitgenbench;
30+
31+
using Dim = alpaka::DimInt<1>;
32+
using Idx = std::uint32_t;
33+
using AccTag = std::remove_cvref_t<decltype(std::get<0>(alpaka::EnabledAccTags{}))>;
34+
using Acc = alpaka::TagToAcc<AccTag, Dim, Idx>;
35+
36+
namespace kitgenbench::Actions {
37+
[[maybe_unused]] static constexpr int MALLOC = 1;
38+
[[maybe_unused]] static constexpr int FREE = 2;
39+
} // namespace kitgenbench::Actions
40+
41+
auto makeExecutionDetails() {
42+
auto const platformAcc = alpaka::Platform<Acc>{};
43+
auto const dev = alpaka::getDevByIdx(platformAcc, 0);
44+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
45+
cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1024U * 1024U * 1024U);
46+
#endif
47+
uint32_t const numThreadsPerBlock = 256U;
48+
uint32_t const numThreads = 4U * numThreadsPerBlock;
49+
auto workdiv = [numThreads, numThreadsPerBlock]() -> alpaka::WorkDivMembers<Dim, Idx> {
50+
if constexpr (std::is_same_v<alpaka::AccToTag<Acc>, alpaka::TagCpuSerial>) {
51+
return {{1U}, {1U}, {numThreads}};
52+
} else {
53+
return alpaka::WorkDivMembers<Dim, Idx>{
54+
{numThreads / numThreadsPerBlock}, {numThreadsPerBlock}, {1U}};
55+
}
56+
}();
57+
return kitgenbench::ExecutionDetails<Acc, decltype(dev)>{workdiv, dev};
58+
}
59+
60+
static constexpr std::uint32_t ALLOCATION_SIZE = 16U;
61+
62+
// Reasons for the check to yield the result it yielded.
63+
// `completed` means that the check completed. The result can still be true/false depending on
64+
// whether the obtained value was actually correct. `notApplicable` means that the checks were
65+
// skipped. `nullpointer` means that a nullpointer was given, so the checks couldn't run at all.
66+
enum class Reason { completed, notApplicable, nullpointer };
67+
using Payload = std::variant<std::span<std::byte, ALLOCATION_SIZE>, std::pair<bool, Reason>>;
68+
69+
template <typename TAccTag> struct SimpleSumLogger {
70+
using Clock = DeviceClock<TAccTag>;
71+
72+
DeviceClock<TAccTag>::DurationType mallocDuration;
73+
std::uint32_t mallocCounter{0U};
74+
75+
DeviceClock<TAccTag>::DurationType freeDuration;
76+
std::uint32_t freeCounter{0U};
77+
78+
std::uint32_t nullpointersObtained{0U};
79+
std::uint32_t failedChecksCounter{0U};
80+
std::uint32_t invalidCheckResults{0U};
81+
82+
template <typename TAcc> ALPAKA_FN_INLINE ALPAKA_FN_ACC auto call(TAcc const& acc, auto func) {
83+
static_assert(
84+
std::is_same_v<alpaka::TagToAcc<TAccTag, alpaka::Dim<Acc>, alpaka::Idx<Acc>>, TAcc>);
85+
auto start = Clock::clock();
86+
auto result = func(acc);
87+
auto end = Clock::clock();
88+
89+
if (std::get<0>(result) == Actions::MALLOC) {
90+
mallocDuration += Clock::duration(start, end);
91+
mallocCounter++;
92+
}
93+
94+
if (std::get<0>(result) == Actions::FREE) {
95+
freeDuration += Clock::duration(start, end);
96+
freeCounter++;
97+
}
98+
99+
if (std::get<0>(result) == Actions::CHECK) {
100+
if (std::holds_alternative<std::pair<bool, Reason>>(std::get<1>(result))) {
101+
auto [passed, reason] = std::get<std::pair<bool, Reason>>(std::get<1>(result));
102+
if (not passed) {
103+
if (reason == Reason::nullpointer) {
104+
nullpointersObtained++;
105+
}
106+
if (reason == Reason::completed) {
107+
failedChecksCounter++;
108+
}
109+
}
110+
} else {
111+
invalidCheckResults++;
112+
}
113+
}
114+
115+
return result;
116+
}
117+
118+
ALPAKA_FN_ACC void accumulate(const auto& acc, const SimpleSumLogger& other) {
119+
alpaka::atomicAdd(acc, &mallocDuration, other.mallocDuration);
120+
alpaka::atomicAdd(acc, &mallocCounter, other.mallocCounter);
121+
alpaka::atomicAdd(acc, &freeDuration, other.freeDuration);
122+
alpaka::atomicAdd(acc, &freeCounter, other.freeCounter);
123+
alpaka::atomicAdd(acc, &nullpointersObtained, other.nullpointersObtained);
124+
alpaka::atomicAdd(acc, &failedChecksCounter, other.failedChecksCounter);
125+
alpaka::atomicAdd(acc, &invalidCheckResults, other.invalidCheckResults);
126+
}
127+
128+
nlohmann::json generateReport() {
129+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
130+
cudaDeviceProp prop;
131+
cudaGetDeviceProperties(&prop, 0);
132+
auto clockRate = prop.clockRate;
133+
#else
134+
auto clockRate = 1;
135+
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
136+
return {
137+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
138+
{"clock rate [1/ms]", clockRate},
139+
#endif
140+
{"allocation total time [ms]", mallocDuration / clockRate},
141+
{"allocation average time [ms]",
142+
mallocDuration / clockRate / (mallocCounter > 0 ? mallocCounter : 1U)},
143+
{"allocation count", mallocCounter},
144+
{"deallocation total time [ms]", freeDuration / clockRate},
145+
{"deallocation average time [ms]",
146+
freeDuration / clockRate / (freeCounter > 0 ? freeCounter : 1U)},
147+
{"deallocation count ", freeCounter},
148+
{"failed checks count", failedChecksCounter},
149+
{"nullpointers count", nullpointersObtained},
150+
{"invalid check results count", invalidCheckResults},
151+
};
152+
}
153+
};
154+
155+
template <template <typename, size_t> typename T, typename TType, size_t TExtent> struct IsSpan {
156+
static constexpr bool value = std::is_same_v<T<TType, TExtent>, std::span<TType, TExtent>>;
157+
};
158+
159+
template <template <typename, size_t> typename T, typename TType, size_t TExtent>
160+
constexpr auto isSpan(T<TType, TExtent>) {
161+
return IsSpan<T, TType, TExtent>{};
162+
}
163+
164+
template <typename TNew, typename TOld, std::size_t TExtent>
165+
constexpr auto convertDataType(std::span<TOld, TExtent>& range) {
166+
return std::span<TNew, TExtent * sizeof(TOld) / sizeof(TNew)>(
167+
reinterpret_cast<TNew*>(range.data()), range.size());
168+
}
169+
170+
struct IotaReductionChecker {
171+
uint32_t currentValue{};
172+
173+
ALPAKA_FN_ACC auto check([[maybe_unused]] const auto& acc, const auto& result) {
174+
if (std::get<0>(result) != Actions::MALLOC) {
175+
return std::make_tuple(Actions::CHECK, Payload(std::make_pair(true, Reason::notApplicable)));
176+
}
177+
auto range = std::get<0>(std::get<1>(result));
178+
if (range.data() == nullptr) {
179+
return std::make_tuple(Actions::CHECK, Payload(std::make_pair(false, Reason::nullpointer)));
180+
}
181+
auto uintRange = convertDataType<uint32_t>(range);
182+
std::iota(std::begin(uintRange), std::end(uintRange), currentValue);
183+
size_t n = uintRange.size();
184+
// The exact formula is using size_t because n is size_t. Casting it down will oftentimes run
185+
// into an overflow that the reduction encounters, too.
186+
auto expected = static_cast<uint32_t>(n * currentValue + n * (n - 1) / 2) ^ currentValue;
187+
currentValue ^= std::reduce(std::cbegin(uintRange), std::cend(uintRange));
188+
return std::make_tuple(+Actions::CHECK,
189+
Payload(std::make_pair(expected == currentValue, Reason::completed)));
190+
}
191+
192+
ALPAKA_FN_ACC auto accumulate(const auto& acc, const auto& other) {
193+
alpaka::atomicXor(acc, &currentValue, other.currentValue);
194+
}
195+
196+
nlohmann::json generateReport() { return {{"final value", currentValue}}; }
197+
};
198+
199+
template <typename T> struct NoStoreProvider {
200+
ALPAKA_FN_ACC T load(auto const) { return {}; }
201+
ALPAKA_FN_ACC void store(auto const&, T&&, auto const) {}
202+
nlohmann::json generateReport() { return {}; }
203+
};
204+
205+
template <typename T> struct AccumulateResultsProvider {
206+
T result{};
207+
ALPAKA_FN_ACC T load(auto const) { return {}; }
208+
ALPAKA_FN_ACC void store(const auto& acc, T&& instance, auto const) {
209+
result.accumulate(acc, instance);
210+
}
211+
nlohmann::json generateReport() { return result.generateReport(); }
212+
};
213+
214+
template <typename T> struct AcumulateChecksProvider {
215+
T result{};
216+
ALPAKA_FN_ACC T load(auto const threadIndex) { return {threadIndex}; }
217+
ALPAKA_FN_ACC void store(const auto& acc, T&& instance, auto const) {
218+
result.accumulate(acc, instance);
219+
}
220+
nlohmann::json generateReport() { return result.generateReport(); }
221+
};
222+
223+
namespace setups {
224+
struct SingleSizeMallocRecipe {
225+
static constexpr std::uint32_t allocationSize{ALLOCATION_SIZE};
226+
static constexpr std::uint32_t numAllocations{256U};
227+
std::array<std::byte*, numAllocations> pointers{{}};
228+
std::uint32_t counter{0U};
229+
230+
ALPAKA_FN_ACC auto next([[maybe_unused]] const auto& acc) {
231+
if (counter >= numAllocations)
232+
return std::make_tuple(+kitgenbench::Actions::STOP,
233+
Payload(std::span<std::byte, allocationSize>{
234+
static_cast<std::byte*>(nullptr), allocationSize}));
235+
pointers[counter] = static_cast<std::byte*>(malloc(allocationSize));
236+
auto result = std::make_tuple(
237+
+kitgenbench::Actions::MALLOC,
238+
Payload(std::span<std::byte, allocationSize>(pointers[counter], allocationSize)));
239+
counter++;
240+
return result;
241+
}
242+
243+
nlohmann::json generateReport() { return {}; }
244+
};
245+
246+
template <typename TAcc, typename TDev> struct InstructionDetails {
247+
struct DevicePackage {
248+
NoStoreProvider<SingleSizeMallocRecipe> recipes{};
249+
AccumulateResultsProvider<SimpleSumLogger<AccTag>> loggers{};
250+
AcumulateChecksProvider<IotaReductionChecker> checkers{};
251+
};
252+
253+
DevicePackage hostData{};
254+
alpaka::Buf<TDev, DevicePackage, alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> devicePackageBuffer;
255+
256+
InstructionDetails(TDev const& device)
257+
: devicePackageBuffer(alpaka::allocBuf<DevicePackage, Idx>(device, 1U)) {};
258+
259+
auto sendTo([[maybe_unused]] TDev const& device, auto& queue) {
260+
alpaka::memset(queue, devicePackageBuffer, 0U);
261+
return reinterpret_cast<DevicePackage*>(alpaka::getPtrNative(devicePackageBuffer));
262+
}
263+
auto retrieveFrom([[maybe_unused]] TDev const& device, auto& queue) {
264+
auto const platformHost = alpaka::PlatformCpu{};
265+
auto const devHost = getDevByIdx(platformHost, 0);
266+
auto view = alpaka::createView(devHost, &hostData, 1U);
267+
alpaka::memcpy(queue, view, devicePackageBuffer);
268+
}
269+
270+
nlohmann::json generateReport() {
271+
return {{"recipes", hostData.recipes.generateReport()},
272+
{"logs", hostData.loggers.generateReport()},
273+
{"checks", hostData.checkers.generateReport()}};
274+
}
275+
};
276+
277+
template <typename TAcc, typename TDev> auto makeInstructionDetails(TDev const& device) {
278+
return InstructionDetails<TAcc, TDev>(device);
279+
}
280+
281+
auto composeSetup() {
282+
auto execution = makeExecutionDetails();
283+
return setup::composeSetup("Non trivial", execution,
284+
makeInstructionDetails<Acc>(execution.device), {});
285+
}
286+
} // namespace setups
287+
288+
/**
289+
* @brief Compose a report from the provided metadata, configuration, and individual reports.
290+
*
291+
* This function takes a json object representing the metadata, a json object
292+
* representing the configuration, and a json object representing the individual
293+
* reports, and composes a report by merging them into a single json object.
294+
* The resulting json object is returned.
295+
*
296+
* @param metadata The json object representing the metadata.
297+
* @param config The json object representing the configuration.
298+
* @param individualReports The json object representing the individual reports.
299+
* @return json The json object representing the composed report.
300+
*/
301+
json composeReport(json const& metadata, json const& benchmarkReports) {
302+
json report{};
303+
report["metadata"] = metadata;
304+
report["benchmarks"] = benchmarkReports;
305+
return report;
306+
}
307+
308+
void output(json const& report) { std::cout << report << std::endl; }
309+
310+
auto main() -> int {
311+
auto metadata = gatherMetadata();
312+
auto setup = setups::composeSetup();
313+
auto benchmarkReports = runBenchmarks(setup);
314+
auto report = composeReport(metadata, benchmarkReports);
315+
output(report);
316+
return EXIT_SUCCESS;
317+
}

0 commit comments

Comments
 (0)