Skip to content

Commit 1dd34f2

Browse files
committed
Multi-setup example
1 parent 920229c commit 1dd34f2

File tree

3 files changed

+375
-0
lines changed

3 files changed

+375
-0
lines changed

examples/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,8 @@ add_subdirectory(
1111
${CMAKE_CURRENT_LIST_DIR}/simple-mallocMC
1212
${CMAKE_BINARY_DIR}/examples/simple-mallocMC
1313
)
14+
15+
add_subdirectory(
16+
${CMAKE_CURRENT_LIST_DIR}/multi-setup
17+
${CMAKE_BINARY_DIR}/examples/multi-setup
18+
)
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
cmake_minimum_required(VERSION 3.14...3.22)
2+
3+
if(POLICY CMP0167)
4+
cmake_policy(SET CMP0167 NEW)
5+
endif()
6+
project(KitGenBenchExampleMultiSetup LANGUAGES CXX)
7+
8+
# --- Import tools ----
9+
10+
include(../../cmake/tools.cmake)
11+
12+
# ---- Dependencies ----
13+
14+
include(../../cmake/CPM.cmake)
15+
16+
cpmaddpackage(
17+
NAME nlohmann_json
18+
GITHUB_REPOSITORY nlohmann/json
19+
VERSION 3.11.3 NO_TESTS
20+
)
21+
22+
cpmaddpackage(
23+
NAME alpaka
24+
GITHUB_REPOSITORY alpaka-group/alpaka
25+
GIT_TAG 1.2.0
26+
)
27+
28+
cpmaddpackage(NAME KitGenBench SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../..)
29+
30+
# ---- Create standalone executable ----
31+
32+
file(GLOB sources CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/source/*.cpp)
33+
34+
alpaka_add_executable(${PROJECT_NAME} ${sources})
35+
36+
set_target_properties(
37+
${PROJECT_NAME}
38+
PROPERTIES
39+
CXX_STANDARD 20
40+
OUTPUT_NAME ${PROJECT_NAME}
41+
CXX_STANDARD_REQUIRED ON
42+
CXX_EXTENSIONS OFF
43+
)
44+
45+
target_link_libraries(
46+
${PROJECT_NAME}
47+
KitGenBench::KitGenBench
48+
nlohmann_json::nlohmann_json
49+
alpaka::alpaka
50+
)
Lines changed: 320 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,320 @@
1+
#include <kitgenbench/DeviceClock.h>
2+
#include <kitgenbench/kitgenbench.h>
3+
#include <kitgenbench/setup.h>
4+
#include <kitgenbench/version.h>
5+
#include <sys/types.h>
6+
7+
#include <alpaka/workdiv/WorkDivMembers.hpp>
8+
#include <cstdint>
9+
#include <limits>
10+
#include <sstream>
11+
#include <tuple>
12+
#include <utility>
13+
#include <variant>
14+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
15+
# include <cuda_runtime.h>
16+
#endif // alpaka_ACC_GPU_CUDA_ENABLE
17+
18+
#include <alpaka/acc/AccCpuSerial.hpp>
19+
#include <alpaka/acc/Tag.hpp>
20+
#include <alpaka/atomic/Traits.hpp>
21+
#include <alpaka/core/Common.hpp>
22+
#include <alpaka/mem/buf/Traits.hpp>
23+
#include <alpaka/mem/view/Traits.hpp>
24+
#include <cstdlib>
25+
#include <type_traits>
26+
#include <vector>
27+
28+
#include "nlohmann/json_fwd.hpp"
29+
30+
using nlohmann::json;
31+
using namespace kitgenbench;
32+
33+
using Dim = alpaka::DimInt<1>;
34+
using Idx = std::uint32_t;
35+
using AccTag = std::remove_cvref_t<decltype(std::get<0>(alpaka::EnabledAccTags{}))>;
36+
using Acc = alpaka::TagToAcc<AccTag, Dim, Idx>;
37+
38+
namespace kitgenbench::Actions {
39+
[[maybe_unused]] static constexpr int MALLOC = 1;
40+
[[maybe_unused]] static constexpr int FREE = 2;
41+
} // namespace kitgenbench::Actions
42+
43+
auto makeExecutionDetails() {
44+
auto const platformAcc = alpaka::Platform<Acc>{};
45+
auto const dev = alpaka::getDevByIdx(platformAcc, 0);
46+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
47+
cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1024U * 1024U * 1024U);
48+
#endif
49+
uint32_t const numThreadsPerBlock = 256U;
50+
uint32_t const numThreads = 4U * numThreadsPerBlock;
51+
auto workdiv = [numThreads, numThreadsPerBlock]() -> alpaka::WorkDivMembers<Dim, Idx> {
52+
if constexpr (std::is_same_v<alpaka::AccToTag<Acc>, alpaka::TagCpuSerial>) {
53+
return {{1U}, {1U}, {numThreads}};
54+
} else {
55+
return alpaka::WorkDivMembers<Dim, Idx>{
56+
{numThreads / numThreadsPerBlock}, {numThreadsPerBlock}, {1U}};
57+
}
58+
}();
59+
return kitgenbench::ExecutionDetails<Acc, decltype(dev)>{workdiv, dev};
60+
}
61+
62+
// Reasons for the check to yield the result it yielded.
63+
// `completed` means that the check completed. The result can still be true/false depending on
64+
// whether the obtained value was actually correct. `notApplicable` means that the checks were
65+
// skipped. `nullpointer` means that a nullpointer was given, so the checks couldn't run at all.
66+
enum class Reason { completed, notApplicable, nullpointer };
67+
using Payload = std::variant<std::span<std::byte>, std::pair<bool, Reason>>;
68+
69+
template <typename TAccTag> struct SimpleSumLogger {
70+
using Clock = DeviceClock<TAccTag>;
71+
72+
DeviceClock<TAccTag>::DurationType mallocDuration;
73+
std::uint32_t mallocCounter{0U};
74+
75+
DeviceClock<TAccTag>::DurationType freeDuration;
76+
std::uint32_t freeCounter{0U};
77+
78+
std::uint32_t nullpointersObtained{0U};
79+
std::uint32_t failedChecksCounter{0U};
80+
std::uint32_t invalidCheckResults{0U};
81+
82+
template <typename TAcc> ALPAKA_FN_INLINE ALPAKA_FN_ACC auto call(TAcc const& acc, auto func) {
83+
static_assert(
84+
std::is_same_v<alpaka::TagToAcc<TAccTag, alpaka::Dim<Acc>, alpaka::Idx<Acc>>, TAcc>);
85+
auto start = Clock::clock();
86+
auto result = func(acc);
87+
auto end = Clock::clock();
88+
89+
if (std::get<0>(result) == Actions::MALLOC) {
90+
mallocDuration += Clock::duration(start, end);
91+
mallocCounter++;
92+
}
93+
94+
if (std::get<0>(result) == Actions::FREE) {
95+
freeDuration += Clock::duration(start, end);
96+
freeCounter++;
97+
}
98+
99+
if (std::get<0>(result) == Actions::CHECK) {
100+
if (std::holds_alternative<std::pair<bool, Reason>>(std::get<1>(result))) {
101+
auto [passed, reason] = std::get<std::pair<bool, Reason>>(std::get<1>(result));
102+
if (not passed) {
103+
if (reason == Reason::nullpointer) {
104+
nullpointersObtained++;
105+
}
106+
if (reason == Reason::completed) {
107+
failedChecksCounter++;
108+
}
109+
}
110+
} else {
111+
invalidCheckResults++;
112+
}
113+
}
114+
115+
return result;
116+
}
117+
118+
ALPAKA_FN_ACC void accumulate(const auto& acc, const SimpleSumLogger& other) {
119+
alpaka::atomicAdd(acc, &mallocDuration, other.mallocDuration);
120+
alpaka::atomicAdd(acc, &mallocCounter, other.mallocCounter);
121+
alpaka::atomicAdd(acc, &freeDuration, other.freeDuration);
122+
alpaka::atomicAdd(acc, &freeCounter, other.freeCounter);
123+
alpaka::atomicAdd(acc, &nullpointersObtained, other.nullpointersObtained);
124+
alpaka::atomicAdd(acc, &failedChecksCounter, other.failedChecksCounter);
125+
alpaka::atomicAdd(acc, &invalidCheckResults, other.invalidCheckResults);
126+
}
127+
128+
nlohmann::json generateReport() {
129+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
130+
cudaDeviceProp prop;
131+
cudaGetDeviceProperties(&prop, 0);
132+
auto clockRate = prop.clockRate;
133+
#else
134+
auto clockRate = 1;
135+
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
136+
return {
137+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
138+
{"clock rate [1/ms]", clockRate},
139+
#endif
140+
{"allocation total time [ms]", mallocDuration / clockRate},
141+
{"allocation average time [ms]",
142+
mallocDuration / clockRate / (mallocCounter > 0 ? mallocCounter : 1U)},
143+
{"allocation count", mallocCounter},
144+
{"deallocation total time [ms]", freeDuration / clockRate},
145+
{"deallocation average time [ms]",
146+
freeDuration / clockRate / (freeCounter > 0 ? freeCounter : 1U)},
147+
{"deallocation count ", freeCounter},
148+
{"failed checks count", failedChecksCounter},
149+
{"nullpointers count", nullpointersObtained},
150+
{"invalid check results count", invalidCheckResults},
151+
};
152+
}
153+
};
154+
155+
template <template <typename, size_t> typename T, typename TType, size_t TExtent> struct IsSpan {
156+
static constexpr bool value = std::is_same_v<T<TType, TExtent>, std::span<TType, TExtent>>;
157+
};
158+
159+
template <template <typename, size_t> typename T, typename TType, size_t TExtent>
160+
constexpr auto isSpan(T<TType, TExtent>) {
161+
return IsSpan<T, TType, TExtent>{};
162+
}
163+
164+
template <typename TNew, typename TOld> constexpr auto convertDataType(std::span<TOld>& range) {
165+
return std::span<TNew>(reinterpret_cast<TNew*>(range.data()),
166+
range.size() * sizeof(TOld) / sizeof(TNew));
167+
}
168+
169+
struct IotaReductionChecker {
170+
uint32_t currentValue{};
171+
172+
ALPAKA_FN_ACC auto check([[maybe_unused]] const auto& acc, const auto& result) {
173+
if (std::get<0>(result) != Actions::MALLOC) {
174+
return std::make_tuple(Actions::CHECK, Payload(std::make_pair(true, Reason::notApplicable)));
175+
}
176+
auto range = std::get<0>(std::get<1>(result));
177+
if (range.data() == nullptr) {
178+
return std::make_tuple(Actions::CHECK, Payload(std::make_pair(false, Reason::nullpointer)));
179+
}
180+
auto uintRange = convertDataType<uint32_t>(range);
181+
std::iota(std::begin(uintRange), std::end(uintRange), currentValue);
182+
size_t n = uintRange.size();
183+
// The exact formula is using size_t because n is size_t. Casting it down will oftentimes run
184+
// into an overflow that the reduction encounters, too.
185+
auto expected = static_cast<uint32_t>(n * currentValue + n * (n - 1) / 2) ^ currentValue;
186+
currentValue ^= std::reduce(std::cbegin(uintRange), std::cend(uintRange));
187+
return std::make_tuple(+Actions::CHECK,
188+
Payload(std::make_pair(expected == currentValue, Reason::completed)));
189+
}
190+
191+
ALPAKA_FN_ACC auto accumulate(const auto& acc, const auto& other) {
192+
alpaka::atomicXor(acc, &currentValue, other.currentValue);
193+
}
194+
195+
nlohmann::json generateReport() { return {{"final value", currentValue}}; }
196+
};
197+
198+
template <typename T> struct NoStoreProvider {
199+
ALPAKA_FN_ACC T load(auto const) { return {}; }
200+
ALPAKA_FN_ACC void store(auto const&, T&&, auto const) {}
201+
nlohmann::json generateReport() { return {}; }
202+
};
203+
204+
template <typename T> struct AccumulateResultsProvider {
205+
T result{};
206+
ALPAKA_FN_ACC T load(auto const) { return {}; }
207+
ALPAKA_FN_ACC void store(const auto& acc, T&& instance, auto const) {
208+
result.accumulate(acc, instance);
209+
}
210+
nlohmann::json generateReport() { return result.generateReport(); }
211+
};
212+
213+
template <typename T> struct AcumulateChecksProvider {
214+
T result{};
215+
ALPAKA_FN_ACC T load(auto const threadIndex) { return {threadIndex}; }
216+
ALPAKA_FN_ACC void store(const auto& acc, T&& instance, auto const) {
217+
result.accumulate(acc, instance);
218+
}
219+
nlohmann::json generateReport() { return result.generateReport(); }
220+
};
221+
222+
namespace setups {
223+
template <uint32_t size> struct SingleSizeMallocRecipe {
224+
static constexpr std::uint32_t allocationSize{size};
225+
static constexpr std::uint32_t numAllocations{256U};
226+
std::array<std::byte*, numAllocations> pointers{{}};
227+
std::uint32_t counter{0U};
228+
229+
ALPAKA_FN_ACC auto next([[maybe_unused]] const auto& acc) {
230+
if (counter >= numAllocations)
231+
return std::make_tuple(
232+
+kitgenbench::Actions::STOP,
233+
Payload(std::span<std::byte>{static_cast<std::byte*>(nullptr), allocationSize}));
234+
pointers[counter] = static_cast<std::byte*>(malloc(allocationSize));
235+
auto result
236+
= std::make_tuple(+kitgenbench::Actions::MALLOC,
237+
Payload(std::span<std::byte>(pointers[counter], allocationSize)));
238+
counter++;
239+
return result;
240+
}
241+
242+
nlohmann::json generateReport() { return {}; }
243+
};
244+
245+
template <uint32_t size, typename TAcc, typename TDev> struct InstructionDetails {
246+
struct DevicePackage {
247+
NoStoreProvider<SingleSizeMallocRecipe<size>> recipes{};
248+
AccumulateResultsProvider<SimpleSumLogger<AccTag>> loggers{};
249+
AcumulateChecksProvider<IotaReductionChecker> checkers{};
250+
};
251+
252+
DevicePackage hostData{};
253+
alpaka::Buf<TDev, DevicePackage, alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> devicePackageBuffer;
254+
255+
InstructionDetails(TDev const& device)
256+
: devicePackageBuffer(alpaka::allocBuf<DevicePackage, Idx>(device, 1U)) {};
257+
258+
auto sendTo([[maybe_unused]] TDev const& device, auto& queue) {
259+
alpaka::memset(queue, devicePackageBuffer, 0U);
260+
return reinterpret_cast<DevicePackage*>(alpaka::getPtrNative(devicePackageBuffer));
261+
}
262+
auto retrieveFrom([[maybe_unused]] TDev const& device, auto& queue) {
263+
auto const platformHost = alpaka::PlatformCpu{};
264+
auto const devHost = getDevByIdx(platformHost, 0);
265+
auto view = alpaka::createView(devHost, &hostData, 1U);
266+
alpaka::memcpy(queue, view, devicePackageBuffer);
267+
}
268+
269+
nlohmann::json generateReport() {
270+
return {{"recipes", hostData.recipes.generateReport()},
271+
{"logs", hostData.loggers.generateReport()},
272+
{"checks", hostData.checkers.generateReport()}};
273+
}
274+
};
275+
276+
template <typename TAcc, uint32_t size, typename TDev>
277+
auto makeInstructionDetails(TDev const& device) {
278+
return InstructionDetails<size, TAcc, TDev>(device);
279+
}
280+
281+
using ALLOCATION_SIZES = std::integer_sequence<uint32_t, 16U, 256U>;
282+
283+
template <uint32_t... nums> auto composeSetups(std::integer_sequence<uint32_t, nums...>) {
284+
auto execution = makeExecutionDetails();
285+
return std::make_tuple(
286+
setup::composeSetup((std::stringstream{} << "Single size: " << nums).str(), execution,
287+
makeInstructionDetails<Acc, nums>(execution.device), {})...);
288+
}
289+
} // namespace setups
290+
291+
/**
292+
* @brief Compose a report from the provided metadata, configuration, and individual reports.
293+
*
294+
* This function takes a json object representing the metadata, a json object
295+
* representing the configuration, and a json object representing the individual
296+
* reports, and composes a report by merging them into a single json object.
297+
* The resulting json object is returned.
298+
*
299+
* @param metadata The json object representing the metadata.
300+
* @param config The json object representing the configuration.
301+
* @param individualReports The json object representing the individual reports.
302+
* @return json The json object representing the composed report.
303+
*/
304+
json composeReport(json const& metadata, json const& benchmarkReports) {
305+
json report{};
306+
report["metadata"] = metadata;
307+
report["benchmarks"] = benchmarkReports;
308+
return report;
309+
}
310+
311+
void output(json const& report) { std::cout << report << std::endl; }
312+
313+
auto main() -> int {
314+
auto metadata = gatherMetadata();
315+
auto setup = setups::composeSetups(setups::ALLOCATION_SIZES{});
316+
auto benchmarkReports = std::apply([](auto&&... args) { return runBenchmarks(args...); }, setup);
317+
auto report = composeReport(metadata, benchmarkReports);
318+
output(report);
319+
return EXIT_SUCCESS;
320+
}

0 commit comments

Comments
 (0)