Skip to content

Commit 5c1a1d8

Browse files
Merge pull request #4 from chillenzer/complete-mallocMC-example
Complete mallocMC example
2 parents 920229c + 56c306a commit 5c1a1d8

File tree

3 files changed

+43
-19
lines changed

3 files changed

+43
-19
lines changed

examples/plain-malloc/source/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ auto makeExecutionDetails() {
4545
cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1024U * 1024U * 1024U);
4646
#endif
4747
uint32_t const numThreadsPerBlock = 256U;
48-
uint32_t const numThreads = 4U * numThreadsPerBlock;
48+
uint32_t const numThreads = 16U * numThreadsPerBlock;
4949
auto workdiv = [numThreads, numThreadsPerBlock]() -> alpaka::WorkDivMembers<Dim, Idx> {
5050
if constexpr (std::is_same_v<alpaka::AccToTag<Acc>, alpaka::TagCpuSerial>) {
5151
return {{1U}, {1U}, {numThreads}};

examples/simple-mallocMC/source/main.cpp

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ using MyAllocator
3939
= mallocMC::Allocator<Acc, CreationPolicies::FlatterScatter<>, DistributionPolicies::Noop,
4040
OOMPolicies::ReturnNull, ReservePoolPolicies::AlpakaBuf<Acc>,
4141
AlignmentPolicies::Shrink<>>;
42+
using MyAllocatorHandle =
43+
typename std::remove_cvref_t<decltype(std::declval<MyAllocator>().getAllocatorHandle())>;
44+
static constexpr std::uint32_t ALLOCATION_SIZE = 16U;
45+
static constexpr std::size_t HEAP_SIZE = 1024U * 1024U * 1024U;
4246

4347
namespace kitgenbench::Actions {
4448
[[maybe_unused]] static constexpr int MALLOC = 1;
@@ -48,11 +52,8 @@ namespace kitgenbench::Actions {
4852
auto makeExecutionDetails() {
4953
auto const platformAcc = alpaka::Platform<Acc>{};
5054
auto const dev = alpaka::getDevByIdx(platformAcc, 0);
51-
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
52-
cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1024U * 1024U * 1024U);
53-
#endif
5455
uint32_t const numThreadsPerBlock = 256U;
55-
uint32_t const numThreads = 4U * numThreadsPerBlock;
56+
uint32_t const numThreads = 16U * numThreadsPerBlock;
5657
auto workdiv = [numThreads, numThreadsPerBlock]() -> alpaka::WorkDivMembers<Dim, Idx> {
5758
if constexpr (std::is_same_v<alpaka::AccToTag<Acc>, alpaka::TagCpuSerial>) {
5859
return {{1U}, {1U}, {numThreads}};
@@ -64,8 +65,6 @@ auto makeExecutionDetails() {
6465
return kitgenbench::ExecutionDetails<Acc, decltype(dev)>{workdiv, dev};
6566
}
6667

67-
static constexpr std::uint32_t ALLOCATION_SIZE = 16U;
68-
6968
// Reasons for the check to yield the result it yielded.
7069
// `completed` means that the check completed. The result can still be true/false depending on
7170
// whether the obtained value was actually correct. `notApplicable` means that the checks were
@@ -209,6 +208,13 @@ template <typename T> struct NoStoreProvider {
209208
nlohmann::json generateReport() { return {}; }
210209
};
211210

211+
template <typename T, typename... T_Resource> struct ResourceProvider {
212+
std::tuple<T_Resource...> resources{};
213+
ALPAKA_FN_ACC T load(auto const) { return {resources}; }
214+
ALPAKA_FN_ACC void store(auto const&, T&&, auto const) {}
215+
nlohmann::json generateReport() { return {}; }
216+
};
217+
212218
template <typename T> struct AccumulateResultsProvider {
213219
T result{};
214220
ALPAKA_FN_ACC T load(auto const) { return {}; }
@@ -229,6 +235,9 @@ template <typename T> struct AcumulateChecksProvider {
229235

230236
namespace setups {
231237
struct SingleSizeMallocRecipe {
238+
ALPAKA_FN_ACC SingleSizeMallocRecipe(std::tuple<MyAllocatorHandle> handleInTuple)
239+
: handle{std::get<0>(handleInTuple)} {}
240+
MyAllocatorHandle handle;
232241
static constexpr std::uint32_t allocationSize{ALLOCATION_SIZE};
233242
static constexpr std::uint32_t numAllocations{256U};
234243
std::array<std::byte*, numAllocations> pointers{{}};
@@ -239,7 +248,7 @@ namespace setups {
239248
return std::make_tuple(+kitgenbench::Actions::STOP,
240249
Payload(std::span<std::byte, allocationSize>{
241250
static_cast<std::byte*>(nullptr), allocationSize}));
242-
pointers[counter] = static_cast<std::byte*>(malloc(allocationSize));
251+
pointers[counter] = static_cast<std::byte*>(handle.malloc(acc, allocationSize));
243252
auto result = std::make_tuple(
244253
+kitgenbench::Actions::MALLOC,
245254
Payload(std::span<std::byte, allocationSize>(pointers[counter], allocationSize)));
@@ -252,19 +261,25 @@ namespace setups {
252261

253262
template <typename TAcc, typename TDev> struct InstructionDetails {
254263
struct DevicePackage {
255-
NoStoreProvider<SingleSizeMallocRecipe> recipes{};
264+
ResourceProvider<SingleSizeMallocRecipe, MyAllocatorHandle> recipes{};
256265
AccumulateResultsProvider<SimpleSumLogger<AccTag>> loggers{};
257266
AcumulateChecksProvider<IotaReductionChecker> checkers{};
267+
268+
DevicePackage(MyAllocatorHandle handle) : recipes{handle} {};
258269
};
259270

260-
DevicePackage hostData{};
271+
DevicePackage hostData;
261272
alpaka::Buf<TDev, DevicePackage, alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> devicePackageBuffer;
262273

263-
InstructionDetails(TDev const& device)
264-
: devicePackageBuffer(alpaka::allocBuf<DevicePackage, Idx>(device, 1U)) {};
274+
InstructionDetails(TDev const& device, auto allocatorHandle)
275+
: hostData(allocatorHandle),
276+
devicePackageBuffer(alpaka::allocBuf<DevicePackage, Idx>(device, 1U)) {};
265277

266278
auto sendTo([[maybe_unused]] TDev const& device, auto& queue) {
267-
alpaka::memset(queue, devicePackageBuffer, 0U);
279+
auto const platformHost = alpaka::PlatformCpu{};
280+
auto const devHost = getDevByIdx(platformHost, 0);
281+
auto view = alpaka::createView(devHost, &hostData, 1U);
282+
alpaka::memcpy(queue, devicePackageBuffer, view);
268283
return reinterpret_cast<DevicePackage*>(alpaka::getPtrNative(devicePackageBuffer));
269284
}
270285
auto retrieveFrom([[maybe_unused]] TDev const& device, auto& queue) {
@@ -281,14 +296,15 @@ namespace setups {
281296
}
282297
};
283298

284-
template <typename TAcc, typename TDev> auto makeInstructionDetails(TDev const& device) {
285-
return InstructionDetails<TAcc, TDev>(device);
299+
template <typename TAcc, typename TDev>
300+
auto makeInstructionDetails(TDev const& device, auto allocatorHandle) {
301+
return InstructionDetails<TAcc, TDev>(device, allocatorHandle);
286302
}
287303

288-
auto composeSetup() {
304+
auto composeSetup(auto allocatorHandle) {
289305
auto execution = makeExecutionDetails();
290306
return setup::composeSetup("Non trivial", execution,
291-
makeInstructionDetails<Acc>(execution.device), {});
307+
makeInstructionDetails<Acc>(execution.device, allocatorHandle), {});
292308
}
293309
} // namespace setups
294310

@@ -314,9 +330,16 @@ json composeReport(json const& metadata, json const& benchmarkReports) {
314330

315331
void output(json const& report) { std::cout << report << std::endl; }
316332

333+
auto setupAllocator(auto const& device, auto HEAP_SIZE) {
334+
auto queue = alpaka::Queue<Acc, alpaka::Blocking>(device);
335+
return MyAllocator(device, queue, HEAP_SIZE);
336+
}
337+
317338
auto main() -> int {
318339
auto metadata = gatherMetadata();
319-
auto setup = setups::composeSetup();
340+
auto execution = makeExecutionDetails();
341+
MyAllocator allocator = setupAllocator(execution.device, HEAP_SIZE);
342+
auto setup = setups::composeSetup(allocator.getAllocatorHandle());
320343
auto benchmarkReports = runBenchmarks(setup);
321344
auto report = composeReport(metadata, benchmarkReports);
322345
output(report);

include/kitgenbench/DeviceClock.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
namespace kitgenbench {
1212
template <typename TAccTag> struct DeviceClock;
1313

14+
#ifndef ALPAKA_ACC_GPU_CUDA_ENABLED
1415
template <> struct DeviceClock<alpaka::TagCpuSerial> {
1516
using DurationType = float;
1617
ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto clock() {
@@ -25,7 +26,7 @@ namespace kitgenbench {
2526
}
2627
};
2728

28-
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
29+
#else
2930

3031
template <> struct DeviceClock<alpaka::TagGpuCudaRt> {
3132
using DurationType = float;

0 commit comments

Comments
 (0)