66#include < alpaka/workdiv/WorkDivMembers.hpp>
77#include < cstdint>
88#include < limits>
9+ #include < sstream>
910#include < tuple>
1011#include < utility>
1112#include < variant>
@@ -57,14 +58,12 @@ auto makeExecutionDetails() {
5758 return kitgenbench::ExecutionDetails<Acc, decltype (dev)>{workdiv, dev};
5859}
5960
60- static constexpr std::uint32_t ALLOCATION_SIZE = 16U ;
61-
6261// Reasons for the check to yield the result it yielded.
6362// `completed` means that the check completed. The result can still be true/false depending on
6463// whether the obtained value was actually correct. `notApplicable` means that the checks were
6564// skipped. `nullpointer` means that a nullpointer was given, so the checks couldn't run at all.
6665enum class Reason { completed, notApplicable, nullpointer };
67- using Payload = std::variant<std::span<std::byte, ALLOCATION_SIZE >, std::pair<bool , Reason>>;
66+ using Payload = std::variant<std::span<std::byte>, std::pair<bool , Reason>>;
6867
6968template <typename TAccTag> struct SimpleSumLogger {
7069 using Clock = DeviceClock<TAccTag>;
@@ -161,10 +160,9 @@ constexpr auto isSpan(T<TType, TExtent>) {
161160 return IsSpan<T, TType, TExtent>{};
162161}
163162
164- template <typename TNew, typename TOld, std::size_t TExtent>
165- constexpr auto convertDataType (std::span<TOld, TExtent>& range) {
166- return std::span<TNew, TExtent * sizeof (TOld) / sizeof (TNew)>(
167- reinterpret_cast <TNew*>(range.data ()), range.size ());
163+ template <typename TNew, typename TOld> constexpr auto convertDataType (std::span<TOld>& range) {
164+ return std::span<TNew>(reinterpret_cast <TNew*>(range.data ()),
165+ range.size () * sizeof (TOld) / sizeof (TNew));
168166}
169167
170168struct IotaReductionChecker {
@@ -211,6 +209,14 @@ template <typename T> struct AccumulateResultsProvider {
211209 nlohmann::json generateReport () { return result.generateReport (); }
212210};
213211
212+ template <typename T, typename U> struct ArgumentStoringProvider {
213+ U argument{};
214+ ALPAKA_FN_ACC T load ([[maybe_unused]] auto const threadIndex) { return {argument}; }
215+ ALPAKA_FN_ACC void store ([[maybe_unused]] const auto & acc, [[maybe_unused]] T&& instance,
216+ auto const ) {}
217+ nlohmann::json generateReport () { return {}; }
218+ };
219+
214220template <typename T> struct AcumulateChecksProvider {
215221 T result{};
216222 ALPAKA_FN_ACC T load (auto const threadIndex) { return {threadIndex}; }
@@ -222,20 +228,20 @@ template <typename T> struct AcumulateChecksProvider {
222228
223229namespace setups {
224230 struct SingleSizeMallocRecipe {
225- static constexpr std::uint32_t allocationSize{ALLOCATION_SIZE} ;
231+ std::uint32_t allocationSize;
226232 static constexpr std::uint32_t numAllocations{256U };
227233 std::array<std::byte*, numAllocations> pointers{{}};
228234 std::uint32_t counter{0U };
229235
230236 ALPAKA_FN_ACC auto next ([[maybe_unused]] const auto & acc) {
231237 if (counter >= numAllocations)
232- return std::make_tuple (+kitgenbench::Actions::STOP,
233- Payload (std::span<std::byte, allocationSize>{
234- static_cast <std::byte*>(nullptr ), allocationSize}));
238+ return std::make_tuple (
239+ +kitgenbench::Actions::STOP,
240+ Payload (std::span<std::byte>{ static_cast <std::byte*>(nullptr ), allocationSize}));
235241 pointers[counter] = static_cast <std::byte*>(malloc (allocationSize));
236- auto result = std::make_tuple (
237- +kitgenbench::Actions::MALLOC,
238- Payload (std::span<std::byte, allocationSize >(pointers[counter], allocationSize)));
242+ auto result
243+ = std::make_tuple ( +kitgenbench::Actions::MALLOC,
244+ Payload (std::span<std::byte>(pointers[counter], allocationSize)));
239245 counter++;
240246 return result;
241247 }
@@ -245,19 +251,24 @@ namespace setups {
245251
246252 template <typename TAcc, typename TDev> struct InstructionDetails {
247253 struct DevicePackage {
248- NoStoreProvider <SingleSizeMallocRecipe> recipes{};
254+ ArgumentStoringProvider <SingleSizeMallocRecipe, uint32_t > recipes{};
249255 AccumulateResultsProvider<SimpleSumLogger<AccTag>> loggers{};
250256 AcumulateChecksProvider<IotaReductionChecker> checkers{};
257+
258+ DevicePackage (auto size) : recipes{size} {}
251259 };
252260
253261 DevicePackage hostData{};
254262 alpaka::Buf<TDev, DevicePackage, alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> devicePackageBuffer;
255263
256- InstructionDetails (TDev const & device)
257- : devicePackageBuffer(alpaka::allocBuf<DevicePackage, Idx>(device, 1U )) {};
264+ InstructionDetails (TDev const & device, uint32_t size )
265+ : hostData(size), devicePackageBuffer(alpaka::allocBuf<DevicePackage, Idx>(device, 1U )) {};
258266
259267 auto sendTo ([[maybe_unused]] TDev const & device, auto & queue) {
260- alpaka::memset (queue, devicePackageBuffer, 0U );
268+ auto const platformHost = alpaka::PlatformCpu{};
269+ auto const devHost = getDevByIdx (platformHost, 0 );
270+ auto view = alpaka::createView (devHost, &hostData, 1U );
271+ alpaka::memcpy (queue, devicePackageBuffer, view);
261272 return reinterpret_cast <DevicePackage*>(alpaka::getPtrNative (devicePackageBuffer));
262273 }
263274 auto retrieveFrom ([[maybe_unused]] TDev const & device, auto & queue) {
@@ -274,14 +285,16 @@ namespace setups {
274285 }
275286 };
276287
277- template <typename TAcc, typename TDev> auto makeInstructionDetails (TDev const & device) {
278- return InstructionDetails<TAcc, TDev>(device);
288+ template <typename TAcc, typename TDev>
289+ auto makeInstructionDetails (TDev const & device, uint32_t size) {
290+ return InstructionDetails<TAcc, TDev>(device, size);
279291 }
280292
281- auto composeSetup () {
293+ auto composeSetup (uint32_t size ) {
282294 auto execution = makeExecutionDetails ();
283- return setup::composeSetup (" Non trivial" , execution,
284- makeInstructionDetails<Acc>(execution.device ), {});
295+ return setup::composeSetup ((std::stringstream{} << " Allocation size: " << size).str (),
296+ execution, makeInstructionDetails<Acc>(execution.device , size),
297+ {{" allocation size" , size}});
285298 }
286299} // namespace setups
287300
@@ -309,9 +322,16 @@ void output(json const& report) { std::cout << report << std::endl; }
309322
310323auto main () -> int {
311324 auto metadata = gatherMetadata ();
312- auto setup = setups::composeSetup ();
313- auto benchmarkReports = runBenchmarks (setup);
325+ json benchmarkReports = json::object ();
326+ auto allocationSizes = std::to_array ({16U , 32U , 64U , 128U , 256U , 512U , 1024U });
327+ for (auto const size : allocationSizes) {
328+ auto setup = setups::composeSetup (size);
329+ // CAUTION: This overwrites the outermost "total runtime" which will be reported wrongly.
330+ benchmarkReports.merge_patch (runBenchmarks (setup));
331+ }
314332 auto report = composeReport (metadata, benchmarkReports);
333+ // Hot fix: Remove wrongly "merged", i.e. overwritten, "total runtime".
334+ report[" benchmarks" ].erase (" total runtime [ms]" );
315335 output (report);
316336 return EXIT_SUCCESS;
317337}
0 commit comments