From 7ddca649541a249639e6a00d0c60f772e8502a74 Mon Sep 17 00:00:00 2001 From: Chin-En Kuo Date: Tue, 14 Jan 2025 08:27:05 +0100 Subject: [PATCH 1/6] runnable code for alpaka reduction kernel --- example/reduction/CMakeLists.txt | 50 +++++++ example/reduction/src/config.h | 26 ++++ example/reduction/src/reduction.cpp | 213 ++++++++++++++++++++++++++++ 3 files changed, 289 insertions(+) create mode 100644 example/reduction/CMakeLists.txt create mode 100644 example/reduction/src/config.h create mode 100644 example/reduction/src/reduction.cpp diff --git a/example/reduction/CMakeLists.txt b/example/reduction/CMakeLists.txt new file mode 100644 index 000000000..18db8a31d --- /dev/null +++ b/example/reduction/CMakeLists.txt @@ -0,0 +1,50 @@ +# +# Copyright 2023 Benjamin Worpitz, Jan Stephan +# SPDX-License-Identifier: ISC +# + +################################################################################ +# Required CMake version. + +cmake_minimum_required(VERSION 3.25) + +set_property(GLOBAL PROPERTY USE_FOLDERS ON) + +################################################################################ +# Project. + +set(_TARGET_NAME vectorAdd) + +project(tutorail LANGUAGES CXX) + + +#------------------------------------------------------------------------------- +# Find alpaka. + +if (NOT TARGET alpaka::alpaka) + option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF) + + if (alpaka_USE_SOURCE_TREE) + # Don't build the examples recursively + set(alpaka_BUILD_EXAMPLES OFF) + add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka") + else () + find_package(alpaka REQUIRED) + endif () +endif () + +#------------------------------------------------------------------------------- +# Add executable. + +file(GLOB_RECURSE tutorialSource "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp") + +foreach (tutExampleFile ${tutorialSource}) + get_filename_component(tutFileName ${tutExampleFile} NAME) + string(REPLACE ".cpp" "" tutName ${tutFileName}) + alpaka_add_executable(${tutName} ${tutExampleFile}) + target_link_libraries(${tutName} PUBLIC alpaka::alpaka) + set_target_properties(${tutName} PROPERTIES FOLDER reduction) + target_compile_features(${tutName} PRIVATE cxx_std_20) + + add_test(NAME ${tutName} COMMAND ${tutName}) +endforeach () diff --git a/example/reduction/src/config.h b/example/reduction/src/config.h new file mode 100644 index 000000000..8a69e3c84 --- /dev/null +++ b/example/reduction/src/config.h @@ -0,0 +1,26 @@ +/* Copyright 2024 Andrea Bocci + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include +#include +#include + +#include + +// index type +using Idx = uint32_t; +// vectors +template +using Vec = alpaka::Vec; +// zero dimension aka scalar is currently not supported +// using Scalar = Vec; +using Vec1D = Vec<1u>; +using Vec2D = Vec<2u>; +using Vec3D = Vec<3u>; + +// remove NDEBUG to activate asserts +#ifdef NDEBUG +# undef NDEBUG +#endif diff --git a/example/reduction/src/reduction.cpp b/example/reduction/src/reduction.cpp new file mode 100644 index 000000000..396edce67 --- /dev/null +++ b/example/reduction/src/reduction.cpp @@ -0,0 +1,213 @@ +/* Copyright 2024 Andrea Bocci, René Widera + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "config.h" + +#include + +#include +#include +#include + +/** @file + * + * In the previous example we showed how to handle thread indices by hand to iterate over 1 and 3-dimensional data. + * There are very seldom cases where you need this explict control over threads and blocks. Very often handling thread + * indices by hand will result in performance issues at least on CPU devices. + * + * This example will show how you can iterate with frames, which can be seen as data index chunks without explicit + * calculate thread indices by hand. The code is is easy to write and read and will mostly be faster on CPU and GPU + * devices. + */ + +// sum += A[]*B[]; + +struct ReductionKernel +{ + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, auto const in1, auto out, auto arraySize) const + { + // product() returns a scalar therefore we need the explicit Vec1D type + Vec1D linearNumFrames = acc[alpaka::frame::count].product(); + auto frameExtent = acc[alpaka::frame::extent]; + auto frameDataExtent = linearNumFrames*frameExtent; + Vec1D linearFrameExtent = frameExtent.product(); + auto traverseInFrame = alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::worker::threadsInBlock, alpaka::IdxRange{frameExtent}); + auto traverseOverFrames = alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::worker::blocksInGrid, alpaka::IdxRange{alpaka::CVec{}, frameDataExtent, linearFrameExtent}); + // Shared Mempry + auto sumOnSharedMem = alpaka::onAcc::declareSharedMdArray(acc, frameExtent); + + // Values of these addresses will be used later. + // Sync() is not required. + for(auto [elemIdxInFrame] : traverseInFrame){ + sumOnSharedMem[elemIdxInFrame] = 0; + } + + // init completed + /////////////////////////////////////////////////////////// + // For each frame in frames and for each thread in frame do + // get the sum of a small piece in in1[] on the thread to the sharedMem[]. + + for(auto frameIdx : traverseOverFrames){ + for(auto elemIdx : traverseInFrame){ + for(auto [i]: alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::WorkerGroup{frameIdx + elemIdx, frameDataExtent}, alpaka::IdxRange{arraySize})){ + sumOnSharedMem[elemIdx] += in1[i]; + } + } + } + + alpaka::onAcc::syncBlockThreads(acc); + + // Copying data to sharedMem completed. + /////////////////////////////////////////////////////////// + // For each thread on acc do + // sum up. + + for(auto [elemIdx] : alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::worker::threadsInBlock, alpaka::IdxRange{acc[alpaka::layer::thread].count(), frameExtent})){ + sumOnSharedMem[acc[alpaka::layer::thread].idx()] += sumOnSharedMem[elemIdx]; + } + + // Suming up on each thread completed. + /////////////////////////////////////////////////////////// + // + + auto const [local_i] = acc[alpaka::layer::thread].idx(); + auto const [blockSize] = acc[alpaka::layer::thread].count(); + for(auto stride = blockSize /2; stride > 0; stride /=2){ + alpaka::onAcc::syncBlockThreads(acc); + if(local_i < stride){ + sumOnSharedMem[local_i] += sumOnSharedMem[local_i+stride]; + //sumOnSharedMem[local_i+stride] = 0; + } + } + + if(local_i == 0){ + out[acc[alpaka::layer::block].idx().x()] = sumOnSharedMem[local_i]; + } + + // + /////////////////////////////////////////////////////////// + // + + + } +}; + +void testReductionKernel( + alpaka::onHost::concepts::Device auto host, + alpaka::onHost::concepts::Device auto device, + auto computeExec) +{ + // random number generator with a gaussian distribution + std::random_device rd{}; + std::default_random_engine rand{rd()}; + std::normal_distribution dist{0.f, 1.f}; + + // buffer size + constexpr uint32_t size = 1024 * 1024; + + // tolerance + //constexpr float epsilon = 0.000001f; + constexpr float epsilon = 0.000001f*size; + + // allocate input and output host buffers in pinned memory accessible by the Platform devices + auto in1_h = alpaka::onHost::alloc(host, Vec1D{size}); + auto out_h = alpaka::onHost::allocMirror(host, in1_h); + + // fill the input buffers with random data, and the output buffer with zeros + for(uint32_t i = 0; i < size; ++i) + { + in1_h[i] = dist(rand); + out_h[i] = 0.; + } + + // run the test the given device + alpaka::onHost::Queue queue = device.makeQueue(); + + // allocate input and output buffers on the device + auto in1_d = alpaka::onHost::allocMirror(device, in1_h); + auto out_d = alpaka::onHost::allocMirror(device, out_h); + + // copy the input data to the device; the size is known from the buffer objects + alpaka::onHost::memcpy(queue, in1_d, in1_h); + + // fill the output buffer with zeros; the size is known from the buffer objects + alpaka::onHost::memset(queue, out_d, 0x00); + + // launch the 1-dimensional kernel + constexpr auto frameExtent = 32u; + auto numFrames = Vec1D{size} / frameExtent; + // The kernel assumes that the problem size is a multiple of the frame size. + assert((numFrames * frameExtent).x() == size); + + auto frameSpec = alpaka::onHost::FrameSpec{numFrames, alpaka::CVec{}}; + + // fill the output buffer with zeros; the size is known from the buffer objects + alpaka::onHost::memset(queue, out_d, 0x00); + + std::cout << "Testing VectorAddKernel with vector indices with a grid of " << frameSpec << "\n"; + queue + .enqueue(computeExec, frameSpec, ReductionKernel{}, in1_d.getMdSpan(), out_d.getMdSpan(), Vec1D(size)); + + // copy the results from the device to the host + alpaka::onHost::memcpy(queue, out_h, out_d); + + // wait for all the operations to complete + alpaka::onHost::wait(queue); + + auto finalSum = std::accumulate( + &out_h[0], + &out_h[size-1], + float(0)); + + float sum = 0; + // check the results + for(uint32_t i = 0; i < size; ++i) + { + sum += in1_h[i]; + } + std::cout << "acc output: " << finalSum << " host answer: " << sum << std::endl; + assert(finalSum < sum + epsilon); + assert(finalSum > sum - epsilon); + std::cout << "success\n"; +} + +int example(auto const cfg) +{ + auto deviceApi = cfg[alpaka::object::api]; + auto computeExec = cfg[alpaka::object::exec]; + + // initialise the accelerator platform + alpaka::onHost::Platform platform = alpaka::onHost::makePlatform(deviceApi); + + // require at least one device + std::size_t n = alpaka::onHost::getDeviceCount(platform); + + if(n == 0) + { + return EXIT_FAILURE; + } + + // use the single host device + alpaka::onHost::Platform host_platform = alpaka::onHost::makePlatform(alpaka::api::cpu); + alpaka::onHost::Device host = host_platform.makeDevice(0); + std::cout << "Host: " << alpaka::onHost::getName(host) << "\n\n"; + + // use the first device + alpaka::onHost::Device device = platform.makeDevice(0); + std::cout << "Device: " << alpaka::onHost::getName(device) << "\n\n"; + + testReductionKernel(host, device, computeExec); + + return EXIT_SUCCESS; +} + +auto main() -> int +{ + using namespace alpaka; + // Execute the example once for each enabled API and executor. + return executeForEach( + [=](auto const& tag) { return example(tag); }, + onHost::allExecutorsAndApis(onHost::enabledApis)); +} From 972f73983a8bb2eeaed5c1160ba8e5c78e7a90d8 Mon Sep 17 00:00:00 2001 From: Chin-En Kuo Date: Wed, 15 Jan 2025 09:44:18 +0100 Subject: [PATCH 2/6] added timer for performance evaluation --- example/reduction/src/reduction.cpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/example/reduction/src/reduction.cpp b/example/reduction/src/reduction.cpp index 396edce67..80ccbf5c6 100644 --- a/example/reduction/src/reduction.cpp +++ b/example/reduction/src/reduction.cpp @@ -10,6 +10,9 @@ #include #include +#include +#include + /** @file * * In the previous example we showed how to handle thread indices by hand to iterate over 1 and 3-dimensional data. @@ -147,19 +150,34 @@ void testReductionKernel( alpaka::onHost::memset(queue, out_d, 0x00); std::cout << "Testing VectorAddKernel with vector indices with a grid of " << frameSpec << "\n"; + + auto beginT = std::chrono::high_resolution_clock::now(); + queue .enqueue(computeExec, frameSpec, ReductionKernel{}, in1_d.getMdSpan(), out_d.getMdSpan(), Vec1D(size)); + auto endT = std::chrono::high_resolution_clock::now(); + std::cout << "Time for kernel execution: " << std::chrono::duration(endT - beginT).count() << 's' + << std::endl; + + beginT = std::chrono::high_resolution_clock::now(); // copy the results from the device to the host alpaka::onHost::memcpy(queue, out_h, out_d); // wait for all the operations to complete alpaka::onHost::wait(queue); + endT = std::chrono::high_resolution_clock::now(); + std::cout << "Time for HtoD copy: " << std::chrono::duration(endT - beginT).count() << 's' + << std::endl; + beginT = std::chrono::high_resolution_clock::now(); auto finalSum = std::accumulate( &out_h[0], &out_h[size-1], float(0)); + endT = std::chrono::high_resolution_clock::now(); + std::cout << "Time for partial sum accumulation: " << std::chrono::duration(endT - beginT).count() << 's' + << std::endl; float sum = 0; // check the results From 4a2bd62ab494356949fdc4aa4d017dda116f9be9 Mon Sep 17 00:00:00 2001 From: Chin-En Kuo Date: Tue, 21 Jan 2025 11:06:03 +0100 Subject: [PATCH 3/6] type changed to double from float to reduce precision errors --- example/reduction/src/reduction.cpp | 79 +++++++++++++++++------------ 1 file changed, 46 insertions(+), 33 deletions(-) diff --git a/example/reduction/src/reduction.cpp b/example/reduction/src/reduction.cpp index 80ccbf5c6..0793864ae 100644 --- a/example/reduction/src/reduction.cpp +++ b/example/reduction/src/reduction.cpp @@ -13,6 +13,9 @@ #include #include +#include +#include + /** @file * * In the previous example we showed how to handle thread indices by hand to iterate over 1 and 3-dimensional data. @@ -39,7 +42,7 @@ struct ReductionKernel auto traverseInFrame = alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::worker::threadsInBlock, alpaka::IdxRange{frameExtent}); auto traverseOverFrames = alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::worker::blocksInGrid, alpaka::IdxRange{alpaka::CVec{}, frameDataExtent, linearFrameExtent}); // Shared Mempry - auto sumOnSharedMem = alpaka::onAcc::declareSharedMdArray(acc, frameExtent); + auto sumOnSharedMem = alpaka::onAcc::declareSharedMdArray(acc, frameExtent); // Values of these addresses will be used later. // Sync() is not required. @@ -81,6 +84,7 @@ struct ReductionKernel alpaka::onAcc::syncBlockThreads(acc); if(local_i < stride){ sumOnSharedMem[local_i] += sumOnSharedMem[local_i+stride]; + //printf("[kernel] %f\n", sumOnSharedMem[local_i]); //sumOnSharedMem[local_i+stride] = 0; } } @@ -100,22 +104,27 @@ struct ReductionKernel void testReductionKernel( alpaka::onHost::concepts::Device auto host, alpaka::onHost::concepts::Device auto device, - auto computeExec) + auto computeExec, auto size) { + + std::cout << "[Host] " << alpaka::onHost::getName(host) << ", "; + std::cout << "[Device] " << alpaka::onHost::getName(device) << ", "; + // random number generator with a gaussian distribution - std::random_device rd{}; - std::default_random_engine rand{rd()}; - std::normal_distribution dist{0.f, 1.f}; + //std::random_device rd{}; + //std::default_random_engine rand{rd()}; + std::default_random_engine rand{}; + std::normal_distribution dist{(double)0.01, (double)1.}; // buffer size - constexpr uint32_t size = 1024 * 1024; + std::cout << "[Problem Size] " << size << ", "; // tolerance - //constexpr float epsilon = 0.000001f; - constexpr float epsilon = 0.000001f*size; + constexpr double epsilon = (double)0.0001; + //constexpr float epsilon = 0.000001f*size; // allocate input and output host buffers in pinned memory accessible by the Platform devices - auto in1_h = alpaka::onHost::alloc(host, Vec1D{size}); + auto in1_h = alpaka::onHost::alloc(host, Vec1D{size}); auto out_h = alpaka::onHost::allocMirror(host, in1_h); // fill the input buffers with random data, and the output buffer with zeros @@ -139,56 +148,57 @@ void testReductionKernel( alpaka::onHost::memset(queue, out_d, 0x00); // launch the 1-dimensional kernel - constexpr auto frameExtent = 32u; - auto numFrames = Vec1D{size} / frameExtent; + constexpr auto frameExtent = 256; + auto numFrames = Vec1D{size} / frameExtent /8; // The kernel assumes that the problem size is a multiple of the frame size. - assert((numFrames * frameExtent).x() == size); + assert((numFrames * frameExtent).x() *8 == size); auto frameSpec = alpaka::onHost::FrameSpec{numFrames, alpaka::CVec{}}; // fill the output buffer with zeros; the size is known from the buffer objects alpaka::onHost::memset(queue, out_d, 0x00); - std::cout << "Testing VectorAddKernel with vector indices with a grid of " << frameSpec << "\n"; - - auto beginT = std::chrono::high_resolution_clock::now(); + std::cout << "Grid of " << frameSpec << ", "; + alpaka::onHost::wait(queue); + auto beginT = std::chrono::high_resolution_clock::now(); queue .enqueue(computeExec, frameSpec, ReductionKernel{}, in1_d.getMdSpan(), out_d.getMdSpan(), Vec1D(size)); - + alpaka::onHost::wait(queue); auto endT = std::chrono::high_resolution_clock::now(); - std::cout << "Time for kernel execution: " << std::chrono::duration(endT - beginT).count() << 's' - << std::endl; + std::cout << "[T Kernel Exec] " << std::chrono::duration(endT - beginT).count() << 's' << ", "; + + alpaka::onHost::wait(queue); beginT = std::chrono::high_resolution_clock::now(); // copy the results from the device to the host alpaka::onHost::memcpy(queue, out_h, out_d); - // wait for all the operations to complete alpaka::onHost::wait(queue); endT = std::chrono::high_resolution_clock::now(); - std::cout << "Time for HtoD copy: " << std::chrono::duration(endT - beginT).count() << 's' - << std::endl; - + std::cout << "[T HtoD Copy] " << std::chrono::duration(endT - beginT).count() << 's' << ", "; + + alpaka::onHost::wait(queue); beginT = std::chrono::high_resolution_clock::now(); auto finalSum = std::accumulate( &out_h[0], &out_h[size-1], - float(0)); + double(0)); endT = std::chrono::high_resolution_clock::now(); - std::cout << "Time for partial sum accumulation: " << std::chrono::duration(endT - beginT).count() << 's' - << std::endl; + std::cout << "[T Partial Sum Accumulation] " << std::chrono::duration(endT - beginT).count() << 's' + << ", "; - float sum = 0; + double sum = 0; // check the results for(uint32_t i = 0; i < size; ++i) { + //if (i < 5) std::cout << "[num] " << in1_h[i] << std::endl; sum += in1_h[i]; } - std::cout << "acc output: " << finalSum << " host answer: " << sum << std::endl; - assert(finalSum < sum + epsilon); - assert(finalSum > sum - epsilon); - std::cout << "success\n"; + //std::cout << "acc output: " << finalSum << " host answer: " << sum << std::endl; + printf("[Device Output] %f [Host Output] %f, ",finalSum, sum); + assert(pow(finalSum - sum,2) < pow(epsilon,2)); + std::cout << "[Results] " << "success\n"; } int example(auto const cfg) @@ -210,13 +220,15 @@ int example(auto const cfg) // use the single host device alpaka::onHost::Platform host_platform = alpaka::onHost::makePlatform(alpaka::api::cpu); alpaka::onHost::Device host = host_platform.makeDevice(0); - std::cout << "Host: " << alpaka::onHost::getName(host) << "\n\n"; // use the first device alpaka::onHost::Device device = platform.makeDevice(0); - std::cout << "Device: " << alpaka::onHost::getName(device) << "\n\n"; - testReductionKernel(host, device, computeExec); + uint32_t size = 1024 * 1024; + for(int fac = 0; fac < 11; fac++){ + testReductionKernel(host, device, computeExec, size); + size *= 2; + } return EXIT_SUCCESS; } @@ -225,6 +237,7 @@ auto main() -> int { using namespace alpaka; // Execute the example once for each enabled API and executor. + std::srand(std::time(0)); // set time as random seed; rand() after this line will automatically use the same seed return executeForEach( [=](auto const& tag) { return example(tag); }, onHost::allExecutorsAndApis(onHost::enabledApis)); From df1f7c6063b0bb110a2cc551173437c93ce40edb Mon Sep 17 00:00:00 2001 From: Chin-En Kuo Date: Wed, 22 Jan 2025 13:35:28 +0100 Subject: [PATCH 4/6] changed to comma seperated output format --- example/reduction/src/reduction.cpp | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/example/reduction/src/reduction.cpp b/example/reduction/src/reduction.cpp index 0793864ae..ef04ee780 100644 --- a/example/reduction/src/reduction.cpp +++ b/example/reduction/src/reduction.cpp @@ -107,8 +107,9 @@ void testReductionKernel( auto computeExec, auto size) { - std::cout << "[Host] " << alpaka::onHost::getName(host) << ", "; - std::cout << "[Device] " << alpaka::onHost::getName(device) << ", "; + //std::cout << "[Host] " << alpaka::onHost::getName(host) << ", "; + //std::cout << "[Device] " << alpaka::onHost::getName(device) << ", "; + std::cout << alpaka::onHost::getName(device) << ", "; // random number generator with a gaussian distribution //std::random_device rd{}; @@ -117,7 +118,8 @@ void testReductionKernel( std::normal_distribution dist{(double)0.01, (double)1.}; // buffer size - std::cout << "[Problem Size] " << size << ", "; + //std::cout << "[Problem Size] " << size << ", "; + std::cout << size << ", "; // tolerance constexpr double epsilon = (double)0.0001; @@ -158,7 +160,7 @@ void testReductionKernel( // fill the output buffer with zeros; the size is known from the buffer objects alpaka::onHost::memset(queue, out_d, 0x00); - std::cout << "Grid of " << frameSpec << ", "; + //std::cout << "Grid of " << frameSpec << ", "; alpaka::onHost::wait(queue); auto beginT = std::chrono::high_resolution_clock::now(); @@ -166,7 +168,8 @@ void testReductionKernel( .enqueue(computeExec, frameSpec, ReductionKernel{}, in1_d.getMdSpan(), out_d.getMdSpan(), Vec1D(size)); alpaka::onHost::wait(queue); auto endT = std::chrono::high_resolution_clock::now(); - std::cout << "[T Kernel Exec] " << std::chrono::duration(endT - beginT).count() << 's' << ", "; + //std::cout << "[T Kernel Exec] " << std::chrono::duration(endT - beginT).count() << 's' << ", "; + std::cout << std::chrono::duration(endT - beginT).count() << ", "; alpaka::onHost::wait(queue); @@ -176,7 +179,7 @@ void testReductionKernel( // wait for all the operations to complete alpaka::onHost::wait(queue); endT = std::chrono::high_resolution_clock::now(); - std::cout << "[T HtoD Copy] " << std::chrono::duration(endT - beginT).count() << 's' << ", "; + //std::cout << "[T HtoD Copy] " << std::chrono::duration(endT - beginT).count() << 's' << ", "; alpaka::onHost::wait(queue); beginT = std::chrono::high_resolution_clock::now(); @@ -185,8 +188,7 @@ void testReductionKernel( &out_h[size-1], double(0)); endT = std::chrono::high_resolution_clock::now(); - std::cout << "[T Partial Sum Accumulation] " << std::chrono::duration(endT - beginT).count() << 's' - << ", "; + //std::cout << "[T Partial Sum Accumulation] " << std::chrono::duration(endT - beginT).count() << 's' << ", "; double sum = 0; // check the results @@ -196,9 +198,10 @@ void testReductionKernel( sum += in1_h[i]; } //std::cout << "acc output: " << finalSum << " host answer: " << sum << std::endl; - printf("[Device Output] %f [Host Output] %f, ",finalSum, sum); + //printf("[Device Output] %f [Host Output] %f, ",finalSum, sum); assert(pow(finalSum - sum,2) < pow(epsilon,2)); - std::cout << "[Results] " << "success\n"; + //std::cout << "[Results] " << "success\n"; + std::cout << "success\n"; } int example(auto const cfg) @@ -238,6 +241,7 @@ auto main() -> int using namespace alpaka; // Execute the example once for each enabled API and executor. std::srand(std::time(0)); // set time as random seed; rand() after this line will automatically use the same seed + std::cout << "Device, Problem Size, T Kernel Exec (s), Results" << std::endl; return executeForEach( [=](auto const& tag) { return example(tag); }, onHost::allExecutorsAndApis(onHost::enabledApis)); From 473d0284dab3d136d7f4a65dcafe359dbb4f372f Mon Sep 17 00:00:00 2001 From: Chin-En Kuo Date: Wed, 22 Jan 2025 16:19:59 +0100 Subject: [PATCH 5/6] added reduction folder --- example/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index db3ef601b..fb46ab972 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -18,3 +18,5 @@ project("alpakaExamples" LANGUAGES CXX) add_subdirectory("heatEquation2D/") add_subdirectory("vectorAdd/") add_subdirectory("tutorial/") +add_subdirectory("helloWorld/") +add_subdirectory("reduction/") From 79b176ea9b4954acfbf85a126a6963416d573bb8 Mon Sep 17 00:00:00 2001 From: Chin-En Kuo Date: Wed, 29 Jan 2025 07:32:31 +0100 Subject: [PATCH 6/6] changed some declaration to auto and added some std::flush --- example/reduction/src/reduction.cpp | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/example/reduction/src/reduction.cpp b/example/reduction/src/reduction.cpp index ef04ee780..8ecea7053 100644 --- a/example/reduction/src/reduction.cpp +++ b/example/reduction/src/reduction.cpp @@ -109,7 +109,7 @@ void testReductionKernel( //std::cout << "[Host] " << alpaka::onHost::getName(host) << ", "; //std::cout << "[Device] " << alpaka::onHost::getName(device) << ", "; - std::cout << alpaka::onHost::getName(device) << ", "; + std::cout << alpaka::onHost::getName(device) << ", " << std::flush; // random number generator with a gaussian distribution //std::random_device rd{}; @@ -119,7 +119,7 @@ void testReductionKernel( // buffer size //std::cout << "[Problem Size] " << size << ", "; - std::cout << size << ", "; + std::cout << size << ", " << std::flush; // tolerance constexpr double epsilon = (double)0.0001; @@ -130,7 +130,7 @@ void testReductionKernel( auto out_h = alpaka::onHost::allocMirror(host, in1_h); // fill the input buffers with random data, and the output buffer with zeros - for(uint32_t i = 0; i < size; ++i) + for(auto i = 0; i < size; ++i) { in1_h[i] = dist(rand); out_h[i] = 0.; @@ -151,9 +151,10 @@ void testReductionKernel( // launch the 1-dimensional kernel constexpr auto frameExtent = 256; + //constexpr auto frameExtent = 1024; auto numFrames = Vec1D{size} / frameExtent /8; // The kernel assumes that the problem size is a multiple of the frame size. - assert((numFrames * frameExtent).x() *8 == size); + //assert((numFrames * frameExtent).x() *8 == size); auto frameSpec = alpaka::onHost::FrameSpec{numFrames, alpaka::CVec{}}; @@ -169,7 +170,7 @@ void testReductionKernel( alpaka::onHost::wait(queue); auto endT = std::chrono::high_resolution_clock::now(); //std::cout << "[T Kernel Exec] " << std::chrono::duration(endT - beginT).count() << 's' << ", "; - std::cout << std::chrono::duration(endT - beginT).count() << ", "; + std::cout << std::chrono::duration(endT - beginT).count() << ", " << std::flush; alpaka::onHost::wait(queue); @@ -192,7 +193,7 @@ void testReductionKernel( double sum = 0; // check the results - for(uint32_t i = 0; i < size; ++i) + for(auto i = 0; i < size; ++i) { //if (i < 5) std::cout << "[num] " << in1_h[i] << std::endl; sum += in1_h[i]; @@ -201,7 +202,7 @@ void testReductionKernel( //printf("[Device Output] %f [Host Output] %f, ",finalSum, sum); assert(pow(finalSum - sum,2) < pow(epsilon,2)); //std::cout << "[Results] " << "success\n"; - std::cout << "success\n"; + std::cout << "success\n" << std::flush; } int example(auto const cfg) @@ -228,9 +229,9 @@ int example(auto const cfg) alpaka::onHost::Device device = platform.makeDevice(0); uint32_t size = 1024 * 1024; - for(int fac = 0; fac < 11; fac++){ + for(int fac = 10; fac < 11; fac++){ + size = 1024*1024*pow(2,fac); testReductionKernel(host, device, computeExec, size); - size *= 2; } return EXIT_SUCCESS; @@ -240,7 +241,8 @@ auto main() -> int { using namespace alpaka; // Execute the example once for each enabled API and executor. - std::srand(std::time(0)); // set time as random seed; rand() after this line will automatically use the same seed + //std::srand(std::time(0)); // set time as random seed; rand() after this line will automatically use the same seed + std::srand(12345); std::cout << "Device, Problem Size, T Kernel Exec (s), Results" << std::endl; return executeForEach( [=](auto const& tag) { return example(tag); },