From 7ddca649541a249639e6a00d0c60f772e8502a74 Mon Sep 17 00:00:00 2001
From: Chin-En Kuo <ck175439@fwkhal8999.fz-rossendorf.de>
Date: Tue, 14 Jan 2025 08:27:05 +0100
Subject: [PATCH 1/6] runnable code for alpaka reduction kernel

---
 example/reduction/CMakeLists.txt    |  50 +++++++
 example/reduction/src/config.h      |  26 ++++
 example/reduction/src/reduction.cpp | 213 ++++++++++++++++++++++++++++
 3 files changed, 289 insertions(+)
 create mode 100644 example/reduction/CMakeLists.txt
 create mode 100644 example/reduction/src/config.h
 create mode 100644 example/reduction/src/reduction.cpp
diff --git a/example/reduction/CMakeLists.txt b/example/reduction/CMakeLists.txt
new file mode 100644
index 000000000..18db8a31d
--- /dev/null
+++ b/example/reduction/CMakeLists.txt
@@ -0,0 +1,50 @@
+#
+# Copyright 2023 Benjamin Worpitz, Jan Stephan
+# SPDX-License-Identifier: ISC
+#
+
+################################################################################
+# Required CMake version.
+
+cmake_minimum_required(VERSION 3.25)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+################################################################################
+# Project.
+
+set(_TARGET_NAME vectorAdd)
+
+project(tutorail LANGUAGES CXX)
+
+
+#-------------------------------------------------------------------------------
+# Find alpaka.
+
+if (NOT TARGET alpaka::alpaka)
+    option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if (alpaka_USE_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else ()
+        find_package(alpaka REQUIRED)
+    endif ()
+endif ()
+
+#-------------------------------------------------------------------------------
+# Add executable.
+
+file(GLOB_RECURSE tutorialSource "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp")
+
+foreach (tutExampleFile ${tutorialSource})
+    get_filename_component(tutFileName ${tutExampleFile} NAME)
+    string(REPLACE ".cpp" "" tutName ${tutFileName})
+    alpaka_add_executable(${tutName} ${tutExampleFile})
+    target_link_libraries(${tutName} PUBLIC alpaka::alpaka)
+    set_target_properties(${tutName} PROPERTIES FOLDER reduction)
+    target_compile_features(${tutName} PRIVATE cxx_std_20)
+
+    add_test(NAME ${tutName} COMMAND ${tutName})
+endforeach ()
diff --git a/example/reduction/src/config.h b/example/reduction/src/config.h
new file mode 100644
index 000000000..8a69e3c84
--- /dev/null
+++ b/example/reduction/src/config.h
@@ -0,0 +1,26 @@
+/* Copyright 2024 Andrea Bocci
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <alpaka/alpaka.hpp>
+#include <alpaka/example/executeForEach.hpp>
+#include <alpaka/example/executors.hpp>
+
+#include <cstdint>
+
+// index type
+using Idx = uint32_t;
+// vectors
+template<uint32_t T_dim>
+using Vec = alpaka::Vec<uint32_t, T_dim>;
+// zero dimension aka scalar is currently not supported
+// using Scalar = Vec<Dim0D>;
+using Vec1D = Vec<1u>;
+using Vec2D = Vec<2u>;
+using Vec3D = Vec<3u>;
+
+// remove NDEBUG to activate asserts
+#ifdef NDEBUG
+#    undef NDEBUG
+#endif
diff --git a/example/reduction/src/reduction.cpp b/example/reduction/src/reduction.cpp
new file mode 100644
index 000000000..396edce67
--- /dev/null
+++ b/example/reduction/src/reduction.cpp
@@ -0,0 +1,213 @@
+/* Copyright 2024 Andrea Bocci, René Widera
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "config.h"
+
+#include <alpaka/alpaka.hpp>
+
+#include <cassert>
+#include <cstdio>
+#include <random>
+
+/** @file
+ *
+ * In the previous example we showed how to handle thread indices by hand to iterate over 1 and 3-dimensional data.
+ * There are very seldom cases where you need this explict control over threads and blocks. Very often handling thread
+ * indices by hand will result in performance issues at least on CPU devices.
+ *
+ * This example will show how you can iterate with frames, which can be seen as data index chunks without explicit
+ * calculate thread indices by hand. The code is is easy to write and read and will mostly be faster on CPU and GPU
+ * devices.
+ */
+
+// sum += A[]*B[];
+
+struct ReductionKernel
+{
+    template<typename TAcc>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, auto const in1, auto out, auto arraySize) const
+    {
+        // product() returns a scalar therefore we need the explicit Vec1D type
+        Vec1D linearNumFrames = acc[alpaka::frame::count].product();
+        auto frameExtent = acc[alpaka::frame::extent];
+        auto frameDataExtent = linearNumFrames*frameExtent;
+        Vec1D linearFrameExtent = frameExtent.product();
+        auto traverseInFrame = alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::worker::threadsInBlock, alpaka::IdxRange{frameExtent});
+        auto traverseOverFrames = alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::worker::blocksInGrid, alpaka::IdxRange{alpaka::CVec<uint32_t, 0u>{}, frameDataExtent, linearFrameExtent});
+        // Shared Mempry
+        auto sumOnSharedMem = alpaka::onAcc::declareSharedMdArray<float>(acc, frameExtent);
+
+        // Values of these addresses will be used later. 
+        // Sync() is not required.
+        for(auto [elemIdxInFrame] : traverseInFrame){
+            sumOnSharedMem[elemIdxInFrame] = 0;
+        }
+
+        // init completed
+        ///////////////////////////////////////////////////////////
+        // For each frame in frames and for each thread in frame do
+        // get the sum of a small piece in in1[] on the thread to the sharedMem[].
+
+        for(auto frameIdx : traverseOverFrames){
+            for(auto elemIdx : traverseInFrame){
+                for(auto [i]: alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::WorkerGroup{frameIdx + elemIdx, frameDataExtent}, alpaka::IdxRange{arraySize})){
+                    sumOnSharedMem[elemIdx] += in1[i];
+                }
+            }
+        }
+
+        alpaka::onAcc::syncBlockThreads(acc);
+
+        // Copying data to sharedMem completed.
+        ///////////////////////////////////////////////////////////
+        // For each thread on acc do
+        // sum up.
+
+        for(auto [elemIdx] : alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::worker::threadsInBlock, alpaka::IdxRange{acc[alpaka::layer::thread].count(), frameExtent})){
+            sumOnSharedMem[acc[alpaka::layer::thread].idx()] += sumOnSharedMem[elemIdx];
+        }
+
+        // Suming up on each thread completed.
+        ///////////////////////////////////////////////////////////
+        // 
+
+        auto const [local_i] = acc[alpaka::layer::thread].idx();
+        auto const [blockSize] = acc[alpaka::layer::thread].count();
+        for(auto stride = blockSize /2; stride > 0; stride /=2){
+            alpaka::onAcc::syncBlockThreads(acc);
+            if(local_i < stride){
+                sumOnSharedMem[local_i] += sumOnSharedMem[local_i+stride];
+                //sumOnSharedMem[local_i+stride] = 0;
+            }
+        }
+
+        if(local_i == 0){
+            out[acc[alpaka::layer::block].idx().x()] = sumOnSharedMem[local_i];
+        }
+
+        // 
+        ///////////////////////////////////////////////////////////
+        // 
+
+
+    }
+};
+
+void testReductionKernel(
+    alpaka::onHost::concepts::Device auto host,
+    alpaka::onHost::concepts::Device auto device,
+    auto computeExec)
+{
+    // random number generator with a gaussian distribution
+    std::random_device rd{};
+    std::default_random_engine rand{rd()};
+    std::normal_distribution<float> dist{0.f, 1.f};
+
+    // buffer size
+    constexpr uint32_t size = 1024 * 1024;
+
+    // tolerance
+    //constexpr float epsilon = 0.000001f;
+    constexpr float epsilon = 0.000001f*size;
+
+    // allocate input and output host buffers in pinned memory accessible by the Platform devices
+    auto in1_h = alpaka::onHost::alloc<float>(host, Vec1D{size});
+    auto out_h = alpaka::onHost::allocMirror(host, in1_h);
+
+    // fill the input buffers with random data, and the output buffer with zeros
+    for(uint32_t i = 0; i < size; ++i)
+    {
+        in1_h[i] = dist(rand);
+        out_h[i] = 0.;
+    }
+
+    // run the test the given device
+    alpaka::onHost::Queue queue = device.makeQueue();
+
+    // allocate input and output buffers on the device
+    auto in1_d = alpaka::onHost::allocMirror(device, in1_h);
+    auto out_d = alpaka::onHost::allocMirror(device, out_h);
+
+    // copy the input data to the device; the size is known from the buffer objects
+    alpaka::onHost::memcpy(queue, in1_d, in1_h);
+
+    // fill the output buffer with zeros; the size is known from the buffer objects
+    alpaka::onHost::memset(queue, out_d, 0x00);
+
+    // launch the 1-dimensional kernel
+    constexpr auto frameExtent = 32u;
+    auto numFrames = Vec1D{size} / frameExtent;
+    // The kernel assumes that the problem size is a multiple of the frame size.
+    assert((numFrames * frameExtent).x() == size);
+
+    auto frameSpec = alpaka::onHost::FrameSpec{numFrames, alpaka::CVec<uint32_t, frameExtent>{}};
+
+    // fill the output buffer with zeros; the size is known from the buffer objects
+    alpaka::onHost::memset(queue, out_d, 0x00);
+
+    std::cout << "Testing VectorAddKernel with vector indices with a grid of " << frameSpec << "\n";
+    queue
+        .enqueue(computeExec, frameSpec, ReductionKernel{}, in1_d.getMdSpan(), out_d.getMdSpan(), Vec1D(size));
+
+    // copy the results from the device to the host
+    alpaka::onHost::memcpy(queue, out_h, out_d);
+
+    // wait for all the operations to complete
+    alpaka::onHost::wait(queue);
+    
+    auto finalSum = std::accumulate(
+        &out_h[0],
+        &out_h[size-1],
+        float(0));
+
+    float sum = 0;
+    // check the results
+    for(uint32_t i = 0; i < size; ++i)
+    {
+        sum += in1_h[i];     
+    }
+    std::cout << "acc output: " << finalSum << " host answer: " << sum << std::endl;
+    assert(finalSum < sum + epsilon);
+    assert(finalSum > sum - epsilon);
+    std::cout << "success\n";
+}
+
+int example(auto const cfg)
+{
+    auto deviceApi = cfg[alpaka::object::api];
+    auto computeExec = cfg[alpaka::object::exec];
+
+    // initialise the accelerator platform
+    alpaka::onHost::Platform platform = alpaka::onHost::makePlatform(deviceApi);
+
+    // require at least one device
+    std::size_t n = alpaka::onHost::getDeviceCount(platform);
+
+    if(n == 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    // use the single host device
+    alpaka::onHost::Platform host_platform = alpaka::onHost::makePlatform(alpaka::api::cpu);
+    alpaka::onHost::Device host = host_platform.makeDevice(0);
+    std::cout << "Host:   " << alpaka::onHost::getName(host) << "\n\n";
+
+    // use the first device
+    alpaka::onHost::Device device = platform.makeDevice(0);
+    std::cout << "Device: " << alpaka::onHost::getName(device) << "\n\n";
+
+    testReductionKernel(host, device, computeExec);
+
+    return EXIT_SUCCESS;
+}
+
+auto main() -> int
+{
+    using namespace alpaka;
+    // Execute the example once for each enabled API and executor.
+    return executeForEach(
+        [=](auto const& tag) { return example(tag); },
+        onHost::allExecutorsAndApis(onHost::enabledApis));
+}

From 972f73983a8bb2eeaed5c1160ba8e5c78e7a90d8 Mon Sep 17 00:00:00 2001
From: Chin-En Kuo <ck175439@fwkhal8999.fz-rossendorf.de>
Date: Wed, 15 Jan 2025 09:44:18 +0100
Subject: [PATCH 2/6] added timer for performance evaluation

---
 example/reduction/src/reduction.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/example/reduction/src/reduction.cpp b/example/reduction/src/reduction.cpp
index 396edce67..80ccbf5c6 100644
--- a/example/reduction/src/reduction.cpp
+++ b/example/reduction/src/reduction.cpp
@@ -10,6 +10,9 @@
 #include <cstdio>
 #include <random>
 
+#include <chrono>
+#include <iostream>
+
 /** @file
  *
  * In the previous example we showed how to handle thread indices by hand to iterate over 1 and 3-dimensional data.
@@ -147,19 +150,34 @@ void testReductionKernel(
     alpaka::onHost::memset(queue, out_d, 0x00);
 
     std::cout << "Testing VectorAddKernel with vector indices with a grid of " << frameSpec << "\n";
+    
+    auto beginT = std::chrono::high_resolution_clock::now();
+
     queue
         .enqueue(computeExec, frameSpec, ReductionKernel{}, in1_d.getMdSpan(), out_d.getMdSpan(), Vec1D(size));
 
+    auto endT = std::chrono::high_resolution_clock::now();
+    std::cout << "Time for kernel execution: " << std::chrono::duration<double>(endT - beginT).count() << 's'
+                  << std::endl;
+
+    beginT = std::chrono::high_resolution_clock::now();
     // copy the results from the device to the host
     alpaka::onHost::memcpy(queue, out_h, out_d);
 
     // wait for all the operations to complete
     alpaka::onHost::wait(queue);
+    endT = std::chrono::high_resolution_clock::now();
+    std::cout << "Time for HtoD copy: " << std::chrono::duration<double>(endT - beginT).count() << 's'
+                  << std::endl;
     
+    beginT = std::chrono::high_resolution_clock::now();
     auto finalSum = std::accumulate(
         &out_h[0],
         &out_h[size-1],
         float(0));
+    endT = std::chrono::high_resolution_clock::now();
+    std::cout << "Time for partial sum accumulation: " << std::chrono::duration<double>(endT - beginT).count() << 's'
+                  << std::endl;
 
     float sum = 0;
     // check the results

From 4a2bd62ab494356949fdc4aa4d017dda116f9be9 Mon Sep 17 00:00:00 2001
From: Chin-En Kuo <ck175439@fwkhal8999.fz-rossendorf.de>
Date: Tue, 21 Jan 2025 11:06:03 +0100
Subject: [PATCH 3/6] type changed to double from float to reduce precision
 errors

---
 example/reduction/src/reduction.cpp | 79 +++++++++++++++++------------
 1 file changed, 46 insertions(+), 33 deletions(-)

diff --git a/example/reduction/src/reduction.cpp b/example/reduction/src/reduction.cpp
index 80ccbf5c6..0793864ae 100644
--- a/example/reduction/src/reduction.cpp
+++ b/example/reduction/src/reduction.cpp
@@ -13,6 +13,9 @@
 #include <chrono>
 #include <iostream>
 
+#include <cstdlib>
+#include <ctime>
+
 /** @file
  *
  * In the previous example we showed how to handle thread indices by hand to iterate over 1 and 3-dimensional data.
@@ -39,7 +42,7 @@ struct ReductionKernel
         auto traverseInFrame = alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::worker::threadsInBlock, alpaka::IdxRange{frameExtent});
         auto traverseOverFrames = alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::worker::blocksInGrid, alpaka::IdxRange{alpaka::CVec<uint32_t, 0u>{}, frameDataExtent, linearFrameExtent});
         // Shared Mempry
-        auto sumOnSharedMem = alpaka::onAcc::declareSharedMdArray<float>(acc, frameExtent);
+        auto sumOnSharedMem = alpaka::onAcc::declareSharedMdArray<double>(acc, frameExtent);
 
         // Values of these addresses will be used later. 
         // Sync() is not required.
@@ -81,6 +84,7 @@ struct ReductionKernel
             alpaka::onAcc::syncBlockThreads(acc);
             if(local_i < stride){
                 sumOnSharedMem[local_i] += sumOnSharedMem[local_i+stride];
+                //printf("[kernel] %f\n", sumOnSharedMem[local_i]);
                 //sumOnSharedMem[local_i+stride] = 0;
             }
         }
@@ -100,22 +104,27 @@ struct ReductionKernel
 void testReductionKernel(
     alpaka::onHost::concepts::Device auto host,
     alpaka::onHost::concepts::Device auto device,
-    auto computeExec)
+    auto computeExec, auto size)
 {
+
+    std::cout << "[Host] " << alpaka::onHost::getName(host) << ", ";
+    std::cout << "[Device] " << alpaka::onHost::getName(device) << ", ";
+
     // random number generator with a gaussian distribution
-    std::random_device rd{};
-    std::default_random_engine rand{rd()};
-    std::normal_distribution<float> dist{0.f, 1.f};
+    //std::random_device rd{};
+    //std::default_random_engine rand{rd()};
+    std::default_random_engine rand{};
+    std::normal_distribution<double> dist{(double)0.01, (double)1.};
 
     // buffer size
-    constexpr uint32_t size = 1024 * 1024;
+    std::cout << "[Problem Size] " << size << ", ";
 
     // tolerance
-    //constexpr float epsilon = 0.000001f;
-    constexpr float epsilon = 0.000001f*size;
+    constexpr double epsilon = (double)0.0001;
+    //constexpr float epsilon = 0.000001f*size;
 
     // allocate input and output host buffers in pinned memory accessible by the Platform devices
-    auto in1_h = alpaka::onHost::alloc<float>(host, Vec1D{size});
+    auto in1_h = alpaka::onHost::alloc<double>(host, Vec1D{size});
     auto out_h = alpaka::onHost::allocMirror(host, in1_h);
 
     // fill the input buffers with random data, and the output buffer with zeros
@@ -139,56 +148,57 @@ void testReductionKernel(
     alpaka::onHost::memset(queue, out_d, 0x00);
 
     // launch the 1-dimensional kernel
-    constexpr auto frameExtent = 32u;
-    auto numFrames = Vec1D{size} / frameExtent;
+    constexpr auto frameExtent = 256;
+    auto numFrames = Vec1D{size} / frameExtent /8;
     // The kernel assumes that the problem size is a multiple of the frame size.
-    assert((numFrames * frameExtent).x() == size);
+    assert((numFrames * frameExtent).x() *8 == size);
 
     auto frameSpec = alpaka::onHost::FrameSpec{numFrames, alpaka::CVec<uint32_t, frameExtent>{}};
 
     // fill the output buffer with zeros; the size is known from the buffer objects
     alpaka::onHost::memset(queue, out_d, 0x00);
 
-    std::cout << "Testing VectorAddKernel with vector indices with a grid of " << frameSpec << "\n";
-    
-    auto beginT = std::chrono::high_resolution_clock::now();
+    std::cout << "Grid of " << frameSpec << ", ";
 
+    alpaka::onHost::wait(queue); 
+    auto beginT = std::chrono::high_resolution_clock::now();
     queue
         .enqueue(computeExec, frameSpec, ReductionKernel{}, in1_d.getMdSpan(), out_d.getMdSpan(), Vec1D(size));
-
+    alpaka::onHost::wait(queue);
     auto endT = std::chrono::high_resolution_clock::now();
-    std::cout << "Time for kernel execution: " << std::chrono::duration<double>(endT - beginT).count() << 's'
-                  << std::endl;
+    std::cout << "[T Kernel Exec] " << std::chrono::duration<double>(endT - beginT).count() << 's' << ", ";
+
 
+    alpaka::onHost::wait(queue); 
     beginT = std::chrono::high_resolution_clock::now();
     // copy the results from the device to the host
     alpaka::onHost::memcpy(queue, out_h, out_d);
-
     // wait for all the operations to complete
     alpaka::onHost::wait(queue);
     endT = std::chrono::high_resolution_clock::now();
-    std::cout << "Time for HtoD copy: " << std::chrono::duration<double>(endT - beginT).count() << 's'
-                  << std::endl;
-    
+    std::cout << "[T HtoD Copy] " << std::chrono::duration<double>(endT - beginT).count() << 's' << ", ";
+
+    alpaka::onHost::wait(queue); 
     beginT = std::chrono::high_resolution_clock::now();
     auto finalSum = std::accumulate(
         &out_h[0],
         &out_h[size-1],
-        float(0));
+        double(0));
     endT = std::chrono::high_resolution_clock::now();
-    std::cout << "Time for partial sum accumulation: " << std::chrono::duration<double>(endT - beginT).count() << 's'
-                  << std::endl;
+    std::cout << "[T Partial Sum Accumulation] " << std::chrono::duration<double>(endT - beginT).count() << 's'
+                  << ", ";
 
-    float sum = 0;
+    double sum = 0;
     // check the results
     for(uint32_t i = 0; i < size; ++i)
     {
+        //if (i < 5) std::cout << "[num] " << in1_h[i] << std::endl;
         sum += in1_h[i];     
     }
-    std::cout << "acc output: " << finalSum << " host answer: " << sum << std::endl;
-    assert(finalSum < sum + epsilon);
-    assert(finalSum > sum - epsilon);
-    std::cout << "success\n";
+    //std::cout << "acc output: " << finalSum << " host answer: " << sum << std::endl;
+    printf("[Device Output] %f [Host Output] %f, ",finalSum, sum);
+    assert(pow(finalSum - sum,2) < pow(epsilon,2));
+    std::cout << "[Results] " << "success\n";
 }
 
 int example(auto const cfg)
@@ -210,13 +220,15 @@ int example(auto const cfg)
     // use the single host device
     alpaka::onHost::Platform host_platform = alpaka::onHost::makePlatform(alpaka::api::cpu);
     alpaka::onHost::Device host = host_platform.makeDevice(0);
-    std::cout << "Host:   " << alpaka::onHost::getName(host) << "\n\n";
 
     // use the first device
     alpaka::onHost::Device device = platform.makeDevice(0);
-    std::cout << "Device: " << alpaka::onHost::getName(device) << "\n\n";
 
-    testReductionKernel(host, device, computeExec);
+    uint32_t size = 1024 * 1024;
+    for(int fac = 0; fac < 11; fac++){
+        testReductionKernel(host, device, computeExec, size);
+        size *= 2;
+    }
 
     return EXIT_SUCCESS;
 }
@@ -225,6 +237,7 @@ auto main() -> int
 {
     using namespace alpaka;
     // Execute the example once for each enabled API and executor.
+    std::srand(std::time(0)); // set time as random seed; rand() after this line will automatically use the same seed
     return executeForEach(
         [=](auto const& tag) { return example(tag); },
         onHost::allExecutorsAndApis(onHost::enabledApis));

From df1f7c6063b0bb110a2cc551173437c93ce40edb Mon Sep 17 00:00:00 2001
From: Chin-En Kuo <ck175439@fwkhal8999.fz-rossendorf.de>
Date: Wed, 22 Jan 2025 13:35:28 +0100
Subject: [PATCH 4/6] changed to comma seperated output format

---
 example/reduction/src/reduction.cpp | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/example/reduction/src/reduction.cpp b/example/reduction/src/reduction.cpp
index 0793864ae..ef04ee780 100644
--- a/example/reduction/src/reduction.cpp
+++ b/example/reduction/src/reduction.cpp
@@ -107,8 +107,9 @@ void testReductionKernel(
     auto computeExec, auto size)
 {
 
-    std::cout << "[Host] " << alpaka::onHost::getName(host) << ", ";
-    std::cout << "[Device] " << alpaka::onHost::getName(device) << ", ";
+    //std::cout << "[Host] " << alpaka::onHost::getName(host) << ", ";
+    //std::cout << "[Device] " << alpaka::onHost::getName(device) << ", ";
+    std::cout << alpaka::onHost::getName(device) << ", ";
 
     // random number generator with a gaussian distribution
     //std::random_device rd{};
@@ -117,7 +118,8 @@ void testReductionKernel(
     std::normal_distribution<double> dist{(double)0.01, (double)1.};
 
     // buffer size
-    std::cout << "[Problem Size] " << size << ", ";
+    //std::cout << "[Problem Size] " << size << ", ";
+    std::cout << size << ", ";
 
     // tolerance
     constexpr double epsilon = (double)0.0001;
@@ -158,7 +160,7 @@ void testReductionKernel(
     // fill the output buffer with zeros; the size is known from the buffer objects
     alpaka::onHost::memset(queue, out_d, 0x00);
 
-    std::cout << "Grid of " << frameSpec << ", ";
+    //std::cout << "Grid of " << frameSpec << ", ";
 
     alpaka::onHost::wait(queue); 
     auto beginT = std::chrono::high_resolution_clock::now();
@@ -166,7 +168,8 @@ void testReductionKernel(
         .enqueue(computeExec, frameSpec, ReductionKernel{}, in1_d.getMdSpan(), out_d.getMdSpan(), Vec1D(size));
     alpaka::onHost::wait(queue);
     auto endT = std::chrono::high_resolution_clock::now();
-    std::cout << "[T Kernel Exec] " << std::chrono::duration<double>(endT - beginT).count() << 's' << ", ";
+    //std::cout << "[T Kernel Exec] " << std::chrono::duration<double>(endT - beginT).count() << 's' << ", ";
+    std::cout << std::chrono::duration<double>(endT - beginT).count() << ", ";
 
 
     alpaka::onHost::wait(queue); 
@@ -176,7 +179,7 @@ void testReductionKernel(
     // wait for all the operations to complete
     alpaka::onHost::wait(queue);
     endT = std::chrono::high_resolution_clock::now();
-    std::cout << "[T HtoD Copy] " << std::chrono::duration<double>(endT - beginT).count() << 's' << ", ";
+    //std::cout << "[T HtoD Copy] " << std::chrono::duration<double>(endT - beginT).count() << 's' << ", ";
 
     alpaka::onHost::wait(queue); 
     beginT = std::chrono::high_resolution_clock::now();
@@ -185,8 +188,7 @@ void testReductionKernel(
         &out_h[size-1],
         double(0));
     endT = std::chrono::high_resolution_clock::now();
-    std::cout << "[T Partial Sum Accumulation] " << std::chrono::duration<double>(endT - beginT).count() << 's'
-                  << ", ";
+    //std::cout << "[T Partial Sum Accumulation] " << std::chrono::duration<double>(endT - beginT).count() << 's' << ", ";
 
     double sum = 0;
     // check the results
@@ -196,9 +198,10 @@ void testReductionKernel(
         sum += in1_h[i];     
     }
     //std::cout << "acc output: " << finalSum << " host answer: " << sum << std::endl;
-    printf("[Device Output] %f [Host Output] %f, ",finalSum, sum);
+    //printf("[Device Output] %f [Host Output] %f, ",finalSum, sum);
     assert(pow(finalSum - sum,2) < pow(epsilon,2));
-    std::cout << "[Results] " << "success\n";
+    //std::cout << "[Results] " << "success\n";
+    std::cout << "success\n";
 }
 
 int example(auto const cfg)
@@ -238,6 +241,7 @@ auto main() -> int
     using namespace alpaka;
     // Execute the example once for each enabled API and executor.
     std::srand(std::time(0)); // set time as random seed; rand() after this line will automatically use the same seed
+    std::cout << "Device, Problem Size, T Kernel Exec (s), Results" << std::endl;
     return executeForEach(
         [=](auto const& tag) { return example(tag); },
         onHost::allExecutorsAndApis(onHost::enabledApis));

From 473d0284dab3d136d7f4a65dcafe359dbb4f372f Mon Sep 17 00:00:00 2001
From: Chin-En Kuo <ck175439@fwkhal8999.fz-rossendorf.de>
Date: Wed, 22 Jan 2025 16:19:59 +0100
Subject: [PATCH 5/6] added reduction folder

---
 example/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index db3ef601b..fb46ab972 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -18,3 +18,5 @@ project("alpakaExamples" LANGUAGES CXX)
 add_subdirectory("heatEquation2D/")
 add_subdirectory("vectorAdd/")
 add_subdirectory("tutorial/")
+add_subdirectory("helloWorld/")
+add_subdirectory("reduction/")

From 79b176ea9b4954acfbf85a126a6963416d573bb8 Mon Sep 17 00:00:00 2001
From: Chin-En Kuo <ck175439@fwkhal8999.fz-rossendorf.de>
Date: Wed, 29 Jan 2025 07:32:31 +0100
Subject: [PATCH 6/6] changed some declaration to auto and added some
 std::flush

---
 example/reduction/src/reduction.cpp | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/example/reduction/src/reduction.cpp b/example/reduction/src/reduction.cpp
index ef04ee780..8ecea7053 100644
--- a/example/reduction/src/reduction.cpp
+++ b/example/reduction/src/reduction.cpp
@@ -109,7 +109,7 @@ void testReductionKernel(
 
     //std::cout << "[Host] " << alpaka::onHost::getName(host) << ", ";
     //std::cout << "[Device] " << alpaka::onHost::getName(device) << ", ";
-    std::cout << alpaka::onHost::getName(device) << ", ";
+    std::cout << alpaka::onHost::getName(device) << ", " << std::flush;
 
     // random number generator with a gaussian distribution
     //std::random_device rd{};
@@ -119,7 +119,7 @@ void testReductionKernel(
 
     // buffer size
     //std::cout << "[Problem Size] " << size << ", ";
-    std::cout << size << ", ";
+    std::cout << size << ", " << std::flush;
 
     // tolerance
     constexpr double epsilon = (double)0.0001;
@@ -130,7 +130,7 @@ void testReductionKernel(
     auto out_h = alpaka::onHost::allocMirror(host, in1_h);
 
     // fill the input buffers with random data, and the output buffer with zeros
-    for(uint32_t i = 0; i < size; ++i)
+    for(auto i = 0; i < size; ++i)
     {
         in1_h[i] = dist(rand);
         out_h[i] = 0.;
@@ -151,9 +151,10 @@ void testReductionKernel(
 
     // launch the 1-dimensional kernel
     constexpr auto frameExtent = 256;
+    //constexpr auto frameExtent = 1024;
     auto numFrames = Vec1D{size} / frameExtent /8;
     // The kernel assumes that the problem size is a multiple of the frame size.
-    assert((numFrames * frameExtent).x() *8 == size);
+    //assert((numFrames * frameExtent).x() *8 == size);
 
     auto frameSpec = alpaka::onHost::FrameSpec{numFrames, alpaka::CVec<uint32_t, frameExtent>{}};
 
@@ -169,7 +170,7 @@ void testReductionKernel(
     alpaka::onHost::wait(queue);
     auto endT = std::chrono::high_resolution_clock::now();
     //std::cout << "[T Kernel Exec] " << std::chrono::duration<double>(endT - beginT).count() << 's' << ", ";
-    std::cout << std::chrono::duration<double>(endT - beginT).count() << ", ";
+    std::cout << std::chrono::duration<double>(endT - beginT).count() << ", " << std::flush;
 
 
     alpaka::onHost::wait(queue); 
@@ -192,7 +193,7 @@ void testReductionKernel(
 
     double sum = 0;
     // check the results
-    for(uint32_t i = 0; i < size; ++i)
+    for(auto i = 0; i < size; ++i)
     {
         //if (i < 5) std::cout << "[num] " << in1_h[i] << std::endl;
         sum += in1_h[i];     
@@ -201,7 +202,7 @@ void testReductionKernel(
     //printf("[Device Output] %f [Host Output] %f, ",finalSum, sum);
     assert(pow(finalSum - sum,2) < pow(epsilon,2));
     //std::cout << "[Results] " << "success\n";
-    std::cout << "success\n";
+    std::cout << "success\n" << std::flush;
 }
 
 int example(auto const cfg)
@@ -228,9 +229,9 @@ int example(auto const cfg)
     alpaka::onHost::Device device = platform.makeDevice(0);
 
     uint32_t size = 1024 * 1024;
-    for(int fac = 0; fac < 11; fac++){
+    for(int fac = 10; fac < 11; fac++){
+        size = 1024*1024*pow(2,fac);
         testReductionKernel(host, device, computeExec, size);
-        size *= 2;
     }
 
     return EXIT_SUCCESS;
@@ -240,7 +241,8 @@ auto main() -> int
 {
     using namespace alpaka;
     // Execute the example once for each enabled API and executor.
-    std::srand(std::time(0)); // set time as random seed; rand() after this line will automatically use the same seed
+    //std::srand(std::time(0)); // set time as random seed; rand() after this line will automatically use the same seed
+    std::srand(12345);
     std::cout << "Device, Problem Size, T Kernel Exec (s), Results" << std::endl;
     return executeForEach(
         [=](auto const& tag) { return example(tag); },