Refactor grid-stride loop

WeiqunZhang · WeiqunZhang · commit 168b2c40e3a6 · 2024-10-12T11:25:48.000-07:00
Move grid-stride loop out of GPU kernels. @ashesh2512 noticed performance issues with grid-stride loops on AMD GPUs in PelePhyscis's large kernels. Thank @AlexanderSinn for the suggestion implemented in this PR.
diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H
@@ -21,6 +21,7 @@
 #include <AMReX_RandomEngine.H>
 #include <AMReX_Algorithm.H>
 #include <AMReX_Math.H>
+#include <AMReX_Vector.H>
 #include <cstddef>
 #include <limits>
 #include <algorithm>
@@ -176,6 +177,45 @@ namespace Gpu {
     {
         return makeExecutionConfig<MT>(box.numPts());
     }
+
+    struct ExecConfig
+    {
+        Long ntotalthreads;
+        int nblocks;
+    };
+
+    template <int MT>
+    Vector<ExecConfig> makeNExecutionConfigs (Long N) noexcept
+    {
+        // Max # of blocks in a kernel launch
+        int numblocks_max = std::numeric_limits<int>::max();
+        // Max # of threads in a kernel launch
+        Long nmax = Long(MT) * numblocks_max;
+        // # of launches needed for N elements without using grid-stride
+        // loops inside GPU kernels.
+        auto nlaunches = int((N+nmax-1)/nmax);
+        Vector<ExecConfig> r(nlaunches);
+        for (int i = 0; i < nlaunches; ++i) {
+            int nblocks;
+            if (N > nmax) {
+                nblocks = numblocks_max;
+                N -= nmax;
+            } else {
+                nblocks = int((N+MT-1)/MT);
+            }
+            // Total # of threads in this launch
+            r[i].ntotalthreads = Long(nblocks) * MT;
+            // # of blocks in this launch
+            r[i].nblocks = nblocks;
+        }
+        return r;
+    }
+
+    template <int MT, int dim>
+    Vector<ExecConfig> makeNExecutionConfigs (BoxND<dim> const& box) noexcept
+    {
+        return makeNExecutionConfigs<MT>(box.numPts());
+    }
 #endif
 
 }
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -747,17 +747,47 @@ void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noe
     launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
 }
 
-template<int MT, typename T, typename L>
+template<int MT, typename T, typename L, std::enable_if_t<std::is_integral_v<T>,int> FOO = 0>
 void launch (T const& n, L const& f) noexcept
 {
     if (amrex::isEmpty(n)) { return; }
-    const auto ec = Gpu::makeExecutionConfig<MT>(n);
-    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (auto const i : Gpu::Range(n)) {
-            f(i);
+    const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
+    T ndone = 0;
+    for (auto const& ec : nec) {
+        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            // This will not overflow, even though nblocks*MT might.
+            auto tid = T(MT*blockIdx.x+threadIdx.x);
+            if (tid < n-ndone) {
+                f(tid+ndone);
+            }
+        });
+        if (&ec != &nec.back()) {
+            ndone += T(ec.ntotalthreads);
         }
-    });
+    }
+    AMREX_GPU_ERROR_CHECK();
+}
+
+template<int MT, int dim, typename L>
+void launch (BoxND<dim> const& box, L const& f) noexcept
+{
+    if (box.isEmpty()) { return; }
+    const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
+    const BoxIndexerND<dim> indexer(box);
+    const auto type = box.ixType();
+    std::uint64_t ndone = 0;
+    for (auto const& ec : nec) {
+        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone;
+            if (icell < indexer.numPts()) {
+                auto iv = indexer.intVect(icell);
+                f(BoxND<dim>(iv,iv,type));
+            }
+        });
+        ndone += ec.ntotalthreads;
+    }
     AMREX_GPU_ERROR_CHECK();
 }
 
@@ -766,16 +796,22 @@ std::enable_if_t<MaybeDeviceRunnable<L>::value>
 ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
 {
     if (amrex::isEmpty(n)) { return; }
-    const auto ec = Gpu::makeExecutionConfig<MT>(n);
-    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x;
-             i < Long(n); i += stride) {
-            detail::call_f_scalar_handler(f, T(i),
-                Gpu::Handler(amrex::min((std::uint64_t(n)-i+(std::uint64_t)threadIdx.x),
-                (std::uint64_t)blockDim.x)));
-        }
-    });
+    const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
+    T ndone = 0;
+    for (auto const& ec : nec) {
+        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            // This will not overflow, even though nblocks*MT might.
+            auto tid = T(MT*blockIdx.x+threadIdx.x);
+            if (tid < n-ndone) {
+                tid += ndone;
+                detail::call_f_scalar_handler(f, tid,
+                    Gpu::Handler(amrex::min((std::uint64_t(n)-tid+(std::uint64_t)threadIdx.x),
+                    (std::uint64_t)blockDim.x)));
+            }
+        });
+        ndone += ec.ntotalthreads;
+    }
     AMREX_GPU_ERROR_CHECK();
 }
 
@@ -785,18 +821,21 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
     const BoxIndexerND<dim> indexer(box);
-    const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
-    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
-             icell < indexer.numPts(); icell += stride)
-        {
-            auto iv = indexer.intVect(icell);
-            detail::call_f_intvect_handler(f, iv,
-                Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
-                (std::uint64_t)blockDim.x)));
-        }
-    });
+    const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
+    std::uint64_t ndone = 0;
+    for (auto const& ec : nec) {
+        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone;
+            if (icell < indexer.numPts()) {
+                auto iv = indexer.intVect(icell);
+                detail::call_f_intvect_handler(f, iv,
+                    Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
+                    (std::uint64_t)blockDim.x)));
+            }
+        });
+        ndone += ec.ntotalthreads;
+    }
     AMREX_GPU_ERROR_CHECK();
 }
 
@@ -806,17 +845,21 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f)
 {
     if (amrex::isEmpty(box)) { return; }
     const BoxIndexerND<dim> indexer(box);
-    const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
-    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
-             icell < indexer.numPts(); icell += stride) {
-            auto iv = indexer.intVect(icell);
-            detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
-                Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
-                (std::uint64_t)blockDim.x)));
-        }
-    });
+    const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
+    std::uint64_t ndone = 0;
+    for (auto const& ec : nec) {
+        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone;
+            if (icell < indexer.numPts()) {
+                auto iv = indexer.intVect(icell);
+                detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
+                    Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
+                    (std::uint64_t)blockDim.x)));
+            }
+        });
+        ndone += ec.ntotalthreads;
+    }
     AMREX_GPU_ERROR_CHECK();
 }