From efda45f83054145095be83b495c5a539e0ee1f56 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Thu, 10 Oct 2024 17:30:50 -0700
Subject: [PATCH 1/8] Refactor grid-stride loop

Move grid-stride loop out of GPU kernels. @ashesh2512 noticed performance
issues with grid-stride loops on AMD GPUs in PelePhyscis's large kernels.

Thank @AlexanderSinn for the suggestion implemented in this PR.
---
 Src/Base/AMReX_GpuLaunch.H        |  40 ++++++++++
 Src/Base/AMReX_GpuLaunchFunctsG.H | 126 +++++++++++++++++++++---------
 2 files changed, 127 insertions(+), 39 deletions(-)
diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H
index 435a11f342b..72480d9a79e 100644
--- a/Src/Base/AMReX_GpuLaunch.H
+++ b/Src/Base/AMReX_GpuLaunch.H
@@ -21,6 +21,7 @@
 #include <AMReX_RandomEngine.H>
 #include <AMReX_Algorithm.H>
 #include <AMReX_Math.H>
+#include <AMReX_Vector.H>
 #include <cstddef>
 #include <limits>
 #include <algorithm>
@@ -176,6 +177,45 @@ namespace Gpu {
     {
         return makeExecutionConfig<MT>(box.numPts());
     }
+
+    struct ExecConfig
+    {
+        Long ntotalthreads;
+        int nblocks;
+    };
+
+    template <int MT>
+    Vector<ExecConfig> makeNExecutionConfigs (Long N) noexcept
+    {
+        // Max # of blocks in a kernel launch
+        int numblocks_max = std::numeric_limits<int>::max();
+        // Max # of threads in a kernel launch
+        Long nmax = Long(MT) * numblocks_max;
+        // # of launches needed for N elements without using grid-stride
+        // loops inside GPU kernels.
+        auto nlaunches = int((N+nmax-1)/nmax);
+        Vector<ExecConfig> r(nlaunches);
+        for (int i = 0; i < nlaunches; ++i) {
+            int nblocks;
+            if (N > nmax) {
+                nblocks = numblocks_max;
+                N -= nmax;
+            } else {
+                nblocks = int((N+MT-1)/MT);
+            }
+            // Total # of threads in this launch
+            r[i].ntotalthreads = Long(nblocks) * MT;
+            // # of blocks in this launch
+            r[i].nblocks = nblocks;
+        }
+        return r;
+    }
+
+    template <int MT, int dim>
+    Vector<ExecConfig> makeNExecutionConfigs (BoxND<dim> const& box) noexcept
+    {
+        return makeNExecutionConfigs<MT>(box.numPts());
+    }
 #endif
 
 }
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index 7955410f8ba..b9fb432fd10 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -747,17 +747,49 @@ void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noe
     launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
 }
 
-template<int MT, typename T, typename L>
+template<int MT, typename T, typename L, std::enable_if_t<std::is_integral_v<T>,int> FOO = 0>
 void launch (T const& n, L const& f) noexcept
 {
+    static_assert(sizeof(T) >= 2);
     if (amrex::isEmpty(n)) { return; }
-    const auto ec = Gpu::makeExecutionConfig<MT>(n);
-    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (auto const i : Gpu::Range(n)) {
-            f(i);
+    const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
+    T ndone = 0;
+    for (auto const& ec : nec) {
+        T nleft = n - ndone;
+        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            // This will not overflow, even though nblocks*MT might.
+            auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
+            if (tid < nleft) {
+                f(tid+ndone);
+            }
+        });
+        if (nleft > ec.ntotalthreads) {
+            ndone += T(ec.ntotalthreads);
         }
-    });
+    }
+    AMREX_GPU_ERROR_CHECK();
+}
+
+template<int MT, int dim, typename L>
+void launch (BoxND<dim> const& box, L const& f) noexcept
+{
+    if (box.isEmpty()) { return; }
+    const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
+    const BoxIndexerND<dim> indexer(box);
+    const auto type = box.ixType();
+    std::uint64_t ndone = 0;
+    for (auto const& ec : nec) {
+        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone;
+            if (icell < indexer.numPts()) {
+                auto iv = indexer.intVect(icell);
+                f(BoxND<dim>(iv,iv,type));
+            }
+        });
+        ndone += ec.ntotalthreads;
+    }
     AMREX_GPU_ERROR_CHECK();
 }
 
@@ -765,17 +797,26 @@ template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_in
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
 ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
 {
+    static_assert(sizeof(T) >= 2);
     if (amrex::isEmpty(n)) { return; }
-    const auto ec = Gpu::makeExecutionConfig<MT>(n);
-    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x;
-             i < Long(n); i += stride) {
-            detail::call_f_scalar_handler(f, T(i),
-                Gpu::Handler(amrex::min((std::uint64_t(n)-i+(std::uint64_t)threadIdx.x),
-                (std::uint64_t)blockDim.x)));
+    const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
+    T ndone = 0;
+    for (auto const& ec : nec) {
+        T nleft = n - ndone;
+        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            // This will not overflow, even though nblocks*MT might.
+            auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
+            if (tid < nleft) {
+                detail::call_f_scalar_handler(f, tid+ndone,
+                    Gpu::Handler(amrex::min((std::uint64_t(nleft)-tid+(std::uint64_t)threadIdx.x),
+                    (std::uint64_t)blockDim.x)));
+            }
+        });
+        if (nleft > ec.ntotalthreads) {
+            ndone += ec.ntotalthreads;
         }
-    });
+    }
     AMREX_GPU_ERROR_CHECK();
 }
 
@@ -785,18 +826,21 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
     const BoxIndexerND<dim> indexer(box);
-    const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
-    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
-             icell < indexer.numPts(); icell += stride)
-        {
-            auto iv = indexer.intVect(icell);
-            detail::call_f_intvect_handler(f, iv,
-                Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
-                (std::uint64_t)blockDim.x)));
-        }
-    });
+    const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
+    std::uint64_t ndone = 0;
+    for (auto const& ec : nec) {
+        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone;
+            if (icell < indexer.numPts()) {
+                auto iv = indexer.intVect(icell);
+                detail::call_f_intvect_handler(f, iv,
+                    Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
+                    (std::uint64_t)blockDim.x)));
+            }
+        });
+        ndone += ec.ntotalthreads;
+    }
     AMREX_GPU_ERROR_CHECK();
 }
 
@@ -806,17 +850,21 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f)
 {
     if (amrex::isEmpty(box)) { return; }
     const BoxIndexerND<dim> indexer(box);
-    const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
-    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
-             icell < indexer.numPts(); icell += stride) {
-            auto iv = indexer.intVect(icell);
-            detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
-                Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
-                (std::uint64_t)blockDim.x)));
-        }
-    });
+    const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
+    std::uint64_t ndone = 0;
+    for (auto const& ec : nec) {
+        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone;
+            if (icell < indexer.numPts()) {
+                auto iv = indexer.intVect(icell);
+                detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
+                    Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
+                    (std::uint64_t)blockDim.x)));
+            }
+        });
+        ndone += ec.ntotalthreads;
+    }
     AMREX_GPU_ERROR_CHECK();
 }
 

From 3bbf233668ad9026288c38bd34ff1dd00b7429fd Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Sat, 12 Oct 2024 12:18:13 -0700
Subject: [PATCH 2/8] Fix Warning

---
 Src/Base/AMReX_GpuLaunchFunctsG.H | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index b9fb432fd10..ff1d9d29000 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -764,7 +764,7 @@ void launch (T const& n, L const& f) noexcept
                 f(tid+ndone);
             }
         });
-        if (nleft > ec.ntotalthreads) {
+        if (Long(nleft) > ec.ntotalthreads) {
             ndone += T(ec.ntotalthreads);
         }
     }
@@ -813,7 +813,7 @@ ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
                     (std::uint64_t)blockDim.x)));
             }
         });
-        if (nleft > ec.ntotalthreads) {
+        if (Long(nleft) > ec.ntotalthreads) {
             ndone += ec.ntotalthreads;
         }
     }

From 89d45c5e82e8cbd7017880090401e3f0216dd45d Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Sat, 12 Oct 2024 23:21:04 -0700
Subject: [PATCH 3/8] Work around a nvcc bug

---
 Src/EB/AMReX_algoim.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Src/EB/AMReX_algoim.cpp b/Src/EB/AMReX_algoim.cpp
index 254e15dab0f..864ec626a0a 100644
--- a/Src/EB/AMReX_algoim.cpp
+++ b/Src/EB/AMReX_algoim.cpp
@@ -66,8 +66,16 @@ compute_integrals (MultiFab& intgmf, IntVect nghost)
 
             if (Gpu::inLaunchRegion())
             {
+#if defined(AMREX_USE_CUDA)
+                // It appears that there is a nvcc bug. We have to use the
+                // 4D ParallelFor here, even though ncomp is 1.
+                int ncomp = fg.nComp();
+                amrex::ParallelFor(bx, ncomp,
+                [=] AMREX_GPU_DEVICE (int i, int j, int k, int) noexcept
+#else
                 amrex::ParallelFor(bx,
                 [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+#endif
                 {
                     const auto ebflag = fg(i,j,k);
                     if (ebflag.isRegular()) {

From a2c2201deccf27db319fb86835db5519689e4d1f Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Wed, 16 Oct 2024 10:44:03 -0700
Subject: [PATCH 4/8] Update Src/Base/AMReX_GpuLaunchFunctsG.H

Co-authored-by: Alexander Sinn <64009254+AlexanderSinn@users.noreply.github.com>
---
 Src/Base/AMReX_GpuLaunchFunctsG.H | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index ff1d9d29000..ddd2286cd70 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -809,7 +809,7 @@ ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
             auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
             if (tid < nleft) {
                 detail::call_f_scalar_handler(f, tid+ndone,
-                    Gpu::Handler(amrex::min((std::uint64_t(nleft)-tid+(std::uint64_t)threadIdx.x),
+                    Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x),
                     (std::uint64_t)blockDim.x)));
             }
         });

From bca9c1b341b866630e196202e1ff435895859f8f Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Wed, 16 Oct 2024 15:19:48 -0700
Subject: [PATCH 5/8] Apply suggestions from code review

Co-authored-by: Alexander Sinn <64009254+AlexanderSinn@users.noreply.github.com>
---
 Src/Base/AMReX_GpuLaunch.H        | 8 +++++---
 Src/Base/AMReX_GpuLaunchFunctsG.H | 9 +++------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H
index 72480d9a79e..5f1e61e0084 100644
--- a/Src/Base/AMReX_GpuLaunch.H
+++ b/Src/Base/AMReX_GpuLaunch.H
@@ -180,7 +180,7 @@ namespace Gpu {
 
     struct ExecConfig
     {
-        Long ntotalthreads;
+        Long start_idx;
         int nblocks;
     };
 
@@ -195,6 +195,7 @@ namespace Gpu {
         // loops inside GPU kernels.
         auto nlaunches = int((N+nmax-1)/nmax);
         Vector<ExecConfig> r(nlaunches);
+        Long ndone = 0;
         for (int i = 0; i < nlaunches; ++i) {
             int nblocks;
             if (N > nmax) {
@@ -203,8 +204,9 @@ namespace Gpu {
             } else {
                 nblocks = int((N+MT-1)/MT);
             }
-            // Total # of threads in this launch
-            r[i].ntotalthreads = Long(nblocks) * MT;
+            // At which element ID the kernel should start
+            r[i].start_idx = ndone;
+            ndone += Long(nblocks) * MT;
             // # of blocks in this launch
             r[i].nblocks = nblocks;
         }
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index ddd2286cd70..4f9b4227746 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -800,22 +800,19 @@ ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
     static_assert(sizeof(T) >= 2);
     if (amrex::isEmpty(n)) { return; }
     const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
-    T ndone = 0;
     for (auto const& ec : nec) {
-        T nleft = n - ndone;
+        const T start_idx = T(ec.start_idx);
+        const T nleft = n - start_idx;
         AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept {
             // This will not overflow, even though nblocks*MT might.
             auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
             if (tid < nleft) {
-                detail::call_f_scalar_handler(f, tid+ndone,
+                detail::call_f_scalar_handler(f, tid+start_idx,
                     Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x),
                     (std::uint64_t)blockDim.x)));
             }
         });
-        if (Long(nleft) > ec.ntotalthreads) {
-            ndone += ec.ntotalthreads;
-        }
     }
     AMREX_GPU_ERROR_CHECK();
 }

From 7510c31e50debfd6304e20ba5fb900b22afbf1d9 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Wed, 16 Oct 2024 15:25:34 -0700
Subject: [PATCH 6/8] Apply suggestions to more places

---
 Src/Base/AMReX_GpuLaunchFunctsG.H | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index 4f9b4227746..cc071bae13b 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -753,21 +753,18 @@ void launch (T const& n, L const& f) noexcept
     static_assert(sizeof(T) >= 2);
     if (amrex::isEmpty(n)) { return; }
     const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
-    T ndone = 0;
     for (auto const& ec : nec) {
-        T nleft = n - ndone;
+        const T start_idx = T(ec.start_idx);
+        const T nleft = n - start_idx;
         AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept {
             // This will not overflow, even though nblocks*MT might.
             auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
             if (tid < nleft) {
-                f(tid+ndone);
+                f(tid+start_idx);
             }
         });
-        if (Long(nleft) > ec.ntotalthreads) {
-            ndone += T(ec.ntotalthreads);
-        }
-    }
+
     AMREX_GPU_ERROR_CHECK();
 }
 
@@ -778,17 +775,16 @@ void launch (BoxND<dim> const& box, L const& f) noexcept
     const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
     const BoxIndexerND<dim> indexer(box);
     const auto type = box.ixType();
-    std::uint64_t ndone = 0;
     for (auto const& ec : nec) {
+        const T start_idx = T(ec.start_idx);
         AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept {
-            auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone;
+            auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
             if (icell < indexer.numPts()) {
                 auto iv = indexer.intVect(icell);
                 f(BoxND<dim>(iv,iv,type));
             }
         });
-        ndone += ec.ntotalthreads;
     }
     AMREX_GPU_ERROR_CHECK();
 }
@@ -824,11 +820,11 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
     if (amrex::isEmpty(box)) { return; }
     const BoxIndexerND<dim> indexer(box);
     const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
-    std::uint64_t ndone = 0;
     for (auto const& ec : nec) {
+        const T start_idx = T(ec.start_idx);
         AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept {
-            auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone;
+            auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
             if (icell < indexer.numPts()) {
                 auto iv = indexer.intVect(icell);
                 detail::call_f_intvect_handler(f, iv,
@@ -836,7 +832,6 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
                     (std::uint64_t)blockDim.x)));
             }
         });
-        ndone += ec.ntotalthreads;
     }
     AMREX_GPU_ERROR_CHECK();
 }
@@ -848,11 +843,11 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f)
     if (amrex::isEmpty(box)) { return; }
     const BoxIndexerND<dim> indexer(box);
     const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
-    std::uint64_t ndone = 0;
     for (auto const& ec : nec) {
+        const T start_idx = T(ec.start_idx);
         AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept {
-            auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone;
+            auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
             if (icell < indexer.numPts()) {
                 auto iv = indexer.intVect(icell);
                 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
@@ -860,7 +855,6 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f)
                     (std::uint64_t)blockDim.x)));
             }
         });
-        ndone += ec.ntotalthreads;
     }
     AMREX_GPU_ERROR_CHECK();
 }

From 660a69da24832018be03aee905b0be0819c1e360 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Wed, 16 Oct 2024 16:02:59 -0700
Subject: [PATCH 7/8] Update Src/Base/AMReX_GpuLaunchFunctsG.H

Co-authored-by: Alexander Sinn <64009254+AlexanderSinn@users.noreply.github.com>
---
 Src/Base/AMReX_GpuLaunchFunctsG.H | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index cc071bae13b..deeb4225dbd 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -764,7 +764,7 @@ void launch (T const& n, L const& f) noexcept
                 f(tid+start_idx);
             }
         });
-
+    }
     AMREX_GPU_ERROR_CHECK();
 }
 

From 064bd3e0096c95cc3ecce40c9ccbbed4b79a5f75 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Wed, 16 Oct 2024 16:10:37 -0700
Subject: [PATCH 8/8] Fix

---
 Src/Base/AMReX_GpuLaunchFunctsG.H | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index deeb4225dbd..56a95dbc5bb 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -776,7 +776,7 @@ void launch (BoxND<dim> const& box, L const& f) noexcept
     const BoxIndexerND<dim> indexer(box);
     const auto type = box.ixType();
     for (auto const& ec : nec) {
-        const T start_idx = T(ec.start_idx);
+        const auto start_idx = std::uint64_t(ec.start_idx);
         AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept {
             auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
@@ -821,7 +821,7 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
     const BoxIndexerND<dim> indexer(box);
     const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
     for (auto const& ec : nec) {
-        const T start_idx = T(ec.start_idx);
+        const auto start_idx = std::uint64_t(ec.start_idx);
         AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept {
             auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
@@ -844,7 +844,7 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f)
     const BoxIndexerND<dim> indexer(box);
     const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
     for (auto const& ec : nec) {
-        const T start_idx = T(ec.start_idx);
+        const auto start_idx = std::uint64_t(ec.start_idx);
         AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept {
             auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;