From efda45f83054145095be83b495c5a539e0ee1f56 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Thu, 10 Oct 2024 17:30:50 -0700 Subject: [PATCH 1/8] Refactor grid-stride loop Move grid-stride loop out of GPU kernels. @ashesh2512 noticed performance issues with grid-stride loops on AMD GPUs in PelePhyscis's large kernels. Thank @AlexanderSinn for the suggestion implemented in this PR. --- Src/Base/AMReX_GpuLaunch.H | 40 ++++++++++ Src/Base/AMReX_GpuLaunchFunctsG.H | 126 +++++++++++++++++++++--------- 2 files changed, 127 insertions(+), 39 deletions(-) diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H index 435a11f342b..72480d9a79e 100644 --- a/Src/Base/AMReX_GpuLaunch.H +++ b/Src/Base/AMReX_GpuLaunch.H @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -176,6 +177,45 @@ namespace Gpu { { return makeExecutionConfig(box.numPts()); } + + struct ExecConfig + { + Long ntotalthreads; + int nblocks; + }; + + template + Vector makeNExecutionConfigs (Long N) noexcept + { + // Max # of blocks in a kernel launch + int numblocks_max = std::numeric_limits::max(); + // Max # of threads in a kernel launch + Long nmax = Long(MT) * numblocks_max; + // # of launches needed for N elements without using grid-stride + // loops inside GPU kernels. + auto nlaunches = int((N+nmax-1)/nmax); + Vector r(nlaunches); + for (int i = 0; i < nlaunches; ++i) { + int nblocks; + if (N > nmax) { + nblocks = numblocks_max; + N -= nmax; + } else { + nblocks = int((N+MT-1)/MT); + } + // Total # of threads in this launch + r[i].ntotalthreads = Long(nblocks) * MT; + // # of blocks in this launch + r[i].nblocks = nblocks; + } + return r; + } + + template + Vector makeNExecutionConfigs (BoxND const& box) noexcept + { + return makeNExecutionConfigs(box.numPts()); + } #endif } diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index 7955410f8ba..b9fb432fd10 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -747,17 +747,49 @@ void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noe launch(nblocks, nthreads_per_block, 0, stream, std::forward(f)); } -template +template,int> FOO = 0> void launch (T const& n, L const& f) noexcept { + static_assert(sizeof(T) >= 2); if (amrex::isEmpty(n)) { return; } - const auto ec = Gpu::makeExecutionConfig(n); - AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (auto const i : Gpu::Range(n)) { - f(i); + const auto& nec = Gpu::makeNExecutionConfigs(n); + T ndone = 0; + for (auto const& ec : nec) { + T nleft = n - ndone; + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + // This will not overflow, even though nblocks*MT might. + auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x); + if (tid < nleft) { + f(tid+ndone); + } + }); + if (nleft > ec.ntotalthreads) { + ndone += T(ec.ntotalthreads); } - }); + } + AMREX_GPU_ERROR_CHECK(); +} + +template +void launch (BoxND const& box, L const& f) noexcept +{ + if (box.isEmpty()) { return; } + const auto& nec = Gpu::makeNExecutionConfigs(box); + const BoxIndexerND indexer(box); + const auto type = box.ixType(); + std::uint64_t ndone = 0; + for (auto const& ec : nec) { + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone; + if (icell < indexer.numPts()) { + auto iv = indexer.intVect(icell); + f(BoxND(iv,iv,type)); + } + }); + ndone += ec.ntotalthreads; + } AMREX_GPU_ERROR_CHECK(); } @@ -765,17 +797,26 @@ template ::value> ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept { + static_assert(sizeof(T) >= 2); if (amrex::isEmpty(n)) { return; } - const auto ec = Gpu::makeExecutionConfig(n); - AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x; - i < Long(n); i += stride) { - detail::call_f_scalar_handler(f, T(i), - Gpu::Handler(amrex::min((std::uint64_t(n)-i+(std::uint64_t)threadIdx.x), - (std::uint64_t)blockDim.x))); + const auto& nec = Gpu::makeNExecutionConfigs(n); + T ndone = 0; + for (auto const& ec : nec) { + T nleft = n - ndone; + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + // This will not overflow, even though nblocks*MT might. + auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x); + if (tid < nleft) { + detail::call_f_scalar_handler(f, tid+ndone, + Gpu::Handler(amrex::min((std::uint64_t(nleft)-tid+(std::uint64_t)threadIdx.x), + (std::uint64_t)blockDim.x))); + } + }); + if (nleft > ec.ntotalthreads) { + ndone += ec.ntotalthreads; } - }); + } AMREX_GPU_ERROR_CHECK(); } @@ -785,18 +826,21 @@ ParallelFor (Gpu::KernelInfo const&, BoxND const& box, L const& f) noexcept { if (amrex::isEmpty(box)) { return; } const BoxIndexerND indexer(box); - const auto ec = Gpu::makeExecutionConfig(box.numPts()); - AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; - icell < indexer.numPts(); icell += stride) - { - auto iv = indexer.intVect(icell); - detail::call_f_intvect_handler(f, iv, - Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x), - (std::uint64_t)blockDim.x))); - } - }); + const auto& nec = Gpu::makeNExecutionConfigs(box); + std::uint64_t ndone = 0; + for (auto const& ec : nec) { + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone; + if (icell < indexer.numPts()) { + auto iv = indexer.intVect(icell); + detail::call_f_intvect_handler(f, iv, + Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x), + (std::uint64_t)blockDim.x))); + } + }); + ndone += ec.ntotalthreads; + } AMREX_GPU_ERROR_CHECK(); } @@ -806,17 +850,21 @@ ParallelFor (Gpu::KernelInfo const&, BoxND const& box, T ncomp, L const& f) { if (amrex::isEmpty(box)) { return; } const BoxIndexerND indexer(box); - const auto ec = Gpu::makeExecutionConfig(box.numPts()); - AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; - icell < indexer.numPts(); icell += stride) { - auto iv = indexer.intVect(icell); - detail::call_f_intvect_ncomp_handler(f, iv, ncomp, - Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x), - (std::uint64_t)blockDim.x))); - } - }); + const auto& nec = Gpu::makeNExecutionConfigs(box); + std::uint64_t ndone = 0; + for (auto const& ec : nec) { + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone; + if (icell < indexer.numPts()) { + auto iv = indexer.intVect(icell); + detail::call_f_intvect_ncomp_handler(f, iv, ncomp, + Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x), + (std::uint64_t)blockDim.x))); + } + }); + ndone += ec.ntotalthreads; + } AMREX_GPU_ERROR_CHECK(); } From 3bbf233668ad9026288c38bd34ff1dd00b7429fd Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Sat, 12 Oct 2024 12:18:13 -0700 Subject: [PATCH 2/8] Fix Warning --- Src/Base/AMReX_GpuLaunchFunctsG.H | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index b9fb432fd10..ff1d9d29000 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -764,7 +764,7 @@ void launch (T const& n, L const& f) noexcept f(tid+ndone); } }); - if (nleft > ec.ntotalthreads) { + if (Long(nleft) > ec.ntotalthreads) { ndone += T(ec.ntotalthreads); } } @@ -813,7 +813,7 @@ ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept (std::uint64_t)blockDim.x))); } }); - if (nleft > ec.ntotalthreads) { + if (Long(nleft) > ec.ntotalthreads) { ndone += ec.ntotalthreads; } } From 89d45c5e82e8cbd7017880090401e3f0216dd45d Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Sat, 12 Oct 2024 23:21:04 -0700 Subject: [PATCH 3/8] Work around a nvcc bug --- Src/EB/AMReX_algoim.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Src/EB/AMReX_algoim.cpp b/Src/EB/AMReX_algoim.cpp index 254e15dab0f..864ec626a0a 100644 --- a/Src/EB/AMReX_algoim.cpp +++ b/Src/EB/AMReX_algoim.cpp @@ -66,8 +66,16 @@ compute_integrals (MultiFab& intgmf, IntVect nghost) if (Gpu::inLaunchRegion()) { +#if defined(AMREX_USE_CUDA) + // It appears that there is a nvcc bug. We have to use the + // 4D ParallelFor here, even though ncomp is 1. + int ncomp = fg.nComp(); + amrex::ParallelFor(bx, ncomp, + [=] AMREX_GPU_DEVICE (int i, int j, int k, int) noexcept +#else amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept +#endif { const auto ebflag = fg(i,j,k); if (ebflag.isRegular()) { From a2c2201deccf27db319fb86835db5519689e4d1f Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Wed, 16 Oct 2024 10:44:03 -0700 Subject: [PATCH 4/8] Update Src/Base/AMReX_GpuLaunchFunctsG.H Co-authored-by: Alexander Sinn <64009254+AlexanderSinn@users.noreply.github.com> --- Src/Base/AMReX_GpuLaunchFunctsG.H | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index ff1d9d29000..ddd2286cd70 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -809,7 +809,7 @@ ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x); if (tid < nleft) { detail::call_f_scalar_handler(f, tid+ndone, - Gpu::Handler(amrex::min((std::uint64_t(nleft)-tid+(std::uint64_t)threadIdx.x), + Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x), (std::uint64_t)blockDim.x))); } }); From bca9c1b341b866630e196202e1ff435895859f8f Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Wed, 16 Oct 2024 15:19:48 -0700 Subject: [PATCH 5/8] Apply suggestions from code review Co-authored-by: Alexander Sinn <64009254+AlexanderSinn@users.noreply.github.com> --- Src/Base/AMReX_GpuLaunch.H | 8 +++++--- Src/Base/AMReX_GpuLaunchFunctsG.H | 9 +++------ 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H index 72480d9a79e..5f1e61e0084 100644 --- a/Src/Base/AMReX_GpuLaunch.H +++ b/Src/Base/AMReX_GpuLaunch.H @@ -180,7 +180,7 @@ namespace Gpu { struct ExecConfig { - Long ntotalthreads; + Long start_idx; int nblocks; }; @@ -195,6 +195,7 @@ namespace Gpu { // loops inside GPU kernels. auto nlaunches = int((N+nmax-1)/nmax); Vector r(nlaunches); + Long ndone = 0; for (int i = 0; i < nlaunches; ++i) { int nblocks; if (N > nmax) { @@ -203,8 +204,9 @@ namespace Gpu { } else { nblocks = int((N+MT-1)/MT); } - // Total # of threads in this launch - r[i].ntotalthreads = Long(nblocks) * MT; + // At which element ID the kernel should start + r[i].start_idx = ndone; + ndone += Long(nblocks) * MT; // # of blocks in this launch r[i].nblocks = nblocks; } diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index ddd2286cd70..4f9b4227746 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -800,22 +800,19 @@ ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept static_assert(sizeof(T) >= 2); if (amrex::isEmpty(n)) { return; } const auto& nec = Gpu::makeNExecutionConfigs(n); - T ndone = 0; for (auto const& ec : nec) { - T nleft = n - ndone; + const T start_idx = T(ec.start_idx); + const T nleft = n - start_idx; AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { // This will not overflow, even though nblocks*MT might. auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x); if (tid < nleft) { - detail::call_f_scalar_handler(f, tid+ndone, + detail::call_f_scalar_handler(f, tid+start_idx, Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x), (std::uint64_t)blockDim.x))); } }); - if (Long(nleft) > ec.ntotalthreads) { - ndone += ec.ntotalthreads; - } } AMREX_GPU_ERROR_CHECK(); } From 7510c31e50debfd6304e20ba5fb900b22afbf1d9 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Wed, 16 Oct 2024 15:25:34 -0700 Subject: [PATCH 6/8] Apply suggestions to more places --- Src/Base/AMReX_GpuLaunchFunctsG.H | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index 4f9b4227746..cc071bae13b 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -753,21 +753,18 @@ void launch (T const& n, L const& f) noexcept static_assert(sizeof(T) >= 2); if (amrex::isEmpty(n)) { return; } const auto& nec = Gpu::makeNExecutionConfigs(n); - T ndone = 0; for (auto const& ec : nec) { - T nleft = n - ndone; + const T start_idx = T(ec.start_idx); + const T nleft = n - start_idx; AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { // This will not overflow, even though nblocks*MT might. auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x); if (tid < nleft) { - f(tid+ndone); + f(tid+start_idx); } }); - if (Long(nleft) > ec.ntotalthreads) { - ndone += T(ec.ntotalthreads); - } - } + AMREX_GPU_ERROR_CHECK(); } @@ -778,17 +775,16 @@ void launch (BoxND const& box, L const& f) noexcept const auto& nec = Gpu::makeNExecutionConfigs(box); const BoxIndexerND indexer(box); const auto type = box.ixType(); - std::uint64_t ndone = 0; for (auto const& ec : nec) { + const T start_idx = T(ec.start_idx); AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone; + auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx; if (icell < indexer.numPts()) { auto iv = indexer.intVect(icell); f(BoxND(iv,iv,type)); } }); - ndone += ec.ntotalthreads; } AMREX_GPU_ERROR_CHECK(); } @@ -824,11 +820,11 @@ ParallelFor (Gpu::KernelInfo const&, BoxND const& box, L const& f) noexcept if (amrex::isEmpty(box)) { return; } const BoxIndexerND indexer(box); const auto& nec = Gpu::makeNExecutionConfigs(box); - std::uint64_t ndone = 0; for (auto const& ec : nec) { + const T start_idx = T(ec.start_idx); AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone; + auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx; if (icell < indexer.numPts()) { auto iv = indexer.intVect(icell); detail::call_f_intvect_handler(f, iv, @@ -836,7 +832,6 @@ ParallelFor (Gpu::KernelInfo const&, BoxND const& box, L const& f) noexcept (std::uint64_t)blockDim.x))); } }); - ndone += ec.ntotalthreads; } AMREX_GPU_ERROR_CHECK(); } @@ -848,11 +843,11 @@ ParallelFor (Gpu::KernelInfo const&, BoxND const& box, T ncomp, L const& f) if (amrex::isEmpty(box)) { return; } const BoxIndexerND indexer(box); const auto& nec = Gpu::makeNExecutionConfigs(box); - std::uint64_t ndone = 0; for (auto const& ec : nec) { + const T start_idx = T(ec.start_idx); AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone; + auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx; if (icell < indexer.numPts()) { auto iv = indexer.intVect(icell); detail::call_f_intvect_ncomp_handler(f, iv, ncomp, @@ -860,7 +855,6 @@ ParallelFor (Gpu::KernelInfo const&, BoxND const& box, T ncomp, L const& f) (std::uint64_t)blockDim.x))); } }); - ndone += ec.ntotalthreads; } AMREX_GPU_ERROR_CHECK(); } From 660a69da24832018be03aee905b0be0819c1e360 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Wed, 16 Oct 2024 16:02:59 -0700 Subject: [PATCH 7/8] Update Src/Base/AMReX_GpuLaunchFunctsG.H Co-authored-by: Alexander Sinn <64009254+AlexanderSinn@users.noreply.github.com> --- Src/Base/AMReX_GpuLaunchFunctsG.H | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index cc071bae13b..deeb4225dbd 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -764,7 +764,7 @@ void launch (T const& n, L const& f) noexcept f(tid+start_idx); } }); - + } AMREX_GPU_ERROR_CHECK(); } From 064bd3e0096c95cc3ecce40c9ccbbed4b79a5f75 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Wed, 16 Oct 2024 16:10:37 -0700 Subject: [PATCH 8/8] Fix --- Src/Base/AMReX_GpuLaunchFunctsG.H | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index deeb4225dbd..56a95dbc5bb 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -776,7 +776,7 @@ void launch (BoxND const& box, L const& f) noexcept const BoxIndexerND indexer(box); const auto type = box.ixType(); for (auto const& ec : nec) { - const T start_idx = T(ec.start_idx); + const auto start_idx = std::uint64_t(ec.start_idx); AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx; @@ -821,7 +821,7 @@ ParallelFor (Gpu::KernelInfo const&, BoxND const& box, L const& f) noexcept const BoxIndexerND indexer(box); const auto& nec = Gpu::makeNExecutionConfigs(box); for (auto const& ec : nec) { - const T start_idx = T(ec.start_idx); + const auto start_idx = std::uint64_t(ec.start_idx); AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx; @@ -844,7 +844,7 @@ ParallelFor (Gpu::KernelInfo const&, BoxND const& box, T ncomp, L const& f) const BoxIndexerND indexer(box); const auto& nec = Gpu::makeNExecutionConfigs(box); for (auto const& ec : nec) { - const T start_idx = T(ec.start_idx); + const auto start_idx = std::uint64_t(ec.start_idx); AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;