diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H index 435a11f342b..5f1e61e0084 100644 --- a/Src/Base/AMReX_GpuLaunch.H +++ b/Src/Base/AMReX_GpuLaunch.H @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -176,6 +177,47 @@ namespace Gpu { { return makeExecutionConfig(box.numPts()); } + + struct ExecConfig + { + Long start_idx; + int nblocks; + }; + + template + Vector makeNExecutionConfigs (Long N) noexcept + { + // Max # of blocks in a kernel launch + int numblocks_max = std::numeric_limits::max(); + // Max # of threads in a kernel launch + Long nmax = Long(MT) * numblocks_max; + // # of launches needed for N elements without using grid-stride + // loops inside GPU kernels. + auto nlaunches = int((N+nmax-1)/nmax); + Vector r(nlaunches); + Long ndone = 0; + for (int i = 0; i < nlaunches; ++i) { + int nblocks; + if (N > nmax) { + nblocks = numblocks_max; + N -= nmax; + } else { + nblocks = int((N+MT-1)/MT); + } + // At which element ID the kernel should start + r[i].start_idx = ndone; + ndone += Long(nblocks) * MT; + // # of blocks in this launch + r[i].nblocks = nblocks; + } + return r; + } + + template + Vector makeNExecutionConfigs (BoxND const& box) noexcept + { + return makeNExecutionConfigs(box.numPts()); + } #endif } diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index 7955410f8ba..56a95dbc5bb 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -747,17 +747,45 @@ void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noe launch(nblocks, nthreads_per_block, 0, stream, std::forward(f)); } -template +template,int> FOO = 0> void launch (T const& n, L const& f) noexcept { + static_assert(sizeof(T) >= 2); if (amrex::isEmpty(n)) { return; } - const auto ec = Gpu::makeExecutionConfig(n); - AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (auto const i : Gpu::Range(n)) { - f(i); - } - }); + const auto& nec = Gpu::makeNExecutionConfigs(n); + for (auto const& ec : nec) { + const T start_idx = T(ec.start_idx); + const T nleft = n - start_idx; + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + // This will not overflow, even though nblocks*MT might. + auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x); + if (tid < nleft) { + f(tid+start_idx); + } + }); + } + AMREX_GPU_ERROR_CHECK(); +} + +template +void launch (BoxND const& box, L const& f) noexcept +{ + if (box.isEmpty()) { return; } + const auto& nec = Gpu::makeNExecutionConfigs(box); + const BoxIndexerND indexer(box); + const auto type = box.ixType(); + for (auto const& ec : nec) { + const auto start_idx = std::uint64_t(ec.start_idx); + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx; + if (icell < indexer.numPts()) { + auto iv = indexer.intVect(icell); + f(BoxND(iv,iv,type)); + } + }); + } AMREX_GPU_ERROR_CHECK(); } @@ -765,17 +793,23 @@ template ::value> ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept { + static_assert(sizeof(T) >= 2); if (amrex::isEmpty(n)) { return; } - const auto ec = Gpu::makeExecutionConfig(n); - AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x; - i < Long(n); i += stride) { - detail::call_f_scalar_handler(f, T(i), - Gpu::Handler(amrex::min((std::uint64_t(n)-i+(std::uint64_t)threadIdx.x), - (std::uint64_t)blockDim.x))); - } - }); + const auto& nec = Gpu::makeNExecutionConfigs(n); + for (auto const& ec : nec) { + const T start_idx = T(ec.start_idx); + const T nleft = n - start_idx; + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + // This will not overflow, even though nblocks*MT might. + auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x); + if (tid < nleft) { + detail::call_f_scalar_handler(f, tid+start_idx, + Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x), + (std::uint64_t)blockDim.x))); + } + }); + } AMREX_GPU_ERROR_CHECK(); } @@ -785,18 +819,20 @@ ParallelFor (Gpu::KernelInfo const&, BoxND const& box, L const& f) noexcept { if (amrex::isEmpty(box)) { return; } const BoxIndexerND indexer(box); - const auto ec = Gpu::makeExecutionConfig(box.numPts()); - AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; - icell < indexer.numPts(); icell += stride) - { - auto iv = indexer.intVect(icell); - detail::call_f_intvect_handler(f, iv, - Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x), - (std::uint64_t)blockDim.x))); - } - }); + const auto& nec = Gpu::makeNExecutionConfigs(box); + for (auto const& ec : nec) { + const auto start_idx = std::uint64_t(ec.start_idx); + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx; + if (icell < indexer.numPts()) { + auto iv = indexer.intVect(icell); + detail::call_f_intvect_handler(f, iv, + Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x), + (std::uint64_t)blockDim.x))); + } + }); + } AMREX_GPU_ERROR_CHECK(); } @@ -806,17 +842,20 @@ ParallelFor (Gpu::KernelInfo const&, BoxND const& box, T ncomp, L const& f) { if (amrex::isEmpty(box)) { return; } const BoxIndexerND indexer(box); - const auto ec = Gpu::makeExecutionConfig(box.numPts()); - AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; - icell < indexer.numPts(); icell += stride) { - auto iv = indexer.intVect(icell); - detail::call_f_intvect_ncomp_handler(f, iv, ncomp, - Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x), - (std::uint64_t)blockDim.x))); - } - }); + const auto& nec = Gpu::makeNExecutionConfigs(box); + for (auto const& ec : nec) { + const auto start_idx = std::uint64_t(ec.start_idx); + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx; + if (icell < indexer.numPts()) { + auto iv = indexer.intVect(icell); + detail::call_f_intvect_ncomp_handler(f, iv, ncomp, + Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x), + (std::uint64_t)blockDim.x))); + } + }); + } AMREX_GPU_ERROR_CHECK(); } diff --git a/Src/EB/AMReX_algoim.cpp b/Src/EB/AMReX_algoim.cpp index 254e15dab0f..864ec626a0a 100644 --- a/Src/EB/AMReX_algoim.cpp +++ b/Src/EB/AMReX_algoim.cpp @@ -66,8 +66,16 @@ compute_integrals (MultiFab& intgmf, IntVect nghost) if (Gpu::inLaunchRegion()) { +#if defined(AMREX_USE_CUDA) + // It appears that there is a nvcc bug. We have to use the + // 4D ParallelFor here, even though ncomp is 1. + int ncomp = fg.nComp(); + amrex::ParallelFor(bx, ncomp, + [=] AMREX_GPU_DEVICE (int i, int j, int k, int) noexcept +#else amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept +#endif { const auto ebflag = fg(i,j,k); if (ebflag.isRegular()) {