@@ -747,17 +747,47 @@ void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noe
747747 launch (nblocks, nthreads_per_block, 0 , stream, std::forward<L>(f));
748748}
749749
750- template <int MT, typename T, typename L>
750+ template <int MT, typename T, typename L, std:: enable_if_t <std::is_integral_v<T>, int > FOO = 0 >
751751void launch (T const & n, L const & f) noexcept
752752{
753753 if (amrex::isEmpty (n)) { return ; }
754- const auto ec = Gpu::makeExecutionConfig<MT>(n);
755- AMREX_LAUNCH_KERNEL (MT, ec.numBlocks , ec.numThreads , 0 , Gpu::gpuStream (),
756- [=] AMREX_GPU_DEVICE () noexcept {
757- for (auto const i : Gpu::Range (n)) {
758- f (i);
754+ const auto & nec = Gpu::makeNExecutionConfigs<MT>(n);
755+ T ndone = 0 ;
756+ for (auto const & ec : nec) {
757+ AMREX_LAUNCH_KERNEL (MT, ec.nblocks , MT, 0 , Gpu::gpuStream (),
758+ [=] AMREX_GPU_DEVICE () noexcept {
759+ // This will not overflow, even though nblocks*MT might.
760+ auto tid = T (MT*blockIdx.x +threadIdx.x );
761+ if (tid < n-ndone) {
762+ f (tid+ndone);
763+ }
764+ });
765+ if (&ec != &nec.back ()) {
766+ ndone += T (ec.ntotalthreads );
759767 }
760- });
768+ }
769+ AMREX_GPU_ERROR_CHECK ();
770+ }
771+
772+ template <int MT, int dim, typename L>
773+ void launch (BoxND<dim> const & box, L const & f) noexcept
774+ {
775+ if (box.isEmpty ()) { return ; }
776+ const auto & nec = Gpu::makeNExecutionConfigs<MT>(box);
777+ const BoxIndexerND<dim> indexer (box);
778+ const auto type = box.ixType ();
779+ std::uint64_t ndone = 0 ;
780+ for (auto const & ec : nec) {
781+ AMREX_LAUNCH_KERNEL (MT, ec.nblocks , MT, 0 , Gpu::gpuStream (),
782+ [=] AMREX_GPU_DEVICE () noexcept {
783+ auto icell = std::uint64_t (MT)*blockIdx.x +threadIdx.x + ndone;
784+ if (icell < indexer.numPts ()) {
785+ auto iv = indexer.intVect (icell);
786+ f (BoxND<dim>(iv,iv,type));
787+ }
788+ });
789+ ndone += ec.ntotalthreads ;
790+ }
761791 AMREX_GPU_ERROR_CHECK ();
762792}
763793
@@ -766,16 +796,22 @@ std::enable_if_t<MaybeDeviceRunnable<L>::value>
766796ParallelFor (Gpu::KernelInfo const &, T n, L const & f) noexcept
767797{
768798 if (amrex::isEmpty (n)) { return ; }
769- const auto ec = Gpu::makeExecutionConfig<MT>(n);
770- AMREX_LAUNCH_KERNEL (MT, ec.numBlocks , ec.numThreads , 0 , Gpu::gpuStream (),
771- [=] AMREX_GPU_DEVICE () noexcept {
772- for (Long i = Long (blockDim.x )*blockIdx.x +threadIdx.x , stride = Long (blockDim.x )*gridDim.x ;
773- i < Long (n); i += stride) {
774- detail::call_f_scalar_handler (f, T (i),
775- Gpu::Handler (amrex::min ((std::uint64_t (n)-i+(std::uint64_t )threadIdx.x ),
776- (std::uint64_t )blockDim.x )));
777- }
778- });
799+ const auto & nec = Gpu::makeNExecutionConfigs<MT>(n);
800+ T ndone = 0 ;
801+ for (auto const & ec : nec) {
802+ AMREX_LAUNCH_KERNEL (MT, ec.nblocks , MT, 0 , Gpu::gpuStream (),
803+ [=] AMREX_GPU_DEVICE () noexcept {
804+ // This will not overflow, even though nblocks*MT might.
805+ auto tid = T (MT*blockIdx.x +threadIdx.x );
806+ if (tid < n-ndone) {
807+ tid += ndone;
808+ detail::call_f_scalar_handler (f, tid,
809+ Gpu::Handler (amrex::min ((std::uint64_t (n)-tid+(std::uint64_t )threadIdx.x ),
810+ (std::uint64_t )blockDim.x )));
811+ }
812+ });
813+ ndone += ec.ntotalthreads ;
814+ }
779815 AMREX_GPU_ERROR_CHECK ();
780816}
781817
@@ -785,18 +821,21 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
785821{
786822 if (amrex::isEmpty (box)) { return ; }
787823 const BoxIndexerND<dim> indexer (box);
788- const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts ());
789- AMREX_LAUNCH_KERNEL (MT, ec.numBlocks , ec.numThreads , 0 , Gpu::gpuStream (),
790- [=] AMREX_GPU_DEVICE () noexcept {
791- for (std::uint64_t icell = std::uint64_t (blockDim.x )*blockIdx.x +threadIdx.x , stride = std::uint64_t (blockDim.x )*gridDim.x ;
792- icell < indexer.numPts (); icell += stride)
793- {
794- auto iv = indexer.intVect (icell);
795- detail::call_f_intvect_handler (f, iv,
796- Gpu::Handler (amrex::min ((indexer.numPts ()-icell+(std::uint64_t )threadIdx.x ),
797- (std::uint64_t )blockDim.x )));
798- }
799- });
824+ const auto & nec = Gpu::makeNExecutionConfigs<MT>(box);
825+ std::uint64_t ndone = 0 ;
826+ for (auto const & ec : nec) {
827+ AMREX_LAUNCH_KERNEL (MT, ec.nblocks , MT, 0 , Gpu::gpuStream (),
828+ [=] AMREX_GPU_DEVICE () noexcept {
829+ auto icell = std::uint64_t (MT)*blockIdx.x +threadIdx.x + ndone;
830+ if (icell < indexer.numPts ()) {
831+ auto iv = indexer.intVect (icell);
832+ detail::call_f_intvect_handler (f, iv,
833+ Gpu::Handler (amrex::min ((indexer.numPts ()-icell+(std::uint64_t )threadIdx.x ),
834+ (std::uint64_t )blockDim.x )));
835+ }
836+ });
837+ ndone += ec.ntotalthreads ;
838+ }
800839 AMREX_GPU_ERROR_CHECK ();
801840}
802841
@@ -806,17 +845,21 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f)
806845{
807846 if (amrex::isEmpty (box)) { return ; }
808847 const BoxIndexerND<dim> indexer (box);
809- const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts ());
810- AMREX_LAUNCH_KERNEL (MT, ec.numBlocks , ec.numThreads , 0 , Gpu::gpuStream (),
811- [=] AMREX_GPU_DEVICE () noexcept {
812- for (std::uint64_t icell = std::uint64_t (blockDim.x )*blockIdx.x +threadIdx.x , stride = std::uint64_t (blockDim.x )*gridDim.x ;
813- icell < indexer.numPts (); icell += stride) {
814- auto iv = indexer.intVect (icell);
815- detail::call_f_intvect_ncomp_handler (f, iv, ncomp,
816- Gpu::Handler (amrex::min ((indexer.numPts ()-icell+(std::uint64_t )threadIdx.x ),
817- (std::uint64_t )blockDim.x )));
818- }
819- });
848+ const auto & nec = Gpu::makeNExecutionConfigs<MT>(box);
849+ std::uint64_t ndone = 0 ;
850+ for (auto const & ec : nec) {
851+ AMREX_LAUNCH_KERNEL (MT, ec.nblocks , MT, 0 , Gpu::gpuStream (),
852+ [=] AMREX_GPU_DEVICE () noexcept {
853+ auto icell = std::uint64_t (MT)*blockIdx.x +threadIdx.x + ndone;
854+ if (icell < indexer.numPts ()) {
855+ auto iv = indexer.intVect (icell);
856+ detail::call_f_intvect_ncomp_handler (f, iv, ncomp,
857+ Gpu::Handler (amrex::min ((indexer.numPts ()-icell+(std::uint64_t )threadIdx.x ),
858+ (std::uint64_t )blockDim.x )));
859+ }
860+ });
861+ ndone += ec.ntotalthreads ;
862+ }
820863 AMREX_GPU_ERROR_CHECK ();
821864}
822865
0 commit comments