@@ -753,21 +753,18 @@ void launch (T const& n, L const& f) noexcept
753753 static_assert (sizeof (T) >= 2 );
754754 if (amrex::isEmpty (n)) { return ; }
755755 const auto & nec = Gpu::makeNExecutionConfigs<MT>(n);
756- T ndone = 0 ;
757756 for (auto const & ec : nec) {
758- T nleft = n - ndone;
757+ const T start_idx = T (ec.start_idx );
758+ const T nleft = n - start_idx;
759759 AMREX_LAUNCH_KERNEL (MT, ec.nblocks , MT, 0 , Gpu::gpuStream (),
760760 [=] AMREX_GPU_DEVICE () noexcept {
761761 // This will not overflow, even though nblocks*MT might.
762762 auto tid = T (MT)*T (blockIdx.x )+T (threadIdx.x );
763763 if (tid < nleft) {
764- f (tid+ndone );
764+ f (tid+start_idx );
765765 }
766766 });
767- if (Long (nleft) > ec.ntotalthreads ) {
768- ndone += T (ec.ntotalthreads );
769- }
770- }
767+
771768 AMREX_GPU_ERROR_CHECK ();
772769}
773770
@@ -778,17 +775,16 @@ void launch (BoxND<dim> const& box, L const& f) noexcept
778775 const auto & nec = Gpu::makeNExecutionConfigs<MT>(box);
779776 const BoxIndexerND<dim> indexer (box);
780777 const auto type = box.ixType ();
781- std::uint64_t ndone = 0 ;
782778 for (auto const & ec : nec) {
779+ const T start_idx = T (ec.start_idx );
783780 AMREX_LAUNCH_KERNEL (MT, ec.nblocks , MT, 0 , Gpu::gpuStream (),
784781 [=] AMREX_GPU_DEVICE () noexcept {
785- auto icell = std::uint64_t (MT)*blockIdx.x +threadIdx.x + ndone ;
782+ auto icell = std::uint64_t (MT)*blockIdx.x +threadIdx.x + start_idx ;
786783 if (icell < indexer.numPts ()) {
787784 auto iv = indexer.intVect (icell);
788785 f (BoxND<dim>(iv,iv,type));
789786 }
790787 });
791- ndone += ec.ntotalthreads ;
792788 }
793789 AMREX_GPU_ERROR_CHECK ();
794790}
@@ -824,19 +820,18 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
824820 if (amrex::isEmpty (box)) { return ; }
825821 const BoxIndexerND<dim> indexer (box);
826822 const auto & nec = Gpu::makeNExecutionConfigs<MT>(box);
827- std::uint64_t ndone = 0 ;
828823 for (auto const & ec : nec) {
824+ const T start_idx = T (ec.start_idx );
829825 AMREX_LAUNCH_KERNEL (MT, ec.nblocks , MT, 0 , Gpu::gpuStream (),
830826 [=] AMREX_GPU_DEVICE () noexcept {
831- auto icell = std::uint64_t (MT)*blockIdx.x +threadIdx.x + ndone ;
827+ auto icell = std::uint64_t (MT)*blockIdx.x +threadIdx.x + start_idx ;
832828 if (icell < indexer.numPts ()) {
833829 auto iv = indexer.intVect (icell);
834830 detail::call_f_intvect_handler (f, iv,
835831 Gpu::Handler (amrex::min ((indexer.numPts ()-icell+(std::uint64_t )threadIdx.x ),
836832 (std::uint64_t )blockDim.x )));
837833 }
838834 });
839- ndone += ec.ntotalthreads ;
840835 }
841836 AMREX_GPU_ERROR_CHECK ();
842837}
@@ -848,19 +843,18 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f)
848843 if (amrex::isEmpty (box)) { return ; }
849844 const BoxIndexerND<dim> indexer (box);
850845 const auto & nec = Gpu::makeNExecutionConfigs<MT>(box);
851- std::uint64_t ndone = 0 ;
852846 for (auto const & ec : nec) {
847+ const T start_idx = T (ec.start_idx );
853848 AMREX_LAUNCH_KERNEL (MT, ec.nblocks , MT, 0 , Gpu::gpuStream (),
854849 [=] AMREX_GPU_DEVICE () noexcept {
855- auto icell = std::uint64_t (MT)*blockIdx.x +threadIdx.x + ndone ;
850+ auto icell = std::uint64_t (MT)*blockIdx.x +threadIdx.x + start_idx ;
856851 if (icell < indexer.numPts ()) {
857852 auto iv = indexer.intVect (icell);
858853 detail::call_f_intvect_ncomp_handler (f, iv, ncomp,
859854 Gpu::Handler (amrex::min ((indexer.numPts ()-icell+(std::uint64_t )threadIdx.x ),
860855 (std::uint64_t )blockDim.x )));
861856 }
862857 });
863- ndone += ec.ntotalthreads ;
864858 }
865859 AMREX_GPU_ERROR_CHECK ();
866860}
0 commit comments