@@ -806,7 +806,7 @@ ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
806806 if (tid < nleft) {
807807 detail::call_f_scalar_handler (f, tid+start_idx,
808808 Gpu::Handler (amrex::min ((std::uint64_t (nleft-tid)+(std::uint64_t )threadIdx.x ),
809- (std::uint64_t )blockDim. x )));
809+ (std::uint64_t )MT )));
810810 }
811811 });
812812 }
@@ -829,7 +829,7 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
829829 auto iv = indexer.intVect (icell);
830830 detail::call_f_intvect_handler (f, iv,
831831 Gpu::Handler (amrex::min ((indexer.numPts ()-icell+(std::uint64_t )threadIdx.x ),
832- (std::uint64_t )blockDim. x )));
832+ (std::uint64_t )MT )));
833833 }
834834 });
835835 }
@@ -852,7 +852,7 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f)
852852 auto iv = indexer.intVect (icell);
853853 detail::call_f_intvect_ncomp_handler (f, iv, ncomp,
854854 Gpu::Handler (amrex::min ((indexer.numPts ()-icell+(std::uint64_t )threadIdx.x ),
855- (std::uint64_t )blockDim. x )));
855+ (std::uint64_t )MT )));
856856 }
857857 });
858858 }
@@ -870,9 +870,9 @@ ParallelForRNG (T n, L const& f) noexcept
870870 amrex::min (ec.numBlocks .x , Gpu::Device::maxBlocksPerLaunch ()),
871871 ec.numThreads , 0 , Gpu::gpuStream (),
872872 [=] AMREX_GPU_DEVICE () noexcept {
873- Long tid = Long (blockDim. x )*blockIdx.x +threadIdx.x ;
873+ Long tid = Long (AMREX_GPU_MAX_THREADS )*blockIdx.x +threadIdx.x ;
874874 RandomEngine engine{&(rand_state[tid])};
875- for (Long i = tid, stride = Long (blockDim. x )*gridDim.x ; i < Long (n); i += stride) {
875+ for (Long i = tid, stride = Long (AMREX_GPU_MAX_THREADS )*gridDim.x ; i < Long (n); i += stride) {
876876 f (T (i),engine);
877877 }
878878 });
@@ -892,9 +892,9 @@ ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
892892 amrex::min (ec.numBlocks .x , Gpu::Device::maxBlocksPerLaunch ()),
893893 ec.numThreads , 0 , Gpu::gpuStream (),
894894 [=] AMREX_GPU_DEVICE () noexcept {
895- auto const tid = std::uint64_t (blockDim. x )*blockIdx.x +threadIdx.x ;
895+ auto const tid = std::uint64_t (AMREX_GPU_MAX_THREADS )*blockIdx.x +threadIdx.x ;
896896 RandomEngine engine{&(rand_state[tid])};
897- for (std::uint64_t icell = tid, stride = std::uint64_t (blockDim. x )*gridDim.x ; icell < indexer.numPts (); icell += stride) {
897+ for (std::uint64_t icell = tid, stride = std::uint64_t (AMREX_GPU_MAX_THREADS )*gridDim.x ; icell < indexer.numPts (); icell += stride) {
898898 auto iv = indexer.intVect (icell);
899899 detail::call_f_intvect_engine (f, iv, engine);
900900 }
@@ -915,9 +915,9 @@ ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
915915 amrex::min (ec.numBlocks .x , Gpu::Device::maxBlocksPerLaunch ()),
916916 ec.numThreads , 0 , Gpu::gpuStream (),
917917 [=] AMREX_GPU_DEVICE () noexcept {
918- auto const tid = std::uint64_t (blockDim. x )*blockIdx.x +threadIdx.x ;
918+ auto const tid = std::uint64_t (AMREX_GPU_MAX_THREADS )*blockIdx.x +threadIdx.x ;
919919 RandomEngine engine{&(rand_state[tid])};
920- for (std::uint64_t icell = tid, stride = std::uint64_t (blockDim. x )*gridDim.x ; icell < indexer.numPts (); icell += stride) {
920+ for (std::uint64_t icell = tid, stride = std::uint64_t (AMREX_GPU_MAX_THREADS )*gridDim.x ; icell < indexer.numPts (); icell += stride) {
921921 auto iv = indexer.intVect (icell);
922922 detail::call_f_intvect_ncomp_engine (f, iv, ncomp, engine);
923923 }
@@ -938,7 +938,7 @@ ParallelFor (Gpu::KernelInfo const&,
938938 AMREX_LAUNCH_KERNEL (MT, ec.numBlocks , ec.numThreads , 0 , Gpu::gpuStream (),
939939 [=] AMREX_GPU_DEVICE () noexcept {
940940 auto const ncells = std::max (indexer1.numPts (), indexer2.numPts ());
941- for (std::uint64_t icell = std::uint64_t (blockDim. x )*blockIdx.x +threadIdx.x , stride = std::uint64_t (blockDim. x )*gridDim.x ;
941+ for (std::uint64_t icell = std::uint64_t (MT )*blockIdx.x +threadIdx.x , stride = std::uint64_t (MT )*gridDim.x ;
942942 icell < ncells; icell += stride) {
943943 if (icell < indexer1.numPts ()) {
944944 auto iv = indexer1.intVect (icell);
@@ -967,7 +967,7 @@ ParallelFor (Gpu::KernelInfo const&,
967967 AMREX_LAUNCH_KERNEL (MT, ec.numBlocks , ec.numThreads , 0 , Gpu::gpuStream (),
968968 [=] AMREX_GPU_DEVICE () noexcept {
969969 auto const ncells = std::max ({indexer1.numPts (), indexer2.numPts (), indexer3.numPts ()});
970- for (std::uint64_t icell = std::uint64_t (blockDim. x )*blockIdx.x +threadIdx.x , stride = std::uint64_t (blockDim. x )*gridDim.x ;
970+ for (std::uint64_t icell = std::uint64_t (MT )*blockIdx.x +threadIdx.x , stride = std::uint64_t (MT )*gridDim.x ;
971971 icell < ncells; icell += stride) {
972972 if (icell < indexer1.numPts ()) {
973973 auto iv = indexer1.intVect (icell);
@@ -1001,7 +1001,7 @@ ParallelFor (Gpu::KernelInfo const&,
10011001 AMREX_LAUNCH_KERNEL (MT, ec.numBlocks , ec.numThreads , 0 , Gpu::gpuStream (),
10021002 [=] AMREX_GPU_DEVICE () noexcept {
10031003 auto const ncells = std::max (indexer1.numPts (), indexer2.numPts ());
1004- for (std::uint64_t icell = std::uint64_t (blockDim. x )*blockIdx.x +threadIdx.x , stride = std::uint64_t (blockDim. x )*gridDim.x ;
1004+ for (std::uint64_t icell = std::uint64_t (MT )*blockIdx.x +threadIdx.x , stride = std::uint64_t (MT )*gridDim.x ;
10051005 icell < ncells; icell += stride) {
10061006 if (icell < indexer1.numPts ()) {
10071007 auto iv = indexer1.intVect (icell);
@@ -1034,7 +1034,7 @@ ParallelFor (Gpu::KernelInfo const&,
10341034 AMREX_LAUNCH_KERNEL (MT, ec.numBlocks , ec.numThreads , 0 , Gpu::gpuStream (),
10351035 [=] AMREX_GPU_DEVICE () noexcept {
10361036 auto const ncells = std::max ({indexer1.numPts (), indexer2.numPts (), indexer3.numPts ()});
1037- for (std::uint64_t icell = std::uint64_t (blockDim. x )*blockIdx.x +threadIdx.x , stride = std::uint64_t (blockDim. x )*gridDim.x ;
1037+ for (std::uint64_t icell = std::uint64_t (MT )*blockIdx.x +threadIdx.x , stride = std::uint64_t (MT )*gridDim.x ;
10381038 icell < ncells; icell += stride) {
10391039 if (icell < indexer1.numPts ()) {
10401040 auto iv = indexer1.intVect (icell);
0 commit comments