Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 16 additions & 9 deletions Src/Base/AMReX_Scan.H
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,8 @@ T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)
constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size;
constexpr int nchunks = 12;
constexpr int nelms_per_block = nthreads * nchunks;
AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(std::numeric_limits<int>::max())*nelms_per_block);
AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
std::numeric_limits<int>::max())*nelms_per_block);
int nblocks = (static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block);
auto stream = Gpu::gpuStream();
Expand Down Expand Up @@ -228,7 +229,7 @@ T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)
T* shared2 = shared + Gpu::Device::warp_size;

// Each block processes [ibegin,iend).
N ibegin = nelms_per_block * blockIdxx;
N ibegin = static_cast<N>(nelms_per_block) * blockIdxx;
N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);

// Each block is responsible for nchunks chunks of data,
Expand Down Expand Up @@ -366,7 +367,7 @@ T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)
int blockDimx = gh.item->get_local_range(0);

// Each block processes [ibegin,iend).
N ibegin = nelms_per_block * blockIdxx;
N ibegin = static_cast<N>(nelms_per_block) * blockIdxx;
N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
T prev_sum = (blockIdxx == 0) ? 0 : blocksum_p[blockIdxx-1];
for (N offset = ibegin + threadIdxx; offset < iend; offset += blockDimx) {
Expand Down Expand Up @@ -398,7 +399,8 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size;
constexpr int nchunks = 12;
constexpr int nelms_per_block = nthreads * nchunks;
AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(std::numeric_limits<int>::max())*nelms_per_block);
AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
std::numeric_limits<int>::max())*nelms_per_block);
int nblocks = (static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;

#ifndef AMREX_SYCL_NO_MULTIPASS_SCAN
Expand Down Expand Up @@ -462,7 +464,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
}

// Each block processes [ibegin,iend).
N ibegin = nelms_per_block * virtual_block_id;
N ibegin = static_cast<N>(nelms_per_block) * virtual_block_id;
N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
BlockStatusT& block_status = block_status_p[virtual_block_id];

Expand Down Expand Up @@ -637,6 +639,8 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size; // # of threads per block
constexpr int nelms_per_thread = sizeof(T) >= 8 ? 8 : 16;
constexpr int nelms_per_block = nthreads * nelms_per_thread;
AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
std::numeric_limits<int>::max())*nelms_per_block);
int nblocks = (n + nelms_per_block - 1) / nelms_per_block;
std::size_t sm = 0;
auto stream = Gpu::gpuStream();
Expand Down Expand Up @@ -713,7 +717,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
auto const virtual_block_id = scan_bid.get(threadIdx.x, temp_storage.ordered_bid);

// Each block processes [ibegin,iend).
N ibegin = nelms_per_block * virtual_block_id;
N ibegin = static_cast<N>(nelms_per_block) * virtual_block_id;
N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);

auto input_begin = rocprim::make_transform_iterator(
Expand Down Expand Up @@ -800,6 +804,8 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size; // # of threads per block
constexpr int nelms_per_thread = sizeof(T) >= 8 ? 4 : 8;
constexpr int nelms_per_block = nthreads * nelms_per_thread;
AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
std::numeric_limits<int>::max())*nelms_per_block);
int nblocks = (n + nelms_per_block - 1) / nelms_per_block;
std::size_t sm = 0;
auto stream = Gpu::gpuStream();
Expand Down Expand Up @@ -854,7 +860,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
int virtual_block_id = blockIdx.x;

// Each block processes [ibegin,iend).
N ibegin = nelms_per_block * virtual_block_id;
N ibegin = static_cast<N>(nelms_per_block) * virtual_block_id;
N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);

auto input_lambda = [&] (N i) -> T { return fin(i+ibegin); };
Expand Down Expand Up @@ -944,7 +950,8 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size;
constexpr int nchunks = 12;
constexpr int nelms_per_block = nthreads * nchunks;
AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(std::numeric_limits<int>::max())*nelms_per_block);
AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
std::numeric_limits<int>::max())*nelms_per_block);
int nblocks = (static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block) + sizeof(int);
auto stream = Gpu::gpuStream();
Expand Down Expand Up @@ -997,7 +1004,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
}

// Each block processes [ibegin,iend).
N ibegin = nelms_per_block * virtual_block_id;
N ibegin = static_cast<N>(nelms_per_block) * virtual_block_id;
N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
BlockStatusT& block_status = block_status_p[virtual_block_id];

Expand Down
Loading