Skip to content

Commit 83e4220

Browse files
committed
Make transposeCtoF asynchronous
It's now up to the user to call Gpu::streamSynchronize() if necessary.
1 parent de2510c commit 83e4220

File tree

1 file changed

+8
-5
lines changed

1 file changed

+8
-5
lines changed

Src/Base/AMReX_BaseFabUtility.H

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,10 @@ void fill (BaseFab<STRUCT>& aos_fab, F const& f)
9797

9898
//! Transpose 3D array (nx,ny,nz) from row-major (i.e. C order) to
9999
//! column-major (Fortran order). The input's unit stride direction is z,
100-
//! whereas the output's unit stride direction is x.
100+
//! whereas the output's unit stride direction is x. Note that for GPU
101+
//! builds, the kernel runs on the current GPU stream asynchronously with
102+
//! respect to the host. If synchronization is needed, it's up to the user
103+
//! to call `amrex::Gpu::streamSynchronize()`.
101104
template <typename T>
102105
void transposeCtoF (T const* pi, T* po, int nx, int ny, int nz)
103106
{
@@ -148,8 +151,6 @@ void transposeCtoF (T const* pi, T* po, int nx, int ny, int nz)
148151
}
149152
}
150153
});
151-
AMREX_GPU_ERROR_CHECK();
152-
Gpu::streamSynchronize();
153154

154155
#elif defined(AMREX_USE_SYCL)
155156

@@ -214,7 +215,6 @@ void transposeCtoF (T const* pi, T* po, int nx, int ny, int nz)
214215
} catch (sycl::exception const& ex) {
215216
amrex::Abort(std::string("transposeCtoF: ")+ex.what()+"!!!!!");
216217
}
217-
Gpu::streamSynchronize();
218218

219219
#else
220220

@@ -249,7 +249,10 @@ void transposeCtoF (T const* pi, T* po, int nx, int ny, int nz)
249249

250250
//! Transpose 2D array (nx,ny) from row-major (i.e. C order) to column-major
251251
//! (Fortran order). The input's unit stride direction is y, whereas the
252-
//! output's unit stride direction is x.
252+
//! output's unit stride direction is x. Note that for GPU builds, the
253+
//! kernel runs on the current GPU stream asynchronously with respect to the
254+
//! host. If synchronization is needed, it's up to the user to call
255+
//! `amrex::Gpu::streamSynchronize()`.
253256
template <typename T>
254257
void transposeCtoF (T const* pi, T* po, int nx, int ny)
255258
{

0 commit comments

Comments
 (0)