@@ -97,7 +97,10 @@ void fill (BaseFab<STRUCT>& aos_fab, F const& f)
9797
9898// ! Transpose 3D array (nx,ny,nz) from row-major (i.e. C order) to
9999// ! column-major (Fortran order). The input's unit stride direction is z,
100- // ! whereas the output's unit stride direction is x.
100+ // ! whereas the output's unit stride direction is x. Note that for GPU
101+ // ! builds, the kernel runs on the current GPU stream asynchronously with
102+ // ! respect to the host. If synchronization is needed, it's up to the user
103+ // ! to call `amrex::Gpu::streamSynchronize()`.
101104template <typename T>
102105void transposeCtoF (T const * pi, T* po, int nx, int ny, int nz)
103106{
@@ -148,8 +151,6 @@ void transposeCtoF (T const* pi, T* po, int nx, int ny, int nz)
148151 }
149152 }
150153 });
151- AMREX_GPU_ERROR_CHECK ();
152- Gpu::streamSynchronize ();
153154
154155#elif defined(AMREX_USE_SYCL)
155156
@@ -214,7 +215,6 @@ void transposeCtoF (T const* pi, T* po, int nx, int ny, int nz)
214215 } catch (sycl::exception const & ex) {
215216 amrex::Abort (std::string (" transposeCtoF: " )+ex.what ()+" !!!!!" );
216217 }
217- Gpu::streamSynchronize ();
218218
219219#else
220220
@@ -249,7 +249,10 @@ void transposeCtoF (T const* pi, T* po, int nx, int ny, int nz)
249249
250250// ! Transpose 2D array (nx,ny) from row-major (i.e. C order) to column-major
251251// ! (Fortran order). The input's unit stride direction is y, whereas the
252- // ! output's unit stride direction is x.
252+ // ! output's unit stride direction is x. Note that for GPU builds, the
253+ // ! kernel runs on the current GPU stream asynchronously with respect to the
254+ // ! host. If synchronization is needed, it's up to the user to call
255+ // ! `amrex::Gpu::streamSynchronize()`.
253256template <typename T>
254257void transposeCtoF (T const * pi, T* po, int nx, int ny)
255258{
0 commit comments