@@ -202,9 +202,6 @@ public:
202202 template <typename FA, typename TRIA, typename TRIC>
203203 void solve_z (FA& spmf, TRIA const & tria, TRIC const & tric);
204204
205- template <typename FA>
206- void solve_z0 (FA& spmf);
207-
208205 [[nodiscard]] std::pair<BoxArray,DistributionMapping> getSpectralDataLayout () const ;
209206
210207private:
@@ -526,36 +523,18 @@ void PoissonHybrid<MF>::solve (MF& a_soln, MF const& a_rhs, TRIA const& tria,
526523 if (m_r2c)
527524 {
528525 m_r2c->forward (*rhs, m_spmf_c);
529- if constexpr (std::is_same_v<TRIA,fft_poisson_detail::Tri_Zero<T>> &&
530- std::is_same_v<TRIC,fft_poisson_detail::Tri_Zero<T>>) {
531- solve_z0 (m_spmf_c);
532- amrex::ignore_unused (tria,tric);
533- } else {
534- solve_z (m_spmf_c, tria, tric);
535- }
526+ solve_z (m_spmf_c, tria, tric);
536527 m_r2c->backward_doit (m_spmf_c, *soln, ng, m_geom.periodicity ());
537528 }
538529 else
539530 {
540531 if (m_r2x->m_cy .empty ()) { // spectral data is real
541532 m_r2x->forward (*rhs, m_spmf_r);
542- if constexpr (std::is_same_v<TRIA,fft_poisson_detail::Tri_Zero<T>> &&
543- std::is_same_v<TRIC,fft_poisson_detail::Tri_Zero<T>>) {
544- solve_z0 (m_spmf_r);
545- amrex::ignore_unused (tria,tric);
546- } else {
547- solve_z (m_spmf_r, tria, tric);
548- }
533+ solve_z (m_spmf_r, tria, tric);
549534 m_r2x->backward (m_spmf_r, *soln, ng, m_geom.periodicity ());
550535 } else { // spectral data is complex.
551536 m_r2x->forward (*rhs, m_spmf_c);
552- if constexpr (std::is_same_v<TRIA,fft_poisson_detail::Tri_Zero<T>> &&
553- std::is_same_v<TRIC,fft_poisson_detail::Tri_Zero<T>>) {
554- solve_z0 (m_spmf_c);
555- amrex::ignore_unused (tria,tric);
556- } else {
557- solve_z (m_spmf_c, tria, tric);
558- }
537+ solve_z (m_spmf_c, tria, tric);
559538 m_r2x->backward (m_spmf_c, *soln, ng, m_geom.periodicity ());
560539 }
561540 }
@@ -602,212 +581,175 @@ void PoissonHybrid<MF>::solve_z (FA& spmf, TRIA const& tria, TRIC const& tric)
602581 }
603582 }
604583
605- bool zlo_neumann = m_bc[2 ].first == Boundary::even;
606- bool zhi_neumann = m_bc[2 ].second == Boundary::even;
607- bool is_singular = (offset[0 ] == T (0 )) && (offset[1 ] == T (0 ))
608- && zlo_neumann && zhi_neumann;
584+ if constexpr (std::is_same_v<TRIA,fft_poisson_detail::Tri_Zero<T>> &&
585+ std::is_same_v<TRIC,fft_poisson_detail::Tri_Zero<T>>) {
586+ #if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
587+ #pragma omp parallel
588+ #endif
589+ for (MFIter mfi (spmf,TilingIfNotGPU ()); mfi.isValid (); ++mfi)
590+ {
591+ auto const & spectral = spmf.array (mfi);
592+ auto const & box = mfi.validbox ();
593+ amrex::ParallelFor (box, [=] AMREX_GPU_DEVICE (int i, int j, int k)
594+ {
595+ T a = facx*(i+offset[0 ]);
596+ T b = facy*(j+offset[1 ]);
597+ T k2 = dxfac * (std::cos (a)-T (1 ))
598+ + dyfac * (std::cos (b)-T (1 ));
599+ if (k2 != T (0 )) {
600+ spectral (i,j,k) /= k2;
601+ }
602+ spectral (i,j,k) *= scale;
603+ });
604+ }
605+ } else {
606+ bool zlo_neumann = m_bc[2 ].first == Boundary::even;
607+ bool zhi_neumann = m_bc[2 ].second == Boundary::even;
608+ bool is_singular = (offset[0 ] == T (0 )) && (offset[1 ] == T (0 ))
609+ && zlo_neumann && zhi_neumann;
609610
610- auto nz = m_geom.Domain ().length (2 );
611+ auto nz = m_geom.Domain ().length (2 );
611612
612613#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
613614#pragma omp parallel
614615#endif
615- for (MFIter mfi (spmf); mfi.isValid (); ++mfi)
616- {
617- auto const & spectral = spmf.array (mfi);
618- auto const & box = mfi.validbox ();
619- auto const & xybox = amrex::makeSlab (box, 2 , 0 );
616+ for (MFIter mfi (spmf); mfi.isValid (); ++mfi)
617+ {
618+ auto const & spectral = spmf.array (mfi);
619+ auto const & box = mfi.validbox ();
620+ auto const & xybox = amrex::makeSlab (box, 2 , 0 );
620621
621622#ifdef AMREX_USE_GPU
622- // xxxxx TODO: We need to explore how to optimize this
623- // function. Maybe we can use cusparse. Maybe we should make
624- // z-direction to be the unit stride direction.
623+ // xxxxx TODO: We need to explore how to optimize this
624+ // function. Maybe we can use cusparse. Maybe we should make
625+ // z-direction to be the unit stride direction.
625626
626- FArrayBox tridiag_workspace (box,4 );
627- auto const & ald = tridiag_workspace.array (0 );
628- auto const & bd = tridiag_workspace.array (1 );
629- auto const & cud = tridiag_workspace.array (2 );
630- auto const & scratch = tridiag_workspace.array (3 );
627+ FArrayBox tridiag_workspace (box,4 );
628+ auto const & ald = tridiag_workspace.array (0 );
629+ auto const & bd = tridiag_workspace.array (1 );
630+ auto const & cud = tridiag_workspace.array (2 );
631+ auto const & scratch = tridiag_workspace.array (3 );
631632
632- amrex::ParallelFor (xybox, [=] AMREX_GPU_DEVICE (int i, int j, int )
633- {
634- T a = facx*(i+offset[0 ]);
635- T b = facy*(j+offset[1 ]);
636- T k2 = dxfac * (std::cos (a)-T (1 ))
637- + dyfac * (std::cos (b)-T (1 ));
638-
639- // Tridiagonal solve
640- for (int k=0 ; k < nz; k++) {
641- if (k==0 ) {
642- ald (i,j,k) = T (0 .);
643- cud (i,j,k) = tric (i,j,k);
644- if (zlo_neumann) {
645- bd (i,j,k) = k2 - cud (i,j,k);
646- } else {
647- bd (i,j,k) = k2 - cud (i,j,k) - T (2.0 )*tria (i,j,k);
648- }
649- } else if (k == nz-1 ) {
650- ald (i,j,k) = tria (i,j,k);
651- cud (i,j,k) = T (0 .);
652- if (zhi_neumann) {
653- bd (i,j,k) = k2 - ald (i,j,k);
654- if (i == 0 && j == 0 && is_singular) {
655- bd (i,j,k) *= T (2.0 );
633+ amrex::ParallelFor (xybox, [=] AMREX_GPU_DEVICE (int i, int j, int )
634+ {
635+ T a = facx*(i+offset[0 ]);
636+ T b = facy*(j+offset[1 ]);
637+ T k2 = dxfac * (std::cos (a)-T (1 ))
638+ + dyfac * (std::cos (b)-T (1 ));
639+
640+ // Tridiagonal solve
641+ for (int k=0 ; k < nz; k++) {
642+ if (k==0 ) {
643+ ald (i,j,k) = T (0 .);
644+ cud (i,j,k) = tric (i,j,k);
645+ if (zlo_neumann) {
646+ bd (i,j,k) = k2 - cud (i,j,k);
647+ } else {
648+ bd (i,j,k) = k2 - cud (i,j,k) - T (2.0 )*tria (i,j,k);
649+ }
650+ } else if (k == nz-1 ) {
651+ ald (i,j,k) = tria (i,j,k);
652+ cud (i,j,k) = T (0 .);
653+ if (zhi_neumann) {
654+ bd (i,j,k) = k2 - ald (i,j,k);
655+ if (i == 0 && j == 0 && is_singular) {
656+ bd (i,j,k) *= T (2.0 );
657+ }
658+ } else {
659+ bd (i,j,k) = k2 - ald (i,j,k) - T (2.0 )*tric (i,j,k);
656660 }
657661 } else {
658- bd (i,j,k) = k2 - ald (i,j,k) - T (2.0 )*tric (i,j,k);
662+ ald (i,j,k) = tria (i,j,k);
663+ cud (i,j,k) = tric (i,j,k);
664+ bd (i,j,k) = k2 -ald (i,j,k)-cud (i,j,k);
659665 }
660- } else {
661- ald (i,j,k) = tria (i,j,k);
662- cud (i,j,k) = tric (i,j,k);
663- bd (i,j,k) = k2 -ald (i,j,k)-cud (i,j,k);
664666 }
665- }
666667
667- scratch (i,j,0 ) = cud (i,j,0 )/bd (i,j,0 );
668- spectral (i,j,0 ) = spectral (i,j,0 )/bd (i,j,0 );
668+ scratch (i,j,0 ) = cud (i,j,0 )/bd (i,j,0 );
669+ spectral (i,j,0 ) = spectral (i,j,0 )/bd (i,j,0 );
669670
670- for (int k = 1 ; k < nz; k++) {
671- if (k < nz-1 ) {
672- scratch (i,j,k) = cud (i,j,k) / (bd (i,j,k) - ald (i,j,k) * scratch (i,j,k-1 ));
671+ for (int k = 1 ; k < nz; k++) {
672+ if (k < nz-1 ) {
673+ scratch (i,j,k) = cud (i,j,k) / (bd (i,j,k) - ald (i,j,k) * scratch (i,j,k-1 ));
674+ }
675+ spectral (i,j,k) = (spectral (i,j,k) - ald (i,j,k) * spectral (i,j,k - 1 ))
676+ / (bd (i,j,k) - ald (i,j,k) * scratch (i,j,k-1 ));
673677 }
674- spectral (i,j,k) = (spectral (i,j,k) - ald (i,j,k) * spectral (i,j,k - 1 ))
675- / (bd (i,j,k) - ald (i,j,k) * scratch (i,j,k-1 ));
676- }
677678
678- for (int k = nz - 2 ; k >= 0 ; k--) {
679- spectral (i,j,k) -= scratch (i,j,k) * spectral (i,j,k + 1 );
680- }
679+ for (int k = nz - 2 ; k >= 0 ; k--) {
680+ spectral (i,j,k) -= scratch (i,j,k) * spectral (i,j,k + 1 );
681+ }
681682
682- for (int k = 0 ; k < nz; ++k) {
683- spectral (i,j,k) *= scale;
684- }
685- });
686- Gpu::streamSynchronize ();
683+ for (int k = 0 ; k < nz; ++k) {
684+ spectral (i,j,k) *= scale;
685+ }
686+ });
687+ Gpu::streamSynchronize ();
687688
688689#else
689690
690- Gpu::DeviceVector<T> ald (nz);
691- Gpu::DeviceVector<T> bd (nz);
692- Gpu::DeviceVector<T> cud (nz);
693- Gpu::DeviceVector<T> scratch (nz);
691+ Gpu::DeviceVector<T> ald (nz);
692+ Gpu::DeviceVector<T> bd (nz);
693+ Gpu::DeviceVector<T> cud (nz);
694+ Gpu::DeviceVector<T> scratch (nz);
694695
695- amrex::LoopOnCpu (xybox, [&] (int i, int j, int )
696- {
697- T a = facx*(i+offset[0 ]);
698- T b = facy*(j+offset[1 ]);
699- T k2 = dxfac * (std::cos (a)-T (1 ))
700- + dyfac * (std::cos (b)-T (1 ));
701-
702- // Tridiagonal solve
703- for (int k=0 ; k < nz; k++) {
704- if (k==0 ) {
705- ald[k] = T (0 .);
706- cud[k] = tric (i,j,k);
707- if (zlo_neumann) {
708- bd[k] = k2 - cud[k];
709- } else {
710- bd[k] = k2 - cud[k] - T (2.0 )*tria (i,j,k);
711- }
712- } else if (k == nz-1 ) {
713- ald[k] = tria (i,j,k);
714- cud[k] = T (0 .);
715- if (zhi_neumann) {
716- bd[k] = k2 - ald[k];
717- if (i == 0 && j == 0 && is_singular) {
718- bd[k] *= T (2.0 );
696+ amrex::LoopOnCpu (xybox, [&] (int i, int j, int )
697+ {
698+ T a = facx*(i+offset[0 ]);
699+ T b = facy*(j+offset[1 ]);
700+ T k2 = dxfac * (std::cos (a)-T (1 ))
701+ + dyfac * (std::cos (b)-T (1 ));
702+
703+ // Tridiagonal solve
704+ for (int k=0 ; k < nz; k++) {
705+ if (k==0 ) {
706+ ald[k] = T (0 .);
707+ cud[k] = tric (i,j,k);
708+ if (zlo_neumann) {
709+ bd[k] = k2 - cud[k];
710+ } else {
711+ bd[k] = k2 - cud[k] - T (2.0 )*tria (i,j,k);
712+ }
713+ } else if (k == nz-1 ) {
714+ ald[k] = tria (i,j,k);
715+ cud[k] = T (0 .);
716+ if (zhi_neumann) {
717+ bd[k] = k2 - ald[k];
718+ if (i == 0 && j == 0 && is_singular) {
719+ bd[k] *= T (2.0 );
720+ }
721+ } else {
722+ bd[k] = k2 - ald[k] - T (2.0 )*tric (i,j,k);
719723 }
720724 } else {
721- bd[k] = k2 - ald[k] - T (2.0 )*tric (i,j,k);
725+ ald[k] = tria (i,j,k);
726+ cud[k] = tric (i,j,k);
727+ bd[k] = k2 -ald[k]-cud[k];
722728 }
723- } else {
724- ald[k] = tria (i,j,k);
725- cud[k] = tric (i,j,k);
726- bd[k] = k2 -ald[k]-cud[k];
727729 }
728- }
729730
730- scratch[0 ] = cud[0 ]/bd[0 ];
731- spectral (i,j,0 ) = spectral (i,j,0 )/bd[0 ];
731+ scratch[0 ] = cud[0 ]/bd[0 ];
732+ spectral (i,j,0 ) = spectral (i,j,0 )/bd[0 ];
732733
733- for (int k = 1 ; k < nz; k++) {
734- if (k < nz-1 ) {
735- scratch[k] = cud[k] / (bd[k] - ald[k] * scratch[k-1 ]);
734+ for (int k = 1 ; k < nz; k++) {
735+ if (k < nz-1 ) {
736+ scratch[k] = cud[k] / (bd[k] - ald[k] * scratch[k-1 ]);
737+ }
738+ spectral (i,j,k) = (spectral (i,j,k) - ald[k] * spectral (i,j,k - 1 ))
739+ / (bd[k] - ald[k] * scratch[k-1 ]);
736740 }
737- spectral (i,j,k) = (spectral (i,j,k) - ald[k] * spectral (i,j,k - 1 ))
738- / (bd[k] - ald[k] * scratch[k-1 ]);
739- }
740741
741- for (int k = nz - 2 ; k >= 0 ; k--) {
742- spectral (i,j,k) -= scratch[k] * spectral (i,j,k + 1 );
743- }
742+ for (int k = nz - 2 ; k >= 0 ; k--) {
743+ spectral (i,j,k) -= scratch[k] * spectral (i,j,k + 1 );
744+ }
744745
745- for (int k = 0 ; k < nz; ++k) {
746- spectral (i,j,k) *= scale;
747- }
748- });
749- #endif
750- }
746+ for (int k = 0 ; k < nz; ++k) {
747+ spectral (i,j,k) *= scale;
748+ }
749+ });
751750#endif
752- }
753-
754- template <typename MF>
755- template <typename FA>
756- void PoissonHybrid<MF>::solve_z0 (FA& spmf)
757- {
758- BL_PROFILE (" PoissonHybrid::solve_z0" );
759-
760- #if (AMREX_SPACEDIM < 3)
761- amrex::ignore_unused (spmf);
762- #else
763-
764- // This function cannot be merged with solve_z because of CUDA's
765- // limitation with constexpr if on Windows.
766- auto facx = Math::pi<T>()/T (m_geom.Domain ().length (0 ));
767- auto facy = Math::pi<T>()/T (m_geom.Domain ().length (1 ));
768- if (m_bc[0 ].first == Boundary::periodic) { facx *= T (2 ); }
769- if (m_bc[1 ].first == Boundary::periodic) { facy *= T (2 ); }
770- auto dxfac = T (2 )/T (m_geom.CellSize (0 )*m_geom.CellSize (0 ));
771- auto dyfac = T (2 )/T (m_geom.CellSize (1 )*m_geom.CellSize (1 ));
772- auto scale = (m_r2x) ? m_r2x->scalingFactor () : m_r2c->scalingFactor ();
773-
774- if (m_geom.Domain ().length (0 ) == 1 ) { dxfac = 0 ; }
775- if (m_geom.Domain ().length (1 ) == 1 ) { dyfac = 0 ; }
776-
777- GpuArray<T,AMREX_SPACEDIM-1 > offset{T (0 ),T (0 )};
778- for (int idim = 0 ; idim < AMREX_SPACEDIM-1 ; ++idim) {
779- if (m_geom.Domain ().length (idim) > 1 ) {
780- if (m_bc[idim].first == Boundary::odd &&
781- m_bc[idim].second == Boundary::odd)
782- {
783- offset[idim] = T (1 );
784- }
785- else if ((m_bc[idim].first == Boundary::odd &&
786- m_bc[idim].second == Boundary::even) ||
787- (m_bc[idim].first == Boundary::even &&
788- m_bc[idim].second == Boundary::odd))
789- {
790- offset[idim] = T (0.5 );
791- }
792751 }
793752 }
794-
795- for (MFIter mfi (spmf,TilingIfNotGPU ()); mfi.isValid (); ++mfi)
796- {
797- auto const & spectral = spmf.array (mfi);
798- auto const & box = mfi.validbox ();
799- amrex::ParallelFor (box, [=] AMREX_GPU_DEVICE (int i, int j, int k)
800- {
801- T a = facx*(i+offset[0 ]);
802- T b = facy*(j+offset[1 ]);
803- T k2 = dxfac * (std::cos (a)-T (1 ))
804- + dyfac * (std::cos (b)-T (1 ));
805- if (k2 != T (0 )) {
806- spectral (i,j,k) /= k2;
807- }
808- spectral (i,j,k) *= scale;
809- });
810- }
811753#endif
812754}
813755
0 commit comments