@@ -202,6 +202,9 @@ public:
202202 template <typename FA, typename TRIA, typename TRIC>
203203 void solve_z (FA& spmf, TRIA const & tria, TRIC const & tric);
204204
205+ template <typename FA>
206+ void solve_z0 (FA& spmf);
207+
205208 [[nodiscard]] std::pair<BoxArray,DistributionMapping> getSpectralDataLayout () const ;
206209
207210private:
@@ -523,18 +526,36 @@ void PoissonHybrid<MF>::solve (MF& a_soln, MF const& a_rhs, TRIA const& tria,
523526 if (m_r2c)
524527 {
525528 m_r2c->forward (*rhs, m_spmf_c);
526- solve_z (m_spmf_c, tria, tric);
529+ if constexpr (std::is_same_v<TRIA,fft_poisson_detail::Tri_Zero<T>> &&
530+ std::is_same_v<TRIC,fft_poisson_detail::Tri_Zero<T>>) {
531+ solve_z0 (m_spmf_c);
532+ amrex::ignore_unused (tria,tric);
533+ } else {
534+ solve_z (m_spmf_c, tria, tric);
535+ }
527536 m_r2c->backward_doit (m_spmf_c, *soln, ng, m_geom.periodicity ());
528537 }
529538 else
530539 {
531540 if (m_r2x->m_cy .empty ()) { // spectral data is real
532541 m_r2x->forward (*rhs, m_spmf_r);
533- solve_z (m_spmf_r, tria, tric);
542+ if constexpr (std::is_same_v<TRIA,fft_poisson_detail::Tri_Zero<T>> &&
543+ std::is_same_v<TRIC,fft_poisson_detail::Tri_Zero<T>>) {
544+ solve_z0 (m_spmf_r);
545+ amrex::ignore_unused (tria,tric);
546+ } else {
547+ solve_z (m_spmf_r, tria, tric);
548+ }
534549 m_r2x->backward (m_spmf_r, *soln, ng, m_geom.periodicity ());
535550 } else { // spectral data is complex.
536551 m_r2x->forward (*rhs, m_spmf_c);
537- solve_z (m_spmf_c, tria, tric);
552+ if constexpr (std::is_same_v<TRIA,fft_poisson_detail::Tri_Zero<T>> &&
553+ std::is_same_v<TRIC,fft_poisson_detail::Tri_Zero<T>>) {
554+ solve_z0 (m_spmf_c);
555+ amrex::ignore_unused (tria,tric);
556+ } else {
557+ solve_z (m_spmf_c, tria, tric);
558+ }
538559 m_r2x->backward (m_spmf_c, *soln, ng, m_geom.periodicity ());
539560 }
540561 }
@@ -581,175 +602,212 @@ void PoissonHybrid<MF>::solve_z (FA& spmf, TRIA const& tria, TRIC const& tric)
581602 }
582603 }
583604
584- if constexpr (std::is_same_v<TRIA,fft_poisson_detail::Tri_Zero<T>> &&
585- std::is_same_v<TRIC,fft_poisson_detail::Tri_Zero<T>>) {
586- #if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
587- #pragma omp parallel
588- #endif
589- for (MFIter mfi (spmf,TilingIfNotGPU ()); mfi.isValid (); ++mfi)
590- {
591- auto const & spectral = spmf.array (mfi);
592- auto const & box = mfi.validbox ();
593- amrex::ParallelFor (box, [=] AMREX_GPU_DEVICE (int i, int j, int k)
594- {
595- T a = facx*(i+offset[0 ]);
596- T b = facy*(j+offset[1 ]);
597- T k2 = dxfac * (std::cos (a)-T (1 ))
598- + dyfac * (std::cos (b)-T (1 ));
599- if (k2 != T (0 )) {
600- spectral (i,j,k) /= k2;
601- }
602- spectral (i,j,k) *= scale;
603- });
604- }
605- } else {
606- bool zlo_neumann = m_bc[2 ].first == Boundary::even;
607- bool zhi_neumann = m_bc[2 ].second == Boundary::even;
608- bool is_singular = (offset[0 ] == T (0 )) && (offset[1 ] == T (0 ))
609- && zlo_neumann && zhi_neumann;
605+ bool zlo_neumann = m_bc[2 ].first == Boundary::even;
606+ bool zhi_neumann = m_bc[2 ].second == Boundary::even;
607+ bool is_singular = (offset[0 ] == T (0 )) && (offset[1 ] == T (0 ))
608+ && zlo_neumann && zhi_neumann;
610609
611- auto nz = m_geom.Domain ().length (2 );
610+ auto nz = m_geom.Domain ().length (2 );
612611
613612#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
614613#pragma omp parallel
615614#endif
616- for (MFIter mfi (spmf); mfi.isValid (); ++mfi)
617- {
618- auto const & spectral = spmf.array (mfi);
619- auto const & box = mfi.validbox ();
620- auto const & xybox = amrex::makeSlab (box, 2 , 0 );
615+ for (MFIter mfi (spmf); mfi.isValid (); ++mfi)
616+ {
617+ auto const & spectral = spmf.array (mfi);
618+ auto const & box = mfi.validbox ();
619+ auto const & xybox = amrex::makeSlab (box, 2 , 0 );
621620
622621#ifdef AMREX_USE_GPU
623- // xxxxx TODO: We need to explore how to optimize this
624- // function. Maybe we can use cusparse. Maybe we should make
625- // z-direction to be the unit stride direction.
622+ // xxxxx TODO: We need to explore how to optimize this
623+ // function. Maybe we can use cusparse. Maybe we should make
624+ // z-direction to be the unit stride direction.
626625
627- FArrayBox tridiag_workspace (box,4 );
628- auto const & ald = tridiag_workspace.array (0 );
629- auto const & bd = tridiag_workspace.array (1 );
630- auto const & cud = tridiag_workspace.array (2 );
631- auto const & scratch = tridiag_workspace.array (3 );
626+ FArrayBox tridiag_workspace (box,4 );
627+ auto const & ald = tridiag_workspace.array (0 );
628+ auto const & bd = tridiag_workspace.array (1 );
629+ auto const & cud = tridiag_workspace.array (2 );
630+ auto const & scratch = tridiag_workspace.array (3 );
632631
633- amrex::ParallelFor (xybox, [=] AMREX_GPU_DEVICE (int i, int j, int )
634- {
635- T a = facx*(i+offset[0 ]);
636- T b = facy*(j+offset[1 ]);
637- T k2 = dxfac * (std::cos (a)-T (1 ))
638- + dyfac * (std::cos (b)-T (1 ));
639-
640- // Tridiagonal solve
641- for (int k=0 ; k < nz; k++) {
642- if (k==0 ) {
643- ald (i,j,k) = T (0 .);
644- cud (i,j,k) = tric (i,j,k);
645- if (zlo_neumann) {
646- bd (i,j,k) = k2 - cud (i,j,k);
647- } else {
648- bd (i,j,k) = k2 - cud (i,j,k) - T (2.0 )*tria (i,j,k);
649- }
650- } else if (k == nz-1 ) {
651- ald (i,j,k) = tria (i,j,k);
652- cud (i,j,k) = T (0 .);
653- if (zhi_neumann) {
654- bd (i,j,k) = k2 - ald (i,j,k);
655- if (i == 0 && j == 0 && is_singular) {
656- bd (i,j,k) *= T (2.0 );
657- }
658- } else {
659- bd (i,j,k) = k2 - ald (i,j,k) - T (2.0 )*tric (i,j,k);
632+ amrex::ParallelFor (xybox, [=] AMREX_GPU_DEVICE (int i, int j, int )
633+ {
634+ T a = facx*(i+offset[0 ]);
635+ T b = facy*(j+offset[1 ]);
636+ T k2 = dxfac * (std::cos (a)-T (1 ))
637+ + dyfac * (std::cos (b)-T (1 ));
638+
639+ // Tridiagonal solve
640+ for (int k=0 ; k < nz; k++) {
641+ if (k==0 ) {
642+ ald (i,j,k) = T (0 .);
643+ cud (i,j,k) = tric (i,j,k);
644+ if (zlo_neumann) {
645+ bd (i,j,k) = k2 - cud (i,j,k);
646+ } else {
647+ bd (i,j,k) = k2 - cud (i,j,k) - T (2.0 )*tria (i,j,k);
648+ }
649+ } else if (k == nz-1 ) {
650+ ald (i,j,k) = tria (i,j,k);
651+ cud (i,j,k) = T (0 .);
652+ if (zhi_neumann) {
653+ bd (i,j,k) = k2 - ald (i,j,k);
654+ if (i == 0 && j == 0 && is_singular) {
655+ bd (i,j,k) *= T (2.0 );
660656 }
661657 } else {
662- ald (i,j,k) = tria (i,j,k);
663- cud (i,j,k) = tric (i,j,k);
664- bd (i,j,k) = k2 -ald (i,j,k)-cud (i,j,k);
658+ bd (i,j,k) = k2 - ald (i,j,k) - T (2.0 )*tric (i,j,k);
665659 }
660+ } else {
661+ ald (i,j,k) = tria (i,j,k);
662+ cud (i,j,k) = tric (i,j,k);
663+ bd (i,j,k) = k2 -ald (i,j,k)-cud (i,j,k);
666664 }
665+ }
667666
668- scratch (i,j,0 ) = cud (i,j,0 )/bd (i,j,0 );
669- spectral (i,j,0 ) = spectral (i,j,0 )/bd (i,j,0 );
667+ scratch (i,j,0 ) = cud (i,j,0 )/bd (i,j,0 );
668+ spectral (i,j,0 ) = spectral (i,j,0 )/bd (i,j,0 );
670669
671- for (int k = 1 ; k < nz; k++) {
672- if (k < nz-1 ) {
673- scratch (i,j,k) = cud (i,j,k) / (bd (i,j,k) - ald (i,j,k) * scratch (i,j,k-1 ));
674- }
675- spectral (i,j,k) = (spectral (i,j,k) - ald (i,j,k) * spectral (i,j,k - 1 ))
676- / (bd (i,j,k) - ald (i,j,k) * scratch (i,j,k-1 ));
670+ for (int k = 1 ; k < nz; k++) {
671+ if (k < nz-1 ) {
672+ scratch (i,j,k) = cud (i,j,k) / (bd (i,j,k) - ald (i,j,k) * scratch (i,j,k-1 ));
677673 }
674+ spectral (i,j,k) = (spectral (i,j,k) - ald (i,j,k) * spectral (i,j,k - 1 ))
675+ / (bd (i,j,k) - ald (i,j,k) * scratch (i,j,k-1 ));
676+ }
678677
679- for (int k = nz - 2 ; k >= 0 ; k--) {
680- spectral (i,j,k) -= scratch (i,j,k) * spectral (i,j,k + 1 );
681- }
678+ for (int k = nz - 2 ; k >= 0 ; k--) {
679+ spectral (i,j,k) -= scratch (i,j,k) * spectral (i,j,k + 1 );
680+ }
682681
683- for (int k = 0 ; k < nz; ++k) {
684- spectral (i,j,k) *= scale;
685- }
686- });
687- Gpu::streamSynchronize ();
682+ for (int k = 0 ; k < nz; ++k) {
683+ spectral (i,j,k) *= scale;
684+ }
685+ });
686+ Gpu::streamSynchronize ();
688687
689688#else
690689
691- Gpu::DeviceVector<T> ald (nz);
692- Gpu::DeviceVector<T> bd (nz);
693- Gpu::DeviceVector<T> cud (nz);
694- Gpu::DeviceVector<T> scratch (nz);
690+ Gpu::DeviceVector<T> ald (nz);
691+ Gpu::DeviceVector<T> bd (nz);
692+ Gpu::DeviceVector<T> cud (nz);
693+ Gpu::DeviceVector<T> scratch (nz);
695694
696- amrex::LoopOnCpu (xybox, [&] (int i, int j, int )
697- {
698- T a = facx*(i+offset[0 ]);
699- T b = facy*(j+offset[1 ]);
700- T k2 = dxfac * (std::cos (a)-T (1 ))
701- + dyfac * (std::cos (b)-T (1 ));
702-
703- // Tridiagonal solve
704- for (int k=0 ; k < nz; k++) {
705- if (k==0 ) {
706- ald[k] = T (0 .);
707- cud[k] = tric (i,j,k);
708- if (zlo_neumann) {
709- bd[k] = k2 - cud[k];
710- } else {
711- bd[k] = k2 - cud[k] - T (2.0 )*tria (i,j,k);
712- }
713- } else if (k == nz-1 ) {
714- ald[k] = tria (i,j,k);
715- cud[k] = T (0 .);
716- if (zhi_neumann) {
717- bd[k] = k2 - ald[k];
718- if (i == 0 && j == 0 && is_singular) {
719- bd[k] *= T (2.0 );
720- }
721- } else {
722- bd[k] = k2 - ald[k] - T (2.0 )*tric (i,j,k);
695+ amrex::LoopOnCpu (xybox, [&] (int i, int j, int )
696+ {
697+ T a = facx*(i+offset[0 ]);
698+ T b = facy*(j+offset[1 ]);
699+ T k2 = dxfac * (std::cos (a)-T (1 ))
700+ + dyfac * (std::cos (b)-T (1 ));
701+
702+ // Tridiagonal solve
703+ for (int k=0 ; k < nz; k++) {
704+ if (k==0 ) {
705+ ald[k] = T (0 .);
706+ cud[k] = tric (i,j,k);
707+ if (zlo_neumann) {
708+ bd[k] = k2 - cud[k];
709+ } else {
710+ bd[k] = k2 - cud[k] - T (2.0 )*tria (i,j,k);
711+ }
712+ } else if (k == nz-1 ) {
713+ ald[k] = tria (i,j,k);
714+ cud[k] = T (0 .);
715+ if (zhi_neumann) {
716+ bd[k] = k2 - ald[k];
717+ if (i == 0 && j == 0 && is_singular) {
718+ bd[k] *= T (2.0 );
723719 }
724720 } else {
725- ald[k] = tria (i,j,k);
726- cud[k] = tric (i,j,k);
727- bd[k] = k2 -ald[k]-cud[k];
721+ bd[k] = k2 - ald[k] - T (2.0 )*tric (i,j,k);
728722 }
723+ } else {
724+ ald[k] = tria (i,j,k);
725+ cud[k] = tric (i,j,k);
726+ bd[k] = k2 -ald[k]-cud[k];
729727 }
728+ }
730729
731- scratch[0 ] = cud[0 ]/bd[0 ];
732- spectral (i,j,0 ) = spectral (i,j,0 )/bd[0 ];
730+ scratch[0 ] = cud[0 ]/bd[0 ];
731+ spectral (i,j,0 ) = spectral (i,j,0 )/bd[0 ];
733732
734- for (int k = 1 ; k < nz; k++) {
735- if (k < nz-1 ) {
736- scratch[k] = cud[k] / (bd[k] - ald[k] * scratch[k-1 ]);
737- }
738- spectral (i,j,k) = (spectral (i,j,k) - ald[k] * spectral (i,j,k - 1 ))
739- / (bd[k] - ald[k] * scratch[k-1 ]);
733+ for (int k = 1 ; k < nz; k++) {
734+ if (k < nz-1 ) {
735+ scratch[k] = cud[k] / (bd[k] - ald[k] * scratch[k-1 ]);
740736 }
737+ spectral (i,j,k) = (spectral (i,j,k) - ald[k] * spectral (i,j,k - 1 ))
738+ / (bd[k] - ald[k] * scratch[k-1 ]);
739+ }
741740
742- for (int k = nz - 2 ; k >= 0 ; k--) {
743- spectral (i,j,k) -= scratch[k] * spectral (i,j,k + 1 );
744- }
741+ for (int k = nz - 2 ; k >= 0 ; k--) {
742+ spectral (i,j,k) -= scratch[k] * spectral (i,j,k + 1 );
743+ }
745744
746- for (int k = 0 ; k < nz; ++k) {
747- spectral (i,j,k) *= scale;
748- }
749- });
745+ for (int k = 0 ; k < nz; ++k) {
746+ spectral (i,j,k) *= scale;
747+ }
748+ });
750749#endif
750+ }
751+ #endif
752+ }
753+
754+ template <typename MF>
755+ template <typename FA>
756+ void PoissonHybrid<MF>::solve_z0 (FA& spmf)
757+ {
758+ BL_PROFILE (" PoissonHybrid::solve_z0" );
759+
760+ #if (AMREX_SPACEDIM < 3)
761+ amrex::ignore_unused (spmf);
762+ #else
763+
764+ // This function cannot be merged with solve_z because of CUDA's
765+ // limitation with constexpr if on Windows.
766+ auto facx = Math::pi<T>()/T (m_geom.Domain ().length (0 ));
767+ auto facy = Math::pi<T>()/T (m_geom.Domain ().length (1 ));
768+ if (m_bc[0 ].first == Boundary::periodic) { facx *= T (2 ); }
769+ if (m_bc[1 ].first == Boundary::periodic) { facy *= T (2 ); }
770+ auto dxfac = T (2 )/T (m_geom.CellSize (0 )*m_geom.CellSize (0 ));
771+ auto dyfac = T (2 )/T (m_geom.CellSize (1 )*m_geom.CellSize (1 ));
772+ auto scale = (m_r2x) ? m_r2x->scalingFactor () : m_r2c->scalingFactor ();
773+
774+ if (m_geom.Domain ().length (0 ) == 1 ) { dxfac = 0 ; }
775+ if (m_geom.Domain ().length (1 ) == 1 ) { dyfac = 0 ; }
776+
777+ GpuArray<T,AMREX_SPACEDIM-1 > offset{T (0 ),T (0 )};
778+ for (int idim = 0 ; idim < AMREX_SPACEDIM-1 ; ++idim) {
779+ if (m_geom.Domain ().length (idim) > 1 ) {
780+ if (m_bc[idim].first == Boundary::odd &&
781+ m_bc[idim].second == Boundary::odd)
782+ {
783+ offset[idim] = T (1 );
784+ }
785+ else if ((m_bc[idim].first == Boundary::odd &&
786+ m_bc[idim].second == Boundary::even) ||
787+ (m_bc[idim].first == Boundary::even &&
788+ m_bc[idim].second == Boundary::odd))
789+ {
790+ offset[idim] = T (0.5 );
791+ }
751792 }
752793 }
794+
795+ for (MFIter mfi (spmf,TilingIfNotGPU ()); mfi.isValid (); ++mfi)
796+ {
797+ auto const & spectral = spmf.array (mfi);
798+ auto const & box = mfi.validbox ();
799+ amrex::ParallelFor (box, [=] AMREX_GPU_DEVICE (int i, int j, int k)
800+ {
801+ T a = facx*(i+offset[0 ]);
802+ T b = facy*(j+offset[1 ]);
803+ T k2 = dxfac * (std::cos (a)-T (1 ))
804+ + dyfac * (std::cos (b)-T (1 ));
805+ if (k2 != T (0 )) {
806+ spectral (i,j,k) /= k2;
807+ }
808+ spectral (i,j,k) *= scale;
809+ });
810+ }
753811#endif
754812}
755813
0 commit comments