Skip to content

Commit 27835ca

Browse files
committed
Revert "Fix for windows"
This reverts commit 81c004f.
1 parent 81c004f commit 27835ca

File tree

1 file changed

+138
-196
lines changed

1 file changed

+138
-196
lines changed

Src/FFT/AMReX_FFT_Poisson.H

Lines changed: 138 additions & 196 deletions
Original file line numberDiff line numberDiff line change
@@ -202,9 +202,6 @@ public:
202202
template <typename FA, typename TRIA, typename TRIC>
203203
void solve_z (FA& spmf, TRIA const& tria, TRIC const& tric);
204204

205-
template <typename FA>
206-
void solve_z0 (FA& spmf);
207-
208205
[[nodiscard]] std::pair<BoxArray,DistributionMapping> getSpectralDataLayout () const;
209206

210207
private:
@@ -526,36 +523,18 @@ void PoissonHybrid<MF>::solve (MF& a_soln, MF const& a_rhs, TRIA const& tria,
526523
if (m_r2c)
527524
{
528525
m_r2c->forward(*rhs, m_spmf_c);
529-
if constexpr (std::is_same_v<TRIA,fft_poisson_detail::Tri_Zero<T>> &&
530-
std::is_same_v<TRIC,fft_poisson_detail::Tri_Zero<T>>) {
531-
solve_z0(m_spmf_c);
532-
amrex::ignore_unused(tria,tric);
533-
} else {
534-
solve_z(m_spmf_c, tria, tric);
535-
}
526+
solve_z(m_spmf_c, tria, tric);
536527
m_r2c->backward_doit(m_spmf_c, *soln, ng, m_geom.periodicity());
537528
}
538529
else
539530
{
540531
if (m_r2x->m_cy.empty()) { // spectral data is real
541532
m_r2x->forward(*rhs, m_spmf_r);
542-
if constexpr (std::is_same_v<TRIA,fft_poisson_detail::Tri_Zero<T>> &&
543-
std::is_same_v<TRIC,fft_poisson_detail::Tri_Zero<T>>) {
544-
solve_z0(m_spmf_r);
545-
amrex::ignore_unused(tria,tric);
546-
} else {
547-
solve_z(m_spmf_r, tria, tric);
548-
}
533+
solve_z(m_spmf_r, tria, tric);
549534
m_r2x->backward(m_spmf_r, *soln, ng, m_geom.periodicity());
550535
} else { // spectral data is complex.
551536
m_r2x->forward(*rhs, m_spmf_c);
552-
if constexpr (std::is_same_v<TRIA,fft_poisson_detail::Tri_Zero<T>> &&
553-
std::is_same_v<TRIC,fft_poisson_detail::Tri_Zero<T>>) {
554-
solve_z0(m_spmf_c);
555-
amrex::ignore_unused(tria,tric);
556-
} else {
557-
solve_z(m_spmf_c, tria, tric);
558-
}
537+
solve_z(m_spmf_c, tria, tric);
559538
m_r2x->backward(m_spmf_c, *soln, ng, m_geom.periodicity());
560539
}
561540
}
@@ -602,212 +581,175 @@ void PoissonHybrid<MF>::solve_z (FA& spmf, TRIA const& tria, TRIC const& tric)
602581
}
603582
}
604583

605-
bool zlo_neumann = m_bc[2].first == Boundary::even;
606-
bool zhi_neumann = m_bc[2].second == Boundary::even;
607-
bool is_singular = (offset[0] == T(0)) && (offset[1] == T(0))
608-
&& zlo_neumann && zhi_neumann;
584+
if constexpr (std::is_same_v<TRIA,fft_poisson_detail::Tri_Zero<T>> &&
585+
std::is_same_v<TRIC,fft_poisson_detail::Tri_Zero<T>>) {
586+
#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
587+
#pragma omp parallel
588+
#endif
589+
for (MFIter mfi(spmf,TilingIfNotGPU()); mfi.isValid(); ++mfi)
590+
{
591+
auto const& spectral = spmf.array(mfi);
592+
auto const& box = mfi.validbox();
593+
amrex::ParallelFor(box, [=] AMREX_GPU_DEVICE (int i, int j, int k)
594+
{
595+
T a = facx*(i+offset[0]);
596+
T b = facy*(j+offset[1]);
597+
T k2 = dxfac * (std::cos(a)-T(1))
598+
+ dyfac * (std::cos(b)-T(1));
599+
if (k2 != T(0)) {
600+
spectral(i,j,k) /= k2;
601+
}
602+
spectral(i,j,k) *= scale;
603+
});
604+
}
605+
} else {
606+
bool zlo_neumann = m_bc[2].first == Boundary::even;
607+
bool zhi_neumann = m_bc[2].second == Boundary::even;
608+
bool is_singular = (offset[0] == T(0)) && (offset[1] == T(0))
609+
&& zlo_neumann && zhi_neumann;
609610

610-
auto nz = m_geom.Domain().length(2);
611+
auto nz = m_geom.Domain().length(2);
611612

612613
#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
613614
#pragma omp parallel
614615
#endif
615-
for (MFIter mfi(spmf); mfi.isValid(); ++mfi)
616-
{
617-
auto const& spectral = spmf.array(mfi);
618-
auto const& box = mfi.validbox();
619-
auto const& xybox = amrex::makeSlab(box, 2, 0);
616+
for (MFIter mfi(spmf); mfi.isValid(); ++mfi)
617+
{
618+
auto const& spectral = spmf.array(mfi);
619+
auto const& box = mfi.validbox();
620+
auto const& xybox = amrex::makeSlab(box, 2, 0);
620621

621622
#ifdef AMREX_USE_GPU
622-
// xxxxx TODO: We need to explore how to optimize this
623-
// function. Maybe we can use cusparse. Maybe we should make
624-
// z-direction to be the unit stride direction.
623+
// xxxxx TODO: We need to explore how to optimize this
624+
// function. Maybe we can use cusparse. Maybe we should make
625+
// z-direction to be the unit stride direction.
625626

626-
FArrayBox tridiag_workspace(box,4);
627-
auto const& ald = tridiag_workspace.array(0);
628-
auto const& bd = tridiag_workspace.array(1);
629-
auto const& cud = tridiag_workspace.array(2);
630-
auto const& scratch = tridiag_workspace.array(3);
627+
FArrayBox tridiag_workspace(box,4);
628+
auto const& ald = tridiag_workspace.array(0);
629+
auto const& bd = tridiag_workspace.array(1);
630+
auto const& cud = tridiag_workspace.array(2);
631+
auto const& scratch = tridiag_workspace.array(3);
631632

632-
amrex::ParallelFor(xybox, [=] AMREX_GPU_DEVICE (int i, int j, int)
633-
{
634-
T a = facx*(i+offset[0]);
635-
T b = facy*(j+offset[1]);
636-
T k2 = dxfac * (std::cos(a)-T(1))
637-
+ dyfac * (std::cos(b)-T(1));
638-
639-
// Tridiagonal solve
640-
for(int k=0; k < nz; k++) {
641-
if(k==0) {
642-
ald(i,j,k) = T(0.);
643-
cud(i,j,k) = tric(i,j,k);
644-
if (zlo_neumann) {
645-
bd(i,j,k) = k2 - cud(i,j,k);
646-
} else {
647-
bd(i,j,k) = k2 - cud(i,j,k) - T(2.0)*tria(i,j,k);
648-
}
649-
} else if (k == nz-1) {
650-
ald(i,j,k) = tria(i,j,k);
651-
cud(i,j,k) = T(0.);
652-
if (zhi_neumann) {
653-
bd(i,j,k) = k2 - ald(i,j,k);
654-
if (i == 0 && j == 0 && is_singular) {
655-
bd(i,j,k) *= T(2.0);
633+
amrex::ParallelFor(xybox, [=] AMREX_GPU_DEVICE (int i, int j, int)
634+
{
635+
T a = facx*(i+offset[0]);
636+
T b = facy*(j+offset[1]);
637+
T k2 = dxfac * (std::cos(a)-T(1))
638+
+ dyfac * (std::cos(b)-T(1));
639+
640+
// Tridiagonal solve
641+
for(int k=0; k < nz; k++) {
642+
if(k==0) {
643+
ald(i,j,k) = T(0.);
644+
cud(i,j,k) = tric(i,j,k);
645+
if (zlo_neumann) {
646+
bd(i,j,k) = k2 - cud(i,j,k);
647+
} else {
648+
bd(i,j,k) = k2 - cud(i,j,k) - T(2.0)*tria(i,j,k);
649+
}
650+
} else if (k == nz-1) {
651+
ald(i,j,k) = tria(i,j,k);
652+
cud(i,j,k) = T(0.);
653+
if (zhi_neumann) {
654+
bd(i,j,k) = k2 - ald(i,j,k);
655+
if (i == 0 && j == 0 && is_singular) {
656+
bd(i,j,k) *= T(2.0);
657+
}
658+
} else {
659+
bd(i,j,k) = k2 - ald(i,j,k) - T(2.0)*tric(i,j,k);
656660
}
657661
} else {
658-
bd(i,j,k) = k2 - ald(i,j,k) - T(2.0)*tric(i,j,k);
662+
ald(i,j,k) = tria(i,j,k);
663+
cud(i,j,k) = tric(i,j,k);
664+
bd(i,j,k) = k2 -ald(i,j,k)-cud(i,j,k);
659665
}
660-
} else {
661-
ald(i,j,k) = tria(i,j,k);
662-
cud(i,j,k) = tric(i,j,k);
663-
bd(i,j,k) = k2 -ald(i,j,k)-cud(i,j,k);
664666
}
665-
}
666667

667-
scratch(i,j,0) = cud(i,j,0)/bd(i,j,0);
668-
spectral(i,j,0) = spectral(i,j,0)/bd(i,j,0);
668+
scratch(i,j,0) = cud(i,j,0)/bd(i,j,0);
669+
spectral(i,j,0) = spectral(i,j,0)/bd(i,j,0);
669670

670-
for (int k = 1; k < nz; k++) {
671-
if (k < nz-1) {
672-
scratch(i,j,k) = cud(i,j,k) / (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1));
671+
for (int k = 1; k < nz; k++) {
672+
if (k < nz-1) {
673+
scratch(i,j,k) = cud(i,j,k) / (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1));
674+
}
675+
spectral(i,j,k) = (spectral(i,j,k) - ald(i,j,k) * spectral(i,j,k - 1))
676+
/ (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1));
673677
}
674-
spectral(i,j,k) = (spectral(i,j,k) - ald(i,j,k) * spectral(i,j,k - 1))
675-
/ (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1));
676-
}
677678

678-
for (int k = nz - 2; k >= 0; k--) {
679-
spectral(i,j,k) -= scratch(i,j,k) * spectral(i,j,k + 1);
680-
}
679+
for (int k = nz - 2; k >= 0; k--) {
680+
spectral(i,j,k) -= scratch(i,j,k) * spectral(i,j,k + 1);
681+
}
681682

682-
for (int k = 0; k < nz; ++k) {
683-
spectral(i,j,k) *= scale;
684-
}
685-
});
686-
Gpu::streamSynchronize();
683+
for (int k = 0; k < nz; ++k) {
684+
spectral(i,j,k) *= scale;
685+
}
686+
});
687+
Gpu::streamSynchronize();
687688

688689
#else
689690

690-
Gpu::DeviceVector<T> ald(nz);
691-
Gpu::DeviceVector<T> bd(nz);
692-
Gpu::DeviceVector<T> cud(nz);
693-
Gpu::DeviceVector<T> scratch(nz);
691+
Gpu::DeviceVector<T> ald(nz);
692+
Gpu::DeviceVector<T> bd(nz);
693+
Gpu::DeviceVector<T> cud(nz);
694+
Gpu::DeviceVector<T> scratch(nz);
694695

695-
amrex::LoopOnCpu(xybox, [&] (int i, int j, int)
696-
{
697-
T a = facx*(i+offset[0]);
698-
T b = facy*(j+offset[1]);
699-
T k2 = dxfac * (std::cos(a)-T(1))
700-
+ dyfac * (std::cos(b)-T(1));
701-
702-
// Tridiagonal solve
703-
for(int k=0; k < nz; k++) {
704-
if(k==0) {
705-
ald[k] = T(0.);
706-
cud[k] = tric(i,j,k);
707-
if (zlo_neumann) {
708-
bd[k] = k2 - cud[k];
709-
} else {
710-
bd[k] = k2 - cud[k] - T(2.0)*tria(i,j,k);
711-
}
712-
} else if (k == nz-1) {
713-
ald[k] = tria(i,j,k);
714-
cud[k] = T(0.);
715-
if (zhi_neumann) {
716-
bd[k] = k2 - ald[k];
717-
if (i == 0 && j == 0 && is_singular) {
718-
bd[k] *= T(2.0);
696+
amrex::LoopOnCpu(xybox, [&] (int i, int j, int)
697+
{
698+
T a = facx*(i+offset[0]);
699+
T b = facy*(j+offset[1]);
700+
T k2 = dxfac * (std::cos(a)-T(1))
701+
+ dyfac * (std::cos(b)-T(1));
702+
703+
// Tridiagonal solve
704+
for(int k=0; k < nz; k++) {
705+
if(k==0) {
706+
ald[k] = T(0.);
707+
cud[k] = tric(i,j,k);
708+
if (zlo_neumann) {
709+
bd[k] = k2 - cud[k];
710+
} else {
711+
bd[k] = k2 - cud[k] - T(2.0)*tria(i,j,k);
712+
}
713+
} else if (k == nz-1) {
714+
ald[k] = tria(i,j,k);
715+
cud[k] = T(0.);
716+
if (zhi_neumann) {
717+
bd[k] = k2 - ald[k];
718+
if (i == 0 && j == 0 && is_singular) {
719+
bd[k] *= T(2.0);
720+
}
721+
} else {
722+
bd[k] = k2 - ald[k] - T(2.0)*tric(i,j,k);
719723
}
720724
} else {
721-
bd[k] = k2 - ald[k] - T(2.0)*tric(i,j,k);
725+
ald[k] = tria(i,j,k);
726+
cud[k] = tric(i,j,k);
727+
bd[k] = k2 -ald[k]-cud[k];
722728
}
723-
} else {
724-
ald[k] = tria(i,j,k);
725-
cud[k] = tric(i,j,k);
726-
bd[k] = k2 -ald[k]-cud[k];
727729
}
728-
}
729730

730-
scratch[0] = cud[0]/bd[0];
731-
spectral(i,j,0) = spectral(i,j,0)/bd[0];
731+
scratch[0] = cud[0]/bd[0];
732+
spectral(i,j,0) = spectral(i,j,0)/bd[0];
732733

733-
for (int k = 1; k < nz; k++) {
734-
if (k < nz-1) {
735-
scratch[k] = cud[k] / (bd[k] - ald[k] * scratch[k-1]);
734+
for (int k = 1; k < nz; k++) {
735+
if (k < nz-1) {
736+
scratch[k] = cud[k] / (bd[k] - ald[k] * scratch[k-1]);
737+
}
738+
spectral(i,j,k) = (spectral(i,j,k) - ald[k] * spectral(i,j,k - 1))
739+
/ (bd[k] - ald[k] * scratch[k-1]);
736740
}
737-
spectral(i,j,k) = (spectral(i,j,k) - ald[k] * spectral(i,j,k - 1))
738-
/ (bd[k] - ald[k] * scratch[k-1]);
739-
}
740741

741-
for (int k = nz - 2; k >= 0; k--) {
742-
spectral(i,j,k) -= scratch[k] * spectral(i,j,k + 1);
743-
}
742+
for (int k = nz - 2; k >= 0; k--) {
743+
spectral(i,j,k) -= scratch[k] * spectral(i,j,k + 1);
744+
}
744745

745-
for (int k = 0; k < nz; ++k) {
746-
spectral(i,j,k) *= scale;
747-
}
748-
});
749-
#endif
750-
}
746+
for (int k = 0; k < nz; ++k) {
747+
spectral(i,j,k) *= scale;
748+
}
749+
});
751750
#endif
752-
}
753-
754-
template <typename MF>
755-
template <typename FA>
756-
void PoissonHybrid<MF>::solve_z0 (FA& spmf)
757-
{
758-
BL_PROFILE("PoissonHybrid::solve_z0");
759-
760-
#if (AMREX_SPACEDIM < 3)
761-
amrex::ignore_unused(spmf);
762-
#else
763-
764-
// This function cannot be merged with solve_z because of CUDA's
765-
// limitation with constexpr if on Windows.
766-
auto facx = Math::pi<T>()/T(m_geom.Domain().length(0));
767-
auto facy = Math::pi<T>()/T(m_geom.Domain().length(1));
768-
if (m_bc[0].first == Boundary::periodic) { facx *= T(2); }
769-
if (m_bc[1].first == Boundary::periodic) { facy *= T(2); }
770-
auto dxfac = T(2)/T(m_geom.CellSize(0)*m_geom.CellSize(0));
771-
auto dyfac = T(2)/T(m_geom.CellSize(1)*m_geom.CellSize(1));
772-
auto scale = (m_r2x) ? m_r2x->scalingFactor() : m_r2c->scalingFactor();
773-
774-
if (m_geom.Domain().length(0) == 1) { dxfac = 0; }
775-
if (m_geom.Domain().length(1) == 1) { dyfac = 0; }
776-
777-
GpuArray<T,AMREX_SPACEDIM-1> offset{T(0),T(0)};
778-
for (int idim = 0; idim < AMREX_SPACEDIM-1; ++idim) {
779-
if (m_geom.Domain().length(idim) > 1) {
780-
if (m_bc[idim].first == Boundary::odd &&
781-
m_bc[idim].second == Boundary::odd)
782-
{
783-
offset[idim] = T(1);
784-
}
785-
else if ((m_bc[idim].first == Boundary::odd &&
786-
m_bc[idim].second == Boundary::even) ||
787-
(m_bc[idim].first == Boundary::even &&
788-
m_bc[idim].second == Boundary::odd))
789-
{
790-
offset[idim] = T(0.5);
791-
}
792751
}
793752
}
794-
795-
for (MFIter mfi(spmf,TilingIfNotGPU()); mfi.isValid(); ++mfi)
796-
{
797-
auto const& spectral = spmf.array(mfi);
798-
auto const& box = mfi.validbox();
799-
amrex::ParallelFor(box, [=] AMREX_GPU_DEVICE (int i, int j, int k)
800-
{
801-
T a = facx*(i+offset[0]);
802-
T b = facy*(j+offset[1]);
803-
T k2 = dxfac * (std::cos(a)-T(1))
804-
+ dyfac * (std::cos(b)-T(1));
805-
if (k2 != T(0)) {
806-
spectral(i,j,k) /= k2;
807-
}
808-
spectral(i,j,k) *= scale;
809-
});
810-
}
811753
#endif
812754
}
813755

0 commit comments

Comments
 (0)