Skip to content

Commit 81c004f

Browse files
committed
Fix for windows
1 parent 7d7c574 commit 81c004f

File tree

1 file changed

+196
-138
lines changed

1 file changed

+196
-138
lines changed

Src/FFT/AMReX_FFT_Poisson.H

Lines changed: 196 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,9 @@ public:
202202
template <typename FA, typename TRIA, typename TRIC>
203203
void solve_z (FA& spmf, TRIA const& tria, TRIC const& tric);
204204

205+
template <typename FA>
206+
void solve_z0 (FA& spmf);
207+
205208
[[nodiscard]] std::pair<BoxArray,DistributionMapping> getSpectralDataLayout () const;
206209

207210
private:
@@ -523,18 +526,36 @@ void PoissonHybrid<MF>::solve (MF& a_soln, MF const& a_rhs, TRIA const& tria,
523526
if (m_r2c)
524527
{
525528
m_r2c->forward(*rhs, m_spmf_c);
526-
solve_z(m_spmf_c, tria, tric);
529+
if constexpr (std::is_same_v<TRIA,fft_poisson_detail::Tri_Zero<T>> &&
530+
std::is_same_v<TRIC,fft_poisson_detail::Tri_Zero<T>>) {
531+
solve_z0(m_spmf_c);
532+
amrex::ignore_unused(tria,tric);
533+
} else {
534+
solve_z(m_spmf_c, tria, tric);
535+
}
527536
m_r2c->backward_doit(m_spmf_c, *soln, ng, m_geom.periodicity());
528537
}
529538
else
530539
{
531540
if (m_r2x->m_cy.empty()) { // spectral data is real
532541
m_r2x->forward(*rhs, m_spmf_r);
533-
solve_z(m_spmf_r, tria, tric);
542+
if constexpr (std::is_same_v<TRIA,fft_poisson_detail::Tri_Zero<T>> &&
543+
std::is_same_v<TRIC,fft_poisson_detail::Tri_Zero<T>>) {
544+
solve_z0(m_spmf_r);
545+
amrex::ignore_unused(tria,tric);
546+
} else {
547+
solve_z(m_spmf_r, tria, tric);
548+
}
534549
m_r2x->backward(m_spmf_r, *soln, ng, m_geom.periodicity());
535550
} else { // spectral data is complex.
536551
m_r2x->forward(*rhs, m_spmf_c);
537-
solve_z(m_spmf_c, tria, tric);
552+
if constexpr (std::is_same_v<TRIA,fft_poisson_detail::Tri_Zero<T>> &&
553+
std::is_same_v<TRIC,fft_poisson_detail::Tri_Zero<T>>) {
554+
solve_z0(m_spmf_c);
555+
amrex::ignore_unused(tria,tric);
556+
} else {
557+
solve_z(m_spmf_c, tria, tric);
558+
}
538559
m_r2x->backward(m_spmf_c, *soln, ng, m_geom.periodicity());
539560
}
540561
}
@@ -581,175 +602,212 @@ void PoissonHybrid<MF>::solve_z (FA& spmf, TRIA const& tria, TRIC const& tric)
581602
}
582603
}
583604

584-
if constexpr (std::is_same_v<TRIA,fft_poisson_detail::Tri_Zero<T>> &&
585-
std::is_same_v<TRIC,fft_poisson_detail::Tri_Zero<T>>) {
586-
#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
587-
#pragma omp parallel
588-
#endif
589-
for (MFIter mfi(spmf,TilingIfNotGPU()); mfi.isValid(); ++mfi)
590-
{
591-
auto const& spectral = spmf.array(mfi);
592-
auto const& box = mfi.validbox();
593-
amrex::ParallelFor(box, [=] AMREX_GPU_DEVICE (int i, int j, int k)
594-
{
595-
T a = facx*(i+offset[0]);
596-
T b = facy*(j+offset[1]);
597-
T k2 = dxfac * (std::cos(a)-T(1))
598-
+ dyfac * (std::cos(b)-T(1));
599-
if (k2 != T(0)) {
600-
spectral(i,j,k) /= k2;
601-
}
602-
spectral(i,j,k) *= scale;
603-
});
604-
}
605-
} else {
606-
bool zlo_neumann = m_bc[2].first == Boundary::even;
607-
bool zhi_neumann = m_bc[2].second == Boundary::even;
608-
bool is_singular = (offset[0] == T(0)) && (offset[1] == T(0))
609-
&& zlo_neumann && zhi_neumann;
605+
bool zlo_neumann = m_bc[2].first == Boundary::even;
606+
bool zhi_neumann = m_bc[2].second == Boundary::even;
607+
bool is_singular = (offset[0] == T(0)) && (offset[1] == T(0))
608+
&& zlo_neumann && zhi_neumann;
610609

611-
auto nz = m_geom.Domain().length(2);
610+
auto nz = m_geom.Domain().length(2);
612611

613612
#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
614613
#pragma omp parallel
615614
#endif
616-
for (MFIter mfi(spmf); mfi.isValid(); ++mfi)
617-
{
618-
auto const& spectral = spmf.array(mfi);
619-
auto const& box = mfi.validbox();
620-
auto const& xybox = amrex::makeSlab(box, 2, 0);
615+
for (MFIter mfi(spmf); mfi.isValid(); ++mfi)
616+
{
617+
auto const& spectral = spmf.array(mfi);
618+
auto const& box = mfi.validbox();
619+
auto const& xybox = amrex::makeSlab(box, 2, 0);
621620

622621
#ifdef AMREX_USE_GPU
623-
// xxxxx TODO: We need to explore how to optimize this
624-
// function. Maybe we can use cusparse. Maybe we should make
625-
// z-direction to be the unit stride direction.
622+
// xxxxx TODO: We need to explore how to optimize this
623+
// function. Maybe we can use cusparse. Maybe we should make
624+
// z-direction to be the unit stride direction.
626625

627-
FArrayBox tridiag_workspace(box,4);
628-
auto const& ald = tridiag_workspace.array(0);
629-
auto const& bd = tridiag_workspace.array(1);
630-
auto const& cud = tridiag_workspace.array(2);
631-
auto const& scratch = tridiag_workspace.array(3);
626+
FArrayBox tridiag_workspace(box,4);
627+
auto const& ald = tridiag_workspace.array(0);
628+
auto const& bd = tridiag_workspace.array(1);
629+
auto const& cud = tridiag_workspace.array(2);
630+
auto const& scratch = tridiag_workspace.array(3);
632631

633-
amrex::ParallelFor(xybox, [=] AMREX_GPU_DEVICE (int i, int j, int)
634-
{
635-
T a = facx*(i+offset[0]);
636-
T b = facy*(j+offset[1]);
637-
T k2 = dxfac * (std::cos(a)-T(1))
638-
+ dyfac * (std::cos(b)-T(1));
639-
640-
// Tridiagonal solve
641-
for(int k=0; k < nz; k++) {
642-
if(k==0) {
643-
ald(i,j,k) = T(0.);
644-
cud(i,j,k) = tric(i,j,k);
645-
if (zlo_neumann) {
646-
bd(i,j,k) = k2 - cud(i,j,k);
647-
} else {
648-
bd(i,j,k) = k2 - cud(i,j,k) - T(2.0)*tria(i,j,k);
649-
}
650-
} else if (k == nz-1) {
651-
ald(i,j,k) = tria(i,j,k);
652-
cud(i,j,k) = T(0.);
653-
if (zhi_neumann) {
654-
bd(i,j,k) = k2 - ald(i,j,k);
655-
if (i == 0 && j == 0 && is_singular) {
656-
bd(i,j,k) *= T(2.0);
657-
}
658-
} else {
659-
bd(i,j,k) = k2 - ald(i,j,k) - T(2.0)*tric(i,j,k);
632+
amrex::ParallelFor(xybox, [=] AMREX_GPU_DEVICE (int i, int j, int)
633+
{
634+
T a = facx*(i+offset[0]);
635+
T b = facy*(j+offset[1]);
636+
T k2 = dxfac * (std::cos(a)-T(1))
637+
+ dyfac * (std::cos(b)-T(1));
638+
639+
// Tridiagonal solve
640+
for(int k=0; k < nz; k++) {
641+
if(k==0) {
642+
ald(i,j,k) = T(0.);
643+
cud(i,j,k) = tric(i,j,k);
644+
if (zlo_neumann) {
645+
bd(i,j,k) = k2 - cud(i,j,k);
646+
} else {
647+
bd(i,j,k) = k2 - cud(i,j,k) - T(2.0)*tria(i,j,k);
648+
}
649+
} else if (k == nz-1) {
650+
ald(i,j,k) = tria(i,j,k);
651+
cud(i,j,k) = T(0.);
652+
if (zhi_neumann) {
653+
bd(i,j,k) = k2 - ald(i,j,k);
654+
if (i == 0 && j == 0 && is_singular) {
655+
bd(i,j,k) *= T(2.0);
660656
}
661657
} else {
662-
ald(i,j,k) = tria(i,j,k);
663-
cud(i,j,k) = tric(i,j,k);
664-
bd(i,j,k) = k2 -ald(i,j,k)-cud(i,j,k);
658+
bd(i,j,k) = k2 - ald(i,j,k) - T(2.0)*tric(i,j,k);
665659
}
660+
} else {
661+
ald(i,j,k) = tria(i,j,k);
662+
cud(i,j,k) = tric(i,j,k);
663+
bd(i,j,k) = k2 -ald(i,j,k)-cud(i,j,k);
666664
}
665+
}
667666

668-
scratch(i,j,0) = cud(i,j,0)/bd(i,j,0);
669-
spectral(i,j,0) = spectral(i,j,0)/bd(i,j,0);
667+
scratch(i,j,0) = cud(i,j,0)/bd(i,j,0);
668+
spectral(i,j,0) = spectral(i,j,0)/bd(i,j,0);
670669

671-
for (int k = 1; k < nz; k++) {
672-
if (k < nz-1) {
673-
scratch(i,j,k) = cud(i,j,k) / (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1));
674-
}
675-
spectral(i,j,k) = (spectral(i,j,k) - ald(i,j,k) * spectral(i,j,k - 1))
676-
/ (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1));
670+
for (int k = 1; k < nz; k++) {
671+
if (k < nz-1) {
672+
scratch(i,j,k) = cud(i,j,k) / (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1));
677673
}
674+
spectral(i,j,k) = (spectral(i,j,k) - ald(i,j,k) * spectral(i,j,k - 1))
675+
/ (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1));
676+
}
678677

679-
for (int k = nz - 2; k >= 0; k--) {
680-
spectral(i,j,k) -= scratch(i,j,k) * spectral(i,j,k + 1);
681-
}
678+
for (int k = nz - 2; k >= 0; k--) {
679+
spectral(i,j,k) -= scratch(i,j,k) * spectral(i,j,k + 1);
680+
}
682681

683-
for (int k = 0; k < nz; ++k) {
684-
spectral(i,j,k) *= scale;
685-
}
686-
});
687-
Gpu::streamSynchronize();
682+
for (int k = 0; k < nz; ++k) {
683+
spectral(i,j,k) *= scale;
684+
}
685+
});
686+
Gpu::streamSynchronize();
688687

689688
#else
690689

691-
Gpu::DeviceVector<T> ald(nz);
692-
Gpu::DeviceVector<T> bd(nz);
693-
Gpu::DeviceVector<T> cud(nz);
694-
Gpu::DeviceVector<T> scratch(nz);
690+
Gpu::DeviceVector<T> ald(nz);
691+
Gpu::DeviceVector<T> bd(nz);
692+
Gpu::DeviceVector<T> cud(nz);
693+
Gpu::DeviceVector<T> scratch(nz);
695694

696-
amrex::LoopOnCpu(xybox, [&] (int i, int j, int)
697-
{
698-
T a = facx*(i+offset[0]);
699-
T b = facy*(j+offset[1]);
700-
T k2 = dxfac * (std::cos(a)-T(1))
701-
+ dyfac * (std::cos(b)-T(1));
702-
703-
// Tridiagonal solve
704-
for(int k=0; k < nz; k++) {
705-
if(k==0) {
706-
ald[k] = T(0.);
707-
cud[k] = tric(i,j,k);
708-
if (zlo_neumann) {
709-
bd[k] = k2 - cud[k];
710-
} else {
711-
bd[k] = k2 - cud[k] - T(2.0)*tria(i,j,k);
712-
}
713-
} else if (k == nz-1) {
714-
ald[k] = tria(i,j,k);
715-
cud[k] = T(0.);
716-
if (zhi_neumann) {
717-
bd[k] = k2 - ald[k];
718-
if (i == 0 && j == 0 && is_singular) {
719-
bd[k] *= T(2.0);
720-
}
721-
} else {
722-
bd[k] = k2 - ald[k] - T(2.0)*tric(i,j,k);
695+
amrex::LoopOnCpu(xybox, [&] (int i, int j, int)
696+
{
697+
T a = facx*(i+offset[0]);
698+
T b = facy*(j+offset[1]);
699+
T k2 = dxfac * (std::cos(a)-T(1))
700+
+ dyfac * (std::cos(b)-T(1));
701+
702+
// Tridiagonal solve
703+
for(int k=0; k < nz; k++) {
704+
if(k==0) {
705+
ald[k] = T(0.);
706+
cud[k] = tric(i,j,k);
707+
if (zlo_neumann) {
708+
bd[k] = k2 - cud[k];
709+
} else {
710+
bd[k] = k2 - cud[k] - T(2.0)*tria(i,j,k);
711+
}
712+
} else if (k == nz-1) {
713+
ald[k] = tria(i,j,k);
714+
cud[k] = T(0.);
715+
if (zhi_neumann) {
716+
bd[k] = k2 - ald[k];
717+
if (i == 0 && j == 0 && is_singular) {
718+
bd[k] *= T(2.0);
723719
}
724720
} else {
725-
ald[k] = tria(i,j,k);
726-
cud[k] = tric(i,j,k);
727-
bd[k] = k2 -ald[k]-cud[k];
721+
bd[k] = k2 - ald[k] - T(2.0)*tric(i,j,k);
728722
}
723+
} else {
724+
ald[k] = tria(i,j,k);
725+
cud[k] = tric(i,j,k);
726+
bd[k] = k2 -ald[k]-cud[k];
729727
}
728+
}
730729

731-
scratch[0] = cud[0]/bd[0];
732-
spectral(i,j,0) = spectral(i,j,0)/bd[0];
730+
scratch[0] = cud[0]/bd[0];
731+
spectral(i,j,0) = spectral(i,j,0)/bd[0];
733732

734-
for (int k = 1; k < nz; k++) {
735-
if (k < nz-1) {
736-
scratch[k] = cud[k] / (bd[k] - ald[k] * scratch[k-1]);
737-
}
738-
spectral(i,j,k) = (spectral(i,j,k) - ald[k] * spectral(i,j,k - 1))
739-
/ (bd[k] - ald[k] * scratch[k-1]);
733+
for (int k = 1; k < nz; k++) {
734+
if (k < nz-1) {
735+
scratch[k] = cud[k] / (bd[k] - ald[k] * scratch[k-1]);
740736
}
737+
spectral(i,j,k) = (spectral(i,j,k) - ald[k] * spectral(i,j,k - 1))
738+
/ (bd[k] - ald[k] * scratch[k-1]);
739+
}
741740

742-
for (int k = nz - 2; k >= 0; k--) {
743-
spectral(i,j,k) -= scratch[k] * spectral(i,j,k + 1);
744-
}
741+
for (int k = nz - 2; k >= 0; k--) {
742+
spectral(i,j,k) -= scratch[k] * spectral(i,j,k + 1);
743+
}
745744

746-
for (int k = 0; k < nz; ++k) {
747-
spectral(i,j,k) *= scale;
748-
}
749-
});
745+
for (int k = 0; k < nz; ++k) {
746+
spectral(i,j,k) *= scale;
747+
}
748+
});
750749
#endif
750+
}
751+
#endif
752+
}
753+
754+
template <typename MF>
755+
template <typename FA>
756+
void PoissonHybrid<MF>::solve_z0 (FA& spmf)
757+
{
758+
BL_PROFILE("PoissonHybrid::solve_z0");
759+
760+
#if (AMREX_SPACEDIM < 3)
761+
amrex::ignore_unused(spmf);
762+
#else
763+
764+
// This function cannot be merged with solve_z because of CUDA's
765+
// limitation with constexpr if on Windows.
766+
auto facx = Math::pi<T>()/T(m_geom.Domain().length(0));
767+
auto facy = Math::pi<T>()/T(m_geom.Domain().length(1));
768+
if (m_bc[0].first == Boundary::periodic) { facx *= T(2); }
769+
if (m_bc[1].first == Boundary::periodic) { facy *= T(2); }
770+
auto dxfac = T(2)/T(m_geom.CellSize(0)*m_geom.CellSize(0));
771+
auto dyfac = T(2)/T(m_geom.CellSize(1)*m_geom.CellSize(1));
772+
auto scale = (m_r2x) ? m_r2x->scalingFactor() : m_r2c->scalingFactor();
773+
774+
if (m_geom.Domain().length(0) == 1) { dxfac = 0; }
775+
if (m_geom.Domain().length(1) == 1) { dyfac = 0; }
776+
777+
GpuArray<T,AMREX_SPACEDIM-1> offset{T(0),T(0)};
778+
for (int idim = 0; idim < AMREX_SPACEDIM-1; ++idim) {
779+
if (m_geom.Domain().length(idim) > 1) {
780+
if (m_bc[idim].first == Boundary::odd &&
781+
m_bc[idim].second == Boundary::odd)
782+
{
783+
offset[idim] = T(1);
784+
}
785+
else if ((m_bc[idim].first == Boundary::odd &&
786+
m_bc[idim].second == Boundary::even) ||
787+
(m_bc[idim].first == Boundary::even &&
788+
m_bc[idim].second == Boundary::odd))
789+
{
790+
offset[idim] = T(0.5);
791+
}
751792
}
752793
}
794+
795+
for (MFIter mfi(spmf,TilingIfNotGPU()); mfi.isValid(); ++mfi)
796+
{
797+
auto const& spectral = spmf.array(mfi);
798+
auto const& box = mfi.validbox();
799+
amrex::ParallelFor(box, [=] AMREX_GPU_DEVICE (int i, int j, int k)
800+
{
801+
T a = facx*(i+offset[0]);
802+
T b = facy*(j+offset[1]);
803+
T k2 = dxfac * (std::cos(a)-T(1))
804+
+ dyfac * (std::cos(b)-T(1));
805+
if (k2 != T(0)) {
806+
spectral(i,j,k) /= k2;
807+
}
808+
spectral(i,j,k) *= scale;
809+
});
810+
}
753811
#endif
754812
}
755813

0 commit comments

Comments
 (0)