diff --git a/.gitignore b/.gitignore
index 46ef541ee9..0e3cb19964 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,8 @@ swift
swift_mpi
fof
fof_mpi
+swift_cuda
+swift_mpicuda
src/version_string.h
swift*.tar.gz
diff --git a/Makefile.am b/Makefile.am
index b5ede6fd97..51f34ac1ed 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -74,6 +74,23 @@ bin_PROGRAMS += fof_mpi
endif
endif
+# BUILD CUDA versions as well?
+if HAVECUDA
+bin_PROGRAMS += swift_cuda
+if HAVEMPI
+bin_PROGRAMS += swift_mpicuda
+endif
+endif
+
+
+# BUILD HIP versions as well?
+if HAVEHIP
+bin_PROGRAMS += swift_hip
+if HAVEMPI
+bin_PROGRAMS += swift_mpihip
+endif
+endif
+
# engine_policy_setaffinity is available?
if HAVESETAFFINITY
ENGINE_POLICY_SETAFFINITY=| engine_policy_setaffinity
@@ -91,6 +108,28 @@ swift_mpi_SOURCES = swift.c
swift_mpi_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)"
swift_mpi_LDADD = src/libswiftsim_mpi.la argparse/libargparse.la $(MPI_LIBS) $(VELOCIRAPTOR_MPI_LIBS) $(EXTRA_LIBS) $(LD_CSDS)
+# Sources for swift_cuda
+swift_cuda_SOURCES = swift.c dummy.C
+swift_cuda_CXXFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(CUDA_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_CUDA
+swift_cuda_LDADD = src/.libs/libswiftsim_cuda.a src/cuda/.libs/libswiftCUDA.a $(EXTRA_LIBS) $(CUDA_LIBS) -lcudart argparse/.libs/libargparse.a src/.libs/libgrav.la
+
+# Sources for swift_hip
+swift_hip_SOURCES = swift.c dummy.C
+swift_hip_CXXFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(HIP_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_HIP
+swift_hip_LDADD = src/.libs/libswiftsim_hip.a src/hip/.libs/libswiftHIP.a $(EXTRA_LIBS) $(HIP_LIBS) -lamdhip64 -L/opt/rocm-5.1.0/lib -lhsa-runtime64 -L/opt/rocm-5.1.0/lib64 -lamd_comgr argparse/.libs/libargparse.a src/.libs/libgrav.la
+
+# Sources for swift_mpicuda, do we need an affinity policy for MPI?
+swift_mpicuda_SOURCES = swift.c dummy.C
+swift_mpicuda_CXXFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) $(CUDA_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_CUDA
+swift_mpicuda_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) $(CUDA_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_CUDA
+swift_mpicuda_LDADD = src/.libs/libswiftsim_mpicuda.a argparse/.libs/libargparse.a src/.libs/libgrav.la src/cuda/.libs/libswiftCUDA.a $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -lcudart
+
+# Sources for swift_mpihip, do we need an affinity policy for MPI?
+swift_mpihip_SOURCES = swift.c dummy.C
+swift_mpihip_CXXFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) $(HIP_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_HIP
+swift_mpihip_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) $(HIP_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_HIP
+swift_mpihip_LDADD = src/.libs/libswiftsim_mpihip.a argparse/.libs/libargparse.a src/.libs/libgrav.la src/hip/.libs/libswiftHIP.a $(MPI_LIBS) $(EXTRA_LIBS) $(HIP_LIBS) -lamdhip64
+
# Sources for fof
fof_SOURCES = swift_fof.c
fof_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)"
diff --git a/configure.ac b/configure.ac
index b0173c6954..59fc40aba5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -41,6 +41,10 @@ AC_USE_SYSTEM_EXTENSIONS
AC_PROG_CC
AM_PROG_CC_C_O
+# Find and test the C++ compiler.
+AC_PROG_CXX
+AC_PROG_CXX_C_O
+
# We need this for compilation hints and possibly FFTW.
AX_OPENMP
@@ -995,6 +999,78 @@ AH_VERBATIM([__STDC_FORMAT_MACROS],
#define __STDC_FORMAT_MACROS 1
#endif])
+
+
+# Check for CUDA
+have_cuda="no"
+AC_ARG_WITH([cuda],
+ [AS_HELP_STRING([--with-cuda=PATH],
+ [root directory where CUDA is installed @<:@yes/no@:>@]
+ )],
+ [],
+ [with_cuda="no"]
+)
+if test "x$with_cuda" != "xno"; then
+ if test "x$with_cuda" != "xyes"; then
+ CUDA_CFLAGS="-I$with_cuda/include"
+ CUDA_LIBS="-L$with_cuda/lib -L$with_cuda/lib64 -lcudart"
+ NVCC="$with_cuda/bin/nvcc"
+ have_cuda="yes"
+ else
+ AC_PATH_PROG([NVCC],[nvcc])
+ echo "Found nvcc = $NVCC"
+ if test -n "$NVCC"; then
+ CUDA_ROOT="`dirname $NVCC`/.."
+ CUDA_CFLAGS="-I${CUDA_ROOT}/include"
+ CUDA_LIBS="-L${CUDA_ROOT}/lib -L${CUDA_ROOT}/lib64 -lcudart"
+ have_cuda="yes"
+ fi
+ fi
+ if test "x$have_cuda" != "xno"; then
+ AC_DEFINE([HAVE_CUDA], 1, [The CUDA compiler is installed.])
+ fi
+ CFLAGS="${CFLAGS} "
+fi
+AC_SUBST(CUDA_CFLAGS)
+AC_SUBST(CUDA_LIBS)
+AC_SUBST(NVCC)
+AM_CONDITIONAL([HAVECUDA],[test -n "$NVCC"])
+
+# Check for HIP
+have_hip="no"
+AC_ARG_WITH([hip],
+ [AS_HELP_STRING([--with-hip=PATH],
+ [root directory where HIP is installed @<:@yes/no@:>@]
+ )],
+ [],
+ [with_hip="no"]
+)
+if test "x$with_hip" != "xno"; then
+ if test "x$with_hip" != "xyes"; then
+ HIP_CFLAGS="-I$with_hip/include"
+ HIP_LIBS="-L$with_hip/lib -L$with_hip/lib64"
+ HIPCC="$with_hip/bin/hipcc"
+ have_hip="yes"
+ else
+ AC_PATH_PROG([HIPCC],[hipcc])
+ echo "Found hipcc = $HIPCC"
+ if test -n "$HIPCC"; then
+ HIP_ROOT="`dirname $HIPCC`/.."
+ HIP_CFLAGS="-I${HIP_ROOT}/include"
+ HIP_LIBS="-L${HIP_ROOT}/lib -L${HIP_ROOT}/lib64"
+ have_hip="yes"
+ fi
+ fi
+ if test "x$have_hip" != "xno"; then
+ AC_DEFINE([HAVE_HIP], 1, [The HIP compiler is installed.])
+ fi
+ CFLAGS="${CFLAGS} "
+fi
+AC_SUBST(HIP_CFLAGS)
+AC_SUBST(HIP_LIBS)
+AC_SUBST(HIPCC)
+AM_CONDITIONAL([HAVEHIP],[test -n "$HIPCC"])
+
# Check for FFTW. We test for this in the standard directories by default,
# and only disable if using --with-fftw=no or --without-fftw. When a value
# is given FFTW must be found.
@@ -3246,6 +3322,10 @@ AC_CONFIG_FILES([tests/testSelectOutput.sh], [chmod +x tests/testSelectOutput.sh
AC_CONFIG_FILES([tests/testFormat.sh], [chmod +x tests/testFormat.sh])
AC_CONFIG_FILES([tests/testNeutrinoCosmology.sh], [chmod +x tests/testNeutrinoCosmology.sh])
AC_CONFIG_FILES([tests/output_list_params.yml])
+# cuda .in file
+AC_CONFIG_FILES([src/cuda/Makefile])
+# hip .in file
+AC_CONFIG_FILES([src/hip/Makefile])
# Save the compilation options
AC_DEFINE_UNQUOTED([SWIFT_CONFIG_FLAGS],["$swift_config_flags"],[Flags passed to configure])
@@ -3276,6 +3356,8 @@ AC_MSG_RESULT([
HDF5 enabled : $with_hdf5
- parallel : $have_parallel_hdf5
METIS/ParMETIS : $have_metis / $have_parmetis
+ CUDA enabled : $have_cuda
+ HIP enabled : $have_hip
FFTW3 enabled : $have_fftw
- threaded/openmp : $have_threaded_fftw / $have_openmp_fftw
- MPI : $have_mpi_fftw
diff --git a/cudalt.py b/cudalt.py
new file mode 100755
index 0000000000..e8643cd1e6
--- /dev/null
+++ b/cudalt.py
@@ -0,0 +1,80 @@
+#!/usr/bin/python3
+# libtoolish hack: compile a .cu file like libtool does
+import sys
+import os
+
+lo_filepath = sys.argv[1]
+o_filepath = lo_filepath.replace(".lo", ".o")
+
+try:
+ i = o_filepath.rindex("/")
+ lo_dir = o_filepath[0:i+1]
+ o_filename = o_filepath[i+1:]
+
+except ValueError:
+ lo_dir = ""
+ o_filename = o_filepath
+
+local_pic_dir = ".libs/"
+local_npic_dir = ""
+pic_dir = lo_dir + local_pic_dir
+npic_dir = lo_dir + local_npic_dir
+
+pic_filepath = pic_dir + o_filename
+npic_filepath = npic_dir + o_filename
+local_pic_filepath = local_pic_dir + o_filename
+local_npic_filepath = local_npic_dir + o_filename
+
+# Make lib dir
+try:
+ os.mkdir(pic_dir)
+except OSError:
+ pass
+
+# generate the command to compile the .cu for shared library
+args = sys.argv[2:]
+args.extend(["-Xcompiler","-fPIC"])
+# position indep code
+args.append("-o")
+args.append(pic_filepath)
+command = " ".join(args)
+print (command)
+
+# compile the .cu
+rv = os.system(command)
+if rv != 0:
+ sys.exit(1)
+
+# generate the command to compile the .cu for static library
+args = sys.argv[2:]
+args.append("-o")
+args.append(npic_filepath)
+command = " ".join(args)
+print (command)
+
+# compile the .cu
+rv = os.system(command)
+if rv != 0:
+ sys.exit(1)
+
+# get libtool version
+fd = os.popen("libtool --version")
+libtool_version = fd.readline()
+fd.close()
+
+# generate the .lo file
+f = open(lo_filepath, "w")
+f.write("# " + lo_filepath + " - a libtool object file\n")
+f.write("# Generated by " + libtool_version + "\n")
+f.write("#\n")
+f.write("# Please DO NOT delete this file!\n")
+f.write("# It is necessary for linking the library.\n\n")
+
+f.write("# Name of the PIC object.\n")
+f.write("pic_object='" + local_pic_filepath + "'\n\n")
+
+f.write("# Name of the non-PIC object.\n")
+f.write("non_pic_object='" + local_npic_filepath + "'\n")
+f.close()
+
+sys.exit(0)
diff --git a/dummy.C b/dummy.C
new file mode 100755
index 0000000000..bbf68f8cea
--- /dev/null
+++ b/dummy.C
@@ -0,0 +1,3 @@
+void dummy(){
+
+}
diff --git a/examples/HydroTests/GreshoVortex_3D/getGlass.sh b/examples/HydroTests/GreshoVortex_3D/getGlass.sh
index d5c5f590ac..068986fc10 100755
--- a/examples/HydroTests/GreshoVortex_3D/getGlass.sh
+++ b/examples/HydroTests/GreshoVortex_3D/getGlass.sh
@@ -1,2 +1,2 @@
#!/bin/bash
-wget http://virgodb.cosma.dur.ac.uk/swift-webstorage/ICs/glassCube_64.hdf5
+wget http://virgodb.cosma.dur.ac.uk/swift-webstorage/ICs/glassCube_128.hdf5
diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index a95a0eae32..6c945e7473 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,21 +7,25 @@ InternalUnitSystem:
UnitTemp_in_cgs: 1 # Kelvin
Scheduler:
- max_top_level_cells: 15
-
+ max_top_level_cells: 8
+ tasks_per_cell: 200
+ # deadlock_waiting_time_s: 10
+ # cell_split_size: 100
+ # cell_sub_size_pair_hydro: 10000 # (Optional) Maximal number of hydro-hydro interactions per sub-pair hydro/star task (this is the default value).
+ # cell_sub_size_self_hydro: 100 # (Optional) Maximal number of hydro-hydro interactions per sub-self hydro/star task. Set to how many cells are targeted for GPU tasks
# Parameters governing the time integration
TimeIntegration:
time_begin: 0. # The starting time of the simulation (in internal units).
time_end: 1. # The end time of the simulation (in internal units).
dt_min: 1e-6 # The minimal time-step size of the simulation (in internal units).
- dt_max: 1e-2 # The maximal time-step size of the simulation (in internal units).
+ dt_max: 1e-4 # The maximal time-step size of the simulation (in internal units).
# Parameters governing the snapshots
Snapshots:
basename: gresho # Common part of the name of output files
time_first: 0. # Time of the first output (in internal units)
- delta_time: 1e-1 # Time difference between consecutive outputs (in internal units)
- compression: 1
+ delta_time: 1e-3 # Time difference between consecutive outputs (in internal units)
+ # compression: 1
# Parameters governing the conserved quantities statistics
Statistics:
@@ -29,10 +33,11 @@ Statistics:
# Parameters for the hydrodynamics scheme
SPH:
- resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+ resolution_eta: 1.9 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration.
# Parameters related to the initial conditions
InitialConditions:
- file_name: ./greshoVortex.hdf5 # The file to read
- periodic: 1
\ No newline at end of file
+ file_name: greshoVortex.hdf5
+ periodic: 1
+ # replicate: 2
diff --git a/examples/HydroTests/GreshoVortex_3D/gresho_split_size_500.yml b/examples/HydroTests/GreshoVortex_3D/gresho_split_size_500.yml
new file mode 100644
index 0000000000..3105787d75
--- /dev/null
+++ b/examples/HydroTests/GreshoVortex_3D/gresho_split_size_500.yml
@@ -0,0 +1,42 @@
+# Define the system of units to use internally.
+InternalUnitSystem:
+ UnitMass_in_cgs: 1 # Grams
+ UnitLength_in_cgs: 1 # Centimeters
+ UnitVelocity_in_cgs: 1 # Centimeters per second
+ UnitCurrent_in_cgs: 1 # Amperes
+ UnitTemp_in_cgs: 1 # Kelvin
+
+Scheduler:
+ max_top_level_cells: 16
+ tasks_per_cell: 200
+ cell_split_size: 700
+ cell_sub_size_pair_hydro: 49000 # (Optional) Maximal number of hydro-hydro interactions per sub-pair hydro/star task (this is the default value).
+ cell_sub_size_self_hydro: 700 # (Optional) Maximal number of hydro-hydro interactions per sub-self hydro/star task. Set to how many cells are targeted for GPU tasks
+# Parameters governing the time integration
+TimeIntegration:
+ time_begin: 0. # The starting time of the simulation (in internal units).
+ time_end: 1. # The end time of the simulation (in internal units).
+ dt_min: 1e-6 # The minimal time-step size of the simulation (in internal units).
+ dt_max: 1e-4 # The maximal time-step size of the simulation (in internal units).
+
+# Parameters governing the snapshots
+Snapshots:
+ basename: gresho # Common part of the name of output files
+ time_first: 0. # Time of the first output (in internal units)
+ delta_time: 1e-3 # Time difference between consecutive outputs (in internal units)
+ # compression: 1
+
+# Parameters governing the conserved quantities statistics
+Statistics:
+ delta_time: 1e-2 # Time between statistics output
+
+# Parameters for the hydrodynamics scheme
+SPH:
+ resolution_eta: 1.9 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+ CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration.
+
+# Parameters related to the initial conditions
+InitialConditions:
+ file_name: greshoVortex.hdf5
+ periodic: 1
+ replicate: 8
diff --git a/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/isolated_galaxy.yml b/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/isolated_galaxy.yml
index 8717af63bd..bcabd810dd 100644
--- a/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/isolated_galaxy.yml
+++ b/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/isolated_galaxy.yml
@@ -10,6 +10,13 @@ InternalUnitSystem:
UnitCurrent_in_cgs: 1 # Amperes
UnitTemp_in_cgs: 1 # Kelvin
+
+
+
+
+
+
+
# Parameters for the self-gravity scheme
Gravity:
eta: 0.025 # Constant dimensionless multiplier for time integration.
@@ -24,7 +31,7 @@ TimeIntegration:
time_begin: 0. # The starting time of the simulation (in internal units).
time_end: 0.1 # The end time of the simulation (in internal units).
dt_min: 1e-9 # The minimal time-step size of the simulation (in internal units).
- dt_max: 1e-2 # The maximal time-step size of the simulation (in internal units).
+ dt_max: 1e-6 # The maximal time-step size of the simulation (in internal units).
# Parameters governing the snapshots
Snapshots:
diff --git a/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/run.sh b/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/run.sh
index 6931897b2c..6a2fa4d897 100755
--- a/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/run.sh
+++ b/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/run.sh
@@ -30,7 +30,7 @@ then
./getEaglePhotometryTable.sh
fi
-../../../swift --threads=16 --feedback --external-gravity --self-gravity --stars --star-formation --cooling --hydro --limiter --sync isolated_galaxy.yml 2>&1 | tee output.log
+../../../swift_mpicuda --threads=16 --feedback --external-gravity --self-gravity --stars --star-formation --cooling --hydro --limiter --sync isolated_galaxy.yml 2>&1 | tee output.log
# Kennicutt-Schmidt law plot
python3 plotSolution.py 100
diff --git a/src/Makefile.am b/src/Makefile.am
index 8099524651..99092acde4 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -16,7 +16,10 @@
# along with this program. If not, see .
# Add the non-standard paths to the included library headers
-AM_CFLAGS = $(HDF5_CPPFLAGS) $(GSL_INCS) $(FFTW_INCS) $(NUMA_INCS) $(GRACKLE_INCS) $(SUNDIALS_INCS) $(CHEALPIX_CFLAGS)
+AM_CFLAGS = $(HDF5_CPPFLAGS) $(GSL_INCS) $(FFTW_INCS) $(NUMA_INCS) $(GRACKLE_INCS) $(SUNDIALS_INCS) $(CHEALPIX_CFLAGS) -O0
+
+# Add HIP Path
+AM_CFLAGS += -D__HIP_PLATFORM_AMD__
# Assign a "safe" version number
AM_LDFLAGS = $(HDF5_LDFLAGS) $(FFTW_LIBS)
@@ -40,6 +43,22 @@ lib_LTLIBRARIES += libswiftsim_mpi.la
noinst_LTLIBRARIES += libgrav_mpi.la
endif
+# Build a cuda version too?
+if HAVECUDA
+lib_LTLIBRARIES += libswiftsim_cuda.la
+if HAVEMPI
+lib_LTLIBRARIES += libswiftsim_mpicuda.la
+endif
+endif
+
+# Build a hip version too?
+if HAVEHIP
+lib_LTLIBRARIES += libswiftsim_hip.la
+if HAVEMPI
+lib_LTLIBRARIES += libswiftsim_mpihip.la
+endif
+endif
+
# List required headers
include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h
include_HEADERS += cell_hydro.h cell_stars.h cell_grav.h cell_sinks.h cell_black_holes.h cell_rt.h cell_grid.h
@@ -161,7 +180,7 @@ endif
AM_SOURCES = space.c space_rebuild.c space_regrid.c space_unique_id.c
AM_SOURCES += space_sort.c space_split.c space_extras.c space_first_init.c space_init.c
AM_SOURCES += space_cell_index.c space_recycle.c
-AM_SOURCES += runner_main.c runner_doiact_hydro.c runner_doiact_limiter.c
+AM_SOURCES += runner_main.c runner_doiact_hydro.c runner_doiact_limiter.c runner_gpu_pack_functions.c
AM_SOURCES += runner_doiact_stars.c runner_doiact_black_holes.c runner_ghost.c
AM_SOURCES += runner_recv.c runner_pack.c
AM_SOURCES += runner_sort.c runner_drift.c runner_black_holes.c runner_time_integration.c
@@ -208,7 +227,7 @@ AM_SOURCES += $(SPHM1RT_RT_SOURCES)
AM_SOURCES += $(GEAR_RT_SOURCES)
# Include files for distribution, not installation.
-nobase_noinst_HEADERS = align.h approx_math.h atomic.h barrier.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h
+nobase_noinst_HEADERS = align.h approx_math.h atomic.h barrier.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h runner_gpu_pack_functions.h
nobase_noinst_HEADERS += gravity_iact.h kernel_long_gravity.h vector.h accumulate.h cache.h exp.h log.h
nobase_noinst_HEADERS += runner_doiact_nosort.h runner_doiact_hydro.h runner_doiact_stars.h runner_doiact_black_holes.h runner_doiact_grav.h
nobase_noinst_HEADERS += runner_doiact_functions_hydro.h runner_doiact_functions_stars.h runner_doiact_functions_black_holes.h
@@ -526,6 +545,33 @@ libswiftsim_mpi_la_LDFLAGS = $(AM_LDFLAGS) $(MPI_LIBS) $(EXTRA_LIBS) -version-in
libswiftsim_mpi_la_SHORTNAME = mpi
libswiftsim_mpi_la_LIBADD = $(GRACKLE_LIBS) $(VELOCIRAPTOR_LIBS) $(MPI_LIBS) libgrav_mpi.la
+# Sources and flags for regular CUDA library
+libswiftsim_cuda_la_SOURCES = $(AM_SOURCES)
+libswiftsim_cuda_la_CFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) -DWITH_CUDA
+libswiftsim_cuda_la_CXXFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) -DWITH_CUDA
+libswiftsim_cuda_la_LDFLAGS = $(AM_LDFLAGS) $(EXTRA_LIBS) $(CUDA_LIBS)
+libswiftsim_cuda_la_SHORTNAME = cuda
+libswiftsim_cuda_la_LIBADD = $(GRACKLE_LIBS) $(VELOCIRAPTOR_LIBS) $(MPI_LIBS) libgrav.la
+
+# Sources and flags for regular HIP library
+libswiftsim_hip_la_SOURCES = $(AM_SOURCES)
+libswiftsim_hip_la_CFLAGS = $(AM_CFLAGS) $(HIP_CFLAGS) -DWITH_HIP
+libswiftsim_hip_la_LDFLAGS = $(AM_LDFLAGS) $(EXTRA_LIBS) $(HIP_LIBS) -lamdhip64
+libswiftsim_hip_la_SHORTNAME = hip
+libswiftsim_hip_la_LIBADD = $(GRACKLE_LIBS) $(VELOCIRAPTOR_LIBS) $(MPI_LIBS) libgrav.la
+
+# Sources and flags for MPI CUDA library
+libswiftsim_mpicuda_la_SOURCES = $(AM_SOURCES)
+libswiftsim_mpicuda_la_CFLAGS = $(AM_CFLAGS) $(MPI_FLAGS) $(CUDA_CFLAGS) -DWITH_CUDA
+libswiftsim_mpicuda_la_CXXFLAGS = $(AM_CFLAGS) $(MPI_FLAGS) $(CUDA_CFLAGS) -DWITH_CUDA
+libswiftsim_mpicuda_la_LDFLAGS = $(AM_LDFLAGS) $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS)
+libswiftsim_mpicuda_la_SHORTNAME = mpicuda
+libswiftsim_mpicuda_la_LIBADD = $(GRACKLE_LIBS) $(VELOCIRAPTOR_LIBS) $(MPI_LIBS) libgrav_mpi.la
+
+#subdir
+SUBDIRS = . cuda
+SUBDIRS += . hip
+
# Versioning. If any sources change then update the version_string.h file with
# the current git revision and package version.
# May have a checkout without a version_string.h file and no git command (tar/zip
diff --git a/src/cell.h b/src/cell.h
index cac5c49878..1d2aa0d7e1 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -360,6 +360,39 @@ enum cell_flags {
*/
struct cell {
+ /*Marks a cell for GPU execution A. Nasar */
+ bool is_gpu_cell;
+
+ int unpacker_cell;
+
+ /*Marks a cell as having done its pack task 0->not 1-> yes*/
+ int pack_done;
+ /*Marks a cell as having done its pack task 0->not 1-> yes*/
+ int pack_done_g;
+ /*Marks a cell as having done its pack task 0->not 1-> yes*/
+ int pack_done_f;
+
+ /*Has the task run on the GPU? 0->No, 1-> Yes*/
+ int gpu_done;
+ /*Has the task run on the GPU? 0->No, 1-> Yes*/
+ int gpu_done_g;
+ /*Has the task run on the GPU? 0->No, 1-> Yes*/
+ int gpu_done_f;
+
+ /*Has the task run on the GPU? 0->No, 1-> Yes*/
+ int unpack_done;
+ /*Has the task run on the GPU? 0->No, 1-> Yes*/
+ int unpack_done_g;
+ /*Has the task run on the GPU? 0->No, 1-> Yes*/
+ int unpack_done_f;
+
+ /*Has the pair task run on the GPU? 0->No, 1-> Yes*/
+ int gpu_done_pair;
+ /*Has the pair task run on the GPU? 0->No, 1-> Yes*/
+ int gpu_done_pair_g;
+ /*Has the pair task run on the GPU? 0->No, 1-> Yes*/
+ int gpu_done_pair_f;
+
/*! The cell location on the grid (corner nearest to the origin). */
double loc[3];
diff --git a/src/cell_hydro.h b/src/cell_hydro.h
index 39db7bc219..14b37dcd6d 100644
--- a/src/cell_hydro.h
+++ b/src/cell_hydro.h
@@ -61,6 +61,25 @@ struct cell_hydro {
/*! Linked list of the tasks computing this cell's hydro density. */
struct link *density;
+ /*! Linked list of the tasks computing this cell's hydro density pack. A.
+ * Nasar */
+ struct link *density_pack;
+ struct link *density_unpack;
+ /*! Linked list of the tasks computing this cell's hydro force pack. */
+ struct link *force_pack;
+ struct link *force_unpack;
+ /*! Linked list of the tasks computing this cell's hydro gradient pack. */
+ struct link *gradient_pack;
+ struct link *gradient_unpack;
+
+ struct task *d_pack;
+ struct task *g_pack;
+ struct task *f_pack;
+
+ struct task *d_unpack;
+ struct task *g_unpack;
+ struct task *f_unpack;
+
/* Linked list of the tasks computing this cell's hydro gradients. */
struct link *gradient;
diff --git a/src/cell_unskip.c b/src/cell_unskip.c
index 6ad14a3560..a9572ea3bc 100644
--- a/src/cell_unskip.c
+++ b/src/cell_unskip.c
@@ -884,7 +884,7 @@ void cell_activate_subcell_hydro_tasks(struct cell *ci, struct cell *cj,
cell_activate_hydro_sorts(ci, sid, s);
cell_activate_hydro_sorts(cj, sid, s);
}
- } /* Otherwise, pair interation */
+ } /* Otherwise, pair interaction */
}
/**
@@ -1657,7 +1657,6 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
if ((ci_active && ci_nodeID == nodeID) ||
(cj_active && cj_nodeID == nodeID)) {
scheduler_activate(s, t);
-
/* Activate hydro drift */
if (t->type == task_type_self) {
if (ci_nodeID == nodeID) cell_activate_drift_part(ci, s);
@@ -1903,19 +1902,94 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
#endif
}
}
-
/* Unskip all the other task types. */
int c_active = cell_is_active_hydro(c, e);
if (c->nodeID == nodeID && c_active) {
+ for (struct link *l = c->hydro.density_pack; l != NULL;
+ l = l->next) { /* A. Nasar */
+ if(l->t->type == task_type_self && l->t->ci->hydro.count > 0)
+ scheduler_activate(s, l->t);
+ else if(l->t->type == task_type_pair && l->t->ci->hydro.count > 0 && l->t->cj->hydro.count > 0)
+ scheduler_activate(s, l->t);
+#ifdef SWIFT_DEBUG_CHECKS
+ if (l->t->ci != NULL) {
+ l->t->ci->pack_done = 0;
+ l->t->ci->gpu_done = 0;
+ l->t->ci->unpack_done = 0;
+ }
+ if (l->t->cj != NULL) {
+ l->t->cj->pack_done = 0;
+ l->t->cj->gpu_done = 0;
+ l->t->cj->unpack_done = 0;
+ }
+#endif
+ }
+ for (struct link *l = c->hydro.density_unpack; l != NULL; l = l->next) {
+ scheduler_activate(s, l->t);
+#ifdef SWIFT_DEBUG_CHECKS
+ l->t->gpu_done = 0;
+#endif
+ }
for (struct link *l = c->hydro.gradient; l != NULL; l = l->next) {
scheduler_activate(s, l->t);
}
for (struct link *l = c->hydro.force; l != NULL; l = l->next) {
scheduler_activate(s, l->t);
}
-
for (struct link *l = c->hydro.limiter; l != NULL; l = l->next)
scheduler_activate(s, l->t);
+ // A. Nasar activate force and gradient packing tasks
+ for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
+ if(l->t->type == task_type_self && l->t->ci->hydro.count > 0)
+ scheduler_activate(s, l->t);
+ else if(l->t->type == task_type_pair && l->t->ci->hydro.count > 0 && l->t->cj->hydro.count > 0)
+ scheduler_activate(s, l->t);
+#ifdef SWIFT_DEBUG_CHECKS
+ if (l->t->ci != NULL) {
+ l->t->ci->pack_done_f = 0;
+ l->t->ci->gpu_done_f = 0;
+ l->t->ci->unpack_done_f = 0;
+ }
+ if (l->t->cj != NULL) {
+ l->t->cj->pack_done_f = 0;
+ l->t->cj->gpu_done_f = 0;
+ l->t->cj->unpack_done_f = 0;
+ }
+#endif
+ }
+ for (struct link *l = c->hydro.force_unpack; l != NULL; l = l->next) {
+ scheduler_activate(s, l->t);
+#ifdef SWIFT_DEBUG_CHECKS
+ l->t->gpu_done = 0;
+#endif
+ }
+
+#ifdef EXTRA_HYDRO_LOOP
+ for (struct link *l = c->hydro.gradient_pack; l != NULL; l = l->next) {
+ if(l->t->type == task_type_self && l->t->ci->hydro.count > 0)
+ scheduler_activate(s, l->t);
+ else if(l->t->type == task_type_pair && l->t->ci->hydro.count > 0 && l->t->cj->hydro.count > 0)
+ scheduler_activate(s, l->t);
+#ifdef SWIFT_DEBUG_CHECKS
+ if (l->t->ci != NULL) {
+ l->t->ci->pack_done_g = 0;
+ l->t->ci->gpu_done_g = 0;
+ l->t->ci->unpack_done_g = 0;
+ }
+ if (l->t->cj != NULL) {
+ l->t->cj->pack_done_g = 0;
+ l->t->cj->gpu_done_g = 0;
+ l->t->cj->unpack_done_g = 0;
+ }
+#endif
+ }
+ for (struct link *l = c->hydro.gradient_unpack; l != NULL; l = l->next) {
+ scheduler_activate(s, l->t);
+#ifdef SWIFT_DEBUG_CHECKS
+ l->t->gpu_done = 0;
+#endif
+ }
+#endif
if (c->hydro.extra_ghost != NULL)
scheduler_activate(s, c->hydro.extra_ghost);
diff --git a/src/clocks.h b/src/clocks.h
index e39d8e8195..4cc7cdaac7 100644
--- a/src/clocks.h
+++ b/src/clocks.h
@@ -20,8 +20,11 @@
#define SWIFT_CLOCKS_H
/* Config parameters. */
+#ifdef WITH_CUDA
+#include "../config.h"
+#else
#include
-
+#endif
/* System includes. */
#include
diff --git a/src/cuda/BLOCK_SIZE.h b/src/cuda/BLOCK_SIZE.h
new file mode 100644
index 0000000000..2d5dda1af2
--- /dev/null
+++ b/src/cuda/BLOCK_SIZE.h
@@ -0,0 +1,12 @@
+#ifndef BLOCK_SIZE_H
+#define BLOCK_SIZE_H
+
+#define BLOCK_SIZE 64
+#define N_TASKS_PER_PACK_SELF 8
+#define N_TASKS_BUNDLE_SELF 2
+
+#define BLOCK_SIZE_PAIR 64
+#define N_TASKS_PER_PACK_PAIR 4
+#define N_TASKS_BUNDLE_PAIR 1
+
+#endif // BLOCK_SIZE_H
diff --git a/src/cuda/GPU_runner_functions.cu b/src/cuda/GPU_runner_functions.cu
new file mode 100644
index 0000000000..d3c08c10ae
--- /dev/null
+++ b/src/cuda/GPU_runner_functions.cu
@@ -0,0 +1,4323 @@
+/*******************************************************************************
+ * This file contains functions used to setup and execute GPU tasks from within
+ *runner_main.c. Consider this a translator allowing .cu based functions to be
+ *called from within runner_main.c
+ ******************************************************************************/
+
+/* Hacky method to make c++ compilers not die. */
+#ifdef WITH_CUDA
+#ifndef static
+#define static
+#endif
+#ifndef restrict
+#define restrict __restrict__
+#endif
+#endif
+
+/* Required header files */
+#include
+/*ifdef WITH_CUDA prevents name mangling. C code sees exact names
+ of functions rather than mangled template names produced by C++*/
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+
+#include "../../config.h"
+
+#ifndef BLOCK_SIZE_H
+#include "BLOCK_SIZE.h"
+#endif
+
+#include "GPU_runner_functions.h"
+#include "device_functions.h"
+#include "part_gpu.h"
+
+#include
+
+#ifdef WITH_CUDA
+}
+#endif
+
+/* function to initialise GPU and printout GPU name*/
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void Initialise_GPU() {
+ int devId = 0;
+ // find and print device name
+ cudaDeviceProp prop;
+ cudaGetDeviceProperties(&prop, devId);
+ printf("Device : %s\n", prop.name);
+ cudaSetDevice(devId);
+ // cuda
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void tester(struct part_soa parts_soa, int *d_task_first_part,
+ int *d_task_last_part, float d_a, float d_H, int bid,
+ int tid, int count_tasks, int tasksperbundle,
+ int nBlocks_per_task, int bundle_first_task,
+ int max_parts, int time_bin_inhibited) {
+ extern __shared__ float vars[];
+ __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+ first_part_in_task_blocks = d_task_first_part[task_id],
+ last_part_in_task_blocks = d_task_last_part[task_id];
+ __syncthreads();
+ const int pid = threadid + first_part_in_task_blocks;
+
+ if (pid < last_part_in_task_blocks) {
+ parts_soa.tid_p[pid] = 1;
+ }
+ // if(parts_soa.tid_p[pid] == 1 && pid < last_part_in_task_blocks)
+ // printf("tid %i last_part_in_blocks %i\n", parts_soa.tid_p[pid],
+ // last_part_in_task_blocks);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_self_density_GPU(
+ struct part_soa parts_soa, int *d_task_first_part, int *d_task_last_part,
+ float d_a, float d_H, int count_tasks, int tasksperbundle,
+ int nBlocks_per_task, int bundle_first_task, int max_parts) {
+ extern __shared__ float vars[];
+ __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ // __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+ int first_part_in_task_blocks, last_part_in_task_blocks;
+ first_part_in_task_blocks = d_task_first_part[task_id],
+ last_part_in_task_blocks = d_task_last_part[task_id];
+ // __syncthreads();
+ const int pid = threadid + first_part_in_task_blocks;
+
+ int ttid = 0;
+ int first_part = 0;
+ int count = 0;
+ int last_part = 0;
+ float cellx = 0.0, celly = 0.0, cellz = 0.0;
+ float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+ float mi = 0.0;
+ float uxi = 0.0;
+ float uyi = 0.0;
+ float uzi = 0.0;
+ float pix = 0.0;
+ float piy = 0.0;
+ float piz = 0.0;
+ float rhoi = 0.0;
+ float rho_dhi = 0.0;
+ float wcounti = 0.0;
+ float wcount_dhi = 0.0;
+ float div_vi = 0.0;
+ float rot_uxi = 0.0;
+ float rot_uyi = 0.0;
+ float rot_uzi = 0.0;
+ int Found_neighbours = 0;
+ // if(pid (0.01f / 128.f) * (0.01f / 128.f)) {
+ // if (r2 < hig2 && r2 > (0.01f/256.f)*(0.01f/256.f)) {
+ Found_neighbours = 1;
+ const float r = sqrt(r2);
+ /* Recover some data */
+ const float mj = mass_tmp[j_block];
+ /* Get the kernel for hi. */
+ if (hi < 1.f / 256.f) printf("h < dx\n");
+ // if(hi<1.f/256.f)printf("h < dx\n");
+ const float h_inv = 1.f / hi;
+ const float ui = r * h_inv;
+ float wi, wi_dx;
+
+ d_kernel_deval(ui, &wi, &wi_dx);
+
+ rhoi += mj * wi;
+ rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+ wcounti += wi;
+ wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+ const float r_inv = 1.f / r;
+ const float faci = mj * wi_dx * r_inv;
+
+ /* Compute dv dot r */
+ float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+ dvz = uzi - uz_tmp[j_block];
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+ div_vi -= faci * dvdr;
+
+ /* Compute dv cross r */
+ float curlvrx = dvy * zij - dvz * yij;
+ float curlvry = dvz * xij - dvx * zij;
+ float curlvrz = dvx * yij - dvy * xij;
+
+ rot_uxi += faci * curlvrx;
+ rot_uyi += faci * curlvry;
+ rot_uzi += faci * curlvrz;
+ }
+ }
+ }
+ __syncthreads();
+ }
+ if (pid < last_part_in_task_blocks) {
+ // float wi, wi_dx;
+ // d_kernel_deval(0.f, &wi, &wi_dx);
+ // printf("mass i %e, self rho %e sum rho %e\n", mi, mi*wi, rhoi);
+ // if(Found_neighbours == 0) printf("Not sure what's going on but no
+ // neighbours found in GPU loop\n");
+ parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+ parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+ parts_soa.div_v[pid] = div_vi;
+ parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi,
+ parts_soa.rot_uz[pid] = rot_uzi;
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void DOSELF_GPU_AOS(struct part_aos *parts_aos,
+ int *d_task_first_part, int *d_task_last_part,
+ float d_a, float d_H, int count_tasks,
+ int tasksperbundle, int nBlocks_per_task,
+ int bundle_first_task, int max_parts,
+ double *d_cell_x, double *d_cell_y,
+ double *d_cell_z) {
+ extern __shared__ float vars[];
+ __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ // __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+ int first_part_in_task_blocks, last_part_in_task_blocks;
+ first_part_in_task_blocks = d_task_first_part[task_id],
+ last_part_in_task_blocks = d_task_last_part[task_id];
+ // __syncthreads();
+ const int pid = threadid + first_part_in_task_blocks;
+
+ int ttid = 0;
+ int first_part = 0;
+ int count = 0;
+ int last_part = 0;
+ float cellx = 0.0, celly = 0.0, cellz = 0.0;
+ float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+ float mi = 0.0;
+ float uxi = 0.0;
+ float uyi = 0.0;
+ float uzi = 0.0;
+ float pix = 0.0;
+ float piy = 0.0;
+ float piz = 0.0;
+ float rhoi = 0.0;
+ float rho_dhi = 0.0;
+ float wcounti = 0.0;
+ float wcount_dhi = 0.0;
+ float div_vi = 0.0;
+ float rot_uxi = 0.0;
+ float rot_uyi = 0.0;
+ float rot_uzi = 0.0;
+ int Found_neighbours = 0;
+ struct part_aos ipart = parts_aos[pid];
+ // if(pid (0.01f / 128.f) * (0.01f / 128.f)) {
+ Found_neighbours = 1;
+ const float r = sqrt(r2);
+ /* Recover some data */
+ const float mj = mass_tmp[j_block];
+ /* Get the kernel for hi. */
+ const float h_inv = 1.f / hi;
+ const float ui = r * h_inv;
+ float wi, wi_dx;
+
+ d_kernel_deval(ui, &wi, &wi_dx);
+
+ rhoi += mj * wi;
+ rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+ wcounti += wi;
+ wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+ const float r_inv = 1.f / r;
+ const float faci = mj * wi_dx * r_inv;
+
+ /* Compute dv dot r */
+ float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+ dvz = uzi - uz_tmp[j_block];
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+ div_vi -= faci * dvdr;
+
+ /* Compute dv cross r */
+ float curlvrx = dvy * zij - dvz * yij;
+ float curlvry = dvz * xij - dvx * zij;
+ float curlvrz = dvx * yij - dvy * xij;
+
+ rot_uxi += faci * curlvrx;
+ rot_uyi += faci * curlvry;
+ rot_uzi += faci * curlvrz;
+ }
+ }
+ }
+ __syncthreads();
+ }
+ if (pid < last_part_in_task_blocks) {
+ // float wi, wi_dx;
+ // d_kernel_deval(0.f, &wi, &wi_dx);
+ // printf("mass i %e, self rho %e sum rho %e\n", mi, mi*wi, rhoi);
+ // if(Found_neighbours == 0) printf("Not sure what's going on but no
+ // neighbours found in GPU loop\n");
+ parts_aos[pid].rho = rhoi, parts_aos[pid].rho_dh = rho_dhi;
+ parts_aos[pid].wcount = wcounti, parts_aos[pid].wcount_dh = wcount_dhi;
+ parts_aos[pid].div_v = div_vi;
+ parts_aos[pid].rot_ux = rot_uxi, parts_aos[pid].rot_uy = rot_uyi,
+ parts_aos[pid].rot_uz = rot_uzi;
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+// template
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+// #include
+__global__ void DOSELF_GPU_AOS_F4(
+ struct part_aos_f4_send *__restrict__ parts_send,
+ struct part_aos_f4_recv *__restrict__ parts_recv, const float d_a,
+ const float d_H, const int bundle_first_task,
+ const int2 *__restrict__ d_task_first_part_f4) {
+
+ extern __shared__ float4 vars_f4[];
+
+ // auto group = cooperative_groups::this_thread_block();
+ __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+ // cuda::barrier bar;
+
+ int first_part_in_task_blocks, last_part_in_task_blocks;
+ int2 first_last_parts = d_task_first_part_f4[task_id];
+ first_part_in_task_blocks = first_last_parts.x;
+ last_part_in_task_blocks = first_last_parts.y;
+
+ const int pid = threadid + first_part_in_task_blocks;
+
+ float4 res_rho = {0.0, 0.0, 0.0, 0.0};
+ float4 res_rot = {0.0, 0.0, 0.0, 0.0};
+ const part_aos_f4_send pi = parts_send[pid];
+ const float4 x_pi = pi.x_p_h;
+ const float4 ux_pi = pi.ux_m;
+ const float hi = x_pi.w, hig2 = hi * hi * kernel_gamma2;
+ int n_neighbours = 0;
+ /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+ * of the single shared memory space "vars" which we allocate in kernel
+ * invocation*/
+ float4 *__restrict__ x_p_h_tmp = (float4 *)&vars_f4[0];
+ float4 *__restrict__ ux_m_tmp = (float4 *)&vars_f4[BLOCK_SIZE];
+ /*Particles copied in blocks to shared memory*/
+ for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+ b += BLOCK_SIZE) {
+ int j = b + threadIdx.x;
+ struct part_aos_f4_send pj = parts_send[j];
+ x_p_h_tmp[threadIdx.x] = pj.x_p_h;
+ ux_m_tmp[threadIdx.x] = pj.ux_m;
+ __syncthreads();
+ for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+ j = j_block + b;
+ if (j < last_part_in_task_blocks) {
+ /* Compute the pairwise distance. */
+ const float4 x_p_h_j = x_p_h_tmp[j_block];
+ const float4 ux_m_j = ux_m_tmp[j_block];
+ const float xij = x_pi.x - x_p_h_j.x, yij = x_pi.y - x_p_h_j.y,
+ zij = x_pi.z - x_p_h_j.z;
+ const float r2 = xij * xij + yij * yij + zij * zij;
+ if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
+ const float r = sqrtf(r2);
+ /* Recover some data */
+ const float mj = ux_m_j.w;
+ /* Get the kernel for hi. */
+ const float h_inv = 1.f / hi;
+ const float ui = r * h_inv;
+ float wi, wi_dx;
+
+ d_kernel_deval(ui, &wi, &wi_dx);
+ /*Add to sums of rho, rho_dh, wcount and wcount_dh*/
+ res_rho.x += mj * wi;
+ res_rho.y -= mj * (hydro_dimension * wi + ui * wi_dx);
+ res_rho.z += wi;
+ res_rho.w -= (hydro_dimension * wi + ui * wi_dx);
+
+ const float r_inv = 1.f / r;
+ const float faci = mj * wi_dx * r_inv;
+
+ /* Compute dv dot r */
+ const float dvx = ux_pi.x - ux_m_j.x, dvy = ux_pi.y - ux_m_j.y,
+ dvz = ux_pi.z - ux_m_j.z;
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+ /* Compute dv cross r */
+ const float curlvrx = dvy * zij - dvz * yij;
+ const float curlvry = dvz * xij - dvx * zij;
+ const float curlvrz = dvx * yij - dvy * xij;
+ /*Add to sums of rot_u and div_v*/
+ res_rot.x += faci * curlvrx;
+ res_rot.y += faci * curlvry;
+ res_rot.z += faci * curlvrz;
+ res_rot.w -= faci * dvdr;
+ }
+ }
+ }
+ __syncthreads();
+ }
+ if (pid < last_part_in_task_blocks) {
+ parts_recv[pid].rho_dh_wcount = res_rho;
+ parts_recv[pid].rot_ux_div_v = res_rot;
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_density_aos(struct part_aos *parts_aos, int *d_task_first_part,
+ int *d_task_last_part, float d_a, float d_H,
+ const char *loop_type, cudaStream_t stream,
+ int block_size, int count_tasks, int tasksperbundle,
+ int numBlocks_x, int numBlocks_y, int bundle_first_task,
+ int max_parts, double *d_cell_x, double *d_cell_y,
+ double *d_cell_z) {
+
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+ DOSELF_GPU_AOS<<>>(parts_aos, d_task_first_part, d_task_last_part,
+ d_a, d_H, count_tasks, tasksperbundle,
+ nBlocks_per_task, bundle_first_task, max_parts,
+ d_cell_x, d_cell_y, d_cell_z);
+ // runner_do_self_density_GPU_naive<<>>(
+ // parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+ // count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+ // max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+struct first_part {
+ int list[32];
+};
+void launch_density_aos_f4(struct part_aos_f4_send *parts_send,
+ struct part_aos_f4_recv *parts_recv, float d_a,
+ float d_H, cudaStream_t stream, int numBlocks_x,
+ int numBlocks_y, int bundle_first_task,
+ int2 *d_task_first_part_f4) {
+
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+ DOSELF_GPU_AOS_F4<<>>(parts_send, parts_recv, d_a, d_H,
+ bundle_first_task, d_task_first_part_f4);
+ // runner_do_self_density_GPU_naive<<>>(
+ // parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+ // count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+ // max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void DOSELF_GPU_AOS_G(struct part_aos_g *parts_aos,
+ int *d_task_first_part, int *d_task_last_part,
+ float d_a, float d_H, int count_tasks,
+ int tasksperbundle, int nBlocks_per_task,
+ int bundle_first_task, int max_parts,
+ double *d_cell_x, double *d_cell_y,
+ double *d_cell_z) {
+ extern __shared__ float varsg[];
+ __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ // __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+ int first_part_in_task_blocks, last_part_in_task_blocks;
+ first_part_in_task_blocks = d_task_first_part[task_id],
+ last_part_in_task_blocks = d_task_last_part[task_id];
+ // __syncthreads();
+ const int pid = threadid + first_part_in_task_blocks;
+
+ int ttid = 0;
+ int first_part = 0;
+ int count = 0;
+ int last_part = 0;
+ float cellx = 0.0, celly = 0.0, cellz = 0.0;
+ float ci = 0.0, cj = 0.0;
+ float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+ float mi = 0.0;
+ float uxi = 0.0;
+ float uyi = 0.0;
+ float uzi = 0.0;
+ float pix = 0.0;
+ float piy = 0.0;
+ float piz = 0.0;
+ float rhoi = 0.0;
+ float div_vi = 0.0;
+ int Found_neighbours = 0;
+ float v_sig;
+ float u = 0.f;
+ float laplace_u = 0.0;
+ float alpha_visc_max_ngb = 0.0;
+ if (pid < last_part_in_task_blocks) {
+ ttid = task_id;
+ first_part = d_task_first_part[ttid];
+ last_part = d_task_last_part[ttid];
+ count = last_part - first_part;
+ cellx = d_cell_x[ttid], celly = d_cell_y[ttid], cellz = d_cell_z[ttid];
+ hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
+ mi = parts_aos[pid].mass;
+ uxi = parts_aos[pid].ux;
+ uyi = parts_aos[pid].uy;
+ uzi = parts_aos[pid].uz;
+ pix = parts_aos[pid].x_p - cellx;
+ piy = parts_aos[pid].y_p - celly;
+ piz = parts_aos[pid].z_p - cellz;
+ ci = parts_aos[pid].soundspeed;
+ v_sig = parts_aos[pid].v_sig;
+ u = parts_aos[pid].u;
+ laplace_u = parts_aos[pid].laplace_u;
+ alpha_visc_max_ngb = parts_aos[pid].alpha_visc_max_ngb;
+ }
+ // if (threadIdx.x == 0) {
+ // first_part_tid_0 = first_part;
+ // last_part_tid_0 = last_part;
+ // }
+ // __syncthreads();
+ int n_neighbours = 0;
+ /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+ * of the single shared memory space "vars" which we allocate in kernel
+ * invocation*/
+ float *x_p_tmp = (float *)&varsg[0];
+ float *y_p_tmp = (float *)&varsg[BLOCK_SIZE];
+ float *z_p_tmp = (float *)&varsg[BLOCK_SIZE * 2];
+ float *h_tmp = (float *)&varsg[BLOCK_SIZE * 3];
+ float *mass_tmp = (float *)&varsg[BLOCK_SIZE * 4];
+ float *ux_tmp = (float *)&varsg[BLOCK_SIZE * 5];
+ float *uy_tmp = (float *)&varsg[BLOCK_SIZE * 6];
+ float *uz_tmp = (float *)&varsg[BLOCK_SIZE * 7];
+ float *cj_tmp = (float *)&varsg[BLOCK_SIZE * 8];
+ float *alpha_tmp = (float *)&varsg[BLOCK_SIZE * 9];
+ float *u_tmp = (float *)&varsg[BLOCK_SIZE * 10];
+ float *rho_tmp = (float *)&varsg[BLOCK_SIZE * 11];
+ int *timebin = (int *)&varsg[BLOCK_SIZE * 12];
+ /*Particles copied in blocks to shared memory*/
+ for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+ b += BLOCK_SIZE) {
+ int j = b + threadIdx.x;
+ x_p_tmp[threadIdx.x] = parts_aos[j].x_p;
+ y_p_tmp[threadIdx.x] = parts_aos[j].y_p;
+ z_p_tmp[threadIdx.x] = parts_aos[j].z_p;
+ h_tmp[threadIdx.x] = parts_aos[j].h;
+ mass_tmp[threadIdx.x] = parts_aos[j].mass;
+ ux_tmp[threadIdx.x] = parts_aos[j].ux;
+ uy_tmp[threadIdx.x] = parts_aos[j].uy;
+ uz_tmp[threadIdx.x] = parts_aos[j].uz;
+ timebin[threadIdx.x] = parts_aos[j].time_bin;
+ cj_tmp[threadIdx.x] = parts_aos[j].soundspeed;
+ alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+ u_tmp[threadIdx.x] = parts_aos[j].u;
+ rho_tmp[threadIdx.x] = parts_aos[j].rho;
+ __syncthreads();
+ for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+ j = j_block + b;
+ // if ((j != pid) && (j < last_part_in_task_blocks) &&
+ // timebin[j_block] != time_bin_inhibited) {
+ // if ((j < last_part_in_task_blocks) &&
+ // timebin[j_block] != time_bin_inhibited) {
+ if (j < last_part_in_task_blocks) {
+ /* Compute the pairwise distance. */
+ const float pjx = x_p_tmp[j_block] - cellx;
+ const float pjy = y_p_tmp[j_block] - celly;
+ const float pjz = z_p_tmp[j_block] - cellz;
+ const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+ const float r2 = xij * xij + yij * yij + zij * zij;
+ if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
+ Found_neighbours = 1;
+ const float r = sqrt(r2);
+ const float r_inv = 1.f / r;
+ /* Recover some data */
+ const float mj = mass_tmp[j_block];
+ /* Get the kernel for hi. */
+ const float h_inv = 1.f / hi;
+ float wi, wi_dx;
+ /* Cosmology terms for the signal velocity */
+ const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+ const float a2_Hubble = d_a * d_a * d_H;
+ /* Compute dv dot r */
+ float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+ dvz = uzi - uz_tmp[j_block];
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+ /* Add Hubble flow */
+ const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+ /* Are the particles moving towards each others ? */
+ const float omega_ij = min(dvdr_Hubble, 0.f);
+ const float mu_ij =
+ fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+
+ /* Signal velocity */
+ const float new_v_sig =
+ ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
+ /* Update if we need to */
+ v_sig = max(v_sig, new_v_sig);
+ /* Calculate Del^2 u for the thermal diffusion coefficient. */
+ /* Need to get some kernel values F_ij = wi_dx */
+ const float ui = r * h_inv;
+ d_kernel_deval(ui, &wi, &wi_dx);
+
+ const float delta_u_factor = (u - u_tmp[j_block]) * r_inv;
+ laplace_u += mj * delta_u_factor * wi_dx / rho_tmp[j_block];
+
+ /* Set the maximal alpha from the previous step over the neighbours
+ * (this is used to limit the diffusion in hydro_prepare_force) */
+ const float alpha_j = alpha_tmp[j_block];
+ alpha_visc_max_ngb = max(alpha_visc_max_ngb, alpha_j);
+ }
+ }
+ }
+ __syncthreads();
+ }
+ if (pid < last_part_in_task_blocks) {
+ parts_aos[pid].v_sig = v_sig, parts_aos[pid].laplace_u = laplace_u;
+ parts_aos[pid].alpha_visc_max_ngb = alpha_visc_max_ngb;
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void DOSELF_GPU_AOS_F4_G(
+ struct part_aos_f4_g_send *__restrict__ parts_send,
+ struct part_aos_f4_g_recv *__restrict__ parts_recv, const float d_a,
+ const float d_H, const int bundle_first_task,
+ const int2 *__restrict__ d_task_first_part_f4) {
+
+ extern __shared__ float4 varsf4_g[];
+
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+ int2 first_last_parts = d_task_first_part_f4[task_id];
+ int first_part_in_task_blocks = first_last_parts.x;
+ int last_part_in_task_blocks = first_last_parts.y;
+ // __syncthreads();
+ const int pid = threadid + first_part_in_task_blocks;
+
+ /*Keep this*/
+ float v_sig = 0.f;
+ float alpha_visc_max_ngb = 0.f;
+ /////////////
+
+ struct part_aos_f4_g_send pi = parts_send[pid];
+ float4 x_h_i = pi.x_h;
+ float4 ux_m_i = pi.ux_m;
+ float4 rho_avisc_u_c_i = pi.rho_avisc_u_c;
+ float3 vsig_lapu_aviscmax_i = {0.f, 0.f, 0.f};
+
+ const float hi = x_h_i.w, hig2 = hi * hi * kernel_gamma2;
+
+ int n_neighbours = 0;
+ /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+ * of the single shared memory space "vars" which we allocate in kernel
+ * invocation*/
+ float4 *__restrict__ x_h_tmp = (float4 *)&varsf4_g[0];
+ float4 *__restrict__ ux_m_tmp = (float4 *)&varsf4_g[BLOCK_SIZE];
+ float4 *__restrict__ rho_avisc_u_c_tmp = (float4 *)&varsf4_g[BLOCK_SIZE * 2];
+
+ /*Particles copied in blocks to shared memory*/
+ for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+ b += BLOCK_SIZE) {
+
+ int j = b + threadIdx.x;
+
+ struct part_aos_f4_g_send pj = parts_send[j];
+ x_h_tmp[threadIdx.x] = pj.x_h;
+ ux_m_tmp[threadIdx.x] = pj.ux_m;
+ rho_avisc_u_c_tmp[threadIdx.x] = pj.rho_avisc_u_c;
+
+ __syncthreads();
+ for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+ j = j_block + b;
+ if (j < last_part_in_task_blocks) {
+ float4 x_h_j = x_h_tmp[j_block];
+ float4 ux_m_j = ux_m_tmp[j_block];
+ float4 rho_avisc_u_c_j = rho_avisc_u_c_tmp[j_block];
+ /* Compute the pairwise distance. */
+ const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y,
+ zij = x_h_i.z - x_h_j.z;
+ const float r2 = xij * xij + yij * yij + zij * zij;
+
+ if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
+ const float r = sqrt(r2);
+ const float r_inv = 1.f / r;
+ /* Recover some data */
+ const float mj = ux_m_j.w;
+ /* Get the kernel for hi. */
+ const float h_inv = 1.f / hi;
+ float wi, wi_dx;
+ /* Cosmology terms for the signal velocity */
+ const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+ const float a2_Hubble = d_a * d_a * d_H;
+ /* Compute dv dot r */
+ float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y,
+ dvz = ux_m_i.z - ux_m_j.z;
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+ /* Add Hubble flow */
+ const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+ /* Are the particles moving towards each others ? */
+ const float omega_ij = min(dvdr_Hubble, 0.f);
+ const float mu_ij =
+ fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+
+ /* Signal velocity */
+ const float new_v_sig = rho_avisc_u_c_i.w + rho_avisc_u_c_j.w -
+ const_viscosity_beta * mu_ij;
+ /* Update if we need to */
+ vsig_lapu_aviscmax_i.x = fmaxf(vsig_lapu_aviscmax_i.x, new_v_sig);
+ /* Calculate Del^2 u for the thermal diffusion coefficient. */
+ /* Need to get some kernel values F_ij = wi_dx */
+ const float ui = r * h_inv;
+ d_kernel_deval(ui, &wi, &wi_dx);
+
+ const float delta_u_factor =
+ (rho_avisc_u_c_i.z - rho_avisc_u_c_j.z) * r_inv;
+ vsig_lapu_aviscmax_i.y +=
+ mj * delta_u_factor * wi_dx / rho_avisc_u_c_j.x;
+
+ /* Set the maximal alpha from the previous step over the neighbours
+ * (this is used to limit the diffusion in hydro_prepare_force) */
+ const float alpha_j = rho_avisc_u_c_j.y;
+ vsig_lapu_aviscmax_i.z = fmaxf(vsig_lapu_aviscmax_i.z, alpha_j);
+ }
+ }
+ }
+ __syncthreads();
+ }
+ if (pid < last_part_in_task_blocks) {
+ // printf("v %f lap %f maxvisc %f\n", vsig_lapu_aviscmax_empty_i.x,
+ // vsig_lapu_aviscmax_empty_i.y, vsig_lapu_aviscmax_empty_i.z);
+ parts_recv[pid].vsig_lapu_aviscmax = vsig_lapu_aviscmax_i;
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void DOSELF_GPU_AOS_F(struct part_aos_f *parts_aos,
+ int *d_task_first_part, int *d_task_last_part,
+ float d_a, float d_H, int count_tasks,
+ int tasksperbundle, int nBlocks_per_task,
+ int bundle_first_task, int max_parts,
+ double *d_cell_x, double *d_cell_y,
+ double *d_cell_z) {
+ extern __shared__ float varsf[];
+ __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ int first_part_in_task_blocks, last_part_in_task_blocks;
+ first_part_in_task_blocks = d_task_first_part[task_id],
+ last_part_in_task_blocks = d_task_last_part[task_id];
+
+ const int pid = threadid + first_part_in_task_blocks;
+
+ int ttid = 0;
+ int first_part = 0;
+ int count = 0;
+ int last_part = 0;
+ float cellx = 0.0, celly = 0.0, cellz = 0.0;
+ float ci = 0.0, cj = 0.0;
+ float hi = 0.0, hig2 = 0.0;
+ float mi = 0.0;
+ float uxi = 0.0;
+ float uyi = 0.0;
+ float uzi = 0.0;
+ float pix = 0.0;
+ float piy = 0.0;
+ float piz = 0.0;
+ float rhoi = 0.0;
+ float div_vi = 0.0;
+ int Found_neighbours = 0;
+ float v_sigi;
+ float ui = 0.f;
+ float u_dti = 0.f;
+ float laplace_ui = 0.0;
+ float alpha_visc_max_ngb = 0.0;
+ float pressurei = 0.0;
+ float alphavisci = 0.0;
+ float alphadiffi = 0.0;
+ float fi = 0.0;
+ float balsarai = 0.0;
+ float ahydroxi = 0.0;
+ float ahydroyi = 0.0;
+ float ahydrozi = 0.0;
+ float h_dti = 0.0;
+ int min_ngb_time_bin = 0;
+ if (pid < last_part_in_task_blocks) {
+ ttid = task_id;
+ first_part = d_task_first_part[ttid];
+ last_part = d_task_last_part[ttid];
+ count = last_part - first_part;
+ cellx = d_cell_x[ttid], celly = d_cell_y[ttid], cellz = d_cell_z[ttid];
+ hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
+ mi = parts_aos[pid].mass;
+ uxi = parts_aos[pid].ux;
+ uyi = parts_aos[pid].uy;
+ uzi = parts_aos[pid].uz;
+ pix = parts_aos[pid].x_p - cellx;
+ piy = parts_aos[pid].y_p - celly;
+ piz = parts_aos[pid].z_p - cellz;
+ ci = parts_aos[pid].soundspeed;
+ fi = parts_aos[pid].f;
+ v_sigi = parts_aos[pid].v_sig;
+ ui = parts_aos[pid].u;
+ rhoi = parts_aos[pid].rho;
+ pressurei = parts_aos[pid].pressure;
+ balsarai = parts_aos[pid].balsara;
+ alphavisci = parts_aos[pid].alpha_visc;
+ alphadiffi = parts_aos[pid].alpha_diff;
+ min_ngb_time_bin = parts_aos[pid].min_ngb_time_bin;
+ // laplace_u = parts_aos[pid].laplace_u;
+ // alpha_visc_max_ngb = parts_aos[pid].alpha_visc_max_ngb;
+ }
+ // if (threadIdx.x == 0) {
+ // first_part_tid_0 = first_part;
+ // last_part_tid_0 = last_part;
+ // }
+ // __syncthreads();
+ int n_neighbours = 0;
+ /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+ * of the single shared memory space "vars" which we allocate in kernel
+ * invocation*/
+ float *x_p_tmp = (float *)&varsf[0];
+ float *y_p_tmp = (float *)&varsf[BLOCK_SIZE];
+ float *z_p_tmp = (float *)&varsf[BLOCK_SIZE * 2];
+ float *h_tmp = (float *)&varsf[BLOCK_SIZE * 3];
+ float *mass_tmp = (float *)&varsf[BLOCK_SIZE * 4];
+ float *ux_tmp = (float *)&varsf[BLOCK_SIZE * 5];
+ float *uy_tmp = (float *)&varsf[BLOCK_SIZE * 6];
+ float *uz_tmp = (float *)&varsf[BLOCK_SIZE * 7];
+ float *cj_tmp = (float *)&varsf[BLOCK_SIZE * 8];
+ float *alphavisc_tmp = (float *)&varsf[BLOCK_SIZE * 9];
+ float *alphadiff_tmp = (float *)&varsf[BLOCK_SIZE * 10];
+ float *u_tmp = (float *)&varsf[BLOCK_SIZE * 11];
+ float *rho_tmp = (float *)&varsf[BLOCK_SIZE * 12];
+ float *pressure_tmp = (float *)&varsf[BLOCK_SIZE * 13];
+ float *f_tmp = (float *)&varsf[BLOCK_SIZE * 14];
+ float *balsara_tmp = (float *)&varsf[BLOCK_SIZE * 15];
+ int *timebin = (int *)&varsf[BLOCK_SIZE * 16];
+ /*Particles copied in blocks to shared memory*/
+ for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+ b += BLOCK_SIZE) {
+ int j = b + threadIdx.x;
+ x_p_tmp[threadIdx.x] = parts_aos[j].x_p;
+ y_p_tmp[threadIdx.x] = parts_aos[j].y_p;
+ z_p_tmp[threadIdx.x] = parts_aos[j].z_p;
+ h_tmp[threadIdx.x] = parts_aos[j].h;
+ mass_tmp[threadIdx.x] = parts_aos[j].mass;
+ ux_tmp[threadIdx.x] = parts_aos[j].ux;
+ uy_tmp[threadIdx.x] = parts_aos[j].uy;
+ uz_tmp[threadIdx.x] = parts_aos[j].uz;
+ timebin[threadIdx.x] = parts_aos[j].time_bin;
+ cj_tmp[threadIdx.x] = parts_aos[j].soundspeed;
+ // alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+ u_tmp[threadIdx.x] = parts_aos[j].u;
+ rho_tmp[threadIdx.x] = parts_aos[j].rho;
+ alphavisc_tmp[threadIdx.x] = parts_aos[j].alpha_visc;
+ alphadiff_tmp[threadIdx.x] = parts_aos[j].alpha_diff;
+ pressure_tmp[threadIdx.x] = parts_aos[j].pressure;
+ f_tmp[threadIdx.x] = parts_aos[j].f;
+ balsara_tmp[threadIdx.x] = parts_aos[j].balsara;
+ __syncthreads();
+ for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+ j = j_block + b;
+ if (j < last_part_in_task_blocks) {
+ /* Compute the pairwise distance. */
+ const float pjx = x_p_tmp[j_block] - cellx;
+ const float pjy = y_p_tmp[j_block] - celly;
+ const float pjz = z_p_tmp[j_block] - cellz;
+ const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+ const float r2 = xij * xij + yij * yij + zij * zij;
+ if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
+
+ // /* Cosmology terms for the signal velocity */
+ const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+ const float a2_Hubble = d_a * d_a * d_H;
+ const float r = sqrt(r2);
+ const float r_inv = 1.f / r;
+ // /* Recover some data */
+ const float mj = mass_tmp[j_block];
+ // /* Get the kernel for hi. */
+ const float hi_inv = 1.f / hi;
+ const float hid_inv =
+ d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
+ const float xi = r * hi_inv;
+ float wi, wi_dx;
+ d_kernel_deval(xi, &wi, &wi_dx);
+ const float wi_dr = hid_inv * wi_dx;
+ /* Get the kernel for hj. */
+ const float hj = h_tmp[j_block];
+ const float hj_inv = 1.0f / hj;
+ const float hjd_inv =
+ d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
+ const float xj = r * hj_inv;
+ float wj, wj_dx;
+ d_kernel_deval(xj, &wj, &wj_dx);
+ const float wj_dr = hjd_inv * wj_dx;
+ // /* Compute dv dot r */
+ float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+ dvz = uzi - uz_tmp[j_block];
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+ // /* Add Hubble flow */
+ const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+ // /* Are the particles moving towards each others ? */
+ const float omega_ij = min(dvdr_Hubble, 0.f);
+ const float mu_ij =
+ fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+ //
+ // /* Signal velocity */
+ const float v_sig =
+ ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
+
+ /* Variable smoothing length term */
+ const float f_ij = 1.f - fi / mj;
+ const float f_ji = 1.f - f_tmp[j_block] / mi;
+
+ /* Balsara term */
+ const float balsaraj = balsara_tmp[j_block];
+ /* Construct the full viscosity term */
+ const float rhoj = rho_tmp[j_block];
+ const float pressurej = pressure_tmp[j_block];
+ const float rho_ij = rhoi + rhoj;
+ const float alpha = alphavisci + alphavisc_tmp[j_block];
+ const float visc =
+ -0.25f * alpha * v_sig * mu_ij * (balsarai + balsaraj) / rho_ij;
+ /* Convolve with the kernel */
+ const float visc_acc_term =
+ 0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv;
+ /* Compute gradient terms */
+ const float P_over_rho2_i = pressurei / (rhoi * rhoi) * f_ij;
+ const float P_over_rho2_j = pressurej / (rhoj * rhoj) * f_ji;
+
+ /* SPH acceleration term */
+ const float sph_acc_term =
+ (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv;
+
+ /* Assemble the acceleration */
+ const float acc = sph_acc_term + visc_acc_term;
+ /* Use the force Luke ! */
+ ahydroxi -= mj * acc * xij;
+ ahydroyi -= mj * acc * yij;
+ ahydrozi -= mj * acc * zij;
+ // if(rhoi == 0 || rhoj == 0 || pressurei == 0 || pressurej
+ // == 0)printf("ri %f rj %f pi %f pj %f\n", rhoi, rhoj,
+ // pressurei, pressurej);
+ /* Get the time derivative for u. */
+ const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr;
+
+ /* Viscosity term */
+ const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble;
+ const float press_sum = pressurei + pressurej;
+ /* Diffusion term */
+ /* Combine the alpha_diff into a pressure-based switch -- this allows
+ * the alpha from the highest pressure particle to dominate, so that
+ * the diffusion limited particles always take precedence - another
+ * trick to allow the scheme to work with thermal feedback. */
+ float alpha_diff =
+ (pressurei * alphadiffi + pressurej * alphadiff_tmp[j_block]) /
+ (press_sum);
+ if (fabsf(press_sum) < 1e-10) alpha_diff = 0.f;
+ const float v_diff =
+ alpha_diff * 0.5f *
+ (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
+ fabsf(fac_mu * r_inv * dvdr_Hubble));
+ /* wi_dx + wj_dx / 2 is F_ij */
+ const float diff_du_term =
+ v_diff * (ui - u_tmp[j_block]) *
+ (f_ij * wi_dr / rhoi + f_ji * wj_dr / rhoj);
+
+ /* Assemble the energy equation term */
+ const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term;
+
+ /* Internal energy time derivative */
+ u_dti += du_dt_i * mj;
+ if (mj == 0.f) printf("zero mass mj %f\n", mj);
+
+ /* Get the time derivative for h. */
+ h_dti -= mj * dvdr * r_inv / rhoj * wi_dr;
+
+ /* Update if we need to; this should be guaranteed by the gradient
+ * loop but due to some possible synchronisation problems this is here
+ * as a _quick fix_. Added: 14th August 2019. To be removed by 1st Jan
+ * 2020. (JB) */
+ v_sigi = max(v_sigi, v_sig);
+ int time_bin_j = timebin[j_block];
+ if (time_bin_j > 0)
+ min_ngb_time_bin = min(min_ngb_time_bin, time_bin_j);
+ // printf("Got in\n");
+ }
+ }
+ }
+ __syncthreads();
+ }
+ if (pid < last_part_in_task_blocks) {
+ parts_aos[pid].v_sig = v_sigi;
+ parts_aos[pid].h_dt = h_dti;
+ parts_aos[pid].u_dt = u_dti;
+ parts_aos[pid].a_hydrox = ahydroxi;
+ parts_aos[pid].a_hydroy = ahydroyi;
+ parts_aos[pid].a_hydroz = ahydrozi;
+ parts_aos[pid].min_ngb_time_bin = min_ngb_time_bin;
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void DOSELF_GPU_AOS_F4_F(
+ struct part_aos_f4_f_send *__restrict__ parts_send,
+ struct part_aos_f4_f_recv *__restrict__ parts_recv, const float d_a,
+ const float d_H, const int bundle_first_task,
+ const int2 *__restrict__ d_task_first_part_f4) {
+
+ extern __shared__ float4 varsf4_f[];
+
+ __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ int first_part_in_task_blocks, last_part_in_task_blocks;
+ // first_part_in_task_blocks = d_task_first_part[task_id],
+ // last_part_in_task_blocks = d_task_last_part[task_id];
+ int2 first_last_parts = d_task_first_part_f4[task_id];
+ first_part_in_task_blocks = first_last_parts.x;
+ last_part_in_task_blocks = first_last_parts.y;
+
+ const int pid = threadid + first_part_in_task_blocks;
+
+ int ttid = 0;
+ int first_part = 0;
+ int count = 0;
+ int last_part = 0;
+ const part_aos_f4_f_send pi = parts_send[pid];
+ float4 x_h_i = pi.x_h;
+ float4 ux_m_i = pi.ux_m;
+ float4 f_b_t_mintbinngb_i = pi.f_bals_timebin_mintimebin_ngb;
+ float4 rho_p_c_vsig_i = pi.rho_p_c_vsigi;
+ float3 u_avisc_adiff_i = pi.u_alphavisc_alphadiff;
+
+ const float mi = ux_m_i.w;
+ int Found_neighbours = 0;
+ float pressurei = rho_p_c_vsig_i.y;
+ const float ci = rho_p_c_vsig_i.z;
+ float3 ahydro = {0.0, 0.0, 0.0};
+ float4 udt_hdt_vsig_mintbinngb = {0.0, 0.0, 0.0, 0.0};
+ udt_hdt_vsig_mintbinngb.z = rho_p_c_vsig_i.w;
+ udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w;
+
+ float hi = x_h_i.w;
+ float hig2 = hi * hi * kernel_gamma2;
+
+ /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+ * of the single shared memory space "vars" which we allocate in kernel
+ * invocation*/
+ float4 *__restrict__ x_h_tmp = (float4 *)&varsf4_f[0];
+ float4 *__restrict__ ux_m_tmp = (float4 *)&varsf4_f[BLOCK_SIZE];
+ float4 *__restrict__ f_b_t_mintbinngb_tmp =
+ (float4 *)&varsf4_f[BLOCK_SIZE * 2];
+ float4 *__restrict__ rho_p_c_vsig_tmp = (float4 *)&varsf4_f[BLOCK_SIZE * 3];
+ float3 *__restrict__ u_avisc_adiff_tmp = (float3 *)&varsf4_f[BLOCK_SIZE * 4];
+ /*Particles copied in blocks to shared memory*/
+ for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+ b += BLOCK_SIZE) {
+ int j = b + threadIdx.x;
+ struct part_aos_f4_f_send pj = parts_send[j];
+ x_h_tmp[threadIdx.x] = pj.x_h;
+ ux_m_tmp[threadIdx.x] = pj.ux_m;
+ f_b_t_mintbinngb_tmp[threadIdx.x] = pj.f_bals_timebin_mintimebin_ngb;
+ rho_p_c_vsig_tmp[threadIdx.x] = pj.rho_p_c_vsigi;
+ // alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+ u_avisc_adiff_tmp[threadIdx.x] = pj.u_alphavisc_alphadiff;
+ __syncthreads();
+ for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+ j = j_block + b;
+ if (j < last_part_in_task_blocks) {
+ /* Compute the pairwise distance. */
+ float4 x_h_j = x_h_tmp[j_block];
+ float4 ux_m_j = ux_m_tmp[j_block];
+ float4 f_b_t_mintbinngb_j = f_b_t_mintbinngb_tmp[j_block];
+ float4 rho_p_c_vsig_j = rho_p_c_vsig_tmp[j_block];
+ float3 u_avisc_adiff_j = u_avisc_adiff_tmp[j_block];
+ const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y,
+ zij = x_h_i.z - x_h_j.z;
+ const float r2 = xij * xij + yij * yij + zij * zij;
+ if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
+ // /* Cosmology terms for the signal velocity */
+ const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+ const float a2_Hubble = d_a * d_a * d_H;
+ const float r = sqrt(r2);
+ const float r_inv = 1.f / r;
+ // /* Recover some data */
+ const float mj = ux_m_j.w;
+ // /* Get the kernel for hi. */
+ const float hi_inv = 1.f / hi;
+ const float hid_inv =
+ d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
+ const float xi = r * hi_inv;
+ float wi, wi_dx;
+ d_kernel_deval(xi, &wi, &wi_dx);
+ const float wi_dr = hid_inv * wi_dx;
+ /* Get the kernel for hj. */
+ const float hj = x_h_j.w;
+ const float hj_inv = 1.0f / hj;
+ const float hjd_inv =
+ d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
+ const float xj = r * hj_inv;
+ float wj, wj_dx;
+ d_kernel_deval(xj, &wj, &wj_dx);
+ const float wj_dr = hjd_inv * wj_dx;
+ // /* Compute dv dot r */
+ float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y,
+ dvz = ux_m_i.z - ux_m_j.z;
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+ // /* Add Hubble flow */
+ const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+ // /* Are the particles moving towards each others ? */
+ const float omega_ij = min(dvdr_Hubble, 0.f);
+ const float mu_ij =
+ fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+ //
+ // /* Signal velocity */
+ const float cj = rho_p_c_vsig_j.z;
+ const float v_sig = ci + cj - const_viscosity_beta * mu_ij;
+
+ /* Variable smoothing length term */
+ const float f_ij = 1.f - f_b_t_mintbinngb_i.x / mj;
+ const float f_ji = 1.f - f_b_t_mintbinngb_j.x / mi;
+
+ /* Construct the full viscosity term */
+ const float pressurej = rho_p_c_vsig_j.y;
+ const float rho_ij = rho_p_c_vsig_i.x + rho_p_c_vsig_j.x;
+ const float alpha = u_avisc_adiff_i.y + u_avisc_adiff_j.y;
+ const float visc = -0.25f * alpha * v_sig * mu_ij *
+ (f_b_t_mintbinngb_i.y + f_b_t_mintbinngb_j.y) /
+ rho_ij;
+ /* Convolve with the kernel */
+ const float visc_acc_term =
+ 0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv;
+ /* Compute gradient terms */
+ const float rhoi2 = rho_p_c_vsig_i.x * rho_p_c_vsig_i.x;
+ const float rhoj2 = rho_p_c_vsig_j.x * rho_p_c_vsig_j.x;
+ const float P_over_rho2_i = pressurei / (rhoi2)*f_ij;
+ const float P_over_rho2_j = pressurej / (rhoj2)*f_ji;
+
+ /* SPH acceleration term */
+ const float sph_acc_term =
+ (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv;
+
+ /* Assemble the acceleration */
+ const float acc = sph_acc_term + visc_acc_term;
+ /* Use the force Luke ! */
+ ahydro.x -= mj * acc * xij;
+ ahydro.y -= mj * acc * yij;
+ ahydro.z -= mj * acc * zij;
+ /* Get the time derivative for u. */
+ const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr;
+
+ /* Viscosity term */
+ const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble;
+ /* Diffusion term */
+ /* Combine the alpha_diff into a pressure-based switch -- this allows
+ * the alpha from the highest pressure particle to dominate, so that
+ * the diffusion limited particles always take precedence - another
+ * trick to allow the scheme to work with thermal feedback. */
+ float alpha_diff =
+ (pressurei * u_avisc_adiff_i.z + pressurej * u_avisc_adiff_j.z) /
+ (pressurei + pressurej);
+ if (fabsf(pressurei + pressurej) < 1e-10) alpha_diff = 0.f;
+ const float v_diff =
+ alpha_diff * 0.5f *
+ (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
+ fabsf(fac_mu * r_inv * dvdr_Hubble));
+ /* wi_dx + wj_dx / 2 is F_ij */
+ const float diff_du_term = v_diff *
+ (u_avisc_adiff_i.x - u_avisc_adiff_j.x) *
+ (f_ij * wi_dr / rho_p_c_vsig_i.x +
+ f_ji * wj_dr / rho_p_c_vsig_j.x);
+
+ /* Assemble the energy equation term */
+ const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term;
+
+ /* Internal energy time derivative */
+ udt_hdt_vsig_mintbinngb.x += du_dt_i * mj;
+
+ /* Get the time derivative for h. */
+ udt_hdt_vsig_mintbinngb.y -=
+ mj * dvdr * r_inv / rho_p_c_vsig_j.x * wi_dr;
+
+ /* Update if we need to; this should be guaranteed by the gradient
+ * loop but due to some possible synchronisation problems this is here
+ * as a _quick fix_. Added: 14th August 2019. To be removed by 1st Jan
+ * 2020. (JB) */
+ udt_hdt_vsig_mintbinngb.z = fmaxf(udt_hdt_vsig_mintbinngb.z, v_sig);
+ unsigned int time_bin_j = (f_b_t_mintbinngb_j.z + 0.5f);
+ unsigned int min_tb_i = (f_b_t_mintbinngb_i.w + 0.5f);
+ if (time_bin_j > 0) f_b_t_mintbinngb_i.w = min(min_tb_i, time_bin_j);
+ // printf("Got in\n");
+ }
+ }
+ }
+ __syncthreads();
+ }
+ if (pid < last_part_in_task_blocks) {
+ udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w;
+ parts_recv[pid].udt_hdt_vsig_mintimebin_ngb = udt_hdt_vsig_mintbinngb;
+ parts_recv[pid].a_hydro = ahydro;
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_density_GPU_naive(
+ struct part_soa parts_soa_ci, struct part_soa parts_soa_cj,
+ int *d_task_first_part_ci, int *d_task_first_part_cj,
+ int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H,
+ int bid, int tid, int count_tasks, int tasksperbundle, int nBlocks_per_task,
+ int bundle_first_task, int time_bin_inhibited) {
+
+ extern __shared__ float vars[];
+ // __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ __shared__ int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+ __shared__ int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+
+ first_part_in_task_blocks_ci = d_task_first_part_ci[task_id];
+ last_part_in_task_blocks_ci = d_task_last_part_ci[task_id];
+ first_part_in_task_blocks_cj = d_task_first_part_cj[task_id];
+ last_part_in_task_blocks_cj = d_task_last_part_cj[task_id];
+
+ __syncthreads();
+ // Now we start calculations for particles in cell i
+ const int pid = threadid + first_part_in_task_blocks_ci;
+
+ float dx =
+ 1.f / 64.f; // Value used to avoid interacting parts with themselves
+ int ttid = 0;
+ int first_part = 0;
+ int count = 0;
+ int last_part = 0;
+ float cellx = 0.0, celly = 0.0, cellz = 0.0;
+ float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+ float mi = 0.0;
+ float uxi = 0.0;
+ float uyi = 0.0;
+ float uzi = 0.0;
+ float pix = 0.0;
+ float piy = 0.0;
+ float piz = 0.0;
+ float rhoi = 0.0;
+ float rho_dhi = 0.0;
+ float wcounti = 0.0;
+ float wcount_dhi = 0.0;
+ float div_vi = 0.0;
+ float rot_uxi = 0.0;
+ float rot_uyi = 0.0;
+ float rot_uzi = 0.0;
+ int Found_neighbours = 0;
+ // if(pid (0.01f/128.f)*(0.01f/128.f)) {
+ if (r2 < hig2 && r2 > (0.01f / dx) * (0.01f / dx)) {
+ Found_neighbours = 1;
+ const float r = sqrt(r2);
+ /* Recover some data */
+ const float mj = mass_tmp[j_block];
+ /* Get the kernel for hi. */
+ if (hi < 1.f / dx) printf("h < dx\n");
+ // if(hi<1.f/256.f)printf("h < dx\n");
+ const float h_inv = 1.f / hi;
+ const float ui = r * h_inv;
+ float wi, wi_dx;
+
+ d_kernel_deval(ui, &wi, &wi_dx);
+
+ rhoi += mj * wi;
+ rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+ wcounti += wi;
+ wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+ const float r_inv = 1.f / r;
+ const float faci = mj * wi_dx * r_inv;
+
+ /* Compute dv dot r */
+ float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+ dvz = uzi - uz_tmp[j_block];
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+ div_vi -= faci * dvdr;
+
+ /* Compute dv cross r */
+ float curlvrx = dvy * zij - dvz * yij;
+ float curlvry = dvz * xij - dvx * zij;
+ float curlvrz = dvx * yij - dvy * xij;
+
+ rot_uxi += faci * curlvrx;
+ rot_uyi += faci * curlvry;
+ rot_uzi += faci * curlvrz;
+ }
+ }
+ }
+ __syncthreads();
+ }
+ if (pid < last_part_in_task_blocks_ci) {
+ parts_soa_ci.rho[pid] = rhoi, parts_soa_ci.rho_dh[pid] = rho_dhi;
+ parts_soa_ci.wcount[pid] = wcounti,
+ parts_soa_ci.wcount_dh[pid] = wcount_dhi;
+ parts_soa_ci.div_v[pid] = div_vi;
+ parts_soa_ci.rot_ux[pid] = rot_uxi, parts_soa_ci.rot_uy[pid] = rot_uyi;
+ parts_soa_ci.rot_uz[pid] = rot_uzi;
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_density_pair_two_kernels(
+ struct part_soa parts_soa_ci, struct part_soa parts_soa_cj,
+ int *d_task_first_part_ci, int *d_task_first_part_cj,
+ int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H,
+ const char *loop_type, cudaStream_t stream, int bid, int block_size,
+ int count_tasks, int tasksperbundle, int max_parts_i, int max_parts_j,
+ int numBlocks_y, int tid, int offset, int bundle_first_task,
+ int time_bin_inhibited) {
+
+ int max_parts = max(max_parts_j, max_parts_i);
+ int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+
+ /*Do ci*/
+ runner_do_pair_density_GPU_naive<<>>(
+ parts_soa_ci, parts_soa_cj, d_task_first_part_ci, d_task_first_part_cj,
+ d_task_last_part_ci, d_task_last_part_cj, d_a, d_H, bid, tid, count_tasks,
+ tasksperbundle, nBlocks_per_task, bundle_first_task, time_bin_inhibited);
+
+ // numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ // gridShape = dim3(numBlocks_x, numBlocks_y);
+ // nBlocks_per_task = numBlocks_x;
+ /*Now do cj*/
+ runner_do_pair_density_GPU_naive<<>>(
+ parts_soa_cj, parts_soa_ci, d_task_first_part_cj, d_task_first_part_ci,
+ d_task_last_part_cj, d_task_last_part_ci, d_a, d_H, bid, tid, count_tasks,
+ tasksperbundle, nBlocks_per_task, bundle_first_task, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIRGPU(struct part_soa parts_soa, int pid,
+ int last_part_in_task_blocks_ci,
+ int first_part_in_task_blocks_cj,
+ int last_part_in_task_blocks_cj, float d_a, float d_H,
+ int time_bin_inhibited, float *vars) {
+
+ float dx =
+ 1.f / 64.f; // Value used to avoid interacting parts with themselves
+
+ float cellx = 0.0, celly = 0.0, cellz = 0.0;
+ float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+ float mi = 0.0;
+ float uxi = 0.0;
+ float uyi = 0.0;
+ float uzi = 0.0;
+ float pix = 0.0;
+ float piy = 0.0;
+ float piz = 0.0;
+ float rhoi = 0.0;
+ float rho_dhi = 0.0;
+ float wcounti = 0.0;
+ float wcount_dhi = 0.0;
+ float div_vi = 0.0;
+ float rot_uxi = 0.0;
+ float rot_uyi = 0.0;
+ float rot_uzi = 0.0;
+ int Found_neighbours = 0;
+
+ if (pid < last_part_in_task_blocks_ci) {
+ cellx = parts_soa.locx[pid], celly = parts_soa.locy[pid],
+ cellz = parts_soa.locz[pid];
+ hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+ mi = parts_soa.mass[pid];
+ uxi = parts_soa.ux[pid];
+ uyi = parts_soa.uy[pid];
+ uzi = parts_soa.uz[pid];
+ pix = parts_soa.x_p[pid] - cellx;
+ piy = parts_soa.y_p[pid] - celly;
+ piz = parts_soa.z_p[pid] - cellz;
+ }
+
+ int n_neighbours = 0;
+ /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+ * of the single shared memory space "vars" which we allocate in kernel
+ * invocation*/
+ float *x_p_tmp = (float *)&vars[0];
+ float *y_p_tmp = (float *)&vars[BLOCK_SIZE];
+ float *z_p_tmp = (float *)&vars[BLOCK_SIZE * 2];
+ float *h_tmp = (float *)&vars[BLOCK_SIZE * 3];
+ float *mass_tmp = (float *)&vars[BLOCK_SIZE * 4];
+ float *ux_tmp = (float *)&vars[BLOCK_SIZE * 5];
+ float *uy_tmp = (float *)&vars[BLOCK_SIZE * 6];
+ float *uz_tmp = (float *)&vars[BLOCK_SIZE * 7];
+ timebin_t *timebin = (timebin_t *)&uz_tmp[BLOCK_SIZE];
+ /*Particles copied in blocks to shared memory*/
+ for (int b = first_part_in_task_blocks_cj; b < last_part_in_task_blocks_cj;
+ b += BLOCK_SIZE) {
+ int j = b + threadIdx.x;
+ x_p_tmp[threadIdx.x] = parts_soa.x_p[j];
+ y_p_tmp[threadIdx.x] = parts_soa.y_p[j];
+ z_p_tmp[threadIdx.x] = parts_soa.z_p[j];
+ h_tmp[threadIdx.x] = parts_soa.h[j];
+ mass_tmp[threadIdx.x] = parts_soa.mass[j];
+ ux_tmp[threadIdx.x] = parts_soa.ux[j];
+ uy_tmp[threadIdx.x] = parts_soa.uy[j];
+ uz_tmp[threadIdx.x] = parts_soa.uz[j];
+ timebin[threadIdx.x] = parts_soa.time_bin[j];
+ __syncthreads();
+ for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+ j = j_block + b;
+ if (j < last_part_in_task_blocks_cj) {
+ /* Compute the pairwise distance. */
+ const float pjx = x_p_tmp[j_block] - cellx;
+ const float pjy = y_p_tmp[j_block] - celly;
+ const float pjz = z_p_tmp[j_block] - cellz;
+ const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+ const float r2 = xij * xij + yij * yij + zij * zij;
+
+ if (r2 < hig2 && r2 > (0.01f / dx) * (0.01f / dx)) {
+ Found_neighbours = 1;
+ const float r = sqrt(r2);
+ /* Recover some data */
+ const float mj = mass_tmp[j_block];
+ /* Get the kernel for hi. */
+ if (hi < 1.f / dx) printf("h < dx\n");
+ const float h_inv = 1.f / hi;
+ const float ui = r * h_inv;
+ float wi, wi_dx;
+
+ d_kernel_deval(ui, &wi, &wi_dx);
+
+ rhoi += mj * wi;
+ rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+ wcounti += wi;
+ wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+ const float r_inv = 1.f / r;
+ const float faci = mj * wi_dx * r_inv;
+
+ /* Compute dv dot r */
+ float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+ dvz = uzi - uz_tmp[j_block];
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+ div_vi -= faci * dvdr;
+
+ /* Compute dv cross r */
+ float curlvrx = dvy * zij - dvz * yij;
+ float curlvry = dvz * xij - dvx * zij;
+ float curlvrz = dvx * yij - dvy * xij;
+
+ rot_uxi += faci * curlvrx;
+ rot_uyi += faci * curlvry;
+ rot_uzi += faci * curlvrz;
+ }
+ }
+ }
+ __syncthreads();
+ }
+ if (pid < last_part_in_task_blocks_ci) {
+ parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+ parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+ parts_soa.div_v[pid] = div_vi;
+ parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi;
+ parts_soa.rot_uz[pid] = rot_uzi;
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NONSYMGPU(struct part_soa parts_soa, int pid,
+ const int ci_start, const int ci_end,
+ const int cj_start, const int cj_end,
+ float d_a, float d_H, float *vars_pair,
+ double *d_shift_x, double *d_shift_y,
+ double *d_shift_z, const int task_id_tmp,
+ int flip_order) {
+
+ float dx =
+ 1.f / 64.f; // Value used to avoid interacting parts with themselves
+
+ float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+
+ float mi = 0.0;
+ float uxi = 0.0;
+ float uyi = 0.0;
+ float uzi = 0.0;
+ float pix = 0.0;
+ float piy = 0.0;
+ float piz = 0.0;
+ float rhoi = 0.0;
+ float rho_dhi = 0.0;
+ float wcounti = 0.0;
+ float wcount_dhi = 0.0;
+ float div_vi = 0.0;
+ float rot_uxi = 0.0;
+ float rot_uyi = 0.0;
+ float rot_uzi = 0.0;
+ int Found_neighbours = 0;
+ int count_i = cj_start;
+ // printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
+ // last_part_in_task_blocks_ci %i\n",
+ // first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
+ // last_part_in_task_blocks_ci);
+ if (pid < ci_end) {
+ hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+ mi = parts_soa.mass[pid];
+ uxi = parts_soa.ux[pid];
+ uyi = parts_soa.uy[pid];
+ uzi = parts_soa.uz[pid];
+ pix = parts_soa.x_p[pid] - d_shift_x[task_id_tmp];
+ piy = parts_soa.y_p[pid] - d_shift_y[task_id_tmp];
+ piz = parts_soa.z_p[pid] - d_shift_z[task_id_tmp];
+ }
+
+ /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+ * of the single shared memory space "vars" which we allocate in kernel
+ * invocation*/
+ float *x_p_tmp = (float *)&vars_pair[0];
+ float *y_p_tmp = (float *)&x_p_tmp[BLOCK_SIZE];
+ float *z_p_tmp = (float *)&y_p_tmp[BLOCK_SIZE];
+ float *h_tmp = (float *)&z_p_tmp[BLOCK_SIZE];
+ float *mass_tmp = (float *)&h_tmp[BLOCK_SIZE];
+ float *ux_tmp = (float *)&mass_tmp[BLOCK_SIZE];
+ float *uy_tmp = (float *)&ux_tmp[BLOCK_SIZE];
+ float *uz_tmp = (float *)&uy_tmp[BLOCK_SIZE];
+ timebin_t *timebin = (timebin_t *)&uz_tmp[BLOCK_SIZE];
+
+ /*Particles copied in blocks to shared memory*/
+ for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) {
+ const int tid_x = threadIdx.x;
+ int j = b + tid_x;
+ x_p_tmp[tid_x] = parts_soa.x_p[j];
+ y_p_tmp[tid_x] = parts_soa.y_p[j];
+ z_p_tmp[tid_x] = parts_soa.z_p[j];
+ // h_tmp[tid_x] = parts_soa.h[j];
+ mass_tmp[tid_x] = parts_soa.mass[j];
+ ux_tmp[tid_x] = parts_soa.ux[j];
+ uy_tmp[tid_x] = parts_soa.uy[j];
+ uz_tmp[tid_x] = parts_soa.uz[j];
+ timebin[tid_x] = parts_soa.time_bin[j];
+
+ __syncthreads();
+ const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
+ const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
+ const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
+ /*j_block is the particle's index in the block. Loop through particles in
+ * shared memory one by one*/
+ for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+ int jj = b + j_block;
+ if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+ const float pjx = x_p_tmp[j_block] - shift_x_j;
+ const float pjy = y_p_tmp[j_block] - shift_y_j;
+ const float pjz = z_p_tmp[j_block] - shift_z_j;
+
+ const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+ // const float xij = (pix - pjx) * flip_order, yij = (piy -
+ // pjy) * flip_order, zij = (piz - pjz) * flip_order;
+ const float r2 = xij * xij + yij * yij + zij * zij;
+ if (r2 < hig2) {
+ /* Recover some data */
+ const float mj = mass_tmp[j_block];
+ const float r = sqrt(r2);
+ /* Get the kernel for hi. */
+ const float h_inv = 1.f / hi;
+ const float ui = r * h_inv;
+ float wi, wi_dx;
+
+ d_kernel_deval(ui, &wi, &wi_dx);
+
+ rhoi += mj * wi;
+ rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+ wcounti += wi;
+ wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+ const float r_inv = 1.f / r;
+ const float faci = mj * wi_dx * r_inv;
+ /* Compute dv dot r */
+ const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+ dvz = uzi - uz_tmp[j_block];
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+ /* Compute dv cross r */
+ const float curlvrx = dvy * zij - dvz * yij;
+ const float curlvry = dvz * xij - dvx * zij;
+ const float curlvrz = dvx * yij - dvy * xij;
+
+ div_vi -= faci * dvdr;
+
+ rot_uxi += faci * curlvrx;
+ rot_uyi += faci * curlvry;
+ rot_uzi += faci * curlvrz;
+ }
+ } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+ } /*End of looping through particles in shared memory---Shared arrays
+ zero'ed for next step in outer loop*/
+ __syncthreads();
+ } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+ if (pid >= ci_start && pid < ci_end) {
+ parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+ parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+ parts_soa.div_v[pid] = div_vi;
+ parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi;
+ parts_soa.rot_uz[pid] = rot_uzi;
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NONSYMGPUAOS(struct part_aos *parts_aos, int pid,
+ const int ci_start, const int ci_end,
+ const int cj_start, const int cj_end,
+ float d_a, float d_H, float *vars_pair_aos,
+ double *d_shift_x, double *d_shift_y,
+ double *d_shift_z, const int task_id_tmp,
+ int flip_order) {
+
+ float dx =
+ 1.f / 64.f; // Value used to avoid interacting parts with themselves
+
+ float hi = 0.0, hig2 = 0.0;
+
+ float mi = 0.0;
+ float uxi = 0.0;
+ float uyi = 0.0;
+ float uzi = 0.0;
+ float pix = 0.0;
+ float piy = 0.0;
+ float piz = 0.0;
+ float rhoi = 0.0;
+ float rho_dhi = 0.0;
+ float wcounti = 0.0;
+ float wcount_dhi = 0.0;
+ float div_vi = 0.0;
+ float rot_uxi = 0.0;
+ float rot_uyi = 0.0;
+ float rot_uzi = 0.0;
+ int Found_neighbours = 0;
+ int count_i = cj_start;
+ // printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
+ // last_part_in_task_blocks_ci %i\n",
+ // first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
+ // last_part_in_task_blocks_ci);
+ if (pid < ci_end) {
+ hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
+ mi = parts_aos[pid].mass;
+ uxi = parts_aos[pid].ux;
+ uyi = parts_aos[pid].uy;
+ uzi = parts_aos[pid].uz;
+ pix = parts_aos[pid].x_p; // - d_shift_x[task_id_tmp];
+ piy = parts_aos[pid].y_p; // - d_shift_y[task_id_tmp];
+ piz = parts_aos[pid].z_p; // - d_shift_z[task_id_tmp];
+ }
+
+ /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+ * of the single shared memory space "vars" which we allocate in kernel
+ * invocation*/
+ float *x_p_tmp = (float *)&vars_pair_aos[0];
+ float *y_p_tmp = (float *)&x_p_tmp[BLOCK_SIZE];
+ float *z_p_tmp = (float *)&y_p_tmp[BLOCK_SIZE];
+ float *h_tmp = (float *)&z_p_tmp[BLOCK_SIZE];
+ float *mass_tmp = (float *)&h_tmp[BLOCK_SIZE];
+ float *ux_tmp = (float *)&mass_tmp[BLOCK_SIZE];
+ float *uy_tmp = (float *)&ux_tmp[BLOCK_SIZE];
+ float *uz_tmp = (float *)&uy_tmp[BLOCK_SIZE];
+ int *timebin = (int *)&uz_tmp[BLOCK_SIZE];
+
+ /*Particles copied in blocks to shared memory*/
+ for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) {
+ const int tid_x = threadIdx.x;
+ int j = b + tid_x;
+ x_p_tmp[tid_x] = parts_aos[j].x_p;
+ y_p_tmp[tid_x] = parts_aos[j].y_p;
+ z_p_tmp[tid_x] = parts_aos[j].z_p;
+ // h_tmp[tid_x] = parts_aos[j].h;
+ mass_tmp[tid_x] = parts_aos[j].mass;
+ ux_tmp[tid_x] = parts_aos[j].ux;
+ uy_tmp[tid_x] = parts_aos[j].uy;
+ uz_tmp[tid_x] = parts_aos[j].uz;
+ timebin[tid_x] = parts_aos[j].time_bin;
+ // const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
+ // const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
+ // const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
+ __syncthreads();
+ /*j_block is the particle's index in the block. Loop through particles in
+ * shared memory one by one*/
+ for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+ int jj = b + j_block;
+ if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+ const float pjx = x_p_tmp[j_block]; // - shift_x_j;
+ const float pjy = y_p_tmp[j_block]; // - shift_y_j;
+ const float pjz = z_p_tmp[j_block]; // - shift_z_j;
+
+ const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+ // const float xij = (pix - pjx) * flip_order, yij = (piy -
+ // pjy) * flip_order, zij = (piz - pjz) * flip_order;
+ const float r2 = xij * xij + yij * yij + zij * zij;
+ if (r2 < hig2) {
+ /* Recover some data */
+ const float mj = mass_tmp[j_block];
+ const float r = sqrt(r2);
+ /* Get the kernel for hi. */
+ const float h_inv = 1.f / hi;
+ const float ui = r * h_inv;
+ float wi, wi_dx;
+
+ d_kernel_deval(ui, &wi, &wi_dx);
+
+ rhoi += mj * wi;
+ rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+ wcounti += wi;
+ wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+ const float r_inv = 1.f / r;
+ const float faci = mj * wi_dx * r_inv;
+ /* Compute dv dot r */
+ const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+ dvz = uzi - uz_tmp[j_block];
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+ /* Compute dv cross r */
+ const float curlvrx = dvy * zij - dvz * yij;
+ const float curlvry = dvz * xij - dvx * zij;
+ const float curlvrz = dvx * yij - dvy * xij;
+
+ div_vi -= faci * dvdr;
+
+ rot_uxi += faci * curlvrx;
+ rot_uyi += faci * curlvry;
+ rot_uzi += faci * curlvrz;
+ // if(timebin[j_block] != 1000 && timebin[j_block] !=
+ // 20)printf("incorrect timebin %i\n", timebin[j_block]);
+ }
+ } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+ } /*End of looping through particles in shared memory---Shared arrays
+ zero'ed for next step in outer loop*/
+ __syncthreads();
+ } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+ if (pid >= ci_start && pid < ci_end) {
+ // printf("timebin %i\n", parts_aos[pid].time_bin);
+ parts_aos[pid].rho = rhoi, parts_aos[pid].rho_dh = rho_dhi;
+ parts_aos[pid].wcount = wcounti, parts_aos[pid].wcount_dh = wcount_dhi;
+ parts_aos[pid].div_v = div_vi;
+ parts_aos[pid].rot_ux = rot_uxi, parts_aos[pid].rot_uy = rot_uyi;
+ parts_aos[pid].rot_uz = rot_uzi;
+ parts_aos[pid].time_bin = 20;
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NONSYMGPUAOSF4(
+ struct part_aos_f4_send *__restrict__ parts_send,
+ struct part_aos_f4_recv *__restrict__ parts_recv, int pid,
+ const int ci_start, const int ci_end, const int cj_start, const int cj_end,
+ float d_a, float d_H, float4 *vars_pair_aos_f4) {
+
+ float dx =
+ 1.f / 64.f; // Value used to avoid interacting parts with themselves
+
+ float hi = 0.0, hig2 = 0.0;
+
+ int Found_neighbours = 0;
+ int count_i = cj_start;
+
+ float4 res_rho = {0.0, 0.0, 0.0, 0.0};
+ float4 res_rot = {0.0, 0.0, 0.0, 0.0};
+ const part_aos_f4_send pi = parts_send[pid];
+ const float4 x_pi = pi.x_p_h;
+ const float4 ux_pi = pi.ux_m;
+ // printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
+ // last_part_in_task_blocks_ci %i\n",
+ // first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
+ // last_part_in_task_blocks_ci);
+ // if (pid < ci_end) {
+ hi = x_pi.w, hig2 = hi * hi * kernel_gamma2;
+ // }
+
+ /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+ * of the single shared memory space "vars" which we allocate in kernel
+ * invocation*/
+ float4 *__restrict__ x_p_h_tmp = (float4 *)&vars_pair_aos_f4[0];
+ float4 *__restrict__ ux_m_tmp = (float4 *)&vars_pair_aos_f4[BLOCK_SIZE];
+
+ /*Particles copied in blocks to shared memory*/
+ for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) {
+ const int tid_x = threadIdx.x;
+ int j = b + tid_x;
+ struct part_aos_f4_send pj = parts_send[j];
+ x_p_h_tmp[tid_x] = pj.x_p_h;
+ ux_m_tmp[tid_x] = pj.ux_m;
+ __syncthreads();
+ /*j_block is the particle's index in the block. Loop through particles in
+ * shared memory one by one*/
+ for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+ int jj = b + j_block;
+ if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+ const float4 x_p_h_j = x_p_h_tmp[j_block];
+ const float4 ux_m_j = ux_m_tmp[j_block];
+
+ const float xij = x_pi.x - x_p_h_j.x, yij = x_pi.y - x_p_h_j.y,
+ zij = x_pi.z - x_p_h_j.z;
+ const float r2 = xij * xij + yij * yij + zij * zij;
+ if (r2 < hig2) {
+ /* Recover some data */
+ const float mj = ux_m_j.w;
+ const float r = sqrt(r2);
+ /* Get the kernel for hi. */
+ const float h_inv = 1.f / hi;
+ const float ui = r * h_inv;
+ float wi, wi_dx;
+
+ d_kernel_deval(ui, &wi, &wi_dx);
+ /*Add to sums of rho, rho_dh, wcount and wcount_dh*/
+ res_rho.x += mj * wi;
+ res_rho.y -= mj * (hydro_dimension * wi + ui * wi_dx);
+ res_rho.z += wi;
+ res_rho.w -= (hydro_dimension * wi + ui * wi_dx);
+
+ const float r_inv = 1.f / r;
+ const float faci = mj * wi_dx * r_inv;
+ /* Compute dv dot r */
+ const float dvx = ux_pi.x - ux_m_j.x, dvy = ux_pi.y - ux_m_j.y,
+ dvz = ux_pi.z - ux_m_j.z;
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+ /* Compute dv cross r */
+ const float curlvrx = dvy * zij - dvz * yij;
+ const float curlvry = dvz * xij - dvx * zij;
+ const float curlvrz = dvx * yij - dvy * xij;
+
+ res_rot.x += faci * curlvrx;
+ res_rot.y += faci * curlvry;
+ res_rot.z += faci * curlvrz;
+ res_rot.w -= faci * dvdr;
+ }
+ } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+ } /*End of looping through particles in shared memory---Shared arrays
+ zero'ed for next step in outer loop*/
+ __syncthreads();
+ } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+ if (pid >= ci_start && pid < ci_end) {
+ parts_recv[pid].rho_dh_wcount = res_rho;
+ parts_recv[pid].rot_ux_div_v = res_rot;
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NAIVEGPUAOSF4(
+ const struct part_aos_f4_send pi,
+ struct part_aos_f4_send *__restrict__ parts_send,
+ struct part_aos_f4_recv *__restrict__ parts_recv, int pid,
+ const int cj_start, const int cj_end, float d_a, float d_H) {
+
+ float dx =
+ 1.f / 64.f; // Value used to avoid interacting parts with themselves
+
+ float hi = 0.0, hig2 = 0.0;
+
+ int Found_neighbours = 0;
+ int count_i = cj_start;
+
+ float4 res_rho = {0.0, 0.0, 0.0, 0.0};
+ float4 res_rot = {0.0, 0.0, 0.0, 0.0};
+ // const part_aos_f4_send pi = parts_send[pid];
+ const float4 x_pi = pi.x_p_h;
+ const float4 ux_pi = pi.ux_m;
+ // printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
+ // last_part_in_task_blocks_ci %i\n",
+ // first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
+ // last_part_in_task_blocks_ci);
+ // if (pid < ci_end) {
+ hi = x_pi.w, hig2 = hi * hi * kernel_gamma2;
+ // }
+
+ // printf("js %i je %i\n", cj_start, cj_end);
+ /*Particles copied in blocks to shared memory*/
+ for (int j = cj_start; j < cj_end; j++) {
+ struct part_aos_f4_send pj = parts_send[j];
+
+ const float4 x_p_h_j = pj.x_p_h;
+ const float4 ux_m_j = pj.ux_m;
+
+ const float xij = x_pi.x - x_p_h_j.x, yij = x_pi.y - x_p_h_j.y,
+ zij = x_pi.z - x_p_h_j.z;
+ const float r2 = xij * xij + yij * yij + zij * zij;
+ // printf("r2 %f \n", r2);
+ if (r2 < hig2) {
+ /* Recover some data */
+ const float mj = ux_m_j.w;
+ const float r = sqrt(r2);
+ /* Get the kernel for hi. */
+ const float h_inv = 1.f / hi;
+ const float ui = r * h_inv;
+ float wi, wi_dx;
+
+ d_kernel_deval(ui, &wi, &wi_dx);
+ /*Add to sums of rho, rho_dh, wcount and wcount_dh*/
+ res_rho.x += mj * wi;
+ res_rho.y -= mj * (hydro_dimension * wi + ui * wi_dx);
+ res_rho.z += wi;
+ res_rho.w -= (hydro_dimension * wi + ui * wi_dx);
+
+ const float r_inv = 1.f / r;
+ const float faci = mj * wi_dx * r_inv;
+ /* Compute dv dot r */
+ const float dvx = ux_pi.x - ux_m_j.x, dvy = ux_pi.y - ux_m_j.y,
+ dvz = ux_pi.z - ux_m_j.z;
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+ /* Compute dv cross r */
+ const float curlvrx = dvy * zij - dvz * yij;
+ const float curlvry = dvz * xij - dvx * zij;
+ const float curlvrz = dvx * yij - dvy * xij;
+
+ res_rot.x += faci * curlvrx;
+ res_rot.y += faci * curlvry;
+ res_rot.z += faci * curlvrz;
+ res_rot.w -= faci * dvdr;
+ }
+ } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+ // if (pid >= ci_start && pid < ci_end) {
+ parts_recv[pid].rho_dh_wcount = res_rho;
+ parts_recv[pid].rot_ux_div_v = res_rot;
+ // }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NONSYMGPUAOSG(struct part_aos_g *parts_aos, int pid,
+ const int ci_start, const int ci_end,
+ const int cj_start, const int cj_end,
+ float d_a, float d_H,
+ float *vars_pair_aosg, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z,
+ const int task_id_tmp, int flip_order) {
+
+ float dx =
+ 1.f / 64.f; // Value used to avoid interacting parts with themselves
+
+ float hi = 0.0, hig2 = 0.0;
+
+ float mi = 0.0;
+ float uxi = 0.0;
+ float uyi = 0.0;
+ float uzi = 0.0;
+ float pix = 0.0;
+ float piy = 0.0;
+ float piz = 0.0;
+ float rhoi = 0.0;
+ float div_vi = 0.0;
+ int Found_neighbours = 0;
+ float v_sig;
+ float u = 0.f;
+ float laplace_u = 0.0;
+ float alpha_visc_max_ngb = 0.0;
+ float ci = 0.0;
+
+ int count_i = cj_start;
+ if (pid < ci_end) {
+ hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
+ mi = parts_aos[pid].mass;
+ uxi = parts_aos[pid].ux;
+ uyi = parts_aos[pid].uy;
+ uzi = parts_aos[pid].uz;
+ ci = parts_aos[pid].soundspeed;
+ v_sig = parts_aos[pid].v_sig;
+ u = parts_aos[pid].u;
+ laplace_u = parts_aos[pid].laplace_u;
+ alpha_visc_max_ngb = parts_aos[pid].alpha_visc_max_ngb;
+
+ pix = parts_aos[pid].x_p - d_shift_x[task_id_tmp];
+ piy = parts_aos[pid].y_p - d_shift_y[task_id_tmp];
+ piz = parts_aos[pid].z_p - d_shift_z[task_id_tmp];
+ }
+
+ /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+ * of the single shared memory space "vars" which we allocate in kernel
+ * invocation*/
+ float *x_p_tmp = (float *)&vars_pair_aosg[0];
+ float *y_p_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE];
+ float *z_p_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 2];
+ float *h_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 3];
+ float *mass_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 4];
+ float *ux_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 5];
+ float *uy_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 6];
+ float *uz_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 7];
+ float *cj_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 8];
+ float *alpha_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 9];
+ float *u_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 10];
+ float *rho_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 11];
+ int *timebin = (int *)&vars_pair_aosg[BLOCK_SIZE * 12];
+
+ /*Particles copied in blocks to shared memory*/
+ for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) {
+ const int tid_x = threadIdx.x;
+ int j = b + tid_x;
+ x_p_tmp[threadIdx.x] = parts_aos[j].x_p;
+ y_p_tmp[threadIdx.x] = parts_aos[j].y_p;
+ z_p_tmp[threadIdx.x] = parts_aos[j].z_p;
+ h_tmp[threadIdx.x] = parts_aos[j].h;
+ mass_tmp[threadIdx.x] = parts_aos[j].mass;
+ ux_tmp[threadIdx.x] = parts_aos[j].ux;
+ uy_tmp[threadIdx.x] = parts_aos[j].uy;
+ uz_tmp[threadIdx.x] = parts_aos[j].uz;
+ timebin[threadIdx.x] = parts_aos[j].time_bin;
+ cj_tmp[threadIdx.x] = parts_aos[j].soundspeed;
+ alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+ u_tmp[threadIdx.x] = parts_aos[j].u;
+ rho_tmp[threadIdx.x] = parts_aos[j].rho;
+ const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
+ const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
+ const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
+ __syncthreads();
+ /*j_block is the particle's index in the block. Loop through particles in
+ * shared memory one by one*/
+ for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+ int jj = b + j_block;
+ if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+ const float pjx = x_p_tmp[j_block] - shift_x_j;
+ const float pjy = y_p_tmp[j_block] - shift_y_j;
+ const float pjz = z_p_tmp[j_block] - shift_z_j;
+ const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+ const float r2 = xij * xij + yij * yij + zij * zij;
+ if (r2 < hig2) {
+ /* Recover some data */
+ const float mj = mass_tmp[j_block];
+ const float r = sqrt(r2);
+ const float r_inv = 1.f / r;
+ /* Get the kernel for hi. */
+ const float h_inv = 1.f / hi;
+ float wi, wi_dx;
+ /* Cosmology terms for the signal velocity */
+ const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+ const float a2_Hubble = d_a * d_a * d_H;
+ /* Compute dv dot r */
+ const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+ dvz = uzi - uz_tmp[j_block];
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+ /* Add Hubble flow */
+ const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+ /* Are the particles moving towards each others ? */
+ const float omega_ij = min(dvdr_Hubble, 0.f);
+ const float mu_ij =
+ fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+
+ /* Signal velocity */
+ const float new_v_sig =
+ ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
+ /* Update if we need to */
+ v_sig = max(v_sig, new_v_sig);
+ /* Calculate Del^2 u for the thermal diffusion coefficient. */
+ /* Need to get some kernel values F_ij = wi_dx */
+ const float ui = r * h_inv;
+ d_kernel_deval(ui, &wi, &wi_dx);
+
+ const float delta_u_factor = (u - u_tmp[j_block]) * r_inv;
+ laplace_u += mj * delta_u_factor * wi_dx / rho_tmp[j_block];
+
+ /* Set the maximal alpha from the previous step over the neighbours
+ * (this is used to limit the diffusion in hydro_prepare_force) */
+ const float alpha_j = alpha_tmp[j_block];
+ alpha_visc_max_ngb = max(alpha_visc_max_ngb, alpha_j);
+ }
+ } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+ } /*End of looping through particles in shared memory---Shared arrays
+ zero'ed for next step in outer loop*/
+ __syncthreads();
+ } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+ if (pid >= ci_start && pid < ci_end) {
+ parts_aos[pid].v_sig = v_sig, parts_aos[pid].laplace_u = laplace_u;
+ parts_aos[pid].alpha_visc_max_ngb = alpha_visc_max_ngb;
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NAIVEGPUAOSF4G(
+ const struct part_aos_f4_g_send pi,
+ struct part_aos_f4_g_send *__restrict__ parts_send,
+ struct part_aos_f4_g_recv *__restrict__ parts_recv, int pid,
+ const int cj_start, const int cj_end, float d_a, float d_H) {
+
+ float dx =
+ 1.f / 64.f; // Value used to avoid interacting parts with themselves
+
+ float hi = 0.0, hig2 = 0.0;
+
+ int Found_neighbours = 0;
+ int count_i = cj_start;
+
+ float4 res_rho = {0.0, 0.0, 0.0, 0.0};
+ float4 res_rot = {0.0, 0.0, 0.0, 0.0};
+ // const part_aos_f4_send pi = parts_send[pid];
+ const float4 x_h_i = pi.x_h;
+ const float4 ux_m_i = pi.ux_m;
+ const float4 rho_avisc_u_c_i = pi.rho_avisc_u_c;
+ float3 vsig_lapu_aviscmax_i = {0.f, 0.f, 0.f};
+
+ // printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
+ // last_part_in_task_blocks_ci %i\n",
+ // first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
+ // last_part_in_task_blocks_ci);
+ // if (pid < ci_end) {
+ hi = x_h_i.w, hig2 = hi * hi * kernel_gamma2;
+ // }
+
+ // printf("js %i je %i\n", cj_start, cj_end);
+ /*Particles copied in blocks to shared memory*/
+ for (int j = cj_start; j < cj_end; j++) {
+ struct part_aos_f4_g_send pj = parts_send[j];
+
+ const float4 x_h_j = pj.x_h;
+ const float4 ux_m_j = pj.ux_m;
+ const float4 rho_avisc_u_c_j = pj.rho_avisc_u_c;
+ const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y,
+ zij = x_h_i.z - x_h_j.z;
+ const float r2 = xij * xij + yij * yij + zij * zij;
+ // printf("r2 %f \n", r2);
+ if (r2 < hig2) {
+ const float r = sqrt(r2);
+ const float r_inv = 1.f / r;
+ /* Recover some data */
+ const float mj = ux_m_j.w;
+ /* Get the kernel for hi. */
+ const float h_inv = 1.f / hi;
+ float wi, wi_dx;
+ /* Cosmology terms for the signal velocity */
+ const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+ const float a2_Hubble = d_a * d_a * d_H;
+ /* Compute dv dot r */
+ float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y,
+ dvz = ux_m_i.z - ux_m_j.z;
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+ /* Add Hubble flow */
+ const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+ /* Are the particles moving towards each others ? */
+ const float omega_ij = min(dvdr_Hubble, 0.f);
+ const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+
+ /* Signal velocity */
+ const float new_v_sig =
+ rho_avisc_u_c_i.w + rho_avisc_u_c_j.w - const_viscosity_beta * mu_ij;
+ /* Update if we need to */
+ vsig_lapu_aviscmax_i.x = fmaxf(vsig_lapu_aviscmax_i.x, new_v_sig);
+ /* Calculate Del^2 u for the thermal diffusion coefficient. */
+ /* Need to get some kernel values F_ij = wi_dx */
+ const float ui = r * h_inv;
+ d_kernel_deval(ui, &wi, &wi_dx);
+
+ const float delta_u_factor =
+ (rho_avisc_u_c_i.z - rho_avisc_u_c_j.z) * r_inv;
+ vsig_lapu_aviscmax_i.y += mj * delta_u_factor * wi_dx / rho_avisc_u_c_j.x;
+
+ /* Set the maximal alpha from the previous step over the neighbours
+ * (this is used to limit the diffusion in hydro_prepare_force) */
+ const float alpha_j = rho_avisc_u_c_j.y;
+ vsig_lapu_aviscmax_i.z = fmaxf(vsig_lapu_aviscmax_i.z, alpha_j);
+ }
+ } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+ // if (pid >= ci_start && pid < ci_end) {
+ parts_recv[pid].vsig_lapu_aviscmax = vsig_lapu_aviscmax_i;
+ // }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NONSYMGPUAOSF(struct part_aos_f *parts_aos, int pid,
+ const int ci_start, const int ci_end,
+ const int cj_start, const int cj_end,
+ float d_a, float d_H,
+ float *vars_pair_aosf, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z,
+ const int task_id_tmp, int flip_order) {
+
+ float ci = 0.0, cj = 0.0;
+ float hi = 0.0, hig2 = 0.0;
+ float mi = 0.0;
+ float uxi = 0.0;
+ float uyi = 0.0;
+ float uzi = 0.0;
+ float pix = 0.0;
+ float piy = 0.0;
+ float piz = 0.0;
+ float rhoi = 0.0;
+ float div_vi = 0.0;
+ int Found_neighbours = 0;
+ float v_sigi;
+ float ui = 0.f;
+ float u_dti = 0.f;
+ float laplace_ui = 0.0;
+ float alpha_visc_max_ngb = 0.0;
+ float pressurei = 0.0;
+ float alphavisci = 0.0;
+ float alphadiffi = 0.0;
+ float fi = 0.0;
+ float balsarai = 0.0;
+ float ahydroxi = 0.0;
+ float ahydroyi = 0.0;
+ float ahydrozi = 0.0;
+ float h_dti = 0.0;
+ int min_ngb_time_bin = 0;
+ if (pid < ci_end) {
+ hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
+ mi = parts_aos[pid].mass;
+ uxi = parts_aos[pid].ux;
+ uyi = parts_aos[pid].uy;
+ uzi = parts_aos[pid].uz;
+ ci = parts_aos[pid].soundspeed;
+ fi = parts_aos[pid].f;
+ v_sigi = parts_aos[pid].v_sig;
+ ui = parts_aos[pid].u;
+ rhoi = parts_aos[pid].rho;
+ pressurei = parts_aos[pid].pressure;
+ balsarai = parts_aos[pid].balsara;
+ alphavisci = parts_aos[pid].alpha_visc;
+ alphadiffi = parts_aos[pid].alpha_diff;
+ min_ngb_time_bin = parts_aos[pid].min_ngb_time_bin;
+ pix = parts_aos[pid].x_p - d_shift_x[task_id_tmp];
+ piy = parts_aos[pid].y_p - d_shift_y[task_id_tmp];
+ piz = parts_aos[pid].z_p - d_shift_z[task_id_tmp];
+ }
+ // if (threadIdx.x == 0) {
+ // first_part_tid_0 = first_part;
+ // last_part_tid_0 = last_part;
+ // }
+ // __syncthreads();
+ int n_neighbours = 0;
+ /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+ * of the single shared memory space "vars" which we allocate in kernel
+ * invocation*/
+ float *x_p_tmp = (float *)&vars_pair_aosf[0];
+ float *y_p_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE];
+ float *z_p_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 2];
+ float *h_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 3];
+ float *mass_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 4];
+ float *ux_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 5];
+ float *uy_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 6];
+ float *uz_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 7];
+ float *cj_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 8];
+ float *alphavisc_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 9];
+ float *alphadiff_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 10];
+ float *u_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 11];
+ float *rho_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 12];
+ float *pressure_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 13];
+ float *f_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 14];
+ float *balsara_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 15];
+ int *timebin = (int *)&vars_pair_aosf[BLOCK_SIZE * 16];
+ /*Particles copied in blocks to shared memory*/
+ for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) {
+ int j = b + threadIdx.x;
+ x_p_tmp[threadIdx.x] = parts_aos[j].x_p;
+ y_p_tmp[threadIdx.x] = parts_aos[j].y_p;
+ z_p_tmp[threadIdx.x] = parts_aos[j].z_p;
+ h_tmp[threadIdx.x] = parts_aos[j].h;
+ mass_tmp[threadIdx.x] = parts_aos[j].mass;
+ ux_tmp[threadIdx.x] = parts_aos[j].ux;
+ uy_tmp[threadIdx.x] = parts_aos[j].uy;
+ uz_tmp[threadIdx.x] = parts_aos[j].uz;
+ timebin[threadIdx.x] = parts_aos[j].time_bin;
+ cj_tmp[threadIdx.x] = parts_aos[j].soundspeed;
+ // alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+ u_tmp[threadIdx.x] = parts_aos[j].u;
+ rho_tmp[threadIdx.x] = parts_aos[j].rho;
+ alphavisc_tmp[threadIdx.x] = parts_aos[j].alpha_visc;
+ alphadiff_tmp[threadIdx.x] = parts_aos[j].alpha_diff;
+ pressure_tmp[threadIdx.x] = parts_aos[j].pressure;
+ f_tmp[threadIdx.x] = parts_aos[j].f;
+ balsara_tmp[threadIdx.x] = parts_aos[j].balsara;
+ const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
+ const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
+ const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
+ __syncthreads();
+ /*j_block is the particle's index in the block. Loop through particles in
+ * shared memory one by one*/
+ for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+ int jj = b + j_block;
+ if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+ /* Compute the pairwise distance. */
+ const float pjx = x_p_tmp[j_block] - shift_x_j;
+ const float pjy = y_p_tmp[j_block] - shift_y_j;
+ const float pjz = z_p_tmp[j_block] - shift_z_j;
+ const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+ const float r2 = xij * xij + yij * yij + zij * zij;
+ if (r2 < hig2) {
+
+ // /* Cosmology terms for the signal velocity */
+ const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+ const float a2_Hubble = d_a * d_a * d_H;
+ const float r = sqrt(r2);
+ const float r_inv = 1.f / r;
+ // /* Recover some data */
+ const float mj = mass_tmp[j_block];
+ // /* Get the kernel for hi. */
+ const float hi_inv = 1.f / hi;
+ const float hid_inv =
+ d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
+ const float xi = r * hi_inv;
+ float wi, wi_dx;
+ d_kernel_deval(xi, &wi, &wi_dx);
+ const float wi_dr = hid_inv * wi_dx;
+ /* Get the kernel for hj. */
+ const float hj = h_tmp[j_block];
+ const float hj_inv = 1.0f / hj;
+ const float hjd_inv =
+ d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
+ const float xj = r * hj_inv;
+ float wj, wj_dx;
+ d_kernel_deval(xj, &wj, &wj_dx);
+ const float wj_dr = hjd_inv * wj_dx;
+ // /* Compute dv dot r */
+ float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+ dvz = uzi - uz_tmp[j_block];
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+ // /* Add Hubble flow */
+ const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+ // /* Are the particles moving towards each others ? */
+ const float omega_ij = min(dvdr_Hubble, 0.f);
+ const float mu_ij =
+ fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+ //
+ // /* Signal velocity */
+ const float v_sig =
+ ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
+
+ /* Variable smoothing length term */
+ const float f_ij = 1.f - fi / mj;
+ const float f_ji = 1.f - f_tmp[j_block] / mi;
+
+ /* Balsara term */
+ const float balsaraj = balsara_tmp[j_block];
+ /* Construct the full viscosity term */
+ const float rhoj = rho_tmp[j_block];
+ const float pressurej = pressure_tmp[j_block];
+ const float rho_ij = rhoi + rhoj;
+ const float alpha = alphavisci + alphavisc_tmp[j_block];
+ const float visc =
+ -0.25f * alpha * v_sig * mu_ij * (balsarai + balsaraj) / rho_ij;
+ /* Convolve with the kernel */
+ const float visc_acc_term =
+ 0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv;
+ /* Compute gradient terms */
+ const float P_over_rho2_i = pressurei / (rhoi * rhoi) * f_ij;
+ const float P_over_rho2_j = pressurej / (rhoj * rhoj) * f_ji;
+
+ /* SPH acceleration term */
+ const float sph_acc_term =
+ (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv;
+
+ /* Assemble the acceleration */
+ const float acc = sph_acc_term + visc_acc_term;
+ /* Use the force Luke ! */
+ ahydroxi -= mj * acc * xij;
+ ahydroyi -= mj * acc * yij;
+ ahydrozi -= mj * acc * zij;
+ // if(rhoi == 0 || rhoj == 0 || pressurei == 0 || pressurej
+ // == 0)printf("ri %f rj %f pi %f pj %f\n", rhoi, rhoj,
+ // pressurei, pressurej);
+ /* Get the time derivative for u. */
+ const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr;
+
+ /* Viscosity term */
+ const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble;
+ const float press_sum = pressurei + pressurej;
+ /* Diffusion term */
+ /* Combine the alpha_diff into a pressure-based switch -- this allows
+ * the alpha from the highest pressure particle to dominate, so that
+ * the diffusion limited particles always take precedence - another
+ * trick to allow the scheme to work with thermal feedback. */
+ float alpha_diff =
+ (pressurei * alphadiffi + pressurej * alphadiff_tmp[j_block]) /
+ (press_sum);
+ if (fabsf(press_sum) < 1e-10) alpha_diff = 0.f;
+ const float v_diff =
+ alpha_diff * 0.5f *
+ (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
+ fabsf(fac_mu * r_inv * dvdr_Hubble));
+ /* wi_dx + wj_dx / 2 is F_ij */
+ const float diff_du_term =
+ v_diff * (ui - u_tmp[j_block]) *
+ (f_ij * wi_dr / rhoi + f_ji * wj_dr / rhoj);
+
+ /* Assemble the energy equation term */
+ const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term;
+
+ /* Internal energy time derivative */
+ u_dti += du_dt_i * mj;
+ if (mj == 0.f) printf("zero mass mj %f\n", mj);
+
+ /* Get the time derivative for h. */
+ h_dti -= mj * dvdr * r_inv / rhoj * wi_dr;
+
+ /* Update if we need to; this should be guaranteed by the gradient
+ * loop but due to some possible synchronisation problems this is here
+ * as a _quick fix_. Added: 14th August 2019. To be removed by 1st Jan
+ * 2020. (JB) */
+ v_sigi = max(v_sigi, v_sig);
+ int time_bin_j = timebin[j_block];
+ if (time_bin_j > 0)
+ min_ngb_time_bin = min(min_ngb_time_bin, time_bin_j);
+ // printf("Got in\n");
+ }
+ }
+ }
+ __syncthreads();
+ } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+ if (pid >= ci_start && pid < ci_end) {
+ parts_aos[pid].v_sig = v_sigi;
+ parts_aos[pid].h_dt = h_dti;
+ parts_aos[pid].u_dt = u_dti;
+ parts_aos[pid].a_hydrox = ahydroxi;
+ parts_aos[pid].a_hydroy = ahydroyi;
+ parts_aos[pid].a_hydroz = ahydrozi;
+ parts_aos[pid].min_ngb_time_bin = min_ngb_time_bin;
+ // printf("%f %f %f %f %f %f\n", v_sigi, h_dti, u_dti, ahydroxi,
+ // ahydroyi, ahydrozi);
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NAIVEGPUAOSF4F(
+ const struct part_aos_f4_f_send pi,
+ struct part_aos_f4_f_send *__restrict__ parts_send,
+ struct part_aos_f4_f_recv *__restrict__ parts_recv, int pid,
+ const int cj_start, const int cj_end, float d_a, float d_H) {
+
+ float dx =
+ 1.f / 64.f; // Value used to avoid interacting parts with themselves
+
+ int Found_neighbours = 0;
+
+ // const part_aos_f4_send pi = parts_send[pid];
+ const float4 x_h_i = pi.x_h;
+ const float4 ux_m_i = pi.ux_m;
+
+ float4 f_b_t_mintbinngb_i = pi.f_bals_timebin_mintimebin_ngb;
+ const float4 rho_p_c_vsig_i = pi.rho_p_c_vsigi;
+ const float3 u_avisc_adiff_i = pi.u_alphavisc_alphadiff;
+
+ const float mi = ux_m_i.w;
+ const float pressurei = rho_p_c_vsig_i.y;
+ const float ci = rho_p_c_vsig_i.z;
+ float3 ahydro = {0.0, 0.0, 0.0};
+ float4 udt_hdt_vsig_mintbinngb = {0.0, 0.0, 0.0, 0.0};
+ udt_hdt_vsig_mintbinngb.z = rho_p_c_vsig_i.w;
+ udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w;
+
+ const float hi = x_h_i.w;
+ const float hig2 = hi * hi * kernel_gamma2;
+
+ // printf("js %i je %i\n", cj_start, cj_end);
+ /*Particles copied in blocks to shared memory*/
+ for (int j = cj_start; j < cj_end; j++) {
+ struct part_aos_f4_f_send pj = parts_send[j];
+ const float4 x_h_j = pj.x_h;
+ const float4 ux_m_j = pj.ux_m;
+ const float4 f_b_t_mintbinngb_j = pj.f_bals_timebin_mintimebin_ngb;
+ const float4 rho_p_c_vsig_j = pj.rho_p_c_vsigi;
+ // alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+ const float3 u_avisc_adiff_j = pj.u_alphavisc_alphadiff;
+ const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y,
+ zij = x_h_i.z - x_h_j.z;
+ const float r2 = xij * xij + yij * yij + zij * zij;
+ // printf("r2 %f \n", r2);
+ if (r2 < hig2) {
+ // /* Cosmology terms for the signal velocity */
+ const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+ const float a2_Hubble = d_a * d_a * d_H;
+ const float r = sqrt(r2);
+ const float r_inv = 1.f / r;
+ // /* Recover some data */
+ const float mj = ux_m_j.w;
+ // /* Get the kernel for hi. */
+ const float hi_inv = 1.f / hi;
+ const float hid_inv = d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
+ const float xi = r * hi_inv;
+ float wi, wi_dx;
+ d_kernel_deval(xi, &wi, &wi_dx);
+ const float wi_dr = hid_inv * wi_dx;
+ /* Get the kernel for hj. */
+ const float hj = x_h_j.w;
+ const float hj_inv = 1.0f / hj;
+ const float hjd_inv = d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
+ const float xj = r * hj_inv;
+ float wj, wj_dx;
+ d_kernel_deval(xj, &wj, &wj_dx);
+ const float wj_dr = hjd_inv * wj_dx;
+ // /* Compute dv dot r */
+ float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y,
+ dvz = ux_m_i.z - ux_m_j.z;
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+ // /* Add Hubble flow */
+ const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+ // /* Are the particles moving towards each others ? */
+ const float omega_ij = min(dvdr_Hubble, 0.f);
+ const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+ //
+ // /* Signal velocity */
+ const float cj = rho_p_c_vsig_j.z;
+ const float v_sig = ci + cj - const_viscosity_beta * mu_ij;
+
+ /* Variable smoothing length term */
+ const float f_ij = 1.f - f_b_t_mintbinngb_i.x / mj;
+ const float f_ji = 1.f - f_b_t_mintbinngb_j.x / mi;
+
+ /* Construct the full viscosity term */
+ const float pressurej = rho_p_c_vsig_j.y;
+ const float rho_ij = rho_p_c_vsig_i.x + rho_p_c_vsig_j.x;
+ const float alpha = u_avisc_adiff_i.y + u_avisc_adiff_j.y;
+ const float visc = -0.25f * alpha * v_sig * mu_ij *
+ (f_b_t_mintbinngb_i.y + f_b_t_mintbinngb_j.y) / rho_ij;
+ /* Convolve with the kernel */
+ const float visc_acc_term =
+ 0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv;
+ /* Compute gradient terms */
+ const float rhoi2 = rho_p_c_vsig_i.x * rho_p_c_vsig_i.x;
+ const float rhoj2 = rho_p_c_vsig_j.x * rho_p_c_vsig_j.x;
+ const float P_over_rho2_i = pressurei / (rhoi2)*f_ij;
+ const float P_over_rho2_j = pressurej / (rhoj2)*f_ji;
+
+ /* SPH acceleration term */
+ const float sph_acc_term =
+ (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv;
+
+ /* Assemble the acceleration */
+ const float acc = sph_acc_term + visc_acc_term;
+ /* Use the force Luke ! */
+ ahydro.x -= mj * acc * xij;
+ ahydro.y -= mj * acc * yij;
+ ahydro.z -= mj * acc * zij;
+ /* Get the time derivative for u. */
+ const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr;
+
+ /* Viscosity term */
+ const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble;
+ /* Diffusion term */
+ /* Combine the alpha_diff into a pressure-based switch -- this allows the
+ * alpha from the highest pressure particle to dominate, so that the
+ * diffusion limited particles always take precedence - another trick to
+ * allow the scheme to work with thermal feedback. */
+ float alpha_diff =
+ (pressurei * u_avisc_adiff_i.z + pressurej * u_avisc_adiff_j.z) /
+ (pressurei + pressurej);
+ if (fabsf(pressurei + pressurej) < 1e-10) alpha_diff = 0.f;
+ const float v_diff = alpha_diff * 0.5f *
+ (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
+ fabsf(fac_mu * r_inv * dvdr_Hubble));
+ /* wi_dx + wj_dx / 2 is F_ij */
+ const float diff_du_term =
+ v_diff * (u_avisc_adiff_i.x - u_avisc_adiff_j.x) *
+ (f_ij * wi_dr / rho_p_c_vsig_i.x + f_ji * wj_dr / rho_p_c_vsig_j.x);
+
+ /* Assemble the energy equation term */
+ const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term;
+
+ /* Internal energy time derivative */
+ udt_hdt_vsig_mintbinngb.x += du_dt_i * mj;
+
+ /* Get the time derivative for h. */
+ udt_hdt_vsig_mintbinngb.y -= mj * dvdr * r_inv / rho_p_c_vsig_j.x * wi_dr;
+
+ /* Update if we need to; this should be guaranteed by the gradient loop
+ * but due to some possible synchronisation problems this is here as a
+ * _quick fix_. Added: 14th August 2019. To be removed by 1st Jan 2020.
+ * (JB) */
+ udt_hdt_vsig_mintbinngb.z = fmaxf(udt_hdt_vsig_mintbinngb.z, v_sig);
+ unsigned int time_bin_j = (f_b_t_mintbinngb_j.z + 0.5f);
+ unsigned int min_tb_i = (f_b_t_mintbinngb_i.w + 0.5f);
+ if (time_bin_j > 0) f_b_t_mintbinngb_i.w = min(min_tb_i, time_bin_j);
+ // printf("Got in\n");
+ }
+ } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+ // if (pid >= ci_start && pid < ci_end) {
+ udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w;
+ parts_recv[pid].udt_hdt_vsig_mintimebin_ngb = udt_hdt_vsig_mintbinngb;
+ parts_recv[pid].a_hydro = ahydro;
+ // }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2GPU(struct part_soa parts_soa, int pid,
+ const int ci_start, const int ci_end,
+ const int cj_start, const int cj_end, float d_a,
+ float d_H, int time_bin_inhibited, float *vars_pair,
+ double *d_shift_x, double *d_shift_y,
+ double *d_shift_z, const int task_id_tmp) {
+
+ float dx =
+ 1.f / 64.f; // Value used to avoid interacting parts with themselves
+
+ float cellx = 0.0, celly = 0.0, cellz = 0.0;
+ float cellxj = 0.0, cellyj = 0.0, cellzj = 0.0;
+ float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+
+ float mi = 0.0;
+ float uxi = 0.0;
+ float uyi = 0.0;
+ float uzi = 0.0;
+ double pix = 0.0;
+ double piy = 0.0;
+ double piz = 0.0;
+ float rhoi = 0.0;
+ float rho_dhi = 0.0;
+ float wcounti = 0.0;
+ float wcount_dhi = 0.0;
+ float div_vi = 0.0;
+ float rot_uxi = 0.0;
+ float rot_uyi = 0.0;
+ float rot_uzi = 0.0;
+ int Found_neighbours = 0;
+ int count_i = cj_start;
+ // printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
+ // last_part_in_task_blocks_ci %i\n",
+ // first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
+ // last_part_in_task_blocks_ci);
+
+ if (pid < ci_end) {
+ cellx = parts_soa.locx[pid];
+ celly = parts_soa.locy[pid];
+ cellz = parts_soa.locz[pid];
+ const int j = cj_start;
+ cellxj = parts_soa.locx[j];
+ cellyj = parts_soa.locy[j];
+ cellzj = parts_soa.locz[j];
+ hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+ mi = parts_soa.mass[pid];
+ uxi = parts_soa.ux[pid];
+ uyi = parts_soa.uy[pid];
+ uzi = parts_soa.uz[pid];
+ pix = parts_soa.x_p[pid] - d_shift_x[task_id_tmp];
+ piy = parts_soa.y_p[pid] - d_shift_y[task_id_tmp];
+ piz = parts_soa.z_p[pid] - d_shift_z[task_id_tmp];
+ }
+
+ int n_neighbours = 0;
+ float av_dist = 0.f;
+ float av_distx = 0.f;
+ float av_disty = 0.f;
+ float av_distz = 0.f;
+ float distby2h = 0.f;
+ /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+ * of the single shared memory space "vars" which we allocate in kernel
+ * invocation*/
+ double *x_p_tmp = (double *)&vars_pair[0];
+ double *y_p_tmp = (double *)&x_p_tmp[BLOCK_SIZE];
+ double *z_p_tmp = (double *)&y_p_tmp[BLOCK_SIZE];
+ float *h_tmp = (float *)&z_p_tmp[BLOCK_SIZE];
+ float *mass_tmp = (float *)&h_tmp[BLOCK_SIZE];
+ float *ux_tmp = (float *)&mass_tmp[BLOCK_SIZE];
+ float *uy_tmp = (float *)&ux_tmp[BLOCK_SIZE];
+ float *uz_tmp = (float *)&uy_tmp[BLOCK_SIZE];
+ timebin_t *timebin = (timebin_t *)&uz_tmp[BLOCK_SIZE];
+ float *rho_tmp = (float *)&timebin[BLOCK_SIZE];
+ float *rho_dh_tmp = (float *)&rho_tmp[BLOCK_SIZE];
+ float *wcount_tmp = (float *)&rho_dh_tmp[BLOCK_SIZE];
+ float *wcount_dh_tmp = (float *)&wcount_tmp[BLOCK_SIZE];
+ float *div_v_tmp = (float *)&wcount_dh_tmp[BLOCK_SIZE];
+ float *rot_ux_tmp = (float *)&div_v_tmp[BLOCK_SIZE];
+ float *rot_uy_tmp = (float *)&rot_ux_tmp[BLOCK_SIZE];
+ float *rot_uz_tmp = (float *)&rot_uy_tmp[BLOCK_SIZE];
+
+ /*Particles copied in blocks to shared memory*/
+ for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) {
+ const int tid_x = threadIdx.x;
+ int j = b + tid_x;
+ x_p_tmp[tid_x] = parts_soa.x_p[j];
+ y_p_tmp[tid_x] = parts_soa.y_p[j];
+ z_p_tmp[tid_x] = parts_soa.z_p[j];
+ h_tmp[tid_x] = parts_soa.h[j];
+ mass_tmp[tid_x] = parts_soa.mass[j];
+ ux_tmp[tid_x] = parts_soa.ux[j];
+ uy_tmp[tid_x] = parts_soa.uy[j];
+ uz_tmp[tid_x] = parts_soa.uz[j];
+ timebin[tid_x] = parts_soa.time_bin[j];
+ rho_tmp[tid_x] = 0.f;
+ rho_dh_tmp[tid_x] = 0.f;
+ wcount_tmp[tid_x] = 0.f;
+ wcount_dh_tmp[tid_x] = 0.f;
+ div_v_tmp[tid_x] = 0.f;
+ rot_ux_tmp[tid_x] = 0.f;
+ rot_uy_tmp[tid_x] = 0.f;
+ rot_uz_tmp[tid_x] = 0.f;
+ __syncthreads();
+ const double shift_x_j = d_shift_x[task_id_tmp + 1];
+ const double shift_y_j = d_shift_y[task_id_tmp + 1];
+ const double shift_z_j = d_shift_z[task_id_tmp + 1];
+ /*j_block is the particle's index in the block. Loop through particles in
+ * shared memory one by one*/
+ for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+ int jj = b + j_block;
+ if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+ const double pjx = x_p_tmp[j_block] - shift_x_j;
+ const double pjy = y_p_tmp[j_block] - shift_y_j;
+ const double pjz = z_p_tmp[j_block] - shift_z_j;
+
+ const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+ // const float xij = pjx - pix, yij = pjy - piy, zij = pjz
+ //- piz;
+ const float r2 = xij * xij + yij * yij + zij * zij;
+ const float hj = h_tmp[j_block];
+ const float hjg2 = hj * hj * kernel_gamma2;
+ // if(r2 > 32.f * hig2 && hig2 != 0.f) printf("x %f y %f z
+ //%f r %f hig2 %f\n", xij/dx, yij/dx, zij/dx, sqrt(r2)/dx);
+ /* Compute dv dot r */
+ const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+ dvz = uzi - uz_tmp[j_block];
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+ /* Compute dv cross r */
+ const float curlvrx = dvy * zij - dvz * yij;
+ const float curlvry = dvz * xij - dvx * zij;
+ const float curlvrz = dvx * yij - dvy * xij;
+
+ const float r = sqrt(r2);
+ if (r2 < hig2) {
+ /* Recover some data */
+ const float mj = mass_tmp[j_block];
+ /* Get the kernel for hi. */
+ // if(hi<1.f/dx)printf("h < dx\n");
+ const float h_inv = 1.f / hi;
+ const float ui = r * h_inv;
+ float wi, wi_dx;
+
+ d_kernel_deval(ui, &wi, &wi_dx);
+
+ rhoi += mj * wi;
+ rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+ wcounti += wi;
+ wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+ const float r_inv = 1.f / r;
+ const float faci = mj * wi_dx * r_inv;
+
+ div_vi -= faci * dvdr;
+
+ rot_uxi += faci * curlvrx;
+ rot_uyi += faci * curlvry;
+ rot_uzi += faci * curlvrz;
+ //
+ }
+ if (r2 < hjg2) {
+ /* Recover some data */
+ /* Get the kernel for hi. */
+ const float hj_inv = 1.f / hj;
+ const float uj = r * hj_inv;
+ float wj, wj_dx;
+
+ d_kernel_deval(uj, &wj, &wj_dx);
+
+ // atomicAdd(&rho_tmp[j_block], mi * wj);
+ atomicAdd(&parts_soa.rho[j], mi * wj);
+ // atomicAdd(&rho_dh_tmp[j_block], -mi * (hydro_dimension
+ //* wj + uj * wj_dx));
+ atomicAdd(&parts_soa.rho_dh[j],
+ -mi * (hydro_dimension * wj + uj * wj_dx));
+
+ // atomicAdd(&wcount_tmp[j_block], wj);
+ atomicAdd(&parts_soa.wcount[j], wj);
+ // atomicAdd(&wcount_dh_tmp[j_block], -(hydro_dimension *
+ // wj + uj * wj_dx));
+ atomicAdd(&parts_soa.wcount_dh[j],
+ -(hydro_dimension * wj + uj * wj_dx));
+
+ const float r_inv = 1.f / r;
+ const float facj = mi * wj_dx * r_inv;
+
+ // atomicAdd(&div_v_tmp[j_block], -facj * dvdr);
+ atomicAdd(&parts_soa.div_v[j], -facj * dvdr);
+
+ // atomicAdd(&rot_ux_tmp[j_block], facj * curlvrx);
+ // atomicAdd(&rot_uy_tmp[j_block], facj * curlvry);
+ // atomicAdd(&rot_uz_tmp[j_block], facj * curlvrz);
+ atomicAdd(&parts_soa.rot_ux[j], facj * curlvrx);
+ atomicAdd(&parts_soa.rot_uy[j], facj * curlvry);
+ atomicAdd(&parts_soa.rot_uz[j], facj * curlvrz);
+ // printf("rho %f rho_dh %f wcount %f wcount_dh %f div_v
+ //%f rotux %f rotuy %f rotuz %f\n" ,rhoi,
+ // rho_dhi, wcounti, wcount_dhi, div_vi, rot_uxi, rot_uyi, rot_uzi);
+ } /*if r2= ci_start)*/
+ } /*End of looping through particles in shared memory---Shared arrays
+ zero'ed for next step in outer loop*/
+ __syncthreads();
+ // if(j < cj_end){
+ // atomicAdd(&parts_soa.rho[j], rho_tmp[threadIdx.x]);
+ // atomicAdd(&parts_soa.rho_dh[j], rho_dh_tmp[threadIdx.x]);
+ // atomicAdd(&parts_soa.wcount[j], wcount_tmp[threadIdx.x]);
+ // atomicAdd(&parts_soa.wcount_dh[j], wcount_dh_tmp[threadIdx.x]);
+ // atomicAdd(&parts_soa.div_v[j], div_v_tmp[threadIdx.x]);
+ // atomicAdd(&parts_soa.rot_ux[j], rot_ux_tmp[threadIdx.x]);
+ // atomicAdd(&parts_soa.rot_uy[j], rot_uy_tmp[threadIdx.x]);
+ // atomicAdd(&parts_soa.rot_uz[j], rot_uz_tmp[threadIdx.x]);
+ // }
+ // __syncthreads();
+ // parts_soa.rho[j] += rho_tmp[threadIdx.x];
+ // parts_soa.rho_dh[j] += rho_dh_tmp[threadIdx.x];
+ // parts_soa.wcount[j] += wcount_tmp[threadIdx.x];
+ // parts_soa.wcount_dh[j] =+ wcount_dh_tmp[threadIdx.x];
+ // parts_soa.div_v[j] += div_v_tmp[threadIdx.x];
+ // parts_soa.rot_ux[j] += rot_ux_tmp[threadIdx.x];
+ // parts_soa.rot_uy[j] =+ rot_uy_tmp[threadIdx.x];
+ // parts_soa.rot_uz[j] += rot_uz_tmp[threadIdx.x];
+ } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+ if (pid >= ci_start && pid < ci_end) {
+ // if(n_neighbours > 0){
+ // distby2h = distby2h/n_neighbours;
+ // av_dist = av_dist/(n_neighbours*dx);
+ // }
+ // av_distx = av_distx/(n_neighbours*dx);
+ // av_disty = av_disty/(n_neighbours*dx);
+ // av_distz = av_distz/(n_neighbours*dx);
+ parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+ parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+ parts_soa.div_v[pid] = div_vi;
+ parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi;
+ parts_soa.rot_uz[pid] = rot_uzi;
+ // if(rhoi != 0.f)printf("rho i %f, rho_dh i %f\n", rhoi, rho_dhi);
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_density_GPU(
+ struct part_soa parts_soa, int *d_task_first_part_ci,
+ int *d_task_first_part_cj, int *d_task_last_part_ci,
+ int *d_task_last_part_cj, float d_a, float d_H, int bid, int tid,
+ int count_tasks, int tasksperbundle, int nBlocks_per_task,
+ int bundle_first_task, int time_bin_inhibited) {
+
+ extern __shared__ float vars[];
+ // __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+ int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+
+ first_part_in_task_blocks_ci = d_task_first_part_ci[task_id];
+ last_part_in_task_blocks_ci = d_task_last_part_ci[task_id];
+ first_part_in_task_blocks_cj = d_task_first_part_cj[task_id];
+ last_part_in_task_blocks_cj = d_task_last_part_cj[task_id];
+
+ // Now we start calculations for particles in cell i
+ const int pid = threadid + first_part_in_task_blocks_ci;
+
+ /*Don't ever put me in an if statement. I've got __syncthreads inside*/
+ DOPAIRGPU(parts_soa, pid, last_part_in_task_blocks_ci,
+ first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, d_a, d_H,
+ time_bin_inhibited, vars);
+ // __syncthreads();
+ // Now we start calculations for particles in cell i
+ const int pjd = threadid + last_part_in_task_blocks_ci;
+ /*Don't ever put me in an if statement. I've got __syncthreads inside*/
+ DOPAIRGPU(parts_soa, pjd, last_part_in_task_blocks_cj,
+ first_part_in_task_blocks_ci, last_part_in_task_blocks_ci, d_a, d_H,
+ time_bin_inhibited, vars);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_sym_density_GPU(
+ struct part_soa parts_soa, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+ int count_tasks, int tasksperbundle, int nBlocks_per_task,
+ int bundle_first_task, int time_bin_inhibited, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z) {
+
+ extern __shared__ float vars_pair[];
+ // __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+ // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+ const int task_id_tmp = 2 * task_id;
+ const int ci_start = d_task_first_parts_pair[task_id_tmp];
+ const int ci_end = d_task_last_parts_pair[task_id_tmp];
+ const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+ const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+ // Now we start calculations for particles in cell i
+ const int pid = threadid + ci_start;
+
+ /*Don't ever put me in an if statement. I've got __syncthreads inside*/
+ DOPAIR2GPU(parts_soa, pid, ci_start, ci_end, cj_start, cj_end, d_a, d_H,
+ time_bin_inhibited, vars_pair, d_shift_x, d_shift_y, d_shift_z,
+ task_id_tmp);
+ // __syncthreads();
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_nonsym_density_GPU(
+ struct part_soa parts_soa, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+ int count_tasks, int tasksperbundle, int nBlocks_per_task,
+ int bundle_first_task, int time_bin_inhibited, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z) {
+
+ extern __shared__ float vars_pair[];
+ // __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+ // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+ const int task_id_tmp = 2 * task_id;
+ const int ci_start = d_task_first_parts_pair[task_id_tmp];
+ const int ci_end = d_task_last_parts_pair[task_id_tmp];
+ const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+ const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+ /* Start calculations for particles in cell i
+ * Don't ever put me in an if statement. I've got __syncthreads inside*/
+ const int pid = threadid + ci_start;
+ const int flip_i = 1;
+ DOPAIR2NONSYMGPU(parts_soa, pid, ci_start, ci_end, cj_start, cj_end, d_a, d_H,
+ vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp,
+ flip_i);
+
+ /*Necessary evil to stop parts from j and i co-existing on shared memory for
+ * sums*/
+ __syncthreads();
+
+ /*Now do cj
+ * Don't ever put me in an if statement. I've got __syncthreads inside*/
+ const int pjd = threadid + cj_start;
+ const int flip_j = -1;
+ DOPAIR2NONSYMGPU(parts_soa, pjd, cj_start, cj_end, ci_start, ci_end, d_a, d_H,
+ vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp + 1,
+ flip_j);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_ci_density_GPU(
+ struct part_soa parts_soa, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+ int count_tasks, int tasksperbundle, int nBlocks_per_task,
+ int bundle_first_task, double *d_shift_x, double *d_shift_y,
+ double *d_shift_z) {
+
+ extern __shared__ float vars_pair[];
+ // __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+ // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+ const int task_id_tmp = 2 * task_id;
+ const int ci_start = d_task_first_parts_pair[task_id_tmp];
+ const int ci_end = d_task_last_parts_pair[task_id_tmp];
+ const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+ const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+ /* Start calculations for particles in cell i
+ * Don't ever put me in an if statement. I've got __syncthreads inside*/
+ const int pid = threadid + ci_start;
+ const int flip_i = 1;
+ DOPAIR2NONSYMGPU(parts_soa, pid, ci_start, ci_end, cj_start, cj_end, d_a, d_H,
+ vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp,
+ flip_i);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_cj_density_GPU(
+ struct part_soa parts_soa, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+ int count_tasks, int tasksperbundle, int nBlocks_per_task,
+ int bundle_first_task, double *d_shift_x, double *d_shift_y,
+ double *d_shift_z) {
+
+ extern __shared__ float vars_pair[];
+ // __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+ // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+ const int task_id_tmp = 2 * task_id;
+ const int ci_start = d_task_first_parts_pair[task_id_tmp];
+ const int ci_end = d_task_last_parts_pair[task_id_tmp];
+ const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+ const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+ /*Now do cj
+ * Don't ever put me in an if statement. I've got __syncthreads inside*/
+ const int pjd = threadid + cj_start;
+ const int flip_j = -1;
+ DOPAIR2NONSYMGPU(parts_soa, pjd, cj_start, cj_end, ci_start, ci_end, d_a, d_H,
+ vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp + 1,
+ flip_j);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_ci_density_GPU_aos(
+ struct part_aos *parts_aos, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+ int count_tasks, int tasksperbundle, int nBlocks_per_task,
+ int bundle_first_task, double *d_shift_x, double *d_shift_y,
+ double *d_shift_z) {
+
+ extern __shared__ float vars_pair_aos[];
+ // __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+ // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+ const int task_id_tmp = 2 * task_id;
+ const int ci_start = d_task_first_parts_pair[task_id_tmp];
+ const int ci_end = d_task_last_parts_pair[task_id_tmp];
+ const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+ const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+ /* Start calculations for particles in cell i
+ * Don't ever put me in an if statement. I've got __syncthreads inside*/
+ const int pid = threadid + ci_start;
+ const int flip_i = 1;
+ DOPAIR2NONSYMGPUAOS(parts_aos, pid, ci_start, ci_end, cj_start, cj_end, d_a,
+ d_H, vars_pair_aos, d_shift_x, d_shift_y, d_shift_z,
+ task_id_tmp, flip_i);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_cj_density_GPU_aos(
+ struct part_aos *parts_aos, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+ int count_tasks, int tasksperbundle, int nBlocks_per_task,
+ int bundle_first_task, double *d_shift_x, double *d_shift_y,
+ double *d_shift_z) {
+
+ extern __shared__ float vars_pair_aos[];
+ // __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+ // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+ const int task_id_tmp = 2 * task_id;
+ const int ci_start = d_task_first_parts_pair[task_id_tmp];
+ const int ci_end = d_task_last_parts_pair[task_id_tmp];
+ const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+ const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+ /*Now do cj
+ * Don't ever put me in an if statement. I've got __syncthreads inside*/
+ const int pjd = threadid + cj_start;
+ const int flip_j = -1;
+ DOPAIR2NONSYMGPUAOS(parts_aos, pjd, cj_start, cj_end, ci_start, ci_end, d_a,
+ d_H, vars_pair_aos, d_shift_x, d_shift_y, d_shift_z,
+ task_id_tmp + 1, flip_j);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_ci_density_GPU_aos_f4(
+ struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+ int4 *fparti_fpartj_lparti_lpartj_dens, float d_a, float d_H,
+ int bundle_first_task) {
+
+ extern __shared__ float4 vars_pair_i_f4[];
+ // __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+ // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+ const int ci_start = fparti_fpartj_lparti_lpartj_dens[task_id].x;
+ const int cj_start = fparti_fpartj_lparti_lpartj_dens[task_id].y;
+ const int ci_end = fparti_fpartj_lparti_lpartj_dens[task_id].z;
+ const int cj_end = fparti_fpartj_lparti_lpartj_dens[task_id].w;
+
+ /* Start calculations for particles in cell i
+ * Don't ever put me in an if statement. I've got __syncthreads inside*/
+ const int pid = threadid + ci_start;
+
+ DOPAIR2NONSYMGPUAOSF4(parts_send, parts_recv, pid, ci_start, ci_end, cj_start,
+ cj_end, d_a, d_H, vars_pair_i_f4);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_cj_density_GPU_aos_f4(
+ struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+ int4 *fparti_fpartj_lparti_lpartj_dens, float d_a, float d_H,
+ int bundle_first_task) {
+
+ extern __shared__ float4 vars_pair_j_f4[];
+ // __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ const int ci_start = fparti_fpartj_lparti_lpartj_dens[task_id].x;
+ const int cj_start = fparti_fpartj_lparti_lpartj_dens[task_id].y;
+ const int ci_end = fparti_fpartj_lparti_lpartj_dens[task_id].z;
+ const int cj_end = fparti_fpartj_lparti_lpartj_dens[task_id].w;
+
+ /*Now do cj
+ * Don't ever put me in an if statement. I've got __syncthreads inside*/
+ const int pjd = threadid + cj_start;
+ DOPAIR2NONSYMGPUAOSF4(parts_send, parts_recv, pjd, cj_start, cj_end, ci_start,
+ ci_end, d_a, d_H, vars_pair_j_f4);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_density_GPU_aos_f4(
+ struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+ float d_a, float d_H, int bundle_first_part, int bundle_n_parts) {
+
+ // extern __shared__ float4 vars_pair_i_f4[];
+ // __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int pid = bundle_first_part + threadid;
+ // const int task_id = bundle_first_part + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+ // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+ if (pid < bundle_first_part + bundle_n_parts) {
+ const struct part_aos_f4_send pi = parts_send[pid];
+ const int cj_start = pi.cjs_cje.x;
+ const int cj_end = pi.cjs_cje.y;
+
+ /* Start calculations for particles in cell i*/
+ DOPAIR2NAIVEGPUAOSF4(pi, parts_send, parts_recv, pid, cj_start, cj_end, d_a,
+ d_H);
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_ci_density_GPU_aos_g(
+ struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+ int count_tasks, int tasksperbundle, int nBlocks_per_task,
+ int bundle_first_task, double *d_shift_x, double *d_shift_y,
+ double *d_shift_z) {
+
+ extern __shared__ float vars_pair_aosg[];
+ // __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+ // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+ const int task_id_tmp = 2 * task_id;
+ const int ci_start = d_task_first_parts_pair[task_id_tmp];
+ const int ci_end = d_task_last_parts_pair[task_id_tmp];
+ const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+ const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+ /* Start calculations for particles in cell i
+ * Don't ever put me in an if statement. I've got __syncthreads inside*/
+ const int pid = threadid + ci_start;
+ const int flip_i = 1;
+ DOPAIR2NONSYMGPUAOSG(parts_aos, pid, ci_start, ci_end, cj_start, cj_end, d_a,
+ d_H, vars_pair_aosg, d_shift_x, d_shift_y, d_shift_z,
+ task_id_tmp, flip_i);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_cj_density_GPU_aos_g(
+ struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+ int count_tasks, int tasksperbundle, int nBlocks_per_task,
+ int bundle_first_task, double *d_shift_x, double *d_shift_y,
+ double *d_shift_z) {
+
+ extern __shared__ float vars_pair_aosg[];
+ // __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+ // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+ const int task_id_tmp = 2 * task_id;
+ const int ci_start = d_task_first_parts_pair[task_id_tmp];
+ const int ci_end = d_task_last_parts_pair[task_id_tmp];
+ const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+ const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+ /*Now do cj
+ * Don't ever put me in an if statement. I've got __syncthreads inside*/
+ const int pjd = threadid + cj_start;
+ const int flip_j = -1;
+ DOPAIR2NONSYMGPUAOSG(parts_aos, pjd, cj_start, cj_end, ci_start, ci_end, d_a,
+ d_H, vars_pair_aosg, d_shift_x, d_shift_y, d_shift_z,
+ task_id_tmp + 1, flip_j);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_gradient_GPU_aos_f4(
+ struct part_aos_f4_g_send *parts_send,
+ struct part_aos_f4_g_recv *parts_recv, float d_a, float d_H,
+ int bundle_first_part, int bundle_n_parts) {
+
+ // extern __shared__ float4 vars_pair_i_f4[];
+ // __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int pid = bundle_first_part + threadid;
+ // const int task_id = bundle_first_part + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ if (pid < bundle_first_part + bundle_n_parts) {
+ const struct part_aos_f4_g_send pi = parts_send[pid];
+ const int cj_start = pi.cjs_cje.x;
+ const int cj_end = pi.cjs_cje.y;
+ /* Start calculations for particles in cell i*/
+ DOPAIR2NAIVEGPUAOSF4G(pi, parts_send, parts_recv, pid, cj_start, cj_end,
+ d_a, d_H);
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_ci_density_GPU_aos_f(
+ struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+ int count_tasks, int tasksperbundle, int nBlocks_per_task,
+ int bundle_first_task, double *d_shift_x, double *d_shift_y,
+ double *d_shift_z) {
+
+ extern __shared__ float vars_pair_aosf[];
+ // __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+ // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+ const int task_id_tmp = 2 * task_id;
+ const int ci_start = d_task_first_parts_pair[task_id_tmp];
+ const int ci_end = d_task_last_parts_pair[task_id_tmp];
+ const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+ const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+ /* Start calculations for particles in cell i
+ * Don't ever put me in an if statement. I've got __syncthreads inside*/
+ const int pid = threadid + ci_start;
+ const int flip_i = 1;
+ DOPAIR2NONSYMGPUAOSF(parts_aos, pid, ci_start, ci_end, cj_start, cj_end, d_a,
+ d_H, vars_pair_aosf, d_shift_x, d_shift_y, d_shift_z,
+ task_id_tmp, flip_i);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_cj_density_GPU_aos_f(
+ struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+ int count_tasks, int tasksperbundle, int nBlocks_per_task,
+ int bundle_first_task, double *d_shift_x, double *d_shift_y,
+ double *d_shift_z) {
+
+ extern __shared__ float vars_pair_aosf[];
+ // __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+ // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+ const int task_id_tmp = 2 * task_id;
+ const int ci_start = d_task_first_parts_pair[task_id_tmp];
+ const int ci_end = d_task_last_parts_pair[task_id_tmp];
+ const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+ const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+ /*Now do cj
+ * Don't ever put me in an if statement. I've got __syncthreads inside*/
+ const int pjd = threadid + cj_start;
+ const int flip_j = -1;
+ DOPAIR2NONSYMGPUAOSF(parts_aos, pjd, cj_start, cj_end, ci_start, ci_end, d_a,
+ d_H, vars_pair_aosf, d_shift_x, d_shift_y, d_shift_z,
+ task_id_tmp + 1, flip_j);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_force_GPU_aos_f4(
+ struct part_aos_f4_f_send *parts_send,
+ struct part_aos_f4_f_recv *parts_recv, float d_a, float d_H,
+ int bundle_first_part, int bundle_n_parts) {
+
+ // extern __shared__ float4 vars_pair_i_f4[];
+ // __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int pid = bundle_first_part + threadid;
+ // const int task_id = bundle_first_part + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ if (pid < bundle_first_part + bundle_n_parts) {
+ const struct part_aos_f4_f_send pi = parts_send[pid];
+ const int cj_start = pi.cjs_cje.x;
+ const int cj_end = pi.cjs_cje.y;
+ /* Start calculations for particles in cell i */
+ DOPAIR2NAIVEGPUAOSF4F(pi, parts_send, parts_recv, pid, cj_start, cj_end,
+ d_a, d_H);
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopair1_branch_density_gpu(
+ struct part_soa parts_soa, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+ int tid, int offset, int bundle_first_task, int time_bin_inhibited,
+ double *d_shift_x, double *d_shift_y, double *d_shift_z) {
+
+ int max_parts = max(max_parts_j, max_parts_i);
+ int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+ // fprintf(stderr,"max_parts %i, max_partsi %i, max_partsj %i\n, "
+ // "numBlocks_x %i, numBlocks_y %i, BLOCK_SIZE %i\n", max_parts,
+ // max_parts_i, max_parts_j, numBlocks_x, numBlocks_y,
+ // BLOCK_SIZE);
+
+ /*Do ci & cj*/
+ // fprintf(stderr, "BLOCK_SIZE %i max parts %i num idle threads %i\n",
+ // BLOCK_SIZE, max_parts, numBlocks_x * BLOCK_SIZE - max_parts);
+
+ // runner_do_pair_sym_density_GPU<<>>(
+ // parts_soa, d_task_first_parts_pair, d_task_last_parts_pair,
+ // d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+ // nBlocks_per_task, bundle_first_task, time_bin_inhibited, d_shift_x,
+ // d_shift_y, d_shift_z);
+
+ runner_do_pair_nonsym_density_GPU<<>>(
+ parts_soa, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+ tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+ time_bin_inhibited, d_shift_x, d_shift_y, d_shift_z);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopairci_branch_density_gpu(
+ struct part_soa parts_soa, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+ int tid, int offset, int bundle_first_task, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z) {
+
+ int max_parts = max_parts_i;
+ int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+
+ runner_do_pair_ci_density_GPU<<>>(
+ parts_soa, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+ tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+ d_shift_x, d_shift_y, d_shift_z);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopaircj_branch_density_gpu(
+ struct part_soa parts_soa, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+ int tid, int offset, int bundle_first_task, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z) {
+
+ int max_parts = max_parts_j;
+ int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+
+ runner_do_pair_cj_density_GPU<<>>(
+ parts_soa, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+ tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+ d_shift_x, d_shift_y, d_shift_z);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopairci_branch_density_gpu_aos(
+ struct part_aos *parts_aos, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+ int tid, int offset, int bundle_first_task, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z) {
+
+ int max_parts = max_parts_i;
+ int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+
+ runner_do_pair_ci_density_GPU_aos<<>>(
+ parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+ tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+ d_shift_x, d_shift_y, d_shift_z);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopaircj_branch_density_gpu_aos(
+ struct part_aos *parts_aos, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+ int tid, int offset, int bundle_first_task, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z) {
+
+ int max_parts = max_parts_j;
+ int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+
+ runner_do_pair_cj_density_GPU_aos<<>>(
+ parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+ tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+ d_shift_x, d_shift_y, d_shift_z);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopairci_branch_density_gpu_aos_f4(
+ struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+ float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+ int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens) {
+
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+
+ runner_do_pair_ci_density_GPU_aos_f4<<<
+ gridShape, BLOCK_SIZE, 2 * BLOCK_SIZE * sizeof(float4), stream>>>(
+ parts_send, parts_recv, fparti_fpartj_lparti_lpartj_dens, d_a, d_H,
+ bundle_first_task);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopaircj_branch_density_gpu_aos_f4(
+ struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+ float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+ int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens) {
+
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+
+ runner_do_pair_cj_density_GPU_aos_f4<<<
+ gridShape, BLOCK_SIZE, 2 * BLOCK_SIZE * sizeof(float4), stream>>>(
+ parts_send, parts_recv, fparti_fpartj_lparti_lpartj_dens, d_a, d_H,
+ bundle_first_task);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopair_branch_density_gpu_aos_f4(
+ struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+ float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+ int bundle_first_part, int bundle_n_parts) {
+
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+
+ // fprintf(stderr, "nblocks %i\n", numBlocks_x);
+ runner_do_pair_density_GPU_aos_f4<<>>(
+ parts_send, parts_recv, d_a, d_H, bundle_first_part, bundle_n_parts);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopairci_branch_density_gpu_aos_g(
+ struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+ int tid, int offset, int bundle_first_task, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z) {
+
+ int max_parts = max_parts_i;
+ int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+
+ runner_do_pair_ci_density_GPU_aos_g<<<
+ gridShape, BLOCK_SIZE,
+ 12 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int), stream>>>(
+ parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+ tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+ d_shift_x, d_shift_y, d_shift_z);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopaircj_branch_density_gpu_aos_g(
+ struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+ int tid, int offset, int bundle_first_task, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z) {
+
+ int max_parts = max_parts_j;
+ int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+
+ runner_do_pair_cj_density_GPU_aos_g<<<
+ gridShape, BLOCK_SIZE,
+ 12 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int), stream>>>(
+ parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+ tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+ d_shift_x, d_shift_y, d_shift_z);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopair_branch_gradient_gpu_aos_f4(
+ struct part_aos_f4_g_send *parts_send,
+ struct part_aos_f4_g_recv *parts_recv, float d_a, float d_H,
+ cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+ int bundle_first_part, int bundle_n_parts) {
+
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+
+ // fprintf(stderr, "nblocks %i\n", numBlocks_x);
+ runner_do_pair_gradient_GPU_aos_f4<<>>(
+ parts_send, parts_recv, d_a, d_H, bundle_first_part, bundle_n_parts);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopairci_branch_density_gpu_aos_f(
+ struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+ int tid, int offset, int bundle_first_task, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z) {
+
+ int max_parts = max_parts_i;
+ int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+
+ runner_do_pair_ci_density_GPU_aos_f<<<
+ gridShape, BLOCK_SIZE,
+ 17 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int), stream>>>(
+ parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+ tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+ d_shift_x, d_shift_y, d_shift_z);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopaircj_branch_density_gpu_aos_f(
+ struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+ int tid, int offset, int bundle_first_task, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z) {
+
+ int max_parts = max_parts_j;
+ int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+
+ runner_do_pair_cj_density_GPU_aos_f<<<
+ gridShape, BLOCK_SIZE,
+ 17 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int), stream>>>(
+ parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+ tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+ d_shift_x, d_shift_y, d_shift_z);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopair_branch_force_gpu_aos_f4(
+ struct part_aos_f4_f_send *parts_send,
+ struct part_aos_f4_f_recv *parts_recv, float d_a, float d_H,
+ cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+ int bundle_first_part, int bundle_n_parts) {
+
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+
+ // fprintf(stderr, "nblocks %i\n", numBlocks_x);
+ runner_do_pair_force_GPU_aos_f4<<>>(
+ parts_send, parts_recv, d_a, d_H, bundle_first_part, bundle_n_parts);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+
+__global__ void runner_do_self_density_GPU_naive(
+ struct part_soa parts_soa, int *d_task_first_part, int *d_task_last_part,
+ float d_a, float d_H, int bid, int tid, int count_tasks, int tasksperbundle,
+ int nBlocks_per_task, int bundle_first_task, int max_parts,
+ int time_bin_inhibited) {
+
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+ first_part_in_task_blocks = d_task_first_part[task_id];
+ last_part_in_task_blocks = d_task_last_part[task_id];
+
+ const int pid = threadid + first_part_in_task_blocks;
+
+ int ttid = 0;
+ int first_part = 0;
+ int count = 0;
+ int last_part = 0;
+ float cellx = 0.0, celly = 0.0, cellz = 0.0;
+ float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+ float mi = 0.0;
+ float uxi = 0.0;
+ float uyi = 0.0;
+ float uzi = 0.0;
+ float pix = 0.0;
+ float piy = 0.0;
+ float piz = 0.0;
+ float rhoi = 0.0;
+ float rho_dhi = 0.0;
+ float wcounti = 0.0;
+ float wcount_dhi = 0.0;
+ float div_vi = 0.0;
+ float rot_uxi = 0.0;
+ float rot_uyi = 0.0;
+ float rot_uzi = 0.0;
+ int Found_neighbours = 0;
+
+ if (pid < last_part_in_task_blocks) {
+ ttid = parts_soa.tid_p[pid];
+ first_part = d_task_first_part[ttid];
+ last_part = d_task_last_part[ttid];
+ count = last_part - first_part;
+ cellx = parts_soa.locx[pid], celly = parts_soa.locy[pid],
+ cellz = parts_soa.locz[pid];
+ hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+ mi = parts_soa.mass[pid];
+ uxi = parts_soa.ux[pid];
+ uyi = parts_soa.uy[pid];
+ uzi = parts_soa.uz[pid];
+ pix = parts_soa.x_p[pid] - cellx;
+ piy = parts_soa.y_p[pid] - celly;
+ piz = parts_soa.z_p[pid] - cellz;
+
+ int n_neighbours = 0;
+
+ /*Naive loop over neighbours*/
+ for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+ b += BLOCK_SIZE) {
+ for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+ int j = j_block + b;
+ if (j < last_part_in_task_blocks) {
+ const float x_p_tmp = parts_soa.x_p[j];
+ const float y_p_tmp = parts_soa.y_p[j];
+ const float z_p_tmp = parts_soa.z_p[j];
+ const float h_tmp = parts_soa.h[j];
+ const float mass_tmp = parts_soa.mass[j];
+ const float ux_tmp = parts_soa.ux[j];
+ const float uy_tmp = parts_soa.uy[j];
+ const float uz_tmp = parts_soa.uz[j];
+ const timebin_t timebin = parts_soa.time_bin[j];
+
+ /* Compute the pairwise distance. */
+ const float pjx = x_p_tmp - cellx;
+ const float pjy = y_p_tmp - celly;
+ const float pjz = z_p_tmp - cellz;
+ const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+ const float r2 = xij * xij + yij * yij + zij * zij;
+ const float hj = h_tmp, hjg2 = hj * hj * kernel_gamma2;
+ if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
+ Found_neighbours = 1;
+ const float r = sqrt(r2);
+ /* Recover some data */
+ const float mj = mass_tmp;
+ /* Get the kernel for hi. */
+ if (hi < 1.f / 128.f) printf("h < dx\n");
+ const float h_inv = 1.f / hi;
+ const float ui = r * h_inv;
+ float wi, wi_dx;
+
+ d_kernel_deval(ui, &wi, &wi_dx);
+
+ rhoi += mj * wi;
+ rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+ wcounti += wi;
+ wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+ const float r_inv = 1.f / r;
+ const float faci = mj * wi_dx * r_inv;
+
+ /* Compute dv dot r */
+ float dvx = uxi - ux_tmp, dvy = uyi - uy_tmp, dvz = uzi - uz_tmp;
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+ div_vi -= faci * dvdr;
+
+ /* Compute dv cross r */
+ float curlvrx = dvy * zij - dvz * yij;
+ float curlvry = dvz * xij - dvx * zij;
+ float curlvrz = dvx * yij - dvy * xij;
+
+ rot_uxi += faci * curlvrx;
+ rot_uyi += faci * curlvry;
+ rot_uzi += faci * curlvrz;
+ }
+ }
+ }
+ }
+ // float wi, wi_dx;
+ // d_kernel_deval(0.f, &wi, &wi_dx);
+ if (Found_neighbours == 0)
+ printf("Not sure what's going on but no neighbours found in GPU loop\n");
+ parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+ parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+ parts_soa.div_v[pid] = div_vi;
+ parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi,
+ parts_soa.rot_uz[pid] = rot_uzi;
+ }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_tester_kernel(struct part_soa parts_soa, int *d_task_first_part,
+ int *d_task_last_part, float d_a, float d_H,
+ const char *loop_type, cudaStream_t stream, int bid,
+ int block_size, int count_tasks, int tasksperbundle,
+ int numBlocks_x, int numBlocks_y, int tid, int offset,
+ int bundle_first_task, int max_parts,
+ int time_bin_inhibited) {
+
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+ tester<<>>(parts_soa, d_task_first_part, d_task_last_part, d_a, d_H,
+ bid, tid, count_tasks, tasksperbundle, nBlocks_per_task,
+ bundle_first_task, max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part,
+ int *d_task_last_part, float d_a, float d_H,
+ const char *loop_type, cudaStream_t stream,
+ int block_size, int count_tasks, int tasksperbundle,
+ int numBlocks_x, int numBlocks_y,
+ int bundle_first_task, int max_parts) {
+
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+ runner_do_self_density_GPU<<>>(
+ parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, count_tasks,
+ tasksperbundle, nBlocks_per_task, bundle_first_task, max_parts);
+ // runner_do_self_density_GPU_naive<<>>(
+ // parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+ // count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+ // max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_gradient_aos(struct part_aos_g *parts_aos, int *d_task_first_part,
+ int *d_task_last_part, float d_a, float d_H,
+ const char *loop_type, cudaStream_t stream,
+ int block_size, int count_tasks, int tasksperbundle,
+ int numBlocks_x, int numBlocks_y,
+ int bundle_first_task, int max_parts, double *d_cell_x,
+ double *d_cell_y, double *d_cell_z) {
+
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+ DOSELF_GPU_AOS_G<<>>(parts_aos, d_task_first_part, d_task_last_part,
+ d_a, d_H, count_tasks, tasksperbundle,
+ nBlocks_per_task, bundle_first_task, max_parts,
+ d_cell_x, d_cell_y, d_cell_z);
+ // runner_do_self_density_GPU_naive<<>>(
+ // parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+ // count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+ // max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_gradient_aos_f4(struct part_aos_f4_g_send *parts_send,
+ struct part_aos_f4_g_recv *parts_recv, float d_a,
+ float d_H, cudaStream_t stream, int numBlocks_x,
+ int numBlocks_y, int bundle_first_task,
+ int2 *d_task_first_part_f4) {
+
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+ DOSELF_GPU_AOS_F4_G<<>>(parts_send, parts_recv, d_a, d_H,
+ bundle_first_task, d_task_first_part_f4);
+ // runner_do_self_density_GPU_naive<<>>(
+ // parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+ // count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+ // max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_force_aos(struct part_aos_f *parts_aos, int *d_task_first_part,
+ int *d_task_last_part, float d_a, float d_H,
+ const char *loop_type, cudaStream_t stream,
+ int block_size, int count_tasks, int tasksperbundle,
+ int numBlocks_x, int numBlocks_y, int bundle_first_task,
+ int max_parts, double *d_cell_x, double *d_cell_y,
+ double *d_cell_z) {
+
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+ DOSELF_GPU_AOS_F<<>>(parts_aos, d_task_first_part, d_task_last_part,
+ d_a, d_H, count_tasks, tasksperbundle,
+ nBlocks_per_task, bundle_first_task, max_parts,
+ d_cell_x, d_cell_y, d_cell_z);
+ // runner_do_self_density_GPU_naive<<>>(
+ // parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+ // count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+ // max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_force_aos_f4(struct part_aos_f4_f_send *d_parts_send,
+ struct part_aos_f4_f_recv *d_parts_recv, float d_a,
+ float d_H, cudaStream_t stream, int numBlocks_x,
+ int numBlocks_y, int bundle_first_task,
+ int2 *d_task_first_part_f4) {
+
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+ DOSELF_GPU_AOS_F4_F<<<
+ gridShape, BLOCK_SIZE,
+ 4 * BLOCK_SIZE * sizeof(float4) + BLOCK_SIZE * sizeof(float3), stream>>>(
+ d_parts_send, d_parts_recv, d_a, d_H, bundle_first_task,
+ d_task_first_part_f4);
+ // runner_do_self_density_GPU_naive<<>>(
+ // parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+ // count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+ // max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
diff --git a/src/cuda/GPU_runner_functions.h b/src/cuda/GPU_runner_functions.h
new file mode 100644
index 0000000000..27bbecdd92
--- /dev/null
+++ b/src/cuda/GPU_runner_functions.h
@@ -0,0 +1,148 @@
+#ifndef CUDA_HEADERS_H
+#define CUDA_HEADERS_H
+#define n_streams 1024
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "part_gpu.h"
+void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part,
+ int *d_task_last_part, float d_a, float d_H,
+ const char *loop_type, cudaStream_t stream,
+ int block_size, int count_tasks, int tasksperbundle,
+ int numBlocks_x, int numBlocks_y,
+ int bundle_first_task, int max_parts);
+void launch_density_aos(struct part_aos *parts_aos, int *d_task_first_part,
+ int *d_task_last_part, float d_a, float d_H,
+ const char *loop_type, cudaStream_t stream,
+ int block_size, int count_tasks, int tasksperbundle,
+ int numBlocks_x, int numBlocks_y, int bundle_first_task,
+ int max_parts, double *d_cell_x, double *d_cell_y,
+ double *d_cell_z);
+void launch_density_aos_f4(struct part_aos_f4_send *parts_send,
+ struct part_aos_f4_recv *parts_recv, float d_a,
+ float d_H, cudaStream_t stream, int numBlocks_x,
+ int numBlocks_y, int bundle_first_task,
+ int2 *d_task_first_part_f4);
+void launch_gradient_aos(struct part_aos_g *parts_aos, int *d_task_first_part,
+ int *d_task_last_part, float d_a, float d_H,
+ const char *loop_type, cudaStream_t stream,
+ int block_size, int count_tasks, int tasksperbundle,
+ int numBlocks_x, int numBlocks_y,
+ int bundle_first_task, int max_parts, double *d_cell_x,
+ double *d_cell_y, double *d_cell_z);
+void launch_gradient_aos_f4(struct part_aos_f4_g_send *parts_send,
+ struct part_aos_f4_g_recv *parts_recv, float d_a,
+ float d_H, cudaStream_t stream, int numBlocks_x,
+ int numBlocks_y, int bundle_first_task,
+ int2 *d_task_first_part_f4);
+void launch_force_aos(struct part_aos_f *parts_aos, int *d_task_first_part,
+ int *d_task_last_part, float d_a, float d_H,
+ const char *loop_type, cudaStream_t stream,
+ int block_size, int count_tasks, int tasksperbundle,
+ int numBlocks_x, int numBlocks_y, int bundle_first_task,
+ int max_parts, double *d_cell_x, double *d_cell_y,
+ double *d_cell_z);
+void launch_force_aos_f4(struct part_aos_f4_f_send *parts_send,
+ struct part_aos_f4_f_recv *parts_recv, float d_a,
+ float d_H, cudaStream_t stream, int numBlocks_x,
+ int numBlocks_y, int bundle_first_task,
+ int2 *d_task_first_part_f4);
+void launch_density_pair_two_kernels(
+ struct part_soa parts_soa_ci, struct part_soa parts_soa_cj,
+ int *d_task_first_part_ci, int *d_task_first_part_cj,
+ int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H,
+ const char *loop_type, cudaStream_t stream, int bid, int block_size,
+ int count_tasks, int tasksperbundle, int max_parts_i, int max_parts_j,
+ int numBlocks_y, int tid, int offset, int bundle_first_task,
+ int max_active_bin);
+void runner_dopair1_branch_density_gpu(
+ struct part_soa parts_soa, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+ int tid, int offset, int bundle_first_task, int max_active_bin,
+ double *d_shift_x, double *d_shift_y, double *d_shift_z);
+void runner_dopairci_branch_density_gpu(
+ struct part_soa parts_soa, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+ int tid, int offset, int bundle_first_task, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z);
+void runner_dopaircj_branch_density_gpu(
+ struct part_soa parts_soa, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+ int tid, int offset, int bundle_first_task, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z);
+void runner_dopairci_branch_density_gpu_aos(
+ struct part_aos *parts_aos, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+ int tid, int offset, int bundle_first_task, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z);
+void runner_dopairci_branch_density_gpu_aos_f4(
+ struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+ float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+ int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens);
+void runner_dopaircj_branch_density_gpu_aos_f4(
+ struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+ float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+ int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens);
+void runner_dopair_branch_density_gpu_aos_f4(
+ struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+ float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+ int bundle_first_part, int bundle_n_parts);
+void runner_dopaircj_branch_density_gpu_aos(
+ struct part_aos *parts_aos, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+ int tid, int offset, int bundle_first_task, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z);
+void runner_dopairci_branch_density_gpu_aos_g(
+ struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+ int tid, int offset, int bundle_first_task, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z);
+void runner_dopaircj_branch_density_gpu_aos_g(
+ struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+ int tid, int offset, int bundle_first_task, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z);
+void runner_dopair_branch_gradient_gpu_aos_f4(
+ struct part_aos_f4_g_send *parts_send,
+ struct part_aos_f4_g_recv *parts_recv, float d_a, float d_H,
+ cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+ int bundle_first_part, int bundle_n_parts);
+void runner_dopairci_branch_density_gpu_aos_f(
+ struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+ int tid, int offset, int bundle_first_task, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z);
+void runner_dopaircj_branch_density_gpu_aos_f(
+ struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+ int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+ int tid, int offset, int bundle_first_task, double *d_shift_x,
+ double *d_shift_y, double *d_shift_z);
+void runner_dopair_branch_force_gpu_aos_f4(
+ struct part_aos_f4_f_send *parts_send,
+ struct part_aos_f4_f_recv *parts_recv, float d_a, float d_H,
+ cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+ int bundle_first_part, int bundle_n_parts);
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CUDA_HEADER_H
diff --git a/src/cuda/Makefile.am b/src/cuda/Makefile.am
new file mode 100644
index 0000000000..5fb5bbc34f
--- /dev/null
+++ b/src/cuda/Makefile.am
@@ -0,0 +1,66 @@
+SOURCES_CUDA = GPU_runner_functions.cu tester.cu ../files_for_new_functions/arrays_malloc.cu ../files_for_new_functions/host_device_data_transfer.cu #../runner_main.cu
+include_HEADERS = GPU_runner_functions.h device_functions.h BLOCK_SIZE.h tester.h ../files_for_new_functions/arrays_malloc.h ../files_for_new_functions/host_device_data_transfer.h
+EXTRA_DIST = $(SOURCES_CUDA) $(include_HEADERS)
+
+if HAVECUDA
+
+AM_CFLAGS = -I.. $(HDF5_CPPFLAGS)
+CUDA_MYFLAGS = -D_FORCE_INLINES -O4 -lineinfo -src-in-ptx --maxrregcount=64 -ftz=true -DWITH_CUDA --default-stream per-thread --use_fast_math -lcudadevrt #-dlink -ccbin=gcc
+CUDA_MYFLAGS += -arch=sm_70
+CUDA_MYFLAGS += --extra-device-vectorization
+
+#CUDA_MYFLAGS = -D_FORCE_INLINES -O3 -lineinfo -src-in-ptx --maxrregcount=32 -ftz=true -DWITH_CUDA -ccbin=gcc -m64 --default-stream per-thread #-dlink
+#CUDA_MYFLAGS += -arch=sm_80 \
+#-gencode=arch=compute_80,code=sm_80 \
+#-gencode=arch=compute_86,code=sm_86 \
+#-gencode=arch=compute_87,code=sm_87 \
+#-gencode=arch=compute_86,code=compute_86
+#CUDA_MYFLAGS += --extra-device-vectorization
+
+# Assign a "safe" version number
+AM_LDFLAGS = $(HDF5_LDFLAGS) $(FFTW_LIBS) -version-info 0:0:0
+
+#bin_PROGRAMS = test_27_cells test_125_cells
+
+# Rules to compile CUDA code.
+.cu.o:
+ $(NVCC) -c $(NVCCFLAGS) $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) $< -o $@
+.cu.lo:
+ PATH=$(top_srcdir):$(PATH) && cudalt.py $@ $(NVCC) -c $(NVCCFLAGS) $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) $<
+
+# The library. Dummy C library so that we get libtool linking setup.
+lib_LTLIBRARIES = libswiftCUDA.la libswiftdummy.la
+
+# Special link command to avoid including CFLAGS which are not understood.
+libswiftCUDA_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
+ $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
+ $(libswiftCUDA_la_LDFLAGS) \
+ $(LDFLAGS) -o $@
+
+libswiftCUDA_la_SOURCES = $(SOURCES_CUDA)
+libswiftCUDA_la_CFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) ../libswiftsim_cuda.la -I../
+libswiftCUDA_la_CXXFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) ../libswiftsim_cuda.la -I../
+libswiftCUDA_la_LIBADD = ../.libs/libswiftsim_cuda.la
+libswiftCUDA_la_LDFLAGS = $(AM_LDFLAGS)
+
+if HAVEMPI
+libswiftCUDA_la_CFLAGS += ../libswiftsim_mpicuda.la
+libswiftCUDA_la_CXXFLAGS += ../libswiftsim_mpicuda.la
+libswiftCUDA_la_LIBADD += ../.libs/libswiftsim_mpicuda.la
+endif
+
+libswiftdummy_la_SOURCES = dummy.c
+libswiftdummy_la_CFLAGS = $(AM_CFLAGS)
+libswiftdummy_la_LDFLAGS = $(AM_LDFLAGS)
+
+#test_27_cells_SOURCES=test27cells.c
+#test_27_cells_CFLAGS=$(AM_CFLAGS) -DWITH_CUDA $(CUDA_CFLAGS)
+#test_27_cells_LDADD= ../.libs/libswiftsim_cuda.la ../.libs/libswiftsim_mpicuda.la libswiftCUDA.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -L/home/aidan/cuda_7.5/lib64/ -lcudart
+#test_27_cells_LDFLAGS = $(AM_LDFLAGS) $(CUDA_CFLAGS)
+
+#test_125_cells_SOURCES=test125cells.c
+#test_125_cells_CFLAGS=$(AM_CFLAGS) -DWITH_CUDA $(CUDA_CFLAGS)
+#test_125_cells_LDADD= ../libswiftsim_cuda.la ../libswiftsim_mpicuda.la libswiftCUDA.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -L/home/aidan/cuda_7.5/lib64/ -lcudart
+#test_125_cells_LDFLAGS = $(AM_LDFLAGS) $(CUDA_CFLAGS)
+
+endif
diff --git a/src/cuda/device_functions.h b/src/cuda/device_functions.h
new file mode 100644
index 0000000000..afc4a1a5d8
--- /dev/null
+++ b/src/cuda/device_functions.h
@@ -0,0 +1,149 @@
+#ifndef DEVICE_FUNCTIONS_H
+#define DEVICE_FUNCTIONS_H
+#include "../../config.h"
+
+/* Local headers. */
+// #include "../dimension.h"
+// #include "../error.h"
+// #include "../inline.h"
+// #include "../minmax.h"
+// #include "../vector.h"
+
+// Is this even necessary? Probably not as our code will operate differently
+#define num_cuda_threads 128
+#define hydro_dimension 3.f
+
+/// Here we define stuff from kernel_hydro.h when using cubic_spline_kernel.
+/// Will worry about sorting 'if statements for different kernels later////
+/* First some powers of gamma = H/h */
+#define kernel_gamma ((float)(1.825742))
+#define kernel_gamma_inv ((float)(1. / kernel_gamma))
+#define kernel_gamma2 ((float)(kernel_gamma * kernel_gamma))
+#define kernel_ivals 2
+#define kernel_degree 3 /*!< Degree of the polynomial */
+#define kernel_gamma_dim ((float)(kernel_gamma * kernel_gamma * kernel_gamma))
+#define kernel_gamma_dim_plus_one \
+ ((float)(kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma))
+#define kernel_gamma_inv_dim \
+ ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma)))
+#define kernel_gamma_inv_dim_plus_one \
+ ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma)))
+#define kernel_ivals_f ((float)kernel_ivals) /*!< Number of branches */
+#define kernel_constant ((float)(16. * M_1_PI))
+/*! Cosmology default beta=3.0.
+ * Alpha can be set in the parameter file.
+ * Beta is defined as in e.g. Price (2010) Eqn (103) */
+#define const_viscosity_beta 3.0f
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+/**
+ * @brief Returns the argument to the power given by the dimension plus one
+ *
+ * Computes \f$x^{d+1}\f$.
+ */
+__device__ float d_pow_dimension_plus_one(float x) {
+
+#if defined(HYDRO_DIMENSION_3D)
+
+ const float x2 = x * x;
+ return x2 * x2;
+
+#elif defined(HYDRO_DIMENSION_2D)
+
+ return x * x * x;
+
+#elif defined(HYDRO_DIMENSION_1D)
+
+ return x * x;
+
+#else
+
+ error("The dimension is not defined !");
+ return 0.f;
+
+#endif
+}
+
+/**
+ * @brief Return the argument to the power three adiabatic index minus five over
+ * two.
+ *
+ * Computes \f$x^{(3\gamma - 5)/2}\f$.
+ *
+ * @param x Argument
+ */
+__device__ float d_pow_three_gamma_minus_five_over_two(float x) {
+#if defined(HYDRO_GAMMA_5_3)
+
+ return 1.f; /* x^(0) */
+
+#elif defined(HYDRO_GAMMA_7_5)
+
+ return powf(x, -0.4f); /* x^(-2/5) */
+
+#elif defined(HYDRO_GAMMA_4_3)
+
+ return 1.f / sqrtf(x); /* x^(-1/2) */
+
+#elif defined(HYDRO_GAMMA_2_1)
+
+ return sqrtf(x); /* x^(1/2) */
+
+#else
+
+ error("The adiabatic index is not defined !");
+ return 0.f;
+
+#endif
+}
+
+/**
+ * @brief Computes the kernel function and its derivative.
+ *
+ * The kernel function needs to be mutliplied by \f$h^{-d}\f$ and the gradient
+ * by \f$h^{-(d+1)}\f$, where \f$d\f$ is the dimensionality of the problem.
+ *
+ * Returns 0 if \f$u > \gamma = H/h\f$.
+ *
+ * @param u The ratio of the distance to the smoothing length \f$u = x/h\f$.
+ * @param W (return) The value of the kernel function \f$W(x,h)\f$.
+ * @param dW_dx (return) The norm of the gradient of \f$|\nabla W(x,h)|\f$.
+ */
+__device__ void d_kernel_deval(float u, float *__restrict__ W,
+ float *__restrict__ dW_dx) {
+
+ /* Go to the range [0,1[ from [0,H[ */
+ const float x = u * kernel_gamma_inv;
+
+ /* Pick the correct branch of the kernel */
+ const int temp = (int)(x * kernel_ivals_f);
+ const int ind = temp > kernel_ivals ? kernel_ivals : temp;
+ static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)] = {
+ 3.f, -3.f, 0.f, 0.5f, /* 0 < u < 0.5 */
+ -1.f, 3.f, -3.f, 1.f, /* 0.5 < u < 1 */
+ 0.f, 0.f, 0.f, 0.f}; /* 1 < u */
+ const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
+ /* First two terms of the polynomial ... */
+ float w = coeffs[0] * x + coeffs[1];
+ float dw_dx = coeffs[0];
+
+ /* ... and the rest of them */
+ for (int k = 2; k <= kernel_degree; k++) {
+ dw_dx = dw_dx * x + w;
+ w = x * w + coeffs[k];
+ }
+
+ w = fmaxf(w, 0.f);
+ dw_dx = fminf(dw_dx, 0.f);
+
+ /* Return everything */
+ *W = w * kernel_constant * kernel_gamma_inv_dim;
+ *dW_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one;
+}
+
+#ifdef WITH_CUDA
+}
+#endif
+
+#endif // DEVICE_FUNCTIONS_H
diff --git a/src/cuda/dummy.c b/src/cuda/dummy.c
new file mode 100755
index 0000000000..c75d2d873c
--- /dev/null
+++ b/src/cuda/dummy.c
@@ -0,0 +1,9 @@
+#include
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void swiftcudadummy(void) {}
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/cuda/kernel_definitions.cu b/src/cuda/kernel_definitions.cu
new file mode 100644
index 0000000000..a272b7beee
--- /dev/null
+++ b/src/cuda/kernel_definitions.cu
@@ -0,0 +1,114 @@
+/*******************************************************************************
+ * This file contains functions used to setup and execute GPU tasks from within
+ *runner_main.c. Consider this a translator allowing .cu based functions to be
+ *called from within runner_main.c
+ ******************************************************************************/
+#ifdef WITH_CUDA
+#ifndef static
+#define static
+#endif
+// #ifndef restrict
+// #define restrict __restrict__
+// #endif
+#endif
+
+/* Required header files */
+#include
+/*ifdef __cplusplus prevents name mangling. C code sees exact names
+ of functions rather than mangled template names produced by C++*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cell_gpu.h"
+#include "cuda_headers.h"
+#ifdef __cplusplus
+}
+#endif
+
+/* function to initialise and printout GPU name*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+void Initialise_GPU() {
+ int devId = 0;
+ // find and print device name
+ cudaDeviceProp prop;
+ cudaGetDeviceProperties(&prop, devId);
+ printf("Device : %s\n", prop.name);
+ cudaSetDevice(devId);
+ // cuda
+}
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void CPU_runner_doself1_branch_gradient(struct cell_gpu *restrict ci_gpu) {
+ int id = ci_gpu->hydro.parts[0].id;
+ printf("id of first part %d\n", id);
+ // Do stuff here for interactions on CPU but using the temporary GPU arrays
+ // const int count_i = ci_gpu->hydro.count;
+ // const int count_j = cj_gpu->hydro.count;
+ // system("pause");
+ /* Anything to do here? */
+ // if (!cell_is_active_hydro(ci_gpu, e) && !cell_is_active_hydro(cj_gpu,
+ // e)) return;
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void GPU_runner_doself1_branch_gradient(struct cell_gpu *restrict ci_gpu) {
+ int count = ci_gpu->hydro.count;
+ int numBlocks = (count + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+ struct cell_gpu *d_ci_gpu;
+ cudaMalloc((void **)&d_ci_gpu, sizeof(cell_gpu));
+
+ cudaMemcpy(d_ci_gpu, ci_gpu, sizeof(cell_gpu), cudaMemcpyHostToDevice);
+ SPH_Sum_Self<<>>(d_ci_gpu);
+ cudaMemcpy(ci_gpu, d_ci_gpu, sizeof(cell_gpu), cudaMemcpyDeviceToHost);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void SPH_Sum_Self(cell_gpu *d_ci_gpu) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ int i = index;
+ float sumLoc, xi, yi, zi;
+ struct part_gpu *restrict parts = d_ci_gpu->hydro.parts;
+ xi = parts[i].x[0];
+ yi = parts[i].x[1];
+ zi = parts[i].x[2];
+ sumLoc = 0.f;
+ float h = parts[i].h, mass = parts[i].mass, rho = parts[i].rho;
+ const int count = d_ci_gpu->hydro.count;
+ //__shared__ float sh_x[BLOCK_SIZE], sh_y[BLOCK_SIZE];
+ // copy neighbour particles data to shared memory
+ // for (unsigned int j1=0; j1
+
+#include
+
+typedef struct part_soa {
+ /*Task ID*/
+ int *tid_p;
+ /*bundle ID*/
+ int *bid_p;
+ /*! Particle unique ID. */
+ long long *id;
+ /*! Pointer to corresponding gravity part. */
+ // struct gpu_gpart* gpart;
+ /*! Particle position. */
+ double *x_p;
+ double *y_p;
+ double *z_p;
+ /*! Particle predicted velocity. */
+ float *ux;
+ float *uy;
+ float *uz;
+ /*! Particle acceleration. */
+ float *a_hydrox;
+ float *a_hydroy;
+ float *a_hydroz;
+ /*! Particle mass. */
+ float *mass;
+ /*! Particle smoothing length. */
+ float *h;
+ /*! Particle internal energy. */
+ float *u;
+ /*! Time derivative of the internal energy. */
+ float *u_dt;
+ /*! Particle density. */
+ float *rho;
+ /*! Kernel summation (For testing/debugging). */
+ float *SPH_sum;
+
+ /* Cell information */
+ /*! The cell location on the grid (corner nearest to the origin). */
+ float *locx;
+ float *locy;
+ float *locz;
+ /*! The cell dimensions. */
+ float *widthx;
+ float *widthy;
+ float *widthz;
+ float *h_max;
+ int *count_p;
+ int *count_test;
+ /* Density information */
+
+ /*! Neighbour number count. */
+ float *wcount;
+
+ /*! Derivative of the neighbour number with respect to h. */
+ float *wcount_dh;
+
+ /*! Derivative of density with respect to h */
+ float *rho_dh;
+
+ /*! Particle velocity curl. */
+ float *rot_ux;
+ float *rot_uy;
+ float *rot_uz;
+
+ /* viscosity information */
+
+ /*! Particle velocity divergence */
+ float *div_v;
+
+ /*! Particle velocity divergence from previous step */
+ float *div_v_previous_step;
+
+ /*! Artificial viscosity parameter */
+ float *alpha_visc;
+
+ /*! Signal velocity */
+ float *v_sig;
+
+ /* thermal diffusion information */
+
+ /*! del^2 u, a smoothed quantity */
+ float *laplace_u;
+
+ /*! Thermal diffusion coefficient */
+ float *alpha_diff;
+
+ /* force information */
+
+ /*! "Grad h" term -- only partial in P-U */
+ float *f;
+
+ /*! Particle soundspeed. */
+ float *soundspeed;
+
+ /*! Time derivative of smoothing length */
+ float *h_dt;
+
+ /*! Balsara switch */
+ float *balsara;
+
+ /*! Particle pressure. */
+ float *pressure;
+ /*! Maximal alpha (viscosity) over neighbours */
+ float *alpha_visc_max_ngb;
+
+ /* timestep stuff */
+
+ /*! Time-step length */
+ timebin_t *time_bin;
+
+ /*all part of struct timestep_limiter_data, we had to destruct it
+ as GPUs don't like pointer chasing especially when memcpying*/
+ /* Need waking-up ? */
+ timebin_t *wakeup;
+
+ /*! Minimal time-bin across all neighbours */
+ timebin_t *min_ngb_time_bin;
+
+ /* Do we want this particle to be synched back on the time-line? */
+ char *to_be_synchronized;
+} part_soa;
+/*Container for particle data requierd for density calcs*/
+typedef struct part_aos {
+
+ /*! Particle position. */
+ double x_p;
+ double y_p;
+ double z_p;
+
+ /*! Particle position. */
+ double locx;
+ double locy;
+ double locz;
+
+ /*! Particle predicted velocity. */
+ float ux;
+ float uy;
+ float uz;
+ /*! Particle mass. */
+ float mass;
+ /*! Particle smoothing length. */
+ float h;
+ /*! Particle density. */
+ float rho;
+
+ /* Density information */
+ /*! Neighbour number count. */
+ float wcount;
+ /*! Derivative of the neighbour number with respect to h. */
+ float wcount_dh;
+ /*! Derivative of density with respect to h */
+ float rho_dh;
+ /*! Particle velocity curl. */
+ float rot_ux;
+ float rot_uy;
+ float rot_uz;
+
+ /* viscosity information */
+ /*! Particle velocity divergence */
+ float div_v;
+
+ /* timestep stuff */
+ /*! Time-step length */
+ int time_bin;
+} part_aos;
+
+/*Container for particle data requierd for density calcs*/
+typedef struct part_aos_f4_send {
+ /*! Particle position and h -> x, y, z, h */
+ float4 x_p_h;
+
+ /*! Particle predicted velocity and mass -> ux, uy, uz, m */
+ float4 ux_m;
+ /*Markers for where neighbour cell j starts and stops in array indices for
+ * pair tasks*/
+ int2 cjs_cje;
+} part_aos_f4_send __attribute__((aligned(SWIFT_STRUCT_ALIGNMENT)));
+
+typedef struct part_aos_f4_recv {
+ /* Density information; rho */
+ /*! Derivative of density with respect to h; rho_dh,
+ * Neighbour number count; w_count
+ * * Derivative of the neighbour number with respect to h; w_count_dh */
+ float4 rho_dh_wcount;
+ /*! Particle velocity curl; rot_ux and
+ * velocity divergence; div_v */
+ float4 rot_ux_div_v;
+} part_aos_f4_recv;
+
+/*Container for particle data required for density calcs*/
+typedef struct part_aos_f4 {
+ /*! Particle position and h -> x, y, z, h */
+ float4 x_p_h;
+
+ /*! Particle predicted velocity and mass -> ux, uy, uz, m */
+ float4 ux_m;
+ /* Density information; rho */
+ /*! Derivative of density with respect to h; rho_dh,
+ * Neighbour number count; w_count
+ * * Derivative of the neighbour number with respect to h; w_count_dh */
+ float4 rho_dh_wcount;
+
+ /*! Particle velocity curl; rot_ux and
+ * velocity divergence; div_v */
+ float4 rot_ux_div_v;
+
+} part_aos_f4;
+
+/*Container for particle data required for force calcs*/
+typedef struct part_aos_f {
+
+ /*! Particle position. */
+ double x_p;
+ double y_p;
+ double z_p;
+
+ /*! Particle predicted velocity. */
+ float ux;
+ float uy;
+ float uz;
+ /*! Particle mass. */
+ float mass;
+ /*! Particle smoothing length. */
+ float h;
+ /*! Particle density. */
+ float rho;
+ /*! Particle pressure. */
+ float pressure;
+
+ /* Density information */
+ /*! Speed of sound. */
+ float soundspeed;
+ /*! Variable smoothing length term */
+ float f;
+ /*! Derivative of density with respect to h */
+ float balsara;
+ /*! Particle velocity curl. */
+ float alpha_visc;
+ float a_hydrox;
+ float a_hydroy;
+ float a_hydroz;
+ float alpha_diff;
+
+ /* viscosity information */
+ /*! Internal energy */
+ float u;
+ float u_dt;
+ /*! h time derivative */
+ float h_dt;
+ float v_sig;
+
+ /* timestep stuff */
+ /*! Time-step length */
+ int time_bin;
+ int min_ngb_time_bin;
+} part_aos_f;
+
+/*Container for particle data requierd for force calcs*/
+typedef struct part_aos_f4_f {
+
+ /*Data required for the calculation:
+ Values read to local GPU memory*/
+ /*! Particle position smoothing length */
+ float4 x_h;
+ /*! Particle predicted velocity and mass */
+ float4 ux_m;
+ /*! Variable smoothing length term f, balsara, timebin
+ * and initial value of min neighbour timebin */
+ float4 f_bals_timebin_mintimebin_ngb;
+ /*! Particle density, pressure, speed of sound & v_sig to read*/
+ float4 rho_p_c_vsigi;
+ /*! Particle Internal energy u, alpha constants for visc and diff */
+ float3 u_alphavisc_alphadiff;
+
+ /*Result: Values output to global GPU memory*/
+ /* change of u and h with dt, v_sig and returned value of
+ * minimum neighbour timebin */
+ float4 udt_hdt_vsig_mintimebin_ngb;
+ /*Particle acceleration vector*/
+ float3 a_hydro;
+
+} part_aos_f4_f;
+
+/*Container for particle data requierd for force calcs*/
+typedef struct part_aos_f4_f_send {
+
+ /*Data required for the calculation:
+ Values read to local GPU memory*/
+ /*! Particle position smoothing length */
+ float4 x_h;
+ /*! Particle predicted velocity and mass */
+ float4 ux_m;
+ /*! Variable smoothing length term f, balsara, timebin
+ * and initial value of min neighbour timebin */
+ float4 f_bals_timebin_mintimebin_ngb;
+ /*! Particle density, pressure, speed of sound & v_sig to read*/
+ float4 rho_p_c_vsigi;
+ /*! Particle Internal energy u, alpha constants for visc and diff */
+ float3 u_alphavisc_alphadiff;
+
+ int2 cjs_cje;
+
+} part_aos_f4_f_send;
+
+/*Container for particle data requierd for force calcs*/
+typedef struct part_aos_f4_f_recv {
+
+ /*Result: Values output to global GPU memory*/
+ /* change of u and h with dt, v_sig and returned value of
+ * minimum neighbour timebin */
+ float4 udt_hdt_vsig_mintimebin_ngb;
+ /*Particle acceleration vector*/
+ float3 a_hydro;
+
+} part_aos_f4_f_recv;
+
+/*Container for particle data requierd for gradient calcs*/
+typedef struct part_aos_g {
+
+ /*! Particle position. */
+ double x_p;
+ double y_p;
+ double z_p;
+
+ /*! Particle velocity. */
+ float ux;
+ float uy;
+ float uz;
+ /*! Particle mass. */
+ float mass;
+ /*! Particle smoothing length. */
+ float h;
+ /*! Particle density. */
+ float rho;
+
+ /* viscosity information */
+ float visc_alpha;
+ float laplace_u;
+ float alpha_visc_max_ngb;
+ float v_sig;
+
+ float u;
+
+ float soundspeed;
+
+ /* timestep stuff */
+ /*! Time-step length */
+ int time_bin;
+} part_aos_g;
+
+/*Container for particle data requierd for gradient calcs*/
+typedef struct part_aos_f4_g {
+
+ /*! Particle position & smoothing length */
+ float4 x_h;
+
+ /*! Particle velocity and mass */
+ float4 ux_m;
+
+ /*! Particle density alpha visc internal energy u and speed of sound c */
+ float4 rho_avisc_u_c;
+
+ /* viscosity information results */
+ float3 vsig_lapu_aviscmax_empty;
+
+} part_aos_f4_g;
+
+/*Container for particle data requierd for gradient calcs*/
+typedef struct part_aos_f4_g_send {
+
+ /*! Particle position & smoothing length */
+ float4 x_h;
+
+ /*! Particle velocity and mass */
+ float4 ux_m;
+
+ /*! Particle density alpha visc internal energy u and speed of sound c */
+ float4 rho_avisc_u_c;
+
+ /* viscosity information results */
+ float3 vsig_lapu_aviscmax;
+
+ /*Data for cell start and end*/
+ int2 cjs_cje;
+
+} part_aos_f4_g_send;
+
+/*Container for particle data requierd for gradient calcs*/
+typedef struct part_aos_f4_g_recv {
+
+ /* viscosity information results */
+ float3 vsig_lapu_aviscmax;
+
+} part_aos_f4_g_recv;
+
+#ifdef __WITH_CUDA
+}
+#endif
+
+#endif // PART_GPU_H
diff --git a/src/cuda/tester.cu b/src/cuda/tester.cu
new file mode 100644
index 0000000000..3ffaf9e10c
--- /dev/null
+++ b/src/cuda/tester.cu
@@ -0,0 +1,21 @@
+#include "tester.h"
+
+#include
+#include
+#ifdef __cplusplus
+extern "C" {
+#endif
+void testing_linkage(int a, float *b, float c) {
+ std::vector b_value_list;
+ b_value_list.reserve(a);
+ for (int i = 0; i < a; i++) {
+ (*b) = (*b) + c;
+ b_value_list.push_back((*b));
+ std::cout << "Vector value is " << b_value_list[i] << " b value is " << (*b)
+ << std::endl;
+ }
+ std::cout << "Final value of b is " << (*b) << std::endl;
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/cuda/tester.h b/src/cuda/tester.h
new file mode 100755
index 0000000000..5729e66904
--- /dev/null
+++ b/src/cuda/tester.h
@@ -0,0 +1,9 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void testing_linkage(int a, float *b, float c);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/engine.c b/src/engine.c
index 6d1fa0e3f7..023885cb0c 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -1092,12 +1092,22 @@ int engine_estimate_nr_tasks(const struct engine *e) {
*/
n1 += 38;
n2 += 2;
+#ifdef WITH_CUDA // A. Nasar
+ n1 += 4; // Self force and density packs should be 2 but doubled to prevent
+ // code crash due to unpack tasks
+ n1 += 52; // Pair force and density packs should be 26 but doubled to
+ // prevent code crash due to unpack tasks
+#endif
#ifdef WITH_MPI
n1 += 6;
#endif
#ifdef EXTRA_HYDRO_LOOP
n1 += 15;
+#ifdef WITH_CUDA
+ n1 += 1; // Self gradient packs
+ n1 += 13; // Pair gradient packs
+#endif
#ifdef WITH_MPI
n1 += 2;
#endif
@@ -1750,9 +1760,13 @@ void engine_skip_force_and_kick(struct engine *e) {
t->type == task_type_rt_ghost2 || t->type == task_type_rt_tchem ||
t->type == task_type_rt_advance_cell_time ||
t->type == task_type_neutrino_weight || t->type == task_type_csds ||
- t->subtype == task_subtype_force ||
+ t->subtype == task_subtype_force || // A. Nasar
+ t->subtype == task_subtype_gpu_pack_f ||
+ t->subtype == task_subtype_gpu_unpack_f ||
t->subtype == task_subtype_limiter ||
t->subtype == task_subtype_gradient ||
+ t->subtype == task_subtype_gpu_pack_g ||
+ t->subtype == task_subtype_gpu_unpack_g ||
t->subtype == task_subtype_stars_prep1 ||
t->subtype == task_subtype_stars_prep2 ||
t->subtype == task_subtype_stars_feedback ||
@@ -2192,7 +2206,25 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
}
#endif
+ // scheduler_write_dependencies(&e->sched, e->verbose, e->step); // A. Nasar
+ // write deps before running first step
/* Now, launch the calculation */
+ // message("n tasks %i", e->sched.nr_tasks);
+ // for (int i = 0; i < e->sched.nr_tasks; i++){
+ // struct task *tmp_t = &e->sched.tasks[i];
+ // if(tmp_t->subtype == task_subtype_density){
+ // if(tmp_t->skip == 1)error("inactive density task");
+ // }
+ //// if(tmp_t->subtype == task_subtype_force){
+ //// if(tmp_t->skip == 1)error("inactive force task");
+ //// }
+ // if(tmp_t->subtype == task_subtype_gpu_pack_d){
+ // if(tmp_t->skip == 1)error("inactive pack task");
+ // }
+ // if(tmp_t->subtype == task_subtype_gpu_unpack_d){
+ // if(tmp_t->skip == 1)error("inactive unpack task");
+ // }
+ // }
TIMER_TIC;
engine_launch(e, "tasks");
TIMER_TOC(timer_runners);
@@ -2280,6 +2312,22 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
scheduler_write_cell_dependencies(&e->sched, e->verbose, e->step);
if (e->nodeID == 0) scheduler_write_task_level(&e->sched, e->step);
+ // for (int i = 0; i < e->sched.nr_tasks; i++){
+ // struct task *tmp_t = &e->sched.tasks[i];
+ // if(tmp_t->subtype == task_subtype_density){
+ // if(tmp_t->skip == 1)error("inactive density task");
+ // }
+ // if(tmp_t->subtype == task_subtype_force){
+ // if(tmp_t->skip == 1)error("inactive force task");
+ // }
+ // if(tmp_t->subtype == task_subtype_gpu_pack_d){
+ // if(tmp_t->skip == 1)error("inactive pack task");
+ // }
+ // if(tmp_t->subtype == task_subtype_gpu_unpack_d){
+ // if(tmp_t->skip == 1)error("inactive unpack task");
+ // }
+ // }
+
/* Run the 0th time-step */
TIMER_TIC2;
engine_launch(e, "tasks");
diff --git a/src/engine_config.c b/src/engine_config.c
index 5e6c4eb98c..4c0c4420c4 100644
--- a/src/engine_config.c
+++ b/src/engine_config.c
@@ -32,6 +32,19 @@
#include
#endif
+#ifdef WITH_CUDA
+#include "runner_main_clean.cu"
+
+#include /* A. Nasar */
+#endif
+
+#ifdef WITH_HIP
+// #include "/opt/rocm-5.1.0/hip/include/hip/hip_runtime.h"
+#include "runner_main_clean.hip"
+
+#include
+#endif
+
/* This object's header. */
#include "engine.h"
@@ -909,9 +922,12 @@ void engine_config(int restart, int fof, struct engine *e,
e->links_per_tasks =
parser_get_opt_param_float(params, "Scheduler:links_per_tasks", 25.);
- /* Init the scheduler. */
+ /* Init the scheduler. Allow stealing*/
scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
(e->policy & scheduler_flag_steal), e->nodeID, &e->threadpool);
+ /* Init the scheduler. NO stealing A. Nasar */
+ // scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
+ // &e->threadpool);
/* Maximum size of MPI task messages, in KB, that should not be buffered,
* that is sent using MPI_Issend, not MPI_Isend. 4Mb by default. Can be
@@ -981,9 +997,20 @@ void engine_config(int restart, int fof, struct engine *e,
for (int k = 0; k < e->nr_threads; k++) {
e->runners[k].id = k;
e->runners[k].e = e;
+
+#ifdef WITH_CUDA
+ if (pthread_create(&e->runners[k].thread, NULL, &runner_main2,
+ &e->runners[k]) != 0)
+ error("Failed to create GPU runner thread.");
+#elif WITH_HIP
+ if (pthread_create(&e->runners[k].thread, NULL, &runner_main_hip,
+ &e->runners[k]) != 0)
+ error("Failed to create runner thread.");
+#else
if (pthread_create(&e->runners[k].thread, NULL, &runner_main,
&e->runners[k]) != 0)
error("Failed to create runner thread.");
+#endif
/* Try to pin the runner to a given core */
if (with_aff &&
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 1c5a65d88f..a0ff23b2be 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -583,8 +583,13 @@ void engine_addtasks_recv_hydro(
/* Early abort (are we below the level where tasks are)? */
if (!cell_get_flag(c, cell_flag_has_tasks)) return;
- /* Have we reached a level where there are any hydro tasks ? */
- if (t_xv == NULL && c->hydro.density != NULL) {
+ /* Have we reached a level where there are any hydro tasks ? */
+#ifdef WITH_CUDA // A. Nasar
+ if (t_xv == NULL && c->hydro.density != NULL && c->hydro.density_pack != NULL)
+#else
+ if (t_xv == NULL && c->hydro.density != NULL)
+#endif /*WITH_CUDA*/
+ {
#ifdef SWIFT_DEBUG_CHECKS
/* Make sure this cell has a valid tag. */
@@ -711,6 +716,18 @@ void engine_addtasks_recv_hydro(
scheduler_addunlock(s, t_xv, l->t);
scheduler_addunlock(s, l->t, t_rho);
}
+#ifdef WITH_CUDA /* A. Nasar POSSIBLE BUG HERE (More like PROBABLE) NOT \
+ REQUIRED Ghost in for cell j is*/
+ for (struct link *l = c->hydro.density_pack; l != NULL; l = l->next) {
+ scheduler_addunlock(s, t_xv, l->t);
+ scheduler_addunlock(s, l->t, t_rho);
+ }
+ for (struct link *l = c->hydro.density_unpack; l != NULL; l = l->next) {
+ scheduler_addunlock(s, l->t, t_rho);
+ }
+
+#endif
+
#ifdef EXTRA_HYDRO_LOOP
for (struct link *l = c->hydro.gradient; l != NULL; l = l->next) {
scheduler_addunlock(s, t_rho, l->t);
@@ -720,12 +737,37 @@ void engine_addtasks_recv_hydro(
scheduler_addunlock(s, t_gradient, l->t);
scheduler_addunlock(s, l->t, tend);
}
-#else
+#ifdef WITH_CUDA
+ for (struct link *l = c->hydro.gradient_pack; l != NULL; l = l->next) {
+ scheduler_addunlock(s, t_rho, l->t);
+ scheduler_addunlock(s, l->t, t_gradient);
+ }
+ for (struct link *l = c->hydro.gradient_unpack; l != NULL; l = l->next) {
+ scheduler_addunlock(s, l->t, t_gradient);
+ }
+
+ for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
+ scheduler_addunlock(s, t_gradient, l->t);
+ scheduler_addunlock(s, l->t, tend);
+ }
+ for (struct link *l = c->hydro.force_unpack; l != NULL; l = l->next) {
+ scheduler_addunlock(s, l->t, tend);
+ }
+
+#endif /*WITH_CUDA*/
+#else /*EXTRA_HYDRO_LOOP*/
for (struct link *l = c->hydro.force; l != NULL; l = l->next) {
scheduler_addunlock(s, t_rho, l->t);
scheduler_addunlock(s, l->t, tend);
}
-#endif
+#ifdef WITH_CUDA
+ for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
+ scheduler_addunlock(s, t_rho, l->t);
+ // scheduler_addunlock(s, l->t, t_ti);
+ }
+ scheduler_addunlock(s, c->hydro.super->hydro.f_unpack, tend);
+#endif /*WITH_CUDA*/
+#endif /*EXTRA_HYDRO_LOOP*/
if (with_limiter) {
for (struct link *l = c->hydro.limiter; l != NULL; l = l->next) {
@@ -2088,7 +2130,10 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
for (int ind = 0; ind < num_elements; ind++) {
struct task *t = &((struct task *)map_data)[ind];
-
+ if (t->ci == NULL) { // Possible fix missing when moving code over.
+ // Prevents unpack tasks continuing past here
+ break;
+ }
struct cell *ci = t->ci;
struct cell *cj = t->cj;
const enum task_types t_type = t->type;
@@ -2116,6 +2161,12 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
if (t_subtype == task_subtype_density) {
engine_addlink(e, &ci->hydro.density, t);
+ } else if (t_subtype == task_subtype_gpu_pack_d) { // A. Nasar
+ engine_addlink(e, &ci->hydro.density_pack, t);
+ // } else if (t_subtype == task_subtype_gpu_pack_f) {
+ // engine_addlink(e, &ci->hydro.force_pack, t);
+ // } else if (t_subtype == task_subtype_gpu_pack_g) {
+ // engine_addlink(e, &ci->hydro.gradient_pack, t);
} else if (t_subtype == task_subtype_grav) {
engine_addlink(e, &ci->grav.grav, t);
} else if (t_subtype == task_subtype_external_grav) {
@@ -2130,6 +2181,15 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
if (t_subtype == task_subtype_density) {
engine_addlink(e, &ci->hydro.density, t);
engine_addlink(e, &cj->hydro.density, t);
+ } else if (t_subtype == task_subtype_gpu_pack_d) { // A. Nasar
+ engine_addlink(e, &ci->hydro.density_pack, t);
+ engine_addlink(e, &cj->hydro.density_pack, t);
+ // } else if (t_subtype == task_subtype_gpu_pack_f) {
+ // engine_addlink(e, &ci->hydro.force_pack, t);
+ // engine_addlink(e, &cj->hydro.force_pack, t);
+ // } else if (t_subtype == task_subtype_gpu_pack_g) {
+ // engine_addlink(e, &ci->hydro.gradient_pack, t);
+ // engine_addlink(e, &cj->hydro.gradient_pack, t);
} else if (t_subtype == task_subtype_grav) {
engine_addlink(e, &ci->grav.grav, t);
engine_addlink(e, &cj->grav.grav, t);
@@ -2146,6 +2206,15 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
if (t_subtype == task_subtype_density) {
engine_addlink(e, &ci->hydro.density, t);
+ } else if (t_subtype == task_subtype_gpu_pack_d) { // A. Nasar
+ engine_addlink(e, &ci->hydro.density_pack, t);
+ // error("Abouzied: you need to code this up!");
+ } else if (t_subtype == task_subtype_gpu_pack_f) {
+ engine_addlink(e, &ci->hydro.force_pack, t);
+ // error("Abouzied: you need to code this up!");
+ } else if (t_subtype == task_subtype_gpu_pack_g) {
+ engine_addlink(e, &ci->hydro.gradient_pack, t);
+ // error("Abouzied: you need to code this up!");
} else if (t_subtype == task_subtype_grav) {
engine_addlink(e, &ci->grav.grav, t);
} else if (t_subtype == task_subtype_external_grav) {
@@ -2160,6 +2229,18 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
if (t_subtype == task_subtype_density) {
engine_addlink(e, &ci->hydro.density, t);
engine_addlink(e, &cj->hydro.density, t);
+ } else if (t_subtype == task_subtype_gpu_pack_d) {
+ engine_addlink(e, &ci->hydro.density_pack, t);
+ engine_addlink(e, &cj->hydro.density_pack, t);
+ // error("Abouzied: you need to code this up!");
+ } else if (t_subtype == task_subtype_gpu_pack_f) {
+ engine_addlink(e, &ci->hydro.force_pack, t);
+ engine_addlink(e, &cj->hydro.force_pack, t);
+ // error("Abouzied: you need to code this up!");
+ } else if (t_subtype == task_subtype_gpu_pack_g) {
+ engine_addlink(e, &ci->hydro.gradient_pack, t);
+ engine_addlink(e, &cj->hydro.gradient_pack, t);
+ // error("Abouzied: you need to code this up!");
} else if (t_subtype == task_subtype_grav) {
engine_addlink(e, &ci->grav.grav, t);
engine_addlink(e, &cj->grav.grav, t);
@@ -2197,7 +2278,7 @@ void engine_link_gravity_tasks(struct engine *e) {
/* Get a pointer to the task. */
struct task *t = &sched->tasks[k];
- if (t->type == task_type_none) continue;
+ if (t->type == task_type_none || t->ci == NULL) continue;
/* Get the cells we act on */
struct cell *ci = t->ci;
@@ -2425,12 +2506,14 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
const int with_sink = (e->policy & engine_policy_sinks);
#ifdef EXTRA_HYDRO_LOOP
struct task *t_gradient = NULL;
+ struct task *t_gradient_gpu = NULL; // A. Nasar
#endif
#ifdef EXTRA_STAR_LOOPS
struct task *t_star_prep1 = NULL;
struct task *t_star_prep2 = NULL;
#endif
struct task *t_force = NULL;
+ struct task *t_force_gpu = NULL;
struct task *t_limiter = NULL;
struct task *t_star_density = NULL;
struct task *t_star_feedback = NULL;
@@ -2466,6 +2549,33 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
}
+ /*Make packing depend on sorts and drift A. Nasar */
+ else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack_d) {
+ scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
+ /* Task for the second GPU hydro loop A. Nasar */
+ t_force_gpu = scheduler_addtask(sched, task_type_self,
+ task_subtype_gpu_pack_f, 0, 0, ci, NULL);
+ /* Link the tasks to the cells. Do the same for GPU tasks A. Nasar */
+ engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+#ifdef EXTRA_HYDRO_LOOP
+ /* Same work for the additional GPU hydro loop A. Nasar */
+ t_gradient_gpu = scheduler_addtask(
+ sched, task_type_self, task_subtype_gpu_pack_g, 0, 0, ci, NULL);
+ /* Add the link between the new loops and the cell. Same for GPU task A.
+ * Nasar */
+ engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+ // A. Nasar add unlocks for pack tasks here. Unpacks depend on packs and
+ // will be used to create downstream deps later
+ scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+ t_gradient_gpu);
+ scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
+ t_force_gpu);
+#else
+ /* Now, build all the dependencies for the hydro */
+ scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
+#endif
+ }
+
/* Sort tasks depend on the drift of the cell (stars version). */
else if (t_type == task_type_stars_sort && ci->nodeID == nodeID) {
scheduler_addunlock(sched, ci->hydro.super->stars.drift, t);
@@ -2549,6 +2659,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
/* Link the tasks to the cells */
engine_addlink(e, &ci->hydro.force, t_force);
+
if (with_timestep_limiter) {
engine_addlink(e, &ci->hydro.limiter, t_limiter);
}
@@ -2582,10 +2693,9 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
/* Same work for the additional hydro loop */
t_gradient = scheduler_addtask(sched, task_type_self,
task_subtype_gradient, flags, 0, ci, NULL);
-
- /* Add the link between the new loops and the cell */
+ /* Add the link between the new loops and the cell. Same for GPU task A.
+ * Nasar */
engine_addlink(e, &ci->hydro.gradient, t_gradient);
-
/* Now, build all the dependencies for the hydro */
engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
t_limiter, ci, with_cooling,
@@ -2727,6 +2837,80 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
}
}
+ /*Make packing depend on sorts and drift A. Nasar */
+ else if (t_type == task_type_pair && t_subtype == task_subtype_gpu_pack_d) {
+ /* Make all density tasks depend on the drift */
+ if (ci->nodeID == nodeID) {
+ scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
+ }
+ if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+ scheduler_addunlock(sched, cj->hydro.super->hydro.drift, t);
+ }
+ /* Make all density tasks depend on the sorts */
+ scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
+ if (ci->hydro.super != cj->hydro.super) {
+ scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t);
+ }
+ /* New task for the force A. Nasar */
+ t_force_gpu = scheduler_addtask(sched, task_type_pair,
+ task_subtype_gpu_pack_f, 0, 0, ci, cj);
+#ifdef MPI_SYMMETRIC_FORCE_INTERACTION
+ /* The order of operations for an inactive local cell interacting
+ * with an active foreign cell is not guaranteed because the density
+ * (and gradient) iact loops don't exist in that case. So we need
+ * an explicit dependency here to have sorted cells. */
+
+ /* Make GPU force tasks depend on the sorts A. Nasar */
+ scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force_gpu);
+ if (ci->hydro.super != cj->hydro.super) {
+ scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force_gpu);
+ }
+#endif
+ /* Do teh same for GPU tasks A. Nasar*/
+ engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+ engine_addlink(e, &cj->hydro.force_pack, t_force_gpu);
+#ifdef EXTRA_HYDRO_LOOP
+ /* Start by constructing the task for the second and third GPU hydro loop
+ * A. Nasar */
+ t_gradient_gpu = scheduler_addtask(sched, task_type_pair,
+ task_subtype_gpu_pack_g, 0, 0, ci, cj);
+ // /* Add the link between the new loop and both cells */
+ engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+ engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu);
+
+ /* Now, build all the dependencies for the hydro for the cells */
+ /* that are local and are not descendant of the same super_hydro-cells */
+ if (ci->nodeID == nodeID) {
+ /*Same for GPU tasks*/
+ scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+ t_gradient_gpu);
+ scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
+ t_force_gpu);
+ }
+ if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+ /*Same for GPU tasks*/
+ scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
+ t_gradient_gpu);
+ scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost,
+ t_force_gpu);
+ }
+#else
+ /* Now, build all the dependencies for the hydro for the cells */
+ /* that are local and are not descendant of the same super_hydro-cells */
+ if (ci->nodeID == nodeID) {
+ // GPU tasks A. Nasar
+ scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+ t_force_gpu);
+ }
+ if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+ // GPU tasks A. Nasar
+ scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
+ t_force_gpu);
+ }
+#endif
+
+ }
+
/* Otherwise, pair interaction? */
else if (t_type == task_type_pair && t_subtype == task_subtype_density) {
@@ -2849,6 +3033,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
engine_addlink(e, &ci->hydro.force, t_force);
engine_addlink(e, &cj->hydro.force, t_force);
+
if (with_timestep_limiter) {
engine_addlink(e, &ci->hydro.limiter, t_limiter);
engine_addlink(e, &cj->hydro.limiter, t_limiter);
@@ -2931,6 +3116,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
with_cooling,
with_timestep_limiter);
}
+
#endif
if (with_feedback) {
@@ -3269,7 +3455,39 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
}
}
}
+ /*Make packing depend on sorts and drift A. Nasar */
+ else if (t_type == task_type_sub_self &&
+ t_subtype == task_subtype_gpu_pack_d) {
+
+ scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
+ scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
+ /* Start by constructing the task for the second hydro loop */
+ t_force_gpu =
+ scheduler_addtask(sched, task_type_sub_self, task_subtype_gpu_pack_f,
+ flags, 0, ci, NULL);
+ /* Add the link between the new loop and the cell */
+ engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+#ifdef EXTRA_HYDRO_LOOP
+
+ /* Start by constructing the task for the second and third hydro loop */
+ t_gradient_gpu =
+ scheduler_addtask(sched, task_type_sub_self, task_subtype_gpu_pack_g,
+ flags, 0, ci, NULL);
+ /* Add the link between the new loop and the cell */
+ engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+ /* Now, build all the dependencies for the hydro for the cells */
+ /* that are local and are not descendant of the same super_hydro-cells */
+ scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+ t_gradient_gpu);
+ scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
+ t_force_gpu);
+#else
+ /* Now, build all the dependencies for the hydro for the cells */
+ /* that are local and are not descendant of the same super_hydro-cells */
+ scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
+#endif
+ }
/* Otherwise, sub-self interaction? */
else if (t_type == task_type_sub_self &&
t_subtype == task_subtype_density) {
@@ -3355,6 +3573,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
/* Add the link between the new loop and the cell */
engine_addlink(e, &ci->hydro.force, t_force);
+
if (with_timestep_limiter) {
engine_addlink(e, &ci->hydro.limiter, t_limiter);
}
@@ -3388,10 +3607,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
/* Start by constructing the task for the second and third hydro loop */
t_gradient = scheduler_addtask(sched, task_type_sub_self,
task_subtype_gradient, flags, 0, ci, NULL);
-
/* Add the link between the new loop and the cell */
engine_addlink(e, &ci->hydro.gradient, t_gradient);
-
/* Now, build all the dependencies for the hydro for the cells */
/* that are local and are not descendant of the same super_hydro-cells */
engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
@@ -3541,7 +3758,64 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
/* Otherwise, sub-pair interaction? */
else if (t_type == task_type_sub_pair &&
- t_subtype == task_subtype_density) {
+ t_subtype == task_subtype_gpu_pack_d) {
+ /* Make all density pack tasks depend on the drift */
+ if (ci->nodeID == nodeID) {
+ scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
+ }
+ if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+ scheduler_addunlock(sched, cj->hydro.super->hydro.drift, t);
+ }
+ /* Make all density tasks depend on the sorts */
+ scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
+ if (ci->hydro.super != cj->hydro.super) {
+ scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t);
+ }
+ t_force_gpu = scheduler_addtask(
+ sched, task_type_sub_pair, task_subtype_gpu_pack_f, flags, 0, ci, cj);
+#ifdef MPI_SYMMETRIC_FORCE_INTERACTION
+ /* Make all force tasks depend on the sorts */
+ scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force_gpu);
+ if (ci->hydro.super != cj->hydro.super) {
+ scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force_gpu);
+ }
+#endif
+ engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+ engine_addlink(e, &cj->hydro.force_pack, t_force_gpu);
+#ifdef EXTRA_HYDRO_LOOP
+ t_gradient_gpu = scheduler_addtask(
+ sched, task_type_sub_pair, task_subtype_gpu_pack_g, flags, 0, ci, cj);
+ engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+ engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu);
+ /* Now, build all the dependencies for the hydro for the cells */
+ /* that are local and are not descendant of the same super_hydro-cells */
+ if (ci->nodeID == nodeID) {
+ scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+ t_gradient_gpu);
+ scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
+ t_force_gpu);
+ }
+ if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+ scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
+ t_gradient_gpu);
+ scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost,
+ t_force_gpu);
+ }
+#else
+ /* Now, build all the dependencies for the hydro for the cells */
+ /* that are local and are not descendant of the same super_hydro-cells */
+ if (ci->nodeID == nodeID) {
+ scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+ t_force_gpu);
+ }
+ if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+ scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
+ t_force_gpu);
+ }
+#endif
+
+ } else if (t_type == task_type_sub_pair &&
+ t_subtype == task_subtype_density) {
const int bcount_i = ci->black_holes.count;
const int bcount_j = cj->black_holes.count;
@@ -3724,11 +3998,9 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
/* Start by constructing the task for the second and third hydro loop */
t_gradient = scheduler_addtask(sched, task_type_sub_pair,
task_subtype_gradient, flags, 0, ci, cj);
-
/* Add the link between the new loop and both cells */
engine_addlink(e, &ci->hydro.gradient, t_gradient);
engine_addlink(e, &cj->hydro.gradient, t_gradient);
-
/* Now, build all the dependencies for the hydro for the cells */
/* that are local and are not descendant of the same super_hydro-cells */
if (ci->nodeID == nodeID) {
@@ -4142,9 +4414,13 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
continue;
/* If the cell is local build a self-interaction */
+ // struct task *t_pack_self; // A. Nasar
if (ci->nodeID == nodeID) {
scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0, ci,
NULL);
+ // A. Nasar also add a pack task for GPU
+ scheduler_addtask(sched, task_type_self, task_subtype_gpu_pack_d, 0, 0,
+ ci, NULL);
}
/* Now loop over all the neighbours of this cell */
@@ -4178,6 +4454,8 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
const int sid = sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))];
scheduler_addtask(sched, task_type_pair, task_subtype_density, sid, 0,
ci, cj);
+ scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack_d, sid,
+ 0, ci, cj); // A. Nasar
#ifdef SWIFT_DEBUG_CHECKS
#ifdef WITH_MPI
@@ -4600,7 +4878,6 @@ void engine_maketasks(struct engine *e) {
struct cell *cells = s->cells_top;
const int nr_cells = s->nr_cells;
const ticks tic = getticks();
-
/* Re-set the scheduler. */
scheduler_reset(sched, engine_estimate_nr_tasks(e));
@@ -4715,7 +4992,251 @@ void engine_maketasks(struct engine *e) {
* sched->tasks, sched->nr_tasks, sizeof(struct task),
* threadpool_auto_chunk_size, e); */
}
+ int unsplit = 0, split = 0;
+ /*These loops should really be threadmapped A. Nasar*/
+ for (int i = 0; i < sched->nr_tasks; i++) {
+ struct task * t = &sched->tasks[i];
+ if(t->type == task_type_sub_self && t->subtype ==
+ task_subtype_gpu_pack_d){
+ t->type = task_type_self;
+ }
+ if(t->type == task_type_sub_pair && t->subtype ==
+ task_subtype_gpu_pack_d){
+ t->type = task_type_pair;
+ }
+ if(t->type == task_type_sub_self && t->subtype ==
+ task_subtype_gpu_pack_g){
+ t->type = task_type_self;
+ }
+ if(t->type == task_type_sub_pair && t->subtype ==
+ task_subtype_gpu_pack_g){
+ t->type = task_type_pair;
+ }
+ if(t->type == task_type_sub_self && t->subtype ==
+ task_subtype_gpu_pack_f){
+ t->type = task_type_self;
+ }
+ if(t->type == task_type_sub_pair && t->subtype ==
+ task_subtype_gpu_pack_f){
+ t->type = task_type_pair;
+ }
+ }
+
+ /* Now, create unpack tasks based on the existing packs and create
+ * the dependencies pack->unpack->ghost_in A. Nasar */
+ const int pack_size = sched->pack_size;
+ const int pack_size_pair = sched->pack_size_pair;
+
+ int count_current_self = 0;
+ int count_current_pair = 0;
+
+ struct task *last_created_self_unpack = NULL;
+ struct task *last_created_pair_unpack = NULL;
+ /* Loop over all the currently existing pack tasks
+ * These loops should be thread-mapped too but will be a bit more tricky: A.
+ * Nasar*/
+ for (int i = 0; i < sched->nr_tasks; i++) {
+
+ struct task *t = &sched->tasks[i];
+ if (t->subtype != task_subtype_gpu_pack_d) continue;
+
+ if (t->type == task_type_self || t->type == task_type_sub_self) {
+
+ if (count_current_self % pack_size == 0) {
+ last_created_self_unpack = scheduler_addtask(
+ sched, task_type_self, task_subtype_gpu_unpack_d, 0, 0, NULL, NULL);
+ last_created_self_unpack->gpu_done = 0;
+ }
+
+ /* pack -> unpack -> ghost_in */
+ scheduler_addunlock(sched, t, last_created_self_unpack);
+ scheduler_addunlock(sched, last_created_self_unpack,
+ t->ci->hydro.super->hydro
+ .ghost_in); // Keep self_unpack dependency here,
+ // pairs added later using links
+ /*Creating links between each cell and its unpack task*/
+ engine_addlink(e, &t->ci->hydro.density_unpack, last_created_self_unpack);
+ t->ci->hydro.d_unpack = last_created_self_unpack;
+ ++count_current_self;
+ }
+
+ else if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+ if (count_current_pair % pack_size_pair == 0) {
+ last_created_pair_unpack = scheduler_addtask(
+ sched, task_type_pair, task_subtype_gpu_unpack_d, 0, 0, NULL, NULL);
+ }
+
+ scheduler_addunlock(sched, t, last_created_pair_unpack);
+ if (t->ci->nodeID == e->nodeID)
+ scheduler_addunlock(sched, last_created_pair_unpack,
+ t->ci->hydro.super->hydro.ghost_in);
+ if ((t->cj->nodeID == e->nodeID) &&
+ (t->ci->hydro.super != t->cj->hydro.super))
+ scheduler_addunlock(sched, last_created_pair_unpack,
+ t->cj->hydro.super->hydro.ghost_in);
+
+ engine_addlink(e, &t->ci->hydro.density_unpack, last_created_pair_unpack);
+ engine_addlink(e, &t->cj->hydro.density_unpack, last_created_pair_unpack);
+
+ ++count_current_pair;
+ } else {
+ /* Abouzied: I need to implement the sub-self and sub-pair version */
+ error("Something bad happened");
+ }
+ }
+#ifdef SWIFT_DEBUG_CHECKS
+ if (count_current_self != sched->nr_self_pack_tasks_d)
+ error("We did not find the correct number of self pack tasks!!");
+ if (count_current_pair != sched->nr_pair_pack_tasks_d)
+ error("We did not find the correct number of pair pack tasks!!");
+#endif
+
+ /*Now create unpacks for all gpu_pack_g (gradient) tasks A. Nasar */
+ count_current_self = 0;
+ count_current_pair = 0;
+
+ last_created_self_unpack = NULL;
+ last_created_pair_unpack = NULL;
+ /* Loop over all the currently existing gradient pack tasks */
+ for (int i = 0; i < sched->nr_tasks; i++) {
+
+ struct task *t = &sched->tasks[i];
+ if (t->subtype != task_subtype_gpu_pack_g) continue;
+
+ if (t->type == task_type_self || t->type == task_type_sub_self) {
+
+ if (count_current_self % pack_size == 0) {
+ last_created_self_unpack = scheduler_addtask(
+ sched, task_type_self, task_subtype_gpu_unpack_g, 0, 0, NULL, NULL);
+ last_created_self_unpack->gpu_done = 0;
+ }
+
+ /* pack -> unpack -> ghost_in */
+ scheduler_addunlock(sched, t, last_created_self_unpack);
+ scheduler_addunlock(sched, last_created_self_unpack,
+ t->ci->hydro.super->hydro.extra_ghost);
+ /*Creating links between a each cell and its unpack task*/
+ engine_addlink(e, &t->ci->hydro.gradient_unpack,
+ last_created_self_unpack);
+ t->ci->hydro.g_unpack = last_created_self_unpack;
+
+ ++count_current_self;
+ }
+
+ else if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+ if (count_current_pair % pack_size_pair == 0) {
+ last_created_pair_unpack = scheduler_addtask(
+ sched, task_type_pair, task_subtype_gpu_unpack_g, 0, 0, NULL, NULL);
+ }
+
+ /* pack -> unpack -> ghost_in */
+ scheduler_addunlock(sched, t, last_created_pair_unpack);
+ if (t->ci->nodeID == e->nodeID)
+ scheduler_addunlock(sched, last_created_pair_unpack,
+ t->ci->hydro.super->hydro.extra_ghost);
+ if ((t->cj->nodeID == e->nodeID) &&
+ (t->ci->hydro.super != t->cj->hydro.super))
+ scheduler_addunlock(sched, last_created_pair_unpack,
+ t->cj->hydro.super->hydro.extra_ghost);
+
+ engine_addlink(e, &t->ci->hydro.gradient_unpack,
+ last_created_pair_unpack);
+ engine_addlink(e, &t->cj->hydro.gradient_unpack,
+ last_created_pair_unpack);
+
+ ++count_current_pair;
+ } else {
+ /* Abouzied: I need to implement the sub-self and sub-pair version */
+ error("Something bad happened");
+ }
+ }
+#ifdef SWIFT_DEBUG_CHECKS
+ if (count_current_self != sched->nr_self_pack_tasks_g)
+ error(
+ "We did not find the correct number of G self pack tasks!! count %i "
+ "what it shoudl be %i",
+ count_current_self, sched->nr_self_pack_tasks_g);
+ if (count_current_pair != sched->nr_pair_pack_tasks_g)
+ error(
+ "We did not find the correct number of G pair pack tasks!! count %i "
+ "what it shoudl be %i",
+ count_current_pair, sched->nr_pair_pack_tasks_g);
+#endif
+
+ /*Now create unpacks for all gpu_pack_f (force) tasks*/
+ count_current_self = 0;
+ count_current_pair = 0;
+
+ last_created_self_unpack = NULL;
+ last_created_pair_unpack = NULL;
+ /* Loop over all the currently existing gradient pack tasks */
+ for (int i = 0; i < sched->nr_tasks; i++) {
+
+ struct task *t = &sched->tasks[i];
+ if (t->subtype != task_subtype_gpu_pack_f) continue;
+
+ if (t->type == task_type_self || t->type == task_type_sub_self) {
+
+ if (count_current_self % pack_size == 0) {
+ last_created_self_unpack = scheduler_addtask(
+ sched, task_type_self, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL);
+ }
+
+ /* pack -> unpack -> ghost_in */
+ scheduler_addunlock(sched, t, last_created_self_unpack);
+ scheduler_addunlock(sched, last_created_self_unpack,
+ t->ci->hydro.super->hydro.end_force);
+ /*Creating links between a each cell and its unpack task*/
+ engine_addlink(e, &t->ci->hydro.force_unpack, last_created_self_unpack);
+
+ ++count_current_self;
+ }
+
+ else if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+ if (count_current_pair % pack_size_pair == 0) {
+ last_created_pair_unpack = scheduler_addtask(
+ sched, task_type_pair, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL);
+ }
+
+ /* pack -> unpack -> ghost_in */
+ scheduler_addunlock(sched, t, last_created_pair_unpack);
+ if (t->ci->nodeID == e->nodeID)
+ scheduler_addunlock(sched, last_created_pair_unpack,
+ t->ci->hydro.super->hydro.end_force);
+ if ((t->cj->nodeID == e->nodeID) &&
+ (t->ci->hydro.super != t->cj->hydro.super))
+ scheduler_addunlock(sched, last_created_pair_unpack,
+ t->cj->hydro.super->hydro.end_force);
+
+ engine_addlink(e, &t->ci->hydro.force_unpack, last_created_pair_unpack);
+ engine_addlink(e, &t->cj->hydro.force_unpack, last_created_pair_unpack);
+
+ ++count_current_pair;
+ } else {
+ /* Abouzied: I need to implement the sub-self and sub-pair version */
+ error("Something bad happened");
+ }
+ }
+#ifdef SWIFT_DEBUG_CHECKS
+ if (count_current_self != sched->nr_self_pack_tasks_f)
+ error("We did not find the correct number of F self pack tasks!!");
+ if (count_current_pair != sched->nr_pair_pack_tasks_f)
+ error("We did not find the correct number of F pair pack tasks!!");
+#endif
+ /*Debug code to check if some tasks are not split to desired level in tree for
+ * GPU*/
+ // for (int i = 0; i < sched->nr_tasks; i++) {
+ // struct task *t = &sched->tasks[i];
+ // if(t->ci != NULL){
+ //// if(t->type == task_type_pair && ((t->ci->split && !t->cj->split) ||
+ ///(!t->ci->split && t->cj->split))) / error("one is split the other
+ /// isn't");
+ // if(t->ci->hydro.count > 80 && t->type == task_type_self)
+ // error("Count is %i task subtype (%s)",
+ // t->ci->hydro.count, subtaskID_names[t->subtype]);
+ // }
+ // }
if (e->verbose)
message("Making extra hydroloop tasks took %.3f %s.",
clocks_from_ticks(getticks() - tic2), clocks_getunit());
@@ -4866,4 +5387,39 @@ void engine_maketasks(struct engine *e) {
if (e->verbose)
message("took %.3f %s (including reweight).",
clocks_from_ticks(getticks() - tic), clocks_getunit());
+
+ /* Loop over all the CPU hydro tasks to make implicit (needs threadmapping)*/
+ for (int i = 0; i < sched->nr_tasks; i++) {
+
+ struct task *t = &sched->tasks[i];
+ if (t->subtype == task_subtype_density ||
+ t->subtype == task_subtype_gradient ||
+ t->subtype == task_subtype_force) {
+ t->implicit = 1;
+ }
+ // if (t->subtype == task_subtype_gpu_pack_d ||
+ // t->subtype == task_subtype_gpu_pack_g ||
+ // t->subtype == task_subtype_gpu_pack_f ||
+ // t->subtype == task_subtype_gpu_unpack_d ||
+ // t->subtype == task_subtype_gpu_unpack_g ||
+ // t->subtype == task_subtype_gpu_unpack_f){
+ // t->implicit = 1;
+ // }
+ // if (t->subtype == task_subtype_gpu_pack_g ||
+ // t->subtype == task_subtype_gpu_pack_f ||
+ // t->subtype == task_subtype_gpu_unpack_g ||
+ // t->subtype == task_subtype_gpu_unpack_f){// ||
+ //// (t->type == task_type_pair &&
+ //// t->subtype == task_subtype_gpu_pack_d)){
+ // t->implicit = 1;
+ // }
+ // if ((t->subtype == task_subtype_gpu_pack_d ||
+ // t->subtype == task_subtype_gpu_pack_g ||
+ // t->subtype == task_subtype_gpu_pack_f) &&
+ // (t->type == task_type_sub_pair ||
+ // t->type == task_type_sub_self)){
+ // t->implicit = 1;
+ //// error("STill have subs");
+ // }
+ }
}
diff --git a/src/engine_marktasks.c b/src/engine_marktasks.c
index 27b31c99c4..89f5e41b74 100644
--- a/src/engine_marktasks.c
+++ b/src/engine_marktasks.c
@@ -86,6 +86,25 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
const enum task_types t_type = t->type;
const enum task_subtypes t_subtype = t->subtype;
+ // Activate GPU unpack tasks (cell-less dummy tasks so need activating
+ // separately)
+ if (t_type == task_type_self &&
+ (t_subtype == task_subtype_gpu_unpack_d ||
+ t_subtype == task_subtype_gpu_unpack_g ||
+ t_subtype == task_subtype_gpu_unpack_f)) { // A. Nasar
+ scheduler_activate(s, t);
+ continue;
+ }
+
+ if (t_type == task_type_pair &&
+ (t_subtype == task_subtype_gpu_unpack_d ||
+ t_subtype == task_subtype_gpu_unpack_g ||
+ t_subtype == task_subtype_gpu_unpack_f)) { // A. Nasar
+ scheduler_activate(s, t);
+ continue;
+ // fprintf(stderr,"activated pair unpack in marktasks\n");
+ }
+
/* Single-cell task? */
if (t_type == task_type_self || t_type == task_type_sub_self) {
@@ -93,7 +112,17 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
struct cell *ci = t->ci;
#ifdef SWIFT_DEBUG_CHECKS
+#ifndef WITH_CUDA // A. Nasar
if (ci->nodeID != nodeID) error("Non-local self task found");
+#else
+ if ((ci->nodeID != nodeID) && (t_subtype != task_subtype_gpu_unpack_d) &&
+ (t_subtype != task_subtype_gpu_unpack_f) &&
+ (t_subtype != task_subtype_gpu_unpack_g)) {
+ fprintf(stderr, "task is %i\n", subtaskID_names[t->subtype]);
+ error("Non-local self task found. Task is subtaskID_names[%s]",
+ subtaskID_names[t->subtype]);
+ }
+#endif
#endif
const int ci_active_hydro = cell_is_active_hydro(ci, e);
@@ -115,6 +144,39 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
}
}
+ /* Activate packing for GPU A. Nasar */
+ else if (t_type == task_type_self &&
+ t_subtype == task_subtype_gpu_pack_d) {
+ if (ci_active_hydro) {
+ scheduler_activate(s, t);
+ ci->pack_done = 0;
+ ci->gpu_done = 0;
+ ci->unpack_done = 0;
+ }
+ }
+
+ /* Activate packing for GPU */
+ else if (t_type == task_type_self &&
+ t_subtype == task_subtype_gpu_pack_g) {
+ if (ci_active_hydro) {
+ scheduler_activate(s, t);
+ ci->pack_done_g = 0;
+ ci->gpu_done_g = 0;
+ ci->unpack_done_g = 0;
+ }
+ }
+
+ /* Activate packing for GPU */
+ else if (t_type == task_type_self &&
+ t_subtype == task_subtype_gpu_pack_f) {
+ if (ci_active_hydro) {
+ scheduler_activate(s, t);
+ ci->pack_done_f = 0;
+ ci->gpu_done_f = 0;
+ ci->unpack_done_f = 0;
+ }
+ }
+
/* Store current values of dx_max and h_max. */
else if (t_type == task_type_sub_self &&
t_subtype == task_subtype_density) {
@@ -125,12 +187,22 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
}
}
+ /* Store current values of dx_max and h_max. A. Nasar: Unsure if we
+ actually need this*/
+ else if (t_type == task_type_sub_self &&
+ t_subtype == task_subtype_gpu_pack_d) {
+ if (ci_active_hydro) {
+ scheduler_activate(s, t);
+ }
+ }
+
else if (t_type == task_type_self && t_subtype == task_subtype_force) {
if (ci_active_hydro) scheduler_activate(s, t);
}
else if (t_type == task_type_sub_self &&
- t_subtype == task_subtype_force) {
+ (t_subtype == task_subtype_force ||
+ t_subtype == task_subtype_gpu_pack_f)) {
if (ci_active_hydro) scheduler_activate(s, t);
}
@@ -149,7 +221,8 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
}
else if (t_type == task_type_sub_self &&
- t_subtype == task_subtype_gradient) {
+ (t_subtype == task_subtype_gradient ||
+ t_subtype == task_subtype_gpu_pack_g)) {
if (ci_active_hydro) scheduler_activate(s, t);
}
@@ -409,7 +482,29 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
const int ci_active_rt = cell_is_rt_active(ci, e);
const int cj_active_rt = cell_is_rt_active(cj, e);
- /* Only activate tasks that involve a local active cell. */
+ /* Activate packing for GPU A. Nasar */
+ if (t_subtype == task_subtype_gpu_pack_d &&
+ ((ci_active_hydro && ci_nodeID == nodeID) ||
+ (cj_active_hydro && cj_nodeID == nodeID))) {
+ scheduler_activate(s, t);
+ ci->gpu_done_pair = 0;
+ cj->gpu_done_pair = 0;
+ } else if (t_subtype == task_subtype_gpu_pack_g &&
+ ((ci_active_hydro && ci_nodeID == nodeID) ||
+ (cj_active_hydro && cj_nodeID == nodeID))) {
+ scheduler_activate(s, t);
+ ci->gpu_done_pair_g = 0;
+ cj->gpu_done_pair_g = 0;
+ } else if (t_subtype == task_subtype_gpu_pack_f &&
+ ((ci_active_hydro && ci_nodeID == nodeID) ||
+ (cj_active_hydro && cj_nodeID == nodeID))) {
+ scheduler_activate(s, t);
+ ci->gpu_done_pair_f = 0;
+ cj->gpu_done_pair_f = 0;
+ }
+
+ /* Only activate tasks that involve a local active cell. A. Nasar THIS
+ * COULD BE SOURCE OF BUG */
if ((t_subtype == task_subtype_density ||
t_subtype == task_subtype_gradient ||
t_subtype == task_subtype_limiter ||
diff --git a/src/error.h b/src/error.h
index a9b7481cf4..806b74f123 100644
--- a/src/error.h
+++ b/src/error.h
@@ -22,7 +22,11 @@
#define SWIFT_ERROR_H
/* Config parameters. */
+#ifdef WITH_CUDA
+#include "../config.h"
+#else
#include
+#endif
/* Some standard headers. */
#include
diff --git a/src/files_for_new_functions/arrays_malloc.cu b/src/files_for_new_functions/arrays_malloc.cu
new file mode 100644
index 0000000000..3bbf998231
--- /dev/null
+++ b/src/files_for_new_functions/arrays_malloc.cu
@@ -0,0 +1,363 @@
+#include "cuda/part_gpu.h"
+
+#include
+#include
+#include
+#include
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+
+#include "arrays_malloc.h"
+
+void allocate_host(struct part_soa *parts_soa, int count_max_parts_tmp) {
+ ///////////Malloc Host arrays
+ cudaMallocHost((void **)&parts_soa->tid_p, count_max_parts_tmp * sizeof(int));
+ cudaMallocHost((void **)&parts_soa->id,
+ count_max_parts_tmp * sizeof(long long));
+ cudaMallocHost((void **)&parts_soa->mass,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->h, count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->u, count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->u_dt,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->rho, count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->SPH_sum,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->x_p,
+ count_max_parts_tmp * sizeof(double));
+ cudaMallocHost((void **)&parts_soa->y_p,
+ count_max_parts_tmp * sizeof(double));
+ cudaMallocHost((void **)&parts_soa->z_p,
+ count_max_parts_tmp * sizeof(double));
+ cudaMallocHost((void **)&parts_soa->ux, count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->uy, count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->uz, count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->a_hydrox,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->a_hydroy,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->a_hydroz,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->locx,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->locy,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->locz,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->widthx,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->widthy,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->widthz,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->h_max,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->count_p,
+ count_max_parts_tmp * sizeof(int));
+ cudaMallocHost((void **)&parts_soa->wcount,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->wcount_dh,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->rho_dh,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->rot_ux,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->rot_uy,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->rot_uz,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->div_v,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->div_v_previous_step,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->alpha_visc,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->v_sig,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->laplace_u,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->alpha_diff,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->f, count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->soundspeed,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->h_dt,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->balsara,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->pressure,
+ count_max_parts_tmp * sizeof(float));
+ cudaMallocHost((void **)&parts_soa->alpha_visc_max_ngb,
+ count_max_parts_tmp * sizeof(float));
+ /* timestep stuff */
+ cudaMallocHost((void **)&parts_soa->time_bin,
+ count_max_parts_tmp * sizeof(timebin_t));
+ cudaMallocHost((void **)&parts_soa->wakeup,
+ count_max_parts_tmp * sizeof(timebin_t));
+ cudaMallocHost((void **)&parts_soa->min_ngb_time_bin,
+ count_max_parts_tmp * sizeof(timebin_t));
+ cudaMallocHost((void **)&parts_soa->to_be_synchronized,
+ count_max_parts_tmp * sizeof(char));
+}
+
+void allocate_device(struct part_soa d_parts_soa, int count_max_parts_tmp) {
+ ////////now malloc variables for particle data on the GPU. Sheesh
+ fprintf(stderr, "before malloc\n");
+ cudaMalloc((void **)&(d_parts_soa.tid_p), sizeof(int) * count_max_parts_tmp);
+ fprintf(stderr, "after malloc\n");
+ cudaMalloc((void **)&(d_parts_soa.id),
+ sizeof(long long) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.x_p), sizeof(double) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.y_p), sizeof(double) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.z_p), sizeof(double) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.ux), sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.uy), sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.uz), sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.a_hydrox),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.a_hydroy),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.a_hydroz),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.mass), sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.h), sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.u), sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.u_dt), sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.rho), sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.SPH_sum),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.locx), sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.locy), sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.locz), sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.widthx),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.widthy),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.widthz),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.h_max),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.count_p),
+ sizeof(int) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.wcount),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.wcount_dh),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.rho_dh),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.rot_ux),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.rot_uy),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.rot_uz),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.div_v),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.div_v_previous_step),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.alpha_visc),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.v_sig),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.laplace_u),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.alpha_diff),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.f), sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.soundspeed),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.h_dt), sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.balsara),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.pressure),
+ sizeof(float) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.alpha_visc_max_ngb),
+ sizeof(float) * count_max_parts_tmp);
+ /* timestep stuff */
+ cudaMalloc((void **)&(d_parts_soa.time_bin),
+ sizeof(timebin_t) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.wakeup),
+ sizeof(timebin_t) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.min_ngb_time_bin),
+ sizeof(timebin_t) * count_max_parts_tmp);
+ cudaMalloc((void **)&(d_parts_soa.to_be_synchronized),
+ sizeof(char) * count_max_parts_tmp);
+}
+
+cudaError_t cudaAllocInt(int **d_var, int elements) {
+ return cudaMalloc((void **)d_var, elements * sizeof(int));
+}
+cudaError_t cudaAllocFloat(float **d_var, int elements) {
+ return cudaMalloc((void **)d_var, elements * sizeof(float));
+}
+cudaError_t cudaAllocDouble(double **d_var, int elements) {
+ return cudaMalloc((void **)d_var, elements * sizeof(double));
+}
+cudaError_t cudaAllocLonglong(long long **d_var, int elements) {
+ return cudaMalloc((void **)d_var, elements * sizeof(long long));
+}
+cudaError_t cudaAllocChar(char **d_var, int elements) {
+ return cudaMalloc((void **)d_var, elements * sizeof(char));
+}
+cudaError_t cudaAllocTimebin(timebin_t **d_var, int elements) {
+ return cudaMalloc((void **)d_var, elements * sizeof(timebin_t));
+}
+
+void allocate_device_dirty(
+ int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p,
+ double **d_z_p, float **d_ux, float **d_uy, float **d_uz,
+ float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz, float **d_mass,
+ float **d_h, float **d_u, float **d_u_dt, float **d_rho, float **d_locx,
+ float **d_locy, float **d_locz, float **d_widthx, float **d_widthy,
+ float **d_widthz, float **d_h_max, int **d_count_p, float **d_wcount,
+ float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy,
+ float **d_rot_uz, float **d_div_v, float **d_div_v_previous_step,
+ float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
+ float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt,
+ float **d_balsara, float **d_pressure, float **d_alpha_visc_max_ngb,
+ timebin_t **d_time_bin, timebin_t **d_wakeup,
+ timebin_t **d_min_ngb_time_bin, char **d_to_be_synchronized,
+ int count_max_parts_tmp) {
+ ////////Malloc variables for particle data on the GPU. Sheesh, that's a lot
+
+ size_t free_byte;
+ size_t total_byte;
+
+ cudaError_t cuda_status = cudaMemGetInfo(&free_byte, &total_byte);
+ double free = (double)free_byte;
+ double available = (double)total_byte;
+ double used = (available - free);
+ // message("free %lf used %lf", free/10.E8, used/10.E8);
+
+ cudaError_t cu_error = cudaAllocInt(d_tid_p, count_max_parts_tmp);
+ cu_error = cudaAllocLonglong(d_id, count_max_parts_tmp);
+ cu_error = cudaAllocDouble(d_x_p, count_max_parts_tmp);
+ cu_error = cudaAllocDouble(d_y_p, count_max_parts_tmp);
+ cu_error = cudaAllocDouble(d_z_p, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_ux, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_uy, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_uz, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_a_hydrox, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_a_hydroy, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_a_hydroz, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_mass, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_h, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_u, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_u_dt, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_rho, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_locx, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_locy, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_locz, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_widthx, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_widthy, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_widthz, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_h_max, count_max_parts_tmp);
+ cu_error = cudaAllocInt(d_count_p, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_wcount, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_wcount_dh, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_rho_dh, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_rot_ux, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_rot_uy, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_rot_uz, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_div_v, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_div_v_previous_step, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_alpha_visc, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_v_sig, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_laplace_u, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_alpha_diff, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_f, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_soundspeed, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_h_dt, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_balsara, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_pressure, count_max_parts_tmp);
+ cu_error = cudaAllocFloat(d_alpha_visc_max_ngb, count_max_parts_tmp);
+ /* timestep stuff */
+ cu_error = cudaAllocTimebin(d_time_bin, count_max_parts_tmp);
+ cu_error = cudaAllocTimebin(d_wakeup, count_max_parts_tmp);
+ cu_error = cudaAllocTimebin(d_min_ngb_time_bin, count_max_parts_tmp);
+ cu_error = cudaAllocChar(d_to_be_synchronized, count_max_parts_tmp);
+// cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
+// double free_end = (double)free_byte;
+// available = (double)total_byte;
+// double used_end = (available - free_end);
+// message("cuda malloc self free %lf GB used %lf GB used to allocate
+// self"
+// " data %lf MB", free_end/10.E8, used_end/10.E8,
+// (used_end - used)/10.E5);
+// message("at end of malloc dirty: %s",
+// cudaGetErrorString(cu_error));
+#ifdef CUDA_DEBUG
+ if (cu_error != cudaSuccess) {
+ fprintf(stderr, "CUDA error at end of malloc dirty: %s\n",
+ cudaGetErrorString(cu_error));
+ exit(0);
+ }
+#endif
+}
+
+void allocate_device_test(int **tid_test, int count_max_parts_tmp) {
+ ////////now malloc variables for particle data on the GPU. Sheesh
+
+ cudaMalloc((void **)tid_test, sizeof(int) * count_max_parts_tmp);
+
+ cudaError_t cu_error = cudaPeekAtLastError(); // Get error code
+ fprintf(stderr, "malloc tid: %s\n", cudaGetErrorString(cu_error));
+
+ if (cu_error != cudaSuccess) {
+ fprintf(stderr, "CUDA error with malloc tid: %s\n",
+ cudaGetErrorString(cu_error));
+ exit(0);
+ }
+}
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_malloc(struct part_soa *parts_soa, int alloc_type,
+ int count_max_parts_tmp) {
+ allocate_host(parts_soa, count_max_parts_tmp);
+}
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_malloc(struct part_soa d_parts_soa, int alloc_type,
+ int count_max_parts_tmp) {
+ allocate_device(d_parts_soa, count_max_parts_tmp);
+}
+
+void device_malloc_dirty(
+ int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p,
+ double **d_z_p, float **d_ux, float **d_uy, float **d_uz,
+ float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz, float **d_mass,
+ float **d_h, float **d_u, float **d_u_dt, float **d_rho, float **d_locx,
+ float **d_locy, float **d_locz, float **d_widthx, float **d_widthy,
+ float **d_widthz, float **d_h_max, int **d_count_p, float **d_wcount,
+ float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy,
+ float **d_rot_uz, float **d_div_v, float **d_div_v_previous_step,
+ float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
+ float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt,
+ float **d_balsara, float **d_pressure, float **d_alpha_visc_max_ngb,
+ timebin_t **d_time_bin, timebin_t **d_wakeup,
+ timebin_t **d_min_ngb_time_bin, char **d_to_be_synchronized,
+ int count_max_parts_tmp) {
+
+ allocate_device_dirty(
+ d_tid_p, d_id, d_x_p, d_y_p, d_z_p, d_ux, d_uy, d_uz, d_a_hydrox,
+ d_a_hydroy, d_a_hydroz, d_mass, d_h, d_u, d_u_dt, d_rho, d_locx, d_locy,
+ d_locz, d_widthx, d_widthy, d_widthz, d_h_max, d_count_p, d_wcount,
+ d_wcount_dh, d_rho_dh, d_rot_ux, d_rot_uy, d_rot_uz, d_div_v,
+ d_div_v_previous_step, d_alpha_visc, d_v_sig, d_laplace_u, d_alpha_diff,
+ d_f, d_soundspeed, d_h_dt, d_balsara, d_pressure, d_alpha_visc_max_ngb,
+ d_time_bin, d_wakeup, d_min_ngb_time_bin, d_to_be_synchronized,
+ count_max_parts_tmp);
+}
+
+void device_malloc_test(int **tid_test, int count_max_parts_tmp) {
+
+ allocate_device_test(tid_test, count_max_parts_tmp);
+}
+
+#ifdef WITH_CUDA
+}
+#endif
diff --git a/src/files_for_new_functions/arrays_malloc.h b/src/files_for_new_functions/arrays_malloc.h
new file mode 100644
index 0000000000..1107b51444
--- /dev/null
+++ b/src/files_for_new_functions/arrays_malloc.h
@@ -0,0 +1,64 @@
+#include "cuda/part_gpu.h"
+
+#include
+#include
+#include
+#include
+
+cudaError_t cudaAllocInt(int **d_var, int elements);
+cudaError_t cudaAllocFloat(float **d_var, int elements);
+cudaError_t cudaAllocDouble(double **d_var, int elements);
+cudaError_t cudaAllocLonglong(long long **d_var, int elements);
+cudaError_t cudaAllocChar(char **d_var, int elements);
+cudaError_t cudaAllocTimebin(timebin_t **d_var, int elements);
+
+void allocate_host(struct part_soa *parts_soa, int count_max_parts_tmp);
+
+void allocate_device(struct part_soa d_parts_soa, int count_max_parts_tmp);
+
+void allocate_device_dirty(
+ int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p,
+ double **d_z_p, float **d_ux, float **d_uy, float **d_uz,
+ float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz, float **d_mass,
+ float **d_h, float **d_u, float **d_u_dt, float **d_rho, float **d_locx,
+ float **d_locy, float **d_locz, float **d_widthx, float **d_widthy,
+ float **d_widthz, float **d_h_max, int **d_count_p, float **d_wcount,
+ float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy,
+ float **d_rot_uz, float **d_div_v, float **d_div_v_previous_step,
+ float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
+ float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt,
+ float **d_balsara, float **d_pressure, float **d_alpha_visc_max_ngb,
+ timebin_t **d_time_bin, timebin_t **d_wakeup,
+ timebin_t **d_min_ngb_time_bin, char **d_to_be_synchronized,
+ int count_max_parts_tmp);
+
+void allocate_device_test(int **tid_test, int count_max_parts_tmp);
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_malloc(struct part_soa *parts_soa, int alloc_type,
+ int count_max_parts_tmp);
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_malloc(struct part_soa d_parts_soa, int alloc_type,
+ int count_max_parts_tmp);
+
+void device_malloc_dirty(
+ int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p,
+ double **d_z_p, float **d_ux, float **d_uy, float **d_uz,
+ float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz, float **d_mass,
+ float **d_h, float **d_u, float **d_u_dt, float **d_rho, float **d_locx,
+ float **d_locy, float **d_locz, float **d_widthx, float **d_widthy,
+ float **d_widthz, float **d_h_max, int **d_count_p, float **d_wcount,
+ float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy,
+ float **d_rot_uz, float **d_div_v, float **d_div_v_previous_step,
+ float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
+ float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt,
+ float **d_balsara, float **d_pressure, float **d_alpha_visc_max_ngb,
+ timebin_t **d_time_bin, timebin_t **d_wakeup,
+ timebin_t **d_min_ngb_time_bin, char **d_to_be_synchronized,
+ int count_max_parts_tmp);
+
+void device_malloc_test(int **tid_test, int count_max_parts_tmp);
diff --git a/src/files_for_new_functions/host_device_data_transfer.cu b/src/files_for_new_functions/host_device_data_transfer.cu
new file mode 100644
index 0000000000..ede719529b
--- /dev/null
+++ b/src/files_for_new_functions/host_device_data_transfer.cu
@@ -0,0 +1,566 @@
+#include "cuda/part_gpu.h"
+
+#include
+#include
+#include
+#include
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+
+void host2device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp) {
+ // int * tid_h;
+ // cudaMallocHost((void **)&tid_h,
+ // count_max_parts_tmp * sizeof(int));
+ for (int i = 0; i < count_max_parts_tmp; i++) {
+ tid_h[i] = 100;
+ // fprintf(stderr,"tid_h %i\n", tid_h[i]);
+ }
+
+ cudaMemcpy(d_tid_p, tid_h, count_max_parts_tmp * sizeof(int),
+ cudaMemcpyHostToDevice);
+ cudaDeviceSynchronize();
+ // cudaFree(tid_h);
+}
+
+void device2host_test(struct part_soa parts_soa, int *tid_h,
+ int count_max_parts_tmp) {
+ int *tid_p = parts_soa.tid_p;
+ cudaMemcpy(tid_h, tid_p, count_max_parts_tmp * sizeof(int),
+ cudaMemcpyDeviceToHost);
+ for (int i = 0; i < count_max_parts_tmp; i++) {
+ fprintf(stderr, "tid is %i\n", tid_h[i]);
+ }
+}
+
+void device2device_test(int *tid_p, struct part_soa parts_soa,
+ int count_max_parts_tmp) {
+ cudaMemcpy(tid_p, parts_soa.tid_p, sizeof(int *), cudaMemcpyHostToDevice);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp) {
+
+ host2device_test(d_tid_p, tid_h, count_max_parts_tmp);
+}
+
+void device_host_test(struct part_soa parts_soa, int *tid_h,
+ int count_max_parts_tmp) {
+
+ device2host_test(parts_soa, tid_h, count_max_parts_tmp);
+}
+
+void device_device_test(int *tid_p, struct part_soa parts_soa,
+ int count_max_parts_tmp) {
+
+ device2device_test(tid_p, parts_soa, count_max_parts_tmp);
+}
+
+void device2host_density(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+ int count_max_parts_tmp) {
+ cudaMemcpy(parts_soa_buffer.tid_p, tid_p, count_max_parts_tmp * sizeof(int),
+ cudaMemcpyDeviceToHost);
+}
+void device_host_cpy(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+ int count_max_parts_tmp) {
+
+ device2host_density(parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz,
+ a_hydrox, a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx,
+ locy, locz, widthx, widthy, widthz, h_max, count_p,
+ wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz, div_v,
+ div_v_previous_step, alpha_visc, v_sig, laplace_u,
+ alpha_diff, f, soundspeed, h_dt, balsara, pressure,
+ alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
+ to_be_synchronized, count_max_parts_tmp);
+}
+
+void device2device_density(
+ struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+ int count_max_parts_tmp, cudaStream_t stream) {
+
+ cudaMemcpyAsync(&(parts_soa_buffer->tid_p), &tid_p, sizeof(int *),
+ cudaMemcpyHostToDevice, stream);
+ cudaMemcpyAsync(&(parts_soa_buffer->locx), &locx, sizeof(float *),
+ cudaMemcpyHostToDevice, stream);
+ cudaMemcpyAsync(&(parts_soa_buffer->locy), &locy, sizeof(float *),
+ cudaMemcpyHostToDevice, stream);
+ cudaMemcpyAsync(&(parts_soa_buffer->locz), &locz, sizeof(float *),
+ cudaMemcpyHostToDevice, stream);
+ cudaMemcpyAsync(&(parts_soa_buffer->h), &h, sizeof(float *),
+ cudaMemcpyHostToDevice, stream);
+ cudaMemcpyAsync(&(parts_soa_buffer->mass), &mass, sizeof(float *),
+ cudaMemcpyHostToDevice, stream);
+ cudaMemcpyAsync(&(parts_soa_buffer->x_p), &x_p, sizeof(double *),
+ cudaMemcpyHostToDevice, stream);
+ cudaMemcpyAsync(&(parts_soa_buffer->y_p), &y_p, sizeof(double *),
+ cudaMemcpyHostToDevice, stream);
+ cudaMemcpyAsync(&(parts_soa_buffer->z_p), &z_p, sizeof(double *),
+ cudaMemcpyHostToDevice, stream);
+ cudaMemcpyAsync(&(parts_soa_buffer->ux), &ux, sizeof(float *),
+ cudaMemcpyHostToDevice, stream);
+ cudaMemcpyAsync(&(parts_soa_buffer->uy), &uy, sizeof(float *),
+ cudaMemcpyHostToDevice, stream);
+ cudaMemcpyAsync(&(parts_soa_buffer->uz), &uz, sizeof(float *),
+ cudaMemcpyHostToDevice, stream);
+ cudaMemcpyAsync(&(parts_soa_buffer->time_bin), &time_bin, sizeof(timebin_t *),
+ cudaMemcpyHostToDevice, stream);
+}
+
+void host2device_density(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+ int count_max_parts_tmp) {
+ cudaError_t cu_error;
+ cudaMemcpy(&tid_p, &(parts_soa_buffer.tid_p),
+ count_max_parts_tmp * sizeof(int), cudaMemcpyHostToDevice);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_cpy(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+ int count_max_parts_tmp) {
+
+ host2device_density(parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz,
+ a_hydrox, a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx,
+ locy, locz, widthx, widthy, widthz, h_max, count_p,
+ wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz, div_v,
+ div_v_previous_step, alpha_visc, v_sig, laplace_u,
+ alpha_diff, f, soundspeed, h_dt, balsara, pressure,
+ alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
+ to_be_synchronized, count_max_parts_tmp);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_device_bind(
+ struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+ int count_max_parts_tmp, cudaStream_t stream) {
+
+ device2device_density(
+ parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox,
+ a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx,
+ widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy,
+ rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+ alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
+ time_bin, wakeup, min_ngb_time_bin, to_be_synchronized,
+ count_max_parts_tmp, stream);
+}
+
+void host2device_async_density(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+ int bundle_n_parts, cudaStream_t stream) {
+ cudaError_t cu_error;
+ cudaMemcpyAsync(&tid_p[first_part_tmp],
+ &(parts_soa_buffer.tid_p[first_part_tmp]),
+ bundle_n_parts * sizeof(int), cudaMemcpyHostToDevice, stream);
+ cudaMemcpyAsync(
+ &locx[first_part_tmp], &(parts_soa_buffer.locx[first_part_tmp]),
+ bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, stream);
+ cudaMemcpyAsync(
+ &locy[first_part_tmp], &(parts_soa_buffer.locy[first_part_tmp]),
+ bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, stream);
+ cudaMemcpyAsync(&locz[first_part_tmp], &parts_soa_buffer.locz[first_part_tmp],
+ bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+ stream);
+ cudaMemcpyAsync(&h[first_part_tmp], &parts_soa_buffer.h[first_part_tmp],
+ bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+ stream);
+ cudaMemcpyAsync(&mass[first_part_tmp], &parts_soa_buffer.mass[first_part_tmp],
+ bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+ stream);
+ cudaMemcpyAsync(&x_p[first_part_tmp], &parts_soa_buffer.x_p[first_part_tmp],
+ bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice,
+ stream);
+ cudaMemcpyAsync(&y_p[first_part_tmp], &parts_soa_buffer.y_p[first_part_tmp],
+ bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice,
+ stream);
+ cudaMemcpyAsync(&z_p[first_part_tmp], &parts_soa_buffer.z_p[first_part_tmp],
+ bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice,
+ stream);
+ cudaMemcpyAsync(&ux[first_part_tmp], &parts_soa_buffer.ux[first_part_tmp],
+ bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+ stream);
+ cudaMemcpyAsync(&uy[first_part_tmp], &parts_soa_buffer.uy[first_part_tmp],
+ bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+ stream);
+ cudaMemcpyAsync(&uz[first_part_tmp], &parts_soa_buffer.uz[first_part_tmp],
+ bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+ stream);
+ cudaMemcpyAsync(
+ &time_bin[first_part_tmp], &parts_soa_buffer.time_bin[first_part_tmp],
+ bundle_n_parts * sizeof(timebin_t), cudaMemcpyHostToDevice, stream);
+}
+
+void host2device_async_density_pair(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+ int bundle_n_parts, cudaStream_t stream) {
+
+ // int bundle_n_parts = bundle_n_parts_i + bundle_n_parts_j;
+ cudaError_t cu_error;
+ // cudaMemcpyAsync(&tid_p[first_part_tmp],
+ // &(parts_soa_buffer.tid_p[first_part_tmp]),
+ // bundle_n_parts * sizeof(int),
+ // cudaMemcpyHostToDevice, stream);
+ // cudaMemcpyAsync(&locx[first_part_tmp],
+ // &(parts_soa_buffer.locx[first_part_tmp]),
+ // bundle_n_parts * sizeof(float),
+ // cudaMemcpyHostToDevice, stream);
+ // cudaMemcpyAsync(&locy[first_part_tmp],
+ // &(parts_soa_buffer.locy[first_part_tmp]),
+ // bundle_n_parts * sizeof(float),
+ // cudaMemcpyHostToDevice, stream);
+ // cudaMemcpyAsync(&locz[first_part_tmp],
+ // &parts_soa_buffer.locz[first_part_tmp],
+ // bundle_n_parts * sizeof(float),
+ // cudaMemcpyHostToDevice, stream);
+ cudaMemcpyAsync(&h[first_part_tmp], &parts_soa_buffer.h[first_part_tmp],
+ bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+ stream);
+ cudaMemcpyAsync(&mass[first_part_tmp], &parts_soa_buffer.mass[first_part_tmp],
+ bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+ stream);
+ cudaMemcpyAsync(&x_p[first_part_tmp], &parts_soa_buffer.x_p[first_part_tmp],
+ bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice,
+ stream);
+ cudaMemcpyAsync(&y_p[first_part_tmp], &parts_soa_buffer.y_p[first_part_tmp],
+ bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice,
+ stream);
+ cudaMemcpyAsync(&z_p[first_part_tmp], &parts_soa_buffer.z_p[first_part_tmp],
+ bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice,
+ stream);
+ cudaMemcpyAsync(&ux[first_part_tmp], &parts_soa_buffer.ux[first_part_tmp],
+ bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+ stream);
+ cudaMemcpyAsync(&uy[first_part_tmp], &parts_soa_buffer.uy[first_part_tmp],
+ bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+ stream);
+ cudaMemcpyAsync(&uz[first_part_tmp], &parts_soa_buffer.uz[first_part_tmp],
+ bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+ stream);
+ cudaMemcpyAsync(
+ &time_bin[first_part_tmp], &parts_soa_buffer.time_bin[first_part_tmp],
+ bundle_n_parts * sizeof(timebin_t), cudaMemcpyHostToDevice, stream);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_async_cpy(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+ int bundle_n_parts, cudaStream_t stream) {
+
+ host2device_async_density(
+ parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox,
+ a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx,
+ widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy,
+ rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+ alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
+ time_bin, wakeup, min_ngb_time_bin, to_be_synchronized, first_part_tmp,
+ bundle_n_parts, stream);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_async_cpy_pair(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp_i,
+ int bundle_n_parts, cudaStream_t stream) {
+
+ host2device_async_density_pair(
+ parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox,
+ a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx,
+ widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy,
+ rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+ alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
+ time_bin, wakeup, min_ngb_time_bin, to_be_synchronized, first_part_tmp_i,
+ bundle_n_parts, stream);
+}
+
+void device2host_async_density(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+ int bundle_n_parts, cudaStream_t stream) {
+ cudaError_t cu_error;
+
+ cudaMemcpyAsync(&parts_soa_buffer.rho[first_part_tmp], &rho[first_part_tmp],
+ bundle_n_parts * sizeof(float), cudaMemcpyDeviceToHost,
+ stream);
+ cudaMemcpyAsync(&parts_soa_buffer.rho_dh[first_part_tmp],
+ &rho_dh[first_part_tmp], bundle_n_parts * sizeof(float),
+ cudaMemcpyDeviceToHost, stream);
+ cudaMemcpyAsync(&parts_soa_buffer.wcount[first_part_tmp],
+ &wcount[first_part_tmp], bundle_n_parts * sizeof(float),
+ cudaMemcpyDeviceToHost, stream);
+ cudaMemcpyAsync(&parts_soa_buffer.wcount_dh[first_part_tmp],
+ &wcount_dh[first_part_tmp], bundle_n_parts * sizeof(float),
+ cudaMemcpyDeviceToHost, stream);
+ cudaMemcpyAsync(&parts_soa_buffer.div_v[first_part_tmp],
+ &div_v[first_part_tmp], bundle_n_parts * sizeof(float),
+ cudaMemcpyDeviceToHost, stream);
+ cudaMemcpyAsync(&parts_soa_buffer.rot_ux[first_part_tmp],
+ &rot_ux[first_part_tmp], bundle_n_parts * sizeof(float),
+ cudaMemcpyDeviceToHost, stream);
+ cudaMemcpyAsync(&parts_soa_buffer.rot_uy[first_part_tmp],
+ &rot_uy[first_part_tmp], bundle_n_parts * sizeof(float),
+ cudaMemcpyDeviceToHost, stream);
+ cudaMemcpyAsync(&parts_soa_buffer.rot_uz[first_part_tmp],
+ &rot_uz[first_part_tmp], bundle_n_parts * sizeof(float),
+ cudaMemcpyDeviceToHost, stream);
+}
+
+void device2host_async_density_pair(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+ int bundle_n_parts, cudaStream_t stream) {
+ cudaError_t cu_error;
+ // fprintf(stderr, "parts i %i parts j %i\n", bundle_n_parts_i,
+ // bundle_n_parts_j); int bundle_n_parts = bundle_n_parts_i +
+ // bundle_n_parts_j;
+
+ cudaMemcpyAsync(&parts_soa_buffer.rho[first_part_tmp], &rho[first_part_tmp],
+ bundle_n_parts * sizeof(float), cudaMemcpyDeviceToHost,
+ stream);
+ cudaMemcpyAsync(&parts_soa_buffer.rho_dh[first_part_tmp],
+ &rho_dh[first_part_tmp], bundle_n_parts * sizeof(float),
+ cudaMemcpyDeviceToHost, stream);
+ cudaMemcpyAsync(&parts_soa_buffer.wcount[first_part_tmp],
+ &wcount[first_part_tmp], bundle_n_parts * sizeof(float),
+ cudaMemcpyDeviceToHost, stream);
+ cudaMemcpyAsync(&parts_soa_buffer.wcount_dh[first_part_tmp],
+ &wcount_dh[first_part_tmp], bundle_n_parts * sizeof(float),
+ cudaMemcpyDeviceToHost, stream);
+ cudaMemcpyAsync(&parts_soa_buffer.div_v[first_part_tmp],
+ &div_v[first_part_tmp], bundle_n_parts * sizeof(float),
+ cudaMemcpyDeviceToHost, stream);
+ cudaMemcpyAsync(&parts_soa_buffer.rot_ux[first_part_tmp],
+ &rot_ux[first_part_tmp], bundle_n_parts * sizeof(float),
+ cudaMemcpyDeviceToHost, stream);
+ cudaMemcpyAsync(&parts_soa_buffer.rot_uy[first_part_tmp],
+ &rot_uy[first_part_tmp], bundle_n_parts * sizeof(float),
+ cudaMemcpyDeviceToHost, stream);
+ cudaMemcpyAsync(&parts_soa_buffer.rot_uz[first_part_tmp],
+ &rot_uz[first_part_tmp], bundle_n_parts * sizeof(float),
+ cudaMemcpyDeviceToHost, stream);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_host_async_cpy(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+ int bundle_n_parts, cudaStream_t stream) {
+
+ device2host_async_density(
+ parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox,
+ a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx,
+ widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy,
+ rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+ alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
+ time_bin, wakeup, min_ngb_time_bin, to_be_synchronized, first_part_tmp,
+ bundle_n_parts, stream);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_host_async_cpy_pair(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+ int bundle_n_parts, cudaStream_t stream) {
+
+ device2host_async_density_pair(
+ parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox,
+ a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx,
+ widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy,
+ rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+ alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
+ time_bin, wakeup, min_ngb_time_bin, to_be_synchronized, first_part_tmp,
+ bundle_n_parts, stream);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_device_async_bind(
+ struct part_soa *parts_soa, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized) {
+
+ parts_soa->tid_p = tid_p;
+ parts_soa->locx = locx;
+ parts_soa->locy = locy;
+ parts_soa->locz = locz;
+ parts_soa->h = h;
+ parts_soa->mass = mass;
+ parts_soa->x_p = x_p;
+ parts_soa->y_p = y_p;
+ parts_soa->z_p = z_p;
+ parts_soa->rho = rho;
+ parts_soa->rho_dh = rho_dh;
+ parts_soa->wcount = wcount;
+ parts_soa->wcount_dh = wcount_dh;
+ parts_soa->ux = ux;
+ parts_soa->uy = uy;
+ parts_soa->uz = uz;
+ parts_soa->div_v = div_v;
+ parts_soa->rot_ux = rot_ux;
+ parts_soa->rot_uy = rot_uy;
+ parts_soa->rot_uz = rot_uz;
+ parts_soa->time_bin = time_bin;
+}
+
+#ifdef WITH_CUDA
+}
+#endif
diff --git a/src/files_for_new_functions/host_device_data_transfer.h b/src/files_for_new_functions/host_device_data_transfer.h
new file mode 100644
index 0000000000..204afd51fa
--- /dev/null
+++ b/src/files_for_new_functions/host_device_data_transfer.h
@@ -0,0 +1,234 @@
+#include "cuda/part_gpu.h"
+
+#include
+#include
+#include
+#include
+
+void host2device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp);
+
+void device2host_test(struct part_soa parts_soa, int *tid_h,
+ int count_max_parts_tmp);
+
+void device2device_test(int *tid_p, struct part_soa parts_soa,
+ int count_max_parts_tmp);
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp);
+
+void device_host_test(struct part_soa parts_soa, int *tid_h,
+ int count_max_parts_tmp);
+
+void device_device_test(int *tid_p, struct part_soa parts_soa,
+ int count_max_parts_tmp);
+
+void device2host_density(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+ int count_max_parts_tmp);
+
+void device_host_cpy(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+ int count_max_parts_tmp);
+
+void device2device_density(
+ struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+ int count_max_parts_tmp, cudaStream_t stream);
+
+void host2device_density(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+ int count_max_parts_tmp);
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_cpy(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+ int count_max_parts_tmp);
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_device_bind(
+ struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+ int count_max_parts_tmp, cudaStream_t stream);
+
+void host2device_async_density(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+ int bundle_n_parts, cudaStream_t stream);
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_async_cpy(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+ int bundle_n_parts, cudaStream_t stream);
+
+void device2host_async_density(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+ int bundle_n_parts, cudaStream_t stream);
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_host_async_cpy(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+ int bundle_n_parts, cudaStream_t stream);
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_device_async_bind(
+ struct part_soa *parts_soa, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized);
+
+void host_device_async_cpy_pair(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+ int bundle_n_parts, cudaStream_t stream);
+
+void device_host_async_cpy_pair(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+ int bundle_n_parts, cudaStream_t stream);
+
+void device2host_async_density_pair(
+ struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+ float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+ float *u_dt, float *rho, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+ float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+ float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+ float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+ int bundle_n_parts_i, int bundle_n_parts_j, cudaStream_t stream);
diff --git a/src/hip/BLOCK_SIZE.h b/src/hip/BLOCK_SIZE.h
new file mode 100644
index 0000000000..d36e10b99b
--- /dev/null
+++ b/src/hip/BLOCK_SIZE.h
@@ -0,0 +1,10 @@
+#ifndef BLOCK_SIZE_H
+#define BLOCK_SIZE_H
+#ifdef WITH_CUDA
+// extern "C" {
+#endif
+#define BLOCK_SIZE 512
+#ifdef WITH_CUDA
+//}
+#endif
+#endif // BLOCK_SIZE_H
diff --git a/src/hip/Data_and_GPU_prep_functions.cu b/src/hip/Data_and_GPU_prep_functions.cu
new file mode 100644
index 0000000000..57cbe0ad7c
--- /dev/null
+++ b/src/hip/Data_and_GPU_prep_functions.cu
@@ -0,0 +1,229 @@
+/*
+ * Data_and_GPU_prep_functions.cu
+ *
+ * Created on: 17 Apr 2022
+ * Author: abouzied
+ */
+
+/*ifdef WITH_CUDA prevents name mangling. C code sees exact names
+ of functions rather than mangled template names produced by C++*/
+// #ifdef WITH_CUDA
+// extern "C"{
+// #endif
+
+// #include "cuda/cuda_headers.h"
+// #include "device_functions.h"
+// #include "cuda/cell_gpu.h"
+#include
+#include
+// #include "../config.h"
+
+void populate_parts_list(struct cell *ci, struct part_gpu *parts) {
+ ////////////////////////////////////////////
+ ///*****Copy variables for cell i (self interaction)*****/
+ int count = ci->hydro.count;
+
+ // fprintf(stderr,"Tester 111\n");
+ for (int p = 0; p < count; p++) {
+
+ parts[p].id = ci->hydro.parts[p].id;
+
+ // fprintf(stderr,"Tester 222\n");
+ parts[p].count = count;
+ parts[p].h_max = ci->hydro.h_max;
+
+ for (int d = 0; d < 3; d++) {
+ parts[p].x[d] = ci->hydro.parts[p].x[d];
+ parts[p].v[d] = ci->hydro.parts[p].v[d];
+ parts[p].a_hydro[d] = ci->hydro.parts[p].a_hydro[d];
+ parts[p].loc[d] = ci->loc[d];
+ }
+ parts[p].mass = ci->hydro.parts[p].mass;
+ parts[p].h = ci->hydro.parts[p].h;
+ parts[p].u = ci->hydro.parts[p].u;
+ parts[p].u_dt = ci->hydro.parts[p].u_dt;
+ parts[p].rho = ci->hydro.parts[p].rho;
+ parts[p].div_v = ci->hydro.parts[p].viscosity.div_v;
+ parts[p].div_v_previous_step =
+ ci->hydro.parts[p].viscosity.div_v_previous_step;
+ parts[p].alpha_visc = ci->hydro.parts[p].viscosity.alpha;
+ parts[p].v_sig = ci->hydro.parts[p].viscosity.v_sig;
+ parts[p].laplace_u = ci->hydro.parts[p].diffusion.laplace_u;
+ parts[p].alpha_diff = ci->hydro.parts[p].diffusion.alpha;
+ parts[p].f = ci->hydro.parts[p].force.f;
+ parts[p].soundspeed = ci->hydro.parts[p].force.soundspeed;
+ parts[p].h_dt = ci->hydro.parts[p].force.h_dt;
+ parts[p].balsara = ci->hydro.parts[p].force.balsara;
+ parts[p].pressure = ci->hydro.parts[p].force.pressure;
+ parts[p].time_bin = ci->hydro.parts[p].time_bin;
+ parts[p].wakeup = ci->hydro.parts[p].limiter_data.wakeup;
+ parts[p].min_ngb_time_bin =
+ ci->hydro.parts[p].limiter_data.min_ngb_time_bin;
+ parts[p].to_be_synchronized =
+ ci->hydro.parts[p].limiter_data.to_be_synchronized;
+ parts[p].wcount = ci->hydro.parts[p].density.wcount;
+ parts[p].wcount_dh = ci->hydro.parts[p].density.wcount_dh;
+ parts[p].rho_dh = ci->hydro.parts[p].density.rho_dh;
+ parts[p].div_v = ci->hydro.parts[p].viscosity.div_v;
+ parts[p].rot_v[0] = ci->hydro.parts[p].density.rot_v[0];
+ parts[p].rot_v[1] = ci->hydro.parts[p].density.rot_v[1];
+ parts[p].rot_v[2] = ci->hydro.parts[p].density.rot_v[2];
+ parts[p].SPH_sum = 0.f;
+ }
+}
+
+void populate_parts_list_soa(
+ int count_all_parts, struct cell *ci, int first_part_tmp, int count,
+ int tid, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+ float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy,
+ float *a_hydroz, float *mass, float *h, float *u, float *u_dt, float *rho,
+ float *SPH_sum, float *locx, float *locy, float *locz, float *widthx,
+ float *widthy, float *widthz, float *h_max, int *count_p, float *wcount,
+ float *wcount_dh, float *rho_dh, float *rot_u, float *rot_v, float *rot_w,
+ float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig,
+ float *laplace_u, float *alpha_diff, float *f, float *soundspeed,
+ float *h_dt, float *balsara, float *pressure, float *alpha_visc_max_ngb,
+ timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+ char *to_be_synchronized) {
+ ////////////////////////////////////////////
+ struct part *ptmps;
+ ptmps = ci->hydro.parts;
+ // fprintf(stderr,"Tester 111\n");
+#pragma unroll
+ for (int p = 0; p < count; p++) {
+ int p_gid = p + first_part_tmp;
+ // if(p_gid>=count_all_parts){
+ // fprintf(stderr,"p>all parts");
+ // exit(0);
+ // }
+ id[p_gid] = ptmps[p].id;
+ count_p[p_gid] = count;
+ tid_p[p_gid] = tid;
+ h_max[p_gid] = ci->hydro.h_max;
+ x_p[p_gid] = ptmps[p].x[0];
+ y_p[p_gid] = ptmps[p].x[1];
+ z_p[p_gid] = ptmps[p].x[2];
+ ux[p_gid] = ptmps[p].v[0];
+ uy[p_gid] = ptmps[p].v[1];
+ uz[p_gid] = ptmps[p].v[2];
+ a_hydrox[p_gid] = ptmps[p].a_hydro[0];
+ a_hydroy[p_gid] = ptmps[p].a_hydro[1];
+ a_hydroz[p_gid] = ptmps[p].a_hydro[2];
+ locx[p_gid] = ci->loc[0];
+ locy[p_gid] = ci->loc[1];
+ locz[p_gid] = ci->loc[2];
+
+ mass[p_gid] = ptmps[p].mass;
+ h[p_gid] = ptmps[p].h;
+ u[p_gid] = ptmps[p].u;
+ u_dt[p_gid] = ptmps[p].u_dt;
+ rho[p_gid] = ptmps[p].rho;
+ div_v[p_gid] = ptmps[p].viscosity.div_v;
+ div_v_previous_step[p_gid] = ptmps[p].viscosity.div_v_previous_step;
+ alpha_visc[p_gid] = ptmps[p].viscosity.alpha;
+ v_sig[p_gid] = ptmps[p].viscosity.v_sig;
+ laplace_u[p_gid] = ptmps[p].diffusion.laplace_u;
+ alpha_diff[p_gid] = ptmps[p].diffusion.alpha;
+ f[p_gid] = ptmps[p].force.f;
+ soundspeed[p_gid] = ptmps[p].force.soundspeed;
+ h_dt[p_gid] = ptmps[p].force.h_dt;
+ balsara[p_gid] = ptmps[p].force.balsara;
+ pressure[p_gid] = ptmps[p].force.pressure;
+ time_bin[p_gid] = ptmps[p].time_bin;
+ wakeup[p_gid] = ptmps[p].limiter_data.wakeup;
+ min_ngb_time_bin[p_gid] = ptmps[p].limiter_data.min_ngb_time_bin;
+ to_be_synchronized[p_gid] = ptmps[p].limiter_data.to_be_synchronized;
+ wcount[p_gid] = ptmps[p].density.wcount;
+ wcount_dh[p_gid] = ptmps[p].density.wcount_dh;
+ rho_dh[p_gid] = ptmps[p].density.rho_dh;
+ div_v[p_gid] = ptmps[p].viscosity.div_v;
+ rot_u[p_gid] = ptmps[p].density.rot_v[0];
+ rot_v[p_gid] = ptmps[p].density.rot_v[1];
+ rot_w[p_gid] = ptmps[p].density.rot_v[2];
+ SPH_sum[p_gid] = 0.f;
+ // fprintf(stderr,"tid is %i\n",tid_p[p]);
+ // fprintf(stderr,"Tester 222, count=%i, p=%i\n", count,
+ // id[p_gid]);
+ }
+}
+
+void pack_data_soa(int count_all_parts, struct cell *ci, int first_part_tmp,
+ int count, int tid, int *tid_p, long long *id, double *x_p,
+ double *y_p, double *z_p, float *ux, float *uy, float *uz,
+ float *a_hydrox, float *a_hydroy, float *a_hydroz,
+ float *mass, float *h, float *u, float *u_dt, float *rho,
+ float *SPH_sum, float *locx, float *locy, float *locz,
+ float *widthx, float *widthy, float *widthz, float *h_max,
+ int *count_p, float *wcount, float *wcount_dh, float *rho_dh,
+ float *rot_u, float *rot_v, float *rot_w, float *div_v,
+ float *div_v_previous_step, float *alpha_visc, float *v_sig,
+ float *laplace_u, float *alpha_diff, float *f,
+ float *soundspeed, float *h_dt, float *balsara,
+ float *pressure, float *alpha_visc_max_ngb,
+ timebin_t *time_bin, timebin_t *wakeup,
+ timebin_t *min_ngb_time_bin, char *to_be_synchronized) {
+ ////////////////////////////////////////////
+ struct part *ptmps;
+ ptmps = ci->hydro.parts;
+ // fprintf(stderr,"Tester 111\n");
+#pragma unroll
+ for (int p = 0; p < count; p++) {
+ int p_gid = p + first_part_tmp;
+ // if(p_gid>=count_all_parts){
+ // fprintf(stderr,"p>all parts");
+ // exit(0);
+ // }
+ id[p_gid] = ptmps[p].id;
+ count_p[p_gid] = count;
+ tid_p[p_gid] = tid;
+ h_max[p_gid] = ci->hydro.h_max;
+ x_p[p_gid] = ptmps[p].x[0];
+ y_p[p_gid] = ptmps[p].x[1];
+ z_p[p_gid] = ptmps[p].x[2];
+ ux[p_gid] = ptmps[p].v[0];
+ uy[p_gid] = ptmps[p].v[1];
+ uz[p_gid] = ptmps[p].v[2];
+ a_hydrox[p_gid] = ptmps[p].a_hydro[0];
+ a_hydroy[p_gid] = ptmps[p].a_hydro[1];
+ a_hydroz[p_gid] = ptmps[p].a_hydro[2];
+ locx[p_gid] = ci->loc[0];
+ locy[p_gid] = ci->loc[1];
+ locz[p_gid] = ci->loc[2];
+
+ mass[p_gid] = ptmps[p].mass;
+ h[p_gid] = ptmps[p].h;
+ u[p_gid] = ptmps[p].u;
+ u_dt[p_gid] = ptmps[p].u_dt;
+ rho[p_gid] = ptmps[p].rho;
+ div_v[p_gid] = ptmps[p].viscosity.div_v;
+ div_v_previous_step[p_gid] = ptmps[p].viscosity.div_v_previous_step;
+ alpha_visc[p_gid] = ptmps[p].viscosity.alpha;
+ v_sig[p_gid] = ptmps[p].viscosity.v_sig;
+ laplace_u[p_gid] = ptmps[p].diffusion.laplace_u;
+ alpha_diff[p_gid] = ptmps[p].diffusion.alpha;
+ f[p_gid] = ptmps[p].force.f;
+ soundspeed[p_gid] = ptmps[p].force.soundspeed;
+ h_dt[p_gid] = ptmps[p].force.h_dt;
+ balsara[p_gid] = ptmps[p].force.balsara;
+ pressure[p_gid] = ptmps[p].force.pressure;
+ time_bin[p_gid] = ptmps[p].time_bin;
+ wakeup[p_gid] = ptmps[p].limiter_data.wakeup;
+ min_ngb_time_bin[p_gid] = ptmps[p].limiter_data.min_ngb_time_bin;
+ to_be_synchronized[p_gid] = ptmps[p].limiter_data.to_be_synchronized;
+ wcount[p_gid] = ptmps[p].density.wcount;
+ wcount_dh[p_gid] = ptmps[p].density.wcount_dh;
+ rho_dh[p_gid] = ptmps[p].density.rho_dh;
+ div_v[p_gid] = ptmps[p].viscosity.div_v;
+ rot_u[p_gid] = ptmps[p].density.rot_v[0];
+ rot_v[p_gid] = ptmps[p].density.rot_v[1];
+ rot_w[p_gid] = ptmps[p].density.rot_v[2];
+ SPH_sum[p_gid] = 0.f;
+ // fprintf(stderr,"tid is %i\n",tid_p[p]);
+ // fprintf(stderr,"Tester 222, count=%i, p=%i\n", count,
+ // id[p_gid]);
+ }
+}
+
+// #ifdef WITH_CUDA
+// }
+// #endif
diff --git a/src/hip/HIP_runner_functions.h b/src/hip/HIP_runner_functions.h
new file mode 100644
index 0000000000..43a52f96ed
--- /dev/null
+++ b/src/hip/HIP_runner_functions.h
@@ -0,0 +1,22 @@
+#ifndef CUDA_HEADERS_H
+#define CUDA_HEADERS_H
+#define n_streams 1024
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "part_gpu.h"
+void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part,
+ int *d_task_last_part, int *d_bundle_first_part,
+ int *d_bundle_last_part, float d_a, float d_H,
+ const char *loop_type, hipStream_t stream, int bid,
+ int block_size, int count_tasks, int tasksperbundle,
+ int numBlocks_x, int numBlocks_y, int tid,
+ int offset, int bundle_first_task, int max_parts,
+ int max_active_bin);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CUDA_HEADER_H
diff --git a/src/hip/HIP_runner_functions.hip b/src/hip/HIP_runner_functions.hip
new file mode 100755
index 0000000000..634c67a9ad
--- /dev/null
+++ b/src/hip/HIP_runner_functions.hip
@@ -0,0 +1,229 @@
+#include "hip/hip_runtime.h"
+/*******************************************************************************
+ * This file contains functions used to setup and execute GPU tasks from within
+ *runner_main.c. Consider this a translator allowing .cu based functions to be
+ *called from within runner_main.c
+ ******************************************************************************/
+
+/* Hacky method to make c++ compilers not die. */
+#ifdef WITH_HIP
+#ifndef static
+#define static
+#endif
+#ifndef restrict
+#define restrict __restrict__
+#endif
+#endif
+
+/* Required header files */
+#include
+/*ifdef WITH_HIP prevents name mangling. C code sees exact names
+ of functions rather than mangled template names produced by C++*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../../config.h"
+#include "BLOCK_SIZE.h"
+#include "HIP_runner_functions.h"
+#include "hip/device_functions.h"
+#include "part_gpu.h"
+
+void Initialise_GPU() {
+ int devId = 0;
+ // find and print device name
+ hipDeviceProp_t prop;
+ hipGetDeviceProperties(&prop, devId);
+ printf("Device : %s\n", prop.name);
+ hipSetDevice(devId);
+ // cuda
+}
+#ifdef __cplusplus
+}
+#endif
+
+__global__ void runner_do_self_density_GPU(
+ struct part_soa parts_soa, int *d_task_first_part, int *d_task_last_part,
+ int *d_bundle_first_part, int *d_bundle_last_part, float d_a, float d_H,
+ int bid, int tid, int count_tasks, int tasksperbundle, int nBlocks_per_task,
+ int bundle_first_task, int max_parts, int time_bin_inhibited) {
+ extern __shared__ float vars[];
+ __shared__ int first_part_tid_0, last_part_tid_0;
+ const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+ const int task_id = bundle_first_task + blockIdx.y;
+
+ // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+ __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+ first_part_in_task_blocks = d_task_first_part[task_id],
+ last_part_in_task_blocks = d_task_last_part[task_id];
+ __syncthreads();
+ const int b_first_part = d_bundle_first_part[bid];
+ const int pid = threadid + first_part_in_task_blocks;
+ const int b_last_part = d_bundle_last_part[bid];
+
+ int ttid = 0;
+ int first_part = 0;
+ int count = 0;
+ int last_part = 0;
+ float cellx = 0.0, celly = 0.0, cellz = 0.0;
+ float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+ float mi = 0.0;
+ float uxi = 0.0;
+ float uyi = 0.0;
+ float uzi = 0.0;
+ float pix = 0.0;
+ float piy = 0.0;
+ float piz = 0.0;
+ float rhoi = 0.0;
+ float rho_dhi = 0.0;
+ float wcounti = 0.0;
+ float wcount_dhi = 0.0;
+ float div_vi = 0.0;
+ float rot_uxi = 0.0;
+ float rot_uyi = 0.0;
+ float rot_uzi = 0.0;
+ // if(pid (0.01f/128.f)*(0.01f/128.f)) {
+ const float r = sqrt(r2);
+ /* Recover some data */
+ const float mj = mass_tmp[j_block];
+ /* Get the kernel for hi. */
+ if(hi<1.f/128.f)printf("h < dx\n");
+ const float h_inv = 1.f / hi;
+ const float ui = r * h_inv;
+ float wi, wi_dx;
+
+ d_kernel_deval(ui, &wi, &wi_dx);
+
+ rhoi += mj * wi;
+ rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+ wcounti += wi;
+ wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+ const float r_inv = 1.f / r;
+ const float faci = mj * wi_dx * r_inv;
+
+ /* Compute dv dot r */
+ float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+ dvz = uzi - uz_tmp[j_block];
+ const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+ div_vi -= faci * dvdr;
+
+ /* Compute dv cross r */
+ float curlvrx = dvy * zij - dvz * yij;
+ float curlvry = dvz * xij - dvx * zij;
+ float curlvrz = dvx * yij - dvy * xij;
+
+ rot_uxi += faci * curlvrx;
+ rot_uyi += faci * curlvry;
+ rot_uzi += faci * curlvrz;
+ }
+ }
+ }
+ __syncthreads();
+ }
+ if (pid < last_part_in_task_blocks) {
+ float wi, wi_dx;
+ d_kernel_deval(0.f, &wi, &wi_dx);
+// printf("mass i %e, self rho %e sum rho %e\n", mi, mi*wi, rhoi);
+ parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+ parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+ parts_soa.div_v[pid] = div_vi;
+ parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi,
+ parts_soa.rot_uz[pid] = rot_uzi;
+ }
+}
+#ifdef __cplusplus
+extern "C" {
+#endif
+void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part,
+ int *d_task_last_part, int *d_bundle_first_part,
+ int *d_bundle_last_part, float d_a, float d_H,
+ const char *loop_type, hipStream_t stream, int bid,
+ int block_size, int count_tasks, int tasksperbundle,
+ int numBlocks_x, int numBlocks_y, int tid,
+ int offset, int bundle_first_task, int max_parts,
+ int time_bin_inhibited) {
+
+ dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+ int nBlocks_per_task = numBlocks_x;
+ runner_do_self_density_GPU<<>>(
+ parts_soa, d_task_first_part, d_task_last_part, d_bundle_first_part,
+ d_bundle_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+ nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/hip/Makefile.am b/src/hip/Makefile.am
new file mode 100755
index 0000000000..fc626b8831
--- /dev/null
+++ b/src/hip/Makefile.am
@@ -0,0 +1,55 @@
+SOURCES_HIP = HIP_runner_functions.hip
+include_HEADERS = HIP_runner_functions.h device_functions.h BLOCK_SIZE.h tester.h
+EXTRA_DIST = $(SOURCES_HIP) $(include_HEADERS)
+
+if HAVEHIP
+
+AM_CFLAGS = -I.. $(HDF5_CPPFLAGS)
+HIP_MYFLAGS = -D_FORCE_INLINES -O3 -g -DWITH_HIP --offload-arch=gfx90a
+#HIP_MYFLAGS = -D_FORCE_INLINES -O3 -g -v -lineinfo -src-in-ptx --maxrregcount=32 -ftz=true -DWITH_HIP -ccbin=gcc -m64 --default-stream per-thread#-dlink
+
+# Assign a "safe" version number
+AM_LDFLAGS = $(HDF5_LDFLAGS) $(FFTW_LIBS) -version-info 0:0:0
+
+#bin_PROGRAMS = test_27_cells test_125_cells
+
+# Rules to compile HIP code.
+.hip.o:
+ $(HIPCC) -c $(HIPFLAGS) $(AM_CFLAGS) $(HIP_CFLAGS) $(HIP_MYFLAGS) $< -o $@
+.hip.lo:
+ PATH=$(top_srcdir):$(PATH) && cudalt.py $@ $(HIPCC) -c $(HIPFLAGS) $(AM_CFLAGS) $(HIP_CFLAGS) $(HIP_MYFLAGS) $<
+
+# The library. Dummy C library so that we get libtool linking setup.
+lib_LTLIBRARIES = libswiftHIP.la libswiftdummy.la
+
+# Special link command to avoid including CFLAGS which are not understood.
+libswiftHIP_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
+ $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
+ $(libswiftHIP_la_LDFLAGS) \
+ $(LDFLAGS) -o $@
+
+libswiftHIP_la_SOURCES = $(SOURCES_HIP)
+libswiftHIP_la_CFLAGS = $(AM_CFLAGS) $(HIP_CFLAGS) $(HIP_MYFLAGS) ../libswiftsim_hip.la -I../
+libswiftHIP_la_LIBADD = ../.libs/libswiftsim_hip.la
+libswiftHIP_la_LDFLAGS = $(AM_LDFLAGS)
+
+if HAVEMPI
+libswiftHIP_la_CFLAGS += ../libswiftsim_mpihip.la
+libswiftHIP_la_LIBADD += ../.libs/libswiftsim_mpihip.la
+endif
+
+libswiftdummy_la_SOURCES = dummy.c
+libswiftdummy_la_CFLAGS = $(AM_CFLAGS)
+libswiftdummy_la_LDFLAGS = $(AM_LDFLAGS)
+
+#test_27_cells_SOURCES=test27cells.c
+#test_27_cells_CFLAGS=$(AM_CFLAGS) -DWITH_HIP $(HIP_CFLAGS)
+#test_27_cells_LDADD= ../.libs/libswiftsim_hip.la ../.libs/libswiftsim_mpihip.la libswiftHIP.la $(MPI_LIBS) $(EXTRA_LIBS) $(HIP_LIBS)
+#test_27_cells_LDFLAGS = $(AM_LDFLAGS) $(HIP_CFLAGS)
+
+#test_125_cells_SOURCES=test125cells.c
+#test_125_cells_CFLAGS=$(AM_CFLAGS) -DWITH_HIP $(HIP_CFLAGS)
+#test_125_cells_LDADD= ../libswiftsim_hip.la ../libswiftsim_mpihip.la libswiftHIP.la $(MPI_LIBS) $(EXTRA_LIBS) $(HIP_LIBS)
+#test_125_cells_LDFLAGS = $(AM_LDFLAGS) $(HIP_CFLAGS)
+
+endif
diff --git a/src/hip/am--include-marker b/src/hip/am--include-marker
new file mode 100644
index 0000000000..9ce06a81ea
--- /dev/null
+++ b/src/hip/am--include-marker
@@ -0,0 +1 @@
+# dummy
diff --git a/src/hip/cell_gpu.h b/src/hip/cell_gpu.h
new file mode 100644
index 0000000000..dc8d9306f2
--- /dev/null
+++ b/src/hip/cell_gpu.h
@@ -0,0 +1,292 @@
+#ifndef CELL_GPU_H
+#define CELL_GPU_H
+/* Config parameters. */
+#include "../config.h"
+typedef int8_t timebin_t;
+struct xpart_gpu {
+ /*! Offset between current position and position at last tree rebuild. */
+ float x_diff[3];
+ /*! Offset between the current position and position at the last sort. */
+ float x_diff_sort[3];
+ /*! Velocity at the last full step. */
+ float v_full[3];
+ /*! Internal energy at the last full step. */
+ float u_full;
+};
+struct part_gpu {
+ /*Task ID*/
+ int tid;
+ /*! Particle unique ID. */
+ long long id;
+ /*! Pointer to corresponding gravity part. */
+ // struct gpu_gpart* gpart;
+ /*! Particle position. */
+ float x[3];
+ /*! Particle predicted velocity. */
+ float v[3];
+ /*! Particle acceleration. */
+ float a_hydro[3];
+ /*! Particle mass. */
+ float mass;
+ /*! Particle smoothing length. */
+ float h;
+ /*! Particle internal energy. */
+ float u;
+ /*! Time derivative of the internal energy. */
+ float u_dt;
+ /*! Particle density. */
+ float rho;
+ /*! Kernel summation (For testing/debugging). */
+ float SPH_sum;
+
+ /* Cell information */
+ /*! The cell location on the grid (corner nearest to the origin). */
+ float loc[3];
+ /*! The cell dimensions. */
+ float width[3];
+ float h_max;
+ int count;
+ /* Density information */
+
+ /*! Neighbour number count. */
+ float wcount;
+
+ /*! Derivative of the neighbour number with respect to h. */
+ float wcount_dh;
+
+ /*! Derivative of density with respect to h */
+ float rho_dh;
+
+ /*! Particle velocity curl. */
+ float rot_v[3];
+
+ /* viscosity information */
+
+ /*! Particle velocity divergence */
+ float div_v;
+
+ /*! Particle velocity divergence from previous step */
+ float div_v_previous_step;
+
+ /*! Artificial viscosity parameter */
+ float alpha_visc;
+
+ /*! Signal velocity */
+ float v_sig;
+
+ /* thermal diffusion information */
+
+ /*! del^2 u, a smoothed quantity */
+ float laplace_u;
+
+ /*! Thermal diffusion coefficient */
+ float alpha_diff;
+
+ /* force information */
+
+ /*! "Grad h" term -- only partial in P-U */
+ float f;
+
+ /*! Particle soundspeed. */
+ float soundspeed;
+
+ /*! Time derivative of smoothing length */
+ float h_dt;
+
+ /*! Balsara switch */
+ float balsara;
+
+ /*! Particle pressure. */
+ float pressure;
+ /*! Maximal alpha (viscosity) over neighbours */
+ float alpha_visc_max_ngb;
+
+ /* timestep stuff */
+
+ /*! Time-step length */
+ timebin_t time_bin;
+
+ /*all part of struct timestep_limiter_data, we had to destruct it
+ as GPUs don't like pointer chasing especially when memcpying*/
+ /* Need waking-up ? */
+ timebin_t wakeup;
+
+ /*! Minimal time-bin across all neighbours */
+ timebin_t min_ngb_time_bin;
+
+ /* Do we want this particle to be synched back on the time-line? */
+ char to_be_synchronized;
+};
+
+typedef struct part_soa {
+ /*Task ID*/
+ int *tid_p;
+ /*bundle ID*/
+ int *bid_p;
+ /*! Particle unique ID. */
+ long long *id;
+ /*! Pointer to corresponding gravity part. */
+ // struct gpu_gpart* gpart;
+ /*! Particle position. */
+ double *x_p;
+ double *y_p;
+ double *z_p;
+ /*! Particle predicted velocity. */
+ float *ux;
+ float *uy;
+ float *uz;
+ /*! Particle acceleration. */
+ float *a_hydrox;
+ float *a_hydroy;
+ float *a_hydroz;
+ /*! Particle mass. */
+ float *mass;
+ /*! Particle smoothing length. */
+ float *h;
+ /*! Particle internal energy. */
+ float *u;
+ /*! Time derivative of the internal energy. */
+ float *u_dt;
+ /*! Particle density. */
+ float *rho;
+ /*! Kernel summation (For testing/debugging). */
+ float *SPH_sum;
+
+ /* Cell information */
+ /*! The cell location on the grid (corner nearest to the origin). */
+ float *locx;
+ float *locy;
+ float *locz;
+ /*! The cell dimensions. */
+ float *widthx;
+ float *widthy;
+ float *widthz;
+ float *h_max;
+ int *count_p;
+ int *count_test;
+ /* Density information */
+
+ /*! Neighbour number count. */
+ float *wcount;
+
+ /*! Derivative of the neighbour number with respect to h. */
+ float *wcount_dh;
+
+ /*! Derivative of density with respect to h */
+ float *rho_dh;
+
+ /*! Particle velocity curl. */
+ float *rot_ux;
+ float *rot_uy;
+ float *rot_uz;
+
+ /* viscosity information */
+
+ /*! Particle velocity divergence */
+ float *div_v;
+
+ /*! Particle velocity divergence from previous step */
+ float *div_v_previous_step;
+
+ /*! Artificial viscosity parameter */
+ float *alpha_visc;
+
+ /*! Signal velocity */
+ float *v_sig;
+
+ /* thermal diffusion information */
+
+ /*! del^2 u, a smoothed quantity */
+ float *laplace_u;
+
+ /*! Thermal diffusion coefficient */
+ float *alpha_diff;
+
+ /* force information */
+
+ /*! "Grad h" term -- only partial in P-U */
+ float *f;
+
+ /*! Particle soundspeed. */
+ float *soundspeed;
+
+ /*! Time derivative of smoothing length */
+ float *h_dt;
+
+ /*! Balsara switch */
+ float *balsara;
+
+ /*! Particle pressure. */
+ float *pressure;
+ /*! Maximal alpha (viscosity) over neighbours */
+ float *alpha_visc_max_ngb;
+
+ /* timestep stuff */
+
+ /*! Time-step length */
+ timebin_t *time_bin;
+
+ /*all part of struct timestep_limiter_data, we had to destruct it
+ as GPUs don't like pointer chasing especially when memcpying*/
+ /* Need waking-up ? */
+ timebin_t *wakeup;
+
+ /*! Minimal time-bin across all neighbours */
+ timebin_t *min_ngb_time_bin;
+
+ /* Do we want this particle to be synched back on the time-line? */
+ char *to_be_synchronized;
+
+} part_soa;
+
+struct task_cell {
+ struct part_gpu *parts;
+};
+// struct parts_gpu_SoA{
+// struct task_cell *tasks;
+// };
+
+struct cell_hydro_gpu {
+ // struct part_gpu *parts;
+ // struct xpart_gpu *xparts;
+ float h_max;
+ int count;
+};
+struct cell_gpu {
+ /*! The cell location on the grid (corner nearest to the origin). */
+ float loc[3];
+ /*! The cell dimensions. */
+ float width[3];
+ /*Details of contents (particles) and properties*/
+ struct cell_hydro_gpu hydro;
+};
+struct cell_gpu_flat {
+ /*! The cell location on the grid (corner nearest to the origin). */
+ float loc[3];
+ /*! The cell dimensions. */
+ float width[3];
+ float h_max;
+ int count;
+};
+
+struct cells_gpu_flat {
+ float *locx;
+ float *locy;
+ float *locz;
+ /*! The cell dimensions. */
+ float *widthx;
+ float *widthy;
+ float *widthz;
+ /*! The cell location on the grid (corner nearest to the origin). */
+ /* float *loc[3];*/
+ /*! The cell dimensions. */
+ /* float *width[3];*/
+ float *h_max;
+ int *count;
+};
+
+struct cells_gpu_flat_test {
+ float *locx;
+};
+
+#endif // CELL_GPU_H
diff --git a/src/hip/cuda_headers.h b/src/hip/cuda_headers.h
new file mode 100644
index 0000000000..2df61a53b5
--- /dev/null
+++ b/src/hip/cuda_headers.h
@@ -0,0 +1,63 @@
+#ifndef CUDA_HEADERS_H
+#define CUDA_HEADERS_H
+#define n_streams 1024
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+
+void GPU_runner_doself1_branch_gradient(struct cell_gpu *ci_gpu,
+ struct part_gpu *parts_gpu);
+void cuda_tester(struct cell **ci_list_mgd, int numBlocksTest,
+ int block_size_test, int count_tasks);
+void launch_cuda_kernel(struct cell_gpu *ci_gpu, struct part_gpu *parts,
+ int numBlocks, float d_a, float d_H,
+ const char *loop_type);
+void launch_cuda_kernel_streams(struct part_gpu *d_parts, int numBlocks,
+ float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int tid, int count,
+ int max_count, float cellx, float celly,
+ float cellz, int first_part, int last_part);
+void launch_cuda_kernel_bundles(struct cell_gpu *d_all_cells,
+ struct part_gpu **d_all_parts, int numBlocks,
+ float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size,
+ int count_tasks, int tasksperbundle,
+ int numBlocks_x, int numBlocks_y, int tid,
+ int offset);
+void launch_cuda_kernel_bundles_revised(
+ struct part_gpu *d_all_parts, int *d_task_first_part, int *d_task_last_part,
+ int *d_bundle_first_part, int *d_bundle_last_part, int numBlocks, float d_a,
+ float d_H, const char *loop_type, cudaStream_t stream, int bid,
+ int block_size, int count_tasks, int tasksperbundle, int numBlocks_x,
+ int numBlocks_y, int tid, int offset);
+void launch_cuda_kernel_bundles_revised_soa(
+ struct part_soa parts_gpu_soa, int *d_task_first_part,
+ int *d_task_last_part, int *d_bundle_first_part, int *d_bundle_last_part,
+ int numBlocks, float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size, int count_tasks,
+ int tasksperbundle, int numBlocks_x, int numBlocks_y, int tid, int offset,
+ int bundle_first_task, int max_parts);
+void launch_cuda_print_streams(int numBlocks, cudaStream_t stream, int tid);
+void launch_cuda_kernel_tester(struct cell_gpu *d_ci_gpu,
+ struct part_gpu **d_parts, int numBlocks,
+ float d_a, float d_H, const char *loop_type,
+ cudaStream_t stream, int bid, int block_size,
+ int count_tasks, int tasksperbundle,
+ int numBlocks_x, int numBlocks_y, int tid);
+void launch_cuda_kernel_bundles_test(struct cell_gpu *d_all_cells,
+ struct part_gpu **d_all_parts,
+ int numBlocks, float d_a, float d_H,
+ int count_tasks);
+void mgd_mem_cuda_kernel_bundles(struct part_gpu **parts_gpu_list,
+ int numBlocks, float d_a, float d_H,
+ const char *loop_type, cudaStream_t stream,
+ int bid, int block_size, int count_tasks,
+ int tasksperbundle, int numBlocks_x,
+ int numBlocks_y, int tid, int offset);
+
+#ifdef WITH_CUDA
+}
+#endif
+
+#endif // CUDA_HEADER_H
diff --git a/src/hip/device_functions.h b/src/hip/device_functions.h
new file mode 100644
index 0000000000..237c87dec1
--- /dev/null
+++ b/src/hip/device_functions.h
@@ -0,0 +1,149 @@
+#ifndef DEVICE_FUNCTIONS_H
+#define DEVICE_FUNCTIONS_H
+#include "../../config.h"
+
+/* Local headers. */
+// #include "../dimension.h"
+// #include "../error.h"
+// #include "../inline.h"
+// #include "../minmax.h"
+// #include "../vector.h"
+
+// Is this even necessary? Probably not as our code will operate differently
+#define num_cuda_threads 128
+#define hydro_dimension 3.f
+
+/// Here we define stuff from kernel_hydro.h when using cubic_spline_kernel.
+/// Will worry about sorting 'if statements for different kernels later////
+/* First some powers of gamma = H/h */
+#define kernel_gamma ((float)(1.825742))
+#define kernel_gamma_inv ((float)(1. / kernel_gamma))
+#define kernel_gamma2 ((float)(kernel_gamma * kernel_gamma))
+#define kernel_ivals 2
+#define kernel_degree 3 /*!< Degree of the polynomial */
+#define kernel_gamma_dim ((float)(kernel_gamma * kernel_gamma * kernel_gamma))
+#define kernel_gamma_dim_plus_one \
+ ((float)(kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma))
+#define kernel_gamma_inv_dim \
+ ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma)))
+#define kernel_gamma_inv_dim_plus_one \
+ ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma)))
+#define kernel_ivals_f ((float)kernel_ivals) /*!< Number of branches */
+#define kernel_constant ((float)(16. * M_1_PI))
+/*! Cosmology default beta=3.0.
+ * Alpha can be set in the parameter file.
+ * Beta is defined as in e.g. Price (2010) Eqn (103) */
+#define const_viscosity_beta 3.0f
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+/**
+ * @brief Returns the argument to the power given by the dimension plus one
+ *
+ * Computes \f$x^{d+1}\f$.
+ */
+__device__ float d_pow_dimension_plus_one(float x) {
+
+#if defined(HYDRO_DIMENSION_3D)
+
+ const float x2 = x * x;
+ return x2 * x2;
+
+#elif defined(HYDRO_DIMENSION_2D)
+
+ return x * x * x;
+
+#elif defined(HYDRO_DIMENSION_1D)
+
+ return x * x;
+
+#else
+
+ error("The dimension is not defined !");
+ return 0.f;
+
+#endif
+}
+
+/**
+ * @brief Return the argument to the power three adiabatic index minus five over
+ * two.
+ *
+ * Computes \f$x^{(3\gamma - 5)/2}\f$.
+ *
+ * @param x Argument
+ */
+__device__ float d_pow_three_gamma_minus_five_over_two(float x) {
+#if defined(HYDRO_GAMMA_5_3)
+
+ return 1.f; /* x^(0) */
+
+#elif defined(HYDRO_GAMMA_7_5)
+
+ return powf(x, -0.4f); /* x^(-2/5) */
+
+#elif defined(HYDRO_GAMMA_4_3)
+
+ return 1.f / sqrtf(x); /* x^(-1/2) */
+
+#elif defined(HYDRO_GAMMA_2_1)
+
+ return sqrtf(x); /* x^(1/2) */
+
+#else
+
+ error("The adiabatic index is not defined !");
+ return 0.f;
+
+#endif
+}
+
+/**
+ * @brief Computes the kernel function and its derivative.
+ *
+ * The kernel function needs to be mutliplied by \f$h^{-d}\f$ and the gradient
+ * by \f$h^{-(d+1)}\f$, where \f$d\f$ is the dimensionality of the problem.
+ *
+ * Returns 0 if \f$u > \gamma = H/h\f$.
+ *
+ * @param u The ratio of the distance to the smoothing length \f$u = x/h\f$.
+ * @param W (return) The value of the kernel function \f$W(x,h)\f$.
+ * @param dW_dx (return) The norm of the gradient of \f$|\nabla W(x,h)|\f$.
+ */
+__device__ void d_kernel_deval(float u, float *restrict W,
+ float *restrict dW_dx) {
+
+ /* Go to the range [0,1[ from [0,H[ */
+ const float x = u * kernel_gamma_inv;
+
+ /* Pick the correct branch of the kernel */
+ const int temp = (int)(x * kernel_ivals_f);
+ const int ind = temp > kernel_ivals ? kernel_ivals : temp;
+ static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)] = {
+ 3.f, -3.f, 0.f, 0.5f, /* 0 < u < 0.5 */
+ -1.f, 3.f, -3.f, 1.f, /* 0.5 < u < 1 */
+ 0.f, 0.f, 0.f, 0.f}; /* 1 < u */
+ const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
+ /* First two terms of the polynomial ... */
+ float w = coeffs[0] * x + coeffs[1];
+ float dw_dx = coeffs[0];
+
+ /* ... and the rest of them */
+ for (int k = 2; k <= kernel_degree; k++) {
+ dw_dx = dw_dx * x + w;
+ w = x * w + coeffs[k];
+ }
+
+ w = max(w, 0.f);
+ dw_dx = min(dw_dx, 0.f);
+
+ /* Return everything */
+ *W = w * kernel_constant * kernel_gamma_inv_dim;
+ *dW_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one;
+}
+
+#ifdef WITH_CUDA
+}
+#endif
+
+#endif // DEVICE_FUNCTIONS_H
diff --git a/src/hip/dummy.c b/src/hip/dummy.c
new file mode 100755
index 0000000000..66ab4665f9
--- /dev/null
+++ b/src/hip/dummy.c
@@ -0,0 +1,2 @@
+#include
+void swiftcudadummy() {}
diff --git a/src/hip/dummy.cpp b/src/hip/dummy.cpp
new file mode 100755
index 0000000000..66ab4665f9
--- /dev/null
+++ b/src/hip/dummy.cpp
@@ -0,0 +1,2 @@
+#include
+void swiftcudadummy() {}
diff --git a/src/hip/part_gpu.h b/src/hip/part_gpu.h
new file mode 100644
index 0000000000..5d7e32c611
--- /dev/null
+++ b/src/hip/part_gpu.h
@@ -0,0 +1,137 @@
+#ifndef PART_GPU_H
+#define PART_GPU_H
+/* Config parameters. */
+#include "../../config.h"
+typedef int8_t timebin_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// extern "C" {
+
+typedef struct part_soa {
+ /*Task ID*/
+ int *tid_p;
+ /*bundle ID*/
+ int *bid_p;
+ /*! Particle unique ID. */
+ long long *id;
+ /*! Pointer to corresponding gravity part. */
+ // struct gpu_gpart* gpart;
+ /*! Particle position. */
+ double *x_p;
+ double *y_p;
+ double *z_p;
+ /*! Particle predicted velocity. */
+ float *ux;
+ float *uy;
+ float *uz;
+ /*! Particle acceleration. */
+ float *a_hydrox;
+ float *a_hydroy;
+ float *a_hydroz;
+ /*! Particle mass. */
+ float *mass;
+ /*! Particle smoothing length. */
+ float *h;
+ /*! Particle internal energy. */
+ float *u;
+ /*! Time derivative of the internal energy. */
+ float *u_dt;
+ /*! Particle density. */
+ float *rho;
+ /*! Kernel summation (For testing/debugging). */
+ float *SPH_sum;
+
+ /* Cell information */
+ /*! The cell location on the grid (corner nearest to the origin). */
+ float *locx;
+ float *locy;
+ float *locz;
+ /*! The cell dimensions. */
+ float *widthx;
+ float *widthy;
+ float *widthz;
+ float *h_max;
+ int *count_p;
+ int *count_test;
+ /* Density information */
+
+ /*! Neighbour number count. */
+ float *wcount;
+
+ /*! Derivative of the neighbour number with respect to h. */
+ float *wcount_dh;
+
+ /*! Derivative of density with respect to h */
+ float *rho_dh;
+
+ /*! Particle velocity curl. */
+ float *rot_ux;
+ float *rot_uy;
+ float *rot_uz;
+
+ /* viscosity information */
+
+ /*! Particle velocity divergence */
+ float *div_v;
+
+ /*! Particle velocity divergence from previous step */
+ float *div_v_previous_step;
+
+ /*! Artificial viscosity parameter */
+ float *alpha_visc;
+
+ /*! Signal velocity */
+ float *v_sig;
+
+ /* thermal diffusion information */
+
+ /*! del^2 u, a smoothed quantity */
+ float *laplace_u;
+
+ /*! Thermal diffusion coefficient */
+ float *alpha_diff;
+
+ /* force information */
+
+ /*! "Grad h" term -- only partial in P-U */
+ float *f;
+
+ /*! Particle soundspeed. */
+ float *soundspeed;
+
+ /*! Time derivative of smoothing length */
+ float *h_dt;
+
+ /*! Balsara switch */
+ float *balsara;
+
+ /*! Particle pressure. */
+ float *pressure;
+ /*! Maximal alpha (viscosity) over neighbours */
+ float *alpha_visc_max_ngb;
+
+ /* timestep stuff */
+
+ /*! Time-step length */
+ timebin_t *time_bin;
+
+ /*all part of struct timestep_limiter_data, we had to destruct it
+ as GPUs don't like pointer chasing especially when memcpying*/
+ /* Need waking-up ? */
+ timebin_t *wakeup;
+
+ /*! Minimal time-bin across all neighbours */
+ timebin_t *min_ngb_time_bin;
+
+ /* Do we want this particle to be synched back on the time-line? */
+ char *to_be_synchronized;
+};
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif // PART_GPU_H
diff --git a/src/hip/print_something.cu b/src/hip/print_something.cu
new file mode 100755
index 0000000000..b69ad05dd4
--- /dev/null
+++ b/src/hip/print_something.cu
@@ -0,0 +1,37 @@
+#ifdef WITH_CUDA
+#ifndef static
+#define static
+#endif
+#ifndef restrict
+#define restrict __restrict__
+#endif
+#endif
+
+#include
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cuda_headers.h"
+#ifdef __cplusplus
+}
+#endif
+
+extern "C" {
+void print_something_cu() { printf("In Here\n"); }
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void Initialise_GPU() {
+ int devId = 0;
+ // find and print device name
+ cudaDeviceProp prop;
+ cudaGetDeviceProperties(&prop, devId);
+ printf("Device : %s\n", prop.name);
+ cudaSetDevice(devId);
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/hip/tasks_gpu.h b/src/hip/tasks_gpu.h
new file mode 100755
index 0000000000..a3912aee2c
--- /dev/null
+++ b/src/hip/tasks_gpu.h
@@ -0,0 +1,74 @@
+/* Config parameters. */
+#include "../config.h"
+
+struct tasks_self_gpu {
+ struct task_gpu *tgpu;
+};
+
+/**
+ * @brief A task to be run by the #scheduler.
+ */
+struct task_gpu {
+
+ /*! Pointers to the cells this task acts upon */
+ struct cell *ci, *cj;
+
+ /*! List of tasks unlocked by this one */
+ struct task_gpu **unlock_tasks;
+
+ /*! Flags used to carry additional information (e.g. sort directions) */
+ long long flags;
+
+#ifdef WITH_MPI
+
+ /*! Buffer for this task's communications */
+ void *buff;
+
+ /*! MPI request corresponding to this task */
+ MPI_Request req;
+
+#endif
+
+ /*! Rank of a task in the order */
+ int rank;
+
+ /*! Weight of the task */
+ float weight;
+
+ /*! Number of tasks unlocked by this one */
+ int nr_unlock_tasks;
+
+ /*! Number of unsatisfied dependencies */
+ int wait;
+
+ /*! Type of the task */
+ enum task_types type;
+
+ /*! Sub-type of the task (for the tasks that have one */
+ enum task_subtypes subtype;
+
+ /*! Should the scheduler skip this task ? */
+ char skip;
+
+ /*! Is this task implicit (i.e. does not do anything) ? */
+ char implicit;
+
+#ifdef SWIFT_DEBUG_TASKS
+ /*! ID of the queue or runner owning this task */
+ short int rid;
+
+ /*! Information about the direction of the pair task */
+ short int sid;
+#endif
+
+ /*! Start and end time of this task */
+ ticks tic, toc;
+
+ /* Total time spent running this task */
+ ticks total_ticks;
+
+#ifdef SWIFT_DEBUG_CHECKS
+ /* When was this task last run? */
+ integertime_t ti_run;
+#endif /* SWIFT_DEBUG_CHECKS */
+};
diff --git a/src/hip/tester.cu b/src/hip/tester.cu
new file mode 100644
index 0000000000..3ffaf9e10c
--- /dev/null
+++ b/src/hip/tester.cu
@@ -0,0 +1,21 @@
+#include "tester.h"
+
+#include
+#include
+#ifdef __cplusplus
+extern "C" {
+#endif
+void testing_linkage(int a, float *b, float c) {
+ std::vector b_value_list;
+ b_value_list.reserve(a);
+ for (int i = 0; i < a; i++) {
+ (*b) = (*b) + c;
+ b_value_list.push_back((*b));
+ std::cout << "Vector value is " << b_value_list[i] << " b value is " << (*b)
+ << std::endl;
+ }
+ std::cout << "Final value of b is " << (*b) << std::endl;
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/hip/tester.h b/src/hip/tester.h
new file mode 100755
index 0000000000..5729e66904
--- /dev/null
+++ b/src/hip/tester.h
@@ -0,0 +1,9 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void testing_linkage(int a, float *b, float c);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/memuse.h b/src/memuse.h
index 5883e68684..d51ab4282d 100644
--- a/src/memuse.h
+++ b/src/memuse.h
@@ -20,8 +20,11 @@
#define SWIFT_MEMUSE_H
/* Config parameters. */
+#ifdef WITH_CUDA
+#include "../config.h"
+#else
#include
-
+#endif
/* Includes. */
#include
diff --git a/src/queue.c b/src/queue.c
index 30601667cd..790b6b1335 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -178,7 +178,6 @@ void queue_insert(struct queue *q, struct task *t) {
}
}
}
-
/* Increase the incoming count. */
atomic_inc(&q->count_incoming);
}
diff --git a/src/queue.h b/src/queue.h
index 0576403bef..b90ca90b46 100644
--- a/src/queue.h
+++ b/src/queue.h
@@ -75,6 +75,28 @@ struct queue {
int *tid_incoming;
volatile unsigned int first_incoming, last_incoming, count_incoming;
+ /*Number of pack tasks left in queue A. Nasar */
+ volatile int
+ n_packs_self_left_d; /*Number of density pack tasks left in queue*/
+ volatile int n_packs_self_left_f; /*Number of force pack tasks left in queue*/
+ volatile int
+ n_packs_self_left_g; /*Number of gradient pack tasks left in queue*/
+
+ volatile int n_packs_pair_left_d;
+ volatile int n_packs_pair_left_f;
+ volatile int n_packs_pair_left_g;
+
+ volatile int
+ n_packs_self_stolen_d; /*Number of density pack tasks left in queue*/
+ volatile int
+ n_packs_self_stolen_f; /*Number of force pack tasks left in queue*/
+ volatile int
+ n_packs_self_stolen_g; /*Number of gradient pack tasks left in queue*/
+
+ volatile int n_packs_pair_stolen_d;
+ volatile int n_packs_pair_stolen_f;
+ volatile int n_packs_pair_stolen_g;
+
} __attribute__((aligned(queue_struct_align)));
/* Function prototypes. */
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
new file mode 100644
index 0000000000..a78ec6409c
--- /dev/null
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -0,0 +1,2116 @@
+#include "scheduler.h"
+#include "runner_doiact_hydro.h"
+#include "active.h"
+#include
+struct pack_vars_self {
+ /*List of tasks and respective cells to be packed*/
+ struct task **task_list;
+ struct task **top_task_list;
+ struct cell **cell_list;
+ /*List of cell positions*/
+ double *cellx;
+ double *celly;
+ double *cellz;
+ /*List of cell positions*/
+ double *d_cellx;
+ double *d_celly;
+ double *d_cellz;
+ int bundle_size;
+ /*How many particles in a bundle*/
+ int count_parts;
+ /**/
+ int tasks_packed;
+ int top_tasks_packed;
+ int *task_first_part;
+ int *task_last_part;
+ int *d_task_first_part;
+ int *d_task_last_part;
+ int *bundle_first_part;
+ int *bundle_last_part;
+ int *bundle_first_task_list;
+ int count_max_parts;
+ int launch;
+ int launch_leftovers;
+ int target_n_tasks;
+ int nBundles;
+ int tasksperbundle;
+
+} pack_vars_self;
+struct leaf_cell_list{
+ struct cell **ci;
+ struct cell **cj;
+ int n_leaves;
+ int n_start;
+ int n_end;
+ int n_packed;
+};
+struct pack_vars_pair {
+ /*List of tasks and respective cells to be packed*/
+ struct task **task_list;
+ struct task **top_task_list;
+ struct leaf_cell_list * leaf_list;
+ struct cell **ci_list;
+ struct cell **cj_list;
+ /*List of cell shifts*/
+ double *shiftx;
+ double *shifty;
+ double *shiftz;
+ /*List of cell shifts*/
+ double *d_shiftx;
+ double *d_shifty;
+ double *d_shiftz;
+ int bundle_size;
+ /*How many particles in a bundle*/
+ int count_parts;
+ /**/
+ int tasks_packed;
+ int top_tasks_packed;
+ int *task_first_part;
+ int *task_last_part;
+ int *d_task_first_part;
+ int *d_task_last_part;
+ int *bundle_first_part;
+ int *bundle_last_part;
+ int *bundle_first_task_list;
+ int count_max_parts;
+ int launch;
+ int launch_leftovers;
+ int target_n_tasks;
+ int nBundles;
+ int tasksperbundle;
+ int task_locked;
+
+} pack_vars_pair;
+
+struct pack_vars_pair_f4 {
+ /*List of tasks and respective cells to be packed*/
+ struct task **task_list;
+ struct cell **ci_list;
+ struct cell **cj_list;
+ /*List of cell shifts*/
+ float3 *shift;
+ /*List of cell shifts*/
+ float3 *d_shift;
+ int bundle_size;
+ /*How many particles in a bundle*/
+ int count_parts;
+ /**/
+ int tasks_packed;
+ int4 *fparti_fpartj_lparti_lpartj;
+ int4 *d_fparti_fpartj_lparti_lpartj;
+ int *bundle_first_part;
+ int *bundle_last_part;
+ int *bundle_first_task_list;
+ int count_max_parts;
+ int launch;
+ int launch_leftovers;
+ int target_n_tasks;
+ int nBundles;
+ int tasksperbundle;
+
+} pack_vars_pair_f4;
+
+#include "cuda/BLOCK_SIZE.h"
+#include "cuda/GPU_runner_functions.h"
+#include "runner_gpu_pack_functions.h"
+#include "task.h"
+#define CUDA_DEBUG
+
+double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
+ struct pack_vars_self *pack_vars, struct cell *ci,
+ struct task *t,
+ struct part_aos_f4_send *parts_send,
+ int2 *task_first_part_f4) {
+ /* Timers for how long this all takes.
+ * t0 and t1 are from start to finish including GPU calcs
+ * tp0 and tp1 only time packing and unpacking*/
+ struct timespec t0, t1; //
+ clock_gettime(CLOCK_REALTIME, &t0);
+ /* Find my queue for use later*/
+ int qid = r->qid;
+ /*Place pointers to the task and cells packed in an array for use later
+ * when unpacking after the GPU offload*/
+ int tasks_packed = pack_vars->tasks_packed;
+ pack_vars->task_list[tasks_packed] = t;
+ pack_vars->cell_list[tasks_packed] = ci;
+ /* Identify row in particle arrays where this task starts*/
+ task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
+ int *count_parts_self = &pack_vars->count_parts;
+ /* This re-arranges the particle data from cell->hydro->parts into a
+ long array of part structs*/
+ runner_doself1_gpu_pack_neat_aos_f4(
+ r, ci, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
+ count_parts_self, tasks_packed, pack_vars->count_max_parts);
+ /* Identify the row in the array where this task ends (row id of its
+ last particle)*/
+ task_first_part_f4[tasks_packed].y = pack_vars->count_parts;
+ /* Identify first particle for each bundle of tasks */
+ const int bundle_size = pack_vars->bundle_size;
+ if (tasks_packed % bundle_size == 0) {
+ int bid = tasks_packed / bundle_size;
+ pack_vars->bundle_first_part[bid] = task_first_part_f4[tasks_packed].x;
+ pack_vars->bundle_first_task_list[bid] = tasks_packed;
+ }
+ /* Tell the cell it has been packed */
+ ci->pack_done++;
+ /* Record that we have now done a packing (self) */
+ t->done = 1;
+ pack_vars->tasks_packed++;
+ pack_vars->launch = 0;
+ pack_vars->launch_leftovers = 0;
+
+ /*Get a lock to the queue so we can safely decrement counter and check for launch leftover condition*/
+ lock_lock(&s->queues[qid].lock);
+ s->queues[qid].n_packs_self_left_d--;
+ if (s->queues[qid].n_packs_self_left_d < 1) pack_vars->launch_leftovers = 1;
+ lock_unlock(&s->queues[qid].lock);
+ /*Have we packed enough tasks to offload to GPU?*/
+ if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+ pack_vars->launch = 1;
+
+ /*Record the end of packing time*/
+ clock_gettime(CLOCK_REALTIME, &t1);
+ /* Release the lock on the cell */
+ cell_unlocktree(ci);
+ t->gpu_done = 1;
+ /*Calculate time spent packing and return to runner_main*/
+ return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
+
+double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
+ struct pack_vars_self *pack_vars,
+ struct cell *ci, struct task *t,
+ struct part_aos_f4_g_send *parts_send,
+ int2 *task_first_part_f4) {
+
+ /* Timers for how long this all takes.
+ * t0 and t1 are from start to finish including GPU calcs
+ * tp0 and tp1 only time packing and unpacking*/
+ struct timespec t0, t1; //
+ clock_gettime(CLOCK_REALTIME, &t0);
+ /* Find my queue for use later*/
+ int qid = r->qid;
+ /*Place pointers to the task and cells packed in an array for use later
+ * when unpacking after the GPU offload*/
+ int tasks_packed = pack_vars->tasks_packed;
+ pack_vars->task_list[tasks_packed] = t;
+ pack_vars->cell_list[tasks_packed] = ci;
+ /* Identify row in particle arrays where this task starts*/
+ task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
+ int *count_parts_self = &pack_vars->count_parts;
+ /* This re-arranges the particle data from cell->hydro->parts into a
+ long array of part structs*/
+ runner_doself1_gpu_pack_neat_aos_f4_g(
+ r, ci, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
+ count_parts_self, tasks_packed, pack_vars->count_max_parts);
+ /* identify the row in the array where this task ends (row id of its
+ last particle)*/
+ task_first_part_f4[tasks_packed].y = pack_vars->count_parts;
+ /* Identify first particle for each bundle of tasks */
+ const int bundle_size = pack_vars->bundle_size;
+ if (tasks_packed % bundle_size == 0) {
+ int bid = tasks_packed / bundle_size;
+ pack_vars->bundle_first_part[bid] = task_first_part_f4[tasks_packed].x;
+ pack_vars->bundle_first_task_list[bid] = tasks_packed;
+ }
+ /* Tell the cell it has been packed */
+ ci->pack_done_g++;
+ /* Record that we have now done a packing (self) */
+ t->done = 1;
+ pack_vars->tasks_packed++;
+ pack_vars->launch = 0;
+ pack_vars->launch_leftovers = 0;
+ /*Get a lock to the queue so we can safely decrement counter and check for launch leftover condition*/
+ lock_lock(&s->queues[qid].lock);
+ s->queues[qid].n_packs_self_left_g--;
+ if (s->queues[qid].n_packs_self_left_g < 1) pack_vars->launch_leftovers = 1;
+ lock_unlock(&s->queues[qid].lock);
+
+ if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+ pack_vars->launch = 1;
+ /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+ * launch_leftovers statement)*/
+ clock_gettime(CLOCK_REALTIME, &t1);
+ /* Release the lock on the cell */
+ cell_unlocktree(ci);
+ /*Calculate time spent packing and return to runner_main*/
+ return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
+
+double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
+ struct pack_vars_self *pack_vars,
+ struct cell *ci, struct task *t,
+ struct part_aos_f4_f_send *parts_send,
+ int2 *task_first_part_f4) {
+
+ /* Timers for how long this all takes.
+ * t0 and t1 are from start to finish including GPU calcs
+ * tp0 and tp1 only time packing and unpacking*/
+ struct timespec t0, t1; //
+ clock_gettime(CLOCK_REALTIME, &t0);
+ /* Find my queue for use later*/
+ int qid = r->qid;
+ /*Place pointers to the task and cells packed in an array for use later
+ * when unpacking after the GPU offload*/
+ int tasks_packed = pack_vars->tasks_packed;
+ pack_vars->task_list[tasks_packed] = t;
+ pack_vars->cell_list[tasks_packed] = ci;
+ /* Identify row in particle arrays where this task starts*/
+ task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
+ int *count_parts_self = &pack_vars->count_parts;
+ /* This re-arranges the particle data from cell->hydro->parts into a
+ long array of part structs*/
+ runner_doself1_gpu_pack_neat_aos_f4_f(
+ r, ci, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
+ count_parts_self, tasks_packed, pack_vars->count_max_parts);
+ /* Identify the row in the array where this task ends (row id of its
+ last particle) */
+ task_first_part_f4[tasks_packed].y = pack_vars->count_parts;
+ /* Identify first particle for each bundle of tasks */
+ const int bundle_size = pack_vars->bundle_size;
+ if (tasks_packed % bundle_size == 0) {
+ int bid = tasks_packed / bundle_size;
+ pack_vars->bundle_first_part[bid] = task_first_part_f4[tasks_packed].x;
+ pack_vars->bundle_first_task_list[bid] = tasks_packed;
+ }
+ /* Tell the cell it has been packed */
+ ci->pack_done_f++;
+ /* Record that we have now done a packing (self) */
+ t->done = 1;
+ pack_vars->tasks_packed++;
+ pack_vars->launch = 0;
+ pack_vars->launch_leftovers = 0;
+ /*Get a lock to the queue so we can safely decrement counter and check for launch leftover condition*/
+ lock_lock(&s->queues[qid].lock);
+ s->queues[qid].n_packs_self_left_f--;
+ if (s->queues[qid].n_packs_self_left_f < 1) pack_vars->launch_leftovers = 1;
+ lock_unlock(&s->queues[qid].lock);
+ /*Have we packed enough tasks to offload to GPU?*/
+ if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+ pack_vars->launch = 1;
+
+ /*Record the end of packing time*/
+ clock_gettime(CLOCK_REALTIME, &t1);
+ /* Release the lock on the cell */
+ cell_unlocktree(ci);
+ /*Calculate time spent packing and return to runner_main*/
+ return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
+
+void runner_recurse_gpu(struct runner *r, struct scheduler *s,
+ struct pack_vars_pair *restrict pack_vars,
+ struct cell *ci, struct cell *cj, struct task *t,
+ struct part_aos_f4_send *parts_send,
+ struct engine *e,
+ int4 *fparti_fpartj_lparti_lpartj, int *n_leafs_found,
+ int depth, int n_expected_tasks) {
+
+ /* Should we even bother? A. Nasar: For GPU code we need to be clever about this */
+ if (!CELL_IS_ACTIVE(ci, e) && !CELL_IS_ACTIVE(cj, e)) return;
+ if (ci->hydro.count == 0 || cj->hydro.count == 0) return;
+
+ /* Get the type of pair and flip ci/cj if needed. */
+ double shift[3];
+ const int sid = space_getsid_and_swap_cells(s, &ci, &cj, shift);
+
+ /* Recurse? */
+ if (cell_can_recurse_in_pair_hydro_task(ci) &&
+ cell_can_recurse_in_pair_hydro_task(cj)) {
+ struct cell_split_pair *csp = &cell_split_pairs[sid];
+ for (int k = 0; k < csp->count; k++) {
+ const int pid = csp->pairs[k].pid;
+ const int pjd = csp->pairs[k].pjd;
+ /*Do we want to do anything before we recurse?*/
+
+ /*We probably want to record */
+ if (ci->progeny[pid] != NULL && cj->progeny[pjd] != NULL){
+ runner_recurse_gpu(r, s, pack_vars, ci->progeny[pid], cj->progeny[pjd], t, parts_send, e, fparti_fpartj_lparti_lpartj,
+ n_leafs_found, depth + 1, n_expected_tasks);
+// message("recursing to depth %i", depth + 1);
+ }
+ }
+ }
+ else if (CELL_IS_ACTIVE(ci, e) || CELL_IS_ACTIVE(cj, e)) {
+ /* if any cell empty: skip */
+ if(ci->hydro.count == 0 || cj->hydro.count == 0) return;
+ int leafs_found = *n_leafs_found;
+ /*for all leafs to be sent add to cell list */
+// cells_left[leafs_found] = ci;
+// cells_right[leafs_found] = cj;
+ /*Add leaf cells to list for each top_level task*/
+ pack_vars->leaf_list[pack_vars->top_tasks_packed].ci[leafs_found] = ci;
+ pack_vars->leaf_list[pack_vars->top_tasks_packed].cj[leafs_found] = cj;
+ pack_vars->leaf_list[pack_vars->top_tasks_packed].n_leaves++;
+// error("stop");
+ *n_leafs_found = leafs_found + 1;
+ if(*n_leafs_found >= n_expected_tasks)
+ error("Created %i more than expected leaf cells. depth %i", *n_leafs_found, depth);
+ }
+
+};
+
+double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
+ struct pack_vars_pair *restrict pack_vars,
+ struct cell *ci, struct cell *cj, struct task *t,
+ struct part_aos_f4_send *parts_send,
+ struct engine *e,
+ int4 *fparti_fpartj_lparti_lpartj) {
+ /* Timers for how long this all takes.
+ * t0 and t1 are from start to finish including GPU calcs
+ * tp0 and tp1 only time packing and unpacking*/
+ struct timespec t0, t1; //
+ clock_gettime(CLOCK_REALTIME, &t0);
+ int tasks_packed = pack_vars->tasks_packed;
+ int qid = r->qid;
+
+ double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+ struct cell *citmp, *cjtmp;
+ citmp=ci;
+ cjtmp=cj;
+ /* Get the type of pair and flip ci/cj if needed. */
+ double shift[3];
+ const int sid = space_getsid_and_swap_cells(s, &citmp, &cjtmp, shift);
+ if(citmp != ci) error("I'm flipped");
+ /*Get the shifts in case of periodics*/
+ space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+
+ /*Get pointers to the list of tasks and cells packed*/
+// pack_vars->task_list[tasks_packed] = t;
+ pack_vars->ci_list[tasks_packed] = ci;
+ pack_vars->cj_list[tasks_packed] = cj;
+
+ float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
+
+ const int count_ci = ci->hydro.count;
+ const int count_cj = cj->hydro.count;
+
+ /*Assign an id for this task*/
+ const int tid = tasks_packed;
+
+ /* Find first parts in task for ci and cj. Packed_tmp is index for cell i.
+ * packed_tmp+1 is index for cell j */
+ fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts;
+ fparti_fpartj_lparti_lpartj[tasks_packed].y =
+ pack_vars->count_parts + count_ci;
+
+ int *count_parts = &pack_vars->count_parts;
+ /* This re-arranges the particle data from cell->hydro->parts into a
+ long array of part structs*/
+ runner_do_ci_cj_gpu_pack_neat_aos_f4(
+ r, ci, cj, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
+ count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj,
+ shift_tmp);
+ /* Find last parts in task for ci and cj*/
+ fparti_fpartj_lparti_lpartj[tasks_packed].z =
+ pack_vars->count_parts - count_cj;
+ fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts;
+
+ /* Tell the cells they have been packed */
+ ci->pack_done++;
+ cj->pack_done++;
+
+ /* Identify first particle for each bundle of tasks */
+ const int bundle_size = pack_vars->bundle_size;
+ if (tasks_packed % bundle_size == 0) {
+ int bid = tasks_packed / bundle_size;
+ pack_vars->bundle_first_part[bid] =
+ fparti_fpartj_lparti_lpartj[tasks_packed].x;
+ pack_vars->bundle_first_task_list[bid] = tasks_packed;
+ }
+ /* Record that we have now done a packing (self) */
+ t->done = 1;
+ pack_vars->tasks_packed++;
+ pack_vars->launch = 0;
+ pack_vars->launch_leftovers = 0;
+ pack_vars->leaf_list[pack_vars->top_tasks_packed - 1].n_packed++;
+
+ //A. Nasar: Need to come back to this at some point!
+ lock_lock(&s->queues[qid].lock);
+ s->queues[qid].n_packs_pair_left_d--;
+ if (s->queues[qid].n_packs_pair_left_d < 1) pack_vars->launch_leftovers = 1;
+ lock_unlock(&s->queues[qid].lock);
+ if (pack_vars->tasks_packed == pack_vars->target_n_tasks){
+ pack_vars->launch = 1;
+ }
+ /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+ * launch_leftovers statement)*/
+ clock_gettime(CLOCK_REALTIME, &t1);
+ return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+};
+
+double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
+ struct pack_vars_pair *restrict pack_vars,
+ struct cell *ci, struct cell *cj,
+ struct task *t,
+ struct part_aos_f4_g_send *parts_send,
+ struct engine *e,
+ int4 *fparti_fpartj_lparti_lpartj) {
+
+ /* Timers for how long this all takes.
+ * t0 and t1 are from start to finish including GPU calcs
+ * tp0 and tp1 only time packing and unpacking*/
+ struct timespec t0, t1; //
+ clock_gettime(CLOCK_REALTIME, &t0);
+ int tasks_packed = pack_vars->tasks_packed;
+
+ int qid = r->qid;
+ // pthread_mutex_lock(&s->sleep_mutex);
+ // atomic_dec(&(s->p_g_left[qid]));
+ // pthread_cond_broadcast(&s->sleep_cond);
+ // pthread_mutex_unlock(&s->sleep_mutex);
+
+ double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+ /*Get the shifts in case of periodics*/
+ space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+
+ /*Get pointers to the list of tasks and cells packed*/
+ pack_vars->task_list[tasks_packed] = t;
+ pack_vars->ci_list[tasks_packed] = ci;
+ pack_vars->cj_list[tasks_packed] = cj;
+
+ float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
+
+ const int count_ci = ci->hydro.count;
+ const int count_cj = cj->hydro.count;
+
+ /*Assign an id for this task*/
+ const int tid = tasks_packed;
+
+ /* Find first parts in task for ci and cj. Packed_tmp is index for cell i.
+ * packed_tmp+1 is index for cell j */
+ // pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
+ // pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts +
+ // count_ci;
+
+ fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts;
+ fparti_fpartj_lparti_lpartj[tasks_packed].y =
+ pack_vars->count_parts + count_ci;
+
+ int *count_parts = &pack_vars->count_parts;
+ // if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid,
+ // pack_vars->count_parts);
+ /* This re-arranges the particle data from cell->hydro->parts into a
+ long array of part structs*/
+ runner_do_ci_cj_gpu_pack_neat_aos_f4_g(
+ r, ci, cj, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
+ count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj,
+ shift_tmp);
+ // runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no
+ // timing, 1 for timing*/, count_parts, tasks_packed,
+ // pack_vars->count_max_parts); //This may cause an issue. Be sure to test
+ // that
+ // pack_vars->count_parts is actually increment here
+ /* Find last parts in task for ci and cj. Packed_tmp is index for cell i.
+ * packed_tmp+1 is index for cell j */
+
+ // if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count
+ // %i\n", r->cpuid, *count_parts, pack_vars->count_parts);
+ fparti_fpartj_lparti_lpartj[tasks_packed].z =
+ pack_vars->count_parts - count_cj;
+ fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts;
+ // pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts -
+ // count_cj; pack_vars->task_last_part[packed_tmp + 1] =
+ // pack_vars->count_parts;
+
+ /* Tell the cells they have been packed */
+ ci->pack_done_g++;
+ cj->pack_done_g++;
+
+ /* Identify first particle for each bundle of tasks */
+ const int bundle_size = pack_vars->bundle_size;
+ if (tasks_packed % bundle_size == 0) {
+ int bid = tasks_packed / bundle_size;
+ pack_vars->bundle_first_part[bid] =
+ fparti_fpartj_lparti_lpartj[tasks_packed].x;
+ pack_vars->bundle_first_task_list[bid] = tasks_packed;
+ }
+
+ /* Record that we have now done a packing (self) */
+ t->done = 1;
+ /* Copies done. Release the lock ! */
+ cell_unlocktree(ci);
+ cell_unlocktree(cj);
+ pack_vars->tasks_packed++;
+ pack_vars->launch = 0;
+ pack_vars->launch_leftovers = 0;
+ /* Record that we have now done a packing (self) */
+ // int qid = r->qid;
+ // atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
+
+ lock_lock(&s->queues[qid].lock);
+
+ s->queues[qid].n_packs_pair_left_g--;
+
+ if (s->queues[qid].n_packs_pair_left_g < 1) pack_vars->launch_leftovers = 1;
+
+ lock_unlock(&s->queues[qid].lock);
+
+ // if ((s->p_g_left[qid] < 1))
+ // pack_vars->launch_leftovers = 1;
+ if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+ pack_vars->launch = 1;
+ /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+ * launch_leftovers statement)*/
+ clock_gettime(CLOCK_REALTIME, &t1);
+ return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
+
+double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
+ struct pack_vars_pair *restrict pack_vars,
+ struct cell *ci, struct cell *cj,
+ struct task *t,
+ struct part_aos_f4_f_send *parts_send,
+ struct engine *e,
+ int4 *fparti_fpartj_lparti_lpartj) {
+
+ /* Timers for how long this all takes.
+ * t0 and t1 are from start to finish including GPU calcs
+ * tp0 and tp1 only time packing and unpacking*/
+ struct timespec t0, t1; //
+ clock_gettime(CLOCK_REALTIME, &t0);
+ int tasks_packed = pack_vars->tasks_packed;
+
+ /* Record that we have now done a packing (self) */
+ int qid = r->qid;
+ // atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
+ // pthread_mutex_lock(&s->sleep_mutex);
+ atomic_dec(&(s->p_f_left[qid]));
+ // pthread_cond_broadcast(&s->sleep_cond);
+ // pthread_mutex_unlock(&s->sleep_mutex);
+
+ double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+ /*Get the shifts in case of periodics*/
+ space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+
+ /*Get pointers to the list of tasks and cells packed*/
+ pack_vars->task_list[tasks_packed] = t;
+ pack_vars->ci_list[tasks_packed] = ci;
+ pack_vars->cj_list[tasks_packed] = cj;
+
+ float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
+
+ const int count_ci = ci->hydro.count;
+ const int count_cj = cj->hydro.count;
+
+ /*Assign an id for this task*/
+ const int tid = tasks_packed;
+
+ /* Find first parts in task for ci and cj. Packed_tmp is index for cell i.
+ * packed_tmp+1 is index for cell j */
+ // pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
+ // pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts +
+ // count_ci;
+
+ fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts;
+ fparti_fpartj_lparti_lpartj[tasks_packed].y =
+ pack_vars->count_parts + count_ci;
+
+ int *count_parts = &pack_vars->count_parts;
+ // if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid,
+ // pack_vars->count_parts);
+ /* This re-arranges the particle data from cell->hydro->parts into a
+ long array of part structs*/
+ runner_do_ci_cj_gpu_pack_neat_aos_f4_f(
+ r, ci, cj, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
+ count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj,
+ shift_tmp);
+ // runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no
+ // timing, 1 for timing*/, count_parts, tasks_packed,
+ // pack_vars->count_max_parts); //This may cause an issue. Be sure to test
+ // that
+ // pack_vars->count_parts is actually increment here
+ /* Find last parts in task for ci and cj. Packed_tmp is index for cell i.
+ * packed_tmp+1 is index for cell j */
+
+ // if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count
+ // %i\n", r->cpuid, *count_parts, pack_vars->count_parts);
+ fparti_fpartj_lparti_lpartj[tasks_packed].z =
+ pack_vars->count_parts - count_cj;
+ fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts;
+ // pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts -
+ // count_cj; pack_vars->task_last_part[packed_tmp + 1] =
+ // pack_vars->count_parts;
+
+ /* Tell the cells they have been packed */
+ ci->pack_done_f++;
+ cj->pack_done_f++;
+
+ /* Identify first particle for each bundle of tasks */
+ const int bundle_size = pack_vars->bundle_size;
+ if (tasks_packed % bundle_size == 0) {
+ int bid = tasks_packed / bundle_size;
+ pack_vars->bundle_first_part[bid] =
+ fparti_fpartj_lparti_lpartj[tasks_packed].x;
+ pack_vars->bundle_first_task_list[bid] = tasks_packed;
+ }
+
+ /* Record that we have now done a packing (self) */
+ t->done = 1;
+ /* Copies done. Release the lock ! */
+ cell_unlocktree(ci);
+ cell_unlocktree(cj);
+ pack_vars->tasks_packed++;
+ pack_vars->launch = 0;
+ pack_vars->launch_leftovers = 0;
+
+ lock_lock(&s->queues[qid].lock);
+
+ s->queues[qid].n_packs_pair_left_f--;
+
+ if (s->queues[qid].n_packs_pair_left_f < 1) pack_vars->launch_leftovers = 1;
+
+ lock_unlock(&s->queues[qid].lock);
+
+ // if ((s->p_f_left[qid] < 1))
+ // pack_vars->launch_leftovers = 1;
+ if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+ pack_vars->launch = 1;
+ /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+ * launch_leftovers statement)*/
+ clock_gettime(CLOCK_REALTIME, &t1);
+ return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
+
+void runner_doself1_launch_f4(
+ struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars,
+ struct cell *ci, struct task *t, struct part_aos_f4_send *parts_send,
+ struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
+ struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+ float d_H, struct engine *e, double *packing_time, double *gpu_time,
+ double *unpack_time, int devId,
+ int2 *task_first_part_f4, int2 *d_task_first_part_f4,
+ cudaEvent_t *self_end) {
+
+ struct timespec t0, t1, tp0, tp1; //
+ clock_gettime(CLOCK_REALTIME, &t0);
+
+ /* Identify the number of GPU bundles to run in ideal case*/
+ int nBundles_temp = pack_vars->nBundles;
+
+ /*How many tasks have we packed?*/
+ const int tasks_packed = pack_vars->tasks_packed;
+
+ /*How many tasks should be in a bundle?*/
+ const int bundle_size = pack_vars->bundle_size;
+
+ /* Special case for incomplete bundles (when having leftover tasks not enough
+ * to fill a bundle) */
+ if (pack_vars->launch_leftovers) {
+ nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+ if (tasks_packed == 0)
+ error("zero tasks packed but somehow got into GPU loop");
+ // pack_vars->bundle_first_part[nBundles_temp] =
+ // pack_vars->task_first_part[tasks_packed - 1];
+ pack_vars->bundle_first_part[nBundles_temp] =
+ task_first_part_f4[tasks_packed - 1].x;
+ }
+ /* Identify the last particle for each bundle of tasks */
+ for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+ pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+ }
+ /* special treatment for the last bundle */
+ if (nBundles_temp > 1)
+ pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+ else
+ pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+ // clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
+ /*Copy arrays containing first and last part for each task to GPU*/
+ // cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+ // tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+ // cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+ // tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+ // cudaMemPrefetchAsync(d_task_first_part_self_dens_f4, tasks_packed *
+ // sizeof(int2), devId, NULL);
+ /*Copy cell shifts to device*/
+ // cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
+ // tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+ // cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
+ // tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+ // cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
+ // tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+ // clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
+ // *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
+ // (t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) / 1000000000.0;
+ /* Launch the copies for each bundle and run the GPU kernel */
+ /*We don't go into this loop if tasks_left_self == 1 as
+ nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+ int max_parts;
+ for (int bid = 0; bid < nBundles_temp; bid++) {
+
+ max_parts = 0;
+ int parts_in_bundle = 0;
+ const int first_task = bid * bundle_size;
+ int last_task = (bid + 1) * bundle_size;
+ for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+ if (tid < tasks_packed) {
+ /*Get an estimate for the max number of parts per cell in the bundle.
+ * Used for determining the number of GPU CUDA blocks*/
+ int count = task_first_part_f4[tid].y - task_first_part_f4[tid].x;
+ parts_in_bundle += count;
+ max_parts = max(max_parts, count);
+ last_task = tid;
+ }
+ }
+ // const int n_tasks = last_task - first_task;
+
+ const int first_part_tmp = pack_vars->bundle_first_part[bid];
+ const int bundle_n_parts =
+ pack_vars->bundle_last_part[bid] - first_part_tmp;
+ // clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
+ // cudaMemPrefetchAsync(&d_task_first_part_self_dens_f4[first_task],
+ // (last_task - first_task) * sizeof(int2),
+ // devId, stream[bid]);
+ cudaMemcpyAsync(&d_task_first_part_f4[first_task],
+ &task_first_part_f4[first_task],
+ (last_task + 1 - first_task) * sizeof(int2),
+ cudaMemcpyHostToDevice, stream[bid]);
+ // cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();
+ //// if (cu_error != cudaSuccess) { fprintf(
+ /// stderr, "CUDA error in density
+ // self host 2 device memcpy: %s cpuid id is: %i\n ",
+ // cudaGetErrorString(cu_error), r->cpuid);
+ // exit(0);
+ // }
+ // clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
+ // *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
+ // (t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) /
+ // 1000000000.0;
+ cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
+ bundle_n_parts * sizeof(struct part_aos_f4_send),
+ cudaMemcpyHostToDevice, stream[bid]);
+
+ // #ifdef CUDA_DEBUG
+ // cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();
+ ////
+ // //
+ // Get error code if (cu_error != cudaSuccess) { fprintf(
+ // stderr, "CUDA error in density self host 2 device
+ // memcpy: %s cpuid id is: %i\n ",
+ // cudaGetErrorString(cu_error), r->cpuid);
+ // exit(0);
+ // }
+ // #endif
+ const int tasksperbundle = pack_vars->tasksperbundle;
+ int tasks_left = tasksperbundle;
+ if (bid == nBundles_temp - 1) {
+ tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+ }
+ // Will launch a 2d grid of GPU thread blocks (number of tasks is
+ // the y dimension and max_parts is the x dimension
+ int numBlocks_y = tasks_left;
+ int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+ // const char *loop_type = "density";
+ // struct first_part first_parts;
+ // for(int i = 0; i < numBlocks_y; i++) first_parts.list[i] =
+ // pack_vars->task_first_part[i]; fprintf(stderr, "Launching kernel with
+ // %i tasks leftovers %i\n", tasks_packed,
+ // pack_vars->launch_leftovers);
+ // Launch the kernel
+ launch_density_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
+ numBlocks_x, numBlocks_y, bundle_first_task,
+ d_task_first_part_f4);
+ // #ifdef CUDA_DEBUG
+ // cu_error = cudaPeekAtLastError(); // Get error code
+ // if (cu_error != cudaSuccess) {
+ // fprintf(stderr,
+ // "CUDA error with self density kernel launch: %s
+ // cpuid id is: %i\n ",
+ // cudaGetErrorString(cu_error), r->cpuid); exit(0);
+ // }
+ // #endif
+ cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
+ bundle_n_parts * sizeof(struct part_aos_f4_recv),
+ cudaMemcpyDeviceToHost, stream[bid]);
+ cudaEventRecord(self_end[bid], stream[bid]);
+ // #ifdef CUDA_DEBUG
+ // cu_error = cudaPeekAtLastError(); // cudaGetLastError(); //
+ // //
+ // Get error code if (cu_error != cudaSuccess) {
+ // fprintf(stderr, "CUDA error with self density
+ // D2H memcpy: %s cpuid id is: %i\n ",
+ // cudaGetErrorString(cu_error),
+ // r->cpuid); error("Something's up with your cuda code");
+ // }
+ // #endif
+ } /*End of looping over bundles to launch in streams*/
+ /* Make sure all the kernels and copies back are finished */
+ // cudaDeviceSynchronize();
+
+ /*Time end of GPU work*/
+ clock_gettime(CLOCK_REALTIME, &t1);
+ *gpu_time +=
+ (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+ /* Now copy the data back from the CPU thread-local buffers to the cells */
+ /* Pack length counter for use in unpacking */
+ int pack_length_unpack = 0;
+ ticks total_cpu_unpack_ticks = 0.;
+ for (int bid = 0; bid < nBundles_temp; bid++) {
+
+ clock_gettime(CLOCK_REALTIME, &t0);
+
+ // cudaStreamSynchronize(stream[bid]);
+ cudaEventSynchronize(self_end[bid]);
+
+ clock_gettime(CLOCK_REALTIME, &t1);
+ *gpu_time +=
+ (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+ /*Time unpacking*/
+ // clock_gettime(CLOCK_REALTIME, &tp0);
+
+ for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+ if (tid < tasks_packed) {
+ struct cell *cii = pack_vars->cell_list[tid];
+ struct task *tii = pack_vars->task_list[tid];
+
+ // struct cell *cii = ci_list_self_dens[tid];
+ // struct task *tii = task_list_self_dens[tid];
+
+ clock_gettime(CLOCK_REALTIME, &tp0);
+
+ // clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
+ while (cell_locktree(cii)) {
+ ; /* spin until we acquire the lock */
+ }
+ // clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
+ // *hmemcpy_time += (t1hmemcpy.tv_sec -
+ // t0hmemcpy.tv_sec) + (t1hmemcpy.tv_nsec -
+ // t0hmemcpy.tv_nsec) / 1000000000.0;
+ const ticks tic = getticks();
+ /* Do the copy */
+ runner_doself1_gpu_unpack_neat_aos_f4(r, cii, parts_recv, 0,
+ &pack_length_unpack, tid,
+ pack_vars->count_max_parts, e);
+ const ticks toc = getticks();
+
+ total_cpu_unpack_ticks += toc - tic;
+ /* Record things for debugging */
+ cii->gpu_done++;
+ /*Time end of unpacking*/
+ clock_gettime(CLOCK_REALTIME, &tp1);
+ *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+ (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+ pthread_mutex_lock(&s->sleep_mutex);
+ atomic_dec(&s->waiting);
+ pthread_cond_broadcast(&s->sleep_cond);
+ pthread_mutex_unlock(&s->sleep_mutex);
+ /* Release the lock */
+ cell_unlocktree(cii);
+
+ /*schedule my dependencies (Only unpacks really)*/
+ enqueue_dependencies(s, tii);
+ /*Signal sleeping runners*/
+ // MATTHIEU signal_sleeping_runners(s, tii);
+
+ tii->gpu_done = 1;
+ }
+ }
+ /*Time end of unpacking*/
+ // clock_gettime(CLOCK_REALTIME, &tp1);
+ // *hmemcpy_time += (tp1.tv_sec - tp0.tv_sec) +
+ // (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+ // *packing_time += (tp1.tv_sec - tp0.tv_sec) +
+ // (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+ }
+ /* Zero counters for the next pack operations */
+ pack_vars->count_parts = 0;
+ pack_vars->tasks_packed = 0;
+
+ t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
+
+} /*End of GPU work Self*/
+
+void runner_doself1_launch_f4_g(
+ struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars,
+ struct cell *ci, struct task *t, struct part_aos_f4_g_send *parts_send,
+ struct part_aos_f4_g_recv *parts_recv,
+ struct part_aos_f4_g_send *d_parts_send,
+ struct part_aos_f4_g_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+ float d_H, struct engine *e, double *packing_time, double *gpu_time,
+ int2 *task_first_part_f4, int2 *d_task_first_part_f4, cudaEvent_t *self_end,
+ double *unpack_time) {
+
+ struct timespec t0, t1, tp0, tp1;
+ clock_gettime(CLOCK_REALTIME, &t0);
+
+ /* Identify the number of GPU bundles to run in ideal case*/
+ int nBundles_temp = pack_vars->nBundles;
+
+ /*How many tasks have we packed?*/
+ const int tasks_packed = pack_vars->tasks_packed;
+
+ /*How many tasks should be in a bundle?*/
+ const int bundle_size = pack_vars->bundle_size;
+
+ /* Special case for incomplete bundles (when having leftover tasks not enough
+ * to fill a bundle) */
+ if (pack_vars->launch_leftovers) {
+ nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+ // if(tasks_packed == 0) error("zero tasks packed but somehow got into
+ // GPU loop");
+ pack_vars->bundle_first_part[nBundles_temp] =
+ task_first_part_f4[tasks_packed - 1].x;
+ }
+ /* Identify the last particle for each bundle of tasks */
+ for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+ pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+ }
+ /* special treatment for the last bundle */
+ if (nBundles_temp > 1)
+ pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+ else
+ pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+ /* Launch the copies for each bundle and run the GPU kernel */
+ /*We don't go into this loop if tasks_left_self == 1 as
+ nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+ int max_parts;
+ for (int bid = 0; bid < nBundles_temp; bid++) {
+
+ max_parts = 0;
+ int parts_in_bundle = 0;
+ const int first_task = bid * bundle_size;
+ int last_task = (bid + 1) * bundle_size;
+ for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+ if (tid < tasks_packed) {
+ /*Get an estimate for the max number of parts per cell in the bundle.
+ * Used for determining the number of GPU CUDA blocks*/
+ int count = task_first_part_f4[tid].y - task_first_part_f4[tid].x;
+ parts_in_bundle += count;
+ max_parts = max(max_parts, count);
+ last_task = tid;
+ }
+ }
+
+ const int first_part_tmp = pack_vars->bundle_first_part[bid];
+ const int bundle_n_parts =
+ pack_vars->bundle_last_part[bid] - first_part_tmp;
+
+ cudaMemcpyAsync(&d_task_first_part_f4[first_task],
+ &task_first_part_f4[first_task],
+ (last_task + 1 - first_task) * sizeof(int2),
+ cudaMemcpyHostToDevice, stream[bid]);
+
+ cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
+ bundle_n_parts * sizeof(struct part_aos_f4_g_send),
+ cudaMemcpyHostToDevice, stream[bid]);
+ // fprintf(stderr, "bid %i first_part %i nparts %i\n", bid,
+ // first_part_tmp, bundle_n_parts);
+
+#ifdef CUDA_DEBUG
+ cudaError_t cu_error =
+ cudaPeekAtLastError(); // cudaGetLastError(); //
+ // Get error code
+ if (cu_error != cudaSuccess) {
+ fprintf(stderr,
+ "CUDA error in gradient self host 2 device memcpy: %s cpuid id "
+ "is: %i\n ",
+ cudaGetErrorString(cu_error), r->cpuid);
+ exit(0);
+ }
+#endif
+ const int tasksperbundle = pack_vars->tasksperbundle;
+ int tasks_left = tasksperbundle;
+ if (bid == nBundles_temp - 1) {
+ tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+ }
+ // Will launch a 2d grid of GPU thread blocks (number of tasks is
+ // the y dimension and max_parts is the x dimension
+ int numBlocks_y = tasks_left;
+ int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+ // const char *loop_type = "density";
+ // Launch the kernel
+ launch_gradient_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
+ numBlocks_x, numBlocks_y, bundle_first_task,
+ d_task_first_part_f4);
+#ifdef CUDA_DEBUG
+ cu_error = cudaPeekAtLastError(); // Get error code
+ if (cu_error != cudaSuccess) {
+ fprintf(
+ stderr,
+ "CUDA error with self gradient kernel launch: %s cpuid id is: %i\n ",
+ cudaGetErrorString(cu_error), r->cpuid);
+ exit(0);
+ }
+#endif
+ cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
+ bundle_n_parts * sizeof(struct part_aos_f4_g_recv),
+ cudaMemcpyDeviceToHost, stream[bid]);
+ cudaEventRecord(self_end[bid], stream[bid]);
+
+#ifdef CUDA_DEBUG
+ cu_error = cudaPeekAtLastError(); // cudaGetLastError(); //
+ // Get error code
+ if (cu_error != cudaSuccess) {
+ fprintf(stderr,
+ "CUDA error with self gradient D2H memcpy: %s cpuid id is: %i\n ",
+ cudaGetErrorString(cu_error), r->cpuid);
+ error("Something's up with your cuda code");
+ }
+#endif
+ } /*End of looping over bundles to launch in streams*/
+ // exit(0);
+ /* Make sure all the kernels and copies back are finished */
+ // cudaDeviceSynchronize();
+
+ /*Time end of GPU work*/
+ clock_gettime(CLOCK_REALTIME, &t1);
+ *gpu_time +=
+ (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+ /* Now copy the data back from the CPU thread-local buffers to the cells */
+ /* Pack length counter for use in unpacking */
+ int pack_length_unpack = 0;
+ ticks total_cpu_unpack_ticks = 0.;
+ for (int bid = 0; bid < nBundles_temp; bid++) {
+
+ clock_gettime(CLOCK_REALTIME, &t0);
+
+ // cudaStreamSynchronize(stream[bid]);
+ cudaEventSynchronize(self_end[bid]);
+
+ clock_gettime(CLOCK_REALTIME, &t1);
+ *gpu_time +=
+ (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+ /*Time unpacking*/
+ // clock_gettime(CLOCK_REALTIME, &tp0);
+
+ for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+ if (tid < tasks_packed) {
+
+ struct cell *cii = pack_vars->cell_list[tid];
+ struct task *tii = pack_vars->task_list[tid];
+
+ // struct cell *cii = ci_list_self_dens[tid];
+ // struct task *tii = task_list_self_dens[tid];
+
+ while (cell_locktree(cii)) {
+ ; /* spin until we acquire the lock */
+ }
+ /*Time unpacking*/
+ clock_gettime(CLOCK_REALTIME, &tp0);
+ const ticks tic = getticks();
+
+ /* Do the copy */
+ runner_doself1_gpu_unpack_neat_aos_f4_g(r, cii, parts_recv, 0,
+ &pack_length_unpack, tid,
+ pack_vars->count_max_parts, e);
+ const ticks toc = getticks();
+
+ total_cpu_unpack_ticks += toc - tic;
+ /*Time end of unpacking*/
+ clock_gettime(CLOCK_REALTIME, &tp1);
+ *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+ (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+ /* Record things for debugging */
+ cii->gpu_done_g++;
+ pthread_mutex_lock(&s->sleep_mutex);
+ atomic_dec(&s->waiting);
+ pthread_cond_broadcast(&s->sleep_cond);
+ pthread_mutex_unlock(&s->sleep_mutex);
+ /* Release the lock */
+ cell_unlocktree(cii);
+
+ /*schedule my dependencies (Only unpacks really)*/
+ enqueue_dependencies(s, tii);
+ /*Signal sleeping runners*/
+ // MATTHIEU signal_sleeping_runners(s, tii);
+
+ tii->gpu_done = 1;
+ }
+ }
+ /*Time end of unpacking*/
+ // clock_gettime(CLOCK_REALTIME, &tp1);
+ // *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+ // (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+ // *packing_time += (tp1.tv_sec - tp0.tv_sec) +
+ // (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+ }
+ /* Zero counters for the next pack operations */
+ pack_vars->count_parts = 0;
+ pack_vars->tasks_packed = 0;
+
+ t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
+
+} /*End of GPU work Self Gradient*/
+
+void runner_doself1_launch_f4_f(
+ struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars,
+ struct cell *ci, struct task *t, struct part_aos_f4_f_send *parts_send,
+ struct part_aos_f4_f_recv *parts_recv,
+ struct part_aos_f4_f_send *d_parts_send,
+ struct part_aos_f4_f_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+ float d_H, struct engine *e, double *packing_time, double *gpu_time,
+ int2 *task_first_part_f4_f, int2 *d_task_first_part_f4_f,
+ cudaEvent_t *self_end, double *unpack_time) {
+
+ struct timespec t0, t1, tp0, tp1; //
+ clock_gettime(CLOCK_REALTIME, &t0);
+
+ /* Identify the number of GPU bundles to run in ideal case*/
+ int nBundles_temp = pack_vars->nBundles;
+
+ /*How many tasks have we packed?*/
+ const int tasks_packed = pack_vars->tasks_packed;
+
+ /*How many tasks should be in a bundle?*/
+ const int bundle_size = pack_vars->bundle_size;
+
+ /* Special case for incomplete bundles (when having leftover tasks not enough
+ * to fill a bundle) */
+ if (pack_vars->launch_leftovers) {
+ nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+ if (tasks_packed == 0)
+ error("zero tasks packed but somehow got into GPU loop");
+ pack_vars->bundle_first_part[nBundles_temp] =
+ task_first_part_f4_f[tasks_packed - 1].x;
+ }
+ /* Identify the last particle for each bundle of tasks */
+ for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+ pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+ }
+ /* special treatment for the last bundle */
+ if (nBundles_temp > 1)
+ pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+ else
+ pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+ /*Copy arrays containing first and last part for each task to GPU*/
+ // cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+ // tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+ // cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+ // tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+
+ /*Copy cell shifts to device*/
+ // cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
+ // tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+ // cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
+ // tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+ // cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
+ // tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+
+ /* Launch the copies for each bundle and run the GPU kernel */
+ /*We don't go into this loop if tasks_left_self == 1 as
+ nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+ int max_parts = 0;
+ for (int bid = 0; bid < nBundles_temp; bid++) {
+
+ max_parts = 0;
+ int parts_in_bundle = 0;
+ const int first_task = bid * bundle_size;
+ int last_task = (bid + 1) * bundle_size;
+ for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+ if (tid < tasks_packed) {
+ /*Get an estimate for the max number of parts per cell in the bundle.
+ * Used for determining the number of GPU CUDA blocks*/
+ int count = task_first_part_f4_f[tid].y - task_first_part_f4_f[tid].x;
+ parts_in_bundle += count;
+ max_parts = max(max_parts, count);
+ last_task = tid;
+ }
+ }
+
+ const int first_part_tmp = pack_vars->bundle_first_part[bid];
+ const int bundle_n_parts =
+ pack_vars->bundle_last_part[bid] - first_part_tmp;
+ cudaMemcpyAsync(&d_task_first_part_f4_f[first_task],
+ &task_first_part_f4_f[first_task],
+ (last_task + 1 - first_task) * sizeof(int2),
+ cudaMemcpyHostToDevice, stream[bid]);
+
+ cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
+ bundle_n_parts * sizeof(struct part_aos_f4_f_send),
+ cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+ cudaError_t cu_error =
+ cudaPeekAtLastError(); // cudaGetLastError(); //
+ // Get error code
+ if (cu_error != cudaSuccess) {
+ fprintf(stderr,
+ "CUDA error in density self host 2 device memcpy: %s cpuid id "
+ "is: %i\n ",
+ cudaGetErrorString(cu_error), r->cpuid);
+ exit(0);
+ }
+#endif
+ const int tasksperbundle = pack_vars->tasksperbundle;
+ int tasks_left = tasksperbundle;
+ if (bid == nBundles_temp - 1) {
+ tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+ }
+ // Will launch a 2d grid of GPU thread blocks (number of tasks is
+ // the y dimension and max_parts is the x dimension
+ int numBlocks_y = tasks_left;
+ int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+ // Launch the kernel
+ launch_force_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
+ numBlocks_x, numBlocks_y, bundle_first_task,
+ d_task_first_part_f4_f);
+#ifdef CUDA_DEBUG
+ cu_error = cudaPeekAtLastError(); // Get error code
+ if (cu_error != cudaSuccess) {
+ fprintf(stderr,
+ "CUDA error with self force kernel launch: %s cpuid id is: %i\n ",
+ cudaGetErrorString(cu_error), r->cpuid);
+ exit(0);
+ }
+#endif
+ cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
+ bundle_n_parts * sizeof(struct part_aos_f4_f_recv),
+ cudaMemcpyDeviceToHost, stream[bid]);
+ cudaEventRecord(self_end[bid], stream[bid]);
+
+#ifdef CUDA_DEBUG
+ cu_error = cudaPeekAtLastError(); // cudaGetLastError(); //
+ // Get error code
+ if (cu_error != cudaSuccess) {
+ fprintf(stderr,
+ "CUDA error with self firce D2H memcpy: %s cpuid id is: %i\n ",
+ cudaGetErrorString(cu_error), r->cpuid);
+ error("Something's up with your cuda code");
+ }
+#endif
+ } /*End of looping over bundles to launch in streams*/
+
+ /* Make sure all the kernels and copies back are finished */
+ // cudaDeviceSynchronize();
+
+ /*Time end of GPU work*/
+ clock_gettime(CLOCK_REALTIME, &t1);
+ *gpu_time +=
+ (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+ /* Now copy the data back from the CPU thread-local buffers to the cells */
+ /* Pack length counter for use in unpacking */
+ int pack_length_unpack = 0;
+ ticks total_cpu_unpack_ticks = 0.;
+ for (int bid = 0; bid < nBundles_temp; bid++) {
+
+ clock_gettime(CLOCK_REALTIME, &t0);
+
+ // cudaStreamSynchronize(stream[bid]);
+ cudaEventSynchronize(self_end[bid]);
+
+ clock_gettime(CLOCK_REALTIME, &t1);
+ *gpu_time +=
+ (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+ /*Time unpacking*/
+ // clock_gettime(CLOCK_REALTIME, &tp0);
+
+ for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+ if (tid < tasks_packed) {
+ struct cell *cii = pack_vars->cell_list[tid];
+ struct task *tii = pack_vars->task_list[tid];
+
+ // struct cell *cii = ci_list_self_dens[tid];
+ // struct task *tii = task_list_self_dens[tid];
+
+ while (cell_locktree(cii)) {
+ ; /* spin until we acquire the lock */
+ }
+ clock_gettime(CLOCK_REALTIME, &tp0);
+ const ticks tic = getticks();
+
+ /* Do the copy */
+ runner_doself1_gpu_unpack_neat_aos_f4_f(r, cii, parts_recv, 0,
+ &pack_length_unpack, tid,
+ pack_vars->count_max_parts, e);
+ const ticks toc = getticks();
+
+ total_cpu_unpack_ticks += toc - tic;
+ /* Record things for debugging */
+ cii->gpu_done_f++;
+ clock_gettime(CLOCK_REALTIME, &tp1);
+ *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+ (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+ pthread_mutex_lock(&s->sleep_mutex);
+ atomic_dec(&s->waiting);
+ pthread_cond_broadcast(&s->sleep_cond);
+ pthread_mutex_unlock(&s->sleep_mutex);
+ /* Release the lock */
+ cell_unlocktree(cii);
+
+ /*schedule my dependencies (Only unpacks really)*/
+ enqueue_dependencies(s, tii);
+ /*Signal sleeping runners*/
+ // MATTHIEU signal_sleeping_runners(s, tii);
+
+ tii->gpu_done = 1;
+ }
+ }
+ /*Time end of unpacking*/
+ // clock_gettime(CLOCK_REALTIME, &tp1);
+ // *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+ // (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+ }
+
+ /* Zero counters for the next pack operations */
+ pack_vars->count_parts = 0;
+ pack_vars->tasks_packed = 0;
+
+ t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
+} /*End of GPU work Self Gradient*/
+
+void runner_dopair1_launch_f4_one_memcpy(
+ struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+ struct task *t, struct part_aos_f4_send *parts_send,
+ struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
+ struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+ float d_H, struct engine *e, double *packing_time, double *gpu_time,
+ double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
+ cudaEvent_t *pair_end) {
+
+ struct timespec t0, t1, tp0, tp1; //
+ clock_gettime(CLOCK_REALTIME, &t0);
+
+ /* Identify the number of GPU bundles to run in ideal case*/
+ int nBundles_temp = pack_vars->nBundles;
+ /*How many tasks have we packed?*/
+ const int tasks_packed = pack_vars->tasks_packed;
+
+ /*How many tasks should be in a bundle?*/
+ const int bundle_size = pack_vars->bundle_size;
+
+ /* Special case for incomplete bundles (when having leftover tasks not enough
+ * to fill a bundle) */
+ if (pack_vars->launch_leftovers) {
+ nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+ if (tasks_packed == 0)
+ error("zero pair tasks packed but somehow got into GPU loop");
+ // pack_vars->bundle_first_part[nBundles_temp] =
+ // pack_vars->task_first_part[packed_tmp - 2];
+ pack_vars->bundle_first_part[nBundles_temp] =
+ fparti_fpartj_lparti_lpartj_dens[tasks_packed - 1].x;
+ }
+ /* Identify the last particle for each bundle of tasks */
+ for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+ pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+ }
+ /* special treatment for the last bundle */
+ if (nBundles_temp > 1)
+ pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+ else
+ pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+ /* Launch the copies for each bundle and run the GPU kernel */
+ for (int bid = 0; bid < nBundles_temp; bid++) {
+
+ int max_parts_i = 0;
+ int max_parts_j = 0;
+ int parts_in_bundle_ci = 0;
+ int parts_in_bundle_cj = 0;
+ for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+ if (tid < tasks_packed) {
+ /*Get an estimate for the max number of parts per cell in each bundle.
+ * Used for determining the number of GPU CUDA blocks*/
+ int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z -
+ fparti_fpartj_lparti_lpartj_dens[tid].x;
+ parts_in_bundle_ci += count_i;
+ max_parts_i = max(max_parts_i, count_i);
+ int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w -
+ fparti_fpartj_lparti_lpartj_dens[tid].y;
+ parts_in_bundle_cj += count_j;
+ max_parts_j = max(max_parts_j, count_j);
+ // if(count_i > 100 || count_j > 100)
+ // error("Sending data for excessive n parts %i %i",
+ // count_i, count_j);
+ }
+ }
+ const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+ const int bundle_n_parts =
+ pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+
+ cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+ &parts_send[first_part_tmp_i],
+ bundle_n_parts * sizeof(struct part_aos_f4_send),
+ cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+ cudaError_t cu_error =
+ cudaPeekAtLastError(); // cudaGetLastError(); //
+ // Get error code
+ if (cu_error != cudaSuccess) {
+ fprintf(stderr,
+ "CUDA error with pair density H2D async memcpy ci: %s cpuid id "
+ "is: %i\n ",
+ cudaGetErrorString(cu_error), r->cpuid);
+ error("Something's up with your cuda code first_part %i bundle size %i",
+ first_part_tmp_i, bundle_n_parts);
+ }
+#endif
+ /* LAUNCH THE GPU KERNELS for ci & cj */
+ // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+ // the y dimension and max_parts is the x dimension
+ int numBlocks_y = 0; // tasks_left;
+ int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ int bundle_part_0 = pack_vars->bundle_first_part[bid];
+ /* Launch the kernel for ci using data for ci and cj */
+ runner_dopair_branch_density_gpu_aos_f4(
+ d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
+ numBlocks_y, bundle_part_0, bundle_n_parts);
+
+#ifdef CUDA_DEBUG
+ cu_error = cudaPeekAtLastError(); // Get error code
+ if (cu_error != cudaSuccess) {
+ fprintf(
+ stderr,
+ "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+ "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+ cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
+ max_parts_i, max_parts_j);
+ error("Something's up with kernel launch.");
+ }
+#endif
+
+ // Copy results back to CPU BUFFERS
+ cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+ &d_parts_recv[first_part_tmp_i],
+ bundle_n_parts * sizeof(struct part_aos_f4_recv),
+ cudaMemcpyDeviceToHost, stream[bid]);
+ cudaEventRecord(pair_end[bid], stream[bid]);
+
+#ifdef CUDA_DEBUG
+ cu_error = cudaPeekAtLastError(); // cudaGetLastError(); //
+ // Get error code
+ if (cu_error != cudaSuccess) {
+ fprintf(stderr,
+ "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+ cudaGetErrorString(cu_error), r->cpuid);
+ error("Something's up with your cuda code");
+ }
+#endif
+ } /*End of looping over bundles to launch in streams*/
+
+ /* Make sure all the kernels and copies back are finished */
+ // cudaDeviceSynchronize();
+
+ /*Time end of GPU work*/
+ clock_gettime(CLOCK_REALTIME, &t1);
+ *gpu_time +=
+ (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+ /* Now copy the data back from the CPU thread-local buffers to the cells */
+ /* Pack length counter for use in unpacking */
+
+ int pack_length_unpack = 0;
+ ticks total_cpu_unpack_ticks = 0;
+
+ for (int bid = 0; bid < nBundles_temp; bid++) {
+ /*Time unpacking*/
+ clock_gettime(CLOCK_REALTIME, &t0);
+
+ // cudaStreamSynchronize(stream[bid]);
+ cudaEventSynchronize(pair_end[bid]);
+
+ clock_gettime(CLOCK_REALTIME, &t1);
+ *gpu_time +=
+ (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+ ////////////
+
+ /*Time unpacking*/
+ // clock_gettime(CLOCK_REALTIME, &tp0);
+
+// for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+//
+// if (tid < tasks_packed) {
+// clock_gettime(CLOCK_REALTIME, &tp0);
+// /*grab cell and task pointers*/
+// struct cell *cii = pack_vars->ci_list[tid];
+// struct cell *cjj = pack_vars->cj_list[tid];
+// struct task *tii = pack_vars->task_list[tid];
+//
+//// if(!pack_vars->task_locked){
+//// /*Let's lock ci*/
+//// while (cell_locktree(cii)) {
+//// ; /* spin until we acquire the lock */
+//// }
+//// /*Let's lock cj*/
+//// while (cell_locktree(cjj)) {
+//// ; /* spin until we acquire the lock */
+//// }
+//// pack_vars->task_locked = 1;
+//// }
+//
+// const ticks tic = getticks();
+//
+// /* Do the copy */
+// runner_do_ci_cj_gpu_unpack_neat_aos_f4(
+// r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
+// 2 * pack_vars->count_max_parts, e);
+//
+// const ticks toc = getticks();
+//
+// total_cpu_unpack_ticks += toc - tic;
+//
+// /* Record things for debugging */
+// cii->gpu_done_pair++;
+// cjj->gpu_done_pair++;
+//
+//// if(pack_vars->task_locked){
+//// /* Release the locks */
+//// cell_unlocktree(cii);
+//// /* Release the locks */
+//// cell_unlocktree(cjj);
+// pack_vars->task_locked = 0;
+//// }
+//
+// /*Time end of unpacking*/
+// clock_gettime(CLOCK_REALTIME, &tp1);
+// *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+// (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+// /*Signal sleeping runners*/
+// // MATTHIEU signal_sleeping_runners(s, tii);
+//
+// tii->gpu_done = 1;
+// }
+// }
+ }
+
+ /* Zero counters for the next pack operations */
+// pack_vars->count_parts = 0;
+// pack_vars->tasks_packed = 0;
+
+ // /*Time end of unpacking*/
+ // clock_gettime(CLOCK_REALTIME, &t1);
+ // *packing_time += (t1.tv_sec - t0.tv_sec) +
+ // (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+ /* Write the timers back to the task */
+ t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
+
+} /*End of GPU work*/
+
+void runner_dopair1_unpack_f4(
+ struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+ struct task *t, struct part_aos_f4_send *parts_send,
+ struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
+ struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+ float d_H, struct engine *e, double *packing_time, double *gpu_time,
+ double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
+ cudaEvent_t *pair_end, int cstart, int n_leaves_found){
+
+ int topid;
+ int pack_length_unpack = 0;
+ ticks total_cpu_unpack_ticks = 0;
+ /*Loop over top level tasks*/
+ for (topid = 0; topid < pack_vars->top_tasks_packed; topid++) {
+ const ticks tic = getticks();
+ /* Loop through each daughter task */
+ int n_leaves_in_task = pack_vars->leaf_list[topid].n_packed;
+ int nstart = pack_vars->leaf_list[topid].n_start;
+ for(int tid = nstart; tid < n_leaves_in_task + nstart; tid++){
+ /*Get pointers to the leaf cells. SEEMS I'm NOT GETTING A CORRECT POINTER
+ *but likely due to incorrect book keeping*/
+ struct cell * cii_l = pack_vars->leaf_list[topid].ci[tid];
+ struct cell * cjj_l = pack_vars->leaf_list[topid].cj[tid];
+ message("loc %f %f %f topid %i tid %i nleaves %i", pack_vars->leaf_list[topid].ci[tid]->loc[0]
+ , pack_vars->leaf_list[topid].ci[tid]->loc[1]
+ , pack_vars->leaf_list[topid].ci[tid]->loc[2]
+ , topid, tid, n_leaves_in_task);
+// if(*cii_l == NULL || *cjj_l == NULL)error("stop");
+ runner_do_ci_cj_gpu_unpack_neat_aos_f4(
+ r, cii_l, cjj_l, parts_recv, 0, &pack_length_unpack, tid,
+ 2 * pack_vars->count_max_parts, e);
+ }
+
+ const ticks toc = getticks();
+ total_cpu_unpack_ticks += toc - tic;
+ pack_vars->count_parts = 0;
+ /*For some reason the code fails if we get a leaf pair task
+ *this if statement stops the code from trying to unlock same cells twice*/
+ if(topid == pack_vars->top_tasks_packed -1 && cstart != n_leaves_found)
+ continue;
+ enqueue_dependencies(s, pack_vars->top_task_list[topid]);
+ pthread_mutex_lock(&s->sleep_mutex);
+ atomic_dec(&s->waiting);
+ pthread_cond_broadcast(&s->sleep_cond);
+ pthread_mutex_unlock(&s->sleep_mutex);
+ }
+}
+void runner_dopair1_launch_f4_g_one_memcpy(
+ struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+ struct task *t, struct part_aos_f4_g_send *parts_send,
+ struct part_aos_f4_g_recv *parts_recv,
+ struct part_aos_f4_g_send *d_parts_send,
+ struct part_aos_f4_g_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+ float d_H, struct engine *e, double *packing_time, double *gpu_time,
+ double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
+ cudaEvent_t *pair_end) {
+
+ struct timespec t0, t1, tp0, tp1; //
+ clock_gettime(CLOCK_REALTIME, &t0);
+
+ /* Identify the number of GPU bundles to run in ideal case*/
+ int nBundles_temp = pack_vars->nBundles;
+ /*How many tasks have we packed?*/
+ const int tasks_packed = pack_vars->tasks_packed;
+
+ /*How many tasks should be in a bundle?*/
+ const int bundle_size = pack_vars->bundle_size;
+
+ /*tasks-packed needs decrementing before calculating packed_tmp as it was
+ * incremented in runner_dopair1_pack*/
+ // const int packed_tmp = 2 * (tasks_packed - 1);
+
+ /* Special case for incomplete bundles (when having leftover tasks not enough
+ * to fill a bundle) */
+ if (pack_vars->launch_leftovers) {
+ nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+ if (tasks_packed == 0)
+ error("zero pair tasks packed but somehow got into GPU loop");
+ // pack_vars->bundle_first_part[nBundles_temp] =
+ // pack_vars->task_first_part[packed_tmp - 2];
+ pack_vars->bundle_first_part[nBundles_temp] =
+ fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
+ }
+ /* Identify the last particle for each bundle of tasks */
+ for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+ pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+ }
+ /* special treatment for the last bundle */
+ if (nBundles_temp > 1)
+ pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+ else
+ pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+ /* Launch the copies for each bundle and run the GPU kernel */
+ /*We don't go into this loop if tasks_left_self == 1 as
+ nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+ // int max_parts = 0;
+ for (int bid = 0; bid < nBundles_temp; bid++) {
+
+ int max_parts_i = 0;
+ int max_parts_j = 0;
+ int parts_in_bundle_ci = 0;
+ int parts_in_bundle_cj = 0;
+ for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+ if (tid < tasks_packed) {
+ /*Get an estimate for the max number of parts per cell in each bundle.
+ * Used for determining the number of GPU CUDA blocks*/
+ int count_i = fparti_fpartj_lparti_lpartj[tid].z -
+ fparti_fpartj_lparti_lpartj[tid].x;
+ parts_in_bundle_ci += count_i;
+ max_parts_i = max(max_parts_i, count_i);
+ int count_j = fparti_fpartj_lparti_lpartj[tid].w -
+ fparti_fpartj_lparti_lpartj[tid].y;
+ parts_in_bundle_cj += count_j;
+ max_parts_j = max(max_parts_j, count_j);
+ }
+ }
+ const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+ const int bundle_n_parts =
+ pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+
+ cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+ &parts_send[first_part_tmp_i],
+ bundle_n_parts * sizeof(struct part_aos_f4_g_send),
+ cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+ cudaError_t cu_error =
+ cudaPeekAtLastError(); // cudaGetLastError(); //
+ // Get error code
+ if (cu_error != cudaSuccess) {
+ fprintf(stderr,
+ "CUDA error with pair density H2D async memcpy ci: %s cpuid id "
+ "is: %i\n ",
+ cudaGetErrorString(cu_error), r->cpuid);
+ error("Something's up with your cuda code");
+ }
+#endif
+
+ // const int tasksperbundle = pack_vars->tasksperbundle;
+ /* LAUNCH THE GPU KERNELS for ci & cj */
+ // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+ // the y dimension and max_parts is the x dimension
+ int numBlocks_y = 0; // tasks_left;
+ int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ int bundle_part_0 = pack_vars->bundle_first_part[bid];
+ // fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n",
+ // bundle_part_0, bundle_first_task);
+
+ /* Launch the kernel for ci using data for ci and cj */
+ runner_dopair_branch_gradient_gpu_aos_f4(
+ d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
+ numBlocks_y, bundle_part_0, bundle_n_parts);
+
+#ifdef CUDA_DEBUG
+ cu_error = cudaPeekAtLastError(); // Get error code
+ if (cu_error != cudaSuccess) {
+ fprintf(
+ stderr,
+ "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+ "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+ cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
+ max_parts_i, max_parts_j);
+ exit(0);
+ }
+#endif
+
+ // Copy results back to CPU BUFFERS
+ cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+ &d_parts_recv[first_part_tmp_i],
+ bundle_n_parts * sizeof(struct part_aos_f4_g_recv),
+ cudaMemcpyDeviceToHost, stream[bid]);
+ cudaEventRecord(pair_end[bid], stream[bid]);
+
+#ifdef CUDA_DEBUG
+ cu_error = cudaPeekAtLastError(); // cudaGetLastError(); //
+ // Get error code
+ if (cu_error != cudaSuccess) {
+ fprintf(stderr,
+ "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+ cudaGetErrorString(cu_error), r->cpuid);
+ error("Something's up with your cuda code");
+ }
+#endif
+ } /*End of looping over bundles to launch in streams*/
+
+ /* Make sure all the kernels and copies back are finished */
+ // cudaDeviceSynchronize();
+
+ /*Time end of GPU work*/
+ clock_gettime(CLOCK_REALTIME, &t1);
+ *gpu_time +=
+ (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+ /* Now copy the data back from the CPU thread-local buffers to the cells */
+ /* Pack length counter for use in unpacking */
+ int pack_length_unpack = 0;
+
+ ticks total_cpu_unpack_ticks = 0.;
+
+ for (int bid = 0; bid < nBundles_temp; bid++) {
+ /*Time unpacking*/
+ clock_gettime(CLOCK_REALTIME, &t0);
+
+ // cudaStreamSynchronize(stream[bid]);
+ cudaEventSynchronize(pair_end[bid]);
+
+ clock_gettime(CLOCK_REALTIME, &t1);
+ *gpu_time +=
+ (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+ /*Time unpacking*/
+ // clock_gettime(CLOCK_REALTIME, &tp0);
+ // int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+
+ for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+ if (tid < tasks_packed) {
+ clock_gettime(CLOCK_REALTIME, &tp0);
+ /*grab cell and task pointers*/
+ struct cell *cii = pack_vars->ci_list[tid];
+ struct cell *cjj = pack_vars->cj_list[tid];
+ struct task *tii = pack_vars->task_list[tid];
+ /*Let's lock ci*/
+ while (cell_locktree(cii)) {
+ ; /* spin until we acquire the lock */
+ }
+ /*Let's lock cj*/
+ while (cell_locktree(cjj)) {
+ ; /* spin until we acquire the lock */
+ }
+
+ const ticks tic = getticks();
+
+ /* Do the copy */
+ runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(
+ r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
+ 2 * pack_vars->count_max_parts, e);
+
+ const ticks toc = getticks();
+
+ total_cpu_unpack_ticks += toc - tic;
+
+ /* Record things for debugging */
+ cii->gpu_done_pair_g++;
+ cjj->gpu_done_pair_g++;
+ pthread_mutex_lock(&s->sleep_mutex);
+ atomic_dec(&s->waiting);
+ pthread_cond_broadcast(&s->sleep_cond);
+ pthread_mutex_unlock(&s->sleep_mutex);
+ /* Release the locks */
+ cell_unlocktree(cii);
+ /* Release the locks */
+ cell_unlocktree(cjj);
+
+ /*Time end of unpacking*/
+ clock_gettime(CLOCK_REALTIME, &tp1);
+ *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+ (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+ /*schedule my dependencies (Only unpacks really)*/
+ enqueue_dependencies(s, tii);
+ /*Signal sleeping runners*/
+ // MATTHIEU signal_sleeping_runners(s, tii);
+
+ tii->gpu_done = 1;
+ }
+ }
+ }
+ /* Zero counters for the next pack operations */
+ pack_vars->count_parts = 0;
+ pack_vars->tasks_packed = 0;
+
+ /* Write the timers back to the task */
+ t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
+ // /*Time end of unpacking*/
+ // clock_gettime(CLOCK_REALTIME, &t1);
+ // *packing_time += (t1.tv_sec - t0.tv_sec) +
+ // (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+} /*End of GPU work*/
+
+void runner_dopair1_launch_f4_f_one_memcpy(
+ struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+ struct task *t, struct part_aos_f4_f_send *parts_send,
+ struct part_aos_f4_f_recv *parts_recv,
+ struct part_aos_f4_f_send *d_parts_send,
+ struct part_aos_f4_f_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+ float d_H, struct engine *e, double *packing_time, double *gpu_time,
+ double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
+ cudaEvent_t *pair_end) {
+
+ struct timespec t0, t1, tp0, tp1; //
+ clock_gettime(CLOCK_REALTIME, &t0);
+
+ /* Identify the number of GPU bundles to run in ideal case*/
+ int nBundles_temp = pack_vars->nBundles;
+ /*How many tasks have we packed?*/
+ const int tasks_packed = pack_vars->tasks_packed;
+
+ /*How many tasks should be in a bundle?*/
+ const int bundle_size = pack_vars->bundle_size;
+
+ /*tasks-packed needs decrementing before calculating packed_tmp as it was
+ * incremented in runner_dopair1_pack*/
+ // const int packed_tmp = 2 * (tasks_packed - 1);
+
+ /* Special case for incomplete bundles (when having leftover tasks not enough
+ * to fill a bundle) */
+ if (pack_vars->launch_leftovers) {
+ nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+ if (tasks_packed == 0)
+ error("zero pair tasks packed but somehow got into GPU loop");
+ // pack_vars->bundle_first_part[nBundles_temp] =
+ // pack_vars->task_first_part[packed_tmp - 2];
+ pack_vars->bundle_first_part[nBundles_temp] =
+ fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
+ }
+ /* Identify the last particle for each bundle of tasks */
+ for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+ pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+ }
+ /* special treatment for the last bundle */
+ if (nBundles_temp > 1)
+ pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+ else
+ pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+ /* Launch the copies for each bundle and run the GPU kernel */
+ /*We don't go into this loop if tasks_left_self == 1 as
+ nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+ // int max_parts = 0;
+ for (int bid = 0; bid < nBundles_temp; bid++) {
+
+ int max_parts_i = 0;
+ int max_parts_j = 0;
+ int parts_in_bundle_ci = 0;
+ int parts_in_bundle_cj = 0;
+ // const int first_task = bid * pack_vars->bundle_size;
+ // int last_task = (bid + 1) * bundle_size;
+ for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+ if (tid < tasks_packed) {
+ /*Get an estimate for the max number of parts per cell in each bundle.
+ * Used for determining the number of GPU CUDA blocks*/
+ int count_i = fparti_fpartj_lparti_lpartj[tid].z -
+ fparti_fpartj_lparti_lpartj[tid].x;
+ parts_in_bundle_ci += count_i;
+ max_parts_i = max(max_parts_i, count_i);
+ int count_j = fparti_fpartj_lparti_lpartj[tid].w -
+ fparti_fpartj_lparti_lpartj[tid].y;
+ parts_in_bundle_cj += count_j;
+ max_parts_j = max(max_parts_j, count_j);
+
+ // last_task = tid;
+ }
+ }
+ const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+ const int bundle_n_parts =
+ pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+
+ cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+ &parts_send[first_part_tmp_i],
+ bundle_n_parts * sizeof(struct part_aos_f4_f_send),
+ cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+ cudaError_t cu_error =
+ cudaPeekAtLastError(); // cudaGetLastError(); //
+ // Get error code
+ if (cu_error != cudaSuccess) {
+ fprintf(stderr,
+ "CUDA error with pair density H2D async memcpy ci: %s cpuid id "
+ "is: %i\n ",
+ cudaGetErrorString(cu_error), r->cpuid);
+ error("Something's up with your cuda code");
+ }
+#endif
+
+ // const int tasksperbundle = pack_vars->tasksperbundle;
+ /* LAUNCH THE GPU KERNELS for ci & cj */
+ // int tid = 0;
+ // int offset = bid * tasksperbundle;
+ // int tasks_left = tasksperbundle;
+ // if (bid == nBundles_temp - 1) {
+ // tasks_left =
+ // tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+ // }
+
+ // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+ // the y dimension and max_parts is the x dimension
+ int numBlocks_y = 0; // tasks_left;
+ int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ int bundle_part_0 = pack_vars->bundle_first_part[bid];
+ // int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+ // fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n",
+ // bundle_part_0, bundle_first_task);
+
+ /* Launch the kernel for ci using data for ci and cj */
+ runner_dopair_branch_force_gpu_aos_f4(d_parts_send, d_parts_recv, d_a, d_H,
+ stream[bid], numBlocks_x, numBlocks_y,
+ bundle_part_0, bundle_n_parts);
+
+#ifdef CUDA_DEBUG
+ cu_error = cudaPeekAtLastError(); // Get error code
+ if (cu_error != cudaSuccess) {
+ fprintf(
+ stderr,
+ "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+ "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+ cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
+ max_parts_i, max_parts_j);
+ exit(0);
+ }
+#endif
+
+ // Copy results back to CPU BUFFERS
+ cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+ &d_parts_recv[first_part_tmp_i],
+ bundle_n_parts * sizeof(struct part_aos_f4_f_recv),
+ cudaMemcpyDeviceToHost, stream[bid]);
+ cudaEventRecord(pair_end[bid], stream[bid]);
+
+#ifdef CUDA_DEBUG
+ cu_error = cudaPeekAtLastError(); // cudaGetLastError(); //
+ // Get error code
+ if (cu_error != cudaSuccess) {
+ fprintf(stderr,
+ "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+ cudaGetErrorString(cu_error), r->cpuid);
+ error("Something's up with your cuda code");
+ }
+#endif
+ } /*End of looping over bundles to launch in streams*/
+
+ /* Make sure all the kernels and copies back are finished */
+ // cudaDeviceSynchronize();
+
+ /*Time end of GPU work*/
+ clock_gettime(CLOCK_REALTIME, &t1);
+ *gpu_time +=
+ (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+ /* Now copy the data back from the CPU thread-local buffers to the cells */
+ /* Pack length counter for use in unpacking */
+ int pack_length_unpack = 0;
+ ticks total_cpu_unpack_ticks = 0.;
+ for (int bid = 0; bid < nBundles_temp; bid++) {
+ /*Time unpacking*/
+ clock_gettime(CLOCK_REALTIME, &t0);
+
+ // cudaStreamSynchronize(stream[bid]);
+ cudaEventSynchronize(pair_end[bid]);
+
+ clock_gettime(CLOCK_REALTIME, &t1);
+ *gpu_time +=
+ (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+ /*Time unpacking*/
+ // clock_gettime(CLOCK_REALTIME, &tp0);
+ // int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+
+ for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+ if (tid < tasks_packed) {
+ clock_gettime(CLOCK_REALTIME, &tp0);
+ /*grab cell and task pointers*/
+ struct cell *cii = pack_vars->ci_list[tid];
+ struct cell *cjj = pack_vars->cj_list[tid];
+ struct task *tii = pack_vars->task_list[tid];
+ /*Let's lock ci*/
+ while (cell_locktree(cii)) {
+ ; /* spin until we acquire the lock */
+ }
+ /*Let's lock cj*/
+ while (cell_locktree(cjj)) {
+ ; /* spin until we acquire the lock */
+ }
+
+ const ticks tic = getticks();
+
+ /* Do the copy */
+ runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(
+ r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
+ 2 * pack_vars->count_max_parts, e);
+
+ const ticks toc = getticks();
+
+ total_cpu_unpack_ticks += toc - tic;
+
+ /* Record things for debugging */
+ cii->gpu_done_pair_f++;
+ cjj->gpu_done_pair_f++;
+ pthread_mutex_lock(&s->sleep_mutex);
+ atomic_dec(&s->waiting);
+ pthread_cond_broadcast(&s->sleep_cond);
+ pthread_mutex_unlock(&s->sleep_mutex);
+ // /* Release the locks */
+ cell_unlocktree(cii);
+ // /* Release the locks */
+ cell_unlocktree(cjj);
+
+ /*Time end of unpacking*/
+ clock_gettime(CLOCK_REALTIME, &tp1);
+ *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+ (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+ /*schedule my dependencies (Only unpacks really)*/
+ enqueue_dependencies(s, tii);
+ /*Signal sleeping runners*/
+ // MATTHIEU signal_sleeping_runners(s, tii);
+
+ tii->gpu_done = 1;
+ }
+ }
+ }
+ /* Zero counters for the next pack operations */
+ pack_vars->count_parts = 0;
+ pack_vars->tasks_packed = 0;
+
+ /* Write the timers back to the task */
+ t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
+ // /*Time end of unpacking*/
+ // clock_gettime(CLOCK_REALTIME, &t1);
+ // *packing_time += (t1.tv_sec - t0.tv_sec) +
+ // (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+} /*End of GPU work*/
diff --git a/src/runner_gpu_pack_functions.c b/src/runner_gpu_pack_functions.c
new file mode 100644
index 0000000000..af743e6172
--- /dev/null
+++ b/src/runner_gpu_pack_functions.c
@@ -0,0 +1,813 @@
+// #include "active.h"
+// #include
+// #include
+// #include "cuda/cell_gpu.h"
+// #include "runner_gpu_functions.cuh"
+/* This object's header. */
+#include "runner.h"
+/* Local headers. */
+#include "active.h"
+#include "engine.h"
+#include "runner_gpu_pack_functions.h"
+#include "scheduler.h"
+#include "space_getsid.h"
+#include "timers.h"
+#include "runner_doiact_hydro.h"
+
+void runner_doself1_gpu_pack_neat_aos_f4(
+ struct runner *r, struct cell *__restrict__ c,
+ struct part_aos_f4_send *__restrict__ parts_aos_buffer, int timer,
+ int *pack_length, int tid, int count_max_parts_tmp) {
+
+ TIMER_TIC;
+
+ /* Anything to do here? */
+ if (c->hydro.count == 0) return;
+
+ int count = c->hydro.count;
+ int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+ if (local_pack_position + count >= count_max_parts_tmp) {
+ fprintf(stderr,
+ "Exceeded count_max_parts_tmp. Make arrays bigger! count_max %i "
+ "count %i\n",
+ count_max_parts_tmp, local_pack_position + count);
+ error("0");
+ }
+#endif
+ int2 frst_lst_prts = {local_pack_position, local_pack_position + count};
+ /* Pack the particle data into CPU-side buffers*/
+ pack_neat_aos_f4(c, parts_aos_buffer, tid, local_pack_position, count,
+ frst_lst_prts);
+ /* Increment pack length accordingly */
+ (*pack_length) += count;
+
+ if (timer) TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_doself1_gpu_pack_neat_aos_f4_g(
+ struct runner *r, struct cell *c,
+ struct part_aos_f4_g_send *parts_aos_buffer, int timer, int *pack_length,
+ int tid, int count_max_parts_tmp) {
+
+ TIMER_TIC;
+
+ /* Anything to do here? */
+ if (c->hydro.count == 0) return;
+
+ int count = c->hydro.count;
+ int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+ if (local_pack_position + count >= count_max_parts_tmp) {
+ fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
+ exit(0);
+ }
+#endif
+
+ /* Pack the particle data into CPU-side buffers*/
+ pack_neat_aos_f4_g(c, parts_aos_buffer, tid, local_pack_position, count);
+ /* Increment pack length accordingly */
+ (*pack_length) += count;
+
+ if (timer) TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_doself1_gpu_pack_neat_aos_f4_f(
+ struct runner *r, struct cell *restrict c,
+ struct part_aos_f4_f_send *restrict parts_aos_buffer, int timer,
+ int *pack_length, int tid, int count_max_parts_tmp) {
+
+ TIMER_TIC;
+
+ /* Anything to do here? */
+ if (c->hydro.count == 0) return;
+
+ int count = c->hydro.count;
+ int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+ if (local_pack_position + count >= count_max_parts_tmp) {
+ fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
+ exit(0);
+ }
+#endif
+
+ /* Pack the particle data into CPU-side buffers*/
+ pack_neat_aos_f4_f(c, parts_aos_buffer, tid, local_pack_position, count);
+ /* Increment pack length accordingly */
+ (*pack_length) += count;
+
+ if (timer) TIMER_TOC(timer_doself_gpu_pack);
+}
+
+extern inline void pack_neat_pair_aos_f4(
+ struct cell *__restrict c,
+ struct part_aos_f4_send *__restrict parts_aos_buffer, int tid,
+ const int local_pack_position, const int count, const float3 shift,
+ const int2 cstarts) {
+ /*Data to be copied to GPU*/
+ for (int i = 0; i < count; i++) {
+ const int id_in_pack = i + local_pack_position;
+ parts_aos_buffer[id_in_pack].x_p_h.x = c->hydro.parts[i].x[0] - shift.x;
+ parts_aos_buffer[id_in_pack].x_p_h.y = c->hydro.parts[i].x[1] - shift.y;
+ parts_aos_buffer[id_in_pack].x_p_h.z = c->hydro.parts[i].x[2] - shift.z;
+ parts_aos_buffer[id_in_pack].x_p_h.w = c->hydro.parts[i].h;
+ parts_aos_buffer[id_in_pack].ux_m.x = c->hydro.parts[i].v[0];
+ parts_aos_buffer[id_in_pack].ux_m.y = c->hydro.parts[i].v[1];
+ parts_aos_buffer[id_in_pack].ux_m.z = c->hydro.parts[i].v[2];
+ parts_aos_buffer[id_in_pack].ux_m.w = c->hydro.parts[i].mass;
+ parts_aos_buffer[id_in_pack].cjs_cje.x = cstarts.x;
+ parts_aos_buffer[id_in_pack].cjs_cje.y = cstarts.y;
+ }
+}
+
+void pack_neat_aos_f4(struct cell *__restrict__ c,
+ struct part_aos_f4_send *__restrict__ parts_aos_buffer,
+ int tid, int local_pack_position, int count,
+ int2 frst_lst_prts) {
+
+ struct part ptmps[count];
+ memcpy(ptmps, (c->hydro.parts), count * sizeof(struct part));
+ // ptmps = c->hydro.parts;
+ const float cellx = c->loc[0], celly = c->loc[1], cellz = c->loc[2];
+ for (int i = 0; i < count; i++) {
+ const int id_in_pack = i + local_pack_position;
+ // const struct part p = ptmps[i];
+ /*Data to be copied to GPU*/
+ parts_aos_buffer[id_in_pack].x_p_h.x = ptmps[i].x[0] - cellx;
+ parts_aos_buffer[id_in_pack].x_p_h.y = ptmps[i].x[1] - celly;
+ parts_aos_buffer[id_in_pack].x_p_h.z = ptmps[i].x[2] - cellz;
+ parts_aos_buffer[id_in_pack].x_p_h.w = ptmps[i].h;
+ parts_aos_buffer[id_in_pack].ux_m.x = ptmps[i].v[0];
+ parts_aos_buffer[id_in_pack].ux_m.y = ptmps[i].v[1];
+ parts_aos_buffer[id_in_pack].ux_m.z = ptmps[i].v[2];
+ parts_aos_buffer[id_in_pack].ux_m.w = ptmps[i].mass;
+ // /*Initialise sums to zero before CPU/GPU copy*/
+ // const float4 zeroes = {0.0, 0.0, 0.0, 0.0};
+ // parts_aos_buffer[id_in_pack].rho_dh_wcount = zeroes;
+ // parts_aos_buffer[id_in_pack].rot_ux_div_v = zeroes;
+ }
+}
+
+void pack_neat_aos_f4_g(struct cell *c,
+ struct part_aos_f4_g_send *parts_aos_buffer, int tid,
+ int local_pack_position, int count) {
+
+ const struct part *ptmps;
+ ptmps = c->hydro.parts;
+ const float cellx = c->loc[0], celly = c->loc[1], cellz = c->loc[2];
+ for (int i = 0; i < count; i++) {
+ int id_in_pack = i + local_pack_position;
+ const struct part p = ptmps[i];
+ /*Data to be copied to GPU*/
+ parts_aos_buffer[id_in_pack].x_h.x = p.x[0] - cellx;
+ parts_aos_buffer[id_in_pack].x_h.y = p.x[1] - celly;
+ parts_aos_buffer[id_in_pack].x_h.z = p.x[2] - cellz;
+ parts_aos_buffer[id_in_pack].x_h.w = p.h;
+ parts_aos_buffer[id_in_pack].ux_m.x = p.v[0];
+ parts_aos_buffer[id_in_pack].ux_m.y = p.v[1];
+ parts_aos_buffer[id_in_pack].ux_m.z = p.v[2];
+ parts_aos_buffer[id_in_pack].ux_m.w = p.mass;
+ parts_aos_buffer[id_in_pack].rho_avisc_u_c.x = p.rho;
+ parts_aos_buffer[id_in_pack].rho_avisc_u_c.y = p.viscosity.alpha;
+ parts_aos_buffer[id_in_pack].rho_avisc_u_c.z = p.u; // p.density.rot_v[0];
+ parts_aos_buffer[id_in_pack].rho_avisc_u_c.w =
+ p.force.soundspeed; // p.density.rot_v[0];
+ }
+}
+
+extern inline void pack_neat_pair_aos_f4_g(
+ struct cell *__restrict c,
+ struct part_aos_f4_g_send *__restrict parts_aos_buffer, int tid,
+ const int local_pack_position, const int count, const float3 shift,
+ const int2 cstarts) {
+ /*Data to be copied to GPU*/
+ for (int i = 0; i < count; i++) {
+ const int id_in_pack = i + local_pack_position;
+ parts_aos_buffer[id_in_pack].x_h.x = c->hydro.parts[i].x[0] - shift.x;
+ parts_aos_buffer[id_in_pack].x_h.y = c->hydro.parts[i].x[1] - shift.y;
+ parts_aos_buffer[id_in_pack].x_h.z = c->hydro.parts[i].x[2] - shift.z;
+ parts_aos_buffer[id_in_pack].x_h.w = c->hydro.parts[i].h;
+ parts_aos_buffer[id_in_pack].ux_m.x = c->hydro.parts[i].v[0];
+ parts_aos_buffer[id_in_pack].ux_m.y = c->hydro.parts[i].v[1];
+ parts_aos_buffer[id_in_pack].ux_m.z = c->hydro.parts[i].v[2];
+ parts_aos_buffer[id_in_pack].ux_m.w = c->hydro.parts[i].mass;
+ parts_aos_buffer[id_in_pack].rho_avisc_u_c.x = c->hydro.parts[i].rho;
+ parts_aos_buffer[id_in_pack].rho_avisc_u_c.y =
+ c->hydro.parts[i].viscosity.alpha;
+ parts_aos_buffer[id_in_pack].rho_avisc_u_c.z =
+ c->hydro.parts[i].u; // p.density.rot_v[0];
+ parts_aos_buffer[id_in_pack].rho_avisc_u_c.w =
+ c->hydro.parts[i].force.soundspeed; // p.density.rot_v[0];
+ parts_aos_buffer[id_in_pack].cjs_cje.x = cstarts.x;
+ parts_aos_buffer[id_in_pack].cjs_cje.y = cstarts.y;
+ }
+}
+
+void pack_neat_aos_f4_f(const struct cell *restrict c,
+ struct part_aos_f4_f_send *restrict parts_aos, int tid,
+ int local_pack_position, int count) {
+
+ // const struct part *restrict ptmps;
+ // ptmps = c->hydro.parts;
+ const int pp = local_pack_position;
+ const float cellx = c->loc[0];
+ const float celly = c->loc[1];
+ const float cellz = c->loc[2];
+ /*Data to be copied to GPU local memory*/
+ for (int i = 0; i < count; i++) {
+ parts_aos[i + pp].x_h.x = c->hydro.parts[i].x[0] - cellx;
+ parts_aos[i + pp].x_h.y = c->hydro.parts[i].x[1] - celly;
+ parts_aos[i + pp].x_h.z = c->hydro.parts[i].x[2] - cellz;
+ parts_aos[i + pp].x_h.w = c->hydro.parts[i].h;
+ }
+ for (int i = 0; i < count; i++) {
+ parts_aos[i + pp].ux_m.x = c->hydro.parts[i].v[0];
+ parts_aos[i + pp].ux_m.y = c->hydro.parts[i].v[1];
+ parts_aos[i + pp].ux_m.z = c->hydro.parts[i].v[2];
+ parts_aos[i + pp].ux_m.w = c->hydro.parts[i].mass;
+ }
+ for (int i = 0; i < count; i++) {
+ parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.x =
+ c->hydro.parts[i].force.f;
+ parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.y =
+ c->hydro.parts[i].force.balsara;
+ parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.z =
+ c->hydro.parts[i].time_bin;
+ parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.w =
+ c->hydro.parts[i].limiter_data.min_ngb_time_bin;
+ }
+ for (int i = 0; i < count; i++) {
+ parts_aos[i + pp].rho_p_c_vsigi.x = c->hydro.parts[i].rho;
+ parts_aos[i + pp].rho_p_c_vsigi.y = c->hydro.parts[i].force.pressure;
+ parts_aos[i + pp].rho_p_c_vsigi.z = c->hydro.parts[i].force.soundspeed;
+ parts_aos[i + pp].rho_p_c_vsigi.w = c->hydro.parts[i].viscosity.v_sig;
+ }
+ for (int i = 0; i < count; i++) {
+ parts_aos[i + pp].u_alphavisc_alphadiff.x = c->hydro.parts[i].u;
+ parts_aos[i + pp].u_alphavisc_alphadiff.y =
+ c->hydro.parts[i].viscosity.alpha;
+ parts_aos[i + pp].u_alphavisc_alphadiff.z =
+ c->hydro.parts[i].diffusion.alpha;
+ }
+}
+
+extern inline void pack_neat_pair_aos_f4_f(
+ struct cell *__restrict c, struct part_aos_f4_f_send *__restrict parts_aos,
+ int tid, const int local_pack_position, const int count, const float3 shift,
+ const int2 cstarts) {
+ // const struct part *restrict ptmps;
+ // ptmps = c->hydro.parts;
+ const int pp = local_pack_position;
+ /*Data to be copied to GPU local memory*/
+ for (int i = 0; i < count; i++) {
+ const int id = i + pp;
+ parts_aos[id].x_h.x = c->hydro.parts[i].x[0] - shift.x;
+ parts_aos[id].x_h.y = c->hydro.parts[i].x[1] - shift.y;
+ parts_aos[id].x_h.z = c->hydro.parts[i].x[2] - shift.z;
+ parts_aos[id].x_h.w = c->hydro.parts[i].h;
+ parts_aos[id].ux_m.x = c->hydro.parts[i].v[0];
+ parts_aos[id].ux_m.y = c->hydro.parts[i].v[1];
+ parts_aos[id].ux_m.z = c->hydro.parts[i].v[2];
+ parts_aos[id].ux_m.w = c->hydro.parts[i].mass;
+ parts_aos[id].f_bals_timebin_mintimebin_ngb.x = c->hydro.parts[i].force.f;
+ parts_aos[id].f_bals_timebin_mintimebin_ngb.y =
+ c->hydro.parts[i].force.balsara;
+ parts_aos[id].f_bals_timebin_mintimebin_ngb.z = c->hydro.parts[i].time_bin;
+ parts_aos[id].f_bals_timebin_mintimebin_ngb.w =
+ c->hydro.parts[i].limiter_data.min_ngb_time_bin;
+ parts_aos[id].rho_p_c_vsigi.x = c->hydro.parts[i].rho;
+ parts_aos[id].rho_p_c_vsigi.y = c->hydro.parts[i].force.pressure;
+ parts_aos[id].rho_p_c_vsigi.z = c->hydro.parts[i].force.soundspeed;
+ parts_aos[id].rho_p_c_vsigi.w = c->hydro.parts[i].viscosity.v_sig;
+ parts_aos[id].u_alphavisc_alphadiff.x = c->hydro.parts[i].u;
+ parts_aos[id].u_alphavisc_alphadiff.y = c->hydro.parts[i].viscosity.alpha;
+ parts_aos[id].u_alphavisc_alphadiff.z = c->hydro.parts[i].diffusion.alpha;
+ parts_aos[id].cjs_cje.x = cstarts.x;
+ parts_aos[id].cjs_cje.y = cstarts.y;
+ }
+}
+
+void runner_doself1_gpu_unpack_neat_aos_f4(
+ struct runner *r, struct cell *c, struct part_aos_f4_recv *parts_aos_buffer,
+ int timer, int *pack_length, int tid, int count_max_parts_tmp,
+ struct engine *e) {
+ TIMER_TIC;
+
+ /* Anything to do here? */
+ if (c->hydro.count == 0) return;
+ if (!cell_is_active_hydro(c, e)) {
+ message("Inactive cell\n");
+ return;
+ }
+ int count = c->hydro.count;
+ int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+ if (local_pack_position + count >= count_max_parts_tmp) {
+ fprintf(stderr,
+ "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+ "%i pointer to pack_length is %i, local_pack_position is % i, "
+ "count is %i\n",
+ (*pack_length), pack_length, local_pack_position, count, e);
+ }
+#endif
+
+ /* Copy particle data from CPU buffers to cells */
+ unpack_neat_aos_f4(c, parts_aos_buffer, tid, local_pack_position, count, e);
+ // Increment pack length accordingly
+ (*pack_length) += count;
+}
+
+void runner_doself1_gpu_unpack_neat_aos_f4_g(
+ struct runner *r, struct cell *c,
+ struct part_aos_f4_g_recv *parts_aos_buffer, int timer, int *pack_length,
+ int tid, int count_max_parts_tmp, struct engine *e) {
+ TIMER_TIC;
+
+ /* Anything to do here? */
+ if (c->hydro.count == 0) return;
+ if (!cell_is_active_hydro(c, e)) {
+ message("Inactive cell\n");
+ return;
+ }
+ int count = c->hydro.count;
+ int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+ if (local_pack_position + count >= count_max_parts_tmp) {
+ fprintf(stderr,
+ "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+ "%i pointer to pack_length is %i, local_pack_position is % i, "
+ "count is %i\n",
+ (*pack_length), pack_length, local_pack_position, count, e);
+ }
+#endif
+
+ /* Copy particle data from CPU buffers to cells */
+ unpack_neat_aos_f4_g(c, parts_aos_buffer, tid, local_pack_position, count, e);
+ // Increment pack length accordingly
+ (*pack_length) += count;
+}
+
+void runner_doself1_gpu_unpack_neat_aos_f4_f(
+ struct runner *r, struct cell *c,
+ struct part_aos_f4_f_recv *parts_aos_buffer, int timer, int *pack_length,
+ int tid, int count_max_parts_tmp, struct engine *e) {
+ TIMER_TIC;
+
+ /* Anything to do here? */
+ if (c->hydro.count == 0) return;
+ if (!cell_is_active_hydro(c, e)) {
+ message("Inactive cell\n");
+ return;
+ }
+ int count = c->hydro.count;
+ int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+ if (local_pack_position + count >= count_max_parts_tmp) {
+ fprintf(stderr,
+ "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+ "%i pointer to pack_length is %i, local_pack_position is % i, "
+ "count is %i\n",
+ (*pack_length), pack_length, local_pack_position, count, e);
+ }
+#endif
+
+ /* Copy particle data from CPU buffers to cells */
+ unpack_neat_aos_f4_f(c, parts_aos_buffer, tid, local_pack_position, count, e);
+ // Increment pack length accordingly
+ (*pack_length) += count;
+}
+
+#include
+void unpack_neat_aos_f4(struct cell *c,
+ struct part_aos_f4_recv *parts_aos_buffer, int tid,
+ int local_pack_position, int count, struct engine *e) {
+
+ struct part_aos_f4_recv *parts_tmp = &parts_aos_buffer[local_pack_position];
+ for (int i = 0; i < count; i++) {
+
+ struct part_aos_f4_recv p_tmp = parts_tmp[i];
+ float4 rho_dh_wcount = p_tmp.rho_dh_wcount;
+ float4 rot_ux_div_v = p_tmp.rot_ux_div_v;
+ struct part *p = &c->hydro.parts[i];
+ if(!PART_IS_ACTIVE(p, e))continue;
+ p->rho += rho_dh_wcount.x;
+ p->density.rho_dh += rho_dh_wcount.y;
+ p->density.wcount += rho_dh_wcount.z;
+ p->density.wcount_dh += rho_dh_wcount.w;
+ p->density.rot_v[0] += rot_ux_div_v.x;
+ p->density.rot_v[1] += rot_ux_div_v.y;
+ p->density.rot_v[2] += rot_ux_div_v.z;
+ p->viscosity.div_v += rot_ux_div_v.w;
+ }
+}
+
+void unpack_neat_aos_f4_g(struct cell *c,
+ struct part_aos_f4_g_recv *parts_aos_buffer, int tid,
+ int local_pack_position, int count,
+ struct engine *e) {
+
+ struct part_aos_f4_g_recv *parts_tmp = &parts_aos_buffer[local_pack_position];
+ for (int i = 0; i < count; i++) {
+ struct part_aos_f4_g_recv p_tmp = parts_tmp[i];
+ struct part *p = &c->hydro.parts[i];
+ if(!PART_IS_ACTIVE(p, e))continue;
+ const float v_sig = p->viscosity.v_sig;
+ p->viscosity.v_sig = fmaxf(p_tmp.vsig_lapu_aviscmax.x, v_sig);
+ p->diffusion.laplace_u += p_tmp.vsig_lapu_aviscmax.y;
+ const float max_ngb = p->force.alpha_visc_max_ngb;
+ p->force.alpha_visc_max_ngb = fmaxf(p_tmp.vsig_lapu_aviscmax.z, max_ngb);
+ }
+}
+
+void unpack_neat_aos_f4_f(struct cell *restrict c,
+ struct part_aos_f4_f_recv *restrict parts_aos_buffer,
+ int tid, int local_pack_position, int count,
+ struct engine *e) {
+ int pp = local_pack_position;
+ for (int i = 0; i < count; i++) {
+ if(!PART_IS_ACTIVE(&c->hydro.parts[i], e))continue;
+ c->hydro.parts[i].a_hydro[0] += parts_aos_buffer[i + pp].a_hydro.x;
+ c->hydro.parts[i].a_hydro[1] += parts_aos_buffer[i + pp].a_hydro.y;
+ c->hydro.parts[i].a_hydro[2] += parts_aos_buffer[i + pp].a_hydro.z;
+ }
+ for (int i = 0; i < count; i++) {
+ if(!PART_IS_ACTIVE(&c->hydro.parts[i], e))continue;
+ c->hydro.parts[i].viscosity.v_sig =
+ fmaxf(parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.z,
+ c->hydro.parts[i].viscosity.v_sig);
+ c->hydro.parts[i].limiter_data.min_ngb_time_bin =
+ (int)(parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.w + 0.5f);
+ }
+ for (int i = 0; i < count; i++) {
+ if(!PART_IS_ACTIVE(&c->hydro.parts[i], e))continue;
+ c->hydro.parts[i].u_dt +=
+ parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.x;
+ c->hydro.parts[i].force.h_dt +=
+ parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.y;
+ }
+}
+
+void unpack_neat_pair_aos_f4(struct runner *r, struct cell *restrict c,
+ struct part_aos_f4_recv *restrict parts_aos_buffer,
+ int tid, int local_pack_position, int count,
+ struct engine *e) {
+
+ // struct part_aos_f4_recv * restrict parts_tmp =
+ // &parts_aos_buffer[local_pack_position];
+ if (cell_is_active_hydro(c, e)) {
+ int pp = local_pack_position;
+ for (int i = 0; i < count; i++) {
+ int j = i + pp;
+ c->hydro.parts[i].rho += parts_aos_buffer[j].rho_dh_wcount.x;
+ c->hydro.parts[i].density.rho_dh += parts_aos_buffer[j].rho_dh_wcount.y;
+ c->hydro.parts[i].density.wcount += parts_aos_buffer[j].rho_dh_wcount.z;
+ c->hydro.parts[i].density.wcount_dh +=
+ parts_aos_buffer[j].rho_dh_wcount.w;
+ c->hydro.parts[i].density.rot_v[0] += parts_aos_buffer[j].rot_ux_div_v.x;
+ c->hydro.parts[i].density.rot_v[1] += parts_aos_buffer[j].rot_ux_div_v.y;
+ c->hydro.parts[i].density.rot_v[2] += parts_aos_buffer[j].rot_ux_div_v.z;
+ c->hydro.parts[i].viscosity.div_v += parts_aos_buffer[j].rot_ux_div_v.w;
+ }
+ }
+}
+
+void unpack_neat_pair_aos_f4_g(
+ struct runner *r, struct cell *restrict c,
+ struct part_aos_f4_g_recv *restrict parts_aos_buffer, int tid,
+ int local_pack_position, int count, struct engine *e) {
+ // struct part_aos_f4_recv * restrict parts_tmp =
+ // &parts_aos_buffer[local_pack_position]; int pp = local_pack_position; for
+ // (int i = 0; i < count; i++) {
+ // int j = i + pp;
+ // c->hydro.parts[i].viscosity.v_sig =
+ // parts_aos_buffer[j].vsig_lapu_aviscmax.x;
+ // c->hydro.parts[i].diffusion.laplace_u +=
+ // parts_aos_buffer[j].vsig_lapu_aviscmax.y;
+ // c->hydro.parts[i].force.alpha_visc_max_ngb =
+ // parts_aos_buffer[j].vsig_lapu_aviscmax.z;
+ // }
+ if (cell_is_active_hydro(c, e)) {
+
+ struct part_aos_f4_g_recv *parts_tmp =
+ &parts_aos_buffer[local_pack_position];
+ for (int i = 0; i < count; i++) {
+ struct part_aos_f4_g_recv p_tmp = parts_tmp[i];
+ struct part *p = &c->hydro.parts[i];
+ const float v_sig = p->viscosity.v_sig;
+ p->viscosity.v_sig = fmaxf(p_tmp.vsig_lapu_aviscmax.x, v_sig);
+ p->diffusion.laplace_u += p_tmp.vsig_lapu_aviscmax.y;
+ const float max_ngb = p->force.alpha_visc_max_ngb;
+ p->force.alpha_visc_max_ngb = fmaxf(p_tmp.vsig_lapu_aviscmax.z, max_ngb);
+ }
+ }
+}
+
+void unpack_neat_pair_aos_f4_f(
+ struct runner *r, struct cell *restrict c,
+ struct part_aos_f4_f_recv *restrict parts_aos_buffer, int tid,
+ int local_pack_position, int count, struct engine *e) {
+ // struct part_aos_f4_f_recv *restrict parts_tmp =
+ //&parts_aos_buffer[local_pack_position];
+ if (cell_is_active_hydro(c, e)) {
+ int pp = local_pack_position;
+ for (int i = 0; i < count; i++) {
+ // struct part_aos_f4_f_recv p_tmp = parts_tmp[i];
+ // struct part *restrict p = &c->hydro.parts[i];
+ int j = i + pp;
+ c->hydro.parts[i].a_hydro[0] += parts_aos_buffer[j].a_hydro.x;
+ c->hydro.parts[i].a_hydro[1] += parts_aos_buffer[j].a_hydro.y;
+ c->hydro.parts[i].a_hydro[2] += parts_aos_buffer[j].a_hydro.z;
+ c->hydro.parts[i].viscosity.v_sig =
+ fmaxf(parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.z,
+ c->hydro.parts[i].viscosity.v_sig);
+ c->hydro.parts[i].limiter_data.min_ngb_time_bin =
+ (int)(parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.w + 0.5f);
+ c->hydro.parts[i].u_dt +=
+ parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.x;
+ c->hydro.parts[i].force.h_dt +=
+ parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.y;
+ }
+ }
+}
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4(
+ struct runner *r, struct cell *ci, struct cell *cj,
+ struct part_aos_f4_recv *parts_aos_buffer, int timer, int *pack_length,
+ int tid, int count_max_parts_tmp, struct engine *e) {
+
+ /* Anything to do here? */
+// if (ci->hydro.count == 0 || cj->hydro.count == 0)
+// return;
+ if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)) {
+ message("Inactive cell\n");
+ return;
+ }
+ int count_ci = ci->hydro.count;
+ int count_cj = cj->hydro.count;
+ int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+ if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
+ fprintf(stderr,
+ "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+ "%i pointer to pack_length is %i, local_pack_position is % i, "
+ "count is %i\n",
+ (*pack_length), pack_length, local_pack_position, count_ci, e);
+ }
+#endif
+
+ /* Pack the particle data into CPU-side buffers*/
+ // if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+ // %i\n", local_pack_position, count_ci, count_cj);
+ unpack_neat_pair_aos_f4(r, ci, parts_aos_buffer, tid, local_pack_position,
+ count_ci, e);
+ local_pack_position += count_ci;
+ /* Pack the particle data into CPU-side buffers*/
+ // if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+ // %i\n", local_pack_position, count_ci, count_cj);
+ unpack_neat_pair_aos_f4(r, cj, parts_aos_buffer, tid, local_pack_position,
+ count_cj, e);
+ /* Increment pack length accordingly */
+ (*pack_length) += count_ci + count_cj;
+ // if(r->cpuid == 0)exit(0);
+}
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(
+ struct runner *r, struct cell *ci, struct cell *cj,
+ struct part_aos_f4_g_recv *parts_aos_buffer, int timer, int *pack_length,
+ int tid, int count_max_parts_tmp, struct engine *e) {
+
+ /* Anything to do here? */
+ // if (c->hydro.count == 0)
+ // return;
+ if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)) {
+ message("Inactive cell\n");
+ return;
+ }
+ int count_ci = ci->hydro.count;
+ int count_cj = cj->hydro.count;
+ int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+ if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
+ fprintf(stderr,
+ "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+ "%i pointer to pack_length is %i, local_pack_position is % i, "
+ "count is %i\n",
+ (*pack_length), pack_length, local_pack_position, count_ci, e);
+ }
+#endif
+
+ /* Pack the particle data into CPU-side buffers*/
+ // if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+ // %i\n", local_pack_position, count_ci, count_cj);
+ unpack_neat_pair_aos_f4_g(r, ci, parts_aos_buffer, tid, local_pack_position,
+ count_ci, e);
+ local_pack_position += count_ci;
+ /* Pack the particle data into CPU-side buffers*/
+ // if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+ // %i\n", local_pack_position, count_ci, count_cj);
+ unpack_neat_pair_aos_f4_g(r, cj, parts_aos_buffer, tid, local_pack_position,
+ count_cj, e);
+ /* Increment pack length accordingly */
+ (*pack_length) += count_ci + count_cj;
+ // if(r->cpuid == 0)exit(0);
+}
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(
+ struct runner *r, struct cell *ci, struct cell *cj,
+ struct part_aos_f4_f_recv *parts_aos_buffer, int timer, int *pack_length,
+ int tid, int count_max_parts_tmp, struct engine *e) {
+
+ /* Anything to do here? */
+ // if (c->hydro.count == 0)
+ // return;
+ if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)) {
+ message("Inactive cell\n");
+ return;
+ }
+ int count_ci = ci->hydro.count;
+ int count_cj = cj->hydro.count;
+ int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+ if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
+ fprintf(stderr,
+ "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+ "%i pointer to pack_length is %i, local_pack_position is % i, "
+ "count is %i\n",
+ (*pack_length), pack_length, local_pack_position, count_ci, e);
+ }
+#endif
+
+ /* Pack the particle data into CPU-side buffers*/
+ // if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+ // %i\n", local_pack_position, count_ci, count_cj);
+ unpack_neat_pair_aos_f4_f(r, ci, parts_aos_buffer, tid, local_pack_position,
+ count_ci, e);
+ local_pack_position += count_ci;
+ /* Pack the particle data into CPU-side buffers*/
+ // if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+ // %i\n", local_pack_position, count_ci, count_cj);
+ unpack_neat_pair_aos_f4_f(r, cj, parts_aos_buffer, tid, local_pack_position,
+ count_cj, e);
+ /* Increment pack length accordingly */
+ (*pack_length) += count_ci + count_cj;
+ // if(r->cpuid == 0)exit(0);
+}
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f4(
+ struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
+ struct part_aos_f4_send *restrict parts_aos_buffer, int timer,
+ int *pack_length, int tid, int count_max_parts_tmp, const int count_ci,
+ const int count_cj, float3 shift_tmp) {
+
+ TIMER_TIC;
+
+ /* Anything to do here? */
+ if (ci->hydro.count == 0) return;
+
+ int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+ if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
+ fprintf(stderr,
+ "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
+ "ci %i cj %i count_max %i\n",
+ local_pack_position, count_ci, count_cj, count_max_parts_tmp);
+ error();
+ }
+#endif
+
+ /* Pack the particle data into CPU-side buffers*/
+ const float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1],
+ shift_tmp.z + cj->loc[2]};
+ const int lpp1 = local_pack_position;
+
+ const int2 cis_cie = {local_pack_position, local_pack_position + count_ci};
+
+ const int2 cjs_cje = {local_pack_position + count_ci,
+ local_pack_position + count_ci + count_cj};
+
+ pack_neat_pair_aos_f4(ci, parts_aos_buffer, tid, lpp1, count_ci, shift_i,
+ cjs_cje);
+
+ local_pack_position += count_ci;
+ /* Pack the particle data into CPU-side buffers*/
+ const float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]};
+ const int lpp2 = local_pack_position;
+
+ pack_neat_pair_aos_f4(cj, parts_aos_buffer, tid, lpp2, count_cj, shift_j,
+ cis_cie);
+ /* Increment pack length accordingly */
+ (*pack_length) += count_ci + count_cj;
+
+ if (timer) TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f4_g(
+ struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
+ struct part_aos_f4_g_send *restrict parts_aos_buffer, int timer,
+ int *pack_length, int tid, int count_max_parts_tmp, const int count_ci,
+ const int count_cj, float3 shift_tmp) {
+
+ TIMER_TIC;
+
+ /* Anything to do here? */
+ if (ci->hydro.count == 0) return;
+
+ int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+ if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
+ fprintf(stderr,
+ "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
+ "ci %i cj %i count_max %i\n",
+ local_pack_position, count_ci, count_cj, count_max_parts_tmp);
+ error();
+ }
+#endif
+
+ /* Pack the particle data into CPU-side buffers*/
+ const float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1],
+ shift_tmp.z + cj->loc[2]};
+ const int lpp1 = local_pack_position;
+
+ const int2 cis_cie = {local_pack_position, local_pack_position + count_ci};
+
+ const int2 cjs_cje = {local_pack_position + count_ci,
+ local_pack_position + count_ci + count_cj};
+
+ pack_neat_pair_aos_f4_g(ci, parts_aos_buffer, tid, lpp1, count_ci, shift_i,
+ cjs_cje);
+
+ local_pack_position += count_ci;
+ /* Pack the particle data into CPU-side buffers*/
+ const float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]};
+ const int lpp2 = local_pack_position;
+
+ pack_neat_pair_aos_f4_g(cj, parts_aos_buffer, tid, lpp2, count_cj, shift_j,
+ cis_cie);
+ /* Increment pack length accordingly */
+ (*pack_length) += count_ci + count_cj;
+
+ if (timer) TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f4_f(
+ struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
+ struct part_aos_f4_f_send *restrict parts_aos_buffer, int timer,
+ int *pack_length, int tid, int count_max_parts_tmp, const int count_ci,
+ const int count_cj, float3 shift_tmp) {
+
+ TIMER_TIC;
+
+ /* Anything to do here? */
+ if (ci->hydro.count == 0) return;
+
+ int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+ if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
+ fprintf(stderr,
+ "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
+ "ci %i cj %i count_max %i\n",
+ local_pack_position, count_ci, count_cj, count_max_parts_tmp);
+ error();
+ }
+#endif
+
+ /* Pack the particle data into CPU-side buffers*/
+ const float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1],
+ shift_tmp.z + cj->loc[2]};
+ const int lpp1 = local_pack_position;
+
+ const int2 cis_cie = {local_pack_position, local_pack_position + count_ci};
+
+ const int2 cjs_cje = {local_pack_position + count_ci,
+ local_pack_position + count_ci + count_cj};
+
+ pack_neat_pair_aos_f4_f(ci, parts_aos_buffer, tid, lpp1, count_ci, shift_i,
+ cjs_cje);
+
+ local_pack_position += count_ci;
+ /* Pack the particle data into CPU-side buffers*/
+ const float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]};
+ const int lpp2 = local_pack_position;
+
+ pack_neat_pair_aos_f4_f(cj, parts_aos_buffer, tid, lpp2, count_cj, shift_j,
+ cis_cie);
+ /* Increment pack length accordingly */
+ (*pack_length) += count_ci + count_cj;
+
+ if (timer) TIMER_TOC(timer_doself_gpu_pack);
+}
+// #ifdef WITHCUDA
+// }
+// #endif
diff --git a/src/runner_gpu_pack_functions.h b/src/runner_gpu_pack_functions.h
new file mode 100644
index 0000000000..8730219711
--- /dev/null
+++ b/src/runner_gpu_pack_functions.h
@@ -0,0 +1,246 @@
+#include "cuda/part_gpu.h"
+void runner_doself1_gpu_pack(
+ struct runner *r, struct cell *c, int timer, int *pack_length, double *x_p,
+ double *y_p, double *z_p, int tid, int *tid_p, long long *id, float *ux,
+ float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+ float *mass, float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
+ float *locx, float *locy, float *locz, float *widthx, float *widthy,
+ float *widthz, float *h_max, int *count_p, float *wcount, float *wcount_dh,
+ float *rho_dh, float *rot_u, float *rot_v, float *rot_w, float *div_v,
+ float *div_v_previous_step, float *alpha_visc, float *v_sig,
+ float *laplace_u, float *alpha_diff, float *f, float *soundspeed,
+ float *h_dt, float *balsara, float *pressure, float *alpha_visc_max_ngb,
+ timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+ char *to_be_synchronized, int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat(struct runner *r, struct cell *c,
+ struct part_soa parts_soa, int timer,
+ int *pack_length, int tid,
+ int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos(struct runner *r, struct cell *c,
+ struct part_aos *parts_aos, int timer,
+ int *pack_length, int tid,
+ int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos_f4(
+ struct runner *r, struct cell *__restrict__ c,
+ struct part_aos_f4_send *__restrict__ parts_aos, int timer,
+ int *pack_length, int tid, int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos_g(struct runner *r, struct cell *c,
+ struct part_aos_g *parts_aos, int timer,
+ int *pack_length, int tid,
+ int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos_f4_g(struct runner *r, struct cell *c,
+ struct part_aos_f4_g_send *parts_aos,
+ int timer, int *pack_length, int tid,
+ int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos_f(struct runner *r, struct cell *c,
+ struct part_aos_f *parts_aos, int timer,
+ int *pack_length, int tid,
+ int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos_f4_f(
+ struct runner *r, struct cell *restrict c,
+ struct part_aos_f4_f_send *restrict parts_aos, int timer, int *pack_length,
+ int tid, int count_max_parts_tmp);
+void runner_doself1_gpu_pack_forc_aos(struct runner *r, struct cell *c,
+ struct part_aos *parts_aos, int timer,
+ int *pack_length, int tid,
+ int count_max_parts_tmp);
+void runner_doself1_gpu_pack_grad_aos(struct runner *r, struct cell *c,
+ struct part_aos *parts_aos, int timer,
+ int *pack_length, int tid,
+ int count_max_parts_tmp);
+void runner_doself1_gpu_unpack_neat(struct runner *r, struct cell *c,
+ struct part_soa parts_soa, int timer,
+ int *pack_length, int tid,
+ int count_max_parts_tmp, struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos(struct runner *r, struct cell *c,
+ struct part_aos *parts_aos_buffer,
+ int timer, int *pack_length, int tid,
+ int count_max_parts_tmp,
+ struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_f4(
+ struct runner *r, struct cell *c, struct part_aos_f4_recv *parts_aos_buffer,
+ int timer, int *pack_length, int tid, int count_max_parts_tmp,
+ struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_g(struct runner *r, struct cell *c,
+ struct part_aos_g *parts_aos_buffer,
+ int timer, int *pack_length, int tid,
+ int count_max_parts_tmp,
+ struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_f4_g(
+ struct runner *r, struct cell *c,
+ struct part_aos_f4_g_recv *parts_aos_buffer, int timer, int *pack_length,
+ int tid, int count_max_parts_tmp, struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_f(struct runner *r, struct cell *c,
+ struct part_aos_f *parts_aos_buffer,
+ int timer, int *pack_length, int tid,
+ int count_max_parts_tmp,
+ struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_f4_f(
+ struct runner *r, struct cell *restrict c,
+ struct part_aos_f4_f_recv *restrict parts_aos_buffer, int timer,
+ int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
+void pack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
+ int *tid_p, long long *id, float *ux, float *uy, float *uz,
+ float *a_hydrox, float *a_hydroy, float *a_hydroz, float *mass,
+ float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
+ float *locx, float *locy, float *locz, float *widthx, float *widthy,
+ float *widthz, float *h_max, int *count_p, float *wcount,
+ float *wcount_dh, float *rho_dh, float *rot_u, float *rot_v,
+ float *rot_w, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+ float *f, float *soundspeed, float *h_dt, float *balsara,
+ float *pressure, float *alpha_visc_max_ngb, timebin_t *time_bin,
+ timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+ char *to_be_synchronized, int local_pack_position, int count);
+void pack_neat(struct cell *c, struct part_soa parts_soa, int tid,
+ int local_pack_position, int count);
+void pack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid,
+ int local_pack_position, int count);
+void pack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer,
+ int tid, int local_pack_position, int count);
+void pack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos, int tid,
+ int local_pack_position, int count);
+void pack_neat_aos_f4(struct cell *c, struct part_aos_f4_send *parts_aos_buffer,
+ int tid, int local_pack_position, int count,
+ int2 frst_lst_prts);
+void pack_neat_aos_f4_g(struct cell *c,
+ struct part_aos_f4_g_send *parts_aos_buffer, int tid,
+ int local_pack_position, int count);
+void pack_neat_aos_f4_f(const struct cell *restrict c,
+ struct part_aos_f4_f_send *restrict parts_aos, int tid,
+ int local_pack_position, int count);
+void unpack_neat(struct cell *c, struct part_soa parts_soa_buffer, int tid,
+ int local_pack_position, int count, struct engine *e);
+void unpack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid,
+ int local_pack_position, int count, struct engine *e);
+void unpack_neat_aos_f4(struct cell *c,
+ struct part_aos_f4_recv *parts_aos_buffer, int tid,
+ int local_pack_position, int count, struct engine *e);
+void unpack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer,
+ int tid, int local_pack_position, int count,
+ struct engine *e);
+void unpack_neat_aos_f4_g(struct cell *c,
+ struct part_aos_f4_g_recv *parts_aos_buffer, int tid,
+ int local_pack_position, int count, struct engine *e);
+void unpack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos_buffer,
+ int tid, int local_pack_position, int count,
+ struct engine *e);
+void unpack_neat_aos_f4_f(struct cell *restrict c,
+ struct part_aos_f4_f_recv *restrict parts_aos_buffer,
+ int tid, int local_pack_position, int count,
+ struct engine *e);
+void unpack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
+ int *tid_p, long long *id, float *ux, float *uy, float *uz,
+ float *a_hydrox, float *a_hydroy, float *a_hydroz, float *mass,
+ float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
+ float *locx, float *locy, float *locz, float *widthx, float *widthy,
+ float *widthz, float *h_max, int *count_p, float *wcount,
+ float *wcount_dh, float *rho_dh, float *rot_u, float *rot_v,
+ float *rot_w, float *div_v, float *div_v_previous_step,
+ float *alpha_visc, float *v_sig, float *laplace_u,
+ float *alpha_diff, float *f, float *soundspeed, float *h_dt,
+ float *balsara, float *pressure, float *alpha_visc_max_ngb,
+ timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+ char *to_be_synchronized, int local_pack_position, int count,
+ struct engine *e);
+void runner_doself1_gpu_unpack(
+ struct runner *r, struct cell *c, int timer, int *pack_length, double *x_p,
+ double *y_p, double *z_p, int tid, int *tid_p, long long *id, float *ux,
+ float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+ float *mass, float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
+ float *locx, float *locy, float *locz, float *widthx, float *widthy,
+ float *widthz, float *h_max, int *count_p, float *wcount, float *wcount_dh,
+ float *rho_dh, float *rot_u, float *rot_v, float *rot_w, float *div_v,
+ float *div_v_previous_step, float *alpha_visc, float *v_sig,
+ float *laplace_u, float *alpha_diff, float *f, float *soundspeed,
+ float *h_dt, float *balsara, float *pressure, float *alpha_visc_max_ngb,
+ timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+ char *to_be_synchronized, int count_max_parts_tmp, struct engine *e);
+
+void runner_do_ci_cj_gpu_pack_neat(struct runner *r, struct cell *ci,
+ struct cell *cj,
+ struct part_soa parts_soa_buffer, int timer,
+ int *pack_length, int tid,
+ int count_max_parts_tmp, int count_ci,
+ int count_cj);
+
+void runner_do_ci_cj_gpu_pack_neat_aos(struct runner *r, struct cell *ci,
+ struct cell *cj,
+ struct part_aos *parts_aos_buffer,
+ int timer, int *pack_length, int tid,
+ int count_max_parts_tmp, int count_ci,
+ int count_cj, float3 shift_tmp);
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f4(
+ struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
+ struct part_aos_f4_send *restrict parts_aos_buffer, int timer,
+ int *pack_length, int tid, int count_max_parts_tmp, const int count_ci,
+ const int count_cj, float3 shift_tmp);
+
+void runner_do_ci_cj_gpu_pack_neat_aos_g(struct runner *r, struct cell *ci,
+ struct cell *cj,
+ struct part_aos_g *parts_aos_buffer,
+ int timer, int *pack_length, int tid,
+ int count_max_parts_tmp, int count_ci,
+ int count_cj);
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f4_g(
+ struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
+ struct part_aos_f4_g_send *restrict parts_aos_buffer, int timer,
+ int *pack_length, int tid, int count_max_parts_tmp, const int count_ci,
+ const int count_cj, float3 shift_tmp);
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f(struct runner *r, struct cell *ci,
+ struct cell *cj,
+ struct part_aos_f *parts_aos_buffer,
+ int timer, int *pack_length, int tid,
+ int count_max_parts_tmp, int count_ci,
+ int count_cj);
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f4_f(
+ struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
+ struct part_aos_f4_f_send *restrict parts_aos_buffer, int timer,
+ int *pack_length, int tid, int count_max_parts_tmp, const int count_ci,
+ const int count_cj, float3 shift_tmp);
+
+void runner_do_ci_cj_gpu_unpack_neat(struct runner *r, struct cell *ci,
+ struct cell *cj,
+ struct part_soa parts_soa_buffer,
+ int timer, int *pack_length, int tid,
+ int count_max_parts_tmp, struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos(struct runner *r, struct cell *ci,
+ struct cell *cj,
+ struct part_aos *parts_aos_buffer,
+ int timer, int *pack_length, int tid,
+ int count_max_parts_tmp,
+ struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4(
+ struct runner *r, struct cell *ci, struct cell *cj,
+ struct part_aos_f4_recv *parts_aos_buffer, int timer, int *pack_length,
+ int tid, int count_max_parts_tmp, struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(
+ struct runner *r, struct cell *ci, struct cell *cj,
+ struct part_aos_f4_g_recv *parts_aos_buffer, int timer, int *pack_length,
+ int tid, int count_max_parts_tmp, struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_g(struct runner *r, struct cell *ci,
+ struct cell *cj,
+ struct part_aos_g *parts_aos_buffer,
+ int timer, int *pack_length, int tid,
+ int count_max_parts_tmp,
+ struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f(struct runner *r, struct cell *ci,
+ struct cell *cj,
+ struct part_aos_f *parts_aos_buffer,
+ int timer, int *pack_length, int tid,
+ int count_max_parts_tmp,
+ struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(
+ struct runner *r, struct cell *ci, struct cell *cj,
+ struct part_aos_f4_f_recv *parts_aos_buffer, int timer, int *pack_length,
+ int tid, int count_max_parts_tmp, struct engine *e);
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
new file mode 100644
index 0000000000..2376aafba7
--- /dev/null
+++ b/src/runner_main_clean.cu
@@ -0,0 +1,1864 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ * Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ * 2015 Peter W. Draper (p.w.draper@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program. If not, see .
+ *
+ ******************************************************************************/
+/* Config parameters. */
+#define GPUOFFLOAD_DENSITY 1 // off-load hydro density to GPU
+#define GPUOFFLOAD_GRADIENT 1 // off-load hydro gradient to GPU
+#define GPUOFFLOAD_FORCE 1 // off-load hydro force to GPU
+
+// #define DUMP_TIMINGS 1
+#include "../config.h"
+
+/* MPI headers. */
+#ifdef WITH_MPI
+#include
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Config parameters. */
+#include
+
+/* MPI headers. */
+#ifdef WITH_MPI
+#include
+#endif
+
+/* This object's header. */
+#include "runner.h"
+
+/* Local headers. */
+#include "engine.h"
+#include "feedback.h"
+#include "runner_doiact_sinks.h"
+#include "scheduler.h"
+#include "space_getsid.h"
+#include "timers.h"
+
+/* Import the gravity loop functions. */
+#include "runner_doiact_grav.h"
+
+/* Import the density loop functions. */
+#define FUNCTION density
+#define FUNCTION_TASK_LOOP TASK_LOOP_DENSITY
+#include "runner_doiact_hydro.h"
+#include "runner_doiact_undef.h"
+
+/* Import the gradient loop functions (if required). */
+#ifdef EXTRA_HYDRO_LOOP
+#define FUNCTION gradient
+#define FUNCTION_TASK_LOOP TASK_LOOP_GRADIENT
+#include "runner_doiact_hydro.h"
+#include "runner_doiact_undef.h"
+#endif
+
+/* Import the force loop functions. */
+#define FUNCTION force
+#define FUNCTION_TASK_LOOP TASK_LOOP_FORCE
+#include "runner_doiact_hydro.h"
+#include "runner_doiact_undef.h"
+
+/* Import the limiter loop functions. */
+#define FUNCTION limiter
+#define FUNCTION_TASK_LOOP TASK_LOOP_LIMITER
+#include "runner_doiact_limiter.h"
+#include "runner_doiact_undef.h"
+
+/* Import the stars density loop functions. */
+#define FUNCTION density
+#define FUNCTION_TASK_LOOP TASK_LOOP_DENSITY
+#include "runner_doiact_stars.h"
+#include "runner_doiact_undef.h"
+
+#ifdef EXTRA_STAR_LOOPS
+
+/* Import the stars prepare1 loop functions. */
+#define FUNCTION prep1
+#define FUNCTION_TASK_LOOP TASK_LOOP_STARS_PREP1
+#include "runner_doiact_stars.h"
+#include "runner_doiact_undef.h"
+
+/* Import the stars prepare2 loop functions. */
+#define FUNCTION prep2
+#define FUNCTION_TASK_LOOP TASK_LOOP_STARS_PREP2
+#include "runner_doiact_stars.h"
+#include "runner_doiact_undef.h"
+
+#endif /* EXTRA_STAR_LOOPS */
+
+/* Import the stars feedback loop functions. */
+#define FUNCTION feedback
+#define FUNCTION_TASK_LOOP TASK_LOOP_FEEDBACK
+#include "runner_doiact_stars.h"
+#include "runner_doiact_undef.h"
+
+/* Import the black hole density loop functions. */
+#define FUNCTION density
+#define FUNCTION_TASK_LOOP TASK_LOOP_DENSITY
+#include "runner_doiact_black_holes.h"
+#include "runner_doiact_undef.h"
+
+/* Import the black hole feedback loop functions. */
+#define FUNCTION swallow
+#define FUNCTION_TASK_LOOP TASK_LOOP_SWALLOW
+#include "runner_doiact_black_holes.h"
+#include "runner_doiact_undef.h"
+
+/* Import the black hole feedback loop functions. */
+#define FUNCTION feedback
+#define FUNCTION_TASK_LOOP TASK_LOOP_FEEDBACK
+#include "runner_doiact_black_holes.h"
+#include "runner_doiact_undef.h"
+
+/* Import the RT gradient loop functions */
+#define FUNCTION rt_gradient
+#define FUNCTION_TASK_LOOP TASK_LOOP_RT_GRADIENT
+#include "runner_doiact_hydro.h"
+#include "runner_doiact_undef.h"
+
+/* Import the RT transport (force) loop functions. */
+#define FUNCTION rt_transport
+#define FUNCTION_TASK_LOOP TASK_LOOP_RT_TRANSPORT
+#include "runner_doiact_hydro.h"
+#include "runner_doiact_undef.h"
+
+#ifdef __cplusplus
+}
+#endif
+/**
+ * @brief The #runner main thread routine.
+ *
+ * @param data A pointer to this thread's data.
+ **/
+
+/* CUDA Header. Wrap in extern "C" to prevent C++ function name mangling */
+#ifdef WITH_CUDA
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "cuda/part_gpu.h"
+#include
+#include
+#include
+#include "runner_doiact_functions_hydro_gpu.h"
+#include "runner_gpu_pack_functions.h"
+#include "cuda/GPU_runner_functions.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+void *runner_main2(void *data) {
+ struct runner *r = (struct runner *)data;
+ struct engine *e = r->e;
+ struct scheduler *sched = &e->sched;
+ struct space *space = e->s;
+
+ //////////Declare and allocate GPU launch control data structures/////////
+ /*pack_vars contain data required for self and pair packing tasks destined
+ * for the GPU*/
+ //A. N: Needed
+ struct pack_vars_self *pack_vars_self_dens;
+ struct pack_vars_self *pack_vars_self_forc;
+ struct pack_vars_self *pack_vars_self_grad;
+ struct pack_vars_pair *pack_vars_pair_dens;
+ struct pack_vars_pair *pack_vars_pair_forc;
+ struct pack_vars_pair *pack_vars_pair_grad;
+
+ cudaMallocHost((void **)&pack_vars_self_dens,
+ sizeof(struct pack_vars_self *));
+ cudaMallocHost((void **)&pack_vars_self_forc,
+ sizeof(struct pack_vars_self *));
+ cudaMallocHost((void **)&pack_vars_self_grad,
+ sizeof(struct pack_vars_self *));
+
+ cudaMallocHost((void **)&pack_vars_pair_dens,
+ sizeof(struct pack_vars_pair *));
+ cudaMallocHost((void **)&pack_vars_pair_forc,
+ sizeof(struct pack_vars_pair *));
+ cudaMallocHost((void **)&pack_vars_pair_grad,
+ sizeof(struct pack_vars_pair *));
+ ///////////////////////////////////////////////////////////////////////////
+ /*Find and print GPU name(s)*/
+ int devId = 0; //gpu device name
+ struct cudaDeviceProp prop;
+ int nDevices;
+ int maxBlocksSM;
+ int nSMs;
+ /*Get my rank*/
+ int mpi_rank = 0;
+#ifdef WITH_MPI
+ MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+#endif
+ cudaGetDeviceCount(&nDevices);
+ //If running on MPI we set code to use one MPI rank per GPU
+ //This was found to work very well and simplifies writing slurm scipts
+ if (nDevices == 1) cudaSetDevice(devId);
+#ifdef WITH_MPI
+ else {
+ cudaSetDevice(mpi_rank);
+ devId = mpi_rank;
+ }
+#endif
+ //Now tell me some info about my device
+ cudaGetDeviceProperties(&prop, devId);
+ cudaDeviceGetAttribute(&maxBlocksSM, cudaDevAttrMaxBlocksPerMultiprocessor,
+ devId);
+ cudaDeviceGetAttribute(&nSMs, cudaDevAttrMultiProcessorCount, devId);
+ int nPartsPerCell = space->nr_parts / space->tot_cells;
+
+ if (r->cpuid == 0 && mpi_rank == 0) {
+ message("%i devices available device id is %i\n", nDevices, devId);
+ message("Device : %s\n", prop.name);
+ message("nSMs %i max blocks per SM %i maxnBlocks per stream %i\n",
+ nSMs, maxBlocksSM, nSMs * maxBlocksSM);
+ message("Target nBlocks per kernel is %i\n",
+ N_TASKS_BUNDLE_SELF * nPartsPerCell / BLOCK_SIZE);
+ message("Target nBlocks per stream is %i\n",
+ N_TASKS_PER_PACK_SELF * nPartsPerCell / BLOCK_SIZE);
+ }
+
+ cudaError_t cu_error;
+ size_t free_mem, total_mem;
+ cudaMemGetInfo(&free_mem, &total_mem);
+
+ message("free mem %lu, total mem %lu", free_mem, total_mem);
+ // how many tasks do we want for each launch of GPU kernel
+ const int target_n_tasks = sched->pack_size;
+ const int target_n_tasks_pair = sched->pack_size_pair;
+ pack_vars_self_dens->target_n_tasks = target_n_tasks;
+ pack_vars_pair_dens->target_n_tasks = target_n_tasks_pair;
+ pack_vars_self_forc->target_n_tasks = target_n_tasks;
+ pack_vars_pair_forc->target_n_tasks = target_n_tasks_pair;
+ pack_vars_self_grad->target_n_tasks = target_n_tasks;
+ pack_vars_pair_grad->target_n_tasks = target_n_tasks_pair;
+ // how many tasks we want in each bundle (used for launching kernels in
+ // different streams)
+ const int bundle_size = N_TASKS_BUNDLE_SELF;
+ const int bundle_size_pair = N_TASKS_BUNDLE_PAIR;
+ pack_vars_self_dens->bundle_size = bundle_size;
+ pack_vars_pair_dens->bundle_size = bundle_size_pair;
+ pack_vars_self_forc->bundle_size = bundle_size;
+ pack_vars_pair_forc->bundle_size = bundle_size_pair;
+ pack_vars_self_grad->bundle_size = bundle_size;
+ pack_vars_pair_grad->bundle_size = bundle_size_pair;
+ // Keep track of first and last particles for each task (particle data is
+ // arranged in long arrays containing particles from all the tasks we will
+ // work with)
+ /* A. N.: Needed for offloading self tasks as we use these to sort through
+ * which parts need to interact with which */
+ int2 *task_first_part_f4;
+ int2 *task_first_part_f4_f;
+ int2 *task_first_part_f4_g;
+ int2 *d_task_first_part_f4;
+ int2 *d_task_first_part_f4_f;
+ int2 *d_task_first_part_f4_g;
+ cudaMallocHost((void **)&task_first_part_f4, target_n_tasks * sizeof(int2));
+ cudaMalloc((void **)&d_task_first_part_f4, target_n_tasks * sizeof(int2));
+ cudaMallocHost((void **)&task_first_part_f4_f, target_n_tasks * sizeof(int2));
+ cudaMalloc((void **)&d_task_first_part_f4_f, target_n_tasks * sizeof(int2));
+ cudaMallocHost((void **)&task_first_part_f4_g, target_n_tasks * sizeof(int2));
+ cudaMalloc((void **)&d_task_first_part_f4_g, target_n_tasks * sizeof(int2));
+
+ /*A. N.: Needed but only for small part in launch functions. Might
+ be useful for recursion on the GPU so keep for now */
+ int4 *fparti_fpartj_lparti_lpartj_dens;
+ int4 *fparti_fpartj_lparti_lpartj_forc;
+ int4 *fparti_fpartj_lparti_lpartj_grad;
+ cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_dens,
+ target_n_tasks * sizeof(int4));
+ cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_forc,
+ target_n_tasks * sizeof(int4));
+ cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_grad,
+ target_n_tasks * sizeof(int4));
+
+ /* nBundles is the number of task bundles each
+ thread has ==> Used to loop through bundles */
+ int nBundles = (target_n_tasks + bundle_size - 1) / bundle_size;
+ int nBundles_pair =
+ (target_n_tasks_pair + bundle_size_pair - 1) / bundle_size_pair;
+
+ if (r->cpuid == 0) {
+ fprintf(stderr, "engine_rank %i cpuid %i nBundles/nStreams %i\n",
+ engine_rank, r->cpuid, nBundles);
+ fprintf(stderr, "nBundles/nStreams Pair %i\n", nBundles_pair);
+ }
+
+ pack_vars_self_dens->nBundles = nBundles;
+ pack_vars_pair_dens->nBundles = nBundles_pair;
+ pack_vars_self_forc->nBundles = nBundles;
+ pack_vars_pair_forc->nBundles = nBundles_pair;
+ pack_vars_self_grad->nBundles = nBundles;
+ pack_vars_pair_grad->nBundles = nBundles_pair;
+
+ // first part and last part are the first and last particle ids (locally
+ // within this thread). A. Nasar: All these are used in GPU offload setup
+
+ cudaMallocHost((void **)&pack_vars_self_dens->bundle_first_part,
+ nBundles * sizeof(int));
+ cudaMallocHost((void **)&pack_vars_self_dens->bundle_last_part,
+ nBundles * sizeof(int));
+ cudaMallocHost((void **)&pack_vars_self_dens->bundle_first_task_list,
+ nBundles * sizeof(int));
+
+ cudaMallocHost((void **)&pack_vars_pair_dens->bundle_first_part,
+ 2 * nBundles * sizeof(int));
+ cudaMallocHost((void **)&pack_vars_pair_dens->bundle_last_part,
+ 2 * nBundles * sizeof(int));
+ cudaMallocHost((void **)&pack_vars_pair_dens->bundle_first_task_list,
+ 2 * nBundles * sizeof(int));
+
+ cudaMallocHost((void **)&pack_vars_self_forc->bundle_first_part,
+ nBundles * sizeof(int));
+ cudaMallocHost((void **)&pack_vars_self_forc->bundle_last_part,
+ nBundles * sizeof(int));
+ cudaMallocHost((void **)&pack_vars_self_forc->bundle_first_task_list,
+ nBundles * sizeof(int));
+
+ cudaMallocHost((void **)&pack_vars_pair_forc->bundle_first_part,
+ 2 * nBundles * sizeof(int));
+ cudaMallocHost((void **)&pack_vars_pair_forc->bundle_last_part,
+ 2 * nBundles * sizeof(int));
+ cudaMallocHost((void **)&pack_vars_pair_forc->bundle_first_task_list,
+ 2 * nBundles * sizeof(int));
+
+ cudaMallocHost((void **)&pack_vars_self_grad->bundle_first_part,
+ nBundles * sizeof(int));
+ cudaMallocHost((void **)&pack_vars_self_grad->bundle_last_part,
+ nBundles * sizeof(int));
+ cudaMallocHost((void **)&pack_vars_self_grad->bundle_first_task_list,
+ nBundles * sizeof(int));
+
+ cudaMallocHost((void **)&pack_vars_pair_grad->bundle_first_part,
+ 2 * nBundles * sizeof(int));
+ cudaMallocHost((void **)&pack_vars_pair_grad->bundle_last_part,
+ 2 * nBundles * sizeof(int));
+ cudaMallocHost((void **)&pack_vars_pair_grad->bundle_first_task_list,
+ 2 * nBundles * sizeof(int));
+
+ /*Create streams so that we can off-load different batches of work in
+ * different streams and get some con-CURRENCY! Events used to maximise
+ * asynchrony further*/
+
+ cudaStream_t stream[nBundles];
+ cudaStream_t stream_pairs[nBundles_pair];
+
+ cudaEvent_t self_end[nBundles];
+ for (int i = 0; i < nBundles; i++) cudaEventCreate(&self_end[i]);
+ cudaEvent_t self_end_g[nBundles];
+ for (int i = 0; i < nBundles; i++) cudaEventCreate(&self_end_g[i]);
+ cudaEvent_t self_end_f[nBundles];
+ for (int i = 0; i < nBundles; i++) cudaEventCreate(&self_end_f[i]);
+
+ cudaEvent_t pair_end[nBundles];
+ for (int i = 0; i < nBundles; i++) cudaEventCreate(&pair_end[i]);
+ cudaEvent_t pair_end_g[nBundles];
+ for (int i = 0; i < nBundles; i++) cudaEventCreate(&pair_end_g[i]);
+ cudaEvent_t pair_end_f[nBundles];
+ for (int i = 0; i < nBundles; i++) cudaEventCreate(&pair_end_f[i]);
+
+ int tasksperbundle = (target_n_tasks + nBundles - 1) / nBundles;
+ int tasksperbundle_pair =
+ (target_n_tasks_pair + nBundles_pair - 1) / nBundles_pair;
+
+ pack_vars_self_dens->tasksperbundle = tasksperbundle;
+ pack_vars_pair_dens->tasksperbundle = tasksperbundle_pair;
+ pack_vars_self_forc->tasksperbundle = tasksperbundle;
+ pack_vars_pair_forc->tasksperbundle = tasksperbundle_pair;
+ pack_vars_self_grad->tasksperbundle = tasksperbundle;
+ pack_vars_pair_grad->tasksperbundle = tasksperbundle_pair;
+
+ for (int i = 0; i < nBundles; ++i)
+ cudaStreamCreateWithFlags(&stream[i], cudaStreamNonBlocking);
+ for (int i = 0; i < nBundles_pair; ++i)
+ cudaStreamCreateWithFlags(&stream_pairs[i], cudaStreamNonBlocking);
+
+ pack_vars_self_dens->count_parts = 0;
+ pack_vars_pair_dens->count_parts = 0;
+ pack_vars_self_forc->count_parts = 0;
+ pack_vars_pair_forc->count_parts = 0;
+ pack_vars_self_grad->count_parts = 0;
+ pack_vars_pair_grad->count_parts = 0;
+
+ /*Estimate how many particles to pack for GPU for each GPU launch
+ * instruction*/
+ int nr_nodes = 1, res = 0;
+#ifdef WITH_MPI
+ if ((res = MPI_Comm_size(MPI_COMM_WORLD, &nr_nodes)) != MPI_SUCCESS)
+ error("MPI_Comm_size failed with error %i.", res);
+#endif
+ int parts_per_top_level_cell =
+ space->nr_local_cells_with_particles /
+ space->nr_parts; /*A. Nasar: What I think is a good approximation for
+ average N particles in each top level cell*/
+ float eta_neighbours = e->s->eta_neighbours;
+ int np_per_cell = ceil(2.0 * eta_neighbours);
+ np_per_cell *= np_per_cell * np_per_cell;
+ /*A. Nasar: Increase parts per recursed task-level cell by buffer to
+ ensure we allocate enough memory*/
+ int buff = ceil(0.5 * np_per_cell);
+ /*A. Nasar: Multiplication by 2 is also to ensure we do not over-run
+ * the allocated memory on buffers and GPU. This can happen if calculated h
+ * is larger than cell width and splitting makes bigger than target cells*/
+ int count_max_parts_tmp = 64 * 8 * target_n_tasks * (np_per_cell + buff);
+
+ pack_vars_self_dens->count_max_parts = count_max_parts_tmp;
+ pack_vars_pair_dens->count_max_parts = count_max_parts_tmp;
+ pack_vars_self_forc->count_max_parts = count_max_parts_tmp;
+ pack_vars_pair_forc->count_max_parts = count_max_parts_tmp;
+ pack_vars_self_grad->count_max_parts = count_max_parts_tmp;
+ pack_vars_pair_grad->count_max_parts = count_max_parts_tmp;
+
+ /*Declare Buffer and GPU particle arrays*/
+ struct part_aos_f4_send *parts_aos_f4_send;
+ struct part_aos_f4_recv *parts_aos_f4_recv;
+
+ struct part_aos_f4_f_send *parts_aos_forc_f4_send;
+ struct part_aos_f4_f_recv *parts_aos_forc_f4_recv;
+
+ struct part_aos_f4_g_send *parts_aos_grad_f4_send;
+ struct part_aos_f4_g_recv *parts_aos_grad_f4_recv;
+
+ struct part_aos_f4_send *d_parts_aos_f4_send;
+ struct part_aos_f4_recv *d_parts_aos_f4_recv;
+
+ struct part_aos_f4_f_send *d_parts_aos_forc_f4_send;
+ struct part_aos_f4_f_recv *d_parts_aos_forc_f4_recv;
+
+ struct part_aos_f4_g_send *d_parts_aos_grad_f4_send;
+ struct part_aos_f4_g_recv *d_parts_aos_grad_f4_recv;
+
+ struct part_aos_f4_send *parts_aos_pair_f4_send;
+ struct part_aos_f4_recv *parts_aos_pair_f4_recv;
+
+ struct part_aos_f4_send *d_parts_aos_pair_f4_send;
+ struct part_aos_f4_recv *d_parts_aos_pair_f4_recv;
+
+ struct part_aos_f4_f_send *parts_aos_pair_f4_f_send;
+ struct part_aos_f4_f_recv *parts_aos_pair_f4_f_recv;
+
+ struct part_aos_f4_f_send *d_parts_aos_pair_f4_f_send;
+ struct part_aos_f4_f_recv *d_parts_aos_pair_f4_f_recv;
+
+ struct part_aos_f4_g_send *parts_aos_pair_f4_g_send;
+ struct part_aos_f4_g_recv *parts_aos_pair_f4_g_recv;
+
+ struct part_aos_f4_g_send *d_parts_aos_pair_f4_g_send;
+ struct part_aos_f4_g_recv *d_parts_aos_pair_f4_g_recv;
+
+ /*Now allocate memory for Buffer and GPU particle arrays*/
+ cudaMalloc((void **)&d_parts_aos_f4_send,
+ count_max_parts_tmp * sizeof(struct part_aos_f4_send));
+ cudaMalloc((void **)&d_parts_aos_f4_recv,
+ count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
+
+ cudaMalloc((void **)&d_parts_aos_forc_f4_send,
+ count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
+ cudaMalloc((void **)&d_parts_aos_forc_f4_recv,
+ count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
+
+ cudaMalloc((void **)&d_parts_aos_grad_f4_send,
+ count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
+ cudaMalloc((void **)&d_parts_aos_grad_f4_recv,
+ count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
+
+ cudaMallocHost((void **)&parts_aos_f4_send,
+ count_max_parts_tmp * sizeof(struct part_aos_f4_send));
+ cudaMallocHost((void **)&parts_aos_f4_recv,
+ count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
+
+ cudaMallocHost((void **)&parts_aos_forc_f4_send,
+ count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
+ cudaMallocHost((void **)&parts_aos_forc_f4_recv,
+ count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
+
+ cudaMallocHost((void **)&parts_aos_grad_f4_send,
+ count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
+ cudaMallocHost((void **)&parts_aos_grad_f4_recv,
+ count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
+
+ cudaMalloc((void **)&d_parts_aos_pair_f4_send,
+ 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_send));
+ cudaMalloc((void **)&d_parts_aos_pair_f4_recv,
+ 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
+
+ cudaMalloc((void **)&d_parts_aos_pair_f4_f_send,
+ 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
+ cudaMalloc((void **)&d_parts_aos_pair_f4_f_recv,
+ 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
+
+ cudaMalloc((void **)&d_parts_aos_pair_f4_g_send,
+ 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
+ cudaMalloc((void **)&d_parts_aos_pair_f4_g_recv,
+ 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
+
+ cudaMallocHost((void **)&parts_aos_pair_f4_send,
+ 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_send));
+ cudaMallocHost((void **)&parts_aos_pair_f4_recv,
+ 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
+
+ cudaMallocHost((void **)&parts_aos_pair_f4_g_send,
+ 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
+ cudaMallocHost((void **)&parts_aos_pair_f4_g_recv,
+ 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
+
+ cudaMallocHost((void **)&parts_aos_pair_f4_f_send,
+ 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
+ cudaMallocHost((void **)&parts_aos_pair_f4_f_recv,
+ 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
+
+ /*Declare some global variables*/
+ float d_a = e->cosmology->a;
+ float d_H = e->cosmology->H;
+ int step = 0;
+
+ // a list of the cells and tasks the GPU will work on
+ pack_vars_self_dens->task_list =
+ (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+ pack_vars_self_dens->cell_list =
+ (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+
+ pack_vars_pair_dens->task_list =
+ (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+ pack_vars_pair_dens->top_task_list =
+ (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+ int n_leaves_max = 4096;
+ /*Allocate target_n_tasks for top level tasks. This is a 2D array with length target_n_tasks and width n_leaves_max*/
+ struct leaf_cell_list l_list[target_n_tasks];
+ pack_vars_pair_dens->leaf_list = (struct leaf_cell_list *)calloc(target_n_tasks, sizeof(struct leaf_cell_list));
+ for (int i = 0; i < target_n_tasks; i++){
+// l_list[i].ci = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
+// l_list[i].cj = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
+// l_list[i].n_leaves = 0;
+ pack_vars_pair_dens->leaf_list[i].ci = malloc(n_leaves_max * sizeof(struct cell *));
+ pack_vars_pair_dens->leaf_list[i].cj = malloc(n_leaves_max * sizeof(struct cell *));
+ pack_vars_pair_dens->leaf_list[i].n_leaves = 0;
+ pack_vars_pair_dens->leaf_list[i].n_packed = 0;
+// for (int j = 0; j < n_leaves_max; j++){
+// pack_vars_pair_dens->leaf_list[i].ci[j] = l_list[i].ci[j];
+// pack_vars_pair_dens->leaf_list[i].cj[j] = l_list[i].cj[j];
+// }
+ }
+// pack_vars_pair_dens->leaf_list = l_list;
+// pack_vars_pair_dens->leaf_list->ci =
+// (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
+// pack_vars_pair_dens->leaf_list->cj =
+// (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
+ /*Allocate memory for n_leaves_max task pointers per top level task*/
+
+ pack_vars_pair_dens->ci_list =
+ (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+ pack_vars_pair_dens->cj_list =
+ (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+
+ pack_vars_self_forc->task_list =
+ (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+ pack_vars_self_forc->cell_list =
+ (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+
+ pack_vars_pair_forc->task_list =
+ (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+ pack_vars_pair_forc->ci_list =
+ (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+ pack_vars_pair_forc->cj_list =
+ (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+
+ pack_vars_self_grad->task_list =
+ (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+ pack_vars_self_grad->cell_list =
+ (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+
+ pack_vars_pair_grad->task_list =
+ (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+ pack_vars_pair_grad->ci_list =
+ (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+ pack_vars_pair_grad->cj_list =
+ (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+
+ // number of density self tasks executed
+ int tasks_done_cpu = 0;
+ int tasks_done_gpu = 0;
+ int tasks_done_gpu_inc = 0;
+
+ /* Main loop. */
+ while (1) {
+ /*Stuff for debugging*/
+ int n_full_d_bundles = 0, n_full_g_bundles = 0, n_full_f_bundles = 0;
+ int n_full_p_d_bundles = 0, n_full_p_g_bundles = 0, n_full_p_f_bundles = 0;
+ int n_partial_d_bundles = 0, n_partial_g_bundles = 0,
+ n_partial_f_bundles = 0;
+ int n_partial_p_d_bundles = 0, n_partial_p_g_bundles = 0,
+ n_partial_p_f_bundles = 0;
+ int output = 0;
+ int packed_self = 0;
+ int packed_pair = 0;
+ int packed_self_f = 0;
+ int packed_pair_f = 0;
+ int packed_self_g = 0;
+ int packed_pair_g = 0;
+ int density = 0;
+ int density_sub = 0;
+ int unpacked = 0;
+ int unpacked_f = 0;
+ int unpacked_g = 0;
+ int unpacked_pair = 0;
+ int unpacked_pair_f = 0;
+ int unpacked_pair_g = 0;
+ int ghost_in = 0;
+ int cpu_self = 0;
+ int cpu_self_f = 0;
+ int cpu_self_g = 0;
+ int cpu_pair = 0;
+ int cpu_pair_f = 0;
+ int cpu_pair_g = 0;
+ int n_leafs_total = 0;
+ // Initialise timers to zero
+ double time_for_density_cpu = 0.0;
+ double time_for_density_cpu_pair = 0.0;
+ double time_for_cpu_g = 0.0;
+ double time_for_cpu_pair_g = 0.0;
+ double time_for_cpu_f = 0.0;
+ double time_for_cpu_pair_f = 0.0;
+ double time_for_density_cpu_sub = 0.0;
+ double time_for_density_gpu = 0.0;
+ double time_for_density_gpu_pair = 0.0;
+ double time_for_gpu_f = 0.0;
+ double time_for_gpu_pair_f = 0.0;
+ double time_for_gpu_g = 0.0;
+ double time_for_gpu_pair_g = 0.0;
+ double unpack_time_self_g = 0.0;
+ double unpack_time_self_f = 0.0;
+ double unpack_time_self = 0.0;
+ double time_for_gpu_pair = 0.0;
+ int nr_cells = space->nr_cells;
+ /* Wait at the barrier. */
+ engine_barrier(e);
+ // Initialise packing counters
+ pack_vars_self_dens->tasks_packed = 0;
+ pack_vars_pair_dens->tasks_packed = 0;
+ pack_vars_self_dens->count_parts = 0;
+ pack_vars_pair_dens->count_parts = 0;
+ pack_vars_pair_dens->task_locked = 0;
+ pack_vars_pair_dens->top_tasks_packed = 0;
+ // Initialise packing counters
+ pack_vars_self_forc->tasks_packed = 0;
+ pack_vars_pair_forc->tasks_packed = 0;
+ pack_vars_self_forc->count_parts = 0;
+ pack_vars_pair_forc->count_parts = 0;
+ // Initialise packing counters
+ pack_vars_self_grad->tasks_packed = 0;
+ pack_vars_pair_grad->tasks_packed = 0;
+ pack_vars_self_grad->count_parts = 0;
+ pack_vars_pair_grad->count_parts = 0;
+ for(int i = 0; i < target_n_tasks; i++)
+ pack_vars_pair_dens->leaf_list[i].n_leaves = 0;
+
+ int total_tasks_packed_this_time_pair = 0;
+ double packing_time = 0.0;
+ double packing_time_f = 0.0;
+ double packing_time_g = 0.0;
+ double unpacking_time = 0.0;
+ double unpacking_time_f = 0.0;
+ double unpacking_time_g = 0.0;
+ double packing_time_pair = 0.0;
+ double packing_time_pair_f = 0.0;
+ double packing_time_pair_g = 0.0;
+ double unpacking_time_pair = 0.0;
+ double unpacking_time_pair_f = 0.0;
+ double unpacking_time_pair_g = 0.0;
+ double time_for_copy_to_struct = 0.0;
+ double tot_time_for_hard_memcpys = 0.0;
+ /* Can we go home yet? */
+ if (e->step_props & engine_step_prop_done) break;
+ /* Re-set the pointer to the previous task, as there is none. */
+ struct task *t = NULL;
+ struct task *prev = NULL;
+ /*Some bits for output in case of debug*/
+ char buf5[20];
+ snprintf(buf5, sizeof(buf5), "t%dr%dstep%d", r->cpuid, engine_rank, step);
+#ifdef DUMP_TIMINGS
+ FILE *fgpu_steps;
+ fgpu_steps = fopen(buf5, "w");
+#endif
+ // if (step == 0) cudaProfilerStart();
+ step++;
+
+ sched->nr_packs_self_dens_done = 0;
+ sched->nr_packs_pair_dens_done = 0;
+ sched->nr_packs_self_forc_done = 0;
+ sched->nr_packs_pair_forc_done = 0;
+ sched->nr_packs_self_grad_done = 0;
+ sched->nr_packs_pair_grad_done = 0;
+ int n_cells_d = 0;
+ int n_cells_g = 0;
+ int n_cells_f = 0;
+ int n_cells_p_d = 0;
+ int n_cells_p_g = 0;
+ int n_cells_p_f = 0;
+ int n_w_prts_gtr_target_d = 0;
+ int n_w_prts_gtr_target_g = 0;
+ int n_w_prts_gtr_target_f = 0;
+ int n_w_prts_gtr_target_p_d = 0;
+ int n_w_prts_gtr_target_p_g = 0;
+ int n_w_prts_gtr_target_p_f = 0;
+ int g100 = 0;
+ int l100 = 0;
+ int maxcount = 0;
+ /* Loop while there are tasks... */
+ tasks_done_gpu_inc = 0;
+ ticks hang_time = getticks();
+ struct task * ttop_prev;
+ while (1) {
+ // A. Nasar: Get qid for re-use later
+ int qid = r->qid;
+ /* If there's no old task, try to get a new one. */
+ if (t == NULL) {
+ /* Get the task. */
+ TIMER_TIC
+ t = scheduler_gettask(sched, qid, prev);
+ TIMER_TOC(timer_gettask);
+ /* Did I get anything? */
+ if (t == NULL) break;
+ }
+ /* Get the cells. */
+ struct cell *ci = t->ci;
+ struct cell *cj = t->cj;
+
+ struct task * ttop = t;
+
+ if (ci == NULL && (t->subtype != task_subtype_gpu_unpack_d
+ && t->subtype != task_subtype_gpu_unpack_g
+ && t->subtype != task_subtype_gpu_unpack_f)) error("This cannot be");
+
+#ifdef SWIFT_DEBUG_TASKS
+ /* Mark the thread we run on */
+ t->rid = r->cpuid;
+
+ /* And recover the pair direction */
+ if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+ struct cell *ci_temp = ci;
+ struct cell *cj_temp = cj;
+ double shift[3];
+ if (t->subtype != task_subtype_gpu_unpack_d &&
+ t->subtype != task_subtype_gpu_unpack_g &&
+ t->subtype != task_subtype_gpu_unpack_f)
+ t->sid = space_getsid_and_swap_cells(e->s, &ci_temp, &cj_temp, shift);
+ } else {
+ t->sid = -1;
+ }
+#endif
+
+#ifdef SWIFT_DEBUG_CHECKS
+ /* Check that we haven't scheduled an inactive task */
+ t->ti_run = e->ti_current;
+ /* Store the task that will be running (for debugging only) */
+ r->t = t;
+#endif
+
+ const ticks task_beg = getticks();
+ /* Different types of tasks... */
+ switch (t->type) {
+ case task_type_self:
+ if (t->subtype == task_subtype_gpu_unpack_d) {
+ unpacked++;
+ } else if (t->subtype == task_subtype_gpu_unpack_g) {
+ unpacked_g++;
+ } else if (t->subtype == task_subtype_gpu_unpack_f) {
+ unpacked_f++;
+ } else if (t->subtype == task_subtype_density) {
+ cpu_self++;
+#ifndef GPUOFFLOAD_DENSITY
+ struct timespec t0, t1, dt;
+ clock_gettime(CLOCK_REALTIME, &t0);
+ runner_doself1_branch_density(r, ci);
+ clock_gettime(CLOCK_REALTIME, &t1);
+ tasks_done_cpu++;
+ time_for_density_cpu += (t1.tv_sec - t0.tv_sec) +
+ (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+ density++;
+#endif
+ /* GPU WORK */
+ } else if (t->subtype == task_subtype_gpu_pack_d) {
+ packed_self++;
+#ifdef GPUOFFLOAD_DENSITY
+ ticks tic_cpu_pack = getticks();
+ packing_time +=
+ runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci, t,
+ parts_aos_f4_send, task_first_part_f4);
+ //Record times for task analysis
+ t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+ /* No pack tasks left in queue, flag that we want to run */
+ int launch_leftovers = pack_vars_self_dens->launch_leftovers;
+ /*Packed enough tasks. Let's go*/
+ int launch = pack_vars_self_dens->launch;
+ /* Do we have enough stuff to run the GPU ? */
+ if (launch || launch_leftovers) {
+ /*Launch GPU tasks*/
+ int t_packed = pack_vars_self_dens->tasks_packed;
+ runner_doself1_launch_f4(
+ r, sched, pack_vars_self_dens, ci, t, parts_aos_f4_send,
+ parts_aos_f4_recv, d_parts_aos_f4_send, d_parts_aos_f4_recv,
+ stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
+ &unpack_time_self, devId,
+ task_first_part_f4, d_task_first_part_f4, self_end);
+ } /*End of GPU work Self*/
+#endif
+ } /* self / pack */
+ else if (t->subtype == task_subtype_gpu_pack_g) {
+ packed_self_g++;
+#ifdef GPUOFFLOAD_GRADIENT
+ ticks tic_cpu_pack = getticks();
+ packing_time_g += runner_doself1_pack_f4_g(
+ r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
+ task_first_part_f4_g);
+ //Record times for task analysis
+ t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+ /* No pack tasks left in queue, flag that we want to run */
+ int launch_leftovers = pack_vars_self_grad->launch_leftovers;
+ /*Packed enough tasks let's go*/
+ int launch = pack_vars_self_grad->launch;
+ /* Do we have enough stuff to run the GPU ? */
+ if (launch || launch_leftovers) {
+ /*Launch GPU tasks*/
+ int t_packed = pack_vars_self_grad->tasks_packed;
+ runner_doself1_launch_f4_g(
+ r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
+ parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
+ d_parts_aos_grad_f4_recv, stream, d_a, d_H, e,
+ &packing_time_g, &time_for_gpu_g, task_first_part_f4_g,
+ d_task_first_part_f4_g, self_end_g, &unpack_time_self_g);
+ } /*End of GPU work Self*/
+#endif // GPUGRADSELF
+ } else if (t->subtype == task_subtype_gpu_pack_f) {
+ packed_self_f++;
+#ifdef GPUOFFLOAD_FORCE
+ ticks tic_cpu_pack = getticks();
+ packing_time_f += runner_doself1_pack_f4_f(
+ r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
+ task_first_part_f4_f);
+ //Record times for task analysis
+ t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+ /* No pack tasks left in queue, flag that we want to run */
+ int launch_leftovers = pack_vars_self_forc->launch_leftovers;
+ /*Packed enough tasks let's go*/
+ int launch = pack_vars_self_forc->launch;
+ /* Do we have enough stuff to run the GPU ? */
+ if (launch || launch_leftovers) {
+ /*Launch GPU tasks*/
+ int t_packed = pack_vars_self_forc->tasks_packed;
+ runner_doself1_launch_f4_f(
+ r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
+ parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
+ d_parts_aos_forc_f4_recv, stream, d_a, d_H, e,
+ &packing_time_f, &time_for_gpu_f, task_first_part_f4_f,
+ d_task_first_part_f4_f, self_end_f, &unpack_time_self_f);
+ } /*End of GPU work Self*/
+#endif
+ }
+#ifdef EXTRA_HYDRO_LOOP
+ else if (t->subtype == task_subtype_gradient) {
+ cpu_self_g++;
+#ifndef GPUOFFLOAD_GRADIENT
+ struct timespec t0, t1, dt;
+ clock_gettime(CLOCK_REALTIME, &t0);
+ runner_doself1_branch_gradient(r, ci);
+ clock_gettime(CLOCK_REALTIME, &t1);
+ tasks_done_cpu++;
+ time_for_cpu_g += (t1.tv_sec - t0.tv_sec) +
+ (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+#endif
+ }
+#endif
+ else if (t->subtype == task_subtype_force) {
+ cpu_self_f++;
+#ifndef GPUOFFLOAD_FORCE
+ struct timespec t0, t1;
+ clock_gettime(CLOCK_REALTIME, &t0);
+ runner_doself2_branch_force(r, ci);
+ clock_gettime(CLOCK_REALTIME, &t1);
+ tasks_done_cpu++;
+ time_for_cpu_f += (t1.tv_sec - t0.tv_sec) +
+ (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+#endif
+ } else if (t->subtype == task_subtype_limiter)
+ runner_doself1_branch_limiter(r, ci);
+ else if (t->subtype == task_subtype_grav)
+ runner_doself_recursive_grav(r, ci, 1);
+ else if (t->subtype == task_subtype_external_grav)
+ runner_do_grav_external(r, ci, 1);
+ else if (t->subtype == task_subtype_stars_density)
+ runner_doself_branch_stars_density(r, ci);
+#ifdef EXTRA_STAR_LOOPS
+ else if (t->subtype == task_subtype_stars_prep1)
+ runner_doself_branch_stars_prep1(r, ci);
+ else if (t->subtype == task_subtype_stars_prep2)
+ runner_doself_branch_stars_prep2(r, ci);
+#endif
+ else if (t->subtype == task_subtype_stars_feedback)
+ runner_doself_branch_stars_feedback(r, ci);
+ else if (t->subtype == task_subtype_bh_density)
+ runner_doself_branch_bh_density(r, ci);
+ else if (t->subtype == task_subtype_bh_swallow)
+ runner_doself_branch_bh_swallow(r, ci);
+ else if (t->subtype == task_subtype_do_gas_swallow)
+ runner_do_gas_swallow_self(r, ci, 1);
+ else if (t->subtype == task_subtype_do_bh_swallow)
+ runner_do_bh_swallow_self(r, ci, 1);
+ else if (t->subtype == task_subtype_bh_feedback)
+ runner_doself_branch_bh_feedback(r, ci);
+ else if (t->subtype == task_subtype_rt_gradient)
+ runner_doself1_branch_rt_gradient(r, ci);
+ else if (t->subtype == task_subtype_rt_transport)
+ runner_doself2_branch_rt_transport(r, ci);
+ else if (t->subtype == task_subtype_sink_swallow)
+ runner_doself_branch_sinks_swallow(r, ci);
+ else if (t->subtype == task_subtype_sink_do_gas_swallow)
+ runner_do_sinks_gas_swallow_self(r, ci, 1);
+ else if (t->subtype == task_subtype_sink_do_sink_swallow)
+ runner_do_sinks_sink_swallow_self(r, ci, 1);
+ else
+ error("Unknown/invalid task subtype (%s).",
+ subtaskID_names[t->subtype]);
+ break;
+
+ case task_type_pair:
+ if (t->subtype == task_subtype_density) {
+ cpu_pair++;
+#ifndef GPUOFFLOAD_DENSITY
+ struct timespec t0, t1, dt;
+ clock_gettime(CLOCK_REALTIME, &t0);
+ runner_dopair1_branch_density(r, ci, cj);
+ clock_gettime(CLOCK_REALTIME, &t1);
+ tasks_done_cpu++;
+ time_for_density_cpu_pair +=
+ (t1.tv_sec - t0.tv_sec) +
+ (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+#endif
+ }
+ /* GPU WORK */
+ else if (t->subtype == task_subtype_gpu_pack_d) {
+ packed_pair++;
+#ifdef GPUOFFLOAD_DENSITY
+
+ ticks tic_cpu_pack = getticks();
+
+ /////////////////////W.I.P!!!////////////////////////////////////////////////////////
+ /*Call recursion here. This will be a function in runner_doiact_functions_hydro_gpu.h.
+ * We are recursing separately to find out how much work we have before offloading*/
+ //We need to allocate a list to put cell pointers into for each new task
+ int n_expected_tasks = 4096; //A. Nasar: Need to come up with a good estimate for this
+ int n_leaves_found = 0;
+ int top_tasks_packed = pack_vars_pair_dens->top_tasks_packed;
+ int depth = 0;
+
+ pack_vars_pair_dens->leaf_list[top_tasks_packed].n_leaves = 0;
+ pack_vars_pair_dens->leaf_list[top_tasks_packed].n_start = 0;
+ pack_vars_pair_dens->leaf_list[top_tasks_packed].n_packed = 0;
+
+ runner_recurse_gpu(r, sched, pack_vars_pair_dens, ci, cj, t,
+ parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, &n_leaves_found, depth, n_expected_tasks);
+
+ n_leafs_total += n_leaves_found;
+ int cstart = 0, cid = 0;
+
+ pack_vars_pair_dens->top_task_list[top_tasks_packed] = t;
+
+ pack_vars_pair_dens->top_tasks_packed++;
+ pack_vars_pair_dens->task_locked = 1;
+ int t_s, t_e;
+ t_s = 0;
+ int n_t_tasks = pack_vars_pair_dens->target_n_tasks;
+ t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+
+ int ntop_packed = pack_vars_pair_dens->top_tasks_packed;
+
+ while(cstart < n_leaves_found){
+ tic_cpu_pack = getticks();
+
+// if(pack_vars_pair_dens->top_task_list[0] == ttop_prev)
+// error("Working on prev top level task");
+ pack_vars_pair_dens->launch_leftovers = 0;
+ pack_vars_pair_dens->launch = 0;
+ /*Loop through n_daughters such that the pack_vars_pair_dens counters are updated*/
+ while(cstart < n_leaves_found && pack_vars_pair_dens->tasks_packed < n_t_tasks){
+ // n_start is incremented in pack. However, for cases where we have launched
+ // but there are still some daughters left unpacked, we need to restart the
+ // count from zero for the packed arrays as the daughters we previously worked on are no longer necessary.
+ // Thus, the counter for cii and cjj should remain cstart but counter for packing/unpacking arrays
+ // should be n_start which is set to zero after launch. count_parts should also be zero ater launch
+ struct cell * cii = pack_vars_pair_dens->leaf_list[ntop_packed - 1].ci[cstart];
+ struct cell * cjj = pack_vars_pair_dens->leaf_list[ntop_packed - 1].cj[cstart];
+ packing_time_pair += runner_dopair1_pack_f4(
+ /////////////////////////////Are we sure we should use
+ /////////////////////////////cells_left/cells right and not
+ /////////////////////////////pack_vars_pair_dens->leaf_list[top_tasks_packed].ci & cj?
+ r, sched, pack_vars_pair_dens, cii, cjj, t,
+ ///////////////////////////// HERE //////////////////////////////////////////
+ parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
+ if(pack_vars_pair_dens->count_parts > count_max_parts_tmp)
+ error("Packed more parts than possible");
+ cstart++;
+ }
+ /* Copies done. Release the lock ! */
+ t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+ /* Packed enough tasks or no pack tasks left in queue, flag that
+ * we want to run */
+ int launch = pack_vars_pair_dens->launch;
+ int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
+ /* Do we have enough stuff to run the GPU ? */
+ if (launch || launch_leftovers) {
+ /*Launch GPU tasks*/
+ int t_packed = pack_vars_pair_dens->tasks_packed;
+ runner_dopair1_launch_f4_one_memcpy(
+ r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
+ parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
+ d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
+ &packing_time_pair, &time_for_density_gpu_pair,
+ &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
+ pair_end);
+ //A. Nasar: Unpack data and zero count_parts counter
+ runner_dopair1_unpack_f4(
+ r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
+ parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
+ d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
+ &packing_time_pair, &time_for_density_gpu_pair,
+ &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
+ pair_end, cstart, n_leaves_found);
+ /*This ensure that if we still have leaves left we start at index 1.
+ Otherwise, reset the index since we will be grabbing a new task*/
+ int n_packed = pack_vars_pair_dens->tasks_packed;
+ //A. Nasar: We've packed all daughters and have launched --> one way or the other
+ if(cstart == n_leaves_found){
+ pack_vars_pair_dens->top_tasks_packed = 0;
+// for(int i = 0; i < ntop_packed; i++){
+// pack_vars_pair_dens->leaf_list[i].n_leaves = 0;
+// pack_vars_pair_dens->leaf_list[i].n_packed = 0;
+// pack_vars_pair_dens->leaf_list[i].n_start = 0;
+// }
+ }
+ // A. Nasar: We've launched but we have not packed all daughters.
+ // Need to set counters so we start from the last top-task packed
+ // and it's last packed daughter-task and start packing to the beginning of GPU arrays
+ // which is reset to zero (count_parts) in "....unpack_f4()"
+ else{
+ for(int i = 1; i < pack_vars_pair_dens->top_tasks_packed; i++)
+ pack_vars_pair_dens->leaf_list[i].n_start = 0;
+ pack_vars_pair_dens->top_tasks_packed = 1;
+ pack_vars_pair_dens->top_task_list[0]= t;
+ // A. Nasar: We've launched so need to restart counting tasks
+ // from zero and need to reset tasks_packed to zero.
+ // However, the counter for
+ pack_vars_pair_dens->leaf_list[0].n_start = cstart;
+
+ pack_vars_pair_dens->leaf_list[0].n_packed = 0;
+ //A. Nasar: We have packed all daughter tasks in this parent task
+ /*This makes it such that the remaining leaf tasks are packed starting from a
+ fresh list since we are still in the while cstart < n_leaves_found loop**/
+ }
+ // A. Nasar: These need to be reset to zero either way as our GPU array counters
+ // need to re-start from zero
+ pack_vars_pair_dens->tasks_packed = 0;
+ pack_vars_pair_dens->launch_leftovers = 0;
+ pack_vars_pair_dens->launch = 0;
+ }
+ ///////////////////////////////////////////////////////////////////////
+ }
+ ttop_prev = t;
+ cell_unlocktree(ci);
+ cell_unlocktree(cj);
+// pack_vars_pair_dens->launch_leftovers = 0;
+// pack_vars_pair_dens->launch = 0;
+ /////////////////////W.I.P!!!////////////////////////////////////////////////////////
+
+#endif // GPUOFFLOAD_DENSITY
+ } /* pair / pack */
+ else if (t->subtype == task_subtype_gpu_pack_g) {
+ packed_pair_g++;
+#ifdef GPUOFFLOAD_GRADIENT
+ ticks tic_cpu_pack = getticks();
+ packing_time_pair_g +=
+ runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
+ cj, t, parts_aos_pair_f4_g_send, e,
+ fparti_fpartj_lparti_lpartj_grad);
+ t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+ /* No pack tasks left in queue, flag that we want to run */
+ int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
+ /*Packed enough tasks, let's go*/
+ int launch = pack_vars_pair_grad->launch;
+ /* Do we have enough stuff to run the GPU ? */
+ if (launch || launch_leftovers) {
+ /*Launch GPU tasks*/
+ int t_packed = pack_vars_pair_grad->tasks_packed;
+ // signal_sleeping_runners(sched, t, t_packed);
+ runner_dopair1_launch_f4_g_one_memcpy(
+ r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send,
+ parts_aos_pair_f4_g_recv, d_parts_aos_pair_f4_g_send,
+ d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e,
+ &packing_time_pair_g, &time_for_gpu_pair_g,
+ &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad,
+ pair_end_g);
+ }
+ pack_vars_pair_grad->launch_leftovers = 0;
+#endif // GPUOFFLOAD_GRADIENT
+ } else if (t->subtype == task_subtype_gpu_pack_f) {
+ packed_pair_f++;
+#ifdef GPUOFFLOAD_FORCE
+ ticks tic_cpu_pack = getticks();
+ /*Pack data and increment counters checking if we should run on the GPU after packing this task*/
+ packing_time_pair_f +=
+ runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
+ cj, t, parts_aos_pair_f4_f_send, e,
+ fparti_fpartj_lparti_lpartj_forc);
+ /* No pack tasks left in queue, flag that we want to run */
+ int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
+ /*Packed enough tasks let's go*/
+ int launch = pack_vars_pair_forc->launch;
+ /* Do we have enough stuff to run the GPU ? */
+ if (launch || launch_leftovers) {
+ /*Launch GPU tasks*/
+ int t_packed = pack_vars_pair_forc->tasks_packed;
+ // signal_sleeping_runners(sched, t, t_packed);
+ runner_dopair1_launch_f4_f_one_memcpy(
+ r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send,
+ parts_aos_pair_f4_f_recv, d_parts_aos_pair_f4_f_send,
+ d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e,
+ &packing_time_pair_f, &time_for_gpu_pair_f,
+ &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc,
+ pair_end_f);
+
+ pack_vars_pair_forc->launch_leftovers = 0;
+ } /* End of GPU work Pairs */
+#endif // GPUOFFLOAD_FORCE
+ } else if (t->subtype == task_subtype_gpu_unpack_d) {
+ unpacked_pair++;
+ } else if (t->subtype == task_subtype_gpu_unpack_g) {
+ unpacked_pair_g++;
+ } else if (t->subtype == task_subtype_gpu_unpack_f) {
+ unpacked_pair_f++;
+ }
+#ifdef EXTRA_HYDRO_LOOP
+ else if (t->subtype == task_subtype_gradient) {
+ int Do_nothing = 0;
+#ifndef GPUOFFLOAD_GRADIENT
+ struct timespec t0, t1, dt;
+ clock_gettime(CLOCK_REALTIME, &t0);
+ runner_dopair1_branch_gradient(r, ci, cj);
+ clock_gettime(CLOCK_REALTIME, &t1);
+ tasks_done_cpu++;
+ time_for_cpu_pair_g += (t1.tv_sec - t0.tv_sec) +
+ (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+#endif
+ }
+#endif // EXTRA_HYDRO_LOOP
+ else if (t->subtype == task_subtype_force) {
+ int Do_nothing = 0;
+#ifndef GPUOFFLOAD_FORCE
+ struct timespec t0, t1, dt;
+ clock_gettime(CLOCK_REALTIME, &t0);
+ runner_dopair2_branch_force(r, ci, cj);
+ clock_gettime(CLOCK_REALTIME, &t1);
+ tasks_done_cpu++;
+ time_for_cpu_pair_f += (t1.tv_sec - t0.tv_sec) +
+ (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+#endif // GPUOFFLOAD_FORCE
+ } else if (t->subtype == task_subtype_limiter)
+ runner_dopair1_branch_limiter(r, ci, cj);
+ else if (t->subtype == task_subtype_grav)
+ runner_dopair_recursive_grav(r, ci, cj, 1);
+ else if (t->subtype == task_subtype_stars_density)
+ runner_dopair_branch_stars_density(r, ci, cj);
+#ifdef EXTRA_STAR_LOOPS
+ else if (t->subtype == task_subtype_stars_prep1)
+ runner_dopair_branch_stars_prep1(r, ci, cj);
+ else if (t->subtype == task_subtype_stars_prep2)
+ runner_dopair_branch_stars_prep2(r, ci, cj);
+#endif
+ else if (t->subtype == task_subtype_stars_feedback)
+ runner_dopair_branch_stars_feedback(r, ci, cj);
+ else if (t->subtype == task_subtype_bh_density)
+ runner_dopair_branch_bh_density(r, ci, cj);
+ else if (t->subtype == task_subtype_bh_swallow)
+ runner_dopair_branch_bh_swallow(r, ci, cj);
+ else if (t->subtype == task_subtype_do_gas_swallow)
+ runner_do_gas_swallow_pair(r, ci, cj, 1);
+ else if (t->subtype == task_subtype_do_bh_swallow)
+ runner_do_bh_swallow_pair(r, ci, cj, 1);
+ else if (t->subtype == task_subtype_bh_feedback)
+ runner_dopair_branch_bh_feedback(r, ci, cj);
+ else if (t->subtype == task_subtype_rt_gradient)
+ runner_dopair1_branch_rt_gradient(r, ci, cj);
+ else if (t->subtype == task_subtype_rt_transport)
+ runner_dopair2_branch_rt_transport(r, ci, cj);
+ else if (t->subtype == task_subtype_sink_swallow)
+ runner_dopair_branch_sinks_swallow(r, ci, cj);
+ else if (t->subtype == task_subtype_sink_do_gas_swallow)
+ runner_do_sinks_gas_swallow_pair(r, ci, cj, 1);
+ else if (t->subtype == task_subtype_sink_do_sink_swallow)
+ runner_do_sinks_sink_swallow_pair(r, ci, cj, 1);
+ else
+ error("Unknown/invalid task subtype (%s/%s).",
+ taskID_names[t->type], subtaskID_names[t->subtype]);
+ break;
+
+ case task_type_sub_self:
+ if (t->subtype == task_subtype_density) {
+ struct timespec t0, t1, dt;
+ const int count = ci->hydro.count;
+ density_sub++;
+ clock_gettime(CLOCK_REALTIME, &t0);
+ runner_dosub_self1_density(r, ci, 1);
+ clock_gettime(CLOCK_REALTIME, &t1);
+ tasks_done_cpu++;
+ time_for_density_cpu_sub +=
+ (t1.tv_sec - t0.tv_sec) +
+ (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+ }
+#ifdef EXTRA_HYDRO_LOOP
+ else if (t->subtype == task_subtype_gradient) {
+ runner_dosub_self1_gradient(r, ci, 1);
+ }
+#endif
+ else if (t->subtype == task_subtype_force) {
+ runner_dosub_self2_force(r, ci, 1);
+ } else if (t->subtype == task_subtype_limiter)
+ runner_dosub_self1_limiter(r, ci, 1);
+ else if (t->subtype == task_subtype_stars_density)
+ runner_dosub_self_stars_density(r, ci, 1);
+#ifdef EXTRA_STAR_LOOPS
+ else if (t->subtype == task_subtype_stars_prep1)
+ runner_dosub_self_stars_prep1(r, ci, 1);
+ else if (t->subtype == task_subtype_stars_prep2)
+ runner_dosub_self_stars_prep2(r, ci, 1);
+#endif
+ else if (t->subtype == task_subtype_stars_feedback)
+ runner_dosub_self_stars_feedback(r, ci, 1);
+ else if (t->subtype == task_subtype_bh_density)
+ runner_dosub_self_bh_density(r, ci, 1);
+ else if (t->subtype == task_subtype_bh_swallow)
+ runner_dosub_self_bh_swallow(r, ci, 1);
+ else if (t->subtype == task_subtype_do_gas_swallow)
+ runner_do_gas_swallow_self(r, ci, 1);
+ else if (t->subtype == task_subtype_do_bh_swallow)
+ runner_do_bh_swallow_self(r, ci, 1);
+ else if (t->subtype == task_subtype_bh_feedback)
+ runner_dosub_self_bh_feedback(r, ci, 1);
+ else if (t->subtype == task_subtype_rt_gradient)
+ runner_dosub_self1_rt_gradient(r, ci, 1);
+ else if (t->subtype == task_subtype_rt_transport)
+ runner_dosub_self2_rt_transport(r, ci, 1);
+ else if (t->subtype == task_subtype_sink_swallow)
+ runner_dosub_self_sinks_swallow(r, ci, 1);
+ else if (t->subtype == task_subtype_sink_do_gas_swallow)
+ runner_do_sinks_gas_swallow_self(r, ci, 1);
+ else if (t->subtype == task_subtype_sink_do_sink_swallow)
+ runner_do_sinks_sink_swallow_self(r, ci, 1);
+ else
+ error("Unknown/invalid task subtype (%s/%s).",
+ taskID_names[t->type], subtaskID_names[t->subtype]);
+ break;
+
+ case task_type_sub_pair:
+ if (t->subtype == task_subtype_density) {
+ int nothing = 0;
+ runner_dosub_pair1_density(r, ci, cj, 1);
+ }
+#ifdef EXTRA_HYDRO_LOOP
+ else if (t->subtype == task_subtype_gradient) {
+ runner_dosub_pair1_gradient(r, ci, cj, 1);
+ }
+#endif
+ else if (t->subtype == task_subtype_force) {
+ runner_dosub_pair2_force(r, ci, cj, 1);
+ } else if (t->subtype == task_subtype_limiter)
+ runner_dosub_pair1_limiter(r, ci, cj, 1);
+ else if (t->subtype == task_subtype_stars_density)
+ runner_dosub_pair_stars_density(r, ci, cj, 1);
+#ifdef EXTRA_STAR_LOOPS
+ else if (t->subtype == task_subtype_stars_prep1)
+ runner_dosub_pair_stars_prep1(r, ci, cj, 1);
+ else if (t->subtype == task_subtype_stars_prep2)
+ runner_dosub_pair_stars_prep2(r, ci, cj, 1);
+#endif
+ else if (t->subtype == task_subtype_stars_feedback)
+ runner_dosub_pair_stars_feedback(r, ci, cj, 1);
+ else if (t->subtype == task_subtype_bh_density)
+ runner_dosub_pair_bh_density(r, ci, cj, 1);
+ else if (t->subtype == task_subtype_bh_swallow)
+ runner_dosub_pair_bh_swallow(r, ci, cj, 1);
+ else if (t->subtype == task_subtype_do_gas_swallow)
+ runner_do_gas_swallow_pair(r, ci, cj, 1);
+ else if (t->subtype == task_subtype_do_bh_swallow)
+ runner_do_bh_swallow_pair(r, ci, cj, 1);
+ else if (t->subtype == task_subtype_bh_feedback)
+ runner_dosub_pair_bh_feedback(r, ci, cj, 1);
+ else if (t->subtype == task_subtype_rt_gradient)
+ runner_dosub_pair1_rt_gradient(r, ci, cj, 1);
+ else if (t->subtype == task_subtype_rt_transport)
+ runner_dosub_pair2_rt_transport(r, ci, cj, 1);
+ else if (t->subtype == task_subtype_sink_swallow)
+ runner_dosub_pair_sinks_swallow(r, ci, cj, 1);
+ else if (t->subtype == task_subtype_sink_do_gas_swallow)
+ runner_do_sinks_gas_swallow_pair(r, ci, cj, 1);
+ else if (t->subtype == task_subtype_sink_do_sink_swallow)
+ runner_do_sinks_sink_swallow_pair(r, ci, cj, 1);
+ else
+ error("Unknown/invalid task subtype (%s/%s).",
+ taskID_names[t->type], subtaskID_names[t->subtype]);
+ break;
+
+ case task_type_sort:
+ /* Cleanup only if any of the indices went stale. */
+ runner_do_hydro_sort(
+ r, ci, t->flags,
+ ci->hydro.dx_max_sort_old > space_maxreldx * ci->dmin,
+ cell_get_flag(ci, cell_flag_rt_requests_sort), 1);
+ /* Reset the sort flags as our work here is done. */
+ t->flags = 0;
+ break;
+ case task_type_rt_sort:
+ /* Cleanup only if any of the indices went stale.
+ * NOTE: we check whether we reset the sort flags when the
+ * recv tasks are running. Cells without an RT recv task
+ * don't have rt_sort tasks. */
+ runner_do_hydro_sort(
+ r, ci, t->flags,
+ ci->hydro.dx_max_sort_old > space_maxreldx * ci->dmin, 1, 1);
+ /* Reset the sort flags as our work here is done. */
+ t->flags = 0;
+ break;
+ case task_type_stars_sort:
+ /* Cleanup only if any of the indices went stale. */
+ runner_do_stars_sort(
+ r, ci, t->flags,
+ ci->stars.dx_max_sort_old > space_maxreldx * ci->dmin, 1);
+ /* Reset the sort flags as our work here is done. */
+ t->flags = 0;
+ break;
+ case task_type_init_grav:
+ runner_do_init_grav(r, ci, 1);
+ break;
+ case task_type_ghost:
+ runner_do_ghost(r, ci, 1);
+ break;
+#ifdef EXTRA_HYDRO_LOOP
+ case task_type_extra_ghost:
+ runner_do_extra_ghost(r, ci, 1);
+ break;
+#endif
+ case task_type_stars_ghost:
+ runner_do_stars_ghost(r, ci, 1);
+ break;
+ case task_type_bh_density_ghost:
+ runner_do_black_holes_density_ghost(r, ci, 1);
+ break;
+ case task_type_bh_swallow_ghost3:
+ runner_do_black_holes_swallow_ghost(r, ci, 1);
+ break;
+ case task_type_drift_part:
+ runner_do_drift_part(r, ci, 1);
+ break;
+ case task_type_drift_spart:
+ runner_do_drift_spart(r, ci, 1);
+ break;
+ case task_type_drift_sink:
+ runner_do_drift_sink(r, ci, 1);
+ break;
+ case task_type_drift_bpart:
+ runner_do_drift_bpart(r, ci, 1);
+ break;
+ case task_type_drift_gpart:
+ runner_do_drift_gpart(r, ci, 1);
+ break;
+ case task_type_kick1:
+ runner_do_kick1(r, ci, 1);
+ break;
+ case task_type_kick2:
+ runner_do_kick2(r, ci, 1);
+ break;
+ case task_type_end_hydro_force:
+ runner_do_end_hydro_force(r, ci, 1);
+ break;
+ case task_type_end_grav_force:
+ runner_do_end_grav_force(r, ci, 1);
+ break;
+ case task_type_csds:
+ runner_do_csds(r, ci, 1);
+ break;
+ case task_type_timestep:
+ runner_do_timestep(r, ci, 1);
+ break;
+ case task_type_timestep_limiter:
+ runner_do_limiter(r, ci, 0, 1);
+ break;
+ case task_type_timestep_sync:
+ runner_do_sync(r, ci, 0, 1);
+ break;
+ case task_type_collect:
+ runner_do_timestep_collect(r, ci, 1);
+ break;
+ case task_type_rt_collect_times:
+ runner_do_collect_rt_times(r, ci, 1);
+ break;
+#ifdef WITH_MPI
+ case task_type_send:
+ if (t->subtype == task_subtype_tend) {
+ free(t->buff);
+ } else if (t->subtype == task_subtype_sf_counts) {
+ free(t->buff);
+ } else if (t->subtype == task_subtype_part_swallow) {
+ free(t->buff);
+ } else if (t->subtype == task_subtype_bpart_merger) {
+ free(t->buff);
+ } else if (t->subtype == task_subtype_limiter) {
+ free(t->buff);
+ }
+ break;
+ case task_type_recv:
+ if (t->subtype == task_subtype_tend) {
+ cell_unpack_end_step(ci, (struct pcell_step *)t->buff);
+ free(t->buff);
+ } else if (t->subtype == task_subtype_sf_counts) {
+ cell_unpack_sf_counts(ci, (struct pcell_sf *)t->buff);
+ cell_clear_stars_sort_flags(ci, /*clear_unused_flags=*/0);
+ free(t->buff);
+ } else if (t->subtype == task_subtype_xv) {
+ runner_do_recv_part(r, ci, 1, 1);
+ } else if (t->subtype == task_subtype_rho) {
+ runner_do_recv_part(r, ci, 0, 1);
+ } else if (t->subtype == task_subtype_gradient) {
+ runner_do_recv_part(r, ci, 0, 1);
+ } else if (t->subtype == task_subtype_rt_gradient) {
+ runner_do_recv_part(r, ci, 2, 1);
+ } else if (t->subtype == task_subtype_rt_transport) {
+ runner_do_recv_part(r, ci, -1, 1);
+ } else if (t->subtype == task_subtype_part_swallow) {
+ cell_unpack_part_swallow(ci,
+ (struct black_holes_part_data *)t->buff);
+ free(t->buff);
+ } else if (t->subtype == task_subtype_bpart_merger) {
+ cell_unpack_bpart_swallow(ci,
+ (struct black_holes_bpart_data *)t->buff);
+ free(t->buff);
+ } else if (t->subtype == task_subtype_limiter) {
+ /* Nothing to do here. Unpacking done in a separate task */
+ } else if (t->subtype == task_subtype_gpart) {
+ runner_do_recv_gpart(r, ci, 1);
+ } else if (t->subtype == task_subtype_spart_density) {
+ runner_do_recv_spart(r, ci, 1, 1);
+ } else if (t->subtype == task_subtype_part_prep1) {
+ runner_do_recv_part(r, ci, 0, 1);
+ } else if (t->subtype == task_subtype_spart_prep2) {
+ runner_do_recv_spart(r, ci, 0, 1);
+ } else if (t->subtype == task_subtype_bpart_rho) {
+ runner_do_recv_bpart(r, ci, 1, 1);
+ } else if (t->subtype == task_subtype_bpart_feedback) {
+ runner_do_recv_bpart(r, ci, 0, 1);
+ } else {
+ error("Unknown/invalid task subtype (%d).", t->subtype);
+ }
+ break;
+
+ case task_type_pack:
+ runner_do_pack_limiter(r, ci, &t->buff, 1);
+ task_get_unique_dependent(t)->buff = t->buff;
+ break;
+ case task_type_unpack:
+ runner_do_unpack_limiter(r, ci, t->buff, 1);
+ break;
+#endif
+ case task_type_grav_down:
+ runner_do_grav_down(r, t->ci, 1);
+ break;
+ case task_type_grav_long_range:
+ runner_do_grav_long_range(r, t->ci, 1);
+ break;
+ case task_type_grav_mm:
+ runner_dopair_grav_mm_progenies(r, t->flags, t->ci, t->cj);
+ break;
+ case task_type_cooling:
+ runner_do_cooling(r, t->ci, 1);
+ break;
+ case task_type_star_formation:
+ runner_do_star_formation(r, t->ci, 1);
+ break;
+ case task_type_star_formation_sink:
+ runner_do_star_formation_sink(r, t->ci, 1);
+ break;
+ case task_type_stars_resort:
+ runner_do_stars_resort(r, t->ci, 1);
+ break;
+ case task_type_sink_formation:
+ runner_do_sink_formation(r, t->ci);
+ break;
+ case task_type_fof_self:
+ runner_do_fof_search_self(r, t->ci, 1);
+ break;
+ case task_type_fof_pair:
+ runner_do_fof_search_pair(r, t->ci, t->cj, 1);
+ break;
+ case task_type_fof_attach_self:
+ runner_do_fof_attach_self(r, t->ci, 1);
+ break;
+ case task_type_fof_attach_pair:
+ runner_do_fof_attach_pair(r, t->ci, t->cj, 1);
+ break;
+ case task_type_neutrino_weight:
+ runner_do_neutrino_weighting(r, ci, 1);
+ break;
+ case task_type_rt_ghost1:
+ runner_do_rt_ghost1(r, t->ci, 1);
+ break;
+ case task_type_rt_ghost2:
+ runner_do_rt_ghost2(r, t->ci, 1);
+ break;
+ case task_type_rt_tchem:
+ runner_do_rt_tchem(r, t->ci, 1);
+ break;
+ case task_type_rt_advance_cell_time:
+ runner_do_rt_advance_cell_time(r, t->ci, 1);
+ break;
+ default:
+ error("Unknown/invalid task type (%d).", t->type);
+ }
+ r->active_time += (getticks() - task_beg);
+
+/* Mark that we have run this task on these cells */
+#ifdef SWIFT_DEBUG_CHECKS
+ if (ci != NULL) {
+ ci->tasks_executed[t->type]++;
+ ci->subtasks_executed[t->subtype]++;
+ }
+ if (cj != NULL) {
+ cj->tasks_executed[t->type]++;
+ cj->subtasks_executed[t->subtype]++;
+ }
+ /* This runner is not doing a task anymore */
+ r->t = NULL;
+#endif
+
+ /* We're done with this task, see if we get a next one. */
+ prev = t;
+
+ if (t->subtype == task_subtype_gpu_pack_d) {
+#ifdef GPUOFFLOAD_DENSITY
+ /* Don't enqueue unpacks yet. Just signal the runners */
+ t->skip = 1;
+ t->toc = getticks();
+ t->total_ticks += t->toc - t->tic;
+ t = NULL;
+#else
+ t = scheduler_done(sched, t);
+#endif
+ }
+
+ else if (t->subtype == task_subtype_gpu_pack_g) {
+#ifdef GPUOFFLOAD_GRADIENT
+ /* Don't enqueue unpacks yet. Just signal the runners */
+ t->skip = 1;
+ t->toc = getticks();
+ t->total_ticks += t->toc - t->tic;
+ t = NULL;
+#else
+ t = scheduler_done(sched, t);
+#endif
+ }
+
+ else if (t->subtype == task_subtype_gpu_pack_f) {
+#ifdef GPUOFFLOAD_FORCE
+ /* Don't enqueue unpacks yet. Just signal the runners */
+ t->skip = 1;
+ t->toc = getticks();
+ t->total_ticks += t->toc - t->tic;
+ t = NULL;
+#else
+ t = scheduler_done(sched, t);
+#endif
+ }
+
+ else if (t->subtype != task_subtype_gpu_pack_d &&
+ t->subtype != task_subtype_gpu_pack_g &&
+ t->subtype != task_subtype_gpu_pack_f) {
+ t = scheduler_done(sched, t);
+ }
+ } /* main loop. */
+
+ message("n_leafs found %i", n_leafs_total);
+// message("cpu %i packed %i cells with %i containing more parts than target of %i max_count %i",
+// r->cpuid, n_cells_d, n_w_prts_gtr_target_d, np_per_cell, maxcount);
+// message("cpu %i packed %i cells_G with %i containing more parts than target of %i max_count %i",
+// r->cpuid, n_cells_g, n_w_prts_gtr_target_g, np_per_cell, maxcount);
+// message("cpu %i packed %i cells_F with %i containing more parts than target of %i max_count %i",
+// r->cpuid, n_cells_f, n_w_prts_gtr_target_f, np_per_cell, maxcount);
+// message("cpu %i packed %i pairs_D with %i containing more parts than target of %i max_count %i",
+// r->cpuid, n_cells_p_d, n_w_prts_gtr_target_p_d, np_per_cell, maxcount);
+// message("cpu %i packed %i pairs_G with %i containing more parts than target of %i max_count %i",
+// r->cpuid, n_cells_p_g, n_w_prts_gtr_target_p_g, np_per_cell, maxcount);
+// message("cpu %i packed %i pairs_F with %i containing more parts than target of %i max_count %i",
+// r->cpuid, n_cells_p_f, n_w_prts_gtr_target_p_f, np_per_cell, maxcount);
+
+ // message("Worked on %i supers w more than 100 parts", g100);
+ // Stuff for writing debug data to file for validation
+ //// if (step % 10 == 0 || step == 1) {
+ // if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "x, y, z,
+ // rho, rhodh, v_sig, lap_u, a_visc_max, ax, ay, az\n"); for (int tid
+ // = 0; tid < space->nr_local_cells;
+ // tid++) { /* This should indeed be tasks_done_gpu as they are
+ // the only
+ //// tasks which have been done*/
+ // struct cell *ctemp = &(space->cells_top[tid]);
+ // for (int i = 0; i < ctemp->hydro.count; i++) {
+ // fprintf(fgpu_steps, "%f, %f, %f, %f, %f, %f, %f, %f, %f, %f,
+ // %f, %f\n",
+ // ctemp->hydro.parts[i].x[0],
+ // ctemp->hydro.parts[i].x[1],
+ // ctemp->hydro.parts[i].x[2], ctemp->hydro.parts[i].rho,
+ // ctemp->hydro.parts[i].density.rho_dh,
+ // ctemp->hydro.parts[i].viscosity.v_sig,
+ // ctemp->hydro.parts[i].diffusion.laplace_u,
+ // ctemp->hydro.parts[i].force.alpha_visc_max_ngb,
+ // ctemp->hydro.parts[i].a_hydro[0],
+ // ctemp->hydro.parts[i].a_hydro[1],
+ // ctemp->hydro.parts[i].a_hydro[2]);
+ //// message("wcount %f density %f",
+ /// ctemp->hydro.parts[i].density.wcount, ctemp->hydro.parts[i].rho); /
+ /// message("wcount is %f\n", ctemp->hydro.parts[i].density.wcount);
+ // }
+ // }
+ //// }
+ /*Output compute times to separate files. cat later into one file*/
+// if (step % 11 == 0 || step == 1) {
+#ifdef DUMP_TIMINGS
+#if defined(GPUOFFLOAD_DENSITY) || defined(GPUOFFLOAD_GRADIENT) || \
+ defined(GPUOFFLOAD_FORCE)
+ // char buffer[30];
+ // snprintf(buffer, sizeof(buffer), "t%d_stepnfullbundles%d",
+ // r->cpuid, step); FILE *fullbundles = fopen(buffer, "w");
+ // if(r->cpuid == 0)fprintf(fullbundles, "nfull, npartial,
+ // nfullpair, npartialpair\n"); else fprintf(fullbundles, "%i, %i,
+ // %i, %i\n", n_full_d_bundles, n_partial_d_bundles,
+ // n_full_p_d_bundles, n_partial_p_d_bundles); fflush(fullbundles);
+
+ ///////////////////////////////////////////////////////////////
+ /// to ooutput timings uncomment this
+ ///////////////////////////////////////////////////////////////
+ if (r->cpuid == 0 && engine_rank == 0)
+ fprintf(fgpu_steps,
+ "GPU_SD, P_SD, U_SD, GPU_PD, P_PD, U_PD, "
+ "GPU_SF, P_SF, U_SF, GPU_PF, P_PF, U_PF, GPU_SG, P_SG, U_SG, "
+ "GPU_PG, P_PG, U_PG\n "
+ "%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, "
+ "%e, %e\n",
+ time_for_density_gpu, packing_time, unpack_time_self,
+ time_for_density_gpu_pair, packing_time_pair, unpacking_time_pair,
+ time_for_gpu_f, packing_time_f, unpack_time_self_f,
+ time_for_gpu_pair_f, packing_time_pair_f, unpacking_time_pair_f,
+ time_for_gpu_g, packing_time_g, unpack_time_self_g,
+ time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f);
+
+ else
+ fprintf(fgpu_steps,
+ "%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, "
+ "%e, %e\n",
+ time_for_density_gpu, packing_time, unpack_time_self,
+ time_for_density_gpu_pair, packing_time_pair, unpacking_time_pair,
+ time_for_gpu_f, packing_time_f, unpack_time_self_f,
+ time_for_gpu_pair_f, packing_time_pair_f, unpacking_time_pair_f,
+ time_for_gpu_g, packing_time_g, unpack_time_self_g,
+ time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f);
+ //////////////////////////////////////////////////////////////
+ ///////////////////////////////////////////////////////////////
+ ///////////////////////////////////////////////////////////////
+
+#else // No GPU offload
+ if (r->cpuid == 0 && engine_rank == 0)
+ fprintf(fgpu_steps,
+ "CPU TIME SELF, CPU TIME PAIR, "
+ "CPU TIME SELF F, CPU TIME PAIR F, CPU TIME SELF G, CPU TIME "
+ "PAIR G\n "
+ "%e, %e, %e, %e, %e, %e\n",
+ time_for_density_cpu, time_for_density_cpu_pair, time_for_cpu_f,
+ time_for_cpu_pair_f, time_for_cpu_g, time_for_cpu_pair_g);
+
+ else
+ fprintf(fgpu_steps, "%e, %e, %e, %e, %e, %e,\n", time_for_density_cpu,
+ time_for_density_cpu_pair, time_for_cpu_f, time_for_cpu_pair_f,
+ time_for_cpu_g, time_for_cpu_pair_g);
+#endif
+ // }
+ fflush(fgpu_steps);
+ fclose(fgpu_steps);
+#endif // DUMPTIMINGS
+ time_for_density_cpu = 0.0;
+ time_for_density_gpu = 0.0;
+ time_for_density_cpu_pair = 0.0;
+ time_for_density_gpu_pair = 0.0;
+ time_for_density_cpu_sub = 0.0;
+ tot_time_for_hard_memcpys = 0.0;
+ tasks_done_gpu = 0;
+ tasks_done_cpu = 0;
+ tasks_done_gpu_inc = 0;
+ if (ghost_in > 0)
+ fprintf(stderr, "total tasks not done on GPU %i is %i\n", r->cpuid,
+ ghost_in);
+ packed_self = 0;
+ packed_pair = 0;
+ packed_self_f = 0;
+ packed_pair_f = 0;
+ packed_self_g = 0;
+ packed_pair_g = 0;
+ density = 0;
+ density_sub = 0;
+ unpacked = 0;
+ // if(step == 2)cudaProfilerStop();
+ // if(step == 2)exit(0);
+ // size_t free_byte ;
+ // size_t total_byte ;
+ // cudaError_t cuda_status = cudaMemGetInfo( &free_byte,
+ //&total_byte ) ; double free = (double)free_byte; double
+ // available = (double)total_byte; double used = (available - free);
+ // fprintf(stderr, "Used %f GB GPU memory\n", used/1e9);
+ /* Wait at the wait barrier. */
+ // swift_barrier_wait(&e->wait_barrier);
+ }
+ // Free all data
+ // cudaFree(d_tid_p);
+ // cudaFree(d_id);
+ // cudaFree(d_x_p);
+ // cudaFree(d_y_p);
+ // cudaFree(d_z_p);
+ // cudaFree(d_ux);
+ // cudaFree(d_uy);
+ // cudaFree(d_uz);
+ // cudaFree(d_a_hydrox);
+ // cudaFree(d_a_hydroy);
+ // cudaFree(d_a_hydroz);
+ // cudaFree(d_mass);
+ // cudaFree(d_h);
+ // cudaFree(d_u);
+ // cudaFree(d_u_dt);
+ // cudaFree(d_rho);
+ // cudaFree(d_SPH_sum);
+ // cudaFree(d_locx);
+ // cudaFree(d_locy);
+ // cudaFree(d_locz);
+ // cudaFree(d_widthx);
+ // cudaFree(d_widthy);
+ // cudaFree(d_widthz);
+ // cudaFree(d_h_max);
+ // cudaFree(d_count_p);
+ // cudaFree(d_wcount);
+ // cudaFree(d_wcount_dh);
+ // cudaFree(d_rho_dh);
+ // cudaFree(d_rot_ux);
+ // cudaFree(d_rot_uy);
+ // cudaFree(d_rot_uz);
+ // cudaFree(d_div_v);
+ // cudaFree(d_div_v_previous_step);
+ // cudaFree(d_alpha_visc);
+ // cudaFree(d_v_sig);
+ // cudaFree(d_laplace_u);
+ // cudaFree(d_alpha_diff);
+ // cudaFree(d_f);
+ // cudaFree(d_soundspeed);
+ // cudaFree(d_h_dt);
+ // cudaFree(d_balsara);
+ // cudaFree(d_pressure);
+ // cudaFree(d_alpha_visc_max_ngb);
+ // cudaFree(d_time_bin);
+ // cudaFree(d_wakeup);
+ // cudaFree(d_min_ngb_time_bin);
+ // cudaFree(d_to_be_synchronized);
+ // cudaFree(tid_p);
+ // cudaFree(id);
+ // cudaFree(mass);
+ // cudaFree(h);
+ // cudaFree(u);
+ // cudaFree(u_dt);
+ // cudaFree(rho);
+ // cudaFree(SPH_sum);
+ // cudaFree(x_p);
+ // cudaFree(y_p);
+ // cudaFree(z_p);
+ // cudaFree(ux);
+ // cudaFree(uy);
+ // cudaFree(uz);
+ // cudaFree(a_hydrox);
+ // cudaFree(a_hydroy);
+ // cudaFree(a_hydroz);
+ // cudaFree(locx);
+ // cudaFree(locy);
+ // cudaFree(locz);
+ // cudaFree(widthx);
+ // cudaFree(widthy);
+ // cudaFree(widthz);
+ // cudaFree(h_max);
+ // cudaFree(count_p);
+ // cudaFree(wcount);
+ // cudaFree(wcount_dh);
+ // cudaFree(rho_dh);
+ // cudaFree(rot_ux);
+ // cudaFree(rot_uy);
+ // cudaFree(rot_uz);
+ // cudaFree(div_v);
+ // cudaFree(div_v_previous_step);
+ // cudaFree(alpha_visc);
+ // cudaFree(v_sig);
+ // cudaFree(laplace_u);
+ // cudaFree(alpha_diff);
+ // cudaFree(f);
+ // cudaFree(soundspeed);
+ // cudaFree(h_dt);
+ // cudaFree(balsara);
+ // cudaFree(pressure);
+ // cudaFree(alpha_visc_max_ngb);
+ // cudaFree(time_bin);
+ // cudaFree(wakeup);
+ // cudaFree(min_ngb_time_bin);
+ // cudaFree(to_be_synchronized);
+ // cudaFree(partid_p);
+ // cudaFree(d_task_first_part);
+ // cudaFree(d_task_last_part);
+ // cudaFree(task_first_part_self_dens);
+ // cudaFree(task_last_part_self_dens);
+ // cudaFree(task_first_part_pair_ci);
+ // cudaFree(task_last_part_pair_ci);
+ // cudaFree(task_first_part_pair_cj);
+ // cudaFree(task_last_part_pair_cj);
+ // cudaFree(d_bundle_first_part_self_dens);
+ // cudaFree(d_bundle_last_part_self_dens);
+ // cudaFree(bundle_first_part_self_dens);
+ // cudaFree(bundle_last_part_self_dens);
+ // cudaFree(bundle_first_part_pair_ci);
+ // cudaFree(bundle_last_part_pair_ci);
+ // cudaFree(bundle_first_part_pair_cj);
+ // cudaFree(bundle_last_part_pair_cj);
+ // free(ci_list_self_dens);
+ // free(ci_list_pair);
+ // free(cj_list_pair);
+
+ /* Be kind, rewind. */
+ return NULL;
+}
+
+#endif // WITH_CUDA
+
diff --git a/src/runner_others.c b/src/runner_others.c
index cbace92a63..914b1f47a3 100644
--- a/src/runner_others.c
+++ b/src/runner_others.c
@@ -381,7 +381,7 @@ void runner_do_star_formation(struct runner *r, struct cell *c, int timer) {
/* Loop over the gas particles in this cell. */
for (int k = 0; k < count; k++) {
-
+ continue; //A. Nasar: Commented out to try without inhibited particles
/* Get a handle on the part. */
struct part *restrict p = &parts[k];
struct xpart *restrict xp = &xparts[k];
diff --git a/src/scheduler.c b/src/scheduler.c
index 2b156f8250..69203e37b6 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -61,6 +61,7 @@
int activate_by_unskip = 1;
#endif
+#include "cuda/BLOCK_SIZE.h"
/**
* @brief Re-set the list of active tasks.
*/
@@ -900,7 +901,9 @@ void scheduler_write_cell_dependencies(struct scheduler *s, int verbose,
int local_count = 0;
for (int i = 0; i < s->nr_tasks; i++) {
const struct task *ta = &s->tasks[i];
-
+ // if(ta->subtype == task_subtype_gpu_unpack_d
+ // || ta->subtype == task_subtype_gpu_unpack_f
+ // || ta->subtype == task_subtype_gpu_unpack_g)continue;
/* Are we using this task?
* For the 0-step, we wish to show all the tasks (even the inactives). */
if (step != 0 && ta->skip) continue;
@@ -952,7 +955,10 @@ void scheduler_write_cell_dependencies(struct scheduler *s, int verbose,
/* and their dependencies */
for (int j = 0; j < ta->nr_unlock_tasks; j++) {
const struct task *tb = ta->unlock_tasks[j];
-
+ if (tb->subtype == task_subtype_gpu_unpack_d ||
+ tb->subtype == task_subtype_gpu_unpack_f ||
+ tb->subtype == task_subtype_gpu_unpack_g)
+ continue;
/* Are we using this task?
* For the 0-step, we wish to show all the tasks (even the inactive). */
if (step != 0 && tb->skip) continue;
@@ -1167,6 +1173,237 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
const int with_black_holes =
(s->space->e->policy & engine_policy_black_holes);
+ /* Iterate on this task until we're done with it. */
+ int redo = 1;
+ while (redo) {
+ /* Reset the redo flag. */
+ redo = 0;
+
+ /* Is this a non-empty self-task? */
+ const int is_self =
+ (t->type == task_type_self) && (t->ci != NULL) &&
+ ((t->ci->hydro.count > 0) || (with_stars && t->ci->stars.count > 0) ||
+ (with_sinks && t->ci->sinks.count > 0) ||
+ (with_black_holes && t->ci->black_holes.count > 0));
+
+ /* Is this a non-empty pair-task? */
+ const int is_pair = (t->type == task_type_pair) && (t->ci != NULL) &&
+ (t->cj != NULL) &&
+ ((t->ci->hydro.count > 0) ||
+ (with_feedback && t->ci->stars.count > 0) ||
+ (with_sinks && t->ci->sinks.count > 0) ||
+ (with_black_holes && t->ci->black_holes.count > 0)) &&
+ ((t->cj->hydro.count > 0) ||
+ (with_feedback && t->cj->stars.count > 0) ||
+ (with_sinks && t->cj->sinks.count > 0) ||
+ (with_black_holes && t->cj->black_holes.count > 0));
+
+ /* Empty task? */
+ if (!is_self && !is_pair) {
+ t->type = task_type_none;
+ t->subtype = task_subtype_none;
+ t->ci = NULL;
+ t->cj = NULL;
+ t->skip = 1;
+ break;
+ }
+
+ /* Self-interaction? */
+ if (t->type == task_type_self) {
+ /* Get a handle on the cell involved. */
+ struct cell *ci = t->ci;
+
+ /* Foreign task? */
+ if (ci->nodeID != s->nodeID) {
+ t->skip = 1;
+ break;
+ }
+
+ /* Is this cell even split and the task does not violate h ? */
+ if (cell_can_split_self_hydro_task(ci)) {
+ /* Make a sub? */
+ if (scheduler_dosub && (ci->hydro.count < space_subsize_self_hydro_default) &&
+ (ci->stars.count < space_subsize_self_stars)) {
+ /* convert to a self-subtask. */
+ t->type = task_type_sub_self;
+
+ /* Otherwise, make tasks explicitly. */
+ } else {
+ /* Take a step back (we're going to recycle the current task)... */
+ redo = 1;
+
+ /* Add the self tasks. */
+ int first_child = 0;
+ while (ci->progeny[first_child] == NULL) first_child++;
+
+ t->ci = ci->progeny[first_child];
+ cell_set_flag(t->ci, cell_flag_has_tasks);
+
+ for (int k = first_child + 1; k < 8; k++) {
+ /* Do we have a non-empty progenitor? */
+ if (ci->progeny[k] != NULL &&
+ (ci->progeny[k]->hydro.count ||
+ (with_stars && ci->progeny[k]->stars.count))) {
+ scheduler_splittask_hydro(
+ scheduler_addtask(s, task_type_self, t->subtype, 0, 0,
+ ci->progeny[k], NULL),
+ s);
+ }
+ }
+
+ /* Make a task for each pair of progeny */
+ for (int j = 0; j < 8; j++) {
+ /* Do we have a non-empty progenitor? */
+ if (ci->progeny[j] != NULL &&
+ (ci->progeny[j]->hydro.count ||
+ (with_feedback && ci->progeny[j]->stars.count))) {
+ for (int k = j + 1; k < 8; k++) {
+ /* Do we have a second non-empty progenitor? */
+ if (ci->progeny[k] != NULL &&
+ (ci->progeny[k]->hydro.count ||
+ (with_feedback && ci->progeny[k]->stars.count))) {
+ scheduler_splittask_hydro(
+ scheduler_addtask(s, task_type_pair, t->subtype,
+ sub_sid_flag[j][k], 0, ci->progeny[j],
+ ci->progeny[k]),
+ s);
+ }
+ }
+ }
+ }
+ }
+
+ } /* Cell is split */
+
+ } /* Self interaction */
+
+ /* Pair interaction? */
+ else if (t->type == task_type_pair) {
+ /* Get a handle on the cells involved. */
+ struct cell *ci = t->ci;
+ struct cell *cj = t->cj;
+
+ /* Foreign task? */
+ if (ci->nodeID != s->nodeID && cj->nodeID != s->nodeID) {
+ t->skip = 1;
+ break;
+ }
+
+ /* Get the sort ID, use space_getsid_and_swap_cells and not t->flags
+ to make sure we get ci and cj swapped if needed. */
+ double shift[3];
+ const int sid = space_getsid_and_swap_cells(s->space, &ci, &cj, shift);
+
+#ifdef SWIFT_DEBUG_CHECKS
+ if (sid != t->flags)
+ error("Got pair task with incorrect flags: sid=%d flags=%lld", sid,
+ t->flags);
+#endif
+
+ /* Should this task be split-up? */
+ if (cell_can_split_pair_hydro_task(ci) &&
+ cell_can_split_pair_hydro_task(cj)) {
+
+ const int h_count_i = ci->hydro.count;
+ const int h_count_j = cj->hydro.count;
+
+ const int s_count_i = ci->stars.count;
+ const int s_count_j = cj->stars.count;
+
+ int do_sub_hydro = 1;
+ int do_sub_stars_i = 1;
+ int do_sub_stars_j = 1;
+ if (h_count_i > 0 && h_count_j > 0) {
+
+ /* Note: Use division to avoid integer overflow. */
+ do_sub_hydro =
+ h_count_i * sid_scale[sid] < space_subsize_pair_hydro_default / h_count_j;
+ }
+ if (s_count_i > 0 && h_count_j > 0) {
+
+ /* Note: Use division to avoid integer overflow. */
+ do_sub_stars_i =
+ s_count_i * sid_scale[sid] < space_subsize_pair_stars / h_count_j;
+ }
+ if (s_count_j > 0 && h_count_i > 0) {
+
+ /* Note: Use division to avoid integer overflow. */
+ do_sub_stars_j =
+ s_count_j * sid_scale[sid] < space_subsize_pair_stars / h_count_i;
+ }
+
+ /* Replace by a single sub-task? */
+ if (scheduler_dosub &&
+ (do_sub_hydro && do_sub_stars_i && do_sub_stars_j) &&
+ !sort_is_corner(sid)) {
+
+ /* Make this task a sub task. */
+ t->type = task_type_sub_pair;
+
+ /* Otherwise, split it. */
+ } else {
+ /* Take a step back (we're going to recycle the current task)... */
+ redo = 1;
+
+ /* Loop over the sub-cell pairs for the current sid and add new tasks
+ * for them. */
+ struct cell_split_pair *csp = &cell_split_pairs[sid];
+
+ t->ci = ci->progeny[csp->pairs[0].pid];
+ t->cj = cj->progeny[csp->pairs[0].pjd];
+ if (t->ci != NULL) cell_set_flag(t->ci, cell_flag_has_tasks);
+ if (t->cj != NULL) cell_set_flag(t->cj, cell_flag_has_tasks);
+
+ t->flags = csp->pairs[0].sid;
+ for (int k = 1; k < csp->count; k++) {
+ scheduler_splittask_hydro(
+ scheduler_addtask(s, task_type_pair, t->subtype,
+ csp->pairs[k].sid, 0,
+ ci->progeny[csp->pairs[k].pid],
+ cj->progeny[csp->pairs[k].pjd]),
+ s);
+ }
+ }
+
+ /* Otherwise, break it up if it is too large? */
+ } else if (scheduler_doforcesplit && ci->split && cj->split &&
+ (ci->hydro.count > space_maxsize / cj->hydro.count)) {
+
+ /* Replace the current task. */
+ t->type = task_type_none;
+
+ for (int j = 0; j < 8; j++)
+ if (ci->progeny[j] != NULL && ci->progeny[j]->hydro.count)
+ for (int k = 0; k < 8; k++)
+ if (cj->progeny[k] != NULL && cj->progeny[k]->hydro.count) {
+ struct task *tl =
+ scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
+ ci->progeny[j], cj->progeny[k]);
+ scheduler_splittask_hydro(tl, s);
+ tl->flags = space_getsid_and_swap_cells(s->space, &t->ci,
+ &t->cj, shift);
+ }
+ }
+ } /* pair interaction? */
+ } /* iterate over the current task. */
+}
+
+/**
+ * @brief Split a hydrodynamic task if too large.
+ *
+ * @param t The #task
+ * @param s The #scheduler we are working in.
+ */
+static void scheduler_splittask_hydro_GPU(struct task *t, struct scheduler *s) {
+ /* Are we considering both stars and hydro when splitting? */
+ /* Note this is not very clean as the scheduler should not really
+ access the engine... */
+ const int with_feedback = (s->space->e->policy & engine_policy_feedback);
+ const int with_stars = (s->space->e->policy & engine_policy_stars);
+ const int with_sinks = (s->space->e->policy & engine_policy_sinks);
+ const int with_black_holes =
+ (s->space->e->policy & engine_policy_black_holes);
+
/* Iterate on this task until we're done with it. */
int redo = 1;
while (redo) {
@@ -1362,8 +1599,6 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
/* Otherwise, break it up if it is too large? */
} else if (scheduler_doforcesplit && ci->split && cj->split &&
(ci->hydro.count > space_maxsize / cj->hydro.count)) {
- // message( "force splitting pair with %i and %i parts." ,
- // ci->hydro.count , cj->hydro.count );
/* Replace the current task. */
t->type = task_type_none;
@@ -1651,6 +1886,19 @@ void scheduler_splittasks_mapper(void *map_data, int num_elements,
scheduler_splittask_gravity(t, s);
} else if (t->subtype == task_subtype_grav) {
scheduler_splittask_gravity(t, s);
+ // if task is gpu task do not split A. Nasar
+ } else if (t->subtype == task_subtype_gpu_pack_d ||
+ t->subtype == task_subtype_gpu_pack_g ||
+ t->subtype == task_subtype_gpu_pack_f) {
+ scheduler_splittask_hydro_GPU(t, s);
+ } else if (t->subtype == task_subtype_gpu_unpack_d ||
+ t->subtype == task_subtype_gpu_unpack_g ||
+ t->subtype == task_subtype_gpu_unpack_f) {
+ /*Do nothing and grab next task to split.
+ *These tasks are cell-less so cannot split.
+ *Will remove this if statement if set on splitting
+ *b4 creating unpack tasks*/
+ continue;
} else {
#ifdef SWIFT_DEBUG_CHECKS
error("Unexpected task sub-type %s/%s", taskID_names[t->type],
@@ -1740,6 +1988,8 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
t->tic = 0;
t->toc = 0;
t->total_ticks = 0;
+ t->total_cpu_pack_ticks = 0;
+ t->total_cpu_unpack_ticks = 0;
#ifdef SWIFT_DEBUG_CHECKS
t->activated_by_unskip = 0;
t->activated_by_marktask = 0;
@@ -1748,6 +1998,26 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
if (ci != NULL) cell_set_flag(ci, cell_flag_has_tasks);
if (cj != NULL) cell_set_flag(cj, cell_flag_has_tasks);
+ // #ifdef WITH_CUDA A. Nasar
+ if (t->subtype == task_subtype_gpu_pack_d) {
+ if (t->type == task_type_self || t->type == task_type_sub_self)
+ atomic_inc(&s->nr_self_pack_tasks_d);
+ if (t->type == task_type_pair || t->type == task_type_sub_pair)
+ atomic_inc(&s->nr_pair_pack_tasks_d);
+ }
+ if (t->subtype == task_subtype_gpu_pack_f) {
+ if (t->type == task_type_self || t->type == task_type_sub_self)
+ atomic_inc(&s->nr_self_pack_tasks_f);
+ if (t->type == task_type_pair || t->type == task_type_sub_pair)
+ atomic_inc(&s->nr_pair_pack_tasks_f);
+ }
+ if (t->subtype == task_subtype_gpu_pack_g) {
+ if (t->type == task_type_self || t->type == task_type_sub_self)
+ atomic_inc(&s->nr_self_pack_tasks_g);
+ if (t->type == task_type_pair || t->type == task_type_sub_pair)
+ atomic_inc(&s->nr_pair_pack_tasks_g);
+ }
+ // #endif
/* Add an index for it. */
// lock_lock( &s->lock );
s->tasks_ind[atomic_inc(&s->nr_tasks)] = ind;
@@ -1833,6 +2103,13 @@ void scheduler_set_unlocks(struct scheduler *s) {
struct task *t = &s->tasks[k];
for (int i = 0; i < t->nr_unlock_tasks; i++) {
for (int j = i + 1; j < t->nr_unlock_tasks; j++) {
+ /*Fix for the case when one unpack task works over the same cell
+ * connected to two pair pack tasks*/
+ if (t->subtype == task_subtype_gpu_unpack_d ||
+ t->subtype == task_subtype_gpu_unpack_g ||
+ t->subtype == task_subtype_gpu_unpack_f) {
+ continue;
+ }
if (t->unlock_tasks[i] == t->unlock_tasks[j])
error("duplicate unlock! t->type=%s/%s unlocking type=%s/%s",
taskID_names[t->type], subtaskID_names[t->subtype],
@@ -1940,13 +2217,20 @@ void scheduler_reset(struct scheduler *s, int size) {
/* Reset the counters. */
s->size = size;
s->nr_tasks = 0;
+ s->nr_self_pack_tasks_d = 0; // A. Nasar
+ s->nr_pair_pack_tasks_d = 0;
+ s->nr_self_pack_tasks_f = 0;
+ s->nr_pair_pack_tasks_f = 0;
+ s->nr_self_pack_tasks_g = 0;
+ s->nr_pair_pack_tasks_g = 0;
s->tasks_next = 0;
s->waiting = 0;
s->nr_unlocks = 0;
s->completed_unlock_writes = 0;
s->active_count = 0;
s->total_ticks = 0;
-
+ s->pack_size = N_TASKS_PER_PACK_SELF;
+ s->pack_size_pair = N_TASKS_PER_PACK_PAIR;
/* Set the task pointers in the queues. */
for (int k = 0; k < s->nr_queues; k++) s->queues[k].tasks = s->tasks;
}
@@ -2007,6 +2291,24 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
cost = 1.f * (wscale * gcount_i) * gcount_i;
} else if (t->subtype == task_subtype_external_grav)
cost = 1.f * wscale * gcount_i;
+ else if (t->subtype == task_subtype_gpu_pack_d) // A. Nasar
+ cost = 1.f * (wscale * count_i * count_i); // * s->pack_size;
+ else if (t->subtype == task_subtype_gpu_pack_f)
+ cost = 1.f * (wscale * count_i * count_i); // * s->pack_size;
+ else if (t->subtype == task_subtype_gpu_pack_g)
+ cost = 1.f * (wscale * count_i * count_i); // * s->pack_size;
+ else if (t->subtype == task_subtype_gpu_unpack_d)
+ //cost = wscale * s->pack_size;
+ cost = (wscale * count_i) * count_i * s->pack_size;
+ // cost = 1.f * wscale * s->pack_size;
+ else if (t->subtype == task_subtype_gpu_unpack_f)
+ cost = (wscale * count_i) * count_i * s->pack_size;
+// cost = wscale * s->pack_size;
+// cost = 1.f * wscale * s->pack_size;
+ else if (t->subtype == task_subtype_gpu_unpack_g)
+ cost = (wscale * count_i) * count_i * s->pack_size;
+// cost = wscale * s->pack_size;
+// cost = 1.f * wscale * s->pack_size;
else if (t->subtype == task_subtype_stars_density ||
t->subtype == task_subtype_stars_prep1 ||
t->subtype == task_subtype_stars_prep2 ||
@@ -2045,7 +2347,36 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
cost = 3.f * (wscale * gcount_i) * gcount_j;
else
cost = 2.f * (wscale * gcount_i) * gcount_j;
-
+ // Abouzied: Think about good cost (for rainy days) A. Nasar
+ } else if (t->subtype == task_subtype_gpu_pack_d) {
+ // cost = 2.f * (wscale * count_i) * count_i;
+ if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID)
+ cost = 3.f * (wscale * count_i * count_i);
+ else
+ cost = 2.f * (wscale * count_i) * count_j * sid_scale[t->flags];
+ } else if (t->subtype == task_subtype_gpu_pack_f) {
+// cost = 2.f * (wscale * count_i) * count_i;
+ if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID)
+ cost = 3.f * (wscale * count_i * count_i) * sid_scale[t->flags];
+ else
+ cost = 2.f * (wscale * count_i) * count_j * sid_scale[t->flags];
+
+ } else if (t->subtype == task_subtype_gpu_pack_g) {
+ if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID)
+ cost = 3.f * (wscale * count_i * count_i) * sid_scale[t->flags];
+ else
+ cost = 2.f * (wscale * count_i) * count_j * sid_scale[t->flags];
+
+// cost = 2.f * (wscale * count_i) * count_i;
+ } else if (t->subtype == task_subtype_gpu_unpack_d) {
+ cost = (wscale * count_i) * count_i * s->pack_size;
+ //cost = 1.f * wscale;
+ } else if (t->subtype == task_subtype_gpu_unpack_f) {
+ cost = (wscale * count_i) * count_i * s->pack_size;
+ //cost = 1.f * wscale;
+ } else if (t->subtype == task_subtype_gpu_unpack_g) {
+ cost = (wscale * count_i) * count_i * s->pack_size;
+ //cost = 1.f * wscale;
} else if (t->subtype == task_subtype_stars_density ||
t->subtype == task_subtype_stars_prep1 ||
t->subtype == task_subtype_stars_prep2 ||
@@ -2177,7 +2508,21 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
} else if (t->subtype == task_subtype_do_bh_swallow) {
cost = 1.f * wscale * (bcount_i + bcount_j);
-
+ } else if (t->subtype == task_subtype_gpu_pack_d) {
+ cost = 2.f * (wscale * count_i) * count_i;
+ } else if (t->subtype == task_subtype_gpu_pack_f) {
+ cost = 2.f * (wscale * count_i) * count_i;
+ } else if (t->subtype == task_subtype_gpu_pack_g) {
+ cost = 2.f * (wscale * count_i) * count_i;
+ } else if (t->subtype == task_subtype_gpu_unpack_d) {
+ cost = (wscale * count_i) * count_i * s->pack_size;
+ //cost = 1.f * wscale;
+ } else if (t->subtype == task_subtype_gpu_unpack_f) {
+ cost = (wscale * count_i) * count_i * s->pack_size;
+ //cost = 1.f * wscale;
+ } else if (t->subtype == task_subtype_gpu_unpack_g) {
+ cost = (wscale * count_i) * count_i * s->pack_size;
+ //cost = 1.f * wscale;
} else if (t->subtype == task_subtype_density ||
t->subtype == task_subtype_gradient ||
t->subtype == task_subtype_force ||
@@ -2216,10 +2561,25 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
cost = 1.f * wscale * count_i;
} else if (t->subtype == task_subtype_do_bh_swallow) {
cost = 1.f * wscale * bcount_i;
- } else if (t->subtype == task_subtype_density ||
- t->subtype == task_subtype_gradient ||
- t->subtype == task_subtype_force ||
- t->subtype == task_subtype_limiter) {
+ } else if (t->subtype == task_subtype_gpu_pack_d) // A. Nasar
+ cost = 1.f * (wscale * count_i) * count_i; // * s->pack_size;
+ else if (t->subtype == task_subtype_gpu_pack_f)
+ cost = 1.f * (wscale * count_i) * count_i; // * s->pack_size;
+ else if (t->subtype == task_subtype_gpu_pack_g)
+ cost = 1.f * (wscale * count_i) * count_i; // * s->pack_size;
+ else if (t->subtype == task_subtype_gpu_unpack_d)
+ cost = (wscale * count_i) * count_i * s->pack_size;
+ //cost = 1.f * wscale * s->pack_size;
+ else if (t->subtype == task_subtype_gpu_unpack_f)
+ cost = (wscale * count_i) * count_i * s->pack_size;
+ //cost = 1.f * wscale * s->pack_size;
+ else if (t->subtype == task_subtype_gpu_unpack_g)
+ cost = (wscale * count_i) * count_i * s->pack_size;
+ //cost = 1.f * wscale * s->pack_size;
+ else if (t->subtype == task_subtype_density ||
+ t->subtype == task_subtype_gradient ||
+ t->subtype == task_subtype_force ||
+ t->subtype == task_subtype_limiter) {
cost = 1.f * (wscale * count_i) * count_i;
} else if (t->subtype == task_subtype_rt_gradient) {
cost = 1.f * wscale * scount_i * count_i;
@@ -2231,10 +2591,10 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
}
break;
case task_type_ghost:
- if (t->ci == t->ci->hydro.super) cost = wscale * count_i;
+ if (t->ci == t->ci->hydro.super) cost = wscale * count_i * count_i;
break;
case task_type_extra_ghost:
- if (t->ci == t->ci->hydro.super) cost = wscale * count_i;
+ if (t->ci == t->ci->hydro.super) cost = wscale * count_i * count_i;
break;
case task_type_stars_ghost:
if (t->ci == t->ci->hydro.super) cost = wscale * scount_i;
@@ -2246,7 +2606,7 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
if (t->ci == t->ci->hydro.super) cost = wscale * bcount_i;
break;
case task_type_drift_part:
- cost = wscale * count_i;
+ cost = wscale * count_i * count_i;
break;
case task_type_drift_gpart:
cost = wscale * gcount_i;
@@ -2273,7 +2633,7 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
cost = wscale * (gcount_i + gcount_j);
break;
case task_type_end_hydro_force:
- cost = wscale * count_i;
+ cost = wscale * count_i * count_i;
break;
case task_type_end_grav_force:
cost = wscale * gcount_i;
@@ -2309,15 +2669,15 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
break;
case task_type_kick1:
cost =
- wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i);
+ wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i) * count_i;
break;
case task_type_kick2:
cost =
- wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i);
+ wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i) * count_i;
break;
case task_type_timestep:
cost =
- wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i);
+ wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i) * count_i;
break;
case task_type_timestep_limiter:
cost = wscale * count_i;
@@ -2374,6 +2734,27 @@ void scheduler_rewait_mapper(void *map_data, int num_elements,
/* Increment the task's own wait counter for the enqueueing. */
atomic_inc(&t->wait);
+ t->done = 0;
+ t->gpu_done = 0;
+
+ // if (t->type == task_type_self){ // A. Nasar increment number of
+ // waiting tasks
+ // if(t->subtype == task_subtype_gpu_pack_d)
+ // atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left);
+ // if (t->subtype == task_subtype_gpu_pack_f)
+ // atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left_f);
+ // if (t->subtype == task_subtype_gpu_pack_g)
+ // atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left_g);
+ // }
+ //
+ // if (t->type == task_type_pair){
+ // if(t->subtype == task_subtype_gpu_pack_d)
+ // atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left);
+ // if (t->subtype == task_subtype_gpu_pack_f)
+ // atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left_f);
+ // if (t->subtype == task_subtype_gpu_pack_g)
+ // atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left_g);
+ // }
#ifdef SWIFT_DEBUG_CHECKS
/* Check that we don't have more waits that what can be stored. */
@@ -2411,7 +2792,26 @@ void scheduler_enqueue_mapper(void *map_data, int num_elements,
* @param s The #scheduler.
*/
void scheduler_start(struct scheduler *s) {
-
+ for (int i = 0; i < s->nr_queues; i++) { // A. Nasar
+ s->queues[i].n_packs_self_left_d = 0;
+ s->queues[i].n_packs_pair_left_d = 0;
+ s->queues[i].n_packs_self_left_f = 0;
+ s->queues[i].n_packs_pair_left_f = 0;
+ s->queues[i].n_packs_self_left_g = 0;
+ s->queues[i].n_packs_pair_left_g = 0;
+ s->queues[i].n_packs_self_stolen_d = 0;
+ s->queues[i].n_packs_pair_stolen_d = 0;
+ s->queues[i].n_packs_self_stolen_f = 0;
+ s->queues[i].n_packs_pair_stolen_f = 0;
+ s->queues[i].n_packs_self_stolen_g = 0;
+ s->queues[i].n_packs_pair_stolen_g = 0;
+ s->s_d_left[i] = 0;
+ s->s_g_left[i] = 0;
+ s->s_f_left[i] = 0;
+ s->p_d_left[i] = 0;
+ s->p_g_left[i] = 0;
+ s->p_f_left[i] = 0;
+ }
/* Re-wait the tasks. */
if (s->active_count > 1000) {
threadpool_map(s->threadpool, scheduler_rewait_mapper, s->tid_active,
@@ -2487,6 +2887,21 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
t->subtype == task_subtype_external_grav) {
qid = t->ci->grav.super->owner;
owner = &t->ci->grav.super->owner;
+ } else if (t->subtype == task_subtype_gpu_pack_d) { // A. Nasar
+ qid = t->ci->hydro.super->owner;
+ owner = &t->ci->hydro.super->owner;
+ } else if (t->subtype == task_subtype_gpu_pack_f) {
+ qid = t->ci->hydro.super->owner;
+ owner = &t->ci->hydro.super->owner;
+ } else if (t->subtype == task_subtype_gpu_pack_g) {
+ qid = t->ci->hydro.super->owner;
+ owner = &t->ci->hydro.super->owner;
+ } else if (t->subtype == task_subtype_gpu_unpack_d) {
+ qid = -1;
+ } else if (t->subtype == task_subtype_gpu_unpack_f) {
+ qid = -1;
+ } else if (t->subtype == task_subtype_gpu_unpack_g) {
+ qid = -1;
} else {
qid = t->ci->hydro.super->owner;
owner = &t->ci->hydro.super->owner;
@@ -2513,13 +2928,19 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
break;
case task_type_pair:
case task_type_sub_pair:
- qid = t->ci->super->owner;
- owner = &t->ci->super->owner;
- if ((qid < 0) ||
- ((t->cj->super->owner > -1) &&
- (s->queues[qid].count > s->queues[t->cj->super->owner].count))) {
- qid = t->cj->super->owner;
- owner = &t->cj->super->owner;
+ if (t->subtype == task_subtype_gpu_unpack_d ||
+ t->subtype == task_subtype_gpu_unpack_f ||
+ t->subtype == task_subtype_gpu_unpack_g) {
+ qid = -1;
+ } else {
+ qid = t->ci->super->owner;
+ owner = &t->ci->super->owner;
+ if ((qid < 0) ||
+ ((t->cj->super->owner > -1) &&
+ (s->queues[qid].count > s->queues[t->cj->super->owner].count))) {
+ qid = t->cj->super->owner;
+ owner = &t->cj->super->owner;
+ }
}
break;
case task_type_recv:
@@ -2729,12 +3150,83 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
/* Save qid as owner for next time a task accesses this cell. */
if (owner != NULL) *owner = qid;
-
+// if (t->type == task_type_self || t->type == task_type_sub_self) {
+// if (t->subtype == task_subtype_gpu_pack_d && t->ci->hydro.count > 0) {
+// return;
+// }
+// if (t->subtype == task_subtype_gpu_pack_f && t->ci->hydro.count > 0) {
+// return;
+// }
+// if (t->subtype == task_subtype_gpu_pack_g && t->ci->hydro.count > 0) {
+// return;
+// }
+// }
+// /* A. Nasar NEED to think about how to do this with
+// MPI where ci may not be on this node/rank */
+// if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+// if (t->subtype == task_subtype_gpu_pack_d && t->ci->hydro.count > 0 && t->cj->hydro.count > 0) {
+// return;
+// }
+// if (t->subtype == task_subtype_gpu_pack_f && t->ci->hydro.count > 0 && t->cj->hydro.count > 0) {
+// return;
+// }
+// if (t->subtype == task_subtype_gpu_pack_g && t->ci->hydro.count > 0 && t->cj->hydro.count > 0) {
+// return;
+// }
+// }
/* Increase the waiting counter. */
atomic_inc(&s->waiting);
-
/* Insert the task into that queue. */
queue_insert(&s->queues[qid], t);
+ /* A. Nasar: Increment counters required for the pack tasks */
+ if (t->type == task_type_self || t->type == task_type_sub_self) {
+ if (t->subtype == task_subtype_gpu_pack_d && t->ci->hydro.count > 0) {
+ lock_lock(&s->queues[qid].lock);
+ s->queues[qid].n_packs_self_left_d++;
+ if (lock_unlock(&s->queues[qid].lock) != 0)
+ error("Error unlocking queue");
+ atomic_inc(&s->s_d_left[qid]);
+ }
+ if (t->subtype == task_subtype_gpu_pack_f && t->ci->hydro.count > 0) {
+ lock_lock(&s->queues[qid].lock);
+ s->queues[qid].n_packs_self_left_f++;
+ if (lock_unlock(&s->queues[qid].lock) != 0)
+ error("Error unlocking queue");
+ atomic_inc(&s->s_f_left[qid]);
+ }
+ if (t->subtype == task_subtype_gpu_pack_g && t->ci->hydro.count > 0) {
+ lock_lock(&s->queues[qid].lock);
+ s->queues[qid].n_packs_self_left_g++;
+ if (lock_unlock(&s->queues[qid].lock) != 0)
+ error("Error unlocking queue");
+ atomic_inc(&s->s_g_left[qid]);
+ }
+ }
+ /* A. Nasar NEED to think about how to do this with
+ MPI where ci may not be on this node/rank */
+ if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+ if (t->subtype == task_subtype_gpu_pack_d && t->ci->hydro.count > 0 && t->cj->hydro.count > 0) {
+ lock_lock(&s->queues[qid].lock);
+ s->queues[qid].n_packs_pair_left_d++;
+ if (lock_unlock(&s->queues[qid].lock) != 0)
+ error("Error unlocking queue");
+ atomic_inc(&s->p_d_left[qid]);
+ }
+ if (t->subtype == task_subtype_gpu_pack_f && t->ci->hydro.count > 0 && t->cj->hydro.count > 0) {
+ lock_lock(&s->queues[qid].lock);
+ s->queues[qid].n_packs_pair_left_f++;
+ if (lock_unlock(&s->queues[qid].lock) != 0)
+ error("Error unlocking queue");
+ atomic_inc(&s->p_f_left[qid]);
+ }
+ if (t->subtype == task_subtype_gpu_pack_g && t->ci->hydro.count > 0 && t->cj->hydro.count > 0) {
+ lock_lock(&s->queues[qid].lock);
+ s->queues[qid].n_packs_pair_left_g++;
+ if (lock_unlock(&s->queues[qid].lock) != 0)
+ error("Error unlocking queue");
+ atomic_inc(&s->p_g_left[qid]);
+ }
+ }
}
}
@@ -2778,12 +3270,48 @@ struct task *scheduler_done(struct scheduler *s, struct task *t) {
/* Mark the task as skip. */
t->skip = 1;
+ t->done = 1;
+
/* Return the next best task. Note that we currently do not
implement anything that does this, as getting it to respect
priorities is too tricky and currently unnecessary. */
return NULL;
}
+struct task *signal_sleeping_runners(struct scheduler *s, struct task *t,
+ int tasks_packed) {
+ /* Mark the task as skip. */
+ // t->skip = 1;
+
+ /* Task definitely done, signal any sleeping runners. */
+ if (!t->implicit) {
+ pthread_mutex_lock(&s->sleep_mutex);
+ atomic_sub(&s->waiting, tasks_packed);
+ pthread_cond_broadcast(&s->sleep_cond);
+ pthread_mutex_unlock(&s->sleep_mutex);
+ }
+ return NULL;
+}
+
+struct task *enqueue_dependencies(struct scheduler *s, struct task *t) {
+
+ /* Loop through the dependencies and add them to a queue if
+ they are ready. */
+ for (int k = 0; k < t->nr_unlock_tasks; k++) {
+ struct task *t2 = t->unlock_tasks[k];
+ if (t2->skip) continue;
+
+ const int res = atomic_dec(&t2->wait);
+ if (res < 1) {
+ error("Negative wait!");
+ } else if (res == 1) {
+ scheduler_enqueue(s, t2);
+ }
+ }
+
+ return NULL;
+}
+
/**
* @brief Resolve a single dependency by hand.
*
@@ -2911,10 +3439,12 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
/* Check qid. */
if (qid >= nr_queues || qid < 0) error("Bad queue ID.");
+ /*Get a pointer to our queue for re-use*/
+ struct queue *q = &s->queues[qid];
/* Loop as long as there are tasks... */
while (s->waiting > 0 && res == NULL) {
/* Try more than once before sleeping. */
- for (int tries = 0; res == NULL && s->waiting && tries < scheduler_maxtries;
+ for (int tries = 0; res == NULL && s->waiting && tries < scheduler_maxtries * 100;
tries++) {
/* Try to get a task from the suggested queue. */
if (s->queues[qid].count > 0 || s->queues[qid].count_incoming > 0) {
@@ -2926,21 +3456,109 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
/* If unsuccessful, try stealing from the other queues. */
if (s->flags & scheduler_flag_steal) {
+
int count = 0, qids[nr_queues];
- for (int k = 0; k < nr_queues; k++)
+
+ /* Make list of queues that have 1 or more tasks in them */
+ for (int k = 0; k < nr_queues; k++) {
+ if (k == qid) continue;
if (s->queues[k].count > 0 || s->queues[k].count_incoming > 0) {
qids[count++] = k;
}
+ }
+
for (int k = 0; k < scheduler_maxsteal && count > 0; k++) {
+
+ /* Pick a queue at random among the non-empty ones */
const int ind = rand_r(&seed) % count;
- TIMER_TIC
- res = queue_gettask(&s->queues[qids[ind]], prev, 0);
+ /*Get a pointer to the queue we're stealing from*/
+ int qstl_id = qids[ind];
+
+ /* If we got the queue we already have, abort */
+ if (qid == qstl_id) {
+ /* Reduce the size of the list of non-empty queues */
+ qids[ind] = qids[--count];
+ continue;
+ }
+
+ /* The queue we are stealing from */
+ struct queue *q_stl = &s->queues[qstl_id];
+
+ /* Can we lock our own queue? */
+ if (lock_trylock(&q->lock) != 0) {
+
+ /* No --> continue and try a different queue */
+ continue;
+
+ } else {
+
+ /* Yes --> Try locking the que we steal from */
+ if (lock_trylock(&q_stl->lock) != 0) {
+
+ /* Failed? --> Unlock the 1st queue and
+ try again */
+ if (lock_unlock(&q->lock) != 0)
+ error("Unlocking our queue failed");
+ continue;
+ }
+ }
+
+ /* We now have locked q and q_stl */
+
+ /* Try to get a task from that random queue */
+ TIMER_TIC;
+ res = queue_gettask(q_stl, prev, 0);
TIMER_TOC(timer_qsteal);
+
+ /* Lucky? i.e. did we actually get a task? */
if (res != NULL) {
+
+ /*A.Nasar: Get task type*/
+ enum task_types type = res->type;
+ enum task_subtypes subtype = res->subtype;
+
+ /*Move counter from the robbed to the robber*/
+ if ((type == task_type_self || type == task_type_sub_self) &&
+ subtype == task_subtype_gpu_pack_d) {
+ q->n_packs_self_left_d--;
+ q_stl->n_packs_self_left_d--;
+ }
+ if ((type == task_type_self || type == task_type_sub_self) &&
+ subtype == task_subtype_gpu_pack_g) {
+ q->n_packs_self_left_g--;
+ q_stl->n_packs_self_left_g--;
+ }
+ if ((type == task_type_self || type == task_type_sub_self) &&
+ subtype == task_subtype_gpu_pack_f) {
+ q->n_packs_self_left_f--;
+ q_stl->n_packs_self_left_f--;
+ }
+ if ((type == task_type_pair || type == task_type_sub_pair) &&
+ subtype == task_subtype_gpu_pack_d) {
+ q->n_packs_pair_left_d--;
+ q_stl->n_packs_pair_left_d--;
+ }
+ if ((type == task_type_pair || type == task_type_sub_pair) &&
+ subtype == task_subtype_gpu_pack_g) {
+ q->n_packs_pair_left_g--;
+ q_stl->n_packs_pair_left_g--;
+ }
+ if ((type == task_type_pair || type == task_type_sub_pair) &&
+ subtype == task_subtype_gpu_pack_f) {
+ q->n_packs_pair_left_f--;
+ q_stl->n_packs_pair_left_f--;
+ }
+ /* Run with the task */
break;
} else {
+
+ /* Reduce the size of the list of non-empty queues */
qids[ind] = qids[--count];
}
+
+ if (lock_unlock(&q->lock) != 0) error("Unlocking our queue failed");
+ if (lock_unlock(&q_stl->lock) != 0)
+ error("Unlocking the stealing queue failed");
}
if (res != NULL) break;
}
@@ -2956,6 +3574,11 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
pthread_mutex_lock(&s->sleep_mutex);
res = queue_gettask(&s->queues[qid], prev, 1);
if (res == NULL && s->waiting > 0) {
+ // struct queue qq = s->queues[qid];
+ // message("s->waiting %i self_stolen %i, self_left %i, pair_stolen
+ // %i, pair_left %i", s->waiting,
+ // qq.n_packs_self_stolen_f, qq.n_packs_self_left_f,
+ // qq.n_packs_pair_stolen_f, qq.n_packs_pair_left_f);
pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
}
pthread_mutex_unlock(&s->sleep_mutex);
@@ -3002,6 +3625,16 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
/* Initialize each queue. */
for (int k = 0; k < nr_queues; k++) queue_init(&s->queues[k], NULL);
+ /* Initialize each queue. */
+ for (int k = 0; k < nr_queues; k++) {
+ s->s_d_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues);
+ s->s_g_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues);
+ s->s_f_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues);
+ s->p_d_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues);
+ s->p_g_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues);
+ s->p_f_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues);
+ }
+
/* Init the sleep mutex and cond. */
if (pthread_cond_init(&s->sleep_cond, NULL) != 0 ||
pthread_mutex_init(&s->sleep_mutex, NULL) != 0)
@@ -3090,6 +3723,13 @@ void scheduler_free_tasks(struct scheduler *s) {
}
s->size = 0;
s->nr_tasks = 0;
+ // reset GPU task counters too
+ s->nr_self_pack_tasks_d = 0;
+ s->nr_self_pack_tasks_f = 0;
+ s->nr_self_pack_tasks_g = 0;
+ s->nr_pair_pack_tasks_d = 0;
+ s->nr_pair_pack_tasks_f = 0;
+ s->nr_pair_pack_tasks_g = 0;
}
/**
@@ -3207,6 +3847,19 @@ void scheduler_report_task_times_mapper(void *map_data, int num_elements,
const float total_time = clocks_from_ticks(t->total_ticks);
const enum task_categories cat = task_get_category(t);
time_local[cat] += total_time;
+
+ if (t->subtype == task_subtype_gpu_pack_d ||
+ t->subtype == task_subtype_gpu_pack_f ||
+ t->subtype == task_subtype_gpu_pack_g) {
+ time_local[task_category_gpu_pack] +=
+ clocks_from_ticks(t->total_cpu_pack_ticks);
+ time_local[task_category_gpu] -=
+ clocks_from_ticks(t->total_cpu_pack_ticks);
+ time_local[task_category_gpu] -=
+ clocks_from_ticks(t->total_cpu_unpack_ticks);
+ time_local[task_category_gpu_unpack] +=
+ clocks_from_ticks(t->total_cpu_unpack_ticks);
+ }
}
/* Update the global counters */
diff --git a/src/scheduler.h b/src/scheduler.h
index 6ea7b41d58..b7f8b9f2ad 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -60,6 +60,35 @@ extern int activate_by_unskip;
/* Data of a scheduler. */
struct scheduler {
+
+ int nr_packs_self_dens_done; // A. Nasar
+ int nr_packs_pair_dens_done;
+ int nr_packs_self_forc_done;
+ int nr_packs_pair_forc_done;
+ int nr_packs_self_grad_done;
+ int nr_packs_pair_grad_done;
+
+ volatile int *s_d_left;
+ volatile int *s_g_left;
+ volatile int *s_f_left;
+ volatile int *p_d_left;
+ volatile int *p_g_left;
+ volatile int *p_f_left;
+ /* Actual number of GPU tasks. */
+ int nr_gpu_tasks;
+ /* Number of tasks we want*/
+ int target_gpu_tasks;
+ /* Actual number of density pack tasks. */
+ int nr_self_pack_tasks_d, nr_pair_pack_tasks_d;
+ /* Actual number of force pack tasks. */
+ int nr_self_pack_tasks_f, nr_pair_pack_tasks_f;
+ /* Actual number of gradient pack tasks. */
+ int nr_self_pack_tasks_g, nr_pair_pack_tasks_g;
+
+ /*how many tasks we want to try and work on at once on the GPU*/
+ int pack_size;
+ int pack_size_pair;
+
/* Scheduler flags. */
unsigned int flags;
@@ -323,5 +352,8 @@ void scheduler_write_task_level(const struct scheduler *s, int step);
void scheduler_dump_queues(struct engine *e);
void scheduler_report_task_times(const struct scheduler *s,
const int nr_threads);
+struct task *enqueue_dependencies(struct scheduler *s, struct task *t);
+struct task *signal_sleeping_runners(struct scheduler *s, struct task *t,
+ int tasks_packed);
#endif /* SWIFT_SCHEDULER_H */
diff --git a/src/space.h b/src/space.h
index 4e0e849d64..a5358c913c 100644
--- a/src/space.h
+++ b/src/space.h
@@ -48,7 +48,7 @@ struct hydro_props;
#define space_cellallocchunk 1000
#define space_splitsize_default 400
#define space_maxsize_default 8000000
-#define space_grid_split_threshold_default 400
+#define space_grid_split_threshold_default 100
#define space_extra_parts_default 0
#define space_extra_gparts_default 0
#define space_extra_sparts_default 100
@@ -94,6 +94,9 @@ extern double engine_foreign_alloc_margin;
*/
struct space {
+ /*Used to define GPU task memory allocation*/
+ float eta_neighbours;
+
/*! Spatial extent. */
double dim[3];
diff --git a/src/space_getsid.h b/src/space_getsid.h
index df81615d3c..f5e0101d30 100644
--- a/src/space_getsid.h
+++ b/src/space_getsid.h
@@ -46,7 +46,6 @@
__attribute__((always_inline, nonnull)) INLINE static int
space_getsid_and_swap_cells(const struct space *s, struct cell **ci,
struct cell **cj, double shift[3]) {
-
/* Get the relative distance between the pairs, wrapping. */
const int periodic = s->periodic;
double dx[3];
@@ -79,4 +78,89 @@ space_getsid_and_swap_cells(const struct space *s, struct cell **ci,
return sid;
}
+__attribute__((always_inline, nonnull))
+INLINE static int // A. Nasar Same as usual but only used to pack GPU cells
+space_getsid_GPU(const struct space *s, struct cell **ci, struct cell **cj,
+ double *shift_x, double *shift_y, double *shift_z) {
+ /* Get the relative distance between the pairs, wrapping. */
+ const int periodic = s->periodic;
+ double dx[3];
+ for (int k = 0; k < 3; k++) dx[k] = (*cj)->loc[k] - (*ci)->loc[k];
+
+ if (periodic && dx[0] < -s->dim[0] / 2)
+ *(shift_x) = s->dim[0];
+ else if (periodic && dx[0] > s->dim[0] / 2)
+ *(shift_x) = -s->dim[0];
+ else
+ *(shift_x) = 0.0;
+
+ dx[0] += *(shift_x);
+
+ if (periodic && dx[1] < -s->dim[1] / 2)
+ *(shift_y) = s->dim[1];
+ else if (periodic && dx[1] > s->dim[1] / 2)
+ *(shift_y) = -s->dim[1];
+ else
+ *(shift_y) = 0.0;
+
+ dx[1] += *(shift_y);
+
+ if (periodic && dx[2] < -s->dim[2] / 2)
+ *(shift_z) = s->dim[2];
+ else if (periodic && dx[2] > s->dim[2] / 2)
+ *(shift_z) = -s->dim[2];
+ else
+ *(shift_z) = 0.0;
+
+ dx[2] += *(shift_z);
+
+ /* Get the sorting index. */
+ int sid = 0;
+ for (int k = 0; k < 3; k++)
+ sid = 3 * sid + ((dx[k] < 0.0) ? 0 : ((dx[k] > 0.0) ? 2 : 1));
+
+ /* Switch the cells around? */
+ if (runner_flip[sid]) {
+ struct cell *temp = *ci;
+ *ci = *cj;
+ *cj = temp;
+ *(shift_x) = -*(shift_x);
+ *(shift_y) = -*(shift_y);
+ *(shift_z) = -*(shift_z);
+ }
+ sid = sortlistID[sid];
+
+ /* Return the sort ID. */
+ return sid;
+}
+
+__attribute__((always_inline, nonnull)) INLINE static int space_getsid_filter(
+ const struct space *s, struct cell **ci, struct cell **cj,
+ double shift[3]) {
+
+ /* Get the relative distance between the pairs, wrapping. */
+ const int periodic = s->periodic;
+ double dx[3];
+ for (int k = 0; k < 3; k++) {
+ dx[k] = (*cj)->loc[k] - (*ci)->loc[k];
+ if (periodic && dx[k] < -s->dim[k] / 2)
+ shift[k] = s->dim[k];
+ else if (periodic && dx[k] > s->dim[k] / 2)
+ shift[k] = -s->dim[k];
+ else
+ shift[k] = 0.0;
+ dx[k] += shift[k];
+ }
+
+ /* Get the sorting index. */
+ int sid = 0;
+ for (int k = 0; k < 3; k++)
+ sid = 3 * sid + ((dx[k] < 0.0) ? 0 : ((dx[k] > 0.0) ? 2 : 1));
+
+ sid = sortlistID[sid];
+
+ /* Return the sort ID. */
+ return sid;
+}
+
#endif /* SWIFT_SPACE_GETSID_H */
diff --git a/src/space_recycle.c b/src/space_recycle.c
index cf84227302..0b915ac7a2 100644
--- a/src/space_recycle.c
+++ b/src/space_recycle.c
@@ -232,6 +232,12 @@ void space_rebuild_recycle_mapper(void *map_data, int num_elements,
c->mpi.recv = NULL;
c->mpi.send = NULL;
#endif
+ c->hydro.density_pack = NULL; // A. Nasar
+ c->hydro.density_unpack = NULL;
+ c->hydro.gradient_pack = NULL;
+ c->hydro.gradient_unpack = NULL;
+ c->hydro.force_pack = NULL;
+ c->hydro.force_unpack = NULL;
}
}
diff --git a/src/task.c b/src/task.c
index 3b504a79e6..cbe9547e9d 100644
--- a/src/task.c
+++ b/src/task.c
@@ -164,15 +164,22 @@ const char *subtaskID_names[task_subtype_count] = {
"sink_do_gas_swallow",
"rt_gradient",
"rt_transport",
+ "gpu_pack", // A. Nasar
+ "gpu_pack_g",
+ "gpu_pack_f",
+ "gpu_unpack",
+ "gpu_unpack_g",
+ "gpu_unpack_f",
};
const char *task_category_names[task_category_count] = {
- "drift", "sorts", "resort",
- "hydro", "gravity", "feedback",
- "black holes", "cooling", "star formation",
- "limiter", "sync", "time integration",
- "mpi", "pack", "fof",
- "others", "neutrino", "sink",
+ "drift", "sorts", "resort",
+ "hydro", "gravity", "feedback",
+ "black holes", "cooling", "star formation",
+ "limiter", "sync", "time integration",
+ "mpi", "pack", "gpu",
+ "gpu_pack", "gpu_unpack", "fof",
+ "others", "neutrino", "sink",
"RT", "CSDS"};
#ifdef WITH_MPI
@@ -598,6 +605,22 @@ void task_unlock(struct task *t) {
#ifdef SWIFT_TASKS_WITHOUT_ATOMICS
cell_unlocktree(ci);
#endif
+ } else if (subtype == task_subtype_gpu_unpack_d) {
+ // for(int pp = 0; pp < 128 /*should be sched->pack_size*/;
+ // pp++){
+ // cell_unlocktree(t->ci_unpack[pp]);
+ // }
+ /*Do nothing and be on your way*/
+ } else if (subtype == task_subtype_gpu_unpack_f) {
+ /*Do nothing and be on your way*/
+ } else if (subtype == task_subtype_gpu_unpack_g) {
+ /*Do nothing and be on your way*/
+ } else if (subtype == task_subtype_gpu_pack_d) {
+ cell_unlocktree(ci);
+ } else if (subtype == task_subtype_gpu_pack_f) {
+ cell_unlocktree(ci);
+ } else if (subtype == task_subtype_gpu_pack_g) {
+ cell_unlocktree(ci);
} else { /* hydro */
cell_unlocktree(ci);
}
@@ -645,6 +668,21 @@ void task_unlock(struct task *t) {
cell_unlocktree(ci);
cell_unlocktree(cj);
#endif
+ } else if (subtype == task_subtype_gpu_pack_d) {
+ cell_unlocktree(ci);
+ cell_unlocktree(cj);
+ } else if (subtype == task_subtype_gpu_pack_f) {
+ cell_unlocktree(ci);
+ cell_unlocktree(cj);
+ } else if (subtype == task_subtype_gpu_pack_g) {
+ cell_unlocktree(ci);
+ cell_unlocktree(cj);
+ } else if (subtype == task_subtype_gpu_unpack_d) {
+ /* Nothing to do */
+ } else if (subtype == task_subtype_gpu_unpack_f) {
+ /* Nothing to do */
+ } else if (subtype == task_subtype_gpu_unpack_g) {
+ /* Nothing to do */
} else { /* hydro */
cell_unlocktree(ci);
cell_unlocktree(cj);
@@ -848,6 +886,38 @@ int task_lock(struct task *t) {
if (ci->hydro.hold) return 0;
if (cell_locktree(ci) != 0) return 0;
#endif
+ } else if (subtype == task_subtype_gpu_pack_d) {
+ /* Attempt to lock the cell */
+ if (ci->hydro.hold) return 0;
+ if (cell_locktree(ci) != 0) return 0;
+ } else if (subtype == task_subtype_gpu_pack_f) {
+ /* Attempt to lock the cell */
+ if (ci->hydro.hold) return 0;
+ if (cell_locktree(ci) != 0) return 0;
+ } else if (subtype == task_subtype_gpu_pack_g) {
+ /* Attempt to lock the cell */
+ if (ci->hydro.hold) return 0;
+ if (cell_locktree(ci) != 0) return 0;
+ } else if (subtype == task_subtype_gpu_unpack_d) {
+ // for(int pp = 0; pp < 128 /*should be sched->pack_size*/;
+ // pp++){
+ // if (t->ci_unpack[pp]->gpu_done == 0){
+ // message("trying to queue an unpack before all packs
+ // done on GPU"); return 0;
+ // }
+ //// if (t->ci_unpack[pp]->hydro.hold)
+ //// return 0;
+ //// if (cell_locktree(t->ci_unpack[pp]) != 0)
+ //// return 0;
+ // }
+ /* Nothing to do here */
+ return 1;
+ } else if (subtype == task_subtype_gpu_unpack_f) {
+ /* Nothing to do here */
+ return 1;
+ } else if (subtype == task_subtype_gpu_unpack_g) {
+ /* Nothing to do here */
+ return 1;
} else { /* subtype == hydro */
if (ci->hydro.hold) return 0;
if (cell_locktree(ci) != 0) return 0;
@@ -964,6 +1034,39 @@ int task_lock(struct task *t) {
return 0;
}
#endif
+ } else if (subtype == task_subtype_gpu_pack_d) {
+ /* Lock the parts in both cells */
+ if (ci->hydro.hold || cj->hydro.hold) return 0;
+ if (cell_locktree(ci) != 0) return 0;
+ if (cell_locktree(cj) != 0) {
+ cell_unlocktree(ci);
+ return 0;
+ }
+ } else if (subtype == task_subtype_gpu_pack_f) {
+ /* Lock the parts in both cells */
+ if (ci->hydro.hold || cj->hydro.hold) return 0;
+ if (cell_locktree(ci) != 0) return 0;
+ if (cell_locktree(cj) != 0) {
+ cell_unlocktree(ci);
+ return 0;
+ }
+ } else if (subtype == task_subtype_gpu_pack_g) {
+ /* Lock the parts in both cells */
+ if (ci->hydro.hold || cj->hydro.hold) return 0;
+ if (cell_locktree(ci) != 0) return 0;
+ if (cell_locktree(cj) != 0) {
+ cell_unlocktree(ci);
+ return 0;
+ }
+ } else if (subtype == task_subtype_gpu_unpack_d) {
+ /* Nothing to do here. */
+ return 1;
+ } else if (subtype == task_subtype_gpu_unpack_f) {
+ /* Nothing to do here. */
+ return 1;
+ } else if (subtype == task_subtype_gpu_unpack_g) {
+ /* Nothing to do here. */
+ return 1;
} else { /* subtype == hydro */
/* Lock the parts in both cells */
if (ci->hydro.hold || cj->hydro.hold) return 0;
@@ -1127,6 +1230,19 @@ void task_get_group_name(int type, int subtype, char *cluster) {
}
switch (subtype) {
+ /* A. Nasar */
+ case task_subtype_gpu_pack_d:
+ case task_subtype_gpu_unpack_d:
+ strcpy(cluster, "Density");
+ break;
+ case task_subtype_gpu_pack_f:
+ case task_subtype_gpu_unpack_f:
+ strcpy(cluster, "Force");
+ break;
+ case task_subtype_gpu_pack_g:
+ case task_subtype_gpu_unpack_g:
+ strcpy(cluster, "Gradient");
+ break;
case task_subtype_density:
strcpy(cluster, "Density");
break;
@@ -1629,8 +1745,16 @@ void task_dump_active(struct engine *e) {
/* Get destination rank of MPI requests. */
int paired = (t->cj != NULL);
- int otherrank = t->ci->nodeID;
- if (paired) otherrank = t->cj->nodeID;
+ int otherrank = 0;
+ // A. N.: Mods requied to stop code crashing when debugging GPU tasks
+ if (t->subtype != task_subtype_gpu_unpack_d &&
+ t->subtype != task_subtype_gpu_unpack_f &&
+ t->subtype != task_subtype_gpu_unpack_g)
+ otherrank = t->ci->nodeID;
+ if (paired && t->subtype != task_subtype_gpu_unpack_d &&
+ t->subtype != task_subtype_gpu_unpack_f &&
+ t->subtype != task_subtype_gpu_unpack_g)
+ otherrank = t->cj->nodeID;
fprintf(file_thread, "%i %i %s %s %i %i %lli %lli %i %i %i %i %lli\n",
engine_rank, otherrank, taskID_names[t->type],
@@ -1757,6 +1881,14 @@ enum task_categories task_get_category(const struct task *t) {
case task_subtype_force:
return task_category_hydro;
+ case task_subtype_gpu_pack_d: // A. Nasar
+ case task_subtype_gpu_unpack_d:
+ case task_subtype_gpu_pack_f:
+ case task_subtype_gpu_unpack_f:
+ case task_subtype_gpu_pack_g:
+ case task_subtype_gpu_unpack_g:
+ return task_category_gpu;
+
case task_subtype_limiter:
return task_category_limiter;
diff --git a/src/task.h b/src/task.h
index b405a0795f..c6991751b5 100644
--- a/src/task.h
+++ b/src/task.h
@@ -160,6 +160,12 @@ enum task_subtypes {
task_subtype_sink_do_gas_swallow,
task_subtype_rt_gradient,
task_subtype_rt_transport,
+ task_subtype_gpu_pack_d, // A. Nasar
+ task_subtype_gpu_pack_g,
+ task_subtype_gpu_pack_f,
+ task_subtype_gpu_unpack_d,
+ task_subtype_gpu_unpack_g,
+ task_subtype_gpu_unpack_f,
task_subtype_count
} __attribute__((packed));
@@ -196,6 +202,9 @@ enum task_categories {
task_category_time_integration,
task_category_mpi,
task_category_pack,
+ task_category_gpu,
+ task_category_gpu_pack,
+ task_category_gpu_unpack,
task_category_fof,
task_category_others,
task_category_neutrino,
@@ -235,6 +244,15 @@ struct task {
/*! Pointers to the cells this task acts upon */
struct cell *ci, *cj;
+ int done; // A. Nasar
+
+ int gpu_done;
+
+ int corner_pair;
+
+ /*! Pointers to the cells this task acts upon */
+ struct cell **ci_unpack; //, **cj;
+
/*! List of tasks unlocked by this one */
struct task **unlock_tasks;
@@ -286,6 +304,9 @@ struct task {
/*! Start and end time of this task */
ticks tic, toc;
+ ticks total_cpu_pack_ticks;
+ ticks total_cpu_unpack_ticks;
+
/* Total time spent running this task */
ticks total_ticks;
diff --git a/swift.c b/swift.c
index b63941cd63..7a9277ae5c 100644
--- a/swift.c
+++ b/swift.c
@@ -1108,7 +1108,7 @@ int main(int argc, char *argv[]) {
hydro_props_init(&hydro_properties, &prog_const, &us, params);
else
bzero(&hydro_properties, sizeof(struct hydro_props));
-
+ float eta_neighbours = hydro_properties.eta_neighbours;
/* Initialise the equation of state */
if (with_hydro)
eos_init(&eos, &prog_const, &us, params);
@@ -1388,7 +1388,7 @@ int main(int argc, char *argv[]) {
with_self_gravity, with_star_formation, with_sinks,
with_DM_particles, with_DM_background_particles, with_neutrinos,
talking, dry_run, nr_nodes);
-
+ s.eta_neighbours = eta_neighbours;
/* Initialise the line of sight properties. */
if (with_line_of_sight) los_init(s.dim, &los_properties, params);