diff --git a/.gitignore b/.gitignore index 46ef541ee9..0e3cb19964 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,8 @@ swift swift_mpi fof fof_mpi +swift_cuda +swift_mpicuda src/version_string.h swift*.tar.gz diff --git a/Makefile.am b/Makefile.am index b5ede6fd97..51f34ac1ed 100644 --- a/Makefile.am +++ b/Makefile.am @@ -74,6 +74,23 @@ bin_PROGRAMS += fof_mpi endif endif +# BUILD CUDA versions as well? +if HAVECUDA +bin_PROGRAMS += swift_cuda +if HAVEMPI +bin_PROGRAMS += swift_mpicuda +endif +endif + + +# BUILD HIP versions as well? +if HAVEHIP +bin_PROGRAMS += swift_hip +if HAVEMPI +bin_PROGRAMS += swift_mpihip +endif +endif + # engine_policy_setaffinity is available? if HAVESETAFFINITY ENGINE_POLICY_SETAFFINITY=| engine_policy_setaffinity @@ -91,6 +108,28 @@ swift_mpi_SOURCES = swift.c swift_mpi_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" swift_mpi_LDADD = src/libswiftsim_mpi.la argparse/libargparse.la $(MPI_LIBS) $(VELOCIRAPTOR_MPI_LIBS) $(EXTRA_LIBS) $(LD_CSDS) +# Sources for swift_cuda +swift_cuda_SOURCES = swift.c dummy.C +swift_cuda_CXXFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(CUDA_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_CUDA +swift_cuda_LDADD = src/.libs/libswiftsim_cuda.a src/cuda/.libs/libswiftCUDA.a $(EXTRA_LIBS) $(CUDA_LIBS) -lcudart argparse/.libs/libargparse.a src/.libs/libgrav.la + +# Sources for swift_hip +swift_hip_SOURCES = swift.c dummy.C +swift_hip_CXXFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(HIP_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_HIP +swift_hip_LDADD = src/.libs/libswiftsim_hip.a src/hip/.libs/libswiftHIP.a $(EXTRA_LIBS) $(HIP_LIBS) -lamdhip64 -L/opt/rocm-5.1.0/lib -lhsa-runtime64 -L/opt/rocm-5.1.0/lib64 -lamd_comgr argparse/.libs/libargparse.a src/.libs/libgrav.la + +# Sources for swift_mpicuda, do we need an affinity policy for MPI? +swift_mpicuda_SOURCES = swift.c dummy.C +swift_mpicuda_CXXFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) $(CUDA_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_CUDA +swift_mpicuda_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) $(CUDA_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_CUDA +swift_mpicuda_LDADD = src/.libs/libswiftsim_mpicuda.a argparse/.libs/libargparse.a src/.libs/libgrav.la src/cuda/.libs/libswiftCUDA.a $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -lcudart + +# Sources for swift_mpihip, do we need an affinity policy for MPI? +swift_mpihip_SOURCES = swift.c dummy.C +swift_mpihip_CXXFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) $(HIP_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_HIP +swift_mpihip_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) $(HIP_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_HIP +swift_mpihip_LDADD = src/.libs/libswiftsim_mpihip.a argparse/.libs/libargparse.a src/.libs/libgrav.la src/hip/.libs/libswiftHIP.a $(MPI_LIBS) $(EXTRA_LIBS) $(HIP_LIBS) -lamdhip64 + # Sources for fof fof_SOURCES = swift_fof.c fof_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" diff --git a/configure.ac b/configure.ac index b0173c6954..59fc40aba5 100644 --- a/configure.ac +++ b/configure.ac @@ -41,6 +41,10 @@ AC_USE_SYSTEM_EXTENSIONS AC_PROG_CC AM_PROG_CC_C_O +# Find and test the C++ compiler. +AC_PROG_CXX +AC_PROG_CXX_C_O + # We need this for compilation hints and possibly FFTW. AX_OPENMP @@ -995,6 +999,78 @@ AH_VERBATIM([__STDC_FORMAT_MACROS], #define __STDC_FORMAT_MACROS 1 #endif]) + + +# Check for CUDA +have_cuda="no" +AC_ARG_WITH([cuda], + [AS_HELP_STRING([--with-cuda=PATH], + [root directory where CUDA is installed @<:@yes/no@:>@] + )], + [], + [with_cuda="no"] +) +if test "x$with_cuda" != "xno"; then + if test "x$with_cuda" != "xyes"; then + CUDA_CFLAGS="-I$with_cuda/include" + CUDA_LIBS="-L$with_cuda/lib -L$with_cuda/lib64 -lcudart" + NVCC="$with_cuda/bin/nvcc" + have_cuda="yes" + else + AC_PATH_PROG([NVCC],[nvcc]) + echo "Found nvcc = $NVCC" + if test -n "$NVCC"; then + CUDA_ROOT="`dirname $NVCC`/.." + CUDA_CFLAGS="-I${CUDA_ROOT}/include" + CUDA_LIBS="-L${CUDA_ROOT}/lib -L${CUDA_ROOT}/lib64 -lcudart" + have_cuda="yes" + fi + fi + if test "x$have_cuda" != "xno"; then + AC_DEFINE([HAVE_CUDA], 1, [The CUDA compiler is installed.]) + fi + CFLAGS="${CFLAGS} " +fi +AC_SUBST(CUDA_CFLAGS) +AC_SUBST(CUDA_LIBS) +AC_SUBST(NVCC) +AM_CONDITIONAL([HAVECUDA],[test -n "$NVCC"]) + +# Check for HIP +have_hip="no" +AC_ARG_WITH([hip], + [AS_HELP_STRING([--with-hip=PATH], + [root directory where HIP is installed @<:@yes/no@:>@] + )], + [], + [with_hip="no"] +) +if test "x$with_hip" != "xno"; then + if test "x$with_hip" != "xyes"; then + HIP_CFLAGS="-I$with_hip/include" + HIP_LIBS="-L$with_hip/lib -L$with_hip/lib64" + HIPCC="$with_hip/bin/hipcc" + have_hip="yes" + else + AC_PATH_PROG([HIPCC],[hipcc]) + echo "Found hipcc = $HIPCC" + if test -n "$HIPCC"; then + HIP_ROOT="`dirname $HIPCC`/.." + HIP_CFLAGS="-I${HIP_ROOT}/include" + HIP_LIBS="-L${HIP_ROOT}/lib -L${HIP_ROOT}/lib64" + have_hip="yes" + fi + fi + if test "x$have_hip" != "xno"; then + AC_DEFINE([HAVE_HIP], 1, [The HIP compiler is installed.]) + fi + CFLAGS="${CFLAGS} " +fi +AC_SUBST(HIP_CFLAGS) +AC_SUBST(HIP_LIBS) +AC_SUBST(HIPCC) +AM_CONDITIONAL([HAVEHIP],[test -n "$HIPCC"]) + # Check for FFTW. We test for this in the standard directories by default, # and only disable if using --with-fftw=no or --without-fftw. When a value # is given FFTW must be found. @@ -3246,6 +3322,10 @@ AC_CONFIG_FILES([tests/testSelectOutput.sh], [chmod +x tests/testSelectOutput.sh AC_CONFIG_FILES([tests/testFormat.sh], [chmod +x tests/testFormat.sh]) AC_CONFIG_FILES([tests/testNeutrinoCosmology.sh], [chmod +x tests/testNeutrinoCosmology.sh]) AC_CONFIG_FILES([tests/output_list_params.yml]) +# cuda .in file +AC_CONFIG_FILES([src/cuda/Makefile]) +# hip .in file +AC_CONFIG_FILES([src/hip/Makefile]) # Save the compilation options AC_DEFINE_UNQUOTED([SWIFT_CONFIG_FLAGS],["$swift_config_flags"],[Flags passed to configure]) @@ -3276,6 +3356,8 @@ AC_MSG_RESULT([ HDF5 enabled : $with_hdf5 - parallel : $have_parallel_hdf5 METIS/ParMETIS : $have_metis / $have_parmetis + CUDA enabled : $have_cuda + HIP enabled : $have_hip FFTW3 enabled : $have_fftw - threaded/openmp : $have_threaded_fftw / $have_openmp_fftw - MPI : $have_mpi_fftw diff --git a/cudalt.py b/cudalt.py new file mode 100755 index 0000000000..e8643cd1e6 --- /dev/null +++ b/cudalt.py @@ -0,0 +1,80 @@ +#!/usr/bin/python3 +# libtoolish hack: compile a .cu file like libtool does +import sys +import os + +lo_filepath = sys.argv[1] +o_filepath = lo_filepath.replace(".lo", ".o") + +try: + i = o_filepath.rindex("/") + lo_dir = o_filepath[0:i+1] + o_filename = o_filepath[i+1:] + +except ValueError: + lo_dir = "" + o_filename = o_filepath + +local_pic_dir = ".libs/" +local_npic_dir = "" +pic_dir = lo_dir + local_pic_dir +npic_dir = lo_dir + local_npic_dir + +pic_filepath = pic_dir + o_filename +npic_filepath = npic_dir + o_filename +local_pic_filepath = local_pic_dir + o_filename +local_npic_filepath = local_npic_dir + o_filename + +# Make lib dir +try: + os.mkdir(pic_dir) +except OSError: + pass + +# generate the command to compile the .cu for shared library +args = sys.argv[2:] +args.extend(["-Xcompiler","-fPIC"]) +# position indep code +args.append("-o") +args.append(pic_filepath) +command = " ".join(args) +print (command) + +# compile the .cu +rv = os.system(command) +if rv != 0: + sys.exit(1) + +# generate the command to compile the .cu for static library +args = sys.argv[2:] +args.append("-o") +args.append(npic_filepath) +command = " ".join(args) +print (command) + +# compile the .cu +rv = os.system(command) +if rv != 0: + sys.exit(1) + +# get libtool version +fd = os.popen("libtool --version") +libtool_version = fd.readline() +fd.close() + +# generate the .lo file +f = open(lo_filepath, "w") +f.write("# " + lo_filepath + " - a libtool object file\n") +f.write("# Generated by " + libtool_version + "\n") +f.write("#\n") +f.write("# Please DO NOT delete this file!\n") +f.write("# It is necessary for linking the library.\n\n") + +f.write("# Name of the PIC object.\n") +f.write("pic_object='" + local_pic_filepath + "'\n\n") + +f.write("# Name of the non-PIC object.\n") +f.write("non_pic_object='" + local_npic_filepath + "'\n") +f.close() + +sys.exit(0) diff --git a/dummy.C b/dummy.C new file mode 100755 index 0000000000..bbf68f8cea --- /dev/null +++ b/dummy.C @@ -0,0 +1,3 @@ +void dummy(){ + +} diff --git a/examples/HydroTests/GreshoVortex_3D/getGlass.sh b/examples/HydroTests/GreshoVortex_3D/getGlass.sh index d5c5f590ac..068986fc10 100755 --- a/examples/HydroTests/GreshoVortex_3D/getGlass.sh +++ b/examples/HydroTests/GreshoVortex_3D/getGlass.sh @@ -1,2 +1,2 @@ #!/bin/bash -wget http://virgodb.cosma.dur.ac.uk/swift-webstorage/ICs/glassCube_64.hdf5 +wget http://virgodb.cosma.dur.ac.uk/swift-webstorage/ICs/glassCube_128.hdf5 diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml index a95a0eae32..6c945e7473 100644 --- a/examples/HydroTests/GreshoVortex_3D/gresho.yml +++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml @@ -7,21 +7,25 @@ InternalUnitSystem: UnitTemp_in_cgs: 1 # Kelvin Scheduler: - max_top_level_cells: 15 - + max_top_level_cells: 8 + tasks_per_cell: 200 + # deadlock_waiting_time_s: 10 + # cell_split_size: 100 + # cell_sub_size_pair_hydro: 10000 # (Optional) Maximal number of hydro-hydro interactions per sub-pair hydro/star task (this is the default value). + # cell_sub_size_self_hydro: 100 # (Optional) Maximal number of hydro-hydro interactions per sub-self hydro/star task. Set to how many cells are targeted for GPU tasks # Parameters governing the time integration TimeIntegration: time_begin: 0. # The starting time of the simulation (in internal units). time_end: 1. # The end time of the simulation (in internal units). dt_min: 1e-6 # The minimal time-step size of the simulation (in internal units). - dt_max: 1e-2 # The maximal time-step size of the simulation (in internal units). + dt_max: 1e-4 # The maximal time-step size of the simulation (in internal units). # Parameters governing the snapshots Snapshots: basename: gresho # Common part of the name of output files time_first: 0. # Time of the first output (in internal units) - delta_time: 1e-1 # Time difference between consecutive outputs (in internal units) - compression: 1 + delta_time: 1e-3 # Time difference between consecutive outputs (in internal units) + # compression: 1 # Parameters governing the conserved quantities statistics Statistics: @@ -29,10 +33,11 @@ Statistics: # Parameters for the hydrodynamics scheme SPH: - resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). + resolution_eta: 1.9 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. # Parameters related to the initial conditions InitialConditions: - file_name: ./greshoVortex.hdf5 # The file to read - periodic: 1 \ No newline at end of file + file_name: greshoVortex.hdf5 + periodic: 1 + # replicate: 2 diff --git a/examples/HydroTests/GreshoVortex_3D/gresho_split_size_500.yml b/examples/HydroTests/GreshoVortex_3D/gresho_split_size_500.yml new file mode 100644 index 0000000000..3105787d75 --- /dev/null +++ b/examples/HydroTests/GreshoVortex_3D/gresho_split_size_500.yml @@ -0,0 +1,42 @@ +# Define the system of units to use internally. +InternalUnitSystem: + UnitMass_in_cgs: 1 # Grams + UnitLength_in_cgs: 1 # Centimeters + UnitVelocity_in_cgs: 1 # Centimeters per second + UnitCurrent_in_cgs: 1 # Amperes + UnitTemp_in_cgs: 1 # Kelvin + +Scheduler: + max_top_level_cells: 16 + tasks_per_cell: 200 + cell_split_size: 700 + cell_sub_size_pair_hydro: 49000 # (Optional) Maximal number of hydro-hydro interactions per sub-pair hydro/star task (this is the default value). + cell_sub_size_self_hydro: 700 # (Optional) Maximal number of hydro-hydro interactions per sub-self hydro/star task. Set to how many cells are targeted for GPU tasks +# Parameters governing the time integration +TimeIntegration: + time_begin: 0. # The starting time of the simulation (in internal units). + time_end: 1. # The end time of the simulation (in internal units). + dt_min: 1e-6 # The minimal time-step size of the simulation (in internal units). + dt_max: 1e-4 # The maximal time-step size of the simulation (in internal units). + +# Parameters governing the snapshots +Snapshots: + basename: gresho # Common part of the name of output files + time_first: 0. # Time of the first output (in internal units) + delta_time: 1e-3 # Time difference between consecutive outputs (in internal units) + # compression: 1 + +# Parameters governing the conserved quantities statistics +Statistics: + delta_time: 1e-2 # Time between statistics output + +# Parameters for the hydrodynamics scheme +SPH: + resolution_eta: 1.9 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). + CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. + +# Parameters related to the initial conditions +InitialConditions: + file_name: greshoVortex.hdf5 + periodic: 1 + replicate: 8 diff --git a/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/isolated_galaxy.yml b/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/isolated_galaxy.yml index 8717af63bd..bcabd810dd 100644 --- a/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/isolated_galaxy.yml +++ b/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/isolated_galaxy.yml @@ -10,6 +10,13 @@ InternalUnitSystem: UnitCurrent_in_cgs: 1 # Amperes UnitTemp_in_cgs: 1 # Kelvin + + + + + + + # Parameters for the self-gravity scheme Gravity: eta: 0.025 # Constant dimensionless multiplier for time integration. @@ -24,7 +31,7 @@ TimeIntegration: time_begin: 0. # The starting time of the simulation (in internal units). time_end: 0.1 # The end time of the simulation (in internal units). dt_min: 1e-9 # The minimal time-step size of the simulation (in internal units). - dt_max: 1e-2 # The maximal time-step size of the simulation (in internal units). + dt_max: 1e-6 # The maximal time-step size of the simulation (in internal units). # Parameters governing the snapshots Snapshots: diff --git a/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/run.sh b/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/run.sh index 6931897b2c..6a2fa4d897 100755 --- a/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/run.sh +++ b/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/run.sh @@ -30,7 +30,7 @@ then ./getEaglePhotometryTable.sh fi -../../../swift --threads=16 --feedback --external-gravity --self-gravity --stars --star-formation --cooling --hydro --limiter --sync isolated_galaxy.yml 2>&1 | tee output.log +../../../swift_mpicuda --threads=16 --feedback --external-gravity --self-gravity --stars --star-formation --cooling --hydro --limiter --sync isolated_galaxy.yml 2>&1 | tee output.log # Kennicutt-Schmidt law plot python3 plotSolution.py 100 diff --git a/src/Makefile.am b/src/Makefile.am index 8099524651..99092acde4 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -16,7 +16,10 @@ # along with this program. If not, see . # Add the non-standard paths to the included library headers -AM_CFLAGS = $(HDF5_CPPFLAGS) $(GSL_INCS) $(FFTW_INCS) $(NUMA_INCS) $(GRACKLE_INCS) $(SUNDIALS_INCS) $(CHEALPIX_CFLAGS) +AM_CFLAGS = $(HDF5_CPPFLAGS) $(GSL_INCS) $(FFTW_INCS) $(NUMA_INCS) $(GRACKLE_INCS) $(SUNDIALS_INCS) $(CHEALPIX_CFLAGS) -O0 + +# Add HIP Path +AM_CFLAGS += -D__HIP_PLATFORM_AMD__ # Assign a "safe" version number AM_LDFLAGS = $(HDF5_LDFLAGS) $(FFTW_LIBS) @@ -40,6 +43,22 @@ lib_LTLIBRARIES += libswiftsim_mpi.la noinst_LTLIBRARIES += libgrav_mpi.la endif +# Build a cuda version too? +if HAVECUDA +lib_LTLIBRARIES += libswiftsim_cuda.la +if HAVEMPI +lib_LTLIBRARIES += libswiftsim_mpicuda.la +endif +endif + +# Build a hip version too? +if HAVEHIP +lib_LTLIBRARIES += libswiftsim_hip.la +if HAVEMPI +lib_LTLIBRARIES += libswiftsim_mpihip.la +endif +endif + # List required headers include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h include_HEADERS += cell_hydro.h cell_stars.h cell_grav.h cell_sinks.h cell_black_holes.h cell_rt.h cell_grid.h @@ -161,7 +180,7 @@ endif AM_SOURCES = space.c space_rebuild.c space_regrid.c space_unique_id.c AM_SOURCES += space_sort.c space_split.c space_extras.c space_first_init.c space_init.c AM_SOURCES += space_cell_index.c space_recycle.c -AM_SOURCES += runner_main.c runner_doiact_hydro.c runner_doiact_limiter.c +AM_SOURCES += runner_main.c runner_doiact_hydro.c runner_doiact_limiter.c runner_gpu_pack_functions.c AM_SOURCES += runner_doiact_stars.c runner_doiact_black_holes.c runner_ghost.c AM_SOURCES += runner_recv.c runner_pack.c AM_SOURCES += runner_sort.c runner_drift.c runner_black_holes.c runner_time_integration.c @@ -208,7 +227,7 @@ AM_SOURCES += $(SPHM1RT_RT_SOURCES) AM_SOURCES += $(GEAR_RT_SOURCES) # Include files for distribution, not installation. -nobase_noinst_HEADERS = align.h approx_math.h atomic.h barrier.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h +nobase_noinst_HEADERS = align.h approx_math.h atomic.h barrier.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h runner_gpu_pack_functions.h nobase_noinst_HEADERS += gravity_iact.h kernel_long_gravity.h vector.h accumulate.h cache.h exp.h log.h nobase_noinst_HEADERS += runner_doiact_nosort.h runner_doiact_hydro.h runner_doiact_stars.h runner_doiact_black_holes.h runner_doiact_grav.h nobase_noinst_HEADERS += runner_doiact_functions_hydro.h runner_doiact_functions_stars.h runner_doiact_functions_black_holes.h @@ -526,6 +545,33 @@ libswiftsim_mpi_la_LDFLAGS = $(AM_LDFLAGS) $(MPI_LIBS) $(EXTRA_LIBS) -version-in libswiftsim_mpi_la_SHORTNAME = mpi libswiftsim_mpi_la_LIBADD = $(GRACKLE_LIBS) $(VELOCIRAPTOR_LIBS) $(MPI_LIBS) libgrav_mpi.la +# Sources and flags for regular CUDA library +libswiftsim_cuda_la_SOURCES = $(AM_SOURCES) +libswiftsim_cuda_la_CFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) -DWITH_CUDA +libswiftsim_cuda_la_CXXFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) -DWITH_CUDA +libswiftsim_cuda_la_LDFLAGS = $(AM_LDFLAGS) $(EXTRA_LIBS) $(CUDA_LIBS) +libswiftsim_cuda_la_SHORTNAME = cuda +libswiftsim_cuda_la_LIBADD = $(GRACKLE_LIBS) $(VELOCIRAPTOR_LIBS) $(MPI_LIBS) libgrav.la + +# Sources and flags for regular HIP library +libswiftsim_hip_la_SOURCES = $(AM_SOURCES) +libswiftsim_hip_la_CFLAGS = $(AM_CFLAGS) $(HIP_CFLAGS) -DWITH_HIP +libswiftsim_hip_la_LDFLAGS = $(AM_LDFLAGS) $(EXTRA_LIBS) $(HIP_LIBS) -lamdhip64 +libswiftsim_hip_la_SHORTNAME = hip +libswiftsim_hip_la_LIBADD = $(GRACKLE_LIBS) $(VELOCIRAPTOR_LIBS) $(MPI_LIBS) libgrav.la + +# Sources and flags for MPI CUDA library +libswiftsim_mpicuda_la_SOURCES = $(AM_SOURCES) +libswiftsim_mpicuda_la_CFLAGS = $(AM_CFLAGS) $(MPI_FLAGS) $(CUDA_CFLAGS) -DWITH_CUDA +libswiftsim_mpicuda_la_CXXFLAGS = $(AM_CFLAGS) $(MPI_FLAGS) $(CUDA_CFLAGS) -DWITH_CUDA +libswiftsim_mpicuda_la_LDFLAGS = $(AM_LDFLAGS) $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) +libswiftsim_mpicuda_la_SHORTNAME = mpicuda +libswiftsim_mpicuda_la_LIBADD = $(GRACKLE_LIBS) $(VELOCIRAPTOR_LIBS) $(MPI_LIBS) libgrav_mpi.la + +#subdir +SUBDIRS = . cuda +SUBDIRS += . hip + # Versioning. If any sources change then update the version_string.h file with # the current git revision and package version. # May have a checkout without a version_string.h file and no git command (tar/zip diff --git a/src/cell.h b/src/cell.h index cac5c49878..1d2aa0d7e1 100644 --- a/src/cell.h +++ b/src/cell.h @@ -360,6 +360,39 @@ enum cell_flags { */ struct cell { + /*Marks a cell for GPU execution A. Nasar */ + bool is_gpu_cell; + + int unpacker_cell; + + /*Marks a cell as having done its pack task 0->not 1-> yes*/ + int pack_done; + /*Marks a cell as having done its pack task 0->not 1-> yes*/ + int pack_done_g; + /*Marks a cell as having done its pack task 0->not 1-> yes*/ + int pack_done_f; + + /*Has the task run on the GPU? 0->No, 1-> Yes*/ + int gpu_done; + /*Has the task run on the GPU? 0->No, 1-> Yes*/ + int gpu_done_g; + /*Has the task run on the GPU? 0->No, 1-> Yes*/ + int gpu_done_f; + + /*Has the task run on the GPU? 0->No, 1-> Yes*/ + int unpack_done; + /*Has the task run on the GPU? 0->No, 1-> Yes*/ + int unpack_done_g; + /*Has the task run on the GPU? 0->No, 1-> Yes*/ + int unpack_done_f; + + /*Has the pair task run on the GPU? 0->No, 1-> Yes*/ + int gpu_done_pair; + /*Has the pair task run on the GPU? 0->No, 1-> Yes*/ + int gpu_done_pair_g; + /*Has the pair task run on the GPU? 0->No, 1-> Yes*/ + int gpu_done_pair_f; + /*! The cell location on the grid (corner nearest to the origin). */ double loc[3]; diff --git a/src/cell_hydro.h b/src/cell_hydro.h index 39db7bc219..14b37dcd6d 100644 --- a/src/cell_hydro.h +++ b/src/cell_hydro.h @@ -61,6 +61,25 @@ struct cell_hydro { /*! Linked list of the tasks computing this cell's hydro density. */ struct link *density; + /*! Linked list of the tasks computing this cell's hydro density pack. A. + * Nasar */ + struct link *density_pack; + struct link *density_unpack; + /*! Linked list of the tasks computing this cell's hydro force pack. */ + struct link *force_pack; + struct link *force_unpack; + /*! Linked list of the tasks computing this cell's hydro gradient pack. */ + struct link *gradient_pack; + struct link *gradient_unpack; + + struct task *d_pack; + struct task *g_pack; + struct task *f_pack; + + struct task *d_unpack; + struct task *g_unpack; + struct task *f_unpack; + /* Linked list of the tasks computing this cell's hydro gradients. */ struct link *gradient; diff --git a/src/cell_unskip.c b/src/cell_unskip.c index 6ad14a3560..a9572ea3bc 100644 --- a/src/cell_unskip.c +++ b/src/cell_unskip.c @@ -884,7 +884,7 @@ void cell_activate_subcell_hydro_tasks(struct cell *ci, struct cell *cj, cell_activate_hydro_sorts(ci, sid, s); cell_activate_hydro_sorts(cj, sid, s); } - } /* Otherwise, pair interation */ + } /* Otherwise, pair interaction */ } /** @@ -1657,7 +1657,6 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) { if ((ci_active && ci_nodeID == nodeID) || (cj_active && cj_nodeID == nodeID)) { scheduler_activate(s, t); - /* Activate hydro drift */ if (t->type == task_type_self) { if (ci_nodeID == nodeID) cell_activate_drift_part(ci, s); @@ -1903,19 +1902,94 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) { #endif } } - /* Unskip all the other task types. */ int c_active = cell_is_active_hydro(c, e); if (c->nodeID == nodeID && c_active) { + for (struct link *l = c->hydro.density_pack; l != NULL; + l = l->next) { /* A. Nasar */ + if(l->t->type == task_type_self && l->t->ci->hydro.count > 0) + scheduler_activate(s, l->t); + else if(l->t->type == task_type_pair && l->t->ci->hydro.count > 0 && l->t->cj->hydro.count > 0) + scheduler_activate(s, l->t); +#ifdef SWIFT_DEBUG_CHECKS + if (l->t->ci != NULL) { + l->t->ci->pack_done = 0; + l->t->ci->gpu_done = 0; + l->t->ci->unpack_done = 0; + } + if (l->t->cj != NULL) { + l->t->cj->pack_done = 0; + l->t->cj->gpu_done = 0; + l->t->cj->unpack_done = 0; + } +#endif + } + for (struct link *l = c->hydro.density_unpack; l != NULL; l = l->next) { + scheduler_activate(s, l->t); +#ifdef SWIFT_DEBUG_CHECKS + l->t->gpu_done = 0; +#endif + } for (struct link *l = c->hydro.gradient; l != NULL; l = l->next) { scheduler_activate(s, l->t); } for (struct link *l = c->hydro.force; l != NULL; l = l->next) { scheduler_activate(s, l->t); } - for (struct link *l = c->hydro.limiter; l != NULL; l = l->next) scheduler_activate(s, l->t); + // A. Nasar activate force and gradient packing tasks + for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) { + if(l->t->type == task_type_self && l->t->ci->hydro.count > 0) + scheduler_activate(s, l->t); + else if(l->t->type == task_type_pair && l->t->ci->hydro.count > 0 && l->t->cj->hydro.count > 0) + scheduler_activate(s, l->t); +#ifdef SWIFT_DEBUG_CHECKS + if (l->t->ci != NULL) { + l->t->ci->pack_done_f = 0; + l->t->ci->gpu_done_f = 0; + l->t->ci->unpack_done_f = 0; + } + if (l->t->cj != NULL) { + l->t->cj->pack_done_f = 0; + l->t->cj->gpu_done_f = 0; + l->t->cj->unpack_done_f = 0; + } +#endif + } + for (struct link *l = c->hydro.force_unpack; l != NULL; l = l->next) { + scheduler_activate(s, l->t); +#ifdef SWIFT_DEBUG_CHECKS + l->t->gpu_done = 0; +#endif + } + +#ifdef EXTRA_HYDRO_LOOP + for (struct link *l = c->hydro.gradient_pack; l != NULL; l = l->next) { + if(l->t->type == task_type_self && l->t->ci->hydro.count > 0) + scheduler_activate(s, l->t); + else if(l->t->type == task_type_pair && l->t->ci->hydro.count > 0 && l->t->cj->hydro.count > 0) + scheduler_activate(s, l->t); +#ifdef SWIFT_DEBUG_CHECKS + if (l->t->ci != NULL) { + l->t->ci->pack_done_g = 0; + l->t->ci->gpu_done_g = 0; + l->t->ci->unpack_done_g = 0; + } + if (l->t->cj != NULL) { + l->t->cj->pack_done_g = 0; + l->t->cj->gpu_done_g = 0; + l->t->cj->unpack_done_g = 0; + } +#endif + } + for (struct link *l = c->hydro.gradient_unpack; l != NULL; l = l->next) { + scheduler_activate(s, l->t); +#ifdef SWIFT_DEBUG_CHECKS + l->t->gpu_done = 0; +#endif + } +#endif if (c->hydro.extra_ghost != NULL) scheduler_activate(s, c->hydro.extra_ghost); diff --git a/src/clocks.h b/src/clocks.h index e39d8e8195..4cc7cdaac7 100644 --- a/src/clocks.h +++ b/src/clocks.h @@ -20,8 +20,11 @@ #define SWIFT_CLOCKS_H /* Config parameters. */ +#ifdef WITH_CUDA +#include "../config.h" +#else #include - +#endif /* System includes. */ #include diff --git a/src/cuda/BLOCK_SIZE.h b/src/cuda/BLOCK_SIZE.h new file mode 100644 index 0000000000..2d5dda1af2 --- /dev/null +++ b/src/cuda/BLOCK_SIZE.h @@ -0,0 +1,12 @@ +#ifndef BLOCK_SIZE_H +#define BLOCK_SIZE_H + +#define BLOCK_SIZE 64 +#define N_TASKS_PER_PACK_SELF 8 +#define N_TASKS_BUNDLE_SELF 2 + +#define BLOCK_SIZE_PAIR 64 +#define N_TASKS_PER_PACK_PAIR 4 +#define N_TASKS_BUNDLE_PAIR 1 + +#endif // BLOCK_SIZE_H diff --git a/src/cuda/GPU_runner_functions.cu b/src/cuda/GPU_runner_functions.cu new file mode 100644 index 0000000000..d3c08c10ae --- /dev/null +++ b/src/cuda/GPU_runner_functions.cu @@ -0,0 +1,4323 @@ +/******************************************************************************* + * This file contains functions used to setup and execute GPU tasks from within + *runner_main.c. Consider this a translator allowing .cu based functions to be + *called from within runner_main.c + ******************************************************************************/ + +/* Hacky method to make c++ compilers not die. */ +#ifdef WITH_CUDA +#ifndef static +#define static +#endif +#ifndef restrict +#define restrict __restrict__ +#endif +#endif + +/* Required header files */ +#include +/*ifdef WITH_CUDA prevents name mangling. C code sees exact names + of functions rather than mangled template names produced by C++*/ +#ifdef WITH_CUDA +extern "C" { +#endif + +#include "../../config.h" + +#ifndef BLOCK_SIZE_H +#include "BLOCK_SIZE.h" +#endif + +#include "GPU_runner_functions.h" +#include "device_functions.h" +#include "part_gpu.h" + +#include + +#ifdef WITH_CUDA +} +#endif + +/* function to initialise GPU and printout GPU name*/ +#ifdef WITH_CUDA +extern "C" { +#endif +void Initialise_GPU() { + int devId = 0; + // find and print device name + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, devId); + printf("Device : %s\n", prop.name); + cudaSetDevice(devId); + // cuda +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void tester(struct part_soa parts_soa, int *d_task_first_part, + int *d_task_last_part, float d_a, float d_H, int bid, + int tid, int count_tasks, int tasksperbundle, + int nBlocks_per_task, int bundle_first_task, + int max_parts, int time_bin_inhibited) { + extern __shared__ float vars[]; + __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + __shared__ int first_part_in_task_blocks, last_part_in_task_blocks; + first_part_in_task_blocks = d_task_first_part[task_id], + last_part_in_task_blocks = d_task_last_part[task_id]; + __syncthreads(); + const int pid = threadid + first_part_in_task_blocks; + + if (pid < last_part_in_task_blocks) { + parts_soa.tid_p[pid] = 1; + } + // if(parts_soa.tid_p[pid] == 1 && pid < last_part_in_task_blocks) + // printf("tid %i last_part_in_blocks %i\n", parts_soa.tid_p[pid], + // last_part_in_task_blocks); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void runner_do_self_density_GPU( + struct part_soa parts_soa, int *d_task_first_part, int *d_task_last_part, + float d_a, float d_H, int count_tasks, int tasksperbundle, + int nBlocks_per_task, int bundle_first_task, int max_parts) { + extern __shared__ float vars[]; + __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + // __shared__ int first_part_in_task_blocks, last_part_in_task_blocks; + int first_part_in_task_blocks, last_part_in_task_blocks; + first_part_in_task_blocks = d_task_first_part[task_id], + last_part_in_task_blocks = d_task_last_part[task_id]; + // __syncthreads(); + const int pid = threadid + first_part_in_task_blocks; + + int ttid = 0; + int first_part = 0; + int count = 0; + int last_part = 0; + float cellx = 0.0, celly = 0.0, cellz = 0.0; + float hi = 0.0, hig2 = hi * hi * kernel_gamma2; + float mi = 0.0; + float uxi = 0.0; + float uyi = 0.0; + float uzi = 0.0; + float pix = 0.0; + float piy = 0.0; + float piz = 0.0; + float rhoi = 0.0; + float rho_dhi = 0.0; + float wcounti = 0.0; + float wcount_dhi = 0.0; + float div_vi = 0.0; + float rot_uxi = 0.0; + float rot_uyi = 0.0; + float rot_uzi = 0.0; + int Found_neighbours = 0; + // if(pid (0.01f / 128.f) * (0.01f / 128.f)) { + // if (r2 < hig2 && r2 > (0.01f/256.f)*(0.01f/256.f)) { + Found_neighbours = 1; + const float r = sqrt(r2); + /* Recover some data */ + const float mj = mass_tmp[j_block]; + /* Get the kernel for hi. */ + if (hi < 1.f / 256.f) printf("h < dx\n"); + // if(hi<1.f/256.f)printf("h < dx\n"); + const float h_inv = 1.f / hi; + const float ui = r * h_inv; + float wi, wi_dx; + + d_kernel_deval(ui, &wi, &wi_dx); + + rhoi += mj * wi; + rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx); + + wcounti += wi; + wcount_dhi -= (hydro_dimension * wi + ui * wi_dx); + + const float r_inv = 1.f / r; + const float faci = mj * wi_dx * r_inv; + + /* Compute dv dot r */ + float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block], + dvz = uzi - uz_tmp[j_block]; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + + div_vi -= faci * dvdr; + + /* Compute dv cross r */ + float curlvrx = dvy * zij - dvz * yij; + float curlvry = dvz * xij - dvx * zij; + float curlvrz = dvx * yij - dvy * xij; + + rot_uxi += faci * curlvrx; + rot_uyi += faci * curlvry; + rot_uzi += faci * curlvrz; + } + } + } + __syncthreads(); + } + if (pid < last_part_in_task_blocks) { + // float wi, wi_dx; + // d_kernel_deval(0.f, &wi, &wi_dx); + // printf("mass i %e, self rho %e sum rho %e\n", mi, mi*wi, rhoi); + // if(Found_neighbours == 0) printf("Not sure what's going on but no + // neighbours found in GPU loop\n"); + parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi; + parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi; + parts_soa.div_v[pid] = div_vi; + parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi, + parts_soa.rot_uz[pid] = rot_uzi; + } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void DOSELF_GPU_AOS(struct part_aos *parts_aos, + int *d_task_first_part, int *d_task_last_part, + float d_a, float d_H, int count_tasks, + int tasksperbundle, int nBlocks_per_task, + int bundle_first_task, int max_parts, + double *d_cell_x, double *d_cell_y, + double *d_cell_z) { + extern __shared__ float vars[]; + __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + // __shared__ int first_part_in_task_blocks, last_part_in_task_blocks; + int first_part_in_task_blocks, last_part_in_task_blocks; + first_part_in_task_blocks = d_task_first_part[task_id], + last_part_in_task_blocks = d_task_last_part[task_id]; + // __syncthreads(); + const int pid = threadid + first_part_in_task_blocks; + + int ttid = 0; + int first_part = 0; + int count = 0; + int last_part = 0; + float cellx = 0.0, celly = 0.0, cellz = 0.0; + float hi = 0.0, hig2 = hi * hi * kernel_gamma2; + float mi = 0.0; + float uxi = 0.0; + float uyi = 0.0; + float uzi = 0.0; + float pix = 0.0; + float piy = 0.0; + float piz = 0.0; + float rhoi = 0.0; + float rho_dhi = 0.0; + float wcounti = 0.0; + float wcount_dhi = 0.0; + float div_vi = 0.0; + float rot_uxi = 0.0; + float rot_uyi = 0.0; + float rot_uzi = 0.0; + int Found_neighbours = 0; + struct part_aos ipart = parts_aos[pid]; + // if(pid (0.01f / 128.f) * (0.01f / 128.f)) { + Found_neighbours = 1; + const float r = sqrt(r2); + /* Recover some data */ + const float mj = mass_tmp[j_block]; + /* Get the kernel for hi. */ + const float h_inv = 1.f / hi; + const float ui = r * h_inv; + float wi, wi_dx; + + d_kernel_deval(ui, &wi, &wi_dx); + + rhoi += mj * wi; + rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx); + + wcounti += wi; + wcount_dhi -= (hydro_dimension * wi + ui * wi_dx); + + const float r_inv = 1.f / r; + const float faci = mj * wi_dx * r_inv; + + /* Compute dv dot r */ + float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block], + dvz = uzi - uz_tmp[j_block]; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + + div_vi -= faci * dvdr; + + /* Compute dv cross r */ + float curlvrx = dvy * zij - dvz * yij; + float curlvry = dvz * xij - dvx * zij; + float curlvrz = dvx * yij - dvy * xij; + + rot_uxi += faci * curlvrx; + rot_uyi += faci * curlvry; + rot_uzi += faci * curlvrz; + } + } + } + __syncthreads(); + } + if (pid < last_part_in_task_blocks) { + // float wi, wi_dx; + // d_kernel_deval(0.f, &wi, &wi_dx); + // printf("mass i %e, self rho %e sum rho %e\n", mi, mi*wi, rhoi); + // if(Found_neighbours == 0) printf("Not sure what's going on but no + // neighbours found in GPU loop\n"); + parts_aos[pid].rho = rhoi, parts_aos[pid].rho_dh = rho_dhi; + parts_aos[pid].wcount = wcounti, parts_aos[pid].wcount_dh = wcount_dhi; + parts_aos[pid].div_v = div_vi; + parts_aos[pid].rot_ux = rot_uxi, parts_aos[pid].rot_uy = rot_uyi, + parts_aos[pid].rot_uz = rot_uzi; + } +} +#ifdef WITH_CUDA +} +#endif + +// template + +#ifdef WITH_CUDA +extern "C" { +#endif +// #include +__global__ void DOSELF_GPU_AOS_F4( + struct part_aos_f4_send *__restrict__ parts_send, + struct part_aos_f4_recv *__restrict__ parts_recv, const float d_a, + const float d_H, const int bundle_first_task, + const int2 *__restrict__ d_task_first_part_f4) { + + extern __shared__ float4 vars_f4[]; + + // auto group = cooperative_groups::this_thread_block(); + __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + // cuda::barrier bar; + + int first_part_in_task_blocks, last_part_in_task_blocks; + int2 first_last_parts = d_task_first_part_f4[task_id]; + first_part_in_task_blocks = first_last_parts.x; + last_part_in_task_blocks = first_last_parts.y; + + const int pid = threadid + first_part_in_task_blocks; + + float4 res_rho = {0.0, 0.0, 0.0, 0.0}; + float4 res_rot = {0.0, 0.0, 0.0, 0.0}; + const part_aos_f4_send pi = parts_send[pid]; + const float4 x_pi = pi.x_p_h; + const float4 ux_pi = pi.ux_m; + const float hi = x_pi.w, hig2 = hi * hi * kernel_gamma2; + int n_neighbours = 0; + /*Here we use different pointers "x_p_tmp", etc. to point to different regions + * of the single shared memory space "vars" which we allocate in kernel + * invocation*/ + float4 *__restrict__ x_p_h_tmp = (float4 *)&vars_f4[0]; + float4 *__restrict__ ux_m_tmp = (float4 *)&vars_f4[BLOCK_SIZE]; + /*Particles copied in blocks to shared memory*/ + for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks; + b += BLOCK_SIZE) { + int j = b + threadIdx.x; + struct part_aos_f4_send pj = parts_send[j]; + x_p_h_tmp[threadIdx.x] = pj.x_p_h; + ux_m_tmp[threadIdx.x] = pj.ux_m; + __syncthreads(); + for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) { + j = j_block + b; + if (j < last_part_in_task_blocks) { + /* Compute the pairwise distance. */ + const float4 x_p_h_j = x_p_h_tmp[j_block]; + const float4 ux_m_j = ux_m_tmp[j_block]; + const float xij = x_pi.x - x_p_h_j.x, yij = x_pi.y - x_p_h_j.y, + zij = x_pi.z - x_p_h_j.z; + const float r2 = xij * xij + yij * yij + zij * zij; + if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) { + const float r = sqrtf(r2); + /* Recover some data */ + const float mj = ux_m_j.w; + /* Get the kernel for hi. */ + const float h_inv = 1.f / hi; + const float ui = r * h_inv; + float wi, wi_dx; + + d_kernel_deval(ui, &wi, &wi_dx); + /*Add to sums of rho, rho_dh, wcount and wcount_dh*/ + res_rho.x += mj * wi; + res_rho.y -= mj * (hydro_dimension * wi + ui * wi_dx); + res_rho.z += wi; + res_rho.w -= (hydro_dimension * wi + ui * wi_dx); + + const float r_inv = 1.f / r; + const float faci = mj * wi_dx * r_inv; + + /* Compute dv dot r */ + const float dvx = ux_pi.x - ux_m_j.x, dvy = ux_pi.y - ux_m_j.y, + dvz = ux_pi.z - ux_m_j.z; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + + /* Compute dv cross r */ + const float curlvrx = dvy * zij - dvz * yij; + const float curlvry = dvz * xij - dvx * zij; + const float curlvrz = dvx * yij - dvy * xij; + /*Add to sums of rot_u and div_v*/ + res_rot.x += faci * curlvrx; + res_rot.y += faci * curlvry; + res_rot.z += faci * curlvrz; + res_rot.w -= faci * dvdr; + } + } + } + __syncthreads(); + } + if (pid < last_part_in_task_blocks) { + parts_recv[pid].rho_dh_wcount = res_rho; + parts_recv[pid].rot_ux_div_v = res_rot; + } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void launch_density_aos(struct part_aos *parts_aos, int *d_task_first_part, + int *d_task_last_part, float d_a, float d_H, + const char *loop_type, cudaStream_t stream, + int block_size, int count_tasks, int tasksperbundle, + int numBlocks_x, int numBlocks_y, int bundle_first_task, + int max_parts, double *d_cell_x, double *d_cell_y, + double *d_cell_z) { + + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + DOSELF_GPU_AOS<<>>(parts_aos, d_task_first_part, d_task_last_part, + d_a, d_H, count_tasks, tasksperbundle, + nBlocks_per_task, bundle_first_task, max_parts, + d_cell_x, d_cell_y, d_cell_z); + // runner_do_self_density_GPU_naive<<>>( + // parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, + // count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task, + // max_parts, time_bin_inhibited); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +struct first_part { + int list[32]; +}; +void launch_density_aos_f4(struct part_aos_f4_send *parts_send, + struct part_aos_f4_recv *parts_recv, float d_a, + float d_H, cudaStream_t stream, int numBlocks_x, + int numBlocks_y, int bundle_first_task, + int2 *d_task_first_part_f4) { + + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + DOSELF_GPU_AOS_F4<<>>(parts_send, parts_recv, d_a, d_H, + bundle_first_task, d_task_first_part_f4); + // runner_do_self_density_GPU_naive<<>>( + // parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, + // count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task, + // max_parts, time_bin_inhibited); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void DOSELF_GPU_AOS_G(struct part_aos_g *parts_aos, + int *d_task_first_part, int *d_task_last_part, + float d_a, float d_H, int count_tasks, + int tasksperbundle, int nBlocks_per_task, + int bundle_first_task, int max_parts, + double *d_cell_x, double *d_cell_y, + double *d_cell_z) { + extern __shared__ float varsg[]; + __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + // __shared__ int first_part_in_task_blocks, last_part_in_task_blocks; + int first_part_in_task_blocks, last_part_in_task_blocks; + first_part_in_task_blocks = d_task_first_part[task_id], + last_part_in_task_blocks = d_task_last_part[task_id]; + // __syncthreads(); + const int pid = threadid + first_part_in_task_blocks; + + int ttid = 0; + int first_part = 0; + int count = 0; + int last_part = 0; + float cellx = 0.0, celly = 0.0, cellz = 0.0; + float ci = 0.0, cj = 0.0; + float hi = 0.0, hig2 = hi * hi * kernel_gamma2; + float mi = 0.0; + float uxi = 0.0; + float uyi = 0.0; + float uzi = 0.0; + float pix = 0.0; + float piy = 0.0; + float piz = 0.0; + float rhoi = 0.0; + float div_vi = 0.0; + int Found_neighbours = 0; + float v_sig; + float u = 0.f; + float laplace_u = 0.0; + float alpha_visc_max_ngb = 0.0; + if (pid < last_part_in_task_blocks) { + ttid = task_id; + first_part = d_task_first_part[ttid]; + last_part = d_task_last_part[ttid]; + count = last_part - first_part; + cellx = d_cell_x[ttid], celly = d_cell_y[ttid], cellz = d_cell_z[ttid]; + hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2; + mi = parts_aos[pid].mass; + uxi = parts_aos[pid].ux; + uyi = parts_aos[pid].uy; + uzi = parts_aos[pid].uz; + pix = parts_aos[pid].x_p - cellx; + piy = parts_aos[pid].y_p - celly; + piz = parts_aos[pid].z_p - cellz; + ci = parts_aos[pid].soundspeed; + v_sig = parts_aos[pid].v_sig; + u = parts_aos[pid].u; + laplace_u = parts_aos[pid].laplace_u; + alpha_visc_max_ngb = parts_aos[pid].alpha_visc_max_ngb; + } + // if (threadIdx.x == 0) { + // first_part_tid_0 = first_part; + // last_part_tid_0 = last_part; + // } + // __syncthreads(); + int n_neighbours = 0; + /*Here we use different pointers "x_p_tmp", etc. to point to different regions + * of the single shared memory space "vars" which we allocate in kernel + * invocation*/ + float *x_p_tmp = (float *)&varsg[0]; + float *y_p_tmp = (float *)&varsg[BLOCK_SIZE]; + float *z_p_tmp = (float *)&varsg[BLOCK_SIZE * 2]; + float *h_tmp = (float *)&varsg[BLOCK_SIZE * 3]; + float *mass_tmp = (float *)&varsg[BLOCK_SIZE * 4]; + float *ux_tmp = (float *)&varsg[BLOCK_SIZE * 5]; + float *uy_tmp = (float *)&varsg[BLOCK_SIZE * 6]; + float *uz_tmp = (float *)&varsg[BLOCK_SIZE * 7]; + float *cj_tmp = (float *)&varsg[BLOCK_SIZE * 8]; + float *alpha_tmp = (float *)&varsg[BLOCK_SIZE * 9]; + float *u_tmp = (float *)&varsg[BLOCK_SIZE * 10]; + float *rho_tmp = (float *)&varsg[BLOCK_SIZE * 11]; + int *timebin = (int *)&varsg[BLOCK_SIZE * 12]; + /*Particles copied in blocks to shared memory*/ + for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks; + b += BLOCK_SIZE) { + int j = b + threadIdx.x; + x_p_tmp[threadIdx.x] = parts_aos[j].x_p; + y_p_tmp[threadIdx.x] = parts_aos[j].y_p; + z_p_tmp[threadIdx.x] = parts_aos[j].z_p; + h_tmp[threadIdx.x] = parts_aos[j].h; + mass_tmp[threadIdx.x] = parts_aos[j].mass; + ux_tmp[threadIdx.x] = parts_aos[j].ux; + uy_tmp[threadIdx.x] = parts_aos[j].uy; + uz_tmp[threadIdx.x] = parts_aos[j].uz; + timebin[threadIdx.x] = parts_aos[j].time_bin; + cj_tmp[threadIdx.x] = parts_aos[j].soundspeed; + alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha; + u_tmp[threadIdx.x] = parts_aos[j].u; + rho_tmp[threadIdx.x] = parts_aos[j].rho; + __syncthreads(); + for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) { + j = j_block + b; + // if ((j != pid) && (j < last_part_in_task_blocks) && + // timebin[j_block] != time_bin_inhibited) { + // if ((j < last_part_in_task_blocks) && + // timebin[j_block] != time_bin_inhibited) { + if (j < last_part_in_task_blocks) { + /* Compute the pairwise distance. */ + const float pjx = x_p_tmp[j_block] - cellx; + const float pjy = y_p_tmp[j_block] - celly; + const float pjz = z_p_tmp[j_block] - cellz; + const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz; + const float r2 = xij * xij + yij * yij + zij * zij; + if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) { + Found_neighbours = 1; + const float r = sqrt(r2); + const float r_inv = 1.f / r; + /* Recover some data */ + const float mj = mass_tmp[j_block]; + /* Get the kernel for hi. */ + const float h_inv = 1.f / hi; + float wi, wi_dx; + /* Cosmology terms for the signal velocity */ + const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a); + const float a2_Hubble = d_a * d_a * d_H; + /* Compute dv dot r */ + float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block], + dvz = uzi - uz_tmp[j_block]; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + /* Add Hubble flow */ + const float dvdr_Hubble = dvdr + a2_Hubble * r2; + /* Are the particles moving towards each others ? */ + const float omega_ij = min(dvdr_Hubble, 0.f); + const float mu_ij = + fac_mu * r_inv * omega_ij; /* This is 0 or negative */ + + /* Signal velocity */ + const float new_v_sig = + ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij; + /* Update if we need to */ + v_sig = max(v_sig, new_v_sig); + /* Calculate Del^2 u for the thermal diffusion coefficient. */ + /* Need to get some kernel values F_ij = wi_dx */ + const float ui = r * h_inv; + d_kernel_deval(ui, &wi, &wi_dx); + + const float delta_u_factor = (u - u_tmp[j_block]) * r_inv; + laplace_u += mj * delta_u_factor * wi_dx / rho_tmp[j_block]; + + /* Set the maximal alpha from the previous step over the neighbours + * (this is used to limit the diffusion in hydro_prepare_force) */ + const float alpha_j = alpha_tmp[j_block]; + alpha_visc_max_ngb = max(alpha_visc_max_ngb, alpha_j); + } + } + } + __syncthreads(); + } + if (pid < last_part_in_task_blocks) { + parts_aos[pid].v_sig = v_sig, parts_aos[pid].laplace_u = laplace_u; + parts_aos[pid].alpha_visc_max_ngb = alpha_visc_max_ngb; + } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void DOSELF_GPU_AOS_F4_G( + struct part_aos_f4_g_send *__restrict__ parts_send, + struct part_aos_f4_g_recv *__restrict__ parts_recv, const float d_a, + const float d_H, const int bundle_first_task, + const int2 *__restrict__ d_task_first_part_f4) { + + extern __shared__ float4 varsf4_g[]; + + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // __shared__ int first_part_in_task_blocks, last_part_in_task_blocks; + int2 first_last_parts = d_task_first_part_f4[task_id]; + int first_part_in_task_blocks = first_last_parts.x; + int last_part_in_task_blocks = first_last_parts.y; + // __syncthreads(); + const int pid = threadid + first_part_in_task_blocks; + + /*Keep this*/ + float v_sig = 0.f; + float alpha_visc_max_ngb = 0.f; + ///////////// + + struct part_aos_f4_g_send pi = parts_send[pid]; + float4 x_h_i = pi.x_h; + float4 ux_m_i = pi.ux_m; + float4 rho_avisc_u_c_i = pi.rho_avisc_u_c; + float3 vsig_lapu_aviscmax_i = {0.f, 0.f, 0.f}; + + const float hi = x_h_i.w, hig2 = hi * hi * kernel_gamma2; + + int n_neighbours = 0; + /*Here we use different pointers "x_p_tmp", etc. to point to different regions + * of the single shared memory space "vars" which we allocate in kernel + * invocation*/ + float4 *__restrict__ x_h_tmp = (float4 *)&varsf4_g[0]; + float4 *__restrict__ ux_m_tmp = (float4 *)&varsf4_g[BLOCK_SIZE]; + float4 *__restrict__ rho_avisc_u_c_tmp = (float4 *)&varsf4_g[BLOCK_SIZE * 2]; + + /*Particles copied in blocks to shared memory*/ + for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks; + b += BLOCK_SIZE) { + + int j = b + threadIdx.x; + + struct part_aos_f4_g_send pj = parts_send[j]; + x_h_tmp[threadIdx.x] = pj.x_h; + ux_m_tmp[threadIdx.x] = pj.ux_m; + rho_avisc_u_c_tmp[threadIdx.x] = pj.rho_avisc_u_c; + + __syncthreads(); + for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) { + j = j_block + b; + if (j < last_part_in_task_blocks) { + float4 x_h_j = x_h_tmp[j_block]; + float4 ux_m_j = ux_m_tmp[j_block]; + float4 rho_avisc_u_c_j = rho_avisc_u_c_tmp[j_block]; + /* Compute the pairwise distance. */ + const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y, + zij = x_h_i.z - x_h_j.z; + const float r2 = xij * xij + yij * yij + zij * zij; + + if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) { + const float r = sqrt(r2); + const float r_inv = 1.f / r; + /* Recover some data */ + const float mj = ux_m_j.w; + /* Get the kernel for hi. */ + const float h_inv = 1.f / hi; + float wi, wi_dx; + /* Cosmology terms for the signal velocity */ + const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a); + const float a2_Hubble = d_a * d_a * d_H; + /* Compute dv dot r */ + float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y, + dvz = ux_m_i.z - ux_m_j.z; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + /* Add Hubble flow */ + const float dvdr_Hubble = dvdr + a2_Hubble * r2; + /* Are the particles moving towards each others ? */ + const float omega_ij = min(dvdr_Hubble, 0.f); + const float mu_ij = + fac_mu * r_inv * omega_ij; /* This is 0 or negative */ + + /* Signal velocity */ + const float new_v_sig = rho_avisc_u_c_i.w + rho_avisc_u_c_j.w - + const_viscosity_beta * mu_ij; + /* Update if we need to */ + vsig_lapu_aviscmax_i.x = fmaxf(vsig_lapu_aviscmax_i.x, new_v_sig); + /* Calculate Del^2 u for the thermal diffusion coefficient. */ + /* Need to get some kernel values F_ij = wi_dx */ + const float ui = r * h_inv; + d_kernel_deval(ui, &wi, &wi_dx); + + const float delta_u_factor = + (rho_avisc_u_c_i.z - rho_avisc_u_c_j.z) * r_inv; + vsig_lapu_aviscmax_i.y += + mj * delta_u_factor * wi_dx / rho_avisc_u_c_j.x; + + /* Set the maximal alpha from the previous step over the neighbours + * (this is used to limit the diffusion in hydro_prepare_force) */ + const float alpha_j = rho_avisc_u_c_j.y; + vsig_lapu_aviscmax_i.z = fmaxf(vsig_lapu_aviscmax_i.z, alpha_j); + } + } + } + __syncthreads(); + } + if (pid < last_part_in_task_blocks) { + // printf("v %f lap %f maxvisc %f\n", vsig_lapu_aviscmax_empty_i.x, + // vsig_lapu_aviscmax_empty_i.y, vsig_lapu_aviscmax_empty_i.z); + parts_recv[pid].vsig_lapu_aviscmax = vsig_lapu_aviscmax_i; + } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void DOSELF_GPU_AOS_F(struct part_aos_f *parts_aos, + int *d_task_first_part, int *d_task_last_part, + float d_a, float d_H, int count_tasks, + int tasksperbundle, int nBlocks_per_task, + int bundle_first_task, int max_parts, + double *d_cell_x, double *d_cell_y, + double *d_cell_z) { + extern __shared__ float varsf[]; + __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + int first_part_in_task_blocks, last_part_in_task_blocks; + first_part_in_task_blocks = d_task_first_part[task_id], + last_part_in_task_blocks = d_task_last_part[task_id]; + + const int pid = threadid + first_part_in_task_blocks; + + int ttid = 0; + int first_part = 0; + int count = 0; + int last_part = 0; + float cellx = 0.0, celly = 0.0, cellz = 0.0; + float ci = 0.0, cj = 0.0; + float hi = 0.0, hig2 = 0.0; + float mi = 0.0; + float uxi = 0.0; + float uyi = 0.0; + float uzi = 0.0; + float pix = 0.0; + float piy = 0.0; + float piz = 0.0; + float rhoi = 0.0; + float div_vi = 0.0; + int Found_neighbours = 0; + float v_sigi; + float ui = 0.f; + float u_dti = 0.f; + float laplace_ui = 0.0; + float alpha_visc_max_ngb = 0.0; + float pressurei = 0.0; + float alphavisci = 0.0; + float alphadiffi = 0.0; + float fi = 0.0; + float balsarai = 0.0; + float ahydroxi = 0.0; + float ahydroyi = 0.0; + float ahydrozi = 0.0; + float h_dti = 0.0; + int min_ngb_time_bin = 0; + if (pid < last_part_in_task_blocks) { + ttid = task_id; + first_part = d_task_first_part[ttid]; + last_part = d_task_last_part[ttid]; + count = last_part - first_part; + cellx = d_cell_x[ttid], celly = d_cell_y[ttid], cellz = d_cell_z[ttid]; + hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2; + mi = parts_aos[pid].mass; + uxi = parts_aos[pid].ux; + uyi = parts_aos[pid].uy; + uzi = parts_aos[pid].uz; + pix = parts_aos[pid].x_p - cellx; + piy = parts_aos[pid].y_p - celly; + piz = parts_aos[pid].z_p - cellz; + ci = parts_aos[pid].soundspeed; + fi = parts_aos[pid].f; + v_sigi = parts_aos[pid].v_sig; + ui = parts_aos[pid].u; + rhoi = parts_aos[pid].rho; + pressurei = parts_aos[pid].pressure; + balsarai = parts_aos[pid].balsara; + alphavisci = parts_aos[pid].alpha_visc; + alphadiffi = parts_aos[pid].alpha_diff; + min_ngb_time_bin = parts_aos[pid].min_ngb_time_bin; + // laplace_u = parts_aos[pid].laplace_u; + // alpha_visc_max_ngb = parts_aos[pid].alpha_visc_max_ngb; + } + // if (threadIdx.x == 0) { + // first_part_tid_0 = first_part; + // last_part_tid_0 = last_part; + // } + // __syncthreads(); + int n_neighbours = 0; + /*Here we use different pointers "x_p_tmp", etc. to point to different regions + * of the single shared memory space "vars" which we allocate in kernel + * invocation*/ + float *x_p_tmp = (float *)&varsf[0]; + float *y_p_tmp = (float *)&varsf[BLOCK_SIZE]; + float *z_p_tmp = (float *)&varsf[BLOCK_SIZE * 2]; + float *h_tmp = (float *)&varsf[BLOCK_SIZE * 3]; + float *mass_tmp = (float *)&varsf[BLOCK_SIZE * 4]; + float *ux_tmp = (float *)&varsf[BLOCK_SIZE * 5]; + float *uy_tmp = (float *)&varsf[BLOCK_SIZE * 6]; + float *uz_tmp = (float *)&varsf[BLOCK_SIZE * 7]; + float *cj_tmp = (float *)&varsf[BLOCK_SIZE * 8]; + float *alphavisc_tmp = (float *)&varsf[BLOCK_SIZE * 9]; + float *alphadiff_tmp = (float *)&varsf[BLOCK_SIZE * 10]; + float *u_tmp = (float *)&varsf[BLOCK_SIZE * 11]; + float *rho_tmp = (float *)&varsf[BLOCK_SIZE * 12]; + float *pressure_tmp = (float *)&varsf[BLOCK_SIZE * 13]; + float *f_tmp = (float *)&varsf[BLOCK_SIZE * 14]; + float *balsara_tmp = (float *)&varsf[BLOCK_SIZE * 15]; + int *timebin = (int *)&varsf[BLOCK_SIZE * 16]; + /*Particles copied in blocks to shared memory*/ + for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks; + b += BLOCK_SIZE) { + int j = b + threadIdx.x; + x_p_tmp[threadIdx.x] = parts_aos[j].x_p; + y_p_tmp[threadIdx.x] = parts_aos[j].y_p; + z_p_tmp[threadIdx.x] = parts_aos[j].z_p; + h_tmp[threadIdx.x] = parts_aos[j].h; + mass_tmp[threadIdx.x] = parts_aos[j].mass; + ux_tmp[threadIdx.x] = parts_aos[j].ux; + uy_tmp[threadIdx.x] = parts_aos[j].uy; + uz_tmp[threadIdx.x] = parts_aos[j].uz; + timebin[threadIdx.x] = parts_aos[j].time_bin; + cj_tmp[threadIdx.x] = parts_aos[j].soundspeed; + // alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha; + u_tmp[threadIdx.x] = parts_aos[j].u; + rho_tmp[threadIdx.x] = parts_aos[j].rho; + alphavisc_tmp[threadIdx.x] = parts_aos[j].alpha_visc; + alphadiff_tmp[threadIdx.x] = parts_aos[j].alpha_diff; + pressure_tmp[threadIdx.x] = parts_aos[j].pressure; + f_tmp[threadIdx.x] = parts_aos[j].f; + balsara_tmp[threadIdx.x] = parts_aos[j].balsara; + __syncthreads(); + for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) { + j = j_block + b; + if (j < last_part_in_task_blocks) { + /* Compute the pairwise distance. */ + const float pjx = x_p_tmp[j_block] - cellx; + const float pjy = y_p_tmp[j_block] - celly; + const float pjz = z_p_tmp[j_block] - cellz; + const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz; + const float r2 = xij * xij + yij * yij + zij * zij; + if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) { + + // /* Cosmology terms for the signal velocity */ + const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a); + const float a2_Hubble = d_a * d_a * d_H; + const float r = sqrt(r2); + const float r_inv = 1.f / r; + // /* Recover some data */ + const float mj = mass_tmp[j_block]; + // /* Get the kernel for hi. */ + const float hi_inv = 1.f / hi; + const float hid_inv = + d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */ + const float xi = r * hi_inv; + float wi, wi_dx; + d_kernel_deval(xi, &wi, &wi_dx); + const float wi_dr = hid_inv * wi_dx; + /* Get the kernel for hj. */ + const float hj = h_tmp[j_block]; + const float hj_inv = 1.0f / hj; + const float hjd_inv = + d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */ + const float xj = r * hj_inv; + float wj, wj_dx; + d_kernel_deval(xj, &wj, &wj_dx); + const float wj_dr = hjd_inv * wj_dx; + // /* Compute dv dot r */ + float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block], + dvz = uzi - uz_tmp[j_block]; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + // /* Add Hubble flow */ + const float dvdr_Hubble = dvdr + a2_Hubble * r2; + // /* Are the particles moving towards each others ? */ + const float omega_ij = min(dvdr_Hubble, 0.f); + const float mu_ij = + fac_mu * r_inv * omega_ij; /* This is 0 or negative */ + // + // /* Signal velocity */ + const float v_sig = + ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij; + + /* Variable smoothing length term */ + const float f_ij = 1.f - fi / mj; + const float f_ji = 1.f - f_tmp[j_block] / mi; + + /* Balsara term */ + const float balsaraj = balsara_tmp[j_block]; + /* Construct the full viscosity term */ + const float rhoj = rho_tmp[j_block]; + const float pressurej = pressure_tmp[j_block]; + const float rho_ij = rhoi + rhoj; + const float alpha = alphavisci + alphavisc_tmp[j_block]; + const float visc = + -0.25f * alpha * v_sig * mu_ij * (balsarai + balsaraj) / rho_ij; + /* Convolve with the kernel */ + const float visc_acc_term = + 0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv; + /* Compute gradient terms */ + const float P_over_rho2_i = pressurei / (rhoi * rhoi) * f_ij; + const float P_over_rho2_j = pressurej / (rhoj * rhoj) * f_ji; + + /* SPH acceleration term */ + const float sph_acc_term = + (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv; + + /* Assemble the acceleration */ + const float acc = sph_acc_term + visc_acc_term; + /* Use the force Luke ! */ + ahydroxi -= mj * acc * xij; + ahydroyi -= mj * acc * yij; + ahydrozi -= mj * acc * zij; + // if(rhoi == 0 || rhoj == 0 || pressurei == 0 || pressurej + // == 0)printf("ri %f rj %f pi %f pj %f\n", rhoi, rhoj, + // pressurei, pressurej); + /* Get the time derivative for u. */ + const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr; + + /* Viscosity term */ + const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble; + const float press_sum = pressurei + pressurej; + /* Diffusion term */ + /* Combine the alpha_diff into a pressure-based switch -- this allows + * the alpha from the highest pressure particle to dominate, so that + * the diffusion limited particles always take precedence - another + * trick to allow the scheme to work with thermal feedback. */ + float alpha_diff = + (pressurei * alphadiffi + pressurej * alphadiff_tmp[j_block]) / + (press_sum); + if (fabsf(press_sum) < 1e-10) alpha_diff = 0.f; + const float v_diff = + alpha_diff * 0.5f * + (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) + + fabsf(fac_mu * r_inv * dvdr_Hubble)); + /* wi_dx + wj_dx / 2 is F_ij */ + const float diff_du_term = + v_diff * (ui - u_tmp[j_block]) * + (f_ij * wi_dr / rhoi + f_ji * wj_dr / rhoj); + + /* Assemble the energy equation term */ + const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term; + + /* Internal energy time derivative */ + u_dti += du_dt_i * mj; + if (mj == 0.f) printf("zero mass mj %f\n", mj); + + /* Get the time derivative for h. */ + h_dti -= mj * dvdr * r_inv / rhoj * wi_dr; + + /* Update if we need to; this should be guaranteed by the gradient + * loop but due to some possible synchronisation problems this is here + * as a _quick fix_. Added: 14th August 2019. To be removed by 1st Jan + * 2020. (JB) */ + v_sigi = max(v_sigi, v_sig); + int time_bin_j = timebin[j_block]; + if (time_bin_j > 0) + min_ngb_time_bin = min(min_ngb_time_bin, time_bin_j); + // printf("Got in\n"); + } + } + } + __syncthreads(); + } + if (pid < last_part_in_task_blocks) { + parts_aos[pid].v_sig = v_sigi; + parts_aos[pid].h_dt = h_dti; + parts_aos[pid].u_dt = u_dti; + parts_aos[pid].a_hydrox = ahydroxi; + parts_aos[pid].a_hydroy = ahydroyi; + parts_aos[pid].a_hydroz = ahydrozi; + parts_aos[pid].min_ngb_time_bin = min_ngb_time_bin; + } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void DOSELF_GPU_AOS_F4_F( + struct part_aos_f4_f_send *__restrict__ parts_send, + struct part_aos_f4_f_recv *__restrict__ parts_recv, const float d_a, + const float d_H, const int bundle_first_task, + const int2 *__restrict__ d_task_first_part_f4) { + + extern __shared__ float4 varsf4_f[]; + + __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + int first_part_in_task_blocks, last_part_in_task_blocks; + // first_part_in_task_blocks = d_task_first_part[task_id], + // last_part_in_task_blocks = d_task_last_part[task_id]; + int2 first_last_parts = d_task_first_part_f4[task_id]; + first_part_in_task_blocks = first_last_parts.x; + last_part_in_task_blocks = first_last_parts.y; + + const int pid = threadid + first_part_in_task_blocks; + + int ttid = 0; + int first_part = 0; + int count = 0; + int last_part = 0; + const part_aos_f4_f_send pi = parts_send[pid]; + float4 x_h_i = pi.x_h; + float4 ux_m_i = pi.ux_m; + float4 f_b_t_mintbinngb_i = pi.f_bals_timebin_mintimebin_ngb; + float4 rho_p_c_vsig_i = pi.rho_p_c_vsigi; + float3 u_avisc_adiff_i = pi.u_alphavisc_alphadiff; + + const float mi = ux_m_i.w; + int Found_neighbours = 0; + float pressurei = rho_p_c_vsig_i.y; + const float ci = rho_p_c_vsig_i.z; + float3 ahydro = {0.0, 0.0, 0.0}; + float4 udt_hdt_vsig_mintbinngb = {0.0, 0.0, 0.0, 0.0}; + udt_hdt_vsig_mintbinngb.z = rho_p_c_vsig_i.w; + udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w; + + float hi = x_h_i.w; + float hig2 = hi * hi * kernel_gamma2; + + /*Here we use different pointers "x_p_tmp", etc. to point to different regions + * of the single shared memory space "vars" which we allocate in kernel + * invocation*/ + float4 *__restrict__ x_h_tmp = (float4 *)&varsf4_f[0]; + float4 *__restrict__ ux_m_tmp = (float4 *)&varsf4_f[BLOCK_SIZE]; + float4 *__restrict__ f_b_t_mintbinngb_tmp = + (float4 *)&varsf4_f[BLOCK_SIZE * 2]; + float4 *__restrict__ rho_p_c_vsig_tmp = (float4 *)&varsf4_f[BLOCK_SIZE * 3]; + float3 *__restrict__ u_avisc_adiff_tmp = (float3 *)&varsf4_f[BLOCK_SIZE * 4]; + /*Particles copied in blocks to shared memory*/ + for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks; + b += BLOCK_SIZE) { + int j = b + threadIdx.x; + struct part_aos_f4_f_send pj = parts_send[j]; + x_h_tmp[threadIdx.x] = pj.x_h; + ux_m_tmp[threadIdx.x] = pj.ux_m; + f_b_t_mintbinngb_tmp[threadIdx.x] = pj.f_bals_timebin_mintimebin_ngb; + rho_p_c_vsig_tmp[threadIdx.x] = pj.rho_p_c_vsigi; + // alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha; + u_avisc_adiff_tmp[threadIdx.x] = pj.u_alphavisc_alphadiff; + __syncthreads(); + for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) { + j = j_block + b; + if (j < last_part_in_task_blocks) { + /* Compute the pairwise distance. */ + float4 x_h_j = x_h_tmp[j_block]; + float4 ux_m_j = ux_m_tmp[j_block]; + float4 f_b_t_mintbinngb_j = f_b_t_mintbinngb_tmp[j_block]; + float4 rho_p_c_vsig_j = rho_p_c_vsig_tmp[j_block]; + float3 u_avisc_adiff_j = u_avisc_adiff_tmp[j_block]; + const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y, + zij = x_h_i.z - x_h_j.z; + const float r2 = xij * xij + yij * yij + zij * zij; + if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) { + // /* Cosmology terms for the signal velocity */ + const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a); + const float a2_Hubble = d_a * d_a * d_H; + const float r = sqrt(r2); + const float r_inv = 1.f / r; + // /* Recover some data */ + const float mj = ux_m_j.w; + // /* Get the kernel for hi. */ + const float hi_inv = 1.f / hi; + const float hid_inv = + d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */ + const float xi = r * hi_inv; + float wi, wi_dx; + d_kernel_deval(xi, &wi, &wi_dx); + const float wi_dr = hid_inv * wi_dx; + /* Get the kernel for hj. */ + const float hj = x_h_j.w; + const float hj_inv = 1.0f / hj; + const float hjd_inv = + d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */ + const float xj = r * hj_inv; + float wj, wj_dx; + d_kernel_deval(xj, &wj, &wj_dx); + const float wj_dr = hjd_inv * wj_dx; + // /* Compute dv dot r */ + float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y, + dvz = ux_m_i.z - ux_m_j.z; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + // /* Add Hubble flow */ + const float dvdr_Hubble = dvdr + a2_Hubble * r2; + // /* Are the particles moving towards each others ? */ + const float omega_ij = min(dvdr_Hubble, 0.f); + const float mu_ij = + fac_mu * r_inv * omega_ij; /* This is 0 or negative */ + // + // /* Signal velocity */ + const float cj = rho_p_c_vsig_j.z; + const float v_sig = ci + cj - const_viscosity_beta * mu_ij; + + /* Variable smoothing length term */ + const float f_ij = 1.f - f_b_t_mintbinngb_i.x / mj; + const float f_ji = 1.f - f_b_t_mintbinngb_j.x / mi; + + /* Construct the full viscosity term */ + const float pressurej = rho_p_c_vsig_j.y; + const float rho_ij = rho_p_c_vsig_i.x + rho_p_c_vsig_j.x; + const float alpha = u_avisc_adiff_i.y + u_avisc_adiff_j.y; + const float visc = -0.25f * alpha * v_sig * mu_ij * + (f_b_t_mintbinngb_i.y + f_b_t_mintbinngb_j.y) / + rho_ij; + /* Convolve with the kernel */ + const float visc_acc_term = + 0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv; + /* Compute gradient terms */ + const float rhoi2 = rho_p_c_vsig_i.x * rho_p_c_vsig_i.x; + const float rhoj2 = rho_p_c_vsig_j.x * rho_p_c_vsig_j.x; + const float P_over_rho2_i = pressurei / (rhoi2)*f_ij; + const float P_over_rho2_j = pressurej / (rhoj2)*f_ji; + + /* SPH acceleration term */ + const float sph_acc_term = + (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv; + + /* Assemble the acceleration */ + const float acc = sph_acc_term + visc_acc_term; + /* Use the force Luke ! */ + ahydro.x -= mj * acc * xij; + ahydro.y -= mj * acc * yij; + ahydro.z -= mj * acc * zij; + /* Get the time derivative for u. */ + const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr; + + /* Viscosity term */ + const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble; + /* Diffusion term */ + /* Combine the alpha_diff into a pressure-based switch -- this allows + * the alpha from the highest pressure particle to dominate, so that + * the diffusion limited particles always take precedence - another + * trick to allow the scheme to work with thermal feedback. */ + float alpha_diff = + (pressurei * u_avisc_adiff_i.z + pressurej * u_avisc_adiff_j.z) / + (pressurei + pressurej); + if (fabsf(pressurei + pressurej) < 1e-10) alpha_diff = 0.f; + const float v_diff = + alpha_diff * 0.5f * + (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) + + fabsf(fac_mu * r_inv * dvdr_Hubble)); + /* wi_dx + wj_dx / 2 is F_ij */ + const float diff_du_term = v_diff * + (u_avisc_adiff_i.x - u_avisc_adiff_j.x) * + (f_ij * wi_dr / rho_p_c_vsig_i.x + + f_ji * wj_dr / rho_p_c_vsig_j.x); + + /* Assemble the energy equation term */ + const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term; + + /* Internal energy time derivative */ + udt_hdt_vsig_mintbinngb.x += du_dt_i * mj; + + /* Get the time derivative for h. */ + udt_hdt_vsig_mintbinngb.y -= + mj * dvdr * r_inv / rho_p_c_vsig_j.x * wi_dr; + + /* Update if we need to; this should be guaranteed by the gradient + * loop but due to some possible synchronisation problems this is here + * as a _quick fix_. Added: 14th August 2019. To be removed by 1st Jan + * 2020. (JB) */ + udt_hdt_vsig_mintbinngb.z = fmaxf(udt_hdt_vsig_mintbinngb.z, v_sig); + unsigned int time_bin_j = (f_b_t_mintbinngb_j.z + 0.5f); + unsigned int min_tb_i = (f_b_t_mintbinngb_i.w + 0.5f); + if (time_bin_j > 0) f_b_t_mintbinngb_i.w = min(min_tb_i, time_bin_j); + // printf("Got in\n"); + } + } + } + __syncthreads(); + } + if (pid < last_part_in_task_blocks) { + udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w; + parts_recv[pid].udt_hdt_vsig_mintimebin_ngb = udt_hdt_vsig_mintbinngb; + parts_recv[pid].a_hydro = ahydro; + } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void runner_do_pair_density_GPU_naive( + struct part_soa parts_soa_ci, struct part_soa parts_soa_cj, + int *d_task_first_part_ci, int *d_task_first_part_cj, + int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H, + int bid, int tid, int count_tasks, int tasksperbundle, int nBlocks_per_task, + int bundle_first_task, int time_bin_inhibited) { + + extern __shared__ float vars[]; + // __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + __shared__ int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci; + __shared__ int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj; + + first_part_in_task_blocks_ci = d_task_first_part_ci[task_id]; + last_part_in_task_blocks_ci = d_task_last_part_ci[task_id]; + first_part_in_task_blocks_cj = d_task_first_part_cj[task_id]; + last_part_in_task_blocks_cj = d_task_last_part_cj[task_id]; + + __syncthreads(); + // Now we start calculations for particles in cell i + const int pid = threadid + first_part_in_task_blocks_ci; + + float dx = + 1.f / 64.f; // Value used to avoid interacting parts with themselves + int ttid = 0; + int first_part = 0; + int count = 0; + int last_part = 0; + float cellx = 0.0, celly = 0.0, cellz = 0.0; + float hi = 0.0, hig2 = hi * hi * kernel_gamma2; + float mi = 0.0; + float uxi = 0.0; + float uyi = 0.0; + float uzi = 0.0; + float pix = 0.0; + float piy = 0.0; + float piz = 0.0; + float rhoi = 0.0; + float rho_dhi = 0.0; + float wcounti = 0.0; + float wcount_dhi = 0.0; + float div_vi = 0.0; + float rot_uxi = 0.0; + float rot_uyi = 0.0; + float rot_uzi = 0.0; + int Found_neighbours = 0; + // if(pid (0.01f/128.f)*(0.01f/128.f)) { + if (r2 < hig2 && r2 > (0.01f / dx) * (0.01f / dx)) { + Found_neighbours = 1; + const float r = sqrt(r2); + /* Recover some data */ + const float mj = mass_tmp[j_block]; + /* Get the kernel for hi. */ + if (hi < 1.f / dx) printf("h < dx\n"); + // if(hi<1.f/256.f)printf("h < dx\n"); + const float h_inv = 1.f / hi; + const float ui = r * h_inv; + float wi, wi_dx; + + d_kernel_deval(ui, &wi, &wi_dx); + + rhoi += mj * wi; + rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx); + + wcounti += wi; + wcount_dhi -= (hydro_dimension * wi + ui * wi_dx); + + const float r_inv = 1.f / r; + const float faci = mj * wi_dx * r_inv; + + /* Compute dv dot r */ + float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block], + dvz = uzi - uz_tmp[j_block]; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + + div_vi -= faci * dvdr; + + /* Compute dv cross r */ + float curlvrx = dvy * zij - dvz * yij; + float curlvry = dvz * xij - dvx * zij; + float curlvrz = dvx * yij - dvy * xij; + + rot_uxi += faci * curlvrx; + rot_uyi += faci * curlvry; + rot_uzi += faci * curlvrz; + } + } + } + __syncthreads(); + } + if (pid < last_part_in_task_blocks_ci) { + parts_soa_ci.rho[pid] = rhoi, parts_soa_ci.rho_dh[pid] = rho_dhi; + parts_soa_ci.wcount[pid] = wcounti, + parts_soa_ci.wcount_dh[pid] = wcount_dhi; + parts_soa_ci.div_v[pid] = div_vi; + parts_soa_ci.rot_ux[pid] = rot_uxi, parts_soa_ci.rot_uy[pid] = rot_uyi; + parts_soa_ci.rot_uz[pid] = rot_uzi; + } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void launch_density_pair_two_kernels( + struct part_soa parts_soa_ci, struct part_soa parts_soa_cj, + int *d_task_first_part_ci, int *d_task_first_part_cj, + int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H, + const char *loop_type, cudaStream_t stream, int bid, int block_size, + int count_tasks, int tasksperbundle, int max_parts_i, int max_parts_j, + int numBlocks_y, int tid, int offset, int bundle_first_task, + int time_bin_inhibited) { + + int max_parts = max(max_parts_j, max_parts_i); + int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE; + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + + /*Do ci*/ + runner_do_pair_density_GPU_naive<<>>( + parts_soa_ci, parts_soa_cj, d_task_first_part_ci, d_task_first_part_cj, + d_task_last_part_ci, d_task_last_part_cj, d_a, d_H, bid, tid, count_tasks, + tasksperbundle, nBlocks_per_task, bundle_first_task, time_bin_inhibited); + + // numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE; + // gridShape = dim3(numBlocks_x, numBlocks_y); + // nBlocks_per_task = numBlocks_x; + /*Now do cj*/ + runner_do_pair_density_GPU_naive<<>>( + parts_soa_cj, parts_soa_ci, d_task_first_part_cj, d_task_first_part_ci, + d_task_last_part_cj, d_task_last_part_ci, d_a, d_H, bid, tid, count_tasks, + tasksperbundle, nBlocks_per_task, bundle_first_task, time_bin_inhibited); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__device__ void DOPAIRGPU(struct part_soa parts_soa, int pid, + int last_part_in_task_blocks_ci, + int first_part_in_task_blocks_cj, + int last_part_in_task_blocks_cj, float d_a, float d_H, + int time_bin_inhibited, float *vars) { + + float dx = + 1.f / 64.f; // Value used to avoid interacting parts with themselves + + float cellx = 0.0, celly = 0.0, cellz = 0.0; + float hi = 0.0, hig2 = hi * hi * kernel_gamma2; + float mi = 0.0; + float uxi = 0.0; + float uyi = 0.0; + float uzi = 0.0; + float pix = 0.0; + float piy = 0.0; + float piz = 0.0; + float rhoi = 0.0; + float rho_dhi = 0.0; + float wcounti = 0.0; + float wcount_dhi = 0.0; + float div_vi = 0.0; + float rot_uxi = 0.0; + float rot_uyi = 0.0; + float rot_uzi = 0.0; + int Found_neighbours = 0; + + if (pid < last_part_in_task_blocks_ci) { + cellx = parts_soa.locx[pid], celly = parts_soa.locy[pid], + cellz = parts_soa.locz[pid]; + hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2; + mi = parts_soa.mass[pid]; + uxi = parts_soa.ux[pid]; + uyi = parts_soa.uy[pid]; + uzi = parts_soa.uz[pid]; + pix = parts_soa.x_p[pid] - cellx; + piy = parts_soa.y_p[pid] - celly; + piz = parts_soa.z_p[pid] - cellz; + } + + int n_neighbours = 0; + /*Here we use different pointers "x_p_tmp", etc. to point to different regions + * of the single shared memory space "vars" which we allocate in kernel + * invocation*/ + float *x_p_tmp = (float *)&vars[0]; + float *y_p_tmp = (float *)&vars[BLOCK_SIZE]; + float *z_p_tmp = (float *)&vars[BLOCK_SIZE * 2]; + float *h_tmp = (float *)&vars[BLOCK_SIZE * 3]; + float *mass_tmp = (float *)&vars[BLOCK_SIZE * 4]; + float *ux_tmp = (float *)&vars[BLOCK_SIZE * 5]; + float *uy_tmp = (float *)&vars[BLOCK_SIZE * 6]; + float *uz_tmp = (float *)&vars[BLOCK_SIZE * 7]; + timebin_t *timebin = (timebin_t *)&uz_tmp[BLOCK_SIZE]; + /*Particles copied in blocks to shared memory*/ + for (int b = first_part_in_task_blocks_cj; b < last_part_in_task_blocks_cj; + b += BLOCK_SIZE) { + int j = b + threadIdx.x; + x_p_tmp[threadIdx.x] = parts_soa.x_p[j]; + y_p_tmp[threadIdx.x] = parts_soa.y_p[j]; + z_p_tmp[threadIdx.x] = parts_soa.z_p[j]; + h_tmp[threadIdx.x] = parts_soa.h[j]; + mass_tmp[threadIdx.x] = parts_soa.mass[j]; + ux_tmp[threadIdx.x] = parts_soa.ux[j]; + uy_tmp[threadIdx.x] = parts_soa.uy[j]; + uz_tmp[threadIdx.x] = parts_soa.uz[j]; + timebin[threadIdx.x] = parts_soa.time_bin[j]; + __syncthreads(); + for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) { + j = j_block + b; + if (j < last_part_in_task_blocks_cj) { + /* Compute the pairwise distance. */ + const float pjx = x_p_tmp[j_block] - cellx; + const float pjy = y_p_tmp[j_block] - celly; + const float pjz = z_p_tmp[j_block] - cellz; + const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz; + const float r2 = xij * xij + yij * yij + zij * zij; + + if (r2 < hig2 && r2 > (0.01f / dx) * (0.01f / dx)) { + Found_neighbours = 1; + const float r = sqrt(r2); + /* Recover some data */ + const float mj = mass_tmp[j_block]; + /* Get the kernel for hi. */ + if (hi < 1.f / dx) printf("h < dx\n"); + const float h_inv = 1.f / hi; + const float ui = r * h_inv; + float wi, wi_dx; + + d_kernel_deval(ui, &wi, &wi_dx); + + rhoi += mj * wi; + rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx); + + wcounti += wi; + wcount_dhi -= (hydro_dimension * wi + ui * wi_dx); + + const float r_inv = 1.f / r; + const float faci = mj * wi_dx * r_inv; + + /* Compute dv dot r */ + float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block], + dvz = uzi - uz_tmp[j_block]; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + + div_vi -= faci * dvdr; + + /* Compute dv cross r */ + float curlvrx = dvy * zij - dvz * yij; + float curlvry = dvz * xij - dvx * zij; + float curlvrz = dvx * yij - dvy * xij; + + rot_uxi += faci * curlvrx; + rot_uyi += faci * curlvry; + rot_uzi += faci * curlvrz; + } + } + } + __syncthreads(); + } + if (pid < last_part_in_task_blocks_ci) { + parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi; + parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi; + parts_soa.div_v[pid] = div_vi; + parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi; + parts_soa.rot_uz[pid] = rot_uzi; + } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__device__ void DOPAIR2NONSYMGPU(struct part_soa parts_soa, int pid, + const int ci_start, const int ci_end, + const int cj_start, const int cj_end, + float d_a, float d_H, float *vars_pair, + double *d_shift_x, double *d_shift_y, + double *d_shift_z, const int task_id_tmp, + int flip_order) { + + float dx = + 1.f / 64.f; // Value used to avoid interacting parts with themselves + + float hi = 0.0, hig2 = hi * hi * kernel_gamma2; + + float mi = 0.0; + float uxi = 0.0; + float uyi = 0.0; + float uzi = 0.0; + float pix = 0.0; + float piy = 0.0; + float piz = 0.0; + float rhoi = 0.0; + float rho_dhi = 0.0; + float wcounti = 0.0; + float wcount_dhi = 0.0; + float div_vi = 0.0; + float rot_uxi = 0.0; + float rot_uyi = 0.0; + float rot_uzi = 0.0; + int Found_neighbours = 0; + int count_i = cj_start; + // printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i + // last_part_in_task_blocks_ci %i\n", + // first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, + // last_part_in_task_blocks_ci); + if (pid < ci_end) { + hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2; + mi = parts_soa.mass[pid]; + uxi = parts_soa.ux[pid]; + uyi = parts_soa.uy[pid]; + uzi = parts_soa.uz[pid]; + pix = parts_soa.x_p[pid] - d_shift_x[task_id_tmp]; + piy = parts_soa.y_p[pid] - d_shift_y[task_id_tmp]; + piz = parts_soa.z_p[pid] - d_shift_z[task_id_tmp]; + } + + /*Here we use different pointers "x_p_tmp", etc. to point to different regions + * of the single shared memory space "vars" which we allocate in kernel + * invocation*/ + float *x_p_tmp = (float *)&vars_pair[0]; + float *y_p_tmp = (float *)&x_p_tmp[BLOCK_SIZE]; + float *z_p_tmp = (float *)&y_p_tmp[BLOCK_SIZE]; + float *h_tmp = (float *)&z_p_tmp[BLOCK_SIZE]; + float *mass_tmp = (float *)&h_tmp[BLOCK_SIZE]; + float *ux_tmp = (float *)&mass_tmp[BLOCK_SIZE]; + float *uy_tmp = (float *)&ux_tmp[BLOCK_SIZE]; + float *uz_tmp = (float *)&uy_tmp[BLOCK_SIZE]; + timebin_t *timebin = (timebin_t *)&uz_tmp[BLOCK_SIZE]; + + /*Particles copied in blocks to shared memory*/ + for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) { + const int tid_x = threadIdx.x; + int j = b + tid_x; + x_p_tmp[tid_x] = parts_soa.x_p[j]; + y_p_tmp[tid_x] = parts_soa.y_p[j]; + z_p_tmp[tid_x] = parts_soa.z_p[j]; + // h_tmp[tid_x] = parts_soa.h[j]; + mass_tmp[tid_x] = parts_soa.mass[j]; + ux_tmp[tid_x] = parts_soa.ux[j]; + uy_tmp[tid_x] = parts_soa.uy[j]; + uz_tmp[tid_x] = parts_soa.uz[j]; + timebin[tid_x] = parts_soa.time_bin[j]; + + __syncthreads(); + const float shift_x_j = d_shift_x[task_id_tmp + flip_order]; + const float shift_y_j = d_shift_y[task_id_tmp + flip_order]; + const float shift_z_j = d_shift_z[task_id_tmp + flip_order]; + /*j_block is the particle's index in the block. Loop through particles in + * shared memory one by one*/ + for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) { + int jj = b + j_block; + if (jj < cj_end && pid < ci_end && pid >= ci_start) { + + const float pjx = x_p_tmp[j_block] - shift_x_j; + const float pjy = y_p_tmp[j_block] - shift_y_j; + const float pjz = z_p_tmp[j_block] - shift_z_j; + + const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz; + // const float xij = (pix - pjx) * flip_order, yij = (piy - + // pjy) * flip_order, zij = (piz - pjz) * flip_order; + const float r2 = xij * xij + yij * yij + zij * zij; + if (r2 < hig2) { + /* Recover some data */ + const float mj = mass_tmp[j_block]; + const float r = sqrt(r2); + /* Get the kernel for hi. */ + const float h_inv = 1.f / hi; + const float ui = r * h_inv; + float wi, wi_dx; + + d_kernel_deval(ui, &wi, &wi_dx); + + rhoi += mj * wi; + rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx); + + wcounti += wi; + wcount_dhi -= (hydro_dimension * wi + ui * wi_dx); + + const float r_inv = 1.f / r; + const float faci = mj * wi_dx * r_inv; + /* Compute dv dot r */ + const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block], + dvz = uzi - uz_tmp[j_block]; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + /* Compute dv cross r */ + const float curlvrx = dvy * zij - dvz * yij; + const float curlvry = dvz * xij - dvx * zij; + const float curlvrz = dvx * yij - dvy * xij; + + div_vi -= faci * dvdr; + + rot_uxi += faci * curlvrx; + rot_uyi += faci * curlvry; + rot_uzi += faci * curlvrz; + } + } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/ + } /*End of looping through particles in shared memory---Shared arrays + zero'ed for next step in outer loop*/ + __syncthreads(); + } /*Loop through parts in cell j one BLOCK_SIZE at a time*/ + if (pid >= ci_start && pid < ci_end) { + parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi; + parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi; + parts_soa.div_v[pid] = div_vi; + parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi; + parts_soa.rot_uz[pid] = rot_uzi; + } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__device__ void DOPAIR2NONSYMGPUAOS(struct part_aos *parts_aos, int pid, + const int ci_start, const int ci_end, + const int cj_start, const int cj_end, + float d_a, float d_H, float *vars_pair_aos, + double *d_shift_x, double *d_shift_y, + double *d_shift_z, const int task_id_tmp, + int flip_order) { + + float dx = + 1.f / 64.f; // Value used to avoid interacting parts with themselves + + float hi = 0.0, hig2 = 0.0; + + float mi = 0.0; + float uxi = 0.0; + float uyi = 0.0; + float uzi = 0.0; + float pix = 0.0; + float piy = 0.0; + float piz = 0.0; + float rhoi = 0.0; + float rho_dhi = 0.0; + float wcounti = 0.0; + float wcount_dhi = 0.0; + float div_vi = 0.0; + float rot_uxi = 0.0; + float rot_uyi = 0.0; + float rot_uzi = 0.0; + int Found_neighbours = 0; + int count_i = cj_start; + // printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i + // last_part_in_task_blocks_ci %i\n", + // first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, + // last_part_in_task_blocks_ci); + if (pid < ci_end) { + hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2; + mi = parts_aos[pid].mass; + uxi = parts_aos[pid].ux; + uyi = parts_aos[pid].uy; + uzi = parts_aos[pid].uz; + pix = parts_aos[pid].x_p; // - d_shift_x[task_id_tmp]; + piy = parts_aos[pid].y_p; // - d_shift_y[task_id_tmp]; + piz = parts_aos[pid].z_p; // - d_shift_z[task_id_tmp]; + } + + /*Here we use different pointers "x_p_tmp", etc. to point to different regions + * of the single shared memory space "vars" which we allocate in kernel + * invocation*/ + float *x_p_tmp = (float *)&vars_pair_aos[0]; + float *y_p_tmp = (float *)&x_p_tmp[BLOCK_SIZE]; + float *z_p_tmp = (float *)&y_p_tmp[BLOCK_SIZE]; + float *h_tmp = (float *)&z_p_tmp[BLOCK_SIZE]; + float *mass_tmp = (float *)&h_tmp[BLOCK_SIZE]; + float *ux_tmp = (float *)&mass_tmp[BLOCK_SIZE]; + float *uy_tmp = (float *)&ux_tmp[BLOCK_SIZE]; + float *uz_tmp = (float *)&uy_tmp[BLOCK_SIZE]; + int *timebin = (int *)&uz_tmp[BLOCK_SIZE]; + + /*Particles copied in blocks to shared memory*/ + for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) { + const int tid_x = threadIdx.x; + int j = b + tid_x; + x_p_tmp[tid_x] = parts_aos[j].x_p; + y_p_tmp[tid_x] = parts_aos[j].y_p; + z_p_tmp[tid_x] = parts_aos[j].z_p; + // h_tmp[tid_x] = parts_aos[j].h; + mass_tmp[tid_x] = parts_aos[j].mass; + ux_tmp[tid_x] = parts_aos[j].ux; + uy_tmp[tid_x] = parts_aos[j].uy; + uz_tmp[tid_x] = parts_aos[j].uz; + timebin[tid_x] = parts_aos[j].time_bin; + // const float shift_x_j = d_shift_x[task_id_tmp + flip_order]; + // const float shift_y_j = d_shift_y[task_id_tmp + flip_order]; + // const float shift_z_j = d_shift_z[task_id_tmp + flip_order]; + __syncthreads(); + /*j_block is the particle's index in the block. Loop through particles in + * shared memory one by one*/ + for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) { + int jj = b + j_block; + if (jj < cj_end && pid < ci_end && pid >= ci_start) { + + const float pjx = x_p_tmp[j_block]; // - shift_x_j; + const float pjy = y_p_tmp[j_block]; // - shift_y_j; + const float pjz = z_p_tmp[j_block]; // - shift_z_j; + + const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz; + // const float xij = (pix - pjx) * flip_order, yij = (piy - + // pjy) * flip_order, zij = (piz - pjz) * flip_order; + const float r2 = xij * xij + yij * yij + zij * zij; + if (r2 < hig2) { + /* Recover some data */ + const float mj = mass_tmp[j_block]; + const float r = sqrt(r2); + /* Get the kernel for hi. */ + const float h_inv = 1.f / hi; + const float ui = r * h_inv; + float wi, wi_dx; + + d_kernel_deval(ui, &wi, &wi_dx); + + rhoi += mj * wi; + rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx); + + wcounti += wi; + wcount_dhi -= (hydro_dimension * wi + ui * wi_dx); + + const float r_inv = 1.f / r; + const float faci = mj * wi_dx * r_inv; + /* Compute dv dot r */ + const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block], + dvz = uzi - uz_tmp[j_block]; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + /* Compute dv cross r */ + const float curlvrx = dvy * zij - dvz * yij; + const float curlvry = dvz * xij - dvx * zij; + const float curlvrz = dvx * yij - dvy * xij; + + div_vi -= faci * dvdr; + + rot_uxi += faci * curlvrx; + rot_uyi += faci * curlvry; + rot_uzi += faci * curlvrz; + // if(timebin[j_block] != 1000 && timebin[j_block] != + // 20)printf("incorrect timebin %i\n", timebin[j_block]); + } + } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/ + } /*End of looping through particles in shared memory---Shared arrays + zero'ed for next step in outer loop*/ + __syncthreads(); + } /*Loop through parts in cell j one BLOCK_SIZE at a time*/ + if (pid >= ci_start && pid < ci_end) { + // printf("timebin %i\n", parts_aos[pid].time_bin); + parts_aos[pid].rho = rhoi, parts_aos[pid].rho_dh = rho_dhi; + parts_aos[pid].wcount = wcounti, parts_aos[pid].wcount_dh = wcount_dhi; + parts_aos[pid].div_v = div_vi; + parts_aos[pid].rot_ux = rot_uxi, parts_aos[pid].rot_uy = rot_uyi; + parts_aos[pid].rot_uz = rot_uzi; + parts_aos[pid].time_bin = 20; + } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__device__ void DOPAIR2NONSYMGPUAOSF4( + struct part_aos_f4_send *__restrict__ parts_send, + struct part_aos_f4_recv *__restrict__ parts_recv, int pid, + const int ci_start, const int ci_end, const int cj_start, const int cj_end, + float d_a, float d_H, float4 *vars_pair_aos_f4) { + + float dx = + 1.f / 64.f; // Value used to avoid interacting parts with themselves + + float hi = 0.0, hig2 = 0.0; + + int Found_neighbours = 0; + int count_i = cj_start; + + float4 res_rho = {0.0, 0.0, 0.0, 0.0}; + float4 res_rot = {0.0, 0.0, 0.0, 0.0}; + const part_aos_f4_send pi = parts_send[pid]; + const float4 x_pi = pi.x_p_h; + const float4 ux_pi = pi.ux_m; + // printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i + // last_part_in_task_blocks_ci %i\n", + // first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, + // last_part_in_task_blocks_ci); + // if (pid < ci_end) { + hi = x_pi.w, hig2 = hi * hi * kernel_gamma2; + // } + + /*Here we use different pointers "x_p_tmp", etc. to point to different regions + * of the single shared memory space "vars" which we allocate in kernel + * invocation*/ + float4 *__restrict__ x_p_h_tmp = (float4 *)&vars_pair_aos_f4[0]; + float4 *__restrict__ ux_m_tmp = (float4 *)&vars_pair_aos_f4[BLOCK_SIZE]; + + /*Particles copied in blocks to shared memory*/ + for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) { + const int tid_x = threadIdx.x; + int j = b + tid_x; + struct part_aos_f4_send pj = parts_send[j]; + x_p_h_tmp[tid_x] = pj.x_p_h; + ux_m_tmp[tid_x] = pj.ux_m; + __syncthreads(); + /*j_block is the particle's index in the block. Loop through particles in + * shared memory one by one*/ + for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) { + int jj = b + j_block; + if (jj < cj_end && pid < ci_end && pid >= ci_start) { + + const float4 x_p_h_j = x_p_h_tmp[j_block]; + const float4 ux_m_j = ux_m_tmp[j_block]; + + const float xij = x_pi.x - x_p_h_j.x, yij = x_pi.y - x_p_h_j.y, + zij = x_pi.z - x_p_h_j.z; + const float r2 = xij * xij + yij * yij + zij * zij; + if (r2 < hig2) { + /* Recover some data */ + const float mj = ux_m_j.w; + const float r = sqrt(r2); + /* Get the kernel for hi. */ + const float h_inv = 1.f / hi; + const float ui = r * h_inv; + float wi, wi_dx; + + d_kernel_deval(ui, &wi, &wi_dx); + /*Add to sums of rho, rho_dh, wcount and wcount_dh*/ + res_rho.x += mj * wi; + res_rho.y -= mj * (hydro_dimension * wi + ui * wi_dx); + res_rho.z += wi; + res_rho.w -= (hydro_dimension * wi + ui * wi_dx); + + const float r_inv = 1.f / r; + const float faci = mj * wi_dx * r_inv; + /* Compute dv dot r */ + const float dvx = ux_pi.x - ux_m_j.x, dvy = ux_pi.y - ux_m_j.y, + dvz = ux_pi.z - ux_m_j.z; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + /* Compute dv cross r */ + const float curlvrx = dvy * zij - dvz * yij; + const float curlvry = dvz * xij - dvx * zij; + const float curlvrz = dvx * yij - dvy * xij; + + res_rot.x += faci * curlvrx; + res_rot.y += faci * curlvry; + res_rot.z += faci * curlvrz; + res_rot.w -= faci * dvdr; + } + } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/ + } /*End of looping through particles in shared memory---Shared arrays + zero'ed for next step in outer loop*/ + __syncthreads(); + } /*Loop through parts in cell j one BLOCK_SIZE at a time*/ + if (pid >= ci_start && pid < ci_end) { + parts_recv[pid].rho_dh_wcount = res_rho; + parts_recv[pid].rot_ux_div_v = res_rot; + } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__device__ void DOPAIR2NAIVEGPUAOSF4( + const struct part_aos_f4_send pi, + struct part_aos_f4_send *__restrict__ parts_send, + struct part_aos_f4_recv *__restrict__ parts_recv, int pid, + const int cj_start, const int cj_end, float d_a, float d_H) { + + float dx = + 1.f / 64.f; // Value used to avoid interacting parts with themselves + + float hi = 0.0, hig2 = 0.0; + + int Found_neighbours = 0; + int count_i = cj_start; + + float4 res_rho = {0.0, 0.0, 0.0, 0.0}; + float4 res_rot = {0.0, 0.0, 0.0, 0.0}; + // const part_aos_f4_send pi = parts_send[pid]; + const float4 x_pi = pi.x_p_h; + const float4 ux_pi = pi.ux_m; + // printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i + // last_part_in_task_blocks_ci %i\n", + // first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, + // last_part_in_task_blocks_ci); + // if (pid < ci_end) { + hi = x_pi.w, hig2 = hi * hi * kernel_gamma2; + // } + + // printf("js %i je %i\n", cj_start, cj_end); + /*Particles copied in blocks to shared memory*/ + for (int j = cj_start; j < cj_end; j++) { + struct part_aos_f4_send pj = parts_send[j]; + + const float4 x_p_h_j = pj.x_p_h; + const float4 ux_m_j = pj.ux_m; + + const float xij = x_pi.x - x_p_h_j.x, yij = x_pi.y - x_p_h_j.y, + zij = x_pi.z - x_p_h_j.z; + const float r2 = xij * xij + yij * yij + zij * zij; + // printf("r2 %f \n", r2); + if (r2 < hig2) { + /* Recover some data */ + const float mj = ux_m_j.w; + const float r = sqrt(r2); + /* Get the kernel for hi. */ + const float h_inv = 1.f / hi; + const float ui = r * h_inv; + float wi, wi_dx; + + d_kernel_deval(ui, &wi, &wi_dx); + /*Add to sums of rho, rho_dh, wcount and wcount_dh*/ + res_rho.x += mj * wi; + res_rho.y -= mj * (hydro_dimension * wi + ui * wi_dx); + res_rho.z += wi; + res_rho.w -= (hydro_dimension * wi + ui * wi_dx); + + const float r_inv = 1.f / r; + const float faci = mj * wi_dx * r_inv; + /* Compute dv dot r */ + const float dvx = ux_pi.x - ux_m_j.x, dvy = ux_pi.y - ux_m_j.y, + dvz = ux_pi.z - ux_m_j.z; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + /* Compute dv cross r */ + const float curlvrx = dvy * zij - dvz * yij; + const float curlvry = dvz * xij - dvx * zij; + const float curlvrz = dvx * yij - dvy * xij; + + res_rot.x += faci * curlvrx; + res_rot.y += faci * curlvry; + res_rot.z += faci * curlvrz; + res_rot.w -= faci * dvdr; + } + } /*Loop through parts in cell j one BLOCK_SIZE at a time*/ + // if (pid >= ci_start && pid < ci_end) { + parts_recv[pid].rho_dh_wcount = res_rho; + parts_recv[pid].rot_ux_div_v = res_rot; + // } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__device__ void DOPAIR2NONSYMGPUAOSG(struct part_aos_g *parts_aos, int pid, + const int ci_start, const int ci_end, + const int cj_start, const int cj_end, + float d_a, float d_H, + float *vars_pair_aosg, double *d_shift_x, + double *d_shift_y, double *d_shift_z, + const int task_id_tmp, int flip_order) { + + float dx = + 1.f / 64.f; // Value used to avoid interacting parts with themselves + + float hi = 0.0, hig2 = 0.0; + + float mi = 0.0; + float uxi = 0.0; + float uyi = 0.0; + float uzi = 0.0; + float pix = 0.0; + float piy = 0.0; + float piz = 0.0; + float rhoi = 0.0; + float div_vi = 0.0; + int Found_neighbours = 0; + float v_sig; + float u = 0.f; + float laplace_u = 0.0; + float alpha_visc_max_ngb = 0.0; + float ci = 0.0; + + int count_i = cj_start; + if (pid < ci_end) { + hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2; + mi = parts_aos[pid].mass; + uxi = parts_aos[pid].ux; + uyi = parts_aos[pid].uy; + uzi = parts_aos[pid].uz; + ci = parts_aos[pid].soundspeed; + v_sig = parts_aos[pid].v_sig; + u = parts_aos[pid].u; + laplace_u = parts_aos[pid].laplace_u; + alpha_visc_max_ngb = parts_aos[pid].alpha_visc_max_ngb; + + pix = parts_aos[pid].x_p - d_shift_x[task_id_tmp]; + piy = parts_aos[pid].y_p - d_shift_y[task_id_tmp]; + piz = parts_aos[pid].z_p - d_shift_z[task_id_tmp]; + } + + /*Here we use different pointers "x_p_tmp", etc. to point to different regions + * of the single shared memory space "vars" which we allocate in kernel + * invocation*/ + float *x_p_tmp = (float *)&vars_pair_aosg[0]; + float *y_p_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE]; + float *z_p_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 2]; + float *h_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 3]; + float *mass_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 4]; + float *ux_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 5]; + float *uy_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 6]; + float *uz_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 7]; + float *cj_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 8]; + float *alpha_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 9]; + float *u_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 10]; + float *rho_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 11]; + int *timebin = (int *)&vars_pair_aosg[BLOCK_SIZE * 12]; + + /*Particles copied in blocks to shared memory*/ + for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) { + const int tid_x = threadIdx.x; + int j = b + tid_x; + x_p_tmp[threadIdx.x] = parts_aos[j].x_p; + y_p_tmp[threadIdx.x] = parts_aos[j].y_p; + z_p_tmp[threadIdx.x] = parts_aos[j].z_p; + h_tmp[threadIdx.x] = parts_aos[j].h; + mass_tmp[threadIdx.x] = parts_aos[j].mass; + ux_tmp[threadIdx.x] = parts_aos[j].ux; + uy_tmp[threadIdx.x] = parts_aos[j].uy; + uz_tmp[threadIdx.x] = parts_aos[j].uz; + timebin[threadIdx.x] = parts_aos[j].time_bin; + cj_tmp[threadIdx.x] = parts_aos[j].soundspeed; + alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha; + u_tmp[threadIdx.x] = parts_aos[j].u; + rho_tmp[threadIdx.x] = parts_aos[j].rho; + const float shift_x_j = d_shift_x[task_id_tmp + flip_order]; + const float shift_y_j = d_shift_y[task_id_tmp + flip_order]; + const float shift_z_j = d_shift_z[task_id_tmp + flip_order]; + __syncthreads(); + /*j_block is the particle's index in the block. Loop through particles in + * shared memory one by one*/ + for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) { + int jj = b + j_block; + if (jj < cj_end && pid < ci_end && pid >= ci_start) { + + const float pjx = x_p_tmp[j_block] - shift_x_j; + const float pjy = y_p_tmp[j_block] - shift_y_j; + const float pjz = z_p_tmp[j_block] - shift_z_j; + const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz; + const float r2 = xij * xij + yij * yij + zij * zij; + if (r2 < hig2) { + /* Recover some data */ + const float mj = mass_tmp[j_block]; + const float r = sqrt(r2); + const float r_inv = 1.f / r; + /* Get the kernel for hi. */ + const float h_inv = 1.f / hi; + float wi, wi_dx; + /* Cosmology terms for the signal velocity */ + const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a); + const float a2_Hubble = d_a * d_a * d_H; + /* Compute dv dot r */ + const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block], + dvz = uzi - uz_tmp[j_block]; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + /* Add Hubble flow */ + const float dvdr_Hubble = dvdr + a2_Hubble * r2; + /* Are the particles moving towards each others ? */ + const float omega_ij = min(dvdr_Hubble, 0.f); + const float mu_ij = + fac_mu * r_inv * omega_ij; /* This is 0 or negative */ + + /* Signal velocity */ + const float new_v_sig = + ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij; + /* Update if we need to */ + v_sig = max(v_sig, new_v_sig); + /* Calculate Del^2 u for the thermal diffusion coefficient. */ + /* Need to get some kernel values F_ij = wi_dx */ + const float ui = r * h_inv; + d_kernel_deval(ui, &wi, &wi_dx); + + const float delta_u_factor = (u - u_tmp[j_block]) * r_inv; + laplace_u += mj * delta_u_factor * wi_dx / rho_tmp[j_block]; + + /* Set the maximal alpha from the previous step over the neighbours + * (this is used to limit the diffusion in hydro_prepare_force) */ + const float alpha_j = alpha_tmp[j_block]; + alpha_visc_max_ngb = max(alpha_visc_max_ngb, alpha_j); + } + } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/ + } /*End of looping through particles in shared memory---Shared arrays + zero'ed for next step in outer loop*/ + __syncthreads(); + } /*Loop through parts in cell j one BLOCK_SIZE at a time*/ + if (pid >= ci_start && pid < ci_end) { + parts_aos[pid].v_sig = v_sig, parts_aos[pid].laplace_u = laplace_u; + parts_aos[pid].alpha_visc_max_ngb = alpha_visc_max_ngb; + } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__device__ void DOPAIR2NAIVEGPUAOSF4G( + const struct part_aos_f4_g_send pi, + struct part_aos_f4_g_send *__restrict__ parts_send, + struct part_aos_f4_g_recv *__restrict__ parts_recv, int pid, + const int cj_start, const int cj_end, float d_a, float d_H) { + + float dx = + 1.f / 64.f; // Value used to avoid interacting parts with themselves + + float hi = 0.0, hig2 = 0.0; + + int Found_neighbours = 0; + int count_i = cj_start; + + float4 res_rho = {0.0, 0.0, 0.0, 0.0}; + float4 res_rot = {0.0, 0.0, 0.0, 0.0}; + // const part_aos_f4_send pi = parts_send[pid]; + const float4 x_h_i = pi.x_h; + const float4 ux_m_i = pi.ux_m; + const float4 rho_avisc_u_c_i = pi.rho_avisc_u_c; + float3 vsig_lapu_aviscmax_i = {0.f, 0.f, 0.f}; + + // printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i + // last_part_in_task_blocks_ci %i\n", + // first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, + // last_part_in_task_blocks_ci); + // if (pid < ci_end) { + hi = x_h_i.w, hig2 = hi * hi * kernel_gamma2; + // } + + // printf("js %i je %i\n", cj_start, cj_end); + /*Particles copied in blocks to shared memory*/ + for (int j = cj_start; j < cj_end; j++) { + struct part_aos_f4_g_send pj = parts_send[j]; + + const float4 x_h_j = pj.x_h; + const float4 ux_m_j = pj.ux_m; + const float4 rho_avisc_u_c_j = pj.rho_avisc_u_c; + const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y, + zij = x_h_i.z - x_h_j.z; + const float r2 = xij * xij + yij * yij + zij * zij; + // printf("r2 %f \n", r2); + if (r2 < hig2) { + const float r = sqrt(r2); + const float r_inv = 1.f / r; + /* Recover some data */ + const float mj = ux_m_j.w; + /* Get the kernel for hi. */ + const float h_inv = 1.f / hi; + float wi, wi_dx; + /* Cosmology terms for the signal velocity */ + const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a); + const float a2_Hubble = d_a * d_a * d_H; + /* Compute dv dot r */ + float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y, + dvz = ux_m_i.z - ux_m_j.z; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + /* Add Hubble flow */ + const float dvdr_Hubble = dvdr + a2_Hubble * r2; + /* Are the particles moving towards each others ? */ + const float omega_ij = min(dvdr_Hubble, 0.f); + const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */ + + /* Signal velocity */ + const float new_v_sig = + rho_avisc_u_c_i.w + rho_avisc_u_c_j.w - const_viscosity_beta * mu_ij; + /* Update if we need to */ + vsig_lapu_aviscmax_i.x = fmaxf(vsig_lapu_aviscmax_i.x, new_v_sig); + /* Calculate Del^2 u for the thermal diffusion coefficient. */ + /* Need to get some kernel values F_ij = wi_dx */ + const float ui = r * h_inv; + d_kernel_deval(ui, &wi, &wi_dx); + + const float delta_u_factor = + (rho_avisc_u_c_i.z - rho_avisc_u_c_j.z) * r_inv; + vsig_lapu_aviscmax_i.y += mj * delta_u_factor * wi_dx / rho_avisc_u_c_j.x; + + /* Set the maximal alpha from the previous step over the neighbours + * (this is used to limit the diffusion in hydro_prepare_force) */ + const float alpha_j = rho_avisc_u_c_j.y; + vsig_lapu_aviscmax_i.z = fmaxf(vsig_lapu_aviscmax_i.z, alpha_j); + } + } /*Loop through parts in cell j one BLOCK_SIZE at a time*/ + // if (pid >= ci_start && pid < ci_end) { + parts_recv[pid].vsig_lapu_aviscmax = vsig_lapu_aviscmax_i; + // } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__device__ void DOPAIR2NONSYMGPUAOSF(struct part_aos_f *parts_aos, int pid, + const int ci_start, const int ci_end, + const int cj_start, const int cj_end, + float d_a, float d_H, + float *vars_pair_aosf, double *d_shift_x, + double *d_shift_y, double *d_shift_z, + const int task_id_tmp, int flip_order) { + + float ci = 0.0, cj = 0.0; + float hi = 0.0, hig2 = 0.0; + float mi = 0.0; + float uxi = 0.0; + float uyi = 0.0; + float uzi = 0.0; + float pix = 0.0; + float piy = 0.0; + float piz = 0.0; + float rhoi = 0.0; + float div_vi = 0.0; + int Found_neighbours = 0; + float v_sigi; + float ui = 0.f; + float u_dti = 0.f; + float laplace_ui = 0.0; + float alpha_visc_max_ngb = 0.0; + float pressurei = 0.0; + float alphavisci = 0.0; + float alphadiffi = 0.0; + float fi = 0.0; + float balsarai = 0.0; + float ahydroxi = 0.0; + float ahydroyi = 0.0; + float ahydrozi = 0.0; + float h_dti = 0.0; + int min_ngb_time_bin = 0; + if (pid < ci_end) { + hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2; + mi = parts_aos[pid].mass; + uxi = parts_aos[pid].ux; + uyi = parts_aos[pid].uy; + uzi = parts_aos[pid].uz; + ci = parts_aos[pid].soundspeed; + fi = parts_aos[pid].f; + v_sigi = parts_aos[pid].v_sig; + ui = parts_aos[pid].u; + rhoi = parts_aos[pid].rho; + pressurei = parts_aos[pid].pressure; + balsarai = parts_aos[pid].balsara; + alphavisci = parts_aos[pid].alpha_visc; + alphadiffi = parts_aos[pid].alpha_diff; + min_ngb_time_bin = parts_aos[pid].min_ngb_time_bin; + pix = parts_aos[pid].x_p - d_shift_x[task_id_tmp]; + piy = parts_aos[pid].y_p - d_shift_y[task_id_tmp]; + piz = parts_aos[pid].z_p - d_shift_z[task_id_tmp]; + } + // if (threadIdx.x == 0) { + // first_part_tid_0 = first_part; + // last_part_tid_0 = last_part; + // } + // __syncthreads(); + int n_neighbours = 0; + /*Here we use different pointers "x_p_tmp", etc. to point to different regions + * of the single shared memory space "vars" which we allocate in kernel + * invocation*/ + float *x_p_tmp = (float *)&vars_pair_aosf[0]; + float *y_p_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE]; + float *z_p_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 2]; + float *h_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 3]; + float *mass_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 4]; + float *ux_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 5]; + float *uy_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 6]; + float *uz_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 7]; + float *cj_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 8]; + float *alphavisc_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 9]; + float *alphadiff_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 10]; + float *u_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 11]; + float *rho_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 12]; + float *pressure_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 13]; + float *f_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 14]; + float *balsara_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 15]; + int *timebin = (int *)&vars_pair_aosf[BLOCK_SIZE * 16]; + /*Particles copied in blocks to shared memory*/ + for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) { + int j = b + threadIdx.x; + x_p_tmp[threadIdx.x] = parts_aos[j].x_p; + y_p_tmp[threadIdx.x] = parts_aos[j].y_p; + z_p_tmp[threadIdx.x] = parts_aos[j].z_p; + h_tmp[threadIdx.x] = parts_aos[j].h; + mass_tmp[threadIdx.x] = parts_aos[j].mass; + ux_tmp[threadIdx.x] = parts_aos[j].ux; + uy_tmp[threadIdx.x] = parts_aos[j].uy; + uz_tmp[threadIdx.x] = parts_aos[j].uz; + timebin[threadIdx.x] = parts_aos[j].time_bin; + cj_tmp[threadIdx.x] = parts_aos[j].soundspeed; + // alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha; + u_tmp[threadIdx.x] = parts_aos[j].u; + rho_tmp[threadIdx.x] = parts_aos[j].rho; + alphavisc_tmp[threadIdx.x] = parts_aos[j].alpha_visc; + alphadiff_tmp[threadIdx.x] = parts_aos[j].alpha_diff; + pressure_tmp[threadIdx.x] = parts_aos[j].pressure; + f_tmp[threadIdx.x] = parts_aos[j].f; + balsara_tmp[threadIdx.x] = parts_aos[j].balsara; + const float shift_x_j = d_shift_x[task_id_tmp + flip_order]; + const float shift_y_j = d_shift_y[task_id_tmp + flip_order]; + const float shift_z_j = d_shift_z[task_id_tmp + flip_order]; + __syncthreads(); + /*j_block is the particle's index in the block. Loop through particles in + * shared memory one by one*/ + for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) { + int jj = b + j_block; + if (jj < cj_end && pid < ci_end && pid >= ci_start) { + /* Compute the pairwise distance. */ + const float pjx = x_p_tmp[j_block] - shift_x_j; + const float pjy = y_p_tmp[j_block] - shift_y_j; + const float pjz = z_p_tmp[j_block] - shift_z_j; + const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz; + const float r2 = xij * xij + yij * yij + zij * zij; + if (r2 < hig2) { + + // /* Cosmology terms for the signal velocity */ + const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a); + const float a2_Hubble = d_a * d_a * d_H; + const float r = sqrt(r2); + const float r_inv = 1.f / r; + // /* Recover some data */ + const float mj = mass_tmp[j_block]; + // /* Get the kernel for hi. */ + const float hi_inv = 1.f / hi; + const float hid_inv = + d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */ + const float xi = r * hi_inv; + float wi, wi_dx; + d_kernel_deval(xi, &wi, &wi_dx); + const float wi_dr = hid_inv * wi_dx; + /* Get the kernel for hj. */ + const float hj = h_tmp[j_block]; + const float hj_inv = 1.0f / hj; + const float hjd_inv = + d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */ + const float xj = r * hj_inv; + float wj, wj_dx; + d_kernel_deval(xj, &wj, &wj_dx); + const float wj_dr = hjd_inv * wj_dx; + // /* Compute dv dot r */ + float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block], + dvz = uzi - uz_tmp[j_block]; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + // /* Add Hubble flow */ + const float dvdr_Hubble = dvdr + a2_Hubble * r2; + // /* Are the particles moving towards each others ? */ + const float omega_ij = min(dvdr_Hubble, 0.f); + const float mu_ij = + fac_mu * r_inv * omega_ij; /* This is 0 or negative */ + // + // /* Signal velocity */ + const float v_sig = + ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij; + + /* Variable smoothing length term */ + const float f_ij = 1.f - fi / mj; + const float f_ji = 1.f - f_tmp[j_block] / mi; + + /* Balsara term */ + const float balsaraj = balsara_tmp[j_block]; + /* Construct the full viscosity term */ + const float rhoj = rho_tmp[j_block]; + const float pressurej = pressure_tmp[j_block]; + const float rho_ij = rhoi + rhoj; + const float alpha = alphavisci + alphavisc_tmp[j_block]; + const float visc = + -0.25f * alpha * v_sig * mu_ij * (balsarai + balsaraj) / rho_ij; + /* Convolve with the kernel */ + const float visc_acc_term = + 0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv; + /* Compute gradient terms */ + const float P_over_rho2_i = pressurei / (rhoi * rhoi) * f_ij; + const float P_over_rho2_j = pressurej / (rhoj * rhoj) * f_ji; + + /* SPH acceleration term */ + const float sph_acc_term = + (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv; + + /* Assemble the acceleration */ + const float acc = sph_acc_term + visc_acc_term; + /* Use the force Luke ! */ + ahydroxi -= mj * acc * xij; + ahydroyi -= mj * acc * yij; + ahydrozi -= mj * acc * zij; + // if(rhoi == 0 || rhoj == 0 || pressurei == 0 || pressurej + // == 0)printf("ri %f rj %f pi %f pj %f\n", rhoi, rhoj, + // pressurei, pressurej); + /* Get the time derivative for u. */ + const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr; + + /* Viscosity term */ + const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble; + const float press_sum = pressurei + pressurej; + /* Diffusion term */ + /* Combine the alpha_diff into a pressure-based switch -- this allows + * the alpha from the highest pressure particle to dominate, so that + * the diffusion limited particles always take precedence - another + * trick to allow the scheme to work with thermal feedback. */ + float alpha_diff = + (pressurei * alphadiffi + pressurej * alphadiff_tmp[j_block]) / + (press_sum); + if (fabsf(press_sum) < 1e-10) alpha_diff = 0.f; + const float v_diff = + alpha_diff * 0.5f * + (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) + + fabsf(fac_mu * r_inv * dvdr_Hubble)); + /* wi_dx + wj_dx / 2 is F_ij */ + const float diff_du_term = + v_diff * (ui - u_tmp[j_block]) * + (f_ij * wi_dr / rhoi + f_ji * wj_dr / rhoj); + + /* Assemble the energy equation term */ + const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term; + + /* Internal energy time derivative */ + u_dti += du_dt_i * mj; + if (mj == 0.f) printf("zero mass mj %f\n", mj); + + /* Get the time derivative for h. */ + h_dti -= mj * dvdr * r_inv / rhoj * wi_dr; + + /* Update if we need to; this should be guaranteed by the gradient + * loop but due to some possible synchronisation problems this is here + * as a _quick fix_. Added: 14th August 2019. To be removed by 1st Jan + * 2020. (JB) */ + v_sigi = max(v_sigi, v_sig); + int time_bin_j = timebin[j_block]; + if (time_bin_j > 0) + min_ngb_time_bin = min(min_ngb_time_bin, time_bin_j); + // printf("Got in\n"); + } + } + } + __syncthreads(); + } /*Loop through parts in cell j one BLOCK_SIZE at a time*/ + if (pid >= ci_start && pid < ci_end) { + parts_aos[pid].v_sig = v_sigi; + parts_aos[pid].h_dt = h_dti; + parts_aos[pid].u_dt = u_dti; + parts_aos[pid].a_hydrox = ahydroxi; + parts_aos[pid].a_hydroy = ahydroyi; + parts_aos[pid].a_hydroz = ahydrozi; + parts_aos[pid].min_ngb_time_bin = min_ngb_time_bin; + // printf("%f %f %f %f %f %f\n", v_sigi, h_dti, u_dti, ahydroxi, + // ahydroyi, ahydrozi); + } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__device__ void DOPAIR2NAIVEGPUAOSF4F( + const struct part_aos_f4_f_send pi, + struct part_aos_f4_f_send *__restrict__ parts_send, + struct part_aos_f4_f_recv *__restrict__ parts_recv, int pid, + const int cj_start, const int cj_end, float d_a, float d_H) { + + float dx = + 1.f / 64.f; // Value used to avoid interacting parts with themselves + + int Found_neighbours = 0; + + // const part_aos_f4_send pi = parts_send[pid]; + const float4 x_h_i = pi.x_h; + const float4 ux_m_i = pi.ux_m; + + float4 f_b_t_mintbinngb_i = pi.f_bals_timebin_mintimebin_ngb; + const float4 rho_p_c_vsig_i = pi.rho_p_c_vsigi; + const float3 u_avisc_adiff_i = pi.u_alphavisc_alphadiff; + + const float mi = ux_m_i.w; + const float pressurei = rho_p_c_vsig_i.y; + const float ci = rho_p_c_vsig_i.z; + float3 ahydro = {0.0, 0.0, 0.0}; + float4 udt_hdt_vsig_mintbinngb = {0.0, 0.0, 0.0, 0.0}; + udt_hdt_vsig_mintbinngb.z = rho_p_c_vsig_i.w; + udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w; + + const float hi = x_h_i.w; + const float hig2 = hi * hi * kernel_gamma2; + + // printf("js %i je %i\n", cj_start, cj_end); + /*Particles copied in blocks to shared memory*/ + for (int j = cj_start; j < cj_end; j++) { + struct part_aos_f4_f_send pj = parts_send[j]; + const float4 x_h_j = pj.x_h; + const float4 ux_m_j = pj.ux_m; + const float4 f_b_t_mintbinngb_j = pj.f_bals_timebin_mintimebin_ngb; + const float4 rho_p_c_vsig_j = pj.rho_p_c_vsigi; + // alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha; + const float3 u_avisc_adiff_j = pj.u_alphavisc_alphadiff; + const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y, + zij = x_h_i.z - x_h_j.z; + const float r2 = xij * xij + yij * yij + zij * zij; + // printf("r2 %f \n", r2); + if (r2 < hig2) { + // /* Cosmology terms for the signal velocity */ + const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a); + const float a2_Hubble = d_a * d_a * d_H; + const float r = sqrt(r2); + const float r_inv = 1.f / r; + // /* Recover some data */ + const float mj = ux_m_j.w; + // /* Get the kernel for hi. */ + const float hi_inv = 1.f / hi; + const float hid_inv = d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */ + const float xi = r * hi_inv; + float wi, wi_dx; + d_kernel_deval(xi, &wi, &wi_dx); + const float wi_dr = hid_inv * wi_dx; + /* Get the kernel for hj. */ + const float hj = x_h_j.w; + const float hj_inv = 1.0f / hj; + const float hjd_inv = d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */ + const float xj = r * hj_inv; + float wj, wj_dx; + d_kernel_deval(xj, &wj, &wj_dx); + const float wj_dr = hjd_inv * wj_dx; + // /* Compute dv dot r */ + float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y, + dvz = ux_m_i.z - ux_m_j.z; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + // /* Add Hubble flow */ + const float dvdr_Hubble = dvdr + a2_Hubble * r2; + // /* Are the particles moving towards each others ? */ + const float omega_ij = min(dvdr_Hubble, 0.f); + const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */ + // + // /* Signal velocity */ + const float cj = rho_p_c_vsig_j.z; + const float v_sig = ci + cj - const_viscosity_beta * mu_ij; + + /* Variable smoothing length term */ + const float f_ij = 1.f - f_b_t_mintbinngb_i.x / mj; + const float f_ji = 1.f - f_b_t_mintbinngb_j.x / mi; + + /* Construct the full viscosity term */ + const float pressurej = rho_p_c_vsig_j.y; + const float rho_ij = rho_p_c_vsig_i.x + rho_p_c_vsig_j.x; + const float alpha = u_avisc_adiff_i.y + u_avisc_adiff_j.y; + const float visc = -0.25f * alpha * v_sig * mu_ij * + (f_b_t_mintbinngb_i.y + f_b_t_mintbinngb_j.y) / rho_ij; + /* Convolve with the kernel */ + const float visc_acc_term = + 0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv; + /* Compute gradient terms */ + const float rhoi2 = rho_p_c_vsig_i.x * rho_p_c_vsig_i.x; + const float rhoj2 = rho_p_c_vsig_j.x * rho_p_c_vsig_j.x; + const float P_over_rho2_i = pressurei / (rhoi2)*f_ij; + const float P_over_rho2_j = pressurej / (rhoj2)*f_ji; + + /* SPH acceleration term */ + const float sph_acc_term = + (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv; + + /* Assemble the acceleration */ + const float acc = sph_acc_term + visc_acc_term; + /* Use the force Luke ! */ + ahydro.x -= mj * acc * xij; + ahydro.y -= mj * acc * yij; + ahydro.z -= mj * acc * zij; + /* Get the time derivative for u. */ + const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr; + + /* Viscosity term */ + const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble; + /* Diffusion term */ + /* Combine the alpha_diff into a pressure-based switch -- this allows the + * alpha from the highest pressure particle to dominate, so that the + * diffusion limited particles always take precedence - another trick to + * allow the scheme to work with thermal feedback. */ + float alpha_diff = + (pressurei * u_avisc_adiff_i.z + pressurej * u_avisc_adiff_j.z) / + (pressurei + pressurej); + if (fabsf(pressurei + pressurej) < 1e-10) alpha_diff = 0.f; + const float v_diff = alpha_diff * 0.5f * + (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) + + fabsf(fac_mu * r_inv * dvdr_Hubble)); + /* wi_dx + wj_dx / 2 is F_ij */ + const float diff_du_term = + v_diff * (u_avisc_adiff_i.x - u_avisc_adiff_j.x) * + (f_ij * wi_dr / rho_p_c_vsig_i.x + f_ji * wj_dr / rho_p_c_vsig_j.x); + + /* Assemble the energy equation term */ + const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term; + + /* Internal energy time derivative */ + udt_hdt_vsig_mintbinngb.x += du_dt_i * mj; + + /* Get the time derivative for h. */ + udt_hdt_vsig_mintbinngb.y -= mj * dvdr * r_inv / rho_p_c_vsig_j.x * wi_dr; + + /* Update if we need to; this should be guaranteed by the gradient loop + * but due to some possible synchronisation problems this is here as a + * _quick fix_. Added: 14th August 2019. To be removed by 1st Jan 2020. + * (JB) */ + udt_hdt_vsig_mintbinngb.z = fmaxf(udt_hdt_vsig_mintbinngb.z, v_sig); + unsigned int time_bin_j = (f_b_t_mintbinngb_j.z + 0.5f); + unsigned int min_tb_i = (f_b_t_mintbinngb_i.w + 0.5f); + if (time_bin_j > 0) f_b_t_mintbinngb_i.w = min(min_tb_i, time_bin_j); + // printf("Got in\n"); + } + } /*Loop through parts in cell j one BLOCK_SIZE at a time*/ + // if (pid >= ci_start && pid < ci_end) { + udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w; + parts_recv[pid].udt_hdt_vsig_mintimebin_ngb = udt_hdt_vsig_mintbinngb; + parts_recv[pid].a_hydro = ahydro; + // } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__device__ void DOPAIR2GPU(struct part_soa parts_soa, int pid, + const int ci_start, const int ci_end, + const int cj_start, const int cj_end, float d_a, + float d_H, int time_bin_inhibited, float *vars_pair, + double *d_shift_x, double *d_shift_y, + double *d_shift_z, const int task_id_tmp) { + + float dx = + 1.f / 64.f; // Value used to avoid interacting parts with themselves + + float cellx = 0.0, celly = 0.0, cellz = 0.0; + float cellxj = 0.0, cellyj = 0.0, cellzj = 0.0; + float hi = 0.0, hig2 = hi * hi * kernel_gamma2; + + float mi = 0.0; + float uxi = 0.0; + float uyi = 0.0; + float uzi = 0.0; + double pix = 0.0; + double piy = 0.0; + double piz = 0.0; + float rhoi = 0.0; + float rho_dhi = 0.0; + float wcounti = 0.0; + float wcount_dhi = 0.0; + float div_vi = 0.0; + float rot_uxi = 0.0; + float rot_uyi = 0.0; + float rot_uzi = 0.0; + int Found_neighbours = 0; + int count_i = cj_start; + // printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i + // last_part_in_task_blocks_ci %i\n", + // first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, + // last_part_in_task_blocks_ci); + + if (pid < ci_end) { + cellx = parts_soa.locx[pid]; + celly = parts_soa.locy[pid]; + cellz = parts_soa.locz[pid]; + const int j = cj_start; + cellxj = parts_soa.locx[j]; + cellyj = parts_soa.locy[j]; + cellzj = parts_soa.locz[j]; + hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2; + mi = parts_soa.mass[pid]; + uxi = parts_soa.ux[pid]; + uyi = parts_soa.uy[pid]; + uzi = parts_soa.uz[pid]; + pix = parts_soa.x_p[pid] - d_shift_x[task_id_tmp]; + piy = parts_soa.y_p[pid] - d_shift_y[task_id_tmp]; + piz = parts_soa.z_p[pid] - d_shift_z[task_id_tmp]; + } + + int n_neighbours = 0; + float av_dist = 0.f; + float av_distx = 0.f; + float av_disty = 0.f; + float av_distz = 0.f; + float distby2h = 0.f; + /*Here we use different pointers "x_p_tmp", etc. to point to different regions + * of the single shared memory space "vars" which we allocate in kernel + * invocation*/ + double *x_p_tmp = (double *)&vars_pair[0]; + double *y_p_tmp = (double *)&x_p_tmp[BLOCK_SIZE]; + double *z_p_tmp = (double *)&y_p_tmp[BLOCK_SIZE]; + float *h_tmp = (float *)&z_p_tmp[BLOCK_SIZE]; + float *mass_tmp = (float *)&h_tmp[BLOCK_SIZE]; + float *ux_tmp = (float *)&mass_tmp[BLOCK_SIZE]; + float *uy_tmp = (float *)&ux_tmp[BLOCK_SIZE]; + float *uz_tmp = (float *)&uy_tmp[BLOCK_SIZE]; + timebin_t *timebin = (timebin_t *)&uz_tmp[BLOCK_SIZE]; + float *rho_tmp = (float *)&timebin[BLOCK_SIZE]; + float *rho_dh_tmp = (float *)&rho_tmp[BLOCK_SIZE]; + float *wcount_tmp = (float *)&rho_dh_tmp[BLOCK_SIZE]; + float *wcount_dh_tmp = (float *)&wcount_tmp[BLOCK_SIZE]; + float *div_v_tmp = (float *)&wcount_dh_tmp[BLOCK_SIZE]; + float *rot_ux_tmp = (float *)&div_v_tmp[BLOCK_SIZE]; + float *rot_uy_tmp = (float *)&rot_ux_tmp[BLOCK_SIZE]; + float *rot_uz_tmp = (float *)&rot_uy_tmp[BLOCK_SIZE]; + + /*Particles copied in blocks to shared memory*/ + for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) { + const int tid_x = threadIdx.x; + int j = b + tid_x; + x_p_tmp[tid_x] = parts_soa.x_p[j]; + y_p_tmp[tid_x] = parts_soa.y_p[j]; + z_p_tmp[tid_x] = parts_soa.z_p[j]; + h_tmp[tid_x] = parts_soa.h[j]; + mass_tmp[tid_x] = parts_soa.mass[j]; + ux_tmp[tid_x] = parts_soa.ux[j]; + uy_tmp[tid_x] = parts_soa.uy[j]; + uz_tmp[tid_x] = parts_soa.uz[j]; + timebin[tid_x] = parts_soa.time_bin[j]; + rho_tmp[tid_x] = 0.f; + rho_dh_tmp[tid_x] = 0.f; + wcount_tmp[tid_x] = 0.f; + wcount_dh_tmp[tid_x] = 0.f; + div_v_tmp[tid_x] = 0.f; + rot_ux_tmp[tid_x] = 0.f; + rot_uy_tmp[tid_x] = 0.f; + rot_uz_tmp[tid_x] = 0.f; + __syncthreads(); + const double shift_x_j = d_shift_x[task_id_tmp + 1]; + const double shift_y_j = d_shift_y[task_id_tmp + 1]; + const double shift_z_j = d_shift_z[task_id_tmp + 1]; + /*j_block is the particle's index in the block. Loop through particles in + * shared memory one by one*/ + for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) { + int jj = b + j_block; + if (jj < cj_end && pid < ci_end && pid >= ci_start) { + + const double pjx = x_p_tmp[j_block] - shift_x_j; + const double pjy = y_p_tmp[j_block] - shift_y_j; + const double pjz = z_p_tmp[j_block] - shift_z_j; + + const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz; + // const float xij = pjx - pix, yij = pjy - piy, zij = pjz + //- piz; + const float r2 = xij * xij + yij * yij + zij * zij; + const float hj = h_tmp[j_block]; + const float hjg2 = hj * hj * kernel_gamma2; + // if(r2 > 32.f * hig2 && hig2 != 0.f) printf("x %f y %f z + //%f r %f hig2 %f\n", xij/dx, yij/dx, zij/dx, sqrt(r2)/dx); + /* Compute dv dot r */ + const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block], + dvz = uzi - uz_tmp[j_block]; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + /* Compute dv cross r */ + const float curlvrx = dvy * zij - dvz * yij; + const float curlvry = dvz * xij - dvx * zij; + const float curlvrz = dvx * yij - dvy * xij; + + const float r = sqrt(r2); + if (r2 < hig2) { + /* Recover some data */ + const float mj = mass_tmp[j_block]; + /* Get the kernel for hi. */ + // if(hi<1.f/dx)printf("h < dx\n"); + const float h_inv = 1.f / hi; + const float ui = r * h_inv; + float wi, wi_dx; + + d_kernel_deval(ui, &wi, &wi_dx); + + rhoi += mj * wi; + rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx); + + wcounti += wi; + wcount_dhi -= (hydro_dimension * wi + ui * wi_dx); + + const float r_inv = 1.f / r; + const float faci = mj * wi_dx * r_inv; + + div_vi -= faci * dvdr; + + rot_uxi += faci * curlvrx; + rot_uyi += faci * curlvry; + rot_uzi += faci * curlvrz; + // + } + if (r2 < hjg2) { + /* Recover some data */ + /* Get the kernel for hi. */ + const float hj_inv = 1.f / hj; + const float uj = r * hj_inv; + float wj, wj_dx; + + d_kernel_deval(uj, &wj, &wj_dx); + + // atomicAdd(&rho_tmp[j_block], mi * wj); + atomicAdd(&parts_soa.rho[j], mi * wj); + // atomicAdd(&rho_dh_tmp[j_block], -mi * (hydro_dimension + //* wj + uj * wj_dx)); + atomicAdd(&parts_soa.rho_dh[j], + -mi * (hydro_dimension * wj + uj * wj_dx)); + + // atomicAdd(&wcount_tmp[j_block], wj); + atomicAdd(&parts_soa.wcount[j], wj); + // atomicAdd(&wcount_dh_tmp[j_block], -(hydro_dimension * + // wj + uj * wj_dx)); + atomicAdd(&parts_soa.wcount_dh[j], + -(hydro_dimension * wj + uj * wj_dx)); + + const float r_inv = 1.f / r; + const float facj = mi * wj_dx * r_inv; + + // atomicAdd(&div_v_tmp[j_block], -facj * dvdr); + atomicAdd(&parts_soa.div_v[j], -facj * dvdr); + + // atomicAdd(&rot_ux_tmp[j_block], facj * curlvrx); + // atomicAdd(&rot_uy_tmp[j_block], facj * curlvry); + // atomicAdd(&rot_uz_tmp[j_block], facj * curlvrz); + atomicAdd(&parts_soa.rot_ux[j], facj * curlvrx); + atomicAdd(&parts_soa.rot_uy[j], facj * curlvry); + atomicAdd(&parts_soa.rot_uz[j], facj * curlvrz); + // printf("rho %f rho_dh %f wcount %f wcount_dh %f div_v + //%f rotux %f rotuy %f rotuz %f\n" ,rhoi, + // rho_dhi, wcounti, wcount_dhi, div_vi, rot_uxi, rot_uyi, rot_uzi); + } /*if r2= ci_start)*/ + } /*End of looping through particles in shared memory---Shared arrays + zero'ed for next step in outer loop*/ + __syncthreads(); + // if(j < cj_end){ + // atomicAdd(&parts_soa.rho[j], rho_tmp[threadIdx.x]); + // atomicAdd(&parts_soa.rho_dh[j], rho_dh_tmp[threadIdx.x]); + // atomicAdd(&parts_soa.wcount[j], wcount_tmp[threadIdx.x]); + // atomicAdd(&parts_soa.wcount_dh[j], wcount_dh_tmp[threadIdx.x]); + // atomicAdd(&parts_soa.div_v[j], div_v_tmp[threadIdx.x]); + // atomicAdd(&parts_soa.rot_ux[j], rot_ux_tmp[threadIdx.x]); + // atomicAdd(&parts_soa.rot_uy[j], rot_uy_tmp[threadIdx.x]); + // atomicAdd(&parts_soa.rot_uz[j], rot_uz_tmp[threadIdx.x]); + // } + // __syncthreads(); + // parts_soa.rho[j] += rho_tmp[threadIdx.x]; + // parts_soa.rho_dh[j] += rho_dh_tmp[threadIdx.x]; + // parts_soa.wcount[j] += wcount_tmp[threadIdx.x]; + // parts_soa.wcount_dh[j] =+ wcount_dh_tmp[threadIdx.x]; + // parts_soa.div_v[j] += div_v_tmp[threadIdx.x]; + // parts_soa.rot_ux[j] += rot_ux_tmp[threadIdx.x]; + // parts_soa.rot_uy[j] =+ rot_uy_tmp[threadIdx.x]; + // parts_soa.rot_uz[j] += rot_uz_tmp[threadIdx.x]; + } /*Loop through parts in cell j one BLOCK_SIZE at a time*/ + if (pid >= ci_start && pid < ci_end) { + // if(n_neighbours > 0){ + // distby2h = distby2h/n_neighbours; + // av_dist = av_dist/(n_neighbours*dx); + // } + // av_distx = av_distx/(n_neighbours*dx); + // av_disty = av_disty/(n_neighbours*dx); + // av_distz = av_distz/(n_neighbours*dx); + parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi; + parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi; + parts_soa.div_v[pid] = div_vi; + parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi; + parts_soa.rot_uz[pid] = rot_uzi; + // if(rhoi != 0.f)printf("rho i %f, rho_dh i %f\n", rhoi, rho_dhi); + } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void runner_do_pair_density_GPU( + struct part_soa parts_soa, int *d_task_first_part_ci, + int *d_task_first_part_cj, int *d_task_last_part_ci, + int *d_task_last_part_cj, float d_a, float d_H, int bid, int tid, + int count_tasks, int tasksperbundle, int nBlocks_per_task, + int bundle_first_task, int time_bin_inhibited) { + + extern __shared__ float vars[]; + // __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci; + int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj; + + first_part_in_task_blocks_ci = d_task_first_part_ci[task_id]; + last_part_in_task_blocks_ci = d_task_last_part_ci[task_id]; + first_part_in_task_blocks_cj = d_task_first_part_cj[task_id]; + last_part_in_task_blocks_cj = d_task_last_part_cj[task_id]; + + // Now we start calculations for particles in cell i + const int pid = threadid + first_part_in_task_blocks_ci; + + /*Don't ever put me in an if statement. I've got __syncthreads inside*/ + DOPAIRGPU(parts_soa, pid, last_part_in_task_blocks_ci, + first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, d_a, d_H, + time_bin_inhibited, vars); + // __syncthreads(); + // Now we start calculations for particles in cell i + const int pjd = threadid + last_part_in_task_blocks_ci; + /*Don't ever put me in an if statement. I've got __syncthreads inside*/ + DOPAIRGPU(parts_soa, pjd, last_part_in_task_blocks_cj, + first_part_in_task_blocks_ci, last_part_in_task_blocks_ci, d_a, d_H, + time_bin_inhibited, vars); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void runner_do_pair_sym_density_GPU( + struct part_soa parts_soa, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, + int count_tasks, int tasksperbundle, int nBlocks_per_task, + int bundle_first_task, int time_bin_inhibited, double *d_shift_x, + double *d_shift_y, double *d_shift_z) { + + extern __shared__ float vars_pair[]; + // __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci; + // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj; + const int task_id_tmp = 2 * task_id; + const int ci_start = d_task_first_parts_pair[task_id_tmp]; + const int ci_end = d_task_last_parts_pair[task_id_tmp]; + const int cj_start = d_task_first_parts_pair[task_id_tmp + 1]; + const int cj_end = d_task_last_parts_pair[task_id_tmp + 1]; + + // Now we start calculations for particles in cell i + const int pid = threadid + ci_start; + + /*Don't ever put me in an if statement. I've got __syncthreads inside*/ + DOPAIR2GPU(parts_soa, pid, ci_start, ci_end, cj_start, cj_end, d_a, d_H, + time_bin_inhibited, vars_pair, d_shift_x, d_shift_y, d_shift_z, + task_id_tmp); + // __syncthreads(); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void runner_do_pair_nonsym_density_GPU( + struct part_soa parts_soa, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, + int count_tasks, int tasksperbundle, int nBlocks_per_task, + int bundle_first_task, int time_bin_inhibited, double *d_shift_x, + double *d_shift_y, double *d_shift_z) { + + extern __shared__ float vars_pair[]; + // __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci; + // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj; + const int task_id_tmp = 2 * task_id; + const int ci_start = d_task_first_parts_pair[task_id_tmp]; + const int ci_end = d_task_last_parts_pair[task_id_tmp]; + const int cj_start = d_task_first_parts_pair[task_id_tmp + 1]; + const int cj_end = d_task_last_parts_pair[task_id_tmp + 1]; + + /* Start calculations for particles in cell i + * Don't ever put me in an if statement. I've got __syncthreads inside*/ + const int pid = threadid + ci_start; + const int flip_i = 1; + DOPAIR2NONSYMGPU(parts_soa, pid, ci_start, ci_end, cj_start, cj_end, d_a, d_H, + vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp, + flip_i); + + /*Necessary evil to stop parts from j and i co-existing on shared memory for + * sums*/ + __syncthreads(); + + /*Now do cj + * Don't ever put me in an if statement. I've got __syncthreads inside*/ + const int pjd = threadid + cj_start; + const int flip_j = -1; + DOPAIR2NONSYMGPU(parts_soa, pjd, cj_start, cj_end, ci_start, ci_end, d_a, d_H, + vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp + 1, + flip_j); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void runner_do_pair_ci_density_GPU( + struct part_soa parts_soa, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, + int count_tasks, int tasksperbundle, int nBlocks_per_task, + int bundle_first_task, double *d_shift_x, double *d_shift_y, + double *d_shift_z) { + + extern __shared__ float vars_pair[]; + // __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci; + // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj; + const int task_id_tmp = 2 * task_id; + const int ci_start = d_task_first_parts_pair[task_id_tmp]; + const int ci_end = d_task_last_parts_pair[task_id_tmp]; + const int cj_start = d_task_first_parts_pair[task_id_tmp + 1]; + const int cj_end = d_task_last_parts_pair[task_id_tmp + 1]; + + /* Start calculations for particles in cell i + * Don't ever put me in an if statement. I've got __syncthreads inside*/ + const int pid = threadid + ci_start; + const int flip_i = 1; + DOPAIR2NONSYMGPU(parts_soa, pid, ci_start, ci_end, cj_start, cj_end, d_a, d_H, + vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp, + flip_i); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void runner_do_pair_cj_density_GPU( + struct part_soa parts_soa, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, + int count_tasks, int tasksperbundle, int nBlocks_per_task, + int bundle_first_task, double *d_shift_x, double *d_shift_y, + double *d_shift_z) { + + extern __shared__ float vars_pair[]; + // __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci; + // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj; + const int task_id_tmp = 2 * task_id; + const int ci_start = d_task_first_parts_pair[task_id_tmp]; + const int ci_end = d_task_last_parts_pair[task_id_tmp]; + const int cj_start = d_task_first_parts_pair[task_id_tmp + 1]; + const int cj_end = d_task_last_parts_pair[task_id_tmp + 1]; + + /*Now do cj + * Don't ever put me in an if statement. I've got __syncthreads inside*/ + const int pjd = threadid + cj_start; + const int flip_j = -1; + DOPAIR2NONSYMGPU(parts_soa, pjd, cj_start, cj_end, ci_start, ci_end, d_a, d_H, + vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp + 1, + flip_j); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void runner_do_pair_ci_density_GPU_aos( + struct part_aos *parts_aos, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, + int count_tasks, int tasksperbundle, int nBlocks_per_task, + int bundle_first_task, double *d_shift_x, double *d_shift_y, + double *d_shift_z) { + + extern __shared__ float vars_pair_aos[]; + // __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci; + // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj; + const int task_id_tmp = 2 * task_id; + const int ci_start = d_task_first_parts_pair[task_id_tmp]; + const int ci_end = d_task_last_parts_pair[task_id_tmp]; + const int cj_start = d_task_first_parts_pair[task_id_tmp + 1]; + const int cj_end = d_task_last_parts_pair[task_id_tmp + 1]; + + /* Start calculations for particles in cell i + * Don't ever put me in an if statement. I've got __syncthreads inside*/ + const int pid = threadid + ci_start; + const int flip_i = 1; + DOPAIR2NONSYMGPUAOS(parts_aos, pid, ci_start, ci_end, cj_start, cj_end, d_a, + d_H, vars_pair_aos, d_shift_x, d_shift_y, d_shift_z, + task_id_tmp, flip_i); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void runner_do_pair_cj_density_GPU_aos( + struct part_aos *parts_aos, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, + int count_tasks, int tasksperbundle, int nBlocks_per_task, + int bundle_first_task, double *d_shift_x, double *d_shift_y, + double *d_shift_z) { + + extern __shared__ float vars_pair_aos[]; + // __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci; + // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj; + const int task_id_tmp = 2 * task_id; + const int ci_start = d_task_first_parts_pair[task_id_tmp]; + const int ci_end = d_task_last_parts_pair[task_id_tmp]; + const int cj_start = d_task_first_parts_pair[task_id_tmp + 1]; + const int cj_end = d_task_last_parts_pair[task_id_tmp + 1]; + + /*Now do cj + * Don't ever put me in an if statement. I've got __syncthreads inside*/ + const int pjd = threadid + cj_start; + const int flip_j = -1; + DOPAIR2NONSYMGPUAOS(parts_aos, pjd, cj_start, cj_end, ci_start, ci_end, d_a, + d_H, vars_pair_aos, d_shift_x, d_shift_y, d_shift_z, + task_id_tmp + 1, flip_j); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void runner_do_pair_ci_density_GPU_aos_f4( + struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, + int4 *fparti_fpartj_lparti_lpartj_dens, float d_a, float d_H, + int bundle_first_task) { + + extern __shared__ float4 vars_pair_i_f4[]; + // __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci; + // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj; + const int ci_start = fparti_fpartj_lparti_lpartj_dens[task_id].x; + const int cj_start = fparti_fpartj_lparti_lpartj_dens[task_id].y; + const int ci_end = fparti_fpartj_lparti_lpartj_dens[task_id].z; + const int cj_end = fparti_fpartj_lparti_lpartj_dens[task_id].w; + + /* Start calculations for particles in cell i + * Don't ever put me in an if statement. I've got __syncthreads inside*/ + const int pid = threadid + ci_start; + + DOPAIR2NONSYMGPUAOSF4(parts_send, parts_recv, pid, ci_start, ci_end, cj_start, + cj_end, d_a, d_H, vars_pair_i_f4); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void runner_do_pair_cj_density_GPU_aos_f4( + struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, + int4 *fparti_fpartj_lparti_lpartj_dens, float d_a, float d_H, + int bundle_first_task) { + + extern __shared__ float4 vars_pair_j_f4[]; + // __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + const int ci_start = fparti_fpartj_lparti_lpartj_dens[task_id].x; + const int cj_start = fparti_fpartj_lparti_lpartj_dens[task_id].y; + const int ci_end = fparti_fpartj_lparti_lpartj_dens[task_id].z; + const int cj_end = fparti_fpartj_lparti_lpartj_dens[task_id].w; + + /*Now do cj + * Don't ever put me in an if statement. I've got __syncthreads inside*/ + const int pjd = threadid + cj_start; + DOPAIR2NONSYMGPUAOSF4(parts_send, parts_recv, pjd, cj_start, cj_end, ci_start, + ci_end, d_a, d_H, vars_pair_j_f4); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void runner_do_pair_density_GPU_aos_f4( + struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, + float d_a, float d_H, int bundle_first_part, int bundle_n_parts) { + + // extern __shared__ float4 vars_pair_i_f4[]; + // __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int pid = bundle_first_part + threadid; + // const int task_id = bundle_first_part + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci; + // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj; + if (pid < bundle_first_part + bundle_n_parts) { + const struct part_aos_f4_send pi = parts_send[pid]; + const int cj_start = pi.cjs_cje.x; + const int cj_end = pi.cjs_cje.y; + + /* Start calculations for particles in cell i*/ + DOPAIR2NAIVEGPUAOSF4(pi, parts_send, parts_recv, pid, cj_start, cj_end, d_a, + d_H); + } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void runner_do_pair_ci_density_GPU_aos_g( + struct part_aos_g *parts_aos, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, + int count_tasks, int tasksperbundle, int nBlocks_per_task, + int bundle_first_task, double *d_shift_x, double *d_shift_y, + double *d_shift_z) { + + extern __shared__ float vars_pair_aosg[]; + // __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci; + // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj; + const int task_id_tmp = 2 * task_id; + const int ci_start = d_task_first_parts_pair[task_id_tmp]; + const int ci_end = d_task_last_parts_pair[task_id_tmp]; + const int cj_start = d_task_first_parts_pair[task_id_tmp + 1]; + const int cj_end = d_task_last_parts_pair[task_id_tmp + 1]; + + /* Start calculations for particles in cell i + * Don't ever put me in an if statement. I've got __syncthreads inside*/ + const int pid = threadid + ci_start; + const int flip_i = 1; + DOPAIR2NONSYMGPUAOSG(parts_aos, pid, ci_start, ci_end, cj_start, cj_end, d_a, + d_H, vars_pair_aosg, d_shift_x, d_shift_y, d_shift_z, + task_id_tmp, flip_i); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void runner_do_pair_cj_density_GPU_aos_g( + struct part_aos_g *parts_aos, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, + int count_tasks, int tasksperbundle, int nBlocks_per_task, + int bundle_first_task, double *d_shift_x, double *d_shift_y, + double *d_shift_z) { + + extern __shared__ float vars_pair_aosg[]; + // __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci; + // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj; + const int task_id_tmp = 2 * task_id; + const int ci_start = d_task_first_parts_pair[task_id_tmp]; + const int ci_end = d_task_last_parts_pair[task_id_tmp]; + const int cj_start = d_task_first_parts_pair[task_id_tmp + 1]; + const int cj_end = d_task_last_parts_pair[task_id_tmp + 1]; + + /*Now do cj + * Don't ever put me in an if statement. I've got __syncthreads inside*/ + const int pjd = threadid + cj_start; + const int flip_j = -1; + DOPAIR2NONSYMGPUAOSG(parts_aos, pjd, cj_start, cj_end, ci_start, ci_end, d_a, + d_H, vars_pair_aosg, d_shift_x, d_shift_y, d_shift_z, + task_id_tmp + 1, flip_j); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void runner_do_pair_gradient_GPU_aos_f4( + struct part_aos_f4_g_send *parts_send, + struct part_aos_f4_g_recv *parts_recv, float d_a, float d_H, + int bundle_first_part, int bundle_n_parts) { + + // extern __shared__ float4 vars_pair_i_f4[]; + // __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int pid = bundle_first_part + threadid; + // const int task_id = bundle_first_part + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + if (pid < bundle_first_part + bundle_n_parts) { + const struct part_aos_f4_g_send pi = parts_send[pid]; + const int cj_start = pi.cjs_cje.x; + const int cj_end = pi.cjs_cje.y; + /* Start calculations for particles in cell i*/ + DOPAIR2NAIVEGPUAOSF4G(pi, parts_send, parts_recv, pid, cj_start, cj_end, + d_a, d_H); + } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void runner_do_pair_ci_density_GPU_aos_f( + struct part_aos_f *parts_aos, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, + int count_tasks, int tasksperbundle, int nBlocks_per_task, + int bundle_first_task, double *d_shift_x, double *d_shift_y, + double *d_shift_z) { + + extern __shared__ float vars_pair_aosf[]; + // __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci; + // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj; + const int task_id_tmp = 2 * task_id; + const int ci_start = d_task_first_parts_pair[task_id_tmp]; + const int ci_end = d_task_last_parts_pair[task_id_tmp]; + const int cj_start = d_task_first_parts_pair[task_id_tmp + 1]; + const int cj_end = d_task_last_parts_pair[task_id_tmp + 1]; + + /* Start calculations for particles in cell i + * Don't ever put me in an if statement. I've got __syncthreads inside*/ + const int pid = threadid + ci_start; + const int flip_i = 1; + DOPAIR2NONSYMGPUAOSF(parts_aos, pid, ci_start, ci_end, cj_start, cj_end, d_a, + d_H, vars_pair_aosf, d_shift_x, d_shift_y, d_shift_z, + task_id_tmp, flip_i); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void runner_do_pair_cj_density_GPU_aos_f( + struct part_aos_f *parts_aos, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, + int count_tasks, int tasksperbundle, int nBlocks_per_task, + int bundle_first_task, double *d_shift_x, double *d_shift_y, + double *d_shift_z) { + + extern __shared__ float vars_pair_aosf[]; + // __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + // int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci; + // int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj; + const int task_id_tmp = 2 * task_id; + const int ci_start = d_task_first_parts_pair[task_id_tmp]; + const int ci_end = d_task_last_parts_pair[task_id_tmp]; + const int cj_start = d_task_first_parts_pair[task_id_tmp + 1]; + const int cj_end = d_task_last_parts_pair[task_id_tmp + 1]; + + /*Now do cj + * Don't ever put me in an if statement. I've got __syncthreads inside*/ + const int pjd = threadid + cj_start; + const int flip_j = -1; + DOPAIR2NONSYMGPUAOSF(parts_aos, pjd, cj_start, cj_end, ci_start, ci_end, d_a, + d_H, vars_pair_aosf, d_shift_x, d_shift_y, d_shift_z, + task_id_tmp + 1, flip_j); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__global__ void runner_do_pair_force_GPU_aos_f4( + struct part_aos_f4_f_send *parts_send, + struct part_aos_f4_f_recv *parts_recv, float d_a, float d_H, + int bundle_first_part, int bundle_n_parts) { + + // extern __shared__ float4 vars_pair_i_f4[]; + // __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int pid = bundle_first_part + threadid; + // const int task_id = bundle_first_part + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + if (pid < bundle_first_part + bundle_n_parts) { + const struct part_aos_f4_f_send pi = parts_send[pid]; + const int cj_start = pi.cjs_cje.x; + const int cj_end = pi.cjs_cje.y; + /* Start calculations for particles in cell i */ + DOPAIR2NAIVEGPUAOSF4F(pi, parts_send, parts_recv, pid, cj_start, cj_end, + d_a, d_H); + } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void runner_dopair1_branch_density_gpu( + struct part_soa parts_soa, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y, + int tid, int offset, int bundle_first_task, int time_bin_inhibited, + double *d_shift_x, double *d_shift_y, double *d_shift_z) { + + int max_parts = max(max_parts_j, max_parts_i); + int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE; + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + // fprintf(stderr,"max_parts %i, max_partsi %i, max_partsj %i\n, " + // "numBlocks_x %i, numBlocks_y %i, BLOCK_SIZE %i\n", max_parts, + // max_parts_i, max_parts_j, numBlocks_x, numBlocks_y, + // BLOCK_SIZE); + + /*Do ci & cj*/ + // fprintf(stderr, "BLOCK_SIZE %i max parts %i num idle threads %i\n", + // BLOCK_SIZE, max_parts, numBlocks_x * BLOCK_SIZE - max_parts); + + // runner_do_pair_sym_density_GPU<<>>( + // parts_soa, d_task_first_parts_pair, d_task_last_parts_pair, + // d_a, d_H, bid, tid, count_tasks, tasksperbundle, + // nBlocks_per_task, bundle_first_task, time_bin_inhibited, d_shift_x, + // d_shift_y, d_shift_z); + + runner_do_pair_nonsym_density_GPU<<>>( + parts_soa, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid, + tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task, + time_bin_inhibited, d_shift_x, d_shift_y, d_shift_z); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void runner_dopairci_branch_density_gpu( + struct part_soa parts_soa, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y, + int tid, int offset, int bundle_first_task, double *d_shift_x, + double *d_shift_y, double *d_shift_z) { + + int max_parts = max_parts_i; + int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE; + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + + runner_do_pair_ci_density_GPU<<>>( + parts_soa, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid, + tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task, + d_shift_x, d_shift_y, d_shift_z); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void runner_dopaircj_branch_density_gpu( + struct part_soa parts_soa, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y, + int tid, int offset, int bundle_first_task, double *d_shift_x, + double *d_shift_y, double *d_shift_z) { + + int max_parts = max_parts_j; + int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE; + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + + runner_do_pair_cj_density_GPU<<>>( + parts_soa, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid, + tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task, + d_shift_x, d_shift_y, d_shift_z); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void runner_dopairci_branch_density_gpu_aos( + struct part_aos *parts_aos, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y, + int tid, int offset, int bundle_first_task, double *d_shift_x, + double *d_shift_y, double *d_shift_z) { + + int max_parts = max_parts_i; + int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE; + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + + runner_do_pair_ci_density_GPU_aos<<>>( + parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid, + tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task, + d_shift_x, d_shift_y, d_shift_z); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void runner_dopaircj_branch_density_gpu_aos( + struct part_aos *parts_aos, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y, + int tid, int offset, int bundle_first_task, double *d_shift_x, + double *d_shift_y, double *d_shift_z) { + + int max_parts = max_parts_j; + int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE; + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + + runner_do_pair_cj_density_GPU_aos<<>>( + parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid, + tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task, + d_shift_x, d_shift_y, d_shift_z); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void runner_dopairci_branch_density_gpu_aos_f4( + struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, + float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y, + int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens) { + + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + + runner_do_pair_ci_density_GPU_aos_f4<<< + gridShape, BLOCK_SIZE, 2 * BLOCK_SIZE * sizeof(float4), stream>>>( + parts_send, parts_recv, fparti_fpartj_lparti_lpartj_dens, d_a, d_H, + bundle_first_task); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void runner_dopaircj_branch_density_gpu_aos_f4( + struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, + float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y, + int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens) { + + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + + runner_do_pair_cj_density_GPU_aos_f4<<< + gridShape, BLOCK_SIZE, 2 * BLOCK_SIZE * sizeof(float4), stream>>>( + parts_send, parts_recv, fparti_fpartj_lparti_lpartj_dens, d_a, d_H, + bundle_first_task); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void runner_dopair_branch_density_gpu_aos_f4( + struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, + float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y, + int bundle_first_part, int bundle_n_parts) { + + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + + // fprintf(stderr, "nblocks %i\n", numBlocks_x); + runner_do_pair_density_GPU_aos_f4<<>>( + parts_send, parts_recv, d_a, d_H, bundle_first_part, bundle_n_parts); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void runner_dopairci_branch_density_gpu_aos_g( + struct part_aos_g *parts_aos, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y, + int tid, int offset, int bundle_first_task, double *d_shift_x, + double *d_shift_y, double *d_shift_z) { + + int max_parts = max_parts_i; + int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE; + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + + runner_do_pair_ci_density_GPU_aos_g<<< + gridShape, BLOCK_SIZE, + 12 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int), stream>>>( + parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid, + tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task, + d_shift_x, d_shift_y, d_shift_z); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void runner_dopaircj_branch_density_gpu_aos_g( + struct part_aos_g *parts_aos, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y, + int tid, int offset, int bundle_first_task, double *d_shift_x, + double *d_shift_y, double *d_shift_z) { + + int max_parts = max_parts_j; + int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE; + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + + runner_do_pair_cj_density_GPU_aos_g<<< + gridShape, BLOCK_SIZE, + 12 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int), stream>>>( + parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid, + tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task, + d_shift_x, d_shift_y, d_shift_z); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void runner_dopair_branch_gradient_gpu_aos_f4( + struct part_aos_f4_g_send *parts_send, + struct part_aos_f4_g_recv *parts_recv, float d_a, float d_H, + cudaStream_t stream, int numBlocks_x, int numBlocks_y, + int bundle_first_part, int bundle_n_parts) { + + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + + // fprintf(stderr, "nblocks %i\n", numBlocks_x); + runner_do_pair_gradient_GPU_aos_f4<<>>( + parts_send, parts_recv, d_a, d_H, bundle_first_part, bundle_n_parts); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void runner_dopairci_branch_density_gpu_aos_f( + struct part_aos_f *parts_aos, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y, + int tid, int offset, int bundle_first_task, double *d_shift_x, + double *d_shift_y, double *d_shift_z) { + + int max_parts = max_parts_i; + int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE; + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + + runner_do_pair_ci_density_GPU_aos_f<<< + gridShape, BLOCK_SIZE, + 17 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int), stream>>>( + parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid, + tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task, + d_shift_x, d_shift_y, d_shift_z); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void runner_dopaircj_branch_density_gpu_aos_f( + struct part_aos_f *parts_aos, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y, + int tid, int offset, int bundle_first_task, double *d_shift_x, + double *d_shift_y, double *d_shift_z) { + + int max_parts = max_parts_j; + int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE; + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + + runner_do_pair_cj_density_GPU_aos_f<<< + gridShape, BLOCK_SIZE, + 17 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int), stream>>>( + parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid, + tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task, + d_shift_x, d_shift_y, d_shift_z); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void runner_dopair_branch_force_gpu_aos_f4( + struct part_aos_f4_f_send *parts_send, + struct part_aos_f4_f_recv *parts_recv, float d_a, float d_H, + cudaStream_t stream, int numBlocks_x, int numBlocks_y, + int bundle_first_part, int bundle_n_parts) { + + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + + // fprintf(stderr, "nblocks %i\n", numBlocks_x); + runner_do_pair_force_GPU_aos_f4<<>>( + parts_send, parts_recv, d_a, d_H, bundle_first_part, bundle_n_parts); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif + +__global__ void runner_do_self_density_GPU_naive( + struct part_soa parts_soa, int *d_task_first_part, int *d_task_last_part, + float d_a, float d_H, int bid, int tid, int count_tasks, int tasksperbundle, + int nBlocks_per_task, int bundle_first_task, int max_parts, + int time_bin_inhibited) { + + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + __shared__ int first_part_in_task_blocks, last_part_in_task_blocks; + first_part_in_task_blocks = d_task_first_part[task_id]; + last_part_in_task_blocks = d_task_last_part[task_id]; + + const int pid = threadid + first_part_in_task_blocks; + + int ttid = 0; + int first_part = 0; + int count = 0; + int last_part = 0; + float cellx = 0.0, celly = 0.0, cellz = 0.0; + float hi = 0.0, hig2 = hi * hi * kernel_gamma2; + float mi = 0.0; + float uxi = 0.0; + float uyi = 0.0; + float uzi = 0.0; + float pix = 0.0; + float piy = 0.0; + float piz = 0.0; + float rhoi = 0.0; + float rho_dhi = 0.0; + float wcounti = 0.0; + float wcount_dhi = 0.0; + float div_vi = 0.0; + float rot_uxi = 0.0; + float rot_uyi = 0.0; + float rot_uzi = 0.0; + int Found_neighbours = 0; + + if (pid < last_part_in_task_blocks) { + ttid = parts_soa.tid_p[pid]; + first_part = d_task_first_part[ttid]; + last_part = d_task_last_part[ttid]; + count = last_part - first_part; + cellx = parts_soa.locx[pid], celly = parts_soa.locy[pid], + cellz = parts_soa.locz[pid]; + hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2; + mi = parts_soa.mass[pid]; + uxi = parts_soa.ux[pid]; + uyi = parts_soa.uy[pid]; + uzi = parts_soa.uz[pid]; + pix = parts_soa.x_p[pid] - cellx; + piy = parts_soa.y_p[pid] - celly; + piz = parts_soa.z_p[pid] - cellz; + + int n_neighbours = 0; + + /*Naive loop over neighbours*/ + for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks; + b += BLOCK_SIZE) { + for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) { + int j = j_block + b; + if (j < last_part_in_task_blocks) { + const float x_p_tmp = parts_soa.x_p[j]; + const float y_p_tmp = parts_soa.y_p[j]; + const float z_p_tmp = parts_soa.z_p[j]; + const float h_tmp = parts_soa.h[j]; + const float mass_tmp = parts_soa.mass[j]; + const float ux_tmp = parts_soa.ux[j]; + const float uy_tmp = parts_soa.uy[j]; + const float uz_tmp = parts_soa.uz[j]; + const timebin_t timebin = parts_soa.time_bin[j]; + + /* Compute the pairwise distance. */ + const float pjx = x_p_tmp - cellx; + const float pjy = y_p_tmp - celly; + const float pjz = z_p_tmp - cellz; + const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz; + const float r2 = xij * xij + yij * yij + zij * zij; + const float hj = h_tmp, hjg2 = hj * hj * kernel_gamma2; + if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) { + Found_neighbours = 1; + const float r = sqrt(r2); + /* Recover some data */ + const float mj = mass_tmp; + /* Get the kernel for hi. */ + if (hi < 1.f / 128.f) printf("h < dx\n"); + const float h_inv = 1.f / hi; + const float ui = r * h_inv; + float wi, wi_dx; + + d_kernel_deval(ui, &wi, &wi_dx); + + rhoi += mj * wi; + rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx); + + wcounti += wi; + wcount_dhi -= (hydro_dimension * wi + ui * wi_dx); + + const float r_inv = 1.f / r; + const float faci = mj * wi_dx * r_inv; + + /* Compute dv dot r */ + float dvx = uxi - ux_tmp, dvy = uyi - uy_tmp, dvz = uzi - uz_tmp; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + + div_vi -= faci * dvdr; + + /* Compute dv cross r */ + float curlvrx = dvy * zij - dvz * yij; + float curlvry = dvz * xij - dvx * zij; + float curlvrz = dvx * yij - dvy * xij; + + rot_uxi += faci * curlvrx; + rot_uyi += faci * curlvry; + rot_uzi += faci * curlvrz; + } + } + } + } + // float wi, wi_dx; + // d_kernel_deval(0.f, &wi, &wi_dx); + if (Found_neighbours == 0) + printf("Not sure what's going on but no neighbours found in GPU loop\n"); + parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi; + parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi; + parts_soa.div_v[pid] = div_vi; + parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi, + parts_soa.rot_uz[pid] = rot_uzi; + } +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void launch_tester_kernel(struct part_soa parts_soa, int *d_task_first_part, + int *d_task_last_part, float d_a, float d_H, + const char *loop_type, cudaStream_t stream, int bid, + int block_size, int count_tasks, int tasksperbundle, + int numBlocks_x, int numBlocks_y, int tid, int offset, + int bundle_first_task, int max_parts, + int time_bin_inhibited) { + + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + tester<<>>(parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, + bid, tid, count_tasks, tasksperbundle, nBlocks_per_task, + bundle_first_task, max_parts, time_bin_inhibited); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part, + int *d_task_last_part, float d_a, float d_H, + const char *loop_type, cudaStream_t stream, + int block_size, int count_tasks, int tasksperbundle, + int numBlocks_x, int numBlocks_y, + int bundle_first_task, int max_parts) { + + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + runner_do_self_density_GPU<<>>( + parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, count_tasks, + tasksperbundle, nBlocks_per_task, bundle_first_task, max_parts); + // runner_do_self_density_GPU_naive<<>>( + // parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, + // count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task, + // max_parts, time_bin_inhibited); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void launch_gradient_aos(struct part_aos_g *parts_aos, int *d_task_first_part, + int *d_task_last_part, float d_a, float d_H, + const char *loop_type, cudaStream_t stream, + int block_size, int count_tasks, int tasksperbundle, + int numBlocks_x, int numBlocks_y, + int bundle_first_task, int max_parts, double *d_cell_x, + double *d_cell_y, double *d_cell_z) { + + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + DOSELF_GPU_AOS_G<<>>(parts_aos, d_task_first_part, d_task_last_part, + d_a, d_H, count_tasks, tasksperbundle, + nBlocks_per_task, bundle_first_task, max_parts, + d_cell_x, d_cell_y, d_cell_z); + // runner_do_self_density_GPU_naive<<>>( + // parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, + // count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task, + // max_parts, time_bin_inhibited); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void launch_gradient_aos_f4(struct part_aos_f4_g_send *parts_send, + struct part_aos_f4_g_recv *parts_recv, float d_a, + float d_H, cudaStream_t stream, int numBlocks_x, + int numBlocks_y, int bundle_first_task, + int2 *d_task_first_part_f4) { + + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + DOSELF_GPU_AOS_F4_G<<>>(parts_send, parts_recv, d_a, d_H, + bundle_first_task, d_task_first_part_f4); + // runner_do_self_density_GPU_naive<<>>( + // parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, + // count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task, + // max_parts, time_bin_inhibited); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void launch_force_aos(struct part_aos_f *parts_aos, int *d_task_first_part, + int *d_task_last_part, float d_a, float d_H, + const char *loop_type, cudaStream_t stream, + int block_size, int count_tasks, int tasksperbundle, + int numBlocks_x, int numBlocks_y, int bundle_first_task, + int max_parts, double *d_cell_x, double *d_cell_y, + double *d_cell_z) { + + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + DOSELF_GPU_AOS_F<<>>(parts_aos, d_task_first_part, d_task_last_part, + d_a, d_H, count_tasks, tasksperbundle, + nBlocks_per_task, bundle_first_task, max_parts, + d_cell_x, d_cell_y, d_cell_z); + // runner_do_self_density_GPU_naive<<>>( + // parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, + // count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task, + // max_parts, time_bin_inhibited); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void launch_force_aos_f4(struct part_aos_f4_f_send *d_parts_send, + struct part_aos_f4_f_recv *d_parts_recv, float d_a, + float d_H, cudaStream_t stream, int numBlocks_x, + int numBlocks_y, int bundle_first_task, + int2 *d_task_first_part_f4) { + + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + DOSELF_GPU_AOS_F4_F<<< + gridShape, BLOCK_SIZE, + 4 * BLOCK_SIZE * sizeof(float4) + BLOCK_SIZE * sizeof(float3), stream>>>( + d_parts_send, d_parts_recv, d_a, d_H, bundle_first_task, + d_task_first_part_f4); + // runner_do_self_density_GPU_naive<<>>( + // parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, + // count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task, + // max_parts, time_bin_inhibited); +} +#ifdef WITH_CUDA +} +#endif diff --git a/src/cuda/GPU_runner_functions.h b/src/cuda/GPU_runner_functions.h new file mode 100644 index 0000000000..27bbecdd92 --- /dev/null +++ b/src/cuda/GPU_runner_functions.h @@ -0,0 +1,148 @@ +#ifndef CUDA_HEADERS_H +#define CUDA_HEADERS_H +#define n_streams 1024 + +#ifdef __cplusplus +extern "C" { +#endif +#include "part_gpu.h" +void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part, + int *d_task_last_part, float d_a, float d_H, + const char *loop_type, cudaStream_t stream, + int block_size, int count_tasks, int tasksperbundle, + int numBlocks_x, int numBlocks_y, + int bundle_first_task, int max_parts); +void launch_density_aos(struct part_aos *parts_aos, int *d_task_first_part, + int *d_task_last_part, float d_a, float d_H, + const char *loop_type, cudaStream_t stream, + int block_size, int count_tasks, int tasksperbundle, + int numBlocks_x, int numBlocks_y, int bundle_first_task, + int max_parts, double *d_cell_x, double *d_cell_y, + double *d_cell_z); +void launch_density_aos_f4(struct part_aos_f4_send *parts_send, + struct part_aos_f4_recv *parts_recv, float d_a, + float d_H, cudaStream_t stream, int numBlocks_x, + int numBlocks_y, int bundle_first_task, + int2 *d_task_first_part_f4); +void launch_gradient_aos(struct part_aos_g *parts_aos, int *d_task_first_part, + int *d_task_last_part, float d_a, float d_H, + const char *loop_type, cudaStream_t stream, + int block_size, int count_tasks, int tasksperbundle, + int numBlocks_x, int numBlocks_y, + int bundle_first_task, int max_parts, double *d_cell_x, + double *d_cell_y, double *d_cell_z); +void launch_gradient_aos_f4(struct part_aos_f4_g_send *parts_send, + struct part_aos_f4_g_recv *parts_recv, float d_a, + float d_H, cudaStream_t stream, int numBlocks_x, + int numBlocks_y, int bundle_first_task, + int2 *d_task_first_part_f4); +void launch_force_aos(struct part_aos_f *parts_aos, int *d_task_first_part, + int *d_task_last_part, float d_a, float d_H, + const char *loop_type, cudaStream_t stream, + int block_size, int count_tasks, int tasksperbundle, + int numBlocks_x, int numBlocks_y, int bundle_first_task, + int max_parts, double *d_cell_x, double *d_cell_y, + double *d_cell_z); +void launch_force_aos_f4(struct part_aos_f4_f_send *parts_send, + struct part_aos_f4_f_recv *parts_recv, float d_a, + float d_H, cudaStream_t stream, int numBlocks_x, + int numBlocks_y, int bundle_first_task, + int2 *d_task_first_part_f4); +void launch_density_pair_two_kernels( + struct part_soa parts_soa_ci, struct part_soa parts_soa_cj, + int *d_task_first_part_ci, int *d_task_first_part_cj, + int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H, + const char *loop_type, cudaStream_t stream, int bid, int block_size, + int count_tasks, int tasksperbundle, int max_parts_i, int max_parts_j, + int numBlocks_y, int tid, int offset, int bundle_first_task, + int max_active_bin); +void runner_dopair1_branch_density_gpu( + struct part_soa parts_soa, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y, + int tid, int offset, int bundle_first_task, int max_active_bin, + double *d_shift_x, double *d_shift_y, double *d_shift_z); +void runner_dopairci_branch_density_gpu( + struct part_soa parts_soa, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y, + int tid, int offset, int bundle_first_task, double *d_shift_x, + double *d_shift_y, double *d_shift_z); +void runner_dopaircj_branch_density_gpu( + struct part_soa parts_soa, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y, + int tid, int offset, int bundle_first_task, double *d_shift_x, + double *d_shift_y, double *d_shift_z); +void runner_dopairci_branch_density_gpu_aos( + struct part_aos *parts_aos, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y, + int tid, int offset, int bundle_first_task, double *d_shift_x, + double *d_shift_y, double *d_shift_z); +void runner_dopairci_branch_density_gpu_aos_f4( + struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, + float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y, + int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens); +void runner_dopaircj_branch_density_gpu_aos_f4( + struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, + float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y, + int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens); +void runner_dopair_branch_density_gpu_aos_f4( + struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, + float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y, + int bundle_first_part, int bundle_n_parts); +void runner_dopaircj_branch_density_gpu_aos( + struct part_aos *parts_aos, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y, + int tid, int offset, int bundle_first_task, double *d_shift_x, + double *d_shift_y, double *d_shift_z); +void runner_dopairci_branch_density_gpu_aos_g( + struct part_aos_g *parts_aos, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y, + int tid, int offset, int bundle_first_task, double *d_shift_x, + double *d_shift_y, double *d_shift_z); +void runner_dopaircj_branch_density_gpu_aos_g( + struct part_aos_g *parts_aos, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y, + int tid, int offset, int bundle_first_task, double *d_shift_x, + double *d_shift_y, double *d_shift_z); +void runner_dopair_branch_gradient_gpu_aos_f4( + struct part_aos_f4_g_send *parts_send, + struct part_aos_f4_g_recv *parts_recv, float d_a, float d_H, + cudaStream_t stream, int numBlocks_x, int numBlocks_y, + int bundle_first_part, int bundle_n_parts); +void runner_dopairci_branch_density_gpu_aos_f( + struct part_aos_f *parts_aos, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y, + int tid, int offset, int bundle_first_task, double *d_shift_x, + double *d_shift_y, double *d_shift_z); +void runner_dopaircj_branch_density_gpu_aos_f( + struct part_aos_f *parts_aos, int *d_task_first_parts_pair, + int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y, + int tid, int offset, int bundle_first_task, double *d_shift_x, + double *d_shift_y, double *d_shift_z); +void runner_dopair_branch_force_gpu_aos_f4( + struct part_aos_f4_f_send *parts_send, + struct part_aos_f4_f_recv *parts_recv, float d_a, float d_H, + cudaStream_t stream, int numBlocks_x, int numBlocks_y, + int bundle_first_part, int bundle_n_parts); +#ifdef __cplusplus +} +#endif + +#endif // CUDA_HEADER_H diff --git a/src/cuda/Makefile.am b/src/cuda/Makefile.am new file mode 100644 index 0000000000..5fb5bbc34f --- /dev/null +++ b/src/cuda/Makefile.am @@ -0,0 +1,66 @@ +SOURCES_CUDA = GPU_runner_functions.cu tester.cu ../files_for_new_functions/arrays_malloc.cu ../files_for_new_functions/host_device_data_transfer.cu #../runner_main.cu +include_HEADERS = GPU_runner_functions.h device_functions.h BLOCK_SIZE.h tester.h ../files_for_new_functions/arrays_malloc.h ../files_for_new_functions/host_device_data_transfer.h +EXTRA_DIST = $(SOURCES_CUDA) $(include_HEADERS) + +if HAVECUDA + +AM_CFLAGS = -I.. $(HDF5_CPPFLAGS) +CUDA_MYFLAGS = -D_FORCE_INLINES -O4 -lineinfo -src-in-ptx --maxrregcount=64 -ftz=true -DWITH_CUDA --default-stream per-thread --use_fast_math -lcudadevrt #-dlink -ccbin=gcc +CUDA_MYFLAGS += -arch=sm_70 +CUDA_MYFLAGS += --extra-device-vectorization + +#CUDA_MYFLAGS = -D_FORCE_INLINES -O3 -lineinfo -src-in-ptx --maxrregcount=32 -ftz=true -DWITH_CUDA -ccbin=gcc -m64 --default-stream per-thread #-dlink +#CUDA_MYFLAGS += -arch=sm_80 \ +#-gencode=arch=compute_80,code=sm_80 \ +#-gencode=arch=compute_86,code=sm_86 \ +#-gencode=arch=compute_87,code=sm_87 \ +#-gencode=arch=compute_86,code=compute_86 +#CUDA_MYFLAGS += --extra-device-vectorization + +# Assign a "safe" version number +AM_LDFLAGS = $(HDF5_LDFLAGS) $(FFTW_LIBS) -version-info 0:0:0 + +#bin_PROGRAMS = test_27_cells test_125_cells + +# Rules to compile CUDA code. +.cu.o: + $(NVCC) -c $(NVCCFLAGS) $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) $< -o $@ +.cu.lo: + PATH=$(top_srcdir):$(PATH) && cudalt.py $@ $(NVCC) -c $(NVCCFLAGS) $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) $< + +# The library. Dummy C library so that we get libtool linking setup. +lib_LTLIBRARIES = libswiftCUDA.la libswiftdummy.la + +# Special link command to avoid including CFLAGS which are not understood. +libswiftCUDA_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \ + $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \ + $(libswiftCUDA_la_LDFLAGS) \ + $(LDFLAGS) -o $@ + +libswiftCUDA_la_SOURCES = $(SOURCES_CUDA) +libswiftCUDA_la_CFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) ../libswiftsim_cuda.la -I../ +libswiftCUDA_la_CXXFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) ../libswiftsim_cuda.la -I../ +libswiftCUDA_la_LIBADD = ../.libs/libswiftsim_cuda.la +libswiftCUDA_la_LDFLAGS = $(AM_LDFLAGS) + +if HAVEMPI +libswiftCUDA_la_CFLAGS += ../libswiftsim_mpicuda.la +libswiftCUDA_la_CXXFLAGS += ../libswiftsim_mpicuda.la +libswiftCUDA_la_LIBADD += ../.libs/libswiftsim_mpicuda.la +endif + +libswiftdummy_la_SOURCES = dummy.c +libswiftdummy_la_CFLAGS = $(AM_CFLAGS) +libswiftdummy_la_LDFLAGS = $(AM_LDFLAGS) + +#test_27_cells_SOURCES=test27cells.c +#test_27_cells_CFLAGS=$(AM_CFLAGS) -DWITH_CUDA $(CUDA_CFLAGS) +#test_27_cells_LDADD= ../.libs/libswiftsim_cuda.la ../.libs/libswiftsim_mpicuda.la libswiftCUDA.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -L/home/aidan/cuda_7.5/lib64/ -lcudart +#test_27_cells_LDFLAGS = $(AM_LDFLAGS) $(CUDA_CFLAGS) + +#test_125_cells_SOURCES=test125cells.c +#test_125_cells_CFLAGS=$(AM_CFLAGS) -DWITH_CUDA $(CUDA_CFLAGS) +#test_125_cells_LDADD= ../libswiftsim_cuda.la ../libswiftsim_mpicuda.la libswiftCUDA.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -L/home/aidan/cuda_7.5/lib64/ -lcudart +#test_125_cells_LDFLAGS = $(AM_LDFLAGS) $(CUDA_CFLAGS) + +endif diff --git a/src/cuda/device_functions.h b/src/cuda/device_functions.h new file mode 100644 index 0000000000..afc4a1a5d8 --- /dev/null +++ b/src/cuda/device_functions.h @@ -0,0 +1,149 @@ +#ifndef DEVICE_FUNCTIONS_H +#define DEVICE_FUNCTIONS_H +#include "../../config.h" + +/* Local headers. */ +// #include "../dimension.h" +// #include "../error.h" +// #include "../inline.h" +// #include "../minmax.h" +// #include "../vector.h" + +// Is this even necessary? Probably not as our code will operate differently +#define num_cuda_threads 128 +#define hydro_dimension 3.f + +/// Here we define stuff from kernel_hydro.h when using cubic_spline_kernel. +/// Will worry about sorting 'if statements for different kernels later//// +/* First some powers of gamma = H/h */ +#define kernel_gamma ((float)(1.825742)) +#define kernel_gamma_inv ((float)(1. / kernel_gamma)) +#define kernel_gamma2 ((float)(kernel_gamma * kernel_gamma)) +#define kernel_ivals 2 +#define kernel_degree 3 /*!< Degree of the polynomial */ +#define kernel_gamma_dim ((float)(kernel_gamma * kernel_gamma * kernel_gamma)) +#define kernel_gamma_dim_plus_one \ + ((float)(kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma)) +#define kernel_gamma_inv_dim \ + ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma))) +#define kernel_gamma_inv_dim_plus_one \ + ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma))) +#define kernel_ivals_f ((float)kernel_ivals) /*!< Number of branches */ +#define kernel_constant ((float)(16. * M_1_PI)) +/*! Cosmology default beta=3.0. + * Alpha can be set in the parameter file. + * Beta is defined as in e.g. Price (2010) Eqn (103) */ +#define const_viscosity_beta 3.0f +#ifdef WITH_CUDA +extern "C" { +#endif +/** + * @brief Returns the argument to the power given by the dimension plus one + * + * Computes \f$x^{d+1}\f$. + */ +__device__ float d_pow_dimension_plus_one(float x) { + +#if defined(HYDRO_DIMENSION_3D) + + const float x2 = x * x; + return x2 * x2; + +#elif defined(HYDRO_DIMENSION_2D) + + return x * x * x; + +#elif defined(HYDRO_DIMENSION_1D) + + return x * x; + +#else + + error("The dimension is not defined !"); + return 0.f; + +#endif +} + +/** + * @brief Return the argument to the power three adiabatic index minus five over + * two. + * + * Computes \f$x^{(3\gamma - 5)/2}\f$. + * + * @param x Argument + */ +__device__ float d_pow_three_gamma_minus_five_over_two(float x) { +#if defined(HYDRO_GAMMA_5_3) + + return 1.f; /* x^(0) */ + +#elif defined(HYDRO_GAMMA_7_5) + + return powf(x, -0.4f); /* x^(-2/5) */ + +#elif defined(HYDRO_GAMMA_4_3) + + return 1.f / sqrtf(x); /* x^(-1/2) */ + +#elif defined(HYDRO_GAMMA_2_1) + + return sqrtf(x); /* x^(1/2) */ + +#else + + error("The adiabatic index is not defined !"); + return 0.f; + +#endif +} + +/** + * @brief Computes the kernel function and its derivative. + * + * The kernel function needs to be mutliplied by \f$h^{-d}\f$ and the gradient + * by \f$h^{-(d+1)}\f$, where \f$d\f$ is the dimensionality of the problem. + * + * Returns 0 if \f$u > \gamma = H/h\f$. + * + * @param u The ratio of the distance to the smoothing length \f$u = x/h\f$. + * @param W (return) The value of the kernel function \f$W(x,h)\f$. + * @param dW_dx (return) The norm of the gradient of \f$|\nabla W(x,h)|\f$. + */ +__device__ void d_kernel_deval(float u, float *__restrict__ W, + float *__restrict__ dW_dx) { + + /* Go to the range [0,1[ from [0,H[ */ + const float x = u * kernel_gamma_inv; + + /* Pick the correct branch of the kernel */ + const int temp = (int)(x * kernel_ivals_f); + const int ind = temp > kernel_ivals ? kernel_ivals : temp; + static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)] = { + 3.f, -3.f, 0.f, 0.5f, /* 0 < u < 0.5 */ + -1.f, 3.f, -3.f, 1.f, /* 0.5 < u < 1 */ + 0.f, 0.f, 0.f, 0.f}; /* 1 < u */ + const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)]; + /* First two terms of the polynomial ... */ + float w = coeffs[0] * x + coeffs[1]; + float dw_dx = coeffs[0]; + + /* ... and the rest of them */ + for (int k = 2; k <= kernel_degree; k++) { + dw_dx = dw_dx * x + w; + w = x * w + coeffs[k]; + } + + w = fmaxf(w, 0.f); + dw_dx = fminf(dw_dx, 0.f); + + /* Return everything */ + *W = w * kernel_constant * kernel_gamma_inv_dim; + *dW_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one; +} + +#ifdef WITH_CUDA +} +#endif + +#endif // DEVICE_FUNCTIONS_H diff --git a/src/cuda/dummy.c b/src/cuda/dummy.c new file mode 100755 index 0000000000..c75d2d873c --- /dev/null +++ b/src/cuda/dummy.c @@ -0,0 +1,9 @@ +#include + +#ifdef __cplusplus +extern "C" { +#endif +void swiftcudadummy(void) {} +#ifdef __cplusplus +} +#endif diff --git a/src/cuda/kernel_definitions.cu b/src/cuda/kernel_definitions.cu new file mode 100644 index 0000000000..a272b7beee --- /dev/null +++ b/src/cuda/kernel_definitions.cu @@ -0,0 +1,114 @@ +/******************************************************************************* + * This file contains functions used to setup and execute GPU tasks from within + *runner_main.c. Consider this a translator allowing .cu based functions to be + *called from within runner_main.c + ******************************************************************************/ +#ifdef WITH_CUDA +#ifndef static +#define static +#endif +// #ifndef restrict +// #define restrict __restrict__ +// #endif +#endif + +/* Required header files */ +#include +/*ifdef __cplusplus prevents name mangling. C code sees exact names + of functions rather than mangled template names produced by C++*/ +#ifdef __cplusplus +extern "C" { +#endif +#include "cell_gpu.h" +#include "cuda_headers.h" +#ifdef __cplusplus +} +#endif + +/* function to initialise and printout GPU name*/ +#ifdef __cplusplus +extern "C" { +#endif +void Initialise_GPU() { + int devId = 0; + // find and print device name + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, devId); + printf("Device : %s\n", prop.name); + cudaSetDevice(devId); + // cuda +} +#ifdef __cplusplus +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void CPU_runner_doself1_branch_gradient(struct cell_gpu *restrict ci_gpu) { + int id = ci_gpu->hydro.parts[0].id; + printf("id of first part %d\n", id); + // Do stuff here for interactions on CPU but using the temporary GPU arrays + // const int count_i = ci_gpu->hydro.count; + // const int count_j = cj_gpu->hydro.count; + // system("pause"); + /* Anything to do here? */ + // if (!cell_is_active_hydro(ci_gpu, e) && !cell_is_active_hydro(cj_gpu, + // e)) return; +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +void GPU_runner_doself1_branch_gradient(struct cell_gpu *restrict ci_gpu) { + int count = ci_gpu->hydro.count; + int numBlocks = (count + BLOCK_SIZE - 1) / BLOCK_SIZE; + + struct cell_gpu *d_ci_gpu; + cudaMalloc((void **)&d_ci_gpu, sizeof(cell_gpu)); + + cudaMemcpy(d_ci_gpu, ci_gpu, sizeof(cell_gpu), cudaMemcpyHostToDevice); + SPH_Sum_Self<<>>(d_ci_gpu); + cudaMemcpy(ci_gpu, d_ci_gpu, sizeof(cell_gpu), cudaMemcpyDeviceToHost); +} +#ifdef WITH_CUDA +} +#endif + +#ifdef WITH_CUDA +extern "C" { +#endif +__device__ void SPH_Sum_Self(cell_gpu *d_ci_gpu) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int i = index; + float sumLoc, xi, yi, zi; + struct part_gpu *restrict parts = d_ci_gpu->hydro.parts; + xi = parts[i].x[0]; + yi = parts[i].x[1]; + zi = parts[i].x[2]; + sumLoc = 0.f; + float h = parts[i].h, mass = parts[i].mass, rho = parts[i].rho; + const int count = d_ci_gpu->hydro.count; + //__shared__ float sh_x[BLOCK_SIZE], sh_y[BLOCK_SIZE]; + // copy neighbour particles data to shared memory + // for (unsigned int j1=0; j1 + +#include + +typedef struct part_soa { + /*Task ID*/ + int *tid_p; + /*bundle ID*/ + int *bid_p; + /*! Particle unique ID. */ + long long *id; + /*! Pointer to corresponding gravity part. */ + // struct gpu_gpart* gpart; + /*! Particle position. */ + double *x_p; + double *y_p; + double *z_p; + /*! Particle predicted velocity. */ + float *ux; + float *uy; + float *uz; + /*! Particle acceleration. */ + float *a_hydrox; + float *a_hydroy; + float *a_hydroz; + /*! Particle mass. */ + float *mass; + /*! Particle smoothing length. */ + float *h; + /*! Particle internal energy. */ + float *u; + /*! Time derivative of the internal energy. */ + float *u_dt; + /*! Particle density. */ + float *rho; + /*! Kernel summation (For testing/debugging). */ + float *SPH_sum; + + /* Cell information */ + /*! The cell location on the grid (corner nearest to the origin). */ + float *locx; + float *locy; + float *locz; + /*! The cell dimensions. */ + float *widthx; + float *widthy; + float *widthz; + float *h_max; + int *count_p; + int *count_test; + /* Density information */ + + /*! Neighbour number count. */ + float *wcount; + + /*! Derivative of the neighbour number with respect to h. */ + float *wcount_dh; + + /*! Derivative of density with respect to h */ + float *rho_dh; + + /*! Particle velocity curl. */ + float *rot_ux; + float *rot_uy; + float *rot_uz; + + /* viscosity information */ + + /*! Particle velocity divergence */ + float *div_v; + + /*! Particle velocity divergence from previous step */ + float *div_v_previous_step; + + /*! Artificial viscosity parameter */ + float *alpha_visc; + + /*! Signal velocity */ + float *v_sig; + + /* thermal diffusion information */ + + /*! del^2 u, a smoothed quantity */ + float *laplace_u; + + /*! Thermal diffusion coefficient */ + float *alpha_diff; + + /* force information */ + + /*! "Grad h" term -- only partial in P-U */ + float *f; + + /*! Particle soundspeed. */ + float *soundspeed; + + /*! Time derivative of smoothing length */ + float *h_dt; + + /*! Balsara switch */ + float *balsara; + + /*! Particle pressure. */ + float *pressure; + /*! Maximal alpha (viscosity) over neighbours */ + float *alpha_visc_max_ngb; + + /* timestep stuff */ + + /*! Time-step length */ + timebin_t *time_bin; + + /*all part of struct timestep_limiter_data, we had to destruct it + as GPUs don't like pointer chasing especially when memcpying*/ + /* Need waking-up ? */ + timebin_t *wakeup; + + /*! Minimal time-bin across all neighbours */ + timebin_t *min_ngb_time_bin; + + /* Do we want this particle to be synched back on the time-line? */ + char *to_be_synchronized; +} part_soa; +/*Container for particle data requierd for density calcs*/ +typedef struct part_aos { + + /*! Particle position. */ + double x_p; + double y_p; + double z_p; + + /*! Particle position. */ + double locx; + double locy; + double locz; + + /*! Particle predicted velocity. */ + float ux; + float uy; + float uz; + /*! Particle mass. */ + float mass; + /*! Particle smoothing length. */ + float h; + /*! Particle density. */ + float rho; + + /* Density information */ + /*! Neighbour number count. */ + float wcount; + /*! Derivative of the neighbour number with respect to h. */ + float wcount_dh; + /*! Derivative of density with respect to h */ + float rho_dh; + /*! Particle velocity curl. */ + float rot_ux; + float rot_uy; + float rot_uz; + + /* viscosity information */ + /*! Particle velocity divergence */ + float div_v; + + /* timestep stuff */ + /*! Time-step length */ + int time_bin; +} part_aos; + +/*Container for particle data requierd for density calcs*/ +typedef struct part_aos_f4_send { + /*! Particle position and h -> x, y, z, h */ + float4 x_p_h; + + /*! Particle predicted velocity and mass -> ux, uy, uz, m */ + float4 ux_m; + /*Markers for where neighbour cell j starts and stops in array indices for + * pair tasks*/ + int2 cjs_cje; +} part_aos_f4_send __attribute__((aligned(SWIFT_STRUCT_ALIGNMENT))); + +typedef struct part_aos_f4_recv { + /* Density information; rho */ + /*! Derivative of density with respect to h; rho_dh, + * Neighbour number count; w_count + * * Derivative of the neighbour number with respect to h; w_count_dh */ + float4 rho_dh_wcount; + /*! Particle velocity curl; rot_ux and + * velocity divergence; div_v */ + float4 rot_ux_div_v; +} part_aos_f4_recv; + +/*Container for particle data required for density calcs*/ +typedef struct part_aos_f4 { + /*! Particle position and h -> x, y, z, h */ + float4 x_p_h; + + /*! Particle predicted velocity and mass -> ux, uy, uz, m */ + float4 ux_m; + /* Density information; rho */ + /*! Derivative of density with respect to h; rho_dh, + * Neighbour number count; w_count + * * Derivative of the neighbour number with respect to h; w_count_dh */ + float4 rho_dh_wcount; + + /*! Particle velocity curl; rot_ux and + * velocity divergence; div_v */ + float4 rot_ux_div_v; + +} part_aos_f4; + +/*Container for particle data required for force calcs*/ +typedef struct part_aos_f { + + /*! Particle position. */ + double x_p; + double y_p; + double z_p; + + /*! Particle predicted velocity. */ + float ux; + float uy; + float uz; + /*! Particle mass. */ + float mass; + /*! Particle smoothing length. */ + float h; + /*! Particle density. */ + float rho; + /*! Particle pressure. */ + float pressure; + + /* Density information */ + /*! Speed of sound. */ + float soundspeed; + /*! Variable smoothing length term */ + float f; + /*! Derivative of density with respect to h */ + float balsara; + /*! Particle velocity curl. */ + float alpha_visc; + float a_hydrox; + float a_hydroy; + float a_hydroz; + float alpha_diff; + + /* viscosity information */ + /*! Internal energy */ + float u; + float u_dt; + /*! h time derivative */ + float h_dt; + float v_sig; + + /* timestep stuff */ + /*! Time-step length */ + int time_bin; + int min_ngb_time_bin; +} part_aos_f; + +/*Container for particle data requierd for force calcs*/ +typedef struct part_aos_f4_f { + + /*Data required for the calculation: + Values read to local GPU memory*/ + /*! Particle position smoothing length */ + float4 x_h; + /*! Particle predicted velocity and mass */ + float4 ux_m; + /*! Variable smoothing length term f, balsara, timebin + * and initial value of min neighbour timebin */ + float4 f_bals_timebin_mintimebin_ngb; + /*! Particle density, pressure, speed of sound & v_sig to read*/ + float4 rho_p_c_vsigi; + /*! Particle Internal energy u, alpha constants for visc and diff */ + float3 u_alphavisc_alphadiff; + + /*Result: Values output to global GPU memory*/ + /* change of u and h with dt, v_sig and returned value of + * minimum neighbour timebin */ + float4 udt_hdt_vsig_mintimebin_ngb; + /*Particle acceleration vector*/ + float3 a_hydro; + +} part_aos_f4_f; + +/*Container for particle data requierd for force calcs*/ +typedef struct part_aos_f4_f_send { + + /*Data required for the calculation: + Values read to local GPU memory*/ + /*! Particle position smoothing length */ + float4 x_h; + /*! Particle predicted velocity and mass */ + float4 ux_m; + /*! Variable smoothing length term f, balsara, timebin + * and initial value of min neighbour timebin */ + float4 f_bals_timebin_mintimebin_ngb; + /*! Particle density, pressure, speed of sound & v_sig to read*/ + float4 rho_p_c_vsigi; + /*! Particle Internal energy u, alpha constants for visc and diff */ + float3 u_alphavisc_alphadiff; + + int2 cjs_cje; + +} part_aos_f4_f_send; + +/*Container for particle data requierd for force calcs*/ +typedef struct part_aos_f4_f_recv { + + /*Result: Values output to global GPU memory*/ + /* change of u and h with dt, v_sig and returned value of + * minimum neighbour timebin */ + float4 udt_hdt_vsig_mintimebin_ngb; + /*Particle acceleration vector*/ + float3 a_hydro; + +} part_aos_f4_f_recv; + +/*Container for particle data requierd for gradient calcs*/ +typedef struct part_aos_g { + + /*! Particle position. */ + double x_p; + double y_p; + double z_p; + + /*! Particle velocity. */ + float ux; + float uy; + float uz; + /*! Particle mass. */ + float mass; + /*! Particle smoothing length. */ + float h; + /*! Particle density. */ + float rho; + + /* viscosity information */ + float visc_alpha; + float laplace_u; + float alpha_visc_max_ngb; + float v_sig; + + float u; + + float soundspeed; + + /* timestep stuff */ + /*! Time-step length */ + int time_bin; +} part_aos_g; + +/*Container for particle data requierd for gradient calcs*/ +typedef struct part_aos_f4_g { + + /*! Particle position & smoothing length */ + float4 x_h; + + /*! Particle velocity and mass */ + float4 ux_m; + + /*! Particle density alpha visc internal energy u and speed of sound c */ + float4 rho_avisc_u_c; + + /* viscosity information results */ + float3 vsig_lapu_aviscmax_empty; + +} part_aos_f4_g; + +/*Container for particle data requierd for gradient calcs*/ +typedef struct part_aos_f4_g_send { + + /*! Particle position & smoothing length */ + float4 x_h; + + /*! Particle velocity and mass */ + float4 ux_m; + + /*! Particle density alpha visc internal energy u and speed of sound c */ + float4 rho_avisc_u_c; + + /* viscosity information results */ + float3 vsig_lapu_aviscmax; + + /*Data for cell start and end*/ + int2 cjs_cje; + +} part_aos_f4_g_send; + +/*Container for particle data requierd for gradient calcs*/ +typedef struct part_aos_f4_g_recv { + + /* viscosity information results */ + float3 vsig_lapu_aviscmax; + +} part_aos_f4_g_recv; + +#ifdef __WITH_CUDA +} +#endif + +#endif // PART_GPU_H diff --git a/src/cuda/tester.cu b/src/cuda/tester.cu new file mode 100644 index 0000000000..3ffaf9e10c --- /dev/null +++ b/src/cuda/tester.cu @@ -0,0 +1,21 @@ +#include "tester.h" + +#include +#include +#ifdef __cplusplus +extern "C" { +#endif +void testing_linkage(int a, float *b, float c) { + std::vector b_value_list; + b_value_list.reserve(a); + for (int i = 0; i < a; i++) { + (*b) = (*b) + c; + b_value_list.push_back((*b)); + std::cout << "Vector value is " << b_value_list[i] << " b value is " << (*b) + << std::endl; + } + std::cout << "Final value of b is " << (*b) << std::endl; +} +#ifdef __cplusplus +} +#endif diff --git a/src/cuda/tester.h b/src/cuda/tester.h new file mode 100755 index 0000000000..5729e66904 --- /dev/null +++ b/src/cuda/tester.h @@ -0,0 +1,9 @@ +#ifdef __cplusplus +extern "C" { +#endif + +void testing_linkage(int a, float *b, float c); + +#ifdef __cplusplus +} +#endif diff --git a/src/engine.c b/src/engine.c index 6d1fa0e3f7..023885cb0c 100644 --- a/src/engine.c +++ b/src/engine.c @@ -1092,12 +1092,22 @@ int engine_estimate_nr_tasks(const struct engine *e) { */ n1 += 38; n2 += 2; +#ifdef WITH_CUDA // A. Nasar + n1 += 4; // Self force and density packs should be 2 but doubled to prevent + // code crash due to unpack tasks + n1 += 52; // Pair force and density packs should be 26 but doubled to + // prevent code crash due to unpack tasks +#endif #ifdef WITH_MPI n1 += 6; #endif #ifdef EXTRA_HYDRO_LOOP n1 += 15; +#ifdef WITH_CUDA + n1 += 1; // Self gradient packs + n1 += 13; // Pair gradient packs +#endif #ifdef WITH_MPI n1 += 2; #endif @@ -1750,9 +1760,13 @@ void engine_skip_force_and_kick(struct engine *e) { t->type == task_type_rt_ghost2 || t->type == task_type_rt_tchem || t->type == task_type_rt_advance_cell_time || t->type == task_type_neutrino_weight || t->type == task_type_csds || - t->subtype == task_subtype_force || + t->subtype == task_subtype_force || // A. Nasar + t->subtype == task_subtype_gpu_pack_f || + t->subtype == task_subtype_gpu_unpack_f || t->subtype == task_subtype_limiter || t->subtype == task_subtype_gradient || + t->subtype == task_subtype_gpu_pack_g || + t->subtype == task_subtype_gpu_unpack_g || t->subtype == task_subtype_stars_prep1 || t->subtype == task_subtype_stars_prep2 || t->subtype == task_subtype_stars_feedback || @@ -2192,7 +2206,25 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs, } #endif + // scheduler_write_dependencies(&e->sched, e->verbose, e->step); // A. Nasar + // write deps before running first step /* Now, launch the calculation */ + // message("n tasks %i", e->sched.nr_tasks); + // for (int i = 0; i < e->sched.nr_tasks; i++){ + // struct task *tmp_t = &e->sched.tasks[i]; + // if(tmp_t->subtype == task_subtype_density){ + // if(tmp_t->skip == 1)error("inactive density task"); + // } + //// if(tmp_t->subtype == task_subtype_force){ + //// if(tmp_t->skip == 1)error("inactive force task"); + //// } + // if(tmp_t->subtype == task_subtype_gpu_pack_d){ + // if(tmp_t->skip == 1)error("inactive pack task"); + // } + // if(tmp_t->subtype == task_subtype_gpu_unpack_d){ + // if(tmp_t->skip == 1)error("inactive unpack task"); + // } + // } TIMER_TIC; engine_launch(e, "tasks"); TIMER_TOC(timer_runners); @@ -2280,6 +2312,22 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs, scheduler_write_cell_dependencies(&e->sched, e->verbose, e->step); if (e->nodeID == 0) scheduler_write_task_level(&e->sched, e->step); + // for (int i = 0; i < e->sched.nr_tasks; i++){ + // struct task *tmp_t = &e->sched.tasks[i]; + // if(tmp_t->subtype == task_subtype_density){ + // if(tmp_t->skip == 1)error("inactive density task"); + // } + // if(tmp_t->subtype == task_subtype_force){ + // if(tmp_t->skip == 1)error("inactive force task"); + // } + // if(tmp_t->subtype == task_subtype_gpu_pack_d){ + // if(tmp_t->skip == 1)error("inactive pack task"); + // } + // if(tmp_t->subtype == task_subtype_gpu_unpack_d){ + // if(tmp_t->skip == 1)error("inactive unpack task"); + // } + // } + /* Run the 0th time-step */ TIMER_TIC2; engine_launch(e, "tasks"); diff --git a/src/engine_config.c b/src/engine_config.c index 5e6c4eb98c..4c0c4420c4 100644 --- a/src/engine_config.c +++ b/src/engine_config.c @@ -32,6 +32,19 @@ #include #endif +#ifdef WITH_CUDA +#include "runner_main_clean.cu" + +#include /* A. Nasar */ +#endif + +#ifdef WITH_HIP +// #include "/opt/rocm-5.1.0/hip/include/hip/hip_runtime.h" +#include "runner_main_clean.hip" + +#include +#endif + /* This object's header. */ #include "engine.h" @@ -909,9 +922,12 @@ void engine_config(int restart, int fof, struct engine *e, e->links_per_tasks = parser_get_opt_param_float(params, "Scheduler:links_per_tasks", 25.); - /* Init the scheduler. */ + /* Init the scheduler. Allow stealing*/ scheduler_init(&e->sched, e->s, maxtasks, nr_queues, (e->policy & scheduler_flag_steal), e->nodeID, &e->threadpool); + /* Init the scheduler. NO stealing A. Nasar */ + // scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID, + // &e->threadpool); /* Maximum size of MPI task messages, in KB, that should not be buffered, * that is sent using MPI_Issend, not MPI_Isend. 4Mb by default. Can be @@ -981,9 +997,20 @@ void engine_config(int restart, int fof, struct engine *e, for (int k = 0; k < e->nr_threads; k++) { e->runners[k].id = k; e->runners[k].e = e; + +#ifdef WITH_CUDA + if (pthread_create(&e->runners[k].thread, NULL, &runner_main2, + &e->runners[k]) != 0) + error("Failed to create GPU runner thread."); +#elif WITH_HIP + if (pthread_create(&e->runners[k].thread, NULL, &runner_main_hip, + &e->runners[k]) != 0) + error("Failed to create runner thread."); +#else if (pthread_create(&e->runners[k].thread, NULL, &runner_main, &e->runners[k]) != 0) error("Failed to create runner thread."); +#endif /* Try to pin the runner to a given core */ if (with_aff && diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c index 1c5a65d88f..a0ff23b2be 100644 --- a/src/engine_maketasks.c +++ b/src/engine_maketasks.c @@ -583,8 +583,13 @@ void engine_addtasks_recv_hydro( /* Early abort (are we below the level where tasks are)? */ if (!cell_get_flag(c, cell_flag_has_tasks)) return; - /* Have we reached a level where there are any hydro tasks ? */ - if (t_xv == NULL && c->hydro.density != NULL) { + /* Have we reached a level where there are any hydro tasks ? */ +#ifdef WITH_CUDA // A. Nasar + if (t_xv == NULL && c->hydro.density != NULL && c->hydro.density_pack != NULL) +#else + if (t_xv == NULL && c->hydro.density != NULL) +#endif /*WITH_CUDA*/ + { #ifdef SWIFT_DEBUG_CHECKS /* Make sure this cell has a valid tag. */ @@ -711,6 +716,18 @@ void engine_addtasks_recv_hydro( scheduler_addunlock(s, t_xv, l->t); scheduler_addunlock(s, l->t, t_rho); } +#ifdef WITH_CUDA /* A. Nasar POSSIBLE BUG HERE (More like PROBABLE) NOT \ + REQUIRED Ghost in for cell j is*/ + for (struct link *l = c->hydro.density_pack; l != NULL; l = l->next) { + scheduler_addunlock(s, t_xv, l->t); + scheduler_addunlock(s, l->t, t_rho); + } + for (struct link *l = c->hydro.density_unpack; l != NULL; l = l->next) { + scheduler_addunlock(s, l->t, t_rho); + } + +#endif + #ifdef EXTRA_HYDRO_LOOP for (struct link *l = c->hydro.gradient; l != NULL; l = l->next) { scheduler_addunlock(s, t_rho, l->t); @@ -720,12 +737,37 @@ void engine_addtasks_recv_hydro( scheduler_addunlock(s, t_gradient, l->t); scheduler_addunlock(s, l->t, tend); } -#else +#ifdef WITH_CUDA + for (struct link *l = c->hydro.gradient_pack; l != NULL; l = l->next) { + scheduler_addunlock(s, t_rho, l->t); + scheduler_addunlock(s, l->t, t_gradient); + } + for (struct link *l = c->hydro.gradient_unpack; l != NULL; l = l->next) { + scheduler_addunlock(s, l->t, t_gradient); + } + + for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) { + scheduler_addunlock(s, t_gradient, l->t); + scheduler_addunlock(s, l->t, tend); + } + for (struct link *l = c->hydro.force_unpack; l != NULL; l = l->next) { + scheduler_addunlock(s, l->t, tend); + } + +#endif /*WITH_CUDA*/ +#else /*EXTRA_HYDRO_LOOP*/ for (struct link *l = c->hydro.force; l != NULL; l = l->next) { scheduler_addunlock(s, t_rho, l->t); scheduler_addunlock(s, l->t, tend); } -#endif +#ifdef WITH_CUDA + for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) { + scheduler_addunlock(s, t_rho, l->t); + // scheduler_addunlock(s, l->t, t_ti); + } + scheduler_addunlock(s, c->hydro.super->hydro.f_unpack, tend); +#endif /*WITH_CUDA*/ +#endif /*EXTRA_HYDRO_LOOP*/ if (with_limiter) { for (struct link *l = c->hydro.limiter; l != NULL; l = l->next) { @@ -2088,7 +2130,10 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements, for (int ind = 0; ind < num_elements; ind++) { struct task *t = &((struct task *)map_data)[ind]; - + if (t->ci == NULL) { // Possible fix missing when moving code over. + // Prevents unpack tasks continuing past here + break; + } struct cell *ci = t->ci; struct cell *cj = t->cj; const enum task_types t_type = t->type; @@ -2116,6 +2161,12 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements, if (t_subtype == task_subtype_density) { engine_addlink(e, &ci->hydro.density, t); + } else if (t_subtype == task_subtype_gpu_pack_d) { // A. Nasar + engine_addlink(e, &ci->hydro.density_pack, t); + // } else if (t_subtype == task_subtype_gpu_pack_f) { + // engine_addlink(e, &ci->hydro.force_pack, t); + // } else if (t_subtype == task_subtype_gpu_pack_g) { + // engine_addlink(e, &ci->hydro.gradient_pack, t); } else if (t_subtype == task_subtype_grav) { engine_addlink(e, &ci->grav.grav, t); } else if (t_subtype == task_subtype_external_grav) { @@ -2130,6 +2181,15 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements, if (t_subtype == task_subtype_density) { engine_addlink(e, &ci->hydro.density, t); engine_addlink(e, &cj->hydro.density, t); + } else if (t_subtype == task_subtype_gpu_pack_d) { // A. Nasar + engine_addlink(e, &ci->hydro.density_pack, t); + engine_addlink(e, &cj->hydro.density_pack, t); + // } else if (t_subtype == task_subtype_gpu_pack_f) { + // engine_addlink(e, &ci->hydro.force_pack, t); + // engine_addlink(e, &cj->hydro.force_pack, t); + // } else if (t_subtype == task_subtype_gpu_pack_g) { + // engine_addlink(e, &ci->hydro.gradient_pack, t); + // engine_addlink(e, &cj->hydro.gradient_pack, t); } else if (t_subtype == task_subtype_grav) { engine_addlink(e, &ci->grav.grav, t); engine_addlink(e, &cj->grav.grav, t); @@ -2146,6 +2206,15 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements, if (t_subtype == task_subtype_density) { engine_addlink(e, &ci->hydro.density, t); + } else if (t_subtype == task_subtype_gpu_pack_d) { // A. Nasar + engine_addlink(e, &ci->hydro.density_pack, t); + // error("Abouzied: you need to code this up!"); + } else if (t_subtype == task_subtype_gpu_pack_f) { + engine_addlink(e, &ci->hydro.force_pack, t); + // error("Abouzied: you need to code this up!"); + } else if (t_subtype == task_subtype_gpu_pack_g) { + engine_addlink(e, &ci->hydro.gradient_pack, t); + // error("Abouzied: you need to code this up!"); } else if (t_subtype == task_subtype_grav) { engine_addlink(e, &ci->grav.grav, t); } else if (t_subtype == task_subtype_external_grav) { @@ -2160,6 +2229,18 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements, if (t_subtype == task_subtype_density) { engine_addlink(e, &ci->hydro.density, t); engine_addlink(e, &cj->hydro.density, t); + } else if (t_subtype == task_subtype_gpu_pack_d) { + engine_addlink(e, &ci->hydro.density_pack, t); + engine_addlink(e, &cj->hydro.density_pack, t); + // error("Abouzied: you need to code this up!"); + } else if (t_subtype == task_subtype_gpu_pack_f) { + engine_addlink(e, &ci->hydro.force_pack, t); + engine_addlink(e, &cj->hydro.force_pack, t); + // error("Abouzied: you need to code this up!"); + } else if (t_subtype == task_subtype_gpu_pack_g) { + engine_addlink(e, &ci->hydro.gradient_pack, t); + engine_addlink(e, &cj->hydro.gradient_pack, t); + // error("Abouzied: you need to code this up!"); } else if (t_subtype == task_subtype_grav) { engine_addlink(e, &ci->grav.grav, t); engine_addlink(e, &cj->grav.grav, t); @@ -2197,7 +2278,7 @@ void engine_link_gravity_tasks(struct engine *e) { /* Get a pointer to the task. */ struct task *t = &sched->tasks[k]; - if (t->type == task_type_none) continue; + if (t->type == task_type_none || t->ci == NULL) continue; /* Get the cells we act on */ struct cell *ci = t->ci; @@ -2425,12 +2506,14 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements, const int with_sink = (e->policy & engine_policy_sinks); #ifdef EXTRA_HYDRO_LOOP struct task *t_gradient = NULL; + struct task *t_gradient_gpu = NULL; // A. Nasar #endif #ifdef EXTRA_STAR_LOOPS struct task *t_star_prep1 = NULL; struct task *t_star_prep2 = NULL; #endif struct task *t_force = NULL; + struct task *t_force_gpu = NULL; struct task *t_limiter = NULL; struct task *t_star_density = NULL; struct task *t_star_feedback = NULL; @@ -2466,6 +2549,33 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements, scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t); } + /*Make packing depend on sorts and drift A. Nasar */ + else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack_d) { + scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t); + /* Task for the second GPU hydro loop A. Nasar */ + t_force_gpu = scheduler_addtask(sched, task_type_self, + task_subtype_gpu_pack_f, 0, 0, ci, NULL); + /* Link the tasks to the cells. Do the same for GPU tasks A. Nasar */ + engine_addlink(e, &ci->hydro.force_pack, t_force_gpu); +#ifdef EXTRA_HYDRO_LOOP + /* Same work for the additional GPU hydro loop A. Nasar */ + t_gradient_gpu = scheduler_addtask( + sched, task_type_self, task_subtype_gpu_pack_g, 0, 0, ci, NULL); + /* Add the link between the new loops and the cell. Same for GPU task A. + * Nasar */ + engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu); + // A. Nasar add unlocks for pack tasks here. Unpacks depend on packs and + // will be used to create downstream deps later + scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, + t_gradient_gpu); + scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, + t_force_gpu); +#else + /* Now, build all the dependencies for the hydro */ + scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu); +#endif + } + /* Sort tasks depend on the drift of the cell (stars version). */ else if (t_type == task_type_stars_sort && ci->nodeID == nodeID) { scheduler_addunlock(sched, ci->hydro.super->stars.drift, t); @@ -2549,6 +2659,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements, /* Link the tasks to the cells */ engine_addlink(e, &ci->hydro.force, t_force); + if (with_timestep_limiter) { engine_addlink(e, &ci->hydro.limiter, t_limiter); } @@ -2582,10 +2693,9 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements, /* Same work for the additional hydro loop */ t_gradient = scheduler_addtask(sched, task_type_self, task_subtype_gradient, flags, 0, ci, NULL); - - /* Add the link between the new loops and the cell */ + /* Add the link between the new loops and the cell. Same for GPU task A. + * Nasar */ engine_addlink(e, &ci->hydro.gradient, t_gradient); - /* Now, build all the dependencies for the hydro */ engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force, t_limiter, ci, with_cooling, @@ -2727,6 +2837,80 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements, } } + /*Make packing depend on sorts and drift A. Nasar */ + else if (t_type == task_type_pair && t_subtype == task_subtype_gpu_pack_d) { + /* Make all density tasks depend on the drift */ + if (ci->nodeID == nodeID) { + scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t); + } + if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) { + scheduler_addunlock(sched, cj->hydro.super->hydro.drift, t); + } + /* Make all density tasks depend on the sorts */ + scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t); + if (ci->hydro.super != cj->hydro.super) { + scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t); + } + /* New task for the force A. Nasar */ + t_force_gpu = scheduler_addtask(sched, task_type_pair, + task_subtype_gpu_pack_f, 0, 0, ci, cj); +#ifdef MPI_SYMMETRIC_FORCE_INTERACTION + /* The order of operations for an inactive local cell interacting + * with an active foreign cell is not guaranteed because the density + * (and gradient) iact loops don't exist in that case. So we need + * an explicit dependency here to have sorted cells. */ + + /* Make GPU force tasks depend on the sorts A. Nasar */ + scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force_gpu); + if (ci->hydro.super != cj->hydro.super) { + scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force_gpu); + } +#endif + /* Do teh same for GPU tasks A. Nasar*/ + engine_addlink(e, &ci->hydro.force_pack, t_force_gpu); + engine_addlink(e, &cj->hydro.force_pack, t_force_gpu); +#ifdef EXTRA_HYDRO_LOOP + /* Start by constructing the task for the second and third GPU hydro loop + * A. Nasar */ + t_gradient_gpu = scheduler_addtask(sched, task_type_pair, + task_subtype_gpu_pack_g, 0, 0, ci, cj); + // /* Add the link between the new loop and both cells */ + engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu); + engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu); + + /* Now, build all the dependencies for the hydro for the cells */ + /* that are local and are not descendant of the same super_hydro-cells */ + if (ci->nodeID == nodeID) { + /*Same for GPU tasks*/ + scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, + t_gradient_gpu); + scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, + t_force_gpu); + } + if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) { + /*Same for GPU tasks*/ + scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out, + t_gradient_gpu); + scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost, + t_force_gpu); + } +#else + /* Now, build all the dependencies for the hydro for the cells */ + /* that are local and are not descendant of the same super_hydro-cells */ + if (ci->nodeID == nodeID) { + // GPU tasks A. Nasar + scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, + t_force_gpu); + } + if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) { + // GPU tasks A. Nasar + scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out, + t_force_gpu); + } +#endif + + } + /* Otherwise, pair interaction? */ else if (t_type == task_type_pair && t_subtype == task_subtype_density) { @@ -2849,6 +3033,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements, engine_addlink(e, &ci->hydro.force, t_force); engine_addlink(e, &cj->hydro.force, t_force); + if (with_timestep_limiter) { engine_addlink(e, &ci->hydro.limiter, t_limiter); engine_addlink(e, &cj->hydro.limiter, t_limiter); @@ -2931,6 +3116,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements, with_cooling, with_timestep_limiter); } + #endif if (with_feedback) { @@ -3269,7 +3455,39 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements, } } } + /*Make packing depend on sorts and drift A. Nasar */ + else if (t_type == task_type_sub_self && + t_subtype == task_subtype_gpu_pack_d) { + + scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t); + scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t); + /* Start by constructing the task for the second hydro loop */ + t_force_gpu = + scheduler_addtask(sched, task_type_sub_self, task_subtype_gpu_pack_f, + flags, 0, ci, NULL); + /* Add the link between the new loop and the cell */ + engine_addlink(e, &ci->hydro.force_pack, t_force_gpu); +#ifdef EXTRA_HYDRO_LOOP + + /* Start by constructing the task for the second and third hydro loop */ + t_gradient_gpu = + scheduler_addtask(sched, task_type_sub_self, task_subtype_gpu_pack_g, + flags, 0, ci, NULL); + /* Add the link between the new loop and the cell */ + engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu); + /* Now, build all the dependencies for the hydro for the cells */ + /* that are local and are not descendant of the same super_hydro-cells */ + scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, + t_gradient_gpu); + scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, + t_force_gpu); +#else + /* Now, build all the dependencies for the hydro for the cells */ + /* that are local and are not descendant of the same super_hydro-cells */ + scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu); +#endif + } /* Otherwise, sub-self interaction? */ else if (t_type == task_type_sub_self && t_subtype == task_subtype_density) { @@ -3355,6 +3573,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements, /* Add the link between the new loop and the cell */ engine_addlink(e, &ci->hydro.force, t_force); + if (with_timestep_limiter) { engine_addlink(e, &ci->hydro.limiter, t_limiter); } @@ -3388,10 +3607,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements, /* Start by constructing the task for the second and third hydro loop */ t_gradient = scheduler_addtask(sched, task_type_sub_self, task_subtype_gradient, flags, 0, ci, NULL); - /* Add the link between the new loop and the cell */ engine_addlink(e, &ci->hydro.gradient, t_gradient); - /* Now, build all the dependencies for the hydro for the cells */ /* that are local and are not descendant of the same super_hydro-cells */ engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force, @@ -3541,7 +3758,64 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements, /* Otherwise, sub-pair interaction? */ else if (t_type == task_type_sub_pair && - t_subtype == task_subtype_density) { + t_subtype == task_subtype_gpu_pack_d) { + /* Make all density pack tasks depend on the drift */ + if (ci->nodeID == nodeID) { + scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t); + } + if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) { + scheduler_addunlock(sched, cj->hydro.super->hydro.drift, t); + } + /* Make all density tasks depend on the sorts */ + scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t); + if (ci->hydro.super != cj->hydro.super) { + scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t); + } + t_force_gpu = scheduler_addtask( + sched, task_type_sub_pair, task_subtype_gpu_pack_f, flags, 0, ci, cj); +#ifdef MPI_SYMMETRIC_FORCE_INTERACTION + /* Make all force tasks depend on the sorts */ + scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force_gpu); + if (ci->hydro.super != cj->hydro.super) { + scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force_gpu); + } +#endif + engine_addlink(e, &ci->hydro.force_pack, t_force_gpu); + engine_addlink(e, &cj->hydro.force_pack, t_force_gpu); +#ifdef EXTRA_HYDRO_LOOP + t_gradient_gpu = scheduler_addtask( + sched, task_type_sub_pair, task_subtype_gpu_pack_g, flags, 0, ci, cj); + engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu); + engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu); + /* Now, build all the dependencies for the hydro for the cells */ + /* that are local and are not descendant of the same super_hydro-cells */ + if (ci->nodeID == nodeID) { + scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, + t_gradient_gpu); + scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, + t_force_gpu); + } + if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) { + scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out, + t_gradient_gpu); + scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost, + t_force_gpu); + } +#else + /* Now, build all the dependencies for the hydro for the cells */ + /* that are local and are not descendant of the same super_hydro-cells */ + if (ci->nodeID == nodeID) { + scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, + t_force_gpu); + } + if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) { + scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out, + t_force_gpu); + } +#endif + + } else if (t_type == task_type_sub_pair && + t_subtype == task_subtype_density) { const int bcount_i = ci->black_holes.count; const int bcount_j = cj->black_holes.count; @@ -3724,11 +3998,9 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements, /* Start by constructing the task for the second and third hydro loop */ t_gradient = scheduler_addtask(sched, task_type_sub_pair, task_subtype_gradient, flags, 0, ci, cj); - /* Add the link between the new loop and both cells */ engine_addlink(e, &ci->hydro.gradient, t_gradient); engine_addlink(e, &cj->hydro.gradient, t_gradient); - /* Now, build all the dependencies for the hydro for the cells */ /* that are local and are not descendant of the same super_hydro-cells */ if (ci->nodeID == nodeID) { @@ -4142,9 +4414,13 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements, continue; /* If the cell is local build a self-interaction */ + // struct task *t_pack_self; // A. Nasar if (ci->nodeID == nodeID) { scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0, ci, NULL); + // A. Nasar also add a pack task for GPU + scheduler_addtask(sched, task_type_self, task_subtype_gpu_pack_d, 0, 0, + ci, NULL); } /* Now loop over all the neighbours of this cell */ @@ -4178,6 +4454,8 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements, const int sid = sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))]; scheduler_addtask(sched, task_type_pair, task_subtype_density, sid, 0, ci, cj); + scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack_d, sid, + 0, ci, cj); // A. Nasar #ifdef SWIFT_DEBUG_CHECKS #ifdef WITH_MPI @@ -4600,7 +4878,6 @@ void engine_maketasks(struct engine *e) { struct cell *cells = s->cells_top; const int nr_cells = s->nr_cells; const ticks tic = getticks(); - /* Re-set the scheduler. */ scheduler_reset(sched, engine_estimate_nr_tasks(e)); @@ -4715,7 +4992,251 @@ void engine_maketasks(struct engine *e) { * sched->tasks, sched->nr_tasks, sizeof(struct task), * threadpool_auto_chunk_size, e); */ } + int unsplit = 0, split = 0; + /*These loops should really be threadmapped A. Nasar*/ + for (int i = 0; i < sched->nr_tasks; i++) { + struct task * t = &sched->tasks[i]; + if(t->type == task_type_sub_self && t->subtype == + task_subtype_gpu_pack_d){ + t->type = task_type_self; + } + if(t->type == task_type_sub_pair && t->subtype == + task_subtype_gpu_pack_d){ + t->type = task_type_pair; + } + if(t->type == task_type_sub_self && t->subtype == + task_subtype_gpu_pack_g){ + t->type = task_type_self; + } + if(t->type == task_type_sub_pair && t->subtype == + task_subtype_gpu_pack_g){ + t->type = task_type_pair; + } + if(t->type == task_type_sub_self && t->subtype == + task_subtype_gpu_pack_f){ + t->type = task_type_self; + } + if(t->type == task_type_sub_pair && t->subtype == + task_subtype_gpu_pack_f){ + t->type = task_type_pair; + } + } + + /* Now, create unpack tasks based on the existing packs and create + * the dependencies pack->unpack->ghost_in A. Nasar */ + const int pack_size = sched->pack_size; + const int pack_size_pair = sched->pack_size_pair; + + int count_current_self = 0; + int count_current_pair = 0; + + struct task *last_created_self_unpack = NULL; + struct task *last_created_pair_unpack = NULL; + /* Loop over all the currently existing pack tasks + * These loops should be thread-mapped too but will be a bit more tricky: A. + * Nasar*/ + for (int i = 0; i < sched->nr_tasks; i++) { + + struct task *t = &sched->tasks[i]; + if (t->subtype != task_subtype_gpu_pack_d) continue; + + if (t->type == task_type_self || t->type == task_type_sub_self) { + + if (count_current_self % pack_size == 0) { + last_created_self_unpack = scheduler_addtask( + sched, task_type_self, task_subtype_gpu_unpack_d, 0, 0, NULL, NULL); + last_created_self_unpack->gpu_done = 0; + } + + /* pack -> unpack -> ghost_in */ + scheduler_addunlock(sched, t, last_created_self_unpack); + scheduler_addunlock(sched, last_created_self_unpack, + t->ci->hydro.super->hydro + .ghost_in); // Keep self_unpack dependency here, + // pairs added later using links + /*Creating links between each cell and its unpack task*/ + engine_addlink(e, &t->ci->hydro.density_unpack, last_created_self_unpack); + t->ci->hydro.d_unpack = last_created_self_unpack; + ++count_current_self; + } + + else if (t->type == task_type_pair || t->type == task_type_sub_pair) { + if (count_current_pair % pack_size_pair == 0) { + last_created_pair_unpack = scheduler_addtask( + sched, task_type_pair, task_subtype_gpu_unpack_d, 0, 0, NULL, NULL); + } + + scheduler_addunlock(sched, t, last_created_pair_unpack); + if (t->ci->nodeID == e->nodeID) + scheduler_addunlock(sched, last_created_pair_unpack, + t->ci->hydro.super->hydro.ghost_in); + if ((t->cj->nodeID == e->nodeID) && + (t->ci->hydro.super != t->cj->hydro.super)) + scheduler_addunlock(sched, last_created_pair_unpack, + t->cj->hydro.super->hydro.ghost_in); + + engine_addlink(e, &t->ci->hydro.density_unpack, last_created_pair_unpack); + engine_addlink(e, &t->cj->hydro.density_unpack, last_created_pair_unpack); + + ++count_current_pair; + } else { + /* Abouzied: I need to implement the sub-self and sub-pair version */ + error("Something bad happened"); + } + } +#ifdef SWIFT_DEBUG_CHECKS + if (count_current_self != sched->nr_self_pack_tasks_d) + error("We did not find the correct number of self pack tasks!!"); + if (count_current_pair != sched->nr_pair_pack_tasks_d) + error("We did not find the correct number of pair pack tasks!!"); +#endif + + /*Now create unpacks for all gpu_pack_g (gradient) tasks A. Nasar */ + count_current_self = 0; + count_current_pair = 0; + + last_created_self_unpack = NULL; + last_created_pair_unpack = NULL; + /* Loop over all the currently existing gradient pack tasks */ + for (int i = 0; i < sched->nr_tasks; i++) { + + struct task *t = &sched->tasks[i]; + if (t->subtype != task_subtype_gpu_pack_g) continue; + + if (t->type == task_type_self || t->type == task_type_sub_self) { + + if (count_current_self % pack_size == 0) { + last_created_self_unpack = scheduler_addtask( + sched, task_type_self, task_subtype_gpu_unpack_g, 0, 0, NULL, NULL); + last_created_self_unpack->gpu_done = 0; + } + + /* pack -> unpack -> ghost_in */ + scheduler_addunlock(sched, t, last_created_self_unpack); + scheduler_addunlock(sched, last_created_self_unpack, + t->ci->hydro.super->hydro.extra_ghost); + /*Creating links between a each cell and its unpack task*/ + engine_addlink(e, &t->ci->hydro.gradient_unpack, + last_created_self_unpack); + t->ci->hydro.g_unpack = last_created_self_unpack; + + ++count_current_self; + } + + else if (t->type == task_type_pair || t->type == task_type_sub_pair) { + if (count_current_pair % pack_size_pair == 0) { + last_created_pair_unpack = scheduler_addtask( + sched, task_type_pair, task_subtype_gpu_unpack_g, 0, 0, NULL, NULL); + } + + /* pack -> unpack -> ghost_in */ + scheduler_addunlock(sched, t, last_created_pair_unpack); + if (t->ci->nodeID == e->nodeID) + scheduler_addunlock(sched, last_created_pair_unpack, + t->ci->hydro.super->hydro.extra_ghost); + if ((t->cj->nodeID == e->nodeID) && + (t->ci->hydro.super != t->cj->hydro.super)) + scheduler_addunlock(sched, last_created_pair_unpack, + t->cj->hydro.super->hydro.extra_ghost); + + engine_addlink(e, &t->ci->hydro.gradient_unpack, + last_created_pair_unpack); + engine_addlink(e, &t->cj->hydro.gradient_unpack, + last_created_pair_unpack); + + ++count_current_pair; + } else { + /* Abouzied: I need to implement the sub-self and sub-pair version */ + error("Something bad happened"); + } + } +#ifdef SWIFT_DEBUG_CHECKS + if (count_current_self != sched->nr_self_pack_tasks_g) + error( + "We did not find the correct number of G self pack tasks!! count %i " + "what it shoudl be %i", + count_current_self, sched->nr_self_pack_tasks_g); + if (count_current_pair != sched->nr_pair_pack_tasks_g) + error( + "We did not find the correct number of G pair pack tasks!! count %i " + "what it shoudl be %i", + count_current_pair, sched->nr_pair_pack_tasks_g); +#endif + + /*Now create unpacks for all gpu_pack_f (force) tasks*/ + count_current_self = 0; + count_current_pair = 0; + + last_created_self_unpack = NULL; + last_created_pair_unpack = NULL; + /* Loop over all the currently existing gradient pack tasks */ + for (int i = 0; i < sched->nr_tasks; i++) { + + struct task *t = &sched->tasks[i]; + if (t->subtype != task_subtype_gpu_pack_f) continue; + + if (t->type == task_type_self || t->type == task_type_sub_self) { + + if (count_current_self % pack_size == 0) { + last_created_self_unpack = scheduler_addtask( + sched, task_type_self, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL); + } + + /* pack -> unpack -> ghost_in */ + scheduler_addunlock(sched, t, last_created_self_unpack); + scheduler_addunlock(sched, last_created_self_unpack, + t->ci->hydro.super->hydro.end_force); + /*Creating links between a each cell and its unpack task*/ + engine_addlink(e, &t->ci->hydro.force_unpack, last_created_self_unpack); + + ++count_current_self; + } + + else if (t->type == task_type_pair || t->type == task_type_sub_pair) { + if (count_current_pair % pack_size_pair == 0) { + last_created_pair_unpack = scheduler_addtask( + sched, task_type_pair, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL); + } + + /* pack -> unpack -> ghost_in */ + scheduler_addunlock(sched, t, last_created_pair_unpack); + if (t->ci->nodeID == e->nodeID) + scheduler_addunlock(sched, last_created_pair_unpack, + t->ci->hydro.super->hydro.end_force); + if ((t->cj->nodeID == e->nodeID) && + (t->ci->hydro.super != t->cj->hydro.super)) + scheduler_addunlock(sched, last_created_pair_unpack, + t->cj->hydro.super->hydro.end_force); + + engine_addlink(e, &t->ci->hydro.force_unpack, last_created_pair_unpack); + engine_addlink(e, &t->cj->hydro.force_unpack, last_created_pair_unpack); + + ++count_current_pair; + } else { + /* Abouzied: I need to implement the sub-self and sub-pair version */ + error("Something bad happened"); + } + } +#ifdef SWIFT_DEBUG_CHECKS + if (count_current_self != sched->nr_self_pack_tasks_f) + error("We did not find the correct number of F self pack tasks!!"); + if (count_current_pair != sched->nr_pair_pack_tasks_f) + error("We did not find the correct number of F pair pack tasks!!"); +#endif + /*Debug code to check if some tasks are not split to desired level in tree for + * GPU*/ + // for (int i = 0; i < sched->nr_tasks; i++) { + // struct task *t = &sched->tasks[i]; + // if(t->ci != NULL){ + //// if(t->type == task_type_pair && ((t->ci->split && !t->cj->split) || + ///(!t->ci->split && t->cj->split))) / error("one is split the other + /// isn't"); + // if(t->ci->hydro.count > 80 && t->type == task_type_self) + // error("Count is %i task subtype (%s)", + // t->ci->hydro.count, subtaskID_names[t->subtype]); + // } + // } if (e->verbose) message("Making extra hydroloop tasks took %.3f %s.", clocks_from_ticks(getticks() - tic2), clocks_getunit()); @@ -4866,4 +5387,39 @@ void engine_maketasks(struct engine *e) { if (e->verbose) message("took %.3f %s (including reweight).", clocks_from_ticks(getticks() - tic), clocks_getunit()); + + /* Loop over all the CPU hydro tasks to make implicit (needs threadmapping)*/ + for (int i = 0; i < sched->nr_tasks; i++) { + + struct task *t = &sched->tasks[i]; + if (t->subtype == task_subtype_density || + t->subtype == task_subtype_gradient || + t->subtype == task_subtype_force) { + t->implicit = 1; + } + // if (t->subtype == task_subtype_gpu_pack_d || + // t->subtype == task_subtype_gpu_pack_g || + // t->subtype == task_subtype_gpu_pack_f || + // t->subtype == task_subtype_gpu_unpack_d || + // t->subtype == task_subtype_gpu_unpack_g || + // t->subtype == task_subtype_gpu_unpack_f){ + // t->implicit = 1; + // } + // if (t->subtype == task_subtype_gpu_pack_g || + // t->subtype == task_subtype_gpu_pack_f || + // t->subtype == task_subtype_gpu_unpack_g || + // t->subtype == task_subtype_gpu_unpack_f){// || + //// (t->type == task_type_pair && + //// t->subtype == task_subtype_gpu_pack_d)){ + // t->implicit = 1; + // } + // if ((t->subtype == task_subtype_gpu_pack_d || + // t->subtype == task_subtype_gpu_pack_g || + // t->subtype == task_subtype_gpu_pack_f) && + // (t->type == task_type_sub_pair || + // t->type == task_type_sub_self)){ + // t->implicit = 1; + //// error("STill have subs"); + // } + } } diff --git a/src/engine_marktasks.c b/src/engine_marktasks.c index 27b31c99c4..89f5e41b74 100644 --- a/src/engine_marktasks.c +++ b/src/engine_marktasks.c @@ -86,6 +86,25 @@ void engine_marktasks_mapper(void *map_data, int num_elements, const enum task_types t_type = t->type; const enum task_subtypes t_subtype = t->subtype; + // Activate GPU unpack tasks (cell-less dummy tasks so need activating + // separately) + if (t_type == task_type_self && + (t_subtype == task_subtype_gpu_unpack_d || + t_subtype == task_subtype_gpu_unpack_g || + t_subtype == task_subtype_gpu_unpack_f)) { // A. Nasar + scheduler_activate(s, t); + continue; + } + + if (t_type == task_type_pair && + (t_subtype == task_subtype_gpu_unpack_d || + t_subtype == task_subtype_gpu_unpack_g || + t_subtype == task_subtype_gpu_unpack_f)) { // A. Nasar + scheduler_activate(s, t); + continue; + // fprintf(stderr,"activated pair unpack in marktasks\n"); + } + /* Single-cell task? */ if (t_type == task_type_self || t_type == task_type_sub_self) { @@ -93,7 +112,17 @@ void engine_marktasks_mapper(void *map_data, int num_elements, struct cell *ci = t->ci; #ifdef SWIFT_DEBUG_CHECKS +#ifndef WITH_CUDA // A. Nasar if (ci->nodeID != nodeID) error("Non-local self task found"); +#else + if ((ci->nodeID != nodeID) && (t_subtype != task_subtype_gpu_unpack_d) && + (t_subtype != task_subtype_gpu_unpack_f) && + (t_subtype != task_subtype_gpu_unpack_g)) { + fprintf(stderr, "task is %i\n", subtaskID_names[t->subtype]); + error("Non-local self task found. Task is subtaskID_names[%s]", + subtaskID_names[t->subtype]); + } +#endif #endif const int ci_active_hydro = cell_is_active_hydro(ci, e); @@ -115,6 +144,39 @@ void engine_marktasks_mapper(void *map_data, int num_elements, } } + /* Activate packing for GPU A. Nasar */ + else if (t_type == task_type_self && + t_subtype == task_subtype_gpu_pack_d) { + if (ci_active_hydro) { + scheduler_activate(s, t); + ci->pack_done = 0; + ci->gpu_done = 0; + ci->unpack_done = 0; + } + } + + /* Activate packing for GPU */ + else if (t_type == task_type_self && + t_subtype == task_subtype_gpu_pack_g) { + if (ci_active_hydro) { + scheduler_activate(s, t); + ci->pack_done_g = 0; + ci->gpu_done_g = 0; + ci->unpack_done_g = 0; + } + } + + /* Activate packing for GPU */ + else if (t_type == task_type_self && + t_subtype == task_subtype_gpu_pack_f) { + if (ci_active_hydro) { + scheduler_activate(s, t); + ci->pack_done_f = 0; + ci->gpu_done_f = 0; + ci->unpack_done_f = 0; + } + } + /* Store current values of dx_max and h_max. */ else if (t_type == task_type_sub_self && t_subtype == task_subtype_density) { @@ -125,12 +187,22 @@ void engine_marktasks_mapper(void *map_data, int num_elements, } } + /* Store current values of dx_max and h_max. A. Nasar: Unsure if we + actually need this*/ + else if (t_type == task_type_sub_self && + t_subtype == task_subtype_gpu_pack_d) { + if (ci_active_hydro) { + scheduler_activate(s, t); + } + } + else if (t_type == task_type_self && t_subtype == task_subtype_force) { if (ci_active_hydro) scheduler_activate(s, t); } else if (t_type == task_type_sub_self && - t_subtype == task_subtype_force) { + (t_subtype == task_subtype_force || + t_subtype == task_subtype_gpu_pack_f)) { if (ci_active_hydro) scheduler_activate(s, t); } @@ -149,7 +221,8 @@ void engine_marktasks_mapper(void *map_data, int num_elements, } else if (t_type == task_type_sub_self && - t_subtype == task_subtype_gradient) { + (t_subtype == task_subtype_gradient || + t_subtype == task_subtype_gpu_pack_g)) { if (ci_active_hydro) scheduler_activate(s, t); } @@ -409,7 +482,29 @@ void engine_marktasks_mapper(void *map_data, int num_elements, const int ci_active_rt = cell_is_rt_active(ci, e); const int cj_active_rt = cell_is_rt_active(cj, e); - /* Only activate tasks that involve a local active cell. */ + /* Activate packing for GPU A. Nasar */ + if (t_subtype == task_subtype_gpu_pack_d && + ((ci_active_hydro && ci_nodeID == nodeID) || + (cj_active_hydro && cj_nodeID == nodeID))) { + scheduler_activate(s, t); + ci->gpu_done_pair = 0; + cj->gpu_done_pair = 0; + } else if (t_subtype == task_subtype_gpu_pack_g && + ((ci_active_hydro && ci_nodeID == nodeID) || + (cj_active_hydro && cj_nodeID == nodeID))) { + scheduler_activate(s, t); + ci->gpu_done_pair_g = 0; + cj->gpu_done_pair_g = 0; + } else if (t_subtype == task_subtype_gpu_pack_f && + ((ci_active_hydro && ci_nodeID == nodeID) || + (cj_active_hydro && cj_nodeID == nodeID))) { + scheduler_activate(s, t); + ci->gpu_done_pair_f = 0; + cj->gpu_done_pair_f = 0; + } + + /* Only activate tasks that involve a local active cell. A. Nasar THIS + * COULD BE SOURCE OF BUG */ if ((t_subtype == task_subtype_density || t_subtype == task_subtype_gradient || t_subtype == task_subtype_limiter || diff --git a/src/error.h b/src/error.h index a9b7481cf4..806b74f123 100644 --- a/src/error.h +++ b/src/error.h @@ -22,7 +22,11 @@ #define SWIFT_ERROR_H /* Config parameters. */ +#ifdef WITH_CUDA +#include "../config.h" +#else #include +#endif /* Some standard headers. */ #include diff --git a/src/files_for_new_functions/arrays_malloc.cu b/src/files_for_new_functions/arrays_malloc.cu new file mode 100644 index 0000000000..3bbf998231 --- /dev/null +++ b/src/files_for_new_functions/arrays_malloc.cu @@ -0,0 +1,363 @@ +#include "cuda/part_gpu.h" + +#include +#include +#include +#include + +#ifdef WITH_CUDA +extern "C" { +#endif + +#include "arrays_malloc.h" + +void allocate_host(struct part_soa *parts_soa, int count_max_parts_tmp) { + ///////////Malloc Host arrays + cudaMallocHost((void **)&parts_soa->tid_p, count_max_parts_tmp * sizeof(int)); + cudaMallocHost((void **)&parts_soa->id, + count_max_parts_tmp * sizeof(long long)); + cudaMallocHost((void **)&parts_soa->mass, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->h, count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->u, count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->u_dt, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->rho, count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->SPH_sum, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->x_p, + count_max_parts_tmp * sizeof(double)); + cudaMallocHost((void **)&parts_soa->y_p, + count_max_parts_tmp * sizeof(double)); + cudaMallocHost((void **)&parts_soa->z_p, + count_max_parts_tmp * sizeof(double)); + cudaMallocHost((void **)&parts_soa->ux, count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->uy, count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->uz, count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->a_hydrox, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->a_hydroy, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->a_hydroz, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->locx, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->locy, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->locz, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->widthx, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->widthy, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->widthz, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->h_max, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->count_p, + count_max_parts_tmp * sizeof(int)); + cudaMallocHost((void **)&parts_soa->wcount, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->wcount_dh, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->rho_dh, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->rot_ux, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->rot_uy, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->rot_uz, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->div_v, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->div_v_previous_step, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->alpha_visc, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->v_sig, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->laplace_u, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->alpha_diff, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->f, count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->soundspeed, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->h_dt, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->balsara, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->pressure, + count_max_parts_tmp * sizeof(float)); + cudaMallocHost((void **)&parts_soa->alpha_visc_max_ngb, + count_max_parts_tmp * sizeof(float)); + /* timestep stuff */ + cudaMallocHost((void **)&parts_soa->time_bin, + count_max_parts_tmp * sizeof(timebin_t)); + cudaMallocHost((void **)&parts_soa->wakeup, + count_max_parts_tmp * sizeof(timebin_t)); + cudaMallocHost((void **)&parts_soa->min_ngb_time_bin, + count_max_parts_tmp * sizeof(timebin_t)); + cudaMallocHost((void **)&parts_soa->to_be_synchronized, + count_max_parts_tmp * sizeof(char)); +} + +void allocate_device(struct part_soa d_parts_soa, int count_max_parts_tmp) { + ////////now malloc variables for particle data on the GPU. Sheesh + fprintf(stderr, "before malloc\n"); + cudaMalloc((void **)&(d_parts_soa.tid_p), sizeof(int) * count_max_parts_tmp); + fprintf(stderr, "after malloc\n"); + cudaMalloc((void **)&(d_parts_soa.id), + sizeof(long long) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.x_p), sizeof(double) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.y_p), sizeof(double) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.z_p), sizeof(double) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.ux), sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.uy), sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.uz), sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.a_hydrox), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.a_hydroy), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.a_hydroz), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.mass), sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.h), sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.u), sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.u_dt), sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.rho), sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.SPH_sum), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.locx), sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.locy), sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.locz), sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.widthx), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.widthy), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.widthz), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.h_max), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.count_p), + sizeof(int) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.wcount), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.wcount_dh), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.rho_dh), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.rot_ux), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.rot_uy), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.rot_uz), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.div_v), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.div_v_previous_step), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.alpha_visc), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.v_sig), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.laplace_u), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.alpha_diff), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.f), sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.soundspeed), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.h_dt), sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.balsara), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.pressure), + sizeof(float) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.alpha_visc_max_ngb), + sizeof(float) * count_max_parts_tmp); + /* timestep stuff */ + cudaMalloc((void **)&(d_parts_soa.time_bin), + sizeof(timebin_t) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.wakeup), + sizeof(timebin_t) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.min_ngb_time_bin), + sizeof(timebin_t) * count_max_parts_tmp); + cudaMalloc((void **)&(d_parts_soa.to_be_synchronized), + sizeof(char) * count_max_parts_tmp); +} + +cudaError_t cudaAllocInt(int **d_var, int elements) { + return cudaMalloc((void **)d_var, elements * sizeof(int)); +} +cudaError_t cudaAllocFloat(float **d_var, int elements) { + return cudaMalloc((void **)d_var, elements * sizeof(float)); +} +cudaError_t cudaAllocDouble(double **d_var, int elements) { + return cudaMalloc((void **)d_var, elements * sizeof(double)); +} +cudaError_t cudaAllocLonglong(long long **d_var, int elements) { + return cudaMalloc((void **)d_var, elements * sizeof(long long)); +} +cudaError_t cudaAllocChar(char **d_var, int elements) { + return cudaMalloc((void **)d_var, elements * sizeof(char)); +} +cudaError_t cudaAllocTimebin(timebin_t **d_var, int elements) { + return cudaMalloc((void **)d_var, elements * sizeof(timebin_t)); +} + +void allocate_device_dirty( + int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p, + double **d_z_p, float **d_ux, float **d_uy, float **d_uz, + float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz, float **d_mass, + float **d_h, float **d_u, float **d_u_dt, float **d_rho, float **d_locx, + float **d_locy, float **d_locz, float **d_widthx, float **d_widthy, + float **d_widthz, float **d_h_max, int **d_count_p, float **d_wcount, + float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy, + float **d_rot_uz, float **d_div_v, float **d_div_v_previous_step, + float **d_alpha_visc, float **d_v_sig, float **d_laplace_u, + float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt, + float **d_balsara, float **d_pressure, float **d_alpha_visc_max_ngb, + timebin_t **d_time_bin, timebin_t **d_wakeup, + timebin_t **d_min_ngb_time_bin, char **d_to_be_synchronized, + int count_max_parts_tmp) { + ////////Malloc variables for particle data on the GPU. Sheesh, that's a lot + + size_t free_byte; + size_t total_byte; + + cudaError_t cuda_status = cudaMemGetInfo(&free_byte, &total_byte); + double free = (double)free_byte; + double available = (double)total_byte; + double used = (available - free); + // message("free %lf used %lf", free/10.E8, used/10.E8); + + cudaError_t cu_error = cudaAllocInt(d_tid_p, count_max_parts_tmp); + cu_error = cudaAllocLonglong(d_id, count_max_parts_tmp); + cu_error = cudaAllocDouble(d_x_p, count_max_parts_tmp); + cu_error = cudaAllocDouble(d_y_p, count_max_parts_tmp); + cu_error = cudaAllocDouble(d_z_p, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_ux, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_uy, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_uz, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_a_hydrox, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_a_hydroy, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_a_hydroz, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_mass, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_h, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_u, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_u_dt, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_rho, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_locx, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_locy, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_locz, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_widthx, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_widthy, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_widthz, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_h_max, count_max_parts_tmp); + cu_error = cudaAllocInt(d_count_p, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_wcount, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_wcount_dh, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_rho_dh, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_rot_ux, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_rot_uy, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_rot_uz, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_div_v, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_div_v_previous_step, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_alpha_visc, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_v_sig, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_laplace_u, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_alpha_diff, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_f, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_soundspeed, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_h_dt, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_balsara, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_pressure, count_max_parts_tmp); + cu_error = cudaAllocFloat(d_alpha_visc_max_ngb, count_max_parts_tmp); + /* timestep stuff */ + cu_error = cudaAllocTimebin(d_time_bin, count_max_parts_tmp); + cu_error = cudaAllocTimebin(d_wakeup, count_max_parts_tmp); + cu_error = cudaAllocTimebin(d_min_ngb_time_bin, count_max_parts_tmp); + cu_error = cudaAllocChar(d_to_be_synchronized, count_max_parts_tmp); +// cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ; +// double free_end = (double)free_byte; +// available = (double)total_byte; +// double used_end = (available - free_end); +// message("cuda malloc self free %lf GB used %lf GB used to allocate +// self" +// " data %lf MB", free_end/10.E8, used_end/10.E8, +// (used_end - used)/10.E5); +// message("at end of malloc dirty: %s", +// cudaGetErrorString(cu_error)); +#ifdef CUDA_DEBUG + if (cu_error != cudaSuccess) { + fprintf(stderr, "CUDA error at end of malloc dirty: %s\n", + cudaGetErrorString(cu_error)); + exit(0); + } +#endif +} + +void allocate_device_test(int **tid_test, int count_max_parts_tmp) { + ////////now malloc variables for particle data on the GPU. Sheesh + + cudaMalloc((void **)tid_test, sizeof(int) * count_max_parts_tmp); + + cudaError_t cu_error = cudaPeekAtLastError(); // Get error code + fprintf(stderr, "malloc tid: %s\n", cudaGetErrorString(cu_error)); + + if (cu_error != cudaSuccess) { + fprintf(stderr, "CUDA error with malloc tid: %s\n", + cudaGetErrorString(cu_error)); + exit(0); + } +} +/*Function to be overloaded using different part_soa structs + * and allocate their internal arrays + * alloc_type 0 for density, 1 for force, 2 for gradient*/ +void host_malloc(struct part_soa *parts_soa, int alloc_type, + int count_max_parts_tmp) { + allocate_host(parts_soa, count_max_parts_tmp); +} +/*Function to be overloaded using different part_soa structs + * and allocate their internal arrays + * alloc_type 0 for density, 1 for force, 2 for gradient*/ +void device_malloc(struct part_soa d_parts_soa, int alloc_type, + int count_max_parts_tmp) { + allocate_device(d_parts_soa, count_max_parts_tmp); +} + +void device_malloc_dirty( + int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p, + double **d_z_p, float **d_ux, float **d_uy, float **d_uz, + float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz, float **d_mass, + float **d_h, float **d_u, float **d_u_dt, float **d_rho, float **d_locx, + float **d_locy, float **d_locz, float **d_widthx, float **d_widthy, + float **d_widthz, float **d_h_max, int **d_count_p, float **d_wcount, + float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy, + float **d_rot_uz, float **d_div_v, float **d_div_v_previous_step, + float **d_alpha_visc, float **d_v_sig, float **d_laplace_u, + float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt, + float **d_balsara, float **d_pressure, float **d_alpha_visc_max_ngb, + timebin_t **d_time_bin, timebin_t **d_wakeup, + timebin_t **d_min_ngb_time_bin, char **d_to_be_synchronized, + int count_max_parts_tmp) { + + allocate_device_dirty( + d_tid_p, d_id, d_x_p, d_y_p, d_z_p, d_ux, d_uy, d_uz, d_a_hydrox, + d_a_hydroy, d_a_hydroz, d_mass, d_h, d_u, d_u_dt, d_rho, d_locx, d_locy, + d_locz, d_widthx, d_widthy, d_widthz, d_h_max, d_count_p, d_wcount, + d_wcount_dh, d_rho_dh, d_rot_ux, d_rot_uy, d_rot_uz, d_div_v, + d_div_v_previous_step, d_alpha_visc, d_v_sig, d_laplace_u, d_alpha_diff, + d_f, d_soundspeed, d_h_dt, d_balsara, d_pressure, d_alpha_visc_max_ngb, + d_time_bin, d_wakeup, d_min_ngb_time_bin, d_to_be_synchronized, + count_max_parts_tmp); +} + +void device_malloc_test(int **tid_test, int count_max_parts_tmp) { + + allocate_device_test(tid_test, count_max_parts_tmp); +} + +#ifdef WITH_CUDA +} +#endif diff --git a/src/files_for_new_functions/arrays_malloc.h b/src/files_for_new_functions/arrays_malloc.h new file mode 100644 index 0000000000..1107b51444 --- /dev/null +++ b/src/files_for_new_functions/arrays_malloc.h @@ -0,0 +1,64 @@ +#include "cuda/part_gpu.h" + +#include +#include +#include +#include + +cudaError_t cudaAllocInt(int **d_var, int elements); +cudaError_t cudaAllocFloat(float **d_var, int elements); +cudaError_t cudaAllocDouble(double **d_var, int elements); +cudaError_t cudaAllocLonglong(long long **d_var, int elements); +cudaError_t cudaAllocChar(char **d_var, int elements); +cudaError_t cudaAllocTimebin(timebin_t **d_var, int elements); + +void allocate_host(struct part_soa *parts_soa, int count_max_parts_tmp); + +void allocate_device(struct part_soa d_parts_soa, int count_max_parts_tmp); + +void allocate_device_dirty( + int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p, + double **d_z_p, float **d_ux, float **d_uy, float **d_uz, + float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz, float **d_mass, + float **d_h, float **d_u, float **d_u_dt, float **d_rho, float **d_locx, + float **d_locy, float **d_locz, float **d_widthx, float **d_widthy, + float **d_widthz, float **d_h_max, int **d_count_p, float **d_wcount, + float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy, + float **d_rot_uz, float **d_div_v, float **d_div_v_previous_step, + float **d_alpha_visc, float **d_v_sig, float **d_laplace_u, + float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt, + float **d_balsara, float **d_pressure, float **d_alpha_visc_max_ngb, + timebin_t **d_time_bin, timebin_t **d_wakeup, + timebin_t **d_min_ngb_time_bin, char **d_to_be_synchronized, + int count_max_parts_tmp); + +void allocate_device_test(int **tid_test, int count_max_parts_tmp); +/*Function to be overloaded using different part_soa structs + * and allocate their internal arrays + * alloc_type 0 for density, 1 for force, 2 for gradient*/ +void host_malloc(struct part_soa *parts_soa, int alloc_type, + int count_max_parts_tmp); + +/*Function to be overloaded using different part_soa structs + * and allocate their internal arrays + * alloc_type 0 for density, 1 for force, 2 for gradient*/ +void device_malloc(struct part_soa d_parts_soa, int alloc_type, + int count_max_parts_tmp); + +void device_malloc_dirty( + int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p, + double **d_z_p, float **d_ux, float **d_uy, float **d_uz, + float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz, float **d_mass, + float **d_h, float **d_u, float **d_u_dt, float **d_rho, float **d_locx, + float **d_locy, float **d_locz, float **d_widthx, float **d_widthy, + float **d_widthz, float **d_h_max, int **d_count_p, float **d_wcount, + float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy, + float **d_rot_uz, float **d_div_v, float **d_div_v_previous_step, + float **d_alpha_visc, float **d_v_sig, float **d_laplace_u, + float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt, + float **d_balsara, float **d_pressure, float **d_alpha_visc_max_ngb, + timebin_t **d_time_bin, timebin_t **d_wakeup, + timebin_t **d_min_ngb_time_bin, char **d_to_be_synchronized, + int count_max_parts_tmp); + +void device_malloc_test(int **tid_test, int count_max_parts_tmp); diff --git a/src/files_for_new_functions/host_device_data_transfer.cu b/src/files_for_new_functions/host_device_data_transfer.cu new file mode 100644 index 0000000000..ede719529b --- /dev/null +++ b/src/files_for_new_functions/host_device_data_transfer.cu @@ -0,0 +1,566 @@ +#include "cuda/part_gpu.h" + +#include +#include +#include +#include + +#ifdef WITH_CUDA +extern "C" { +#endif + +void host2device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp) { + // int * tid_h; + // cudaMallocHost((void **)&tid_h, + // count_max_parts_tmp * sizeof(int)); + for (int i = 0; i < count_max_parts_tmp; i++) { + tid_h[i] = 100; + // fprintf(stderr,"tid_h %i\n", tid_h[i]); + } + + cudaMemcpy(d_tid_p, tid_h, count_max_parts_tmp * sizeof(int), + cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + // cudaFree(tid_h); +} + +void device2host_test(struct part_soa parts_soa, int *tid_h, + int count_max_parts_tmp) { + int *tid_p = parts_soa.tid_p; + cudaMemcpy(tid_h, tid_p, count_max_parts_tmp * sizeof(int), + cudaMemcpyDeviceToHost); + for (int i = 0; i < count_max_parts_tmp; i++) { + fprintf(stderr, "tid is %i\n", tid_h[i]); + } +} + +void device2device_test(int *tid_p, struct part_soa parts_soa, + int count_max_parts_tmp) { + cudaMemcpy(tid_p, parts_soa.tid_p, sizeof(int *), cudaMemcpyHostToDevice); +} + +/*Function to be overloaded using different part_soa structs + * and allocate their internal arrays + * alloc_type 0 for density, 1 for force, 2 for gradient*/ +void host_device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp) { + + host2device_test(d_tid_p, tid_h, count_max_parts_tmp); +} + +void device_host_test(struct part_soa parts_soa, int *tid_h, + int count_max_parts_tmp) { + + device2host_test(parts_soa, tid_h, count_max_parts_tmp); +} + +void device_device_test(int *tid_p, struct part_soa parts_soa, + int count_max_parts_tmp) { + + device2device_test(tid_p, parts_soa, count_max_parts_tmp); +} + +void device2host_density( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, + int count_max_parts_tmp) { + cudaMemcpy(parts_soa_buffer.tid_p, tid_p, count_max_parts_tmp * sizeof(int), + cudaMemcpyDeviceToHost); +} +void device_host_cpy( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, + int count_max_parts_tmp) { + + device2host_density(parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, + a_hydrox, a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, + locy, locz, widthx, widthy, widthz, h_max, count_p, + wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz, div_v, + div_v_previous_step, alpha_visc, v_sig, laplace_u, + alpha_diff, f, soundspeed, h_dt, balsara, pressure, + alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin, + to_be_synchronized, count_max_parts_tmp); +} + +void device2device_density( + struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, + int count_max_parts_tmp, cudaStream_t stream) { + + cudaMemcpyAsync(&(parts_soa_buffer->tid_p), &tid_p, sizeof(int *), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(&(parts_soa_buffer->locx), &locx, sizeof(float *), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(&(parts_soa_buffer->locy), &locy, sizeof(float *), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(&(parts_soa_buffer->locz), &locz, sizeof(float *), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(&(parts_soa_buffer->h), &h, sizeof(float *), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(&(parts_soa_buffer->mass), &mass, sizeof(float *), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(&(parts_soa_buffer->x_p), &x_p, sizeof(double *), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(&(parts_soa_buffer->y_p), &y_p, sizeof(double *), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(&(parts_soa_buffer->z_p), &z_p, sizeof(double *), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(&(parts_soa_buffer->ux), &ux, sizeof(float *), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(&(parts_soa_buffer->uy), &uy, sizeof(float *), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(&(parts_soa_buffer->uz), &uz, sizeof(float *), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(&(parts_soa_buffer->time_bin), &time_bin, sizeof(timebin_t *), + cudaMemcpyHostToDevice, stream); +} + +void host2device_density( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, + int count_max_parts_tmp) { + cudaError_t cu_error; + cudaMemcpy(&tid_p, &(parts_soa_buffer.tid_p), + count_max_parts_tmp * sizeof(int), cudaMemcpyHostToDevice); +} + +/*Function to be overloaded using different part_soa structs + * and allocate their internal arrays + * alloc_type 0 for density, 1 for force, 2 for gradient*/ +void host_device_cpy( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, + int count_max_parts_tmp) { + + host2device_density(parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, + a_hydrox, a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, + locy, locz, widthx, widthy, widthz, h_max, count_p, + wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz, div_v, + div_v_previous_step, alpha_visc, v_sig, laplace_u, + alpha_diff, f, soundspeed, h_dt, balsara, pressure, + alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin, + to_be_synchronized, count_max_parts_tmp); +} + +/*Function to be overloaded using different part_soa structs + * and allocate their internal arrays + * alloc_type 0 for density, 1 for force, 2 for gradient*/ +void device_device_bind( + struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, + int count_max_parts_tmp, cudaStream_t stream) { + + device2device_density( + parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox, + a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx, + widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy, + rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u, + alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb, + time_bin, wakeup, min_ngb_time_bin, to_be_synchronized, + count_max_parts_tmp, stream); +} + +void host2device_async_density( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp, + int bundle_n_parts, cudaStream_t stream) { + cudaError_t cu_error; + cudaMemcpyAsync(&tid_p[first_part_tmp], + &(parts_soa_buffer.tid_p[first_part_tmp]), + bundle_n_parts * sizeof(int), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync( + &locx[first_part_tmp], &(parts_soa_buffer.locx[first_part_tmp]), + bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync( + &locy[first_part_tmp], &(parts_soa_buffer.locy[first_part_tmp]), + bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(&locz[first_part_tmp], &parts_soa_buffer.locz[first_part_tmp], + bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(&h[first_part_tmp], &parts_soa_buffer.h[first_part_tmp], + bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(&mass[first_part_tmp], &parts_soa_buffer.mass[first_part_tmp], + bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(&x_p[first_part_tmp], &parts_soa_buffer.x_p[first_part_tmp], + bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(&y_p[first_part_tmp], &parts_soa_buffer.y_p[first_part_tmp], + bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(&z_p[first_part_tmp], &parts_soa_buffer.z_p[first_part_tmp], + bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(&ux[first_part_tmp], &parts_soa_buffer.ux[first_part_tmp], + bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(&uy[first_part_tmp], &parts_soa_buffer.uy[first_part_tmp], + bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(&uz[first_part_tmp], &parts_soa_buffer.uz[first_part_tmp], + bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync( + &time_bin[first_part_tmp], &parts_soa_buffer.time_bin[first_part_tmp], + bundle_n_parts * sizeof(timebin_t), cudaMemcpyHostToDevice, stream); +} + +void host2device_async_density_pair( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp, + int bundle_n_parts, cudaStream_t stream) { + + // int bundle_n_parts = bundle_n_parts_i + bundle_n_parts_j; + cudaError_t cu_error; + // cudaMemcpyAsync(&tid_p[first_part_tmp], + // &(parts_soa_buffer.tid_p[first_part_tmp]), + // bundle_n_parts * sizeof(int), + // cudaMemcpyHostToDevice, stream); + // cudaMemcpyAsync(&locx[first_part_tmp], + // &(parts_soa_buffer.locx[first_part_tmp]), + // bundle_n_parts * sizeof(float), + // cudaMemcpyHostToDevice, stream); + // cudaMemcpyAsync(&locy[first_part_tmp], + // &(parts_soa_buffer.locy[first_part_tmp]), + // bundle_n_parts * sizeof(float), + // cudaMemcpyHostToDevice, stream); + // cudaMemcpyAsync(&locz[first_part_tmp], + // &parts_soa_buffer.locz[first_part_tmp], + // bundle_n_parts * sizeof(float), + // cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(&h[first_part_tmp], &parts_soa_buffer.h[first_part_tmp], + bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(&mass[first_part_tmp], &parts_soa_buffer.mass[first_part_tmp], + bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(&x_p[first_part_tmp], &parts_soa_buffer.x_p[first_part_tmp], + bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(&y_p[first_part_tmp], &parts_soa_buffer.y_p[first_part_tmp], + bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(&z_p[first_part_tmp], &parts_soa_buffer.z_p[first_part_tmp], + bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(&ux[first_part_tmp], &parts_soa_buffer.ux[first_part_tmp], + bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(&uy[first_part_tmp], &parts_soa_buffer.uy[first_part_tmp], + bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(&uz[first_part_tmp], &parts_soa_buffer.uz[first_part_tmp], + bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync( + &time_bin[first_part_tmp], &parts_soa_buffer.time_bin[first_part_tmp], + bundle_n_parts * sizeof(timebin_t), cudaMemcpyHostToDevice, stream); +} + +/*Function to be overloaded using different part_soa structs + * and allocate their internal arrays + * alloc_type 0 for density, 1 for force, 2 for gradient*/ +void host_device_async_cpy( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp, + int bundle_n_parts, cudaStream_t stream) { + + host2device_async_density( + parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox, + a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx, + widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy, + rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u, + alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb, + time_bin, wakeup, min_ngb_time_bin, to_be_synchronized, first_part_tmp, + bundle_n_parts, stream); +} + +/*Function to be overloaded using different part_soa structs + * and allocate their internal arrays + * alloc_type 0 for density, 1 for force, 2 for gradient*/ +void host_device_async_cpy_pair( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp_i, + int bundle_n_parts, cudaStream_t stream) { + + host2device_async_density_pair( + parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox, + a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx, + widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy, + rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u, + alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb, + time_bin, wakeup, min_ngb_time_bin, to_be_synchronized, first_part_tmp_i, + bundle_n_parts, stream); +} + +void device2host_async_density( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp, + int bundle_n_parts, cudaStream_t stream) { + cudaError_t cu_error; + + cudaMemcpyAsync(&parts_soa_buffer.rho[first_part_tmp], &rho[first_part_tmp], + bundle_n_parts * sizeof(float), cudaMemcpyDeviceToHost, + stream); + cudaMemcpyAsync(&parts_soa_buffer.rho_dh[first_part_tmp], + &rho_dh[first_part_tmp], bundle_n_parts * sizeof(float), + cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(&parts_soa_buffer.wcount[first_part_tmp], + &wcount[first_part_tmp], bundle_n_parts * sizeof(float), + cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(&parts_soa_buffer.wcount_dh[first_part_tmp], + &wcount_dh[first_part_tmp], bundle_n_parts * sizeof(float), + cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(&parts_soa_buffer.div_v[first_part_tmp], + &div_v[first_part_tmp], bundle_n_parts * sizeof(float), + cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(&parts_soa_buffer.rot_ux[first_part_tmp], + &rot_ux[first_part_tmp], bundle_n_parts * sizeof(float), + cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(&parts_soa_buffer.rot_uy[first_part_tmp], + &rot_uy[first_part_tmp], bundle_n_parts * sizeof(float), + cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(&parts_soa_buffer.rot_uz[first_part_tmp], + &rot_uz[first_part_tmp], bundle_n_parts * sizeof(float), + cudaMemcpyDeviceToHost, stream); +} + +void device2host_async_density_pair( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp, + int bundle_n_parts, cudaStream_t stream) { + cudaError_t cu_error; + // fprintf(stderr, "parts i %i parts j %i\n", bundle_n_parts_i, + // bundle_n_parts_j); int bundle_n_parts = bundle_n_parts_i + + // bundle_n_parts_j; + + cudaMemcpyAsync(&parts_soa_buffer.rho[first_part_tmp], &rho[first_part_tmp], + bundle_n_parts * sizeof(float), cudaMemcpyDeviceToHost, + stream); + cudaMemcpyAsync(&parts_soa_buffer.rho_dh[first_part_tmp], + &rho_dh[first_part_tmp], bundle_n_parts * sizeof(float), + cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(&parts_soa_buffer.wcount[first_part_tmp], + &wcount[first_part_tmp], bundle_n_parts * sizeof(float), + cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(&parts_soa_buffer.wcount_dh[first_part_tmp], + &wcount_dh[first_part_tmp], bundle_n_parts * sizeof(float), + cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(&parts_soa_buffer.div_v[first_part_tmp], + &div_v[first_part_tmp], bundle_n_parts * sizeof(float), + cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(&parts_soa_buffer.rot_ux[first_part_tmp], + &rot_ux[first_part_tmp], bundle_n_parts * sizeof(float), + cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(&parts_soa_buffer.rot_uy[first_part_tmp], + &rot_uy[first_part_tmp], bundle_n_parts * sizeof(float), + cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(&parts_soa_buffer.rot_uz[first_part_tmp], + &rot_uz[first_part_tmp], bundle_n_parts * sizeof(float), + cudaMemcpyDeviceToHost, stream); +} + +/*Function to be overloaded using different part_soa structs + * and allocate their internal arrays + * alloc_type 0 for density, 1 for force, 2 for gradient*/ +void device_host_async_cpy( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp, + int bundle_n_parts, cudaStream_t stream) { + + device2host_async_density( + parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox, + a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx, + widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy, + rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u, + alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb, + time_bin, wakeup, min_ngb_time_bin, to_be_synchronized, first_part_tmp, + bundle_n_parts, stream); +} + +/*Function to be overloaded using different part_soa structs + * and allocate their internal arrays + * alloc_type 0 for density, 1 for force, 2 for gradient*/ +void device_host_async_cpy_pair( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp, + int bundle_n_parts, cudaStream_t stream) { + + device2host_async_density_pair( + parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox, + a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx, + widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy, + rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u, + alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb, + time_bin, wakeup, min_ngb_time_bin, to_be_synchronized, first_part_tmp, + bundle_n_parts, stream); +} + +/*Function to be overloaded using different part_soa structs + * and allocate their internal arrays + * alloc_type 0 for density, 1 for force, 2 for gradient*/ +void device_device_async_bind( + struct part_soa *parts_soa, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized) { + + parts_soa->tid_p = tid_p; + parts_soa->locx = locx; + parts_soa->locy = locy; + parts_soa->locz = locz; + parts_soa->h = h; + parts_soa->mass = mass; + parts_soa->x_p = x_p; + parts_soa->y_p = y_p; + parts_soa->z_p = z_p; + parts_soa->rho = rho; + parts_soa->rho_dh = rho_dh; + parts_soa->wcount = wcount; + parts_soa->wcount_dh = wcount_dh; + parts_soa->ux = ux; + parts_soa->uy = uy; + parts_soa->uz = uz; + parts_soa->div_v = div_v; + parts_soa->rot_ux = rot_ux; + parts_soa->rot_uy = rot_uy; + parts_soa->rot_uz = rot_uz; + parts_soa->time_bin = time_bin; +} + +#ifdef WITH_CUDA +} +#endif diff --git a/src/files_for_new_functions/host_device_data_transfer.h b/src/files_for_new_functions/host_device_data_transfer.h new file mode 100644 index 0000000000..204afd51fa --- /dev/null +++ b/src/files_for_new_functions/host_device_data_transfer.h @@ -0,0 +1,234 @@ +#include "cuda/part_gpu.h" + +#include +#include +#include +#include + +void host2device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp); + +void device2host_test(struct part_soa parts_soa, int *tid_h, + int count_max_parts_tmp); + +void device2device_test(int *tid_p, struct part_soa parts_soa, + int count_max_parts_tmp); + +/*Function to be overloaded using different part_soa structs + * and allocate their internal arrays + * alloc_type 0 for density, 1 for force, 2 for gradient*/ +void host_device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp); + +void device_host_test(struct part_soa parts_soa, int *tid_h, + int count_max_parts_tmp); + +void device_device_test(int *tid_p, struct part_soa parts_soa, + int count_max_parts_tmp); + +void device2host_density( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, + int count_max_parts_tmp); + +void device_host_cpy( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, + int count_max_parts_tmp); + +void device2device_density( + struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, + int count_max_parts_tmp, cudaStream_t stream); + +void host2device_density( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, + int count_max_parts_tmp); + +/*Function to be overloaded using different part_soa structs + * and allocate their internal arrays + * alloc_type 0 for density, 1 for force, 2 for gradient*/ +void host_device_cpy( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, + int count_max_parts_tmp); + +/*Function to be overloaded using different part_soa structs + * and allocate their internal arrays + * alloc_type 0 for density, 1 for force, 2 for gradient*/ +void device_device_bind( + struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, + int count_max_parts_tmp, cudaStream_t stream); + +void host2device_async_density( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp, + int bundle_n_parts, cudaStream_t stream); + +/*Function to be overloaded using different part_soa structs + * and allocate their internal arrays + * alloc_type 0 for density, 1 for force, 2 for gradient*/ +void host_device_async_cpy( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp, + int bundle_n_parts, cudaStream_t stream); + +void device2host_async_density( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp, + int bundle_n_parts, cudaStream_t stream); +/*Function to be overloaded using different part_soa structs + * and allocate their internal arrays + * alloc_type 0 for density, 1 for force, 2 for gradient*/ +void device_host_async_cpy( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp, + int bundle_n_parts, cudaStream_t stream); + +/*Function to be overloaded using different part_soa structs + * and allocate their internal arrays + * alloc_type 0 for density, 1 for force, 2 for gradient*/ +void device_device_async_bind( + struct part_soa *parts_soa, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized); + +void host_device_async_cpy_pair( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp, + int bundle_n_parts, cudaStream_t stream); + +void device_host_async_cpy_pair( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp, + int bundle_n_parts, cudaStream_t stream); + +void device2host_async_density_pair( + struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox, + float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u, + float *u_dt, float *rho, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, int *count_p, + float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, + float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure, + float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp, + int bundle_n_parts_i, int bundle_n_parts_j, cudaStream_t stream); diff --git a/src/hip/BLOCK_SIZE.h b/src/hip/BLOCK_SIZE.h new file mode 100644 index 0000000000..d36e10b99b --- /dev/null +++ b/src/hip/BLOCK_SIZE.h @@ -0,0 +1,10 @@ +#ifndef BLOCK_SIZE_H +#define BLOCK_SIZE_H +#ifdef WITH_CUDA +// extern "C" { +#endif +#define BLOCK_SIZE 512 +#ifdef WITH_CUDA +//} +#endif +#endif // BLOCK_SIZE_H diff --git a/src/hip/Data_and_GPU_prep_functions.cu b/src/hip/Data_and_GPU_prep_functions.cu new file mode 100644 index 0000000000..57cbe0ad7c --- /dev/null +++ b/src/hip/Data_and_GPU_prep_functions.cu @@ -0,0 +1,229 @@ +/* + * Data_and_GPU_prep_functions.cu + * + * Created on: 17 Apr 2022 + * Author: abouzied + */ + +/*ifdef WITH_CUDA prevents name mangling. C code sees exact names + of functions rather than mangled template names produced by C++*/ +// #ifdef WITH_CUDA +// extern "C"{ +// #endif + +// #include "cuda/cuda_headers.h" +// #include "device_functions.h" +// #include "cuda/cell_gpu.h" +#include +#include +// #include "../config.h" + +void populate_parts_list(struct cell *ci, struct part_gpu *parts) { + //////////////////////////////////////////// + ///*****Copy variables for cell i (self interaction)*****/ + int count = ci->hydro.count; + + // fprintf(stderr,"Tester 111\n"); + for (int p = 0; p < count; p++) { + + parts[p].id = ci->hydro.parts[p].id; + + // fprintf(stderr,"Tester 222\n"); + parts[p].count = count; + parts[p].h_max = ci->hydro.h_max; + + for (int d = 0; d < 3; d++) { + parts[p].x[d] = ci->hydro.parts[p].x[d]; + parts[p].v[d] = ci->hydro.parts[p].v[d]; + parts[p].a_hydro[d] = ci->hydro.parts[p].a_hydro[d]; + parts[p].loc[d] = ci->loc[d]; + } + parts[p].mass = ci->hydro.parts[p].mass; + parts[p].h = ci->hydro.parts[p].h; + parts[p].u = ci->hydro.parts[p].u; + parts[p].u_dt = ci->hydro.parts[p].u_dt; + parts[p].rho = ci->hydro.parts[p].rho; + parts[p].div_v = ci->hydro.parts[p].viscosity.div_v; + parts[p].div_v_previous_step = + ci->hydro.parts[p].viscosity.div_v_previous_step; + parts[p].alpha_visc = ci->hydro.parts[p].viscosity.alpha; + parts[p].v_sig = ci->hydro.parts[p].viscosity.v_sig; + parts[p].laplace_u = ci->hydro.parts[p].diffusion.laplace_u; + parts[p].alpha_diff = ci->hydro.parts[p].diffusion.alpha; + parts[p].f = ci->hydro.parts[p].force.f; + parts[p].soundspeed = ci->hydro.parts[p].force.soundspeed; + parts[p].h_dt = ci->hydro.parts[p].force.h_dt; + parts[p].balsara = ci->hydro.parts[p].force.balsara; + parts[p].pressure = ci->hydro.parts[p].force.pressure; + parts[p].time_bin = ci->hydro.parts[p].time_bin; + parts[p].wakeup = ci->hydro.parts[p].limiter_data.wakeup; + parts[p].min_ngb_time_bin = + ci->hydro.parts[p].limiter_data.min_ngb_time_bin; + parts[p].to_be_synchronized = + ci->hydro.parts[p].limiter_data.to_be_synchronized; + parts[p].wcount = ci->hydro.parts[p].density.wcount; + parts[p].wcount_dh = ci->hydro.parts[p].density.wcount_dh; + parts[p].rho_dh = ci->hydro.parts[p].density.rho_dh; + parts[p].div_v = ci->hydro.parts[p].viscosity.div_v; + parts[p].rot_v[0] = ci->hydro.parts[p].density.rot_v[0]; + parts[p].rot_v[1] = ci->hydro.parts[p].density.rot_v[1]; + parts[p].rot_v[2] = ci->hydro.parts[p].density.rot_v[2]; + parts[p].SPH_sum = 0.f; + } +} + +void populate_parts_list_soa( + int count_all_parts, struct cell *ci, int first_part_tmp, int count, + int tid, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p, + float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, + float *a_hydroz, float *mass, float *h, float *u, float *u_dt, float *rho, + float *SPH_sum, float *locx, float *locy, float *locz, float *widthx, + float *widthy, float *widthz, float *h_max, int *count_p, float *wcount, + float *wcount_dh, float *rho_dh, float *rot_u, float *rot_v, float *rot_w, + float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, + float *laplace_u, float *alpha_diff, float *f, float *soundspeed, + float *h_dt, float *balsara, float *pressure, float *alpha_visc_max_ngb, + timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin, + char *to_be_synchronized) { + //////////////////////////////////////////// + struct part *ptmps; + ptmps = ci->hydro.parts; + // fprintf(stderr,"Tester 111\n"); +#pragma unroll + for (int p = 0; p < count; p++) { + int p_gid = p + first_part_tmp; + // if(p_gid>=count_all_parts){ + // fprintf(stderr,"p>all parts"); + // exit(0); + // } + id[p_gid] = ptmps[p].id; + count_p[p_gid] = count; + tid_p[p_gid] = tid; + h_max[p_gid] = ci->hydro.h_max; + x_p[p_gid] = ptmps[p].x[0]; + y_p[p_gid] = ptmps[p].x[1]; + z_p[p_gid] = ptmps[p].x[2]; + ux[p_gid] = ptmps[p].v[0]; + uy[p_gid] = ptmps[p].v[1]; + uz[p_gid] = ptmps[p].v[2]; + a_hydrox[p_gid] = ptmps[p].a_hydro[0]; + a_hydroy[p_gid] = ptmps[p].a_hydro[1]; + a_hydroz[p_gid] = ptmps[p].a_hydro[2]; + locx[p_gid] = ci->loc[0]; + locy[p_gid] = ci->loc[1]; + locz[p_gid] = ci->loc[2]; + + mass[p_gid] = ptmps[p].mass; + h[p_gid] = ptmps[p].h; + u[p_gid] = ptmps[p].u; + u_dt[p_gid] = ptmps[p].u_dt; + rho[p_gid] = ptmps[p].rho; + div_v[p_gid] = ptmps[p].viscosity.div_v; + div_v_previous_step[p_gid] = ptmps[p].viscosity.div_v_previous_step; + alpha_visc[p_gid] = ptmps[p].viscosity.alpha; + v_sig[p_gid] = ptmps[p].viscosity.v_sig; + laplace_u[p_gid] = ptmps[p].diffusion.laplace_u; + alpha_diff[p_gid] = ptmps[p].diffusion.alpha; + f[p_gid] = ptmps[p].force.f; + soundspeed[p_gid] = ptmps[p].force.soundspeed; + h_dt[p_gid] = ptmps[p].force.h_dt; + balsara[p_gid] = ptmps[p].force.balsara; + pressure[p_gid] = ptmps[p].force.pressure; + time_bin[p_gid] = ptmps[p].time_bin; + wakeup[p_gid] = ptmps[p].limiter_data.wakeup; + min_ngb_time_bin[p_gid] = ptmps[p].limiter_data.min_ngb_time_bin; + to_be_synchronized[p_gid] = ptmps[p].limiter_data.to_be_synchronized; + wcount[p_gid] = ptmps[p].density.wcount; + wcount_dh[p_gid] = ptmps[p].density.wcount_dh; + rho_dh[p_gid] = ptmps[p].density.rho_dh; + div_v[p_gid] = ptmps[p].viscosity.div_v; + rot_u[p_gid] = ptmps[p].density.rot_v[0]; + rot_v[p_gid] = ptmps[p].density.rot_v[1]; + rot_w[p_gid] = ptmps[p].density.rot_v[2]; + SPH_sum[p_gid] = 0.f; + // fprintf(stderr,"tid is %i\n",tid_p[p]); + // fprintf(stderr,"Tester 222, count=%i, p=%i\n", count, + // id[p_gid]); + } +} + +void pack_data_soa(int count_all_parts, struct cell *ci, int first_part_tmp, + int count, int tid, int *tid_p, long long *id, double *x_p, + double *y_p, double *z_p, float *ux, float *uy, float *uz, + float *a_hydrox, float *a_hydroy, float *a_hydroz, + float *mass, float *h, float *u, float *u_dt, float *rho, + float *SPH_sum, float *locx, float *locy, float *locz, + float *widthx, float *widthy, float *widthz, float *h_max, + int *count_p, float *wcount, float *wcount_dh, float *rho_dh, + float *rot_u, float *rot_v, float *rot_w, float *div_v, + float *div_v_previous_step, float *alpha_visc, float *v_sig, + float *laplace_u, float *alpha_diff, float *f, + float *soundspeed, float *h_dt, float *balsara, + float *pressure, float *alpha_visc_max_ngb, + timebin_t *time_bin, timebin_t *wakeup, + timebin_t *min_ngb_time_bin, char *to_be_synchronized) { + //////////////////////////////////////////// + struct part *ptmps; + ptmps = ci->hydro.parts; + // fprintf(stderr,"Tester 111\n"); +#pragma unroll + for (int p = 0; p < count; p++) { + int p_gid = p + first_part_tmp; + // if(p_gid>=count_all_parts){ + // fprintf(stderr,"p>all parts"); + // exit(0); + // } + id[p_gid] = ptmps[p].id; + count_p[p_gid] = count; + tid_p[p_gid] = tid; + h_max[p_gid] = ci->hydro.h_max; + x_p[p_gid] = ptmps[p].x[0]; + y_p[p_gid] = ptmps[p].x[1]; + z_p[p_gid] = ptmps[p].x[2]; + ux[p_gid] = ptmps[p].v[0]; + uy[p_gid] = ptmps[p].v[1]; + uz[p_gid] = ptmps[p].v[2]; + a_hydrox[p_gid] = ptmps[p].a_hydro[0]; + a_hydroy[p_gid] = ptmps[p].a_hydro[1]; + a_hydroz[p_gid] = ptmps[p].a_hydro[2]; + locx[p_gid] = ci->loc[0]; + locy[p_gid] = ci->loc[1]; + locz[p_gid] = ci->loc[2]; + + mass[p_gid] = ptmps[p].mass; + h[p_gid] = ptmps[p].h; + u[p_gid] = ptmps[p].u; + u_dt[p_gid] = ptmps[p].u_dt; + rho[p_gid] = ptmps[p].rho; + div_v[p_gid] = ptmps[p].viscosity.div_v; + div_v_previous_step[p_gid] = ptmps[p].viscosity.div_v_previous_step; + alpha_visc[p_gid] = ptmps[p].viscosity.alpha; + v_sig[p_gid] = ptmps[p].viscosity.v_sig; + laplace_u[p_gid] = ptmps[p].diffusion.laplace_u; + alpha_diff[p_gid] = ptmps[p].diffusion.alpha; + f[p_gid] = ptmps[p].force.f; + soundspeed[p_gid] = ptmps[p].force.soundspeed; + h_dt[p_gid] = ptmps[p].force.h_dt; + balsara[p_gid] = ptmps[p].force.balsara; + pressure[p_gid] = ptmps[p].force.pressure; + time_bin[p_gid] = ptmps[p].time_bin; + wakeup[p_gid] = ptmps[p].limiter_data.wakeup; + min_ngb_time_bin[p_gid] = ptmps[p].limiter_data.min_ngb_time_bin; + to_be_synchronized[p_gid] = ptmps[p].limiter_data.to_be_synchronized; + wcount[p_gid] = ptmps[p].density.wcount; + wcount_dh[p_gid] = ptmps[p].density.wcount_dh; + rho_dh[p_gid] = ptmps[p].density.rho_dh; + div_v[p_gid] = ptmps[p].viscosity.div_v; + rot_u[p_gid] = ptmps[p].density.rot_v[0]; + rot_v[p_gid] = ptmps[p].density.rot_v[1]; + rot_w[p_gid] = ptmps[p].density.rot_v[2]; + SPH_sum[p_gid] = 0.f; + // fprintf(stderr,"tid is %i\n",tid_p[p]); + // fprintf(stderr,"Tester 222, count=%i, p=%i\n", count, + // id[p_gid]); + } +} + +// #ifdef WITH_CUDA +// } +// #endif diff --git a/src/hip/HIP_runner_functions.h b/src/hip/HIP_runner_functions.h new file mode 100644 index 0000000000..43a52f96ed --- /dev/null +++ b/src/hip/HIP_runner_functions.h @@ -0,0 +1,22 @@ +#ifndef CUDA_HEADERS_H +#define CUDA_HEADERS_H +#define n_streams 1024 + +#ifdef __cplusplus +extern "C" { +#endif +#include "part_gpu.h" +void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part, + int *d_task_last_part, int *d_bundle_first_part, + int *d_bundle_last_part, float d_a, float d_H, + const char *loop_type, hipStream_t stream, int bid, + int block_size, int count_tasks, int tasksperbundle, + int numBlocks_x, int numBlocks_y, int tid, + int offset, int bundle_first_task, int max_parts, + int max_active_bin); + +#ifdef __cplusplus +} +#endif + +#endif // CUDA_HEADER_H diff --git a/src/hip/HIP_runner_functions.hip b/src/hip/HIP_runner_functions.hip new file mode 100755 index 0000000000..634c67a9ad --- /dev/null +++ b/src/hip/HIP_runner_functions.hip @@ -0,0 +1,229 @@ +#include "hip/hip_runtime.h" +/******************************************************************************* + * This file contains functions used to setup and execute GPU tasks from within + *runner_main.c. Consider this a translator allowing .cu based functions to be + *called from within runner_main.c + ******************************************************************************/ + +/* Hacky method to make c++ compilers not die. */ +#ifdef WITH_HIP +#ifndef static +#define static +#endif +#ifndef restrict +#define restrict __restrict__ +#endif +#endif + +/* Required header files */ +#include +/*ifdef WITH_HIP prevents name mangling. C code sees exact names + of functions rather than mangled template names produced by C++*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "../../config.h" +#include "BLOCK_SIZE.h" +#include "HIP_runner_functions.h" +#include "hip/device_functions.h" +#include "part_gpu.h" + +void Initialise_GPU() { + int devId = 0; + // find and print device name + hipDeviceProp_t prop; + hipGetDeviceProperties(&prop, devId); + printf("Device : %s\n", prop.name); + hipSetDevice(devId); + // cuda +} +#ifdef __cplusplus +} +#endif + +__global__ void runner_do_self_density_GPU( + struct part_soa parts_soa, int *d_task_first_part, int *d_task_last_part, + int *d_bundle_first_part, int *d_bundle_last_part, float d_a, float d_H, + int bid, int tid, int count_tasks, int tasksperbundle, int nBlocks_per_task, + int bundle_first_task, int max_parts, int time_bin_inhibited) { + extern __shared__ float vars[]; + __shared__ int first_part_tid_0, last_part_tid_0; + const int threadid = blockDim.x * blockIdx.x + threadIdx.x; + const int task_id = bundle_first_task + blockIdx.y; + + // printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks); + __shared__ int first_part_in_task_blocks, last_part_in_task_blocks; + first_part_in_task_blocks = d_task_first_part[task_id], + last_part_in_task_blocks = d_task_last_part[task_id]; + __syncthreads(); + const int b_first_part = d_bundle_first_part[bid]; + const int pid = threadid + first_part_in_task_blocks; + const int b_last_part = d_bundle_last_part[bid]; + + int ttid = 0; + int first_part = 0; + int count = 0; + int last_part = 0; + float cellx = 0.0, celly = 0.0, cellz = 0.0; + float hi = 0.0, hig2 = hi * hi * kernel_gamma2; + float mi = 0.0; + float uxi = 0.0; + float uyi = 0.0; + float uzi = 0.0; + float pix = 0.0; + float piy = 0.0; + float piz = 0.0; + float rhoi = 0.0; + float rho_dhi = 0.0; + float wcounti = 0.0; + float wcount_dhi = 0.0; + float div_vi = 0.0; + float rot_uxi = 0.0; + float rot_uyi = 0.0; + float rot_uzi = 0.0; + // if(pid (0.01f/128.f)*(0.01f/128.f)) { + const float r = sqrt(r2); + /* Recover some data */ + const float mj = mass_tmp[j_block]; + /* Get the kernel for hi. */ + if(hi<1.f/128.f)printf("h < dx\n"); + const float h_inv = 1.f / hi; + const float ui = r * h_inv; + float wi, wi_dx; + + d_kernel_deval(ui, &wi, &wi_dx); + + rhoi += mj * wi; + rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx); + + wcounti += wi; + wcount_dhi -= (hydro_dimension * wi + ui * wi_dx); + + const float r_inv = 1.f / r; + const float faci = mj * wi_dx * r_inv; + + /* Compute dv dot r */ + float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block], + dvz = uzi - uz_tmp[j_block]; + const float dvdr = dvx * xij + dvy * yij + dvz * zij; + + div_vi -= faci * dvdr; + + /* Compute dv cross r */ + float curlvrx = dvy * zij - dvz * yij; + float curlvry = dvz * xij - dvx * zij; + float curlvrz = dvx * yij - dvy * xij; + + rot_uxi += faci * curlvrx; + rot_uyi += faci * curlvry; + rot_uzi += faci * curlvrz; + } + } + } + __syncthreads(); + } + if (pid < last_part_in_task_blocks) { + float wi, wi_dx; + d_kernel_deval(0.f, &wi, &wi_dx); +// printf("mass i %e, self rho %e sum rho %e\n", mi, mi*wi, rhoi); + parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi; + parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi; + parts_soa.div_v[pid] = div_vi; + parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi, + parts_soa.rot_uz[pid] = rot_uzi; + } +} +#ifdef __cplusplus +extern "C" { +#endif +void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part, + int *d_task_last_part, int *d_bundle_first_part, + int *d_bundle_last_part, float d_a, float d_H, + const char *loop_type, hipStream_t stream, int bid, + int block_size, int count_tasks, int tasksperbundle, + int numBlocks_x, int numBlocks_y, int tid, + int offset, int bundle_first_task, int max_parts, + int time_bin_inhibited) { + + dim3 gridShape = dim3(numBlocks_x, numBlocks_y); + int nBlocks_per_task = numBlocks_x; + runner_do_self_density_GPU<<>>( + parts_soa, d_task_first_part, d_task_last_part, d_bundle_first_part, + d_bundle_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle, + nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited); +} +#ifdef __cplusplus +} +#endif diff --git a/src/hip/Makefile.am b/src/hip/Makefile.am new file mode 100755 index 0000000000..fc626b8831 --- /dev/null +++ b/src/hip/Makefile.am @@ -0,0 +1,55 @@ +SOURCES_HIP = HIP_runner_functions.hip +include_HEADERS = HIP_runner_functions.h device_functions.h BLOCK_SIZE.h tester.h +EXTRA_DIST = $(SOURCES_HIP) $(include_HEADERS) + +if HAVEHIP + +AM_CFLAGS = -I.. $(HDF5_CPPFLAGS) +HIP_MYFLAGS = -D_FORCE_INLINES -O3 -g -DWITH_HIP --offload-arch=gfx90a +#HIP_MYFLAGS = -D_FORCE_INLINES -O3 -g -v -lineinfo -src-in-ptx --maxrregcount=32 -ftz=true -DWITH_HIP -ccbin=gcc -m64 --default-stream per-thread#-dlink + +# Assign a "safe" version number +AM_LDFLAGS = $(HDF5_LDFLAGS) $(FFTW_LIBS) -version-info 0:0:0 + +#bin_PROGRAMS = test_27_cells test_125_cells + +# Rules to compile HIP code. +.hip.o: + $(HIPCC) -c $(HIPFLAGS) $(AM_CFLAGS) $(HIP_CFLAGS) $(HIP_MYFLAGS) $< -o $@ +.hip.lo: + PATH=$(top_srcdir):$(PATH) && cudalt.py $@ $(HIPCC) -c $(HIPFLAGS) $(AM_CFLAGS) $(HIP_CFLAGS) $(HIP_MYFLAGS) $< + +# The library. Dummy C library so that we get libtool linking setup. +lib_LTLIBRARIES = libswiftHIP.la libswiftdummy.la + +# Special link command to avoid including CFLAGS which are not understood. +libswiftHIP_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \ + $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \ + $(libswiftHIP_la_LDFLAGS) \ + $(LDFLAGS) -o $@ + +libswiftHIP_la_SOURCES = $(SOURCES_HIP) +libswiftHIP_la_CFLAGS = $(AM_CFLAGS) $(HIP_CFLAGS) $(HIP_MYFLAGS) ../libswiftsim_hip.la -I../ +libswiftHIP_la_LIBADD = ../.libs/libswiftsim_hip.la +libswiftHIP_la_LDFLAGS = $(AM_LDFLAGS) + +if HAVEMPI +libswiftHIP_la_CFLAGS += ../libswiftsim_mpihip.la +libswiftHIP_la_LIBADD += ../.libs/libswiftsim_mpihip.la +endif + +libswiftdummy_la_SOURCES = dummy.c +libswiftdummy_la_CFLAGS = $(AM_CFLAGS) +libswiftdummy_la_LDFLAGS = $(AM_LDFLAGS) + +#test_27_cells_SOURCES=test27cells.c +#test_27_cells_CFLAGS=$(AM_CFLAGS) -DWITH_HIP $(HIP_CFLAGS) +#test_27_cells_LDADD= ../.libs/libswiftsim_hip.la ../.libs/libswiftsim_mpihip.la libswiftHIP.la $(MPI_LIBS) $(EXTRA_LIBS) $(HIP_LIBS) +#test_27_cells_LDFLAGS = $(AM_LDFLAGS) $(HIP_CFLAGS) + +#test_125_cells_SOURCES=test125cells.c +#test_125_cells_CFLAGS=$(AM_CFLAGS) -DWITH_HIP $(HIP_CFLAGS) +#test_125_cells_LDADD= ../libswiftsim_hip.la ../libswiftsim_mpihip.la libswiftHIP.la $(MPI_LIBS) $(EXTRA_LIBS) $(HIP_LIBS) +#test_125_cells_LDFLAGS = $(AM_LDFLAGS) $(HIP_CFLAGS) + +endif diff --git a/src/hip/am--include-marker b/src/hip/am--include-marker new file mode 100644 index 0000000000..9ce06a81ea --- /dev/null +++ b/src/hip/am--include-marker @@ -0,0 +1 @@ +# dummy diff --git a/src/hip/cell_gpu.h b/src/hip/cell_gpu.h new file mode 100644 index 0000000000..dc8d9306f2 --- /dev/null +++ b/src/hip/cell_gpu.h @@ -0,0 +1,292 @@ +#ifndef CELL_GPU_H +#define CELL_GPU_H +/* Config parameters. */ +#include "../config.h" +typedef int8_t timebin_t; +struct xpart_gpu { + /*! Offset between current position and position at last tree rebuild. */ + float x_diff[3]; + /*! Offset between the current position and position at the last sort. */ + float x_diff_sort[3]; + /*! Velocity at the last full step. */ + float v_full[3]; + /*! Internal energy at the last full step. */ + float u_full; +}; +struct part_gpu { + /*Task ID*/ + int tid; + /*! Particle unique ID. */ + long long id; + /*! Pointer to corresponding gravity part. */ + // struct gpu_gpart* gpart; + /*! Particle position. */ + float x[3]; + /*! Particle predicted velocity. */ + float v[3]; + /*! Particle acceleration. */ + float a_hydro[3]; + /*! Particle mass. */ + float mass; + /*! Particle smoothing length. */ + float h; + /*! Particle internal energy. */ + float u; + /*! Time derivative of the internal energy. */ + float u_dt; + /*! Particle density. */ + float rho; + /*! Kernel summation (For testing/debugging). */ + float SPH_sum; + + /* Cell information */ + /*! The cell location on the grid (corner nearest to the origin). */ + float loc[3]; + /*! The cell dimensions. */ + float width[3]; + float h_max; + int count; + /* Density information */ + + /*! Neighbour number count. */ + float wcount; + + /*! Derivative of the neighbour number with respect to h. */ + float wcount_dh; + + /*! Derivative of density with respect to h */ + float rho_dh; + + /*! Particle velocity curl. */ + float rot_v[3]; + + /* viscosity information */ + + /*! Particle velocity divergence */ + float div_v; + + /*! Particle velocity divergence from previous step */ + float div_v_previous_step; + + /*! Artificial viscosity parameter */ + float alpha_visc; + + /*! Signal velocity */ + float v_sig; + + /* thermal diffusion information */ + + /*! del^2 u, a smoothed quantity */ + float laplace_u; + + /*! Thermal diffusion coefficient */ + float alpha_diff; + + /* force information */ + + /*! "Grad h" term -- only partial in P-U */ + float f; + + /*! Particle soundspeed. */ + float soundspeed; + + /*! Time derivative of smoothing length */ + float h_dt; + + /*! Balsara switch */ + float balsara; + + /*! Particle pressure. */ + float pressure; + /*! Maximal alpha (viscosity) over neighbours */ + float alpha_visc_max_ngb; + + /* timestep stuff */ + + /*! Time-step length */ + timebin_t time_bin; + + /*all part of struct timestep_limiter_data, we had to destruct it + as GPUs don't like pointer chasing especially when memcpying*/ + /* Need waking-up ? */ + timebin_t wakeup; + + /*! Minimal time-bin across all neighbours */ + timebin_t min_ngb_time_bin; + + /* Do we want this particle to be synched back on the time-line? */ + char to_be_synchronized; +}; + +typedef struct part_soa { + /*Task ID*/ + int *tid_p; + /*bundle ID*/ + int *bid_p; + /*! Particle unique ID. */ + long long *id; + /*! Pointer to corresponding gravity part. */ + // struct gpu_gpart* gpart; + /*! Particle position. */ + double *x_p; + double *y_p; + double *z_p; + /*! Particle predicted velocity. */ + float *ux; + float *uy; + float *uz; + /*! Particle acceleration. */ + float *a_hydrox; + float *a_hydroy; + float *a_hydroz; + /*! Particle mass. */ + float *mass; + /*! Particle smoothing length. */ + float *h; + /*! Particle internal energy. */ + float *u; + /*! Time derivative of the internal energy. */ + float *u_dt; + /*! Particle density. */ + float *rho; + /*! Kernel summation (For testing/debugging). */ + float *SPH_sum; + + /* Cell information */ + /*! The cell location on the grid (corner nearest to the origin). */ + float *locx; + float *locy; + float *locz; + /*! The cell dimensions. */ + float *widthx; + float *widthy; + float *widthz; + float *h_max; + int *count_p; + int *count_test; + /* Density information */ + + /*! Neighbour number count. */ + float *wcount; + + /*! Derivative of the neighbour number with respect to h. */ + float *wcount_dh; + + /*! Derivative of density with respect to h */ + float *rho_dh; + + /*! Particle velocity curl. */ + float *rot_ux; + float *rot_uy; + float *rot_uz; + + /* viscosity information */ + + /*! Particle velocity divergence */ + float *div_v; + + /*! Particle velocity divergence from previous step */ + float *div_v_previous_step; + + /*! Artificial viscosity parameter */ + float *alpha_visc; + + /*! Signal velocity */ + float *v_sig; + + /* thermal diffusion information */ + + /*! del^2 u, a smoothed quantity */ + float *laplace_u; + + /*! Thermal diffusion coefficient */ + float *alpha_diff; + + /* force information */ + + /*! "Grad h" term -- only partial in P-U */ + float *f; + + /*! Particle soundspeed. */ + float *soundspeed; + + /*! Time derivative of smoothing length */ + float *h_dt; + + /*! Balsara switch */ + float *balsara; + + /*! Particle pressure. */ + float *pressure; + /*! Maximal alpha (viscosity) over neighbours */ + float *alpha_visc_max_ngb; + + /* timestep stuff */ + + /*! Time-step length */ + timebin_t *time_bin; + + /*all part of struct timestep_limiter_data, we had to destruct it + as GPUs don't like pointer chasing especially when memcpying*/ + /* Need waking-up ? */ + timebin_t *wakeup; + + /*! Minimal time-bin across all neighbours */ + timebin_t *min_ngb_time_bin; + + /* Do we want this particle to be synched back on the time-line? */ + char *to_be_synchronized; + +} part_soa; + +struct task_cell { + struct part_gpu *parts; +}; +// struct parts_gpu_SoA{ +// struct task_cell *tasks; +// }; + +struct cell_hydro_gpu { + // struct part_gpu *parts; + // struct xpart_gpu *xparts; + float h_max; + int count; +}; +struct cell_gpu { + /*! The cell location on the grid (corner nearest to the origin). */ + float loc[3]; + /*! The cell dimensions. */ + float width[3]; + /*Details of contents (particles) and properties*/ + struct cell_hydro_gpu hydro; +}; +struct cell_gpu_flat { + /*! The cell location on the grid (corner nearest to the origin). */ + float loc[3]; + /*! The cell dimensions. */ + float width[3]; + float h_max; + int count; +}; + +struct cells_gpu_flat { + float *locx; + float *locy; + float *locz; + /*! The cell dimensions. */ + float *widthx; + float *widthy; + float *widthz; + /*! The cell location on the grid (corner nearest to the origin). */ + /* float *loc[3];*/ + /*! The cell dimensions. */ + /* float *width[3];*/ + float *h_max; + int *count; +}; + +struct cells_gpu_flat_test { + float *locx; +}; + +#endif // CELL_GPU_H diff --git a/src/hip/cuda_headers.h b/src/hip/cuda_headers.h new file mode 100644 index 0000000000..2df61a53b5 --- /dev/null +++ b/src/hip/cuda_headers.h @@ -0,0 +1,63 @@ +#ifndef CUDA_HEADERS_H +#define CUDA_HEADERS_H +#define n_streams 1024 + +#ifdef WITH_CUDA +extern "C" { +#endif + +void GPU_runner_doself1_branch_gradient(struct cell_gpu *ci_gpu, + struct part_gpu *parts_gpu); +void cuda_tester(struct cell **ci_list_mgd, int numBlocksTest, + int block_size_test, int count_tasks); +void launch_cuda_kernel(struct cell_gpu *ci_gpu, struct part_gpu *parts, + int numBlocks, float d_a, float d_H, + const char *loop_type); +void launch_cuda_kernel_streams(struct part_gpu *d_parts, int numBlocks, + float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int tid, int count, + int max_count, float cellx, float celly, + float cellz, int first_part, int last_part); +void launch_cuda_kernel_bundles(struct cell_gpu *d_all_cells, + struct part_gpu **d_all_parts, int numBlocks, + float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, + int count_tasks, int tasksperbundle, + int numBlocks_x, int numBlocks_y, int tid, + int offset); +void launch_cuda_kernel_bundles_revised( + struct part_gpu *d_all_parts, int *d_task_first_part, int *d_task_last_part, + int *d_bundle_first_part, int *d_bundle_last_part, int numBlocks, float d_a, + float d_H, const char *loop_type, cudaStream_t stream, int bid, + int block_size, int count_tasks, int tasksperbundle, int numBlocks_x, + int numBlocks_y, int tid, int offset); +void launch_cuda_kernel_bundles_revised_soa( + struct part_soa parts_gpu_soa, int *d_task_first_part, + int *d_task_last_part, int *d_bundle_first_part, int *d_bundle_last_part, + int numBlocks, float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, int count_tasks, + int tasksperbundle, int numBlocks_x, int numBlocks_y, int tid, int offset, + int bundle_first_task, int max_parts); +void launch_cuda_print_streams(int numBlocks, cudaStream_t stream, int tid); +void launch_cuda_kernel_tester(struct cell_gpu *d_ci_gpu, + struct part_gpu **d_parts, int numBlocks, + float d_a, float d_H, const char *loop_type, + cudaStream_t stream, int bid, int block_size, + int count_tasks, int tasksperbundle, + int numBlocks_x, int numBlocks_y, int tid); +void launch_cuda_kernel_bundles_test(struct cell_gpu *d_all_cells, + struct part_gpu **d_all_parts, + int numBlocks, float d_a, float d_H, + int count_tasks); +void mgd_mem_cuda_kernel_bundles(struct part_gpu **parts_gpu_list, + int numBlocks, float d_a, float d_H, + const char *loop_type, cudaStream_t stream, + int bid, int block_size, int count_tasks, + int tasksperbundle, int numBlocks_x, + int numBlocks_y, int tid, int offset); + +#ifdef WITH_CUDA +} +#endif + +#endif // CUDA_HEADER_H diff --git a/src/hip/device_functions.h b/src/hip/device_functions.h new file mode 100644 index 0000000000..237c87dec1 --- /dev/null +++ b/src/hip/device_functions.h @@ -0,0 +1,149 @@ +#ifndef DEVICE_FUNCTIONS_H +#define DEVICE_FUNCTIONS_H +#include "../../config.h" + +/* Local headers. */ +// #include "../dimension.h" +// #include "../error.h" +// #include "../inline.h" +// #include "../minmax.h" +// #include "../vector.h" + +// Is this even necessary? Probably not as our code will operate differently +#define num_cuda_threads 128 +#define hydro_dimension 3.f + +/// Here we define stuff from kernel_hydro.h when using cubic_spline_kernel. +/// Will worry about sorting 'if statements for different kernels later//// +/* First some powers of gamma = H/h */ +#define kernel_gamma ((float)(1.825742)) +#define kernel_gamma_inv ((float)(1. / kernel_gamma)) +#define kernel_gamma2 ((float)(kernel_gamma * kernel_gamma)) +#define kernel_ivals 2 +#define kernel_degree 3 /*!< Degree of the polynomial */ +#define kernel_gamma_dim ((float)(kernel_gamma * kernel_gamma * kernel_gamma)) +#define kernel_gamma_dim_plus_one \ + ((float)(kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma)) +#define kernel_gamma_inv_dim \ + ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma))) +#define kernel_gamma_inv_dim_plus_one \ + ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma))) +#define kernel_ivals_f ((float)kernel_ivals) /*!< Number of branches */ +#define kernel_constant ((float)(16. * M_1_PI)) +/*! Cosmology default beta=3.0. + * Alpha can be set in the parameter file. + * Beta is defined as in e.g. Price (2010) Eqn (103) */ +#define const_viscosity_beta 3.0f +#ifdef WITH_CUDA +extern "C" { +#endif +/** + * @brief Returns the argument to the power given by the dimension plus one + * + * Computes \f$x^{d+1}\f$. + */ +__device__ float d_pow_dimension_plus_one(float x) { + +#if defined(HYDRO_DIMENSION_3D) + + const float x2 = x * x; + return x2 * x2; + +#elif defined(HYDRO_DIMENSION_2D) + + return x * x * x; + +#elif defined(HYDRO_DIMENSION_1D) + + return x * x; + +#else + + error("The dimension is not defined !"); + return 0.f; + +#endif +} + +/** + * @brief Return the argument to the power three adiabatic index minus five over + * two. + * + * Computes \f$x^{(3\gamma - 5)/2}\f$. + * + * @param x Argument + */ +__device__ float d_pow_three_gamma_minus_five_over_two(float x) { +#if defined(HYDRO_GAMMA_5_3) + + return 1.f; /* x^(0) */ + +#elif defined(HYDRO_GAMMA_7_5) + + return powf(x, -0.4f); /* x^(-2/5) */ + +#elif defined(HYDRO_GAMMA_4_3) + + return 1.f / sqrtf(x); /* x^(-1/2) */ + +#elif defined(HYDRO_GAMMA_2_1) + + return sqrtf(x); /* x^(1/2) */ + +#else + + error("The adiabatic index is not defined !"); + return 0.f; + +#endif +} + +/** + * @brief Computes the kernel function and its derivative. + * + * The kernel function needs to be mutliplied by \f$h^{-d}\f$ and the gradient + * by \f$h^{-(d+1)}\f$, where \f$d\f$ is the dimensionality of the problem. + * + * Returns 0 if \f$u > \gamma = H/h\f$. + * + * @param u The ratio of the distance to the smoothing length \f$u = x/h\f$. + * @param W (return) The value of the kernel function \f$W(x,h)\f$. + * @param dW_dx (return) The norm of the gradient of \f$|\nabla W(x,h)|\f$. + */ +__device__ void d_kernel_deval(float u, float *restrict W, + float *restrict dW_dx) { + + /* Go to the range [0,1[ from [0,H[ */ + const float x = u * kernel_gamma_inv; + + /* Pick the correct branch of the kernel */ + const int temp = (int)(x * kernel_ivals_f); + const int ind = temp > kernel_ivals ? kernel_ivals : temp; + static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)] = { + 3.f, -3.f, 0.f, 0.5f, /* 0 < u < 0.5 */ + -1.f, 3.f, -3.f, 1.f, /* 0.5 < u < 1 */ + 0.f, 0.f, 0.f, 0.f}; /* 1 < u */ + const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)]; + /* First two terms of the polynomial ... */ + float w = coeffs[0] * x + coeffs[1]; + float dw_dx = coeffs[0]; + + /* ... and the rest of them */ + for (int k = 2; k <= kernel_degree; k++) { + dw_dx = dw_dx * x + w; + w = x * w + coeffs[k]; + } + + w = max(w, 0.f); + dw_dx = min(dw_dx, 0.f); + + /* Return everything */ + *W = w * kernel_constant * kernel_gamma_inv_dim; + *dW_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one; +} + +#ifdef WITH_CUDA +} +#endif + +#endif // DEVICE_FUNCTIONS_H diff --git a/src/hip/dummy.c b/src/hip/dummy.c new file mode 100755 index 0000000000..66ab4665f9 --- /dev/null +++ b/src/hip/dummy.c @@ -0,0 +1,2 @@ +#include +void swiftcudadummy() {} diff --git a/src/hip/dummy.cpp b/src/hip/dummy.cpp new file mode 100755 index 0000000000..66ab4665f9 --- /dev/null +++ b/src/hip/dummy.cpp @@ -0,0 +1,2 @@ +#include +void swiftcudadummy() {} diff --git a/src/hip/part_gpu.h b/src/hip/part_gpu.h new file mode 100644 index 0000000000..5d7e32c611 --- /dev/null +++ b/src/hip/part_gpu.h @@ -0,0 +1,137 @@ +#ifndef PART_GPU_H +#define PART_GPU_H +/* Config parameters. */ +#include "../../config.h" +typedef int8_t timebin_t; + +#ifdef __cplusplus +extern "C" { +#endif + +// extern "C" { + +typedef struct part_soa { + /*Task ID*/ + int *tid_p; + /*bundle ID*/ + int *bid_p; + /*! Particle unique ID. */ + long long *id; + /*! Pointer to corresponding gravity part. */ + // struct gpu_gpart* gpart; + /*! Particle position. */ + double *x_p; + double *y_p; + double *z_p; + /*! Particle predicted velocity. */ + float *ux; + float *uy; + float *uz; + /*! Particle acceleration. */ + float *a_hydrox; + float *a_hydroy; + float *a_hydroz; + /*! Particle mass. */ + float *mass; + /*! Particle smoothing length. */ + float *h; + /*! Particle internal energy. */ + float *u; + /*! Time derivative of the internal energy. */ + float *u_dt; + /*! Particle density. */ + float *rho; + /*! Kernel summation (For testing/debugging). */ + float *SPH_sum; + + /* Cell information */ + /*! The cell location on the grid (corner nearest to the origin). */ + float *locx; + float *locy; + float *locz; + /*! The cell dimensions. */ + float *widthx; + float *widthy; + float *widthz; + float *h_max; + int *count_p; + int *count_test; + /* Density information */ + + /*! Neighbour number count. */ + float *wcount; + + /*! Derivative of the neighbour number with respect to h. */ + float *wcount_dh; + + /*! Derivative of density with respect to h */ + float *rho_dh; + + /*! Particle velocity curl. */ + float *rot_ux; + float *rot_uy; + float *rot_uz; + + /* viscosity information */ + + /*! Particle velocity divergence */ + float *div_v; + + /*! Particle velocity divergence from previous step */ + float *div_v_previous_step; + + /*! Artificial viscosity parameter */ + float *alpha_visc; + + /*! Signal velocity */ + float *v_sig; + + /* thermal diffusion information */ + + /*! del^2 u, a smoothed quantity */ + float *laplace_u; + + /*! Thermal diffusion coefficient */ + float *alpha_diff; + + /* force information */ + + /*! "Grad h" term -- only partial in P-U */ + float *f; + + /*! Particle soundspeed. */ + float *soundspeed; + + /*! Time derivative of smoothing length */ + float *h_dt; + + /*! Balsara switch */ + float *balsara; + + /*! Particle pressure. */ + float *pressure; + /*! Maximal alpha (viscosity) over neighbours */ + float *alpha_visc_max_ngb; + + /* timestep stuff */ + + /*! Time-step length */ + timebin_t *time_bin; + + /*all part of struct timestep_limiter_data, we had to destruct it + as GPUs don't like pointer chasing especially when memcpying*/ + /* Need waking-up ? */ + timebin_t *wakeup; + + /*! Minimal time-bin across all neighbours */ + timebin_t *min_ngb_time_bin; + + /* Do we want this particle to be synched back on the time-line? */ + char *to_be_synchronized; +}; + +#ifdef __cplusplus +}; +#endif + +#endif // PART_GPU_H diff --git a/src/hip/print_something.cu b/src/hip/print_something.cu new file mode 100755 index 0000000000..b69ad05dd4 --- /dev/null +++ b/src/hip/print_something.cu @@ -0,0 +1,37 @@ +#ifdef WITH_CUDA +#ifndef static +#define static +#endif +#ifndef restrict +#define restrict __restrict__ +#endif +#endif + +#include + +#ifdef __cplusplus +extern "C" { +#endif +#include "cuda_headers.h" +#ifdef __cplusplus +} +#endif + +extern "C" { +void print_something_cu() { printf("In Here\n"); } +} + +#ifdef __cplusplus +extern "C" { +#endif +void Initialise_GPU() { + int devId = 0; + // find and print device name + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, devId); + printf("Device : %s\n", prop.name); + cudaSetDevice(devId); +} +#ifdef __cplusplus +} +#endif diff --git a/src/hip/tasks_gpu.h b/src/hip/tasks_gpu.h new file mode 100755 index 0000000000..a3912aee2c --- /dev/null +++ b/src/hip/tasks_gpu.h @@ -0,0 +1,74 @@ +/* Config parameters. */ +#include "../config.h" + +struct tasks_self_gpu { + struct task_gpu *tgpu; +}; + +/** + * @brief A task to be run by the #scheduler. + */ +struct task_gpu { + + /*! Pointers to the cells this task acts upon */ + struct cell *ci, *cj; + + /*! List of tasks unlocked by this one */ + struct task_gpu **unlock_tasks; + + /*! Flags used to carry additional information (e.g. sort directions) */ + long long flags; + +#ifdef WITH_MPI + + /*! Buffer for this task's communications */ + void *buff; + + /*! MPI request corresponding to this task */ + MPI_Request req; + +#endif + + /*! Rank of a task in the order */ + int rank; + + /*! Weight of the task */ + float weight; + + /*! Number of tasks unlocked by this one */ + int nr_unlock_tasks; + + /*! Number of unsatisfied dependencies */ + int wait; + + /*! Type of the task */ + enum task_types type; + + /*! Sub-type of the task (for the tasks that have one */ + enum task_subtypes subtype; + + /*! Should the scheduler skip this task ? */ + char skip; + + /*! Is this task implicit (i.e. does not do anything) ? */ + char implicit; + +#ifdef SWIFT_DEBUG_TASKS + /*! ID of the queue or runner owning this task */ + short int rid; + + /*! Information about the direction of the pair task */ + short int sid; +#endif + + /*! Start and end time of this task */ + ticks tic, toc; + + /* Total time spent running this task */ + ticks total_ticks; + +#ifdef SWIFT_DEBUG_CHECKS + /* When was this task last run? */ + integertime_t ti_run; +#endif /* SWIFT_DEBUG_CHECKS */ +}; diff --git a/src/hip/tester.cu b/src/hip/tester.cu new file mode 100644 index 0000000000..3ffaf9e10c --- /dev/null +++ b/src/hip/tester.cu @@ -0,0 +1,21 @@ +#include "tester.h" + +#include +#include +#ifdef __cplusplus +extern "C" { +#endif +void testing_linkage(int a, float *b, float c) { + std::vector b_value_list; + b_value_list.reserve(a); + for (int i = 0; i < a; i++) { + (*b) = (*b) + c; + b_value_list.push_back((*b)); + std::cout << "Vector value is " << b_value_list[i] << " b value is " << (*b) + << std::endl; + } + std::cout << "Final value of b is " << (*b) << std::endl; +} +#ifdef __cplusplus +} +#endif diff --git a/src/hip/tester.h b/src/hip/tester.h new file mode 100755 index 0000000000..5729e66904 --- /dev/null +++ b/src/hip/tester.h @@ -0,0 +1,9 @@ +#ifdef __cplusplus +extern "C" { +#endif + +void testing_linkage(int a, float *b, float c); + +#ifdef __cplusplus +} +#endif diff --git a/src/memuse.h b/src/memuse.h index 5883e68684..d51ab4282d 100644 --- a/src/memuse.h +++ b/src/memuse.h @@ -20,8 +20,11 @@ #define SWIFT_MEMUSE_H /* Config parameters. */ +#ifdef WITH_CUDA +#include "../config.h" +#else #include - +#endif /* Includes. */ #include diff --git a/src/queue.c b/src/queue.c index 30601667cd..790b6b1335 100644 --- a/src/queue.c +++ b/src/queue.c @@ -178,7 +178,6 @@ void queue_insert(struct queue *q, struct task *t) { } } } - /* Increase the incoming count. */ atomic_inc(&q->count_incoming); } diff --git a/src/queue.h b/src/queue.h index 0576403bef..b90ca90b46 100644 --- a/src/queue.h +++ b/src/queue.h @@ -75,6 +75,28 @@ struct queue { int *tid_incoming; volatile unsigned int first_incoming, last_incoming, count_incoming; + /*Number of pack tasks left in queue A. Nasar */ + volatile int + n_packs_self_left_d; /*Number of density pack tasks left in queue*/ + volatile int n_packs_self_left_f; /*Number of force pack tasks left in queue*/ + volatile int + n_packs_self_left_g; /*Number of gradient pack tasks left in queue*/ + + volatile int n_packs_pair_left_d; + volatile int n_packs_pair_left_f; + volatile int n_packs_pair_left_g; + + volatile int + n_packs_self_stolen_d; /*Number of density pack tasks left in queue*/ + volatile int + n_packs_self_stolen_f; /*Number of force pack tasks left in queue*/ + volatile int + n_packs_self_stolen_g; /*Number of gradient pack tasks left in queue*/ + + volatile int n_packs_pair_stolen_d; + volatile int n_packs_pair_stolen_f; + volatile int n_packs_pair_stolen_g; + } __attribute__((aligned(queue_struct_align))); /* Function prototypes. */ diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h new file mode 100644 index 0000000000..a78ec6409c --- /dev/null +++ b/src/runner_doiact_functions_hydro_gpu.h @@ -0,0 +1,2116 @@ +#include "scheduler.h" +#include "runner_doiact_hydro.h" +#include "active.h" +#include +struct pack_vars_self { + /*List of tasks and respective cells to be packed*/ + struct task **task_list; + struct task **top_task_list; + struct cell **cell_list; + /*List of cell positions*/ + double *cellx; + double *celly; + double *cellz; + /*List of cell positions*/ + double *d_cellx; + double *d_celly; + double *d_cellz; + int bundle_size; + /*How many particles in a bundle*/ + int count_parts; + /**/ + int tasks_packed; + int top_tasks_packed; + int *task_first_part; + int *task_last_part; + int *d_task_first_part; + int *d_task_last_part; + int *bundle_first_part; + int *bundle_last_part; + int *bundle_first_task_list; + int count_max_parts; + int launch; + int launch_leftovers; + int target_n_tasks; + int nBundles; + int tasksperbundle; + +} pack_vars_self; +struct leaf_cell_list{ + struct cell **ci; + struct cell **cj; + int n_leaves; + int n_start; + int n_end; + int n_packed; +}; +struct pack_vars_pair { + /*List of tasks and respective cells to be packed*/ + struct task **task_list; + struct task **top_task_list; + struct leaf_cell_list * leaf_list; + struct cell **ci_list; + struct cell **cj_list; + /*List of cell shifts*/ + double *shiftx; + double *shifty; + double *shiftz; + /*List of cell shifts*/ + double *d_shiftx; + double *d_shifty; + double *d_shiftz; + int bundle_size; + /*How many particles in a bundle*/ + int count_parts; + /**/ + int tasks_packed; + int top_tasks_packed; + int *task_first_part; + int *task_last_part; + int *d_task_first_part; + int *d_task_last_part; + int *bundle_first_part; + int *bundle_last_part; + int *bundle_first_task_list; + int count_max_parts; + int launch; + int launch_leftovers; + int target_n_tasks; + int nBundles; + int tasksperbundle; + int task_locked; + +} pack_vars_pair; + +struct pack_vars_pair_f4 { + /*List of tasks and respective cells to be packed*/ + struct task **task_list; + struct cell **ci_list; + struct cell **cj_list; + /*List of cell shifts*/ + float3 *shift; + /*List of cell shifts*/ + float3 *d_shift; + int bundle_size; + /*How many particles in a bundle*/ + int count_parts; + /**/ + int tasks_packed; + int4 *fparti_fpartj_lparti_lpartj; + int4 *d_fparti_fpartj_lparti_lpartj; + int *bundle_first_part; + int *bundle_last_part; + int *bundle_first_task_list; + int count_max_parts; + int launch; + int launch_leftovers; + int target_n_tasks; + int nBundles; + int tasksperbundle; + +} pack_vars_pair_f4; + +#include "cuda/BLOCK_SIZE.h" +#include "cuda/GPU_runner_functions.h" +#include "runner_gpu_pack_functions.h" +#include "task.h" +#define CUDA_DEBUG + +double runner_doself1_pack_f4(struct runner *r, struct scheduler *s, + struct pack_vars_self *pack_vars, struct cell *ci, + struct task *t, + struct part_aos_f4_send *parts_send, + int2 *task_first_part_f4) { + /* Timers for how long this all takes. + * t0 and t1 are from start to finish including GPU calcs + * tp0 and tp1 only time packing and unpacking*/ + struct timespec t0, t1; // + clock_gettime(CLOCK_REALTIME, &t0); + /* Find my queue for use later*/ + int qid = r->qid; + /*Place pointers to the task and cells packed in an array for use later + * when unpacking after the GPU offload*/ + int tasks_packed = pack_vars->tasks_packed; + pack_vars->task_list[tasks_packed] = t; + pack_vars->cell_list[tasks_packed] = ci; + /* Identify row in particle arrays where this task starts*/ + task_first_part_f4[tasks_packed].x = pack_vars->count_parts; + int *count_parts_self = &pack_vars->count_parts; + /* This re-arranges the particle data from cell->hydro->parts into a + long array of part structs*/ + runner_doself1_gpu_pack_neat_aos_f4( + r, ci, parts_send, 0 /*timer. 0 no timing, 1 for timing*/, + count_parts_self, tasks_packed, pack_vars->count_max_parts); + /* Identify the row in the array where this task ends (row id of its + last particle)*/ + task_first_part_f4[tasks_packed].y = pack_vars->count_parts; + /* Identify first particle for each bundle of tasks */ + const int bundle_size = pack_vars->bundle_size; + if (tasks_packed % bundle_size == 0) { + int bid = tasks_packed / bundle_size; + pack_vars->bundle_first_part[bid] = task_first_part_f4[tasks_packed].x; + pack_vars->bundle_first_task_list[bid] = tasks_packed; + } + /* Tell the cell it has been packed */ + ci->pack_done++; + /* Record that we have now done a packing (self) */ + t->done = 1; + pack_vars->tasks_packed++; + pack_vars->launch = 0; + pack_vars->launch_leftovers = 0; + + /*Get a lock to the queue so we can safely decrement counter and check for launch leftover condition*/ + lock_lock(&s->queues[qid].lock); + s->queues[qid].n_packs_self_left_d--; + if (s->queues[qid].n_packs_self_left_d < 1) pack_vars->launch_leftovers = 1; + lock_unlock(&s->queues[qid].lock); + /*Have we packed enough tasks to offload to GPU?*/ + if (pack_vars->tasks_packed == pack_vars->target_n_tasks) + pack_vars->launch = 1; + + /*Record the end of packing time*/ + clock_gettime(CLOCK_REALTIME, &t1); + /* Release the lock on the cell */ + cell_unlocktree(ci); + t->gpu_done = 1; + /*Calculate time spent packing and return to runner_main*/ + return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; +} + +double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s, + struct pack_vars_self *pack_vars, + struct cell *ci, struct task *t, + struct part_aos_f4_g_send *parts_send, + int2 *task_first_part_f4) { + + /* Timers for how long this all takes. + * t0 and t1 are from start to finish including GPU calcs + * tp0 and tp1 only time packing and unpacking*/ + struct timespec t0, t1; // + clock_gettime(CLOCK_REALTIME, &t0); + /* Find my queue for use later*/ + int qid = r->qid; + /*Place pointers to the task and cells packed in an array for use later + * when unpacking after the GPU offload*/ + int tasks_packed = pack_vars->tasks_packed; + pack_vars->task_list[tasks_packed] = t; + pack_vars->cell_list[tasks_packed] = ci; + /* Identify row in particle arrays where this task starts*/ + task_first_part_f4[tasks_packed].x = pack_vars->count_parts; + int *count_parts_self = &pack_vars->count_parts; + /* This re-arranges the particle data from cell->hydro->parts into a + long array of part structs*/ + runner_doself1_gpu_pack_neat_aos_f4_g( + r, ci, parts_send, 0 /*timer. 0 no timing, 1 for timing*/, + count_parts_self, tasks_packed, pack_vars->count_max_parts); + /* identify the row in the array where this task ends (row id of its + last particle)*/ + task_first_part_f4[tasks_packed].y = pack_vars->count_parts; + /* Identify first particle for each bundle of tasks */ + const int bundle_size = pack_vars->bundle_size; + if (tasks_packed % bundle_size == 0) { + int bid = tasks_packed / bundle_size; + pack_vars->bundle_first_part[bid] = task_first_part_f4[tasks_packed].x; + pack_vars->bundle_first_task_list[bid] = tasks_packed; + } + /* Tell the cell it has been packed */ + ci->pack_done_g++; + /* Record that we have now done a packing (self) */ + t->done = 1; + pack_vars->tasks_packed++; + pack_vars->launch = 0; + pack_vars->launch_leftovers = 0; + /*Get a lock to the queue so we can safely decrement counter and check for launch leftover condition*/ + lock_lock(&s->queues[qid].lock); + s->queues[qid].n_packs_self_left_g--; + if (s->queues[qid].n_packs_self_left_g < 1) pack_vars->launch_leftovers = 1; + lock_unlock(&s->queues[qid].lock); + + if (pack_vars->tasks_packed == pack_vars->target_n_tasks) + pack_vars->launch = 1; + /*Add time to packing_time. Timer for end of GPU work after the if(launch || + * launch_leftovers statement)*/ + clock_gettime(CLOCK_REALTIME, &t1); + /* Release the lock on the cell */ + cell_unlocktree(ci); + /*Calculate time spent packing and return to runner_main*/ + return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; +} + +double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s, + struct pack_vars_self *pack_vars, + struct cell *ci, struct task *t, + struct part_aos_f4_f_send *parts_send, + int2 *task_first_part_f4) { + + /* Timers for how long this all takes. + * t0 and t1 are from start to finish including GPU calcs + * tp0 and tp1 only time packing and unpacking*/ + struct timespec t0, t1; // + clock_gettime(CLOCK_REALTIME, &t0); + /* Find my queue for use later*/ + int qid = r->qid; + /*Place pointers to the task and cells packed in an array for use later + * when unpacking after the GPU offload*/ + int tasks_packed = pack_vars->tasks_packed; + pack_vars->task_list[tasks_packed] = t; + pack_vars->cell_list[tasks_packed] = ci; + /* Identify row in particle arrays where this task starts*/ + task_first_part_f4[tasks_packed].x = pack_vars->count_parts; + int *count_parts_self = &pack_vars->count_parts; + /* This re-arranges the particle data from cell->hydro->parts into a + long array of part structs*/ + runner_doself1_gpu_pack_neat_aos_f4_f( + r, ci, parts_send, 0 /*timer. 0 no timing, 1 for timing*/, + count_parts_self, tasks_packed, pack_vars->count_max_parts); + /* Identify the row in the array where this task ends (row id of its + last particle) */ + task_first_part_f4[tasks_packed].y = pack_vars->count_parts; + /* Identify first particle for each bundle of tasks */ + const int bundle_size = pack_vars->bundle_size; + if (tasks_packed % bundle_size == 0) { + int bid = tasks_packed / bundle_size; + pack_vars->bundle_first_part[bid] = task_first_part_f4[tasks_packed].x; + pack_vars->bundle_first_task_list[bid] = tasks_packed; + } + /* Tell the cell it has been packed */ + ci->pack_done_f++; + /* Record that we have now done a packing (self) */ + t->done = 1; + pack_vars->tasks_packed++; + pack_vars->launch = 0; + pack_vars->launch_leftovers = 0; + /*Get a lock to the queue so we can safely decrement counter and check for launch leftover condition*/ + lock_lock(&s->queues[qid].lock); + s->queues[qid].n_packs_self_left_f--; + if (s->queues[qid].n_packs_self_left_f < 1) pack_vars->launch_leftovers = 1; + lock_unlock(&s->queues[qid].lock); + /*Have we packed enough tasks to offload to GPU?*/ + if (pack_vars->tasks_packed == pack_vars->target_n_tasks) + pack_vars->launch = 1; + + /*Record the end of packing time*/ + clock_gettime(CLOCK_REALTIME, &t1); + /* Release the lock on the cell */ + cell_unlocktree(ci); + /*Calculate time spent packing and return to runner_main*/ + return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; +} + +void runner_recurse_gpu(struct runner *r, struct scheduler *s, + struct pack_vars_pair *restrict pack_vars, + struct cell *ci, struct cell *cj, struct task *t, + struct part_aos_f4_send *parts_send, + struct engine *e, + int4 *fparti_fpartj_lparti_lpartj, int *n_leafs_found, + int depth, int n_expected_tasks) { + + /* Should we even bother? A. Nasar: For GPU code we need to be clever about this */ + if (!CELL_IS_ACTIVE(ci, e) && !CELL_IS_ACTIVE(cj, e)) return; + if (ci->hydro.count == 0 || cj->hydro.count == 0) return; + + /* Get the type of pair and flip ci/cj if needed. */ + double shift[3]; + const int sid = space_getsid_and_swap_cells(s, &ci, &cj, shift); + + /* Recurse? */ + if (cell_can_recurse_in_pair_hydro_task(ci) && + cell_can_recurse_in_pair_hydro_task(cj)) { + struct cell_split_pair *csp = &cell_split_pairs[sid]; + for (int k = 0; k < csp->count; k++) { + const int pid = csp->pairs[k].pid; + const int pjd = csp->pairs[k].pjd; + /*Do we want to do anything before we recurse?*/ + + /*We probably want to record */ + if (ci->progeny[pid] != NULL && cj->progeny[pjd] != NULL){ + runner_recurse_gpu(r, s, pack_vars, ci->progeny[pid], cj->progeny[pjd], t, parts_send, e, fparti_fpartj_lparti_lpartj, + n_leafs_found, depth + 1, n_expected_tasks); +// message("recursing to depth %i", depth + 1); + } + } + } + else if (CELL_IS_ACTIVE(ci, e) || CELL_IS_ACTIVE(cj, e)) { + /* if any cell empty: skip */ + if(ci->hydro.count == 0 || cj->hydro.count == 0) return; + int leafs_found = *n_leafs_found; + /*for all leafs to be sent add to cell list */ +// cells_left[leafs_found] = ci; +// cells_right[leafs_found] = cj; + /*Add leaf cells to list for each top_level task*/ + pack_vars->leaf_list[pack_vars->top_tasks_packed].ci[leafs_found] = ci; + pack_vars->leaf_list[pack_vars->top_tasks_packed].cj[leafs_found] = cj; + pack_vars->leaf_list[pack_vars->top_tasks_packed].n_leaves++; +// error("stop"); + *n_leafs_found = leafs_found + 1; + if(*n_leafs_found >= n_expected_tasks) + error("Created %i more than expected leaf cells. depth %i", *n_leafs_found, depth); + } + +}; + +double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s, + struct pack_vars_pair *restrict pack_vars, + struct cell *ci, struct cell *cj, struct task *t, + struct part_aos_f4_send *parts_send, + struct engine *e, + int4 *fparti_fpartj_lparti_lpartj) { + /* Timers for how long this all takes. + * t0 and t1 are from start to finish including GPU calcs + * tp0 and tp1 only time packing and unpacking*/ + struct timespec t0, t1; // + clock_gettime(CLOCK_REALTIME, &t0); + int tasks_packed = pack_vars->tasks_packed; + int qid = r->qid; + + double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0; + struct cell *citmp, *cjtmp; + citmp=ci; + cjtmp=cj; + /* Get the type of pair and flip ci/cj if needed. */ + double shift[3]; + const int sid = space_getsid_and_swap_cells(s, &citmp, &cjtmp, shift); + if(citmp != ci) error("I'm flipped"); + /*Get the shifts in case of periodics*/ + space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp); + + /*Get pointers to the list of tasks and cells packed*/ +// pack_vars->task_list[tasks_packed] = t; + pack_vars->ci_list[tasks_packed] = ci; + pack_vars->cj_list[tasks_packed] = cj; + + float3 shift_tmp = {x_tmp, y_tmp, z_tmp}; + + const int count_ci = ci->hydro.count; + const int count_cj = cj->hydro.count; + + /*Assign an id for this task*/ + const int tid = tasks_packed; + + /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. + * packed_tmp+1 is index for cell j */ + fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts; + fparti_fpartj_lparti_lpartj[tasks_packed].y = + pack_vars->count_parts + count_ci; + + int *count_parts = &pack_vars->count_parts; + /* This re-arranges the particle data from cell->hydro->parts into a + long array of part structs*/ + runner_do_ci_cj_gpu_pack_neat_aos_f4( + r, ci, cj, parts_send, 0 /*timer. 0 no timing, 1 for timing*/, + count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj, + shift_tmp); + /* Find last parts in task for ci and cj*/ + fparti_fpartj_lparti_lpartj[tasks_packed].z = + pack_vars->count_parts - count_cj; + fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts; + + /* Tell the cells they have been packed */ + ci->pack_done++; + cj->pack_done++; + + /* Identify first particle for each bundle of tasks */ + const int bundle_size = pack_vars->bundle_size; + if (tasks_packed % bundle_size == 0) { + int bid = tasks_packed / bundle_size; + pack_vars->bundle_first_part[bid] = + fparti_fpartj_lparti_lpartj[tasks_packed].x; + pack_vars->bundle_first_task_list[bid] = tasks_packed; + } + /* Record that we have now done a packing (self) */ + t->done = 1; + pack_vars->tasks_packed++; + pack_vars->launch = 0; + pack_vars->launch_leftovers = 0; + pack_vars->leaf_list[pack_vars->top_tasks_packed - 1].n_packed++; + + //A. Nasar: Need to come back to this at some point! + lock_lock(&s->queues[qid].lock); + s->queues[qid].n_packs_pair_left_d--; + if (s->queues[qid].n_packs_pair_left_d < 1) pack_vars->launch_leftovers = 1; + lock_unlock(&s->queues[qid].lock); + if (pack_vars->tasks_packed == pack_vars->target_n_tasks){ + pack_vars->launch = 1; + } + /*Add time to packing_time. Timer for end of GPU work after the if(launch || + * launch_leftovers statement)*/ + clock_gettime(CLOCK_REALTIME, &t1); + return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; +}; + +double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s, + struct pack_vars_pair *restrict pack_vars, + struct cell *ci, struct cell *cj, + struct task *t, + struct part_aos_f4_g_send *parts_send, + struct engine *e, + int4 *fparti_fpartj_lparti_lpartj) { + + /* Timers for how long this all takes. + * t0 and t1 are from start to finish including GPU calcs + * tp0 and tp1 only time packing and unpacking*/ + struct timespec t0, t1; // + clock_gettime(CLOCK_REALTIME, &t0); + int tasks_packed = pack_vars->tasks_packed; + + int qid = r->qid; + // pthread_mutex_lock(&s->sleep_mutex); + // atomic_dec(&(s->p_g_left[qid])); + // pthread_cond_broadcast(&s->sleep_cond); + // pthread_mutex_unlock(&s->sleep_mutex); + + double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0; + /*Get the shifts in case of periodics*/ + space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp); + + /*Get pointers to the list of tasks and cells packed*/ + pack_vars->task_list[tasks_packed] = t; + pack_vars->ci_list[tasks_packed] = ci; + pack_vars->cj_list[tasks_packed] = cj; + + float3 shift_tmp = {x_tmp, y_tmp, z_tmp}; + + const int count_ci = ci->hydro.count; + const int count_cj = cj->hydro.count; + + /*Assign an id for this task*/ + const int tid = tasks_packed; + + /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. + * packed_tmp+1 is index for cell j */ + // pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts; + // pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts + + // count_ci; + + fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts; + fparti_fpartj_lparti_lpartj[tasks_packed].y = + pack_vars->count_parts + count_ci; + + int *count_parts = &pack_vars->count_parts; + // if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid, + // pack_vars->count_parts); + /* This re-arranges the particle data from cell->hydro->parts into a + long array of part structs*/ + runner_do_ci_cj_gpu_pack_neat_aos_f4_g( + r, ci, cj, parts_send, 0 /*timer. 0 no timing, 1 for timing*/, + count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj, + shift_tmp); + // runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no + // timing, 1 for timing*/, count_parts, tasks_packed, + // pack_vars->count_max_parts); //This may cause an issue. Be sure to test + // that + // pack_vars->count_parts is actually increment here + /* Find last parts in task for ci and cj. Packed_tmp is index for cell i. + * packed_tmp+1 is index for cell j */ + + // if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count + // %i\n", r->cpuid, *count_parts, pack_vars->count_parts); + fparti_fpartj_lparti_lpartj[tasks_packed].z = + pack_vars->count_parts - count_cj; + fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts; + // pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - + // count_cj; pack_vars->task_last_part[packed_tmp + 1] = + // pack_vars->count_parts; + + /* Tell the cells they have been packed */ + ci->pack_done_g++; + cj->pack_done_g++; + + /* Identify first particle for each bundle of tasks */ + const int bundle_size = pack_vars->bundle_size; + if (tasks_packed % bundle_size == 0) { + int bid = tasks_packed / bundle_size; + pack_vars->bundle_first_part[bid] = + fparti_fpartj_lparti_lpartj[tasks_packed].x; + pack_vars->bundle_first_task_list[bid] = tasks_packed; + } + + /* Record that we have now done a packing (self) */ + t->done = 1; + /* Copies done. Release the lock ! */ + cell_unlocktree(ci); + cell_unlocktree(cj); + pack_vars->tasks_packed++; + pack_vars->launch = 0; + pack_vars->launch_leftovers = 0; + /* Record that we have now done a packing (self) */ + // int qid = r->qid; + // atomic_dec(&(s->queues[qid].n_packs_pair_left_g)); + + lock_lock(&s->queues[qid].lock); + + s->queues[qid].n_packs_pair_left_g--; + + if (s->queues[qid].n_packs_pair_left_g < 1) pack_vars->launch_leftovers = 1; + + lock_unlock(&s->queues[qid].lock); + + // if ((s->p_g_left[qid] < 1)) + // pack_vars->launch_leftovers = 1; + if (pack_vars->tasks_packed == pack_vars->target_n_tasks) + pack_vars->launch = 1; + /*Add time to packing_time. Timer for end of GPU work after the if(launch || + * launch_leftovers statement)*/ + clock_gettime(CLOCK_REALTIME, &t1); + return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; +} + +double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s, + struct pack_vars_pair *restrict pack_vars, + struct cell *ci, struct cell *cj, + struct task *t, + struct part_aos_f4_f_send *parts_send, + struct engine *e, + int4 *fparti_fpartj_lparti_lpartj) { + + /* Timers for how long this all takes. + * t0 and t1 are from start to finish including GPU calcs + * tp0 and tp1 only time packing and unpacking*/ + struct timespec t0, t1; // + clock_gettime(CLOCK_REALTIME, &t0); + int tasks_packed = pack_vars->tasks_packed; + + /* Record that we have now done a packing (self) */ + int qid = r->qid; + // atomic_dec(&(s->queues[qid].n_packs_pair_left_f)); + // pthread_mutex_lock(&s->sleep_mutex); + atomic_dec(&(s->p_f_left[qid])); + // pthread_cond_broadcast(&s->sleep_cond); + // pthread_mutex_unlock(&s->sleep_mutex); + + double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0; + /*Get the shifts in case of periodics*/ + space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp); + + /*Get pointers to the list of tasks and cells packed*/ + pack_vars->task_list[tasks_packed] = t; + pack_vars->ci_list[tasks_packed] = ci; + pack_vars->cj_list[tasks_packed] = cj; + + float3 shift_tmp = {x_tmp, y_tmp, z_tmp}; + + const int count_ci = ci->hydro.count; + const int count_cj = cj->hydro.count; + + /*Assign an id for this task*/ + const int tid = tasks_packed; + + /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. + * packed_tmp+1 is index for cell j */ + // pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts; + // pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts + + // count_ci; + + fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts; + fparti_fpartj_lparti_lpartj[tasks_packed].y = + pack_vars->count_parts + count_ci; + + int *count_parts = &pack_vars->count_parts; + // if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid, + // pack_vars->count_parts); + /* This re-arranges the particle data from cell->hydro->parts into a + long array of part structs*/ + runner_do_ci_cj_gpu_pack_neat_aos_f4_f( + r, ci, cj, parts_send, 0 /*timer. 0 no timing, 1 for timing*/, + count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj, + shift_tmp); + // runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no + // timing, 1 for timing*/, count_parts, tasks_packed, + // pack_vars->count_max_parts); //This may cause an issue. Be sure to test + // that + // pack_vars->count_parts is actually increment here + /* Find last parts in task for ci and cj. Packed_tmp is index for cell i. + * packed_tmp+1 is index for cell j */ + + // if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count + // %i\n", r->cpuid, *count_parts, pack_vars->count_parts); + fparti_fpartj_lparti_lpartj[tasks_packed].z = + pack_vars->count_parts - count_cj; + fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts; + // pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - + // count_cj; pack_vars->task_last_part[packed_tmp + 1] = + // pack_vars->count_parts; + + /* Tell the cells they have been packed */ + ci->pack_done_f++; + cj->pack_done_f++; + + /* Identify first particle for each bundle of tasks */ + const int bundle_size = pack_vars->bundle_size; + if (tasks_packed % bundle_size == 0) { + int bid = tasks_packed / bundle_size; + pack_vars->bundle_first_part[bid] = + fparti_fpartj_lparti_lpartj[tasks_packed].x; + pack_vars->bundle_first_task_list[bid] = tasks_packed; + } + + /* Record that we have now done a packing (self) */ + t->done = 1; + /* Copies done. Release the lock ! */ + cell_unlocktree(ci); + cell_unlocktree(cj); + pack_vars->tasks_packed++; + pack_vars->launch = 0; + pack_vars->launch_leftovers = 0; + + lock_lock(&s->queues[qid].lock); + + s->queues[qid].n_packs_pair_left_f--; + + if (s->queues[qid].n_packs_pair_left_f < 1) pack_vars->launch_leftovers = 1; + + lock_unlock(&s->queues[qid].lock); + + // if ((s->p_f_left[qid] < 1)) + // pack_vars->launch_leftovers = 1; + if (pack_vars->tasks_packed == pack_vars->target_n_tasks) + pack_vars->launch = 1; + /*Add time to packing_time. Timer for end of GPU work after the if(launch || + * launch_leftovers statement)*/ + clock_gettime(CLOCK_REALTIME, &t1); + return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; +} + +void runner_doself1_launch_f4( + struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, + struct cell *ci, struct task *t, struct part_aos_f4_send *parts_send, + struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send, + struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a, + float d_H, struct engine *e, double *packing_time, double *gpu_time, + double *unpack_time, int devId, + int2 *task_first_part_f4, int2 *d_task_first_part_f4, + cudaEvent_t *self_end) { + + struct timespec t0, t1, tp0, tp1; // + clock_gettime(CLOCK_REALTIME, &t0); + + /* Identify the number of GPU bundles to run in ideal case*/ + int nBundles_temp = pack_vars->nBundles; + + /*How many tasks have we packed?*/ + const int tasks_packed = pack_vars->tasks_packed; + + /*How many tasks should be in a bundle?*/ + const int bundle_size = pack_vars->bundle_size; + + /* Special case for incomplete bundles (when having leftover tasks not enough + * to fill a bundle) */ + if (pack_vars->launch_leftovers) { + nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size; + if (tasks_packed == 0) + error("zero tasks packed but somehow got into GPU loop"); + // pack_vars->bundle_first_part[nBundles_temp] = + // pack_vars->task_first_part[tasks_packed - 1]; + pack_vars->bundle_first_part[nBundles_temp] = + task_first_part_f4[tasks_packed - 1].x; + } + /* Identify the last particle for each bundle of tasks */ + for (int bid = 0; bid < nBundles_temp - 1; bid++) { + pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1]; + } + /* special treatment for the last bundle */ + if (nBundles_temp > 1) + pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts; + else + pack_vars->bundle_last_part[0] = pack_vars->count_parts; + // clock_gettime(CLOCK_REALTIME, &t0hmemcpy); + /*Copy arrays containing first and last part for each task to GPU*/ + // cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part, + // tasks_packed * sizeof(int), cudaMemcpyHostToDevice); + // cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part, + // tasks_packed * sizeof(int), cudaMemcpyHostToDevice); + // cudaMemPrefetchAsync(d_task_first_part_self_dens_f4, tasks_packed * + // sizeof(int2), devId, NULL); + /*Copy cell shifts to device*/ + // cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx, + // tasks_packed * sizeof(double), cudaMemcpyHostToDevice); + // cudaMemcpy(pack_vars->d_celly, pack_vars->celly, + // tasks_packed * sizeof(double), cudaMemcpyHostToDevice); + // cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz, + // tasks_packed * sizeof(double), cudaMemcpyHostToDevice); + // clock_gettime(CLOCK_REALTIME, &t1hmemcpy); + // *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) + + // (t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) / 1000000000.0; + /* Launch the copies for each bundle and run the GPU kernel */ + /*We don't go into this loop if tasks_left_self == 1 as + nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/ + int max_parts; + for (int bid = 0; bid < nBundles_temp; bid++) { + + max_parts = 0; + int parts_in_bundle = 0; + const int first_task = bid * bundle_size; + int last_task = (bid + 1) * bundle_size; + for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) { + if (tid < tasks_packed) { + /*Get an estimate for the max number of parts per cell in the bundle. + * Used for determining the number of GPU CUDA blocks*/ + int count = task_first_part_f4[tid].y - task_first_part_f4[tid].x; + parts_in_bundle += count; + max_parts = max(max_parts, count); + last_task = tid; + } + } + // const int n_tasks = last_task - first_task; + + const int first_part_tmp = pack_vars->bundle_first_part[bid]; + const int bundle_n_parts = + pack_vars->bundle_last_part[bid] - first_part_tmp; + // clock_gettime(CLOCK_REALTIME, &t0hmemcpy); + // cudaMemPrefetchAsync(&d_task_first_part_self_dens_f4[first_task], + // (last_task - first_task) * sizeof(int2), + // devId, stream[bid]); + cudaMemcpyAsync(&d_task_first_part_f4[first_task], + &task_first_part_f4[first_task], + (last_task + 1 - first_task) * sizeof(int2), + cudaMemcpyHostToDevice, stream[bid]); + // cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError(); + //// if (cu_error != cudaSuccess) { fprintf( + /// stderr, "CUDA error in density + // self host 2 device memcpy: %s cpuid id is: %i\n ", + // cudaGetErrorString(cu_error), r->cpuid); + // exit(0); + // } + // clock_gettime(CLOCK_REALTIME, &t1hmemcpy); + // *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) + + // (t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) / + // 1000000000.0; + cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp], + bundle_n_parts * sizeof(struct part_aos_f4_send), + cudaMemcpyHostToDevice, stream[bid]); + + // #ifdef CUDA_DEBUG + // cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError(); + //// + // // + // Get error code if (cu_error != cudaSuccess) { fprintf( + // stderr, "CUDA error in density self host 2 device + // memcpy: %s cpuid id is: %i\n ", + // cudaGetErrorString(cu_error), r->cpuid); + // exit(0); + // } + // #endif + const int tasksperbundle = pack_vars->tasksperbundle; + int tasks_left = tasksperbundle; + if (bid == nBundles_temp - 1) { + tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle; + } + // Will launch a 2d grid of GPU thread blocks (number of tasks is + // the y dimension and max_parts is the x dimension + int numBlocks_y = tasks_left; + int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE; + int bundle_first_task = pack_vars->bundle_first_task_list[bid]; + // const char *loop_type = "density"; + // struct first_part first_parts; + // for(int i = 0; i < numBlocks_y; i++) first_parts.list[i] = + // pack_vars->task_first_part[i]; fprintf(stderr, "Launching kernel with + // %i tasks leftovers %i\n", tasks_packed, + // pack_vars->launch_leftovers); + // Launch the kernel + launch_density_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid], + numBlocks_x, numBlocks_y, bundle_first_task, + d_task_first_part_f4); + // #ifdef CUDA_DEBUG + // cu_error = cudaPeekAtLastError(); // Get error code + // if (cu_error != cudaSuccess) { + // fprintf(stderr, + // "CUDA error with self density kernel launch: %s + // cpuid id is: %i\n ", + // cudaGetErrorString(cu_error), r->cpuid); exit(0); + // } + // #endif + cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp], + bundle_n_parts * sizeof(struct part_aos_f4_recv), + cudaMemcpyDeviceToHost, stream[bid]); + cudaEventRecord(self_end[bid], stream[bid]); + // #ifdef CUDA_DEBUG + // cu_error = cudaPeekAtLastError(); // cudaGetLastError(); // + // // + // Get error code if (cu_error != cudaSuccess) { + // fprintf(stderr, "CUDA error with self density + // D2H memcpy: %s cpuid id is: %i\n ", + // cudaGetErrorString(cu_error), + // r->cpuid); error("Something's up with your cuda code"); + // } + // #endif + } /*End of looping over bundles to launch in streams*/ + /* Make sure all the kernels and copies back are finished */ + // cudaDeviceSynchronize(); + + /*Time end of GPU work*/ + clock_gettime(CLOCK_REALTIME, &t1); + *gpu_time += + (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; + + /* Now copy the data back from the CPU thread-local buffers to the cells */ + /* Pack length counter for use in unpacking */ + int pack_length_unpack = 0; + ticks total_cpu_unpack_ticks = 0.; + for (int bid = 0; bid < nBundles_temp; bid++) { + + clock_gettime(CLOCK_REALTIME, &t0); + + // cudaStreamSynchronize(stream[bid]); + cudaEventSynchronize(self_end[bid]); + + clock_gettime(CLOCK_REALTIME, &t1); + *gpu_time += + (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; + + /*Time unpacking*/ + // clock_gettime(CLOCK_REALTIME, &tp0); + + for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) { + + if (tid < tasks_packed) { + struct cell *cii = pack_vars->cell_list[tid]; + struct task *tii = pack_vars->task_list[tid]; + + // struct cell *cii = ci_list_self_dens[tid]; + // struct task *tii = task_list_self_dens[tid]; + + clock_gettime(CLOCK_REALTIME, &tp0); + + // clock_gettime(CLOCK_REALTIME, &t0hmemcpy); + while (cell_locktree(cii)) { + ; /* spin until we acquire the lock */ + } + // clock_gettime(CLOCK_REALTIME, &t1hmemcpy); + // *hmemcpy_time += (t1hmemcpy.tv_sec - + // t0hmemcpy.tv_sec) + (t1hmemcpy.tv_nsec - + // t0hmemcpy.tv_nsec) / 1000000000.0; + const ticks tic = getticks(); + /* Do the copy */ + runner_doself1_gpu_unpack_neat_aos_f4(r, cii, parts_recv, 0, + &pack_length_unpack, tid, + pack_vars->count_max_parts, e); + const ticks toc = getticks(); + + total_cpu_unpack_ticks += toc - tic; + /* Record things for debugging */ + cii->gpu_done++; + /*Time end of unpacking*/ + clock_gettime(CLOCK_REALTIME, &tp1); + *unpack_time += (tp1.tv_sec - tp0.tv_sec) + + (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0; + pthread_mutex_lock(&s->sleep_mutex); + atomic_dec(&s->waiting); + pthread_cond_broadcast(&s->sleep_cond); + pthread_mutex_unlock(&s->sleep_mutex); + /* Release the lock */ + cell_unlocktree(cii); + + /*schedule my dependencies (Only unpacks really)*/ + enqueue_dependencies(s, tii); + /*Signal sleeping runners*/ + // MATTHIEU signal_sleeping_runners(s, tii); + + tii->gpu_done = 1; + } + } + /*Time end of unpacking*/ + // clock_gettime(CLOCK_REALTIME, &tp1); + // *hmemcpy_time += (tp1.tv_sec - tp0.tv_sec) + + // (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0; + // *packing_time += (tp1.tv_sec - tp0.tv_sec) + + // (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0; + } + /* Zero counters for the next pack operations */ + pack_vars->count_parts = 0; + pack_vars->tasks_packed = 0; + + t->total_cpu_unpack_ticks += total_cpu_unpack_ticks; + +} /*End of GPU work Self*/ + +void runner_doself1_launch_f4_g( + struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, + struct cell *ci, struct task *t, struct part_aos_f4_g_send *parts_send, + struct part_aos_f4_g_recv *parts_recv, + struct part_aos_f4_g_send *d_parts_send, + struct part_aos_f4_g_recv *d_parts_recv, cudaStream_t *stream, float d_a, + float d_H, struct engine *e, double *packing_time, double *gpu_time, + int2 *task_first_part_f4, int2 *d_task_first_part_f4, cudaEvent_t *self_end, + double *unpack_time) { + + struct timespec t0, t1, tp0, tp1; + clock_gettime(CLOCK_REALTIME, &t0); + + /* Identify the number of GPU bundles to run in ideal case*/ + int nBundles_temp = pack_vars->nBundles; + + /*How many tasks have we packed?*/ + const int tasks_packed = pack_vars->tasks_packed; + + /*How many tasks should be in a bundle?*/ + const int bundle_size = pack_vars->bundle_size; + + /* Special case for incomplete bundles (when having leftover tasks not enough + * to fill a bundle) */ + if (pack_vars->launch_leftovers) { + nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size; + // if(tasks_packed == 0) error("zero tasks packed but somehow got into + // GPU loop"); + pack_vars->bundle_first_part[nBundles_temp] = + task_first_part_f4[tasks_packed - 1].x; + } + /* Identify the last particle for each bundle of tasks */ + for (int bid = 0; bid < nBundles_temp - 1; bid++) { + pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1]; + } + /* special treatment for the last bundle */ + if (nBundles_temp > 1) + pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts; + else + pack_vars->bundle_last_part[0] = pack_vars->count_parts; + + /* Launch the copies for each bundle and run the GPU kernel */ + /*We don't go into this loop if tasks_left_self == 1 as + nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/ + int max_parts; + for (int bid = 0; bid < nBundles_temp; bid++) { + + max_parts = 0; + int parts_in_bundle = 0; + const int first_task = bid * bundle_size; + int last_task = (bid + 1) * bundle_size; + for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) { + if (tid < tasks_packed) { + /*Get an estimate for the max number of parts per cell in the bundle. + * Used for determining the number of GPU CUDA blocks*/ + int count = task_first_part_f4[tid].y - task_first_part_f4[tid].x; + parts_in_bundle += count; + max_parts = max(max_parts, count); + last_task = tid; + } + } + + const int first_part_tmp = pack_vars->bundle_first_part[bid]; + const int bundle_n_parts = + pack_vars->bundle_last_part[bid] - first_part_tmp; + + cudaMemcpyAsync(&d_task_first_part_f4[first_task], + &task_first_part_f4[first_task], + (last_task + 1 - first_task) * sizeof(int2), + cudaMemcpyHostToDevice, stream[bid]); + + cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp], + bundle_n_parts * sizeof(struct part_aos_f4_g_send), + cudaMemcpyHostToDevice, stream[bid]); + // fprintf(stderr, "bid %i first_part %i nparts %i\n", bid, + // first_part_tmp, bundle_n_parts); + +#ifdef CUDA_DEBUG + cudaError_t cu_error = + cudaPeekAtLastError(); // cudaGetLastError(); // + // Get error code + if (cu_error != cudaSuccess) { + fprintf(stderr, + "CUDA error in gradient self host 2 device memcpy: %s cpuid id " + "is: %i\n ", + cudaGetErrorString(cu_error), r->cpuid); + exit(0); + } +#endif + const int tasksperbundle = pack_vars->tasksperbundle; + int tasks_left = tasksperbundle; + if (bid == nBundles_temp - 1) { + tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle; + } + // Will launch a 2d grid of GPU thread blocks (number of tasks is + // the y dimension and max_parts is the x dimension + int numBlocks_y = tasks_left; + int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE; + int bundle_first_task = pack_vars->bundle_first_task_list[bid]; + // const char *loop_type = "density"; + // Launch the kernel + launch_gradient_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid], + numBlocks_x, numBlocks_y, bundle_first_task, + d_task_first_part_f4); +#ifdef CUDA_DEBUG + cu_error = cudaPeekAtLastError(); // Get error code + if (cu_error != cudaSuccess) { + fprintf( + stderr, + "CUDA error with self gradient kernel launch: %s cpuid id is: %i\n ", + cudaGetErrorString(cu_error), r->cpuid); + exit(0); + } +#endif + cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp], + bundle_n_parts * sizeof(struct part_aos_f4_g_recv), + cudaMemcpyDeviceToHost, stream[bid]); + cudaEventRecord(self_end[bid], stream[bid]); + +#ifdef CUDA_DEBUG + cu_error = cudaPeekAtLastError(); // cudaGetLastError(); // + // Get error code + if (cu_error != cudaSuccess) { + fprintf(stderr, + "CUDA error with self gradient D2H memcpy: %s cpuid id is: %i\n ", + cudaGetErrorString(cu_error), r->cpuid); + error("Something's up with your cuda code"); + } +#endif + } /*End of looping over bundles to launch in streams*/ + // exit(0); + /* Make sure all the kernels and copies back are finished */ + // cudaDeviceSynchronize(); + + /*Time end of GPU work*/ + clock_gettime(CLOCK_REALTIME, &t1); + *gpu_time += + (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; + + /* Now copy the data back from the CPU thread-local buffers to the cells */ + /* Pack length counter for use in unpacking */ + int pack_length_unpack = 0; + ticks total_cpu_unpack_ticks = 0.; + for (int bid = 0; bid < nBundles_temp; bid++) { + + clock_gettime(CLOCK_REALTIME, &t0); + + // cudaStreamSynchronize(stream[bid]); + cudaEventSynchronize(self_end[bid]); + + clock_gettime(CLOCK_REALTIME, &t1); + *gpu_time += + (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; + + /*Time unpacking*/ + // clock_gettime(CLOCK_REALTIME, &tp0); + + for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) { + + if (tid < tasks_packed) { + + struct cell *cii = pack_vars->cell_list[tid]; + struct task *tii = pack_vars->task_list[tid]; + + // struct cell *cii = ci_list_self_dens[tid]; + // struct task *tii = task_list_self_dens[tid]; + + while (cell_locktree(cii)) { + ; /* spin until we acquire the lock */ + } + /*Time unpacking*/ + clock_gettime(CLOCK_REALTIME, &tp0); + const ticks tic = getticks(); + + /* Do the copy */ + runner_doself1_gpu_unpack_neat_aos_f4_g(r, cii, parts_recv, 0, + &pack_length_unpack, tid, + pack_vars->count_max_parts, e); + const ticks toc = getticks(); + + total_cpu_unpack_ticks += toc - tic; + /*Time end of unpacking*/ + clock_gettime(CLOCK_REALTIME, &tp1); + *unpack_time += (tp1.tv_sec - tp0.tv_sec) + + (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0; + + /* Record things for debugging */ + cii->gpu_done_g++; + pthread_mutex_lock(&s->sleep_mutex); + atomic_dec(&s->waiting); + pthread_cond_broadcast(&s->sleep_cond); + pthread_mutex_unlock(&s->sleep_mutex); + /* Release the lock */ + cell_unlocktree(cii); + + /*schedule my dependencies (Only unpacks really)*/ + enqueue_dependencies(s, tii); + /*Signal sleeping runners*/ + // MATTHIEU signal_sleeping_runners(s, tii); + + tii->gpu_done = 1; + } + } + /*Time end of unpacking*/ + // clock_gettime(CLOCK_REALTIME, &tp1); + // *unpack_time += (tp1.tv_sec - tp0.tv_sec) + + // (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0; + // *packing_time += (tp1.tv_sec - tp0.tv_sec) + + // (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0; + } + /* Zero counters for the next pack operations */ + pack_vars->count_parts = 0; + pack_vars->tasks_packed = 0; + + t->total_cpu_unpack_ticks += total_cpu_unpack_ticks; + +} /*End of GPU work Self Gradient*/ + +void runner_doself1_launch_f4_f( + struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, + struct cell *ci, struct task *t, struct part_aos_f4_f_send *parts_send, + struct part_aos_f4_f_recv *parts_recv, + struct part_aos_f4_f_send *d_parts_send, + struct part_aos_f4_f_recv *d_parts_recv, cudaStream_t *stream, float d_a, + float d_H, struct engine *e, double *packing_time, double *gpu_time, + int2 *task_first_part_f4_f, int2 *d_task_first_part_f4_f, + cudaEvent_t *self_end, double *unpack_time) { + + struct timespec t0, t1, tp0, tp1; // + clock_gettime(CLOCK_REALTIME, &t0); + + /* Identify the number of GPU bundles to run in ideal case*/ + int nBundles_temp = pack_vars->nBundles; + + /*How many tasks have we packed?*/ + const int tasks_packed = pack_vars->tasks_packed; + + /*How many tasks should be in a bundle?*/ + const int bundle_size = pack_vars->bundle_size; + + /* Special case for incomplete bundles (when having leftover tasks not enough + * to fill a bundle) */ + if (pack_vars->launch_leftovers) { + nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size; + if (tasks_packed == 0) + error("zero tasks packed but somehow got into GPU loop"); + pack_vars->bundle_first_part[nBundles_temp] = + task_first_part_f4_f[tasks_packed - 1].x; + } + /* Identify the last particle for each bundle of tasks */ + for (int bid = 0; bid < nBundles_temp - 1; bid++) { + pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1]; + } + /* special treatment for the last bundle */ + if (nBundles_temp > 1) + pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts; + else + pack_vars->bundle_last_part[0] = pack_vars->count_parts; + /*Copy arrays containing first and last part for each task to GPU*/ + // cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part, + // tasks_packed * sizeof(int), cudaMemcpyHostToDevice); + // cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part, + // tasks_packed * sizeof(int), cudaMemcpyHostToDevice); + + /*Copy cell shifts to device*/ + // cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx, + // tasks_packed * sizeof(double), cudaMemcpyHostToDevice); + // cudaMemcpy(pack_vars->d_celly, pack_vars->celly, + // tasks_packed * sizeof(double), cudaMemcpyHostToDevice); + // cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz, + // tasks_packed * sizeof(double), cudaMemcpyHostToDevice); + + /* Launch the copies for each bundle and run the GPU kernel */ + /*We don't go into this loop if tasks_left_self == 1 as + nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/ + int max_parts = 0; + for (int bid = 0; bid < nBundles_temp; bid++) { + + max_parts = 0; + int parts_in_bundle = 0; + const int first_task = bid * bundle_size; + int last_task = (bid + 1) * bundle_size; + for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) { + if (tid < tasks_packed) { + /*Get an estimate for the max number of parts per cell in the bundle. + * Used for determining the number of GPU CUDA blocks*/ + int count = task_first_part_f4_f[tid].y - task_first_part_f4_f[tid].x; + parts_in_bundle += count; + max_parts = max(max_parts, count); + last_task = tid; + } + } + + const int first_part_tmp = pack_vars->bundle_first_part[bid]; + const int bundle_n_parts = + pack_vars->bundle_last_part[bid] - first_part_tmp; + cudaMemcpyAsync(&d_task_first_part_f4_f[first_task], + &task_first_part_f4_f[first_task], + (last_task + 1 - first_task) * sizeof(int2), + cudaMemcpyHostToDevice, stream[bid]); + + cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp], + bundle_n_parts * sizeof(struct part_aos_f4_f_send), + cudaMemcpyHostToDevice, stream[bid]); + +#ifdef CUDA_DEBUG + cudaError_t cu_error = + cudaPeekAtLastError(); // cudaGetLastError(); // + // Get error code + if (cu_error != cudaSuccess) { + fprintf(stderr, + "CUDA error in density self host 2 device memcpy: %s cpuid id " + "is: %i\n ", + cudaGetErrorString(cu_error), r->cpuid); + exit(0); + } +#endif + const int tasksperbundle = pack_vars->tasksperbundle; + int tasks_left = tasksperbundle; + if (bid == nBundles_temp - 1) { + tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle; + } + // Will launch a 2d grid of GPU thread blocks (number of tasks is + // the y dimension and max_parts is the x dimension + int numBlocks_y = tasks_left; + int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE; + int bundle_first_task = pack_vars->bundle_first_task_list[bid]; + // Launch the kernel + launch_force_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid], + numBlocks_x, numBlocks_y, bundle_first_task, + d_task_first_part_f4_f); +#ifdef CUDA_DEBUG + cu_error = cudaPeekAtLastError(); // Get error code + if (cu_error != cudaSuccess) { + fprintf(stderr, + "CUDA error with self force kernel launch: %s cpuid id is: %i\n ", + cudaGetErrorString(cu_error), r->cpuid); + exit(0); + } +#endif + cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp], + bundle_n_parts * sizeof(struct part_aos_f4_f_recv), + cudaMemcpyDeviceToHost, stream[bid]); + cudaEventRecord(self_end[bid], stream[bid]); + +#ifdef CUDA_DEBUG + cu_error = cudaPeekAtLastError(); // cudaGetLastError(); // + // Get error code + if (cu_error != cudaSuccess) { + fprintf(stderr, + "CUDA error with self firce D2H memcpy: %s cpuid id is: %i\n ", + cudaGetErrorString(cu_error), r->cpuid); + error("Something's up with your cuda code"); + } +#endif + } /*End of looping over bundles to launch in streams*/ + + /* Make sure all the kernels and copies back are finished */ + // cudaDeviceSynchronize(); + + /*Time end of GPU work*/ + clock_gettime(CLOCK_REALTIME, &t1); + *gpu_time += + (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; + + /* Now copy the data back from the CPU thread-local buffers to the cells */ + /* Pack length counter for use in unpacking */ + int pack_length_unpack = 0; + ticks total_cpu_unpack_ticks = 0.; + for (int bid = 0; bid < nBundles_temp; bid++) { + + clock_gettime(CLOCK_REALTIME, &t0); + + // cudaStreamSynchronize(stream[bid]); + cudaEventSynchronize(self_end[bid]); + + clock_gettime(CLOCK_REALTIME, &t1); + *gpu_time += + (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; + + /*Time unpacking*/ + // clock_gettime(CLOCK_REALTIME, &tp0); + + for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) { + + if (tid < tasks_packed) { + struct cell *cii = pack_vars->cell_list[tid]; + struct task *tii = pack_vars->task_list[tid]; + + // struct cell *cii = ci_list_self_dens[tid]; + // struct task *tii = task_list_self_dens[tid]; + + while (cell_locktree(cii)) { + ; /* spin until we acquire the lock */ + } + clock_gettime(CLOCK_REALTIME, &tp0); + const ticks tic = getticks(); + + /* Do the copy */ + runner_doself1_gpu_unpack_neat_aos_f4_f(r, cii, parts_recv, 0, + &pack_length_unpack, tid, + pack_vars->count_max_parts, e); + const ticks toc = getticks(); + + total_cpu_unpack_ticks += toc - tic; + /* Record things for debugging */ + cii->gpu_done_f++; + clock_gettime(CLOCK_REALTIME, &tp1); + *unpack_time += (tp1.tv_sec - tp0.tv_sec) + + (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0; + pthread_mutex_lock(&s->sleep_mutex); + atomic_dec(&s->waiting); + pthread_cond_broadcast(&s->sleep_cond); + pthread_mutex_unlock(&s->sleep_mutex); + /* Release the lock */ + cell_unlocktree(cii); + + /*schedule my dependencies (Only unpacks really)*/ + enqueue_dependencies(s, tii); + /*Signal sleeping runners*/ + // MATTHIEU signal_sleeping_runners(s, tii); + + tii->gpu_done = 1; + } + } + /*Time end of unpacking*/ + // clock_gettime(CLOCK_REALTIME, &tp1); + // *unpack_time += (tp1.tv_sec - tp0.tv_sec) + + // (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0; + } + + /* Zero counters for the next pack operations */ + pack_vars->count_parts = 0; + pack_vars->tasks_packed = 0; + + t->total_cpu_unpack_ticks += total_cpu_unpack_ticks; +} /*End of GPU work Self Gradient*/ + +void runner_dopair1_launch_f4_one_memcpy( + struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars, + struct task *t, struct part_aos_f4_send *parts_send, + struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send, + struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a, + float d_H, struct engine *e, double *packing_time, double *gpu_time, + double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens, + cudaEvent_t *pair_end) { + + struct timespec t0, t1, tp0, tp1; // + clock_gettime(CLOCK_REALTIME, &t0); + + /* Identify the number of GPU bundles to run in ideal case*/ + int nBundles_temp = pack_vars->nBundles; + /*How many tasks have we packed?*/ + const int tasks_packed = pack_vars->tasks_packed; + + /*How many tasks should be in a bundle?*/ + const int bundle_size = pack_vars->bundle_size; + + /* Special case for incomplete bundles (when having leftover tasks not enough + * to fill a bundle) */ + if (pack_vars->launch_leftovers) { + nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size; + if (tasks_packed == 0) + error("zero pair tasks packed but somehow got into GPU loop"); + // pack_vars->bundle_first_part[nBundles_temp] = + // pack_vars->task_first_part[packed_tmp - 2]; + pack_vars->bundle_first_part[nBundles_temp] = + fparti_fpartj_lparti_lpartj_dens[tasks_packed - 1].x; + } + /* Identify the last particle for each bundle of tasks */ + for (int bid = 0; bid < nBundles_temp - 1; bid++) { + pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1]; + } + /* special treatment for the last bundle */ + if (nBundles_temp > 1) + pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts; + else + pack_vars->bundle_last_part[0] = pack_vars->count_parts; + + /* Launch the copies for each bundle and run the GPU kernel */ + for (int bid = 0; bid < nBundles_temp; bid++) { + + int max_parts_i = 0; + int max_parts_j = 0; + int parts_in_bundle_ci = 0; + int parts_in_bundle_cj = 0; + for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) { + if (tid < tasks_packed) { + /*Get an estimate for the max number of parts per cell in each bundle. + * Used for determining the number of GPU CUDA blocks*/ + int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z - + fparti_fpartj_lparti_lpartj_dens[tid].x; + parts_in_bundle_ci += count_i; + max_parts_i = max(max_parts_i, count_i); + int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w - + fparti_fpartj_lparti_lpartj_dens[tid].y; + parts_in_bundle_cj += count_j; + max_parts_j = max(max_parts_j, count_j); + // if(count_i > 100 || count_j > 100) + // error("Sending data for excessive n parts %i %i", + // count_i, count_j); + } + } + const int first_part_tmp_i = pack_vars->bundle_first_part[bid]; + const int bundle_n_parts = + pack_vars->bundle_last_part[bid] - first_part_tmp_i; + + cudaMemcpyAsync(&d_parts_send[first_part_tmp_i], + &parts_send[first_part_tmp_i], + bundle_n_parts * sizeof(struct part_aos_f4_send), + cudaMemcpyHostToDevice, stream[bid]); + +#ifdef CUDA_DEBUG + cudaError_t cu_error = + cudaPeekAtLastError(); // cudaGetLastError(); // + // Get error code + if (cu_error != cudaSuccess) { + fprintf(stderr, + "CUDA error with pair density H2D async memcpy ci: %s cpuid id " + "is: %i\n ", + cudaGetErrorString(cu_error), r->cpuid); + error("Something's up with your cuda code first_part %i bundle size %i", + first_part_tmp_i, bundle_n_parts); + } +#endif + /* LAUNCH THE GPU KERNELS for ci & cj */ + // Setup 2d grid of GPU thread blocks for ci (number of tasks is + // the y dimension and max_parts is the x dimension + int numBlocks_y = 0; // tasks_left; + int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE; + int bundle_part_0 = pack_vars->bundle_first_part[bid]; + /* Launch the kernel for ci using data for ci and cj */ + runner_dopair_branch_density_gpu_aos_f4( + d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x, + numBlocks_y, bundle_part_0, bundle_n_parts); + +#ifdef CUDA_DEBUG + cu_error = cudaPeekAtLastError(); // Get error code + if (cu_error != cudaSuccess) { + fprintf( + stderr, + "CUDA error with pair density kernel launch: %s cpuid id is: %i\n " + "nbx %i nby %i max_parts_i %i max_parts_j %i\n", + cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y, + max_parts_i, max_parts_j); + error("Something's up with kernel launch."); + } +#endif + + // Copy results back to CPU BUFFERS + cudaMemcpyAsync(&parts_recv[first_part_tmp_i], + &d_parts_recv[first_part_tmp_i], + bundle_n_parts * sizeof(struct part_aos_f4_recv), + cudaMemcpyDeviceToHost, stream[bid]); + cudaEventRecord(pair_end[bid], stream[bid]); + +#ifdef CUDA_DEBUG + cu_error = cudaPeekAtLastError(); // cudaGetLastError(); // + // Get error code + if (cu_error != cudaSuccess) { + fprintf(stderr, + "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ", + cudaGetErrorString(cu_error), r->cpuid); + error("Something's up with your cuda code"); + } +#endif + } /*End of looping over bundles to launch in streams*/ + + /* Make sure all the kernels and copies back are finished */ + // cudaDeviceSynchronize(); + + /*Time end of GPU work*/ + clock_gettime(CLOCK_REALTIME, &t1); + *gpu_time += + (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; + + /* Now copy the data back from the CPU thread-local buffers to the cells */ + /* Pack length counter for use in unpacking */ + + int pack_length_unpack = 0; + ticks total_cpu_unpack_ticks = 0; + + for (int bid = 0; bid < nBundles_temp; bid++) { + /*Time unpacking*/ + clock_gettime(CLOCK_REALTIME, &t0); + + // cudaStreamSynchronize(stream[bid]); + cudaEventSynchronize(pair_end[bid]); + + clock_gettime(CLOCK_REALTIME, &t1); + *gpu_time += + (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; + + //////////// + + /*Time unpacking*/ + // clock_gettime(CLOCK_REALTIME, &tp0); + +// for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) { +// +// if (tid < tasks_packed) { +// clock_gettime(CLOCK_REALTIME, &tp0); +// /*grab cell and task pointers*/ +// struct cell *cii = pack_vars->ci_list[tid]; +// struct cell *cjj = pack_vars->cj_list[tid]; +// struct task *tii = pack_vars->task_list[tid]; +// +//// if(!pack_vars->task_locked){ +//// /*Let's lock ci*/ +//// while (cell_locktree(cii)) { +//// ; /* spin until we acquire the lock */ +//// } +//// /*Let's lock cj*/ +//// while (cell_locktree(cjj)) { +//// ; /* spin until we acquire the lock */ +//// } +//// pack_vars->task_locked = 1; +//// } +// +// const ticks tic = getticks(); +// +// /* Do the copy */ +// runner_do_ci_cj_gpu_unpack_neat_aos_f4( +// r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid, +// 2 * pack_vars->count_max_parts, e); +// +// const ticks toc = getticks(); +// +// total_cpu_unpack_ticks += toc - tic; +// +// /* Record things for debugging */ +// cii->gpu_done_pair++; +// cjj->gpu_done_pair++; +// +//// if(pack_vars->task_locked){ +//// /* Release the locks */ +//// cell_unlocktree(cii); +//// /* Release the locks */ +//// cell_unlocktree(cjj); +// pack_vars->task_locked = 0; +//// } +// +// /*Time end of unpacking*/ +// clock_gettime(CLOCK_REALTIME, &tp1); +// *unpack_time += (tp1.tv_sec - tp0.tv_sec) + +// (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0; +// /*Signal sleeping runners*/ +// // MATTHIEU signal_sleeping_runners(s, tii); +// +// tii->gpu_done = 1; +// } +// } + } + + /* Zero counters for the next pack operations */ +// pack_vars->count_parts = 0; +// pack_vars->tasks_packed = 0; + + // /*Time end of unpacking*/ + // clock_gettime(CLOCK_REALTIME, &t1); + // *packing_time += (t1.tv_sec - t0.tv_sec) + + // (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; + + /* Write the timers back to the task */ + t->total_cpu_unpack_ticks += total_cpu_unpack_ticks; + +} /*End of GPU work*/ + +void runner_dopair1_unpack_f4( + struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars, + struct task *t, struct part_aos_f4_send *parts_send, + struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send, + struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a, + float d_H, struct engine *e, double *packing_time, double *gpu_time, + double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens, + cudaEvent_t *pair_end, int cstart, int n_leaves_found){ + + int topid; + int pack_length_unpack = 0; + ticks total_cpu_unpack_ticks = 0; + /*Loop over top level tasks*/ + for (topid = 0; topid < pack_vars->top_tasks_packed; topid++) { + const ticks tic = getticks(); + /* Loop through each daughter task */ + int n_leaves_in_task = pack_vars->leaf_list[topid].n_packed; + int nstart = pack_vars->leaf_list[topid].n_start; + for(int tid = nstart; tid < n_leaves_in_task + nstart; tid++){ + /*Get pointers to the leaf cells. SEEMS I'm NOT GETTING A CORRECT POINTER + *but likely due to incorrect book keeping*/ + struct cell * cii_l = pack_vars->leaf_list[topid].ci[tid]; + struct cell * cjj_l = pack_vars->leaf_list[topid].cj[tid]; + message("loc %f %f %f topid %i tid %i nleaves %i", pack_vars->leaf_list[topid].ci[tid]->loc[0] + , pack_vars->leaf_list[topid].ci[tid]->loc[1] + , pack_vars->leaf_list[topid].ci[tid]->loc[2] + , topid, tid, n_leaves_in_task); +// if(*cii_l == NULL || *cjj_l == NULL)error("stop"); + runner_do_ci_cj_gpu_unpack_neat_aos_f4( + r, cii_l, cjj_l, parts_recv, 0, &pack_length_unpack, tid, + 2 * pack_vars->count_max_parts, e); + } + + const ticks toc = getticks(); + total_cpu_unpack_ticks += toc - tic; + pack_vars->count_parts = 0; + /*For some reason the code fails if we get a leaf pair task + *this if statement stops the code from trying to unlock same cells twice*/ + if(topid == pack_vars->top_tasks_packed -1 && cstart != n_leaves_found) + continue; + enqueue_dependencies(s, pack_vars->top_task_list[topid]); + pthread_mutex_lock(&s->sleep_mutex); + atomic_dec(&s->waiting); + pthread_cond_broadcast(&s->sleep_cond); + pthread_mutex_unlock(&s->sleep_mutex); + } +} +void runner_dopair1_launch_f4_g_one_memcpy( + struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars, + struct task *t, struct part_aos_f4_g_send *parts_send, + struct part_aos_f4_g_recv *parts_recv, + struct part_aos_f4_g_send *d_parts_send, + struct part_aos_f4_g_recv *d_parts_recv, cudaStream_t *stream, float d_a, + float d_H, struct engine *e, double *packing_time, double *gpu_time, + double *unpack_time, int4 *fparti_fpartj_lparti_lpartj, + cudaEvent_t *pair_end) { + + struct timespec t0, t1, tp0, tp1; // + clock_gettime(CLOCK_REALTIME, &t0); + + /* Identify the number of GPU bundles to run in ideal case*/ + int nBundles_temp = pack_vars->nBundles; + /*How many tasks have we packed?*/ + const int tasks_packed = pack_vars->tasks_packed; + + /*How many tasks should be in a bundle?*/ + const int bundle_size = pack_vars->bundle_size; + + /*tasks-packed needs decrementing before calculating packed_tmp as it was + * incremented in runner_dopair1_pack*/ + // const int packed_tmp = 2 * (tasks_packed - 1); + + /* Special case for incomplete bundles (when having leftover tasks not enough + * to fill a bundle) */ + if (pack_vars->launch_leftovers) { + nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size; + if (tasks_packed == 0) + error("zero pair tasks packed but somehow got into GPU loop"); + // pack_vars->bundle_first_part[nBundles_temp] = + // pack_vars->task_first_part[packed_tmp - 2]; + pack_vars->bundle_first_part[nBundles_temp] = + fparti_fpartj_lparti_lpartj[tasks_packed - 1].x; + } + /* Identify the last particle for each bundle of tasks */ + for (int bid = 0; bid < nBundles_temp - 1; bid++) { + pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1]; + } + /* special treatment for the last bundle */ + if (nBundles_temp > 1) + pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts; + else + pack_vars->bundle_last_part[0] = pack_vars->count_parts; + + /* Launch the copies for each bundle and run the GPU kernel */ + /*We don't go into this loop if tasks_left_self == 1 as + nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/ + // int max_parts = 0; + for (int bid = 0; bid < nBundles_temp; bid++) { + + int max_parts_i = 0; + int max_parts_j = 0; + int parts_in_bundle_ci = 0; + int parts_in_bundle_cj = 0; + for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) { + if (tid < tasks_packed) { + /*Get an estimate for the max number of parts per cell in each bundle. + * Used for determining the number of GPU CUDA blocks*/ + int count_i = fparti_fpartj_lparti_lpartj[tid].z - + fparti_fpartj_lparti_lpartj[tid].x; + parts_in_bundle_ci += count_i; + max_parts_i = max(max_parts_i, count_i); + int count_j = fparti_fpartj_lparti_lpartj[tid].w - + fparti_fpartj_lparti_lpartj[tid].y; + parts_in_bundle_cj += count_j; + max_parts_j = max(max_parts_j, count_j); + } + } + const int first_part_tmp_i = pack_vars->bundle_first_part[bid]; + const int bundle_n_parts = + pack_vars->bundle_last_part[bid] - first_part_tmp_i; + + cudaMemcpyAsync(&d_parts_send[first_part_tmp_i], + &parts_send[first_part_tmp_i], + bundle_n_parts * sizeof(struct part_aos_f4_g_send), + cudaMemcpyHostToDevice, stream[bid]); + +#ifdef CUDA_DEBUG + cudaError_t cu_error = + cudaPeekAtLastError(); // cudaGetLastError(); // + // Get error code + if (cu_error != cudaSuccess) { + fprintf(stderr, + "CUDA error with pair density H2D async memcpy ci: %s cpuid id " + "is: %i\n ", + cudaGetErrorString(cu_error), r->cpuid); + error("Something's up with your cuda code"); + } +#endif + + // const int tasksperbundle = pack_vars->tasksperbundle; + /* LAUNCH THE GPU KERNELS for ci & cj */ + // Setup 2d grid of GPU thread blocks for ci (number of tasks is + // the y dimension and max_parts is the x dimension + int numBlocks_y = 0; // tasks_left; + int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE; + int bundle_part_0 = pack_vars->bundle_first_part[bid]; + // fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", + // bundle_part_0, bundle_first_task); + + /* Launch the kernel for ci using data for ci and cj */ + runner_dopair_branch_gradient_gpu_aos_f4( + d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x, + numBlocks_y, bundle_part_0, bundle_n_parts); + +#ifdef CUDA_DEBUG + cu_error = cudaPeekAtLastError(); // Get error code + if (cu_error != cudaSuccess) { + fprintf( + stderr, + "CUDA error with pair density kernel launch: %s cpuid id is: %i\n " + "nbx %i nby %i max_parts_i %i max_parts_j %i\n", + cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y, + max_parts_i, max_parts_j); + exit(0); + } +#endif + + // Copy results back to CPU BUFFERS + cudaMemcpyAsync(&parts_recv[first_part_tmp_i], + &d_parts_recv[first_part_tmp_i], + bundle_n_parts * sizeof(struct part_aos_f4_g_recv), + cudaMemcpyDeviceToHost, stream[bid]); + cudaEventRecord(pair_end[bid], stream[bid]); + +#ifdef CUDA_DEBUG + cu_error = cudaPeekAtLastError(); // cudaGetLastError(); // + // Get error code + if (cu_error != cudaSuccess) { + fprintf(stderr, + "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ", + cudaGetErrorString(cu_error), r->cpuid); + error("Something's up with your cuda code"); + } +#endif + } /*End of looping over bundles to launch in streams*/ + + /* Make sure all the kernels and copies back are finished */ + // cudaDeviceSynchronize(); + + /*Time end of GPU work*/ + clock_gettime(CLOCK_REALTIME, &t1); + *gpu_time += + (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; + /* Now copy the data back from the CPU thread-local buffers to the cells */ + /* Pack length counter for use in unpacking */ + int pack_length_unpack = 0; + + ticks total_cpu_unpack_ticks = 0.; + + for (int bid = 0; bid < nBundles_temp; bid++) { + /*Time unpacking*/ + clock_gettime(CLOCK_REALTIME, &t0); + + // cudaStreamSynchronize(stream[bid]); + cudaEventSynchronize(pair_end[bid]); + + clock_gettime(CLOCK_REALTIME, &t1); + *gpu_time += + (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; + + /*Time unpacking*/ + // clock_gettime(CLOCK_REALTIME, &tp0); + // int bundle_first_task = pack_vars->bundle_first_task_list[bid]; + + for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) { + + if (tid < tasks_packed) { + clock_gettime(CLOCK_REALTIME, &tp0); + /*grab cell and task pointers*/ + struct cell *cii = pack_vars->ci_list[tid]; + struct cell *cjj = pack_vars->cj_list[tid]; + struct task *tii = pack_vars->task_list[tid]; + /*Let's lock ci*/ + while (cell_locktree(cii)) { + ; /* spin until we acquire the lock */ + } + /*Let's lock cj*/ + while (cell_locktree(cjj)) { + ; /* spin until we acquire the lock */ + } + + const ticks tic = getticks(); + + /* Do the copy */ + runner_do_ci_cj_gpu_unpack_neat_aos_f4_g( + r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid, + 2 * pack_vars->count_max_parts, e); + + const ticks toc = getticks(); + + total_cpu_unpack_ticks += toc - tic; + + /* Record things for debugging */ + cii->gpu_done_pair_g++; + cjj->gpu_done_pair_g++; + pthread_mutex_lock(&s->sleep_mutex); + atomic_dec(&s->waiting); + pthread_cond_broadcast(&s->sleep_cond); + pthread_mutex_unlock(&s->sleep_mutex); + /* Release the locks */ + cell_unlocktree(cii); + /* Release the locks */ + cell_unlocktree(cjj); + + /*Time end of unpacking*/ + clock_gettime(CLOCK_REALTIME, &tp1); + *unpack_time += (tp1.tv_sec - tp0.tv_sec) + + (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0; + + /*schedule my dependencies (Only unpacks really)*/ + enqueue_dependencies(s, tii); + /*Signal sleeping runners*/ + // MATTHIEU signal_sleeping_runners(s, tii); + + tii->gpu_done = 1; + } + } + } + /* Zero counters for the next pack operations */ + pack_vars->count_parts = 0; + pack_vars->tasks_packed = 0; + + /* Write the timers back to the task */ + t->total_cpu_unpack_ticks += total_cpu_unpack_ticks; + // /*Time end of unpacking*/ + // clock_gettime(CLOCK_REALTIME, &t1); + // *packing_time += (t1.tv_sec - t0.tv_sec) + + // (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; +} /*End of GPU work*/ + +void runner_dopair1_launch_f4_f_one_memcpy( + struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars, + struct task *t, struct part_aos_f4_f_send *parts_send, + struct part_aos_f4_f_recv *parts_recv, + struct part_aos_f4_f_send *d_parts_send, + struct part_aos_f4_f_recv *d_parts_recv, cudaStream_t *stream, float d_a, + float d_H, struct engine *e, double *packing_time, double *gpu_time, + double *unpack_time, int4 *fparti_fpartj_lparti_lpartj, + cudaEvent_t *pair_end) { + + struct timespec t0, t1, tp0, tp1; // + clock_gettime(CLOCK_REALTIME, &t0); + + /* Identify the number of GPU bundles to run in ideal case*/ + int nBundles_temp = pack_vars->nBundles; + /*How many tasks have we packed?*/ + const int tasks_packed = pack_vars->tasks_packed; + + /*How many tasks should be in a bundle?*/ + const int bundle_size = pack_vars->bundle_size; + + /*tasks-packed needs decrementing before calculating packed_tmp as it was + * incremented in runner_dopair1_pack*/ + // const int packed_tmp = 2 * (tasks_packed - 1); + + /* Special case for incomplete bundles (when having leftover tasks not enough + * to fill a bundle) */ + if (pack_vars->launch_leftovers) { + nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size; + if (tasks_packed == 0) + error("zero pair tasks packed but somehow got into GPU loop"); + // pack_vars->bundle_first_part[nBundles_temp] = + // pack_vars->task_first_part[packed_tmp - 2]; + pack_vars->bundle_first_part[nBundles_temp] = + fparti_fpartj_lparti_lpartj[tasks_packed - 1].x; + } + /* Identify the last particle for each bundle of tasks */ + for (int bid = 0; bid < nBundles_temp - 1; bid++) { + pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1]; + } + /* special treatment for the last bundle */ + if (nBundles_temp > 1) + pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts; + else + pack_vars->bundle_last_part[0] = pack_vars->count_parts; + + /* Launch the copies for each bundle and run the GPU kernel */ + /*We don't go into this loop if tasks_left_self == 1 as + nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/ + // int max_parts = 0; + for (int bid = 0; bid < nBundles_temp; bid++) { + + int max_parts_i = 0; + int max_parts_j = 0; + int parts_in_bundle_ci = 0; + int parts_in_bundle_cj = 0; + // const int first_task = bid * pack_vars->bundle_size; + // int last_task = (bid + 1) * bundle_size; + for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) { + if (tid < tasks_packed) { + /*Get an estimate for the max number of parts per cell in each bundle. + * Used for determining the number of GPU CUDA blocks*/ + int count_i = fparti_fpartj_lparti_lpartj[tid].z - + fparti_fpartj_lparti_lpartj[tid].x; + parts_in_bundle_ci += count_i; + max_parts_i = max(max_parts_i, count_i); + int count_j = fparti_fpartj_lparti_lpartj[tid].w - + fparti_fpartj_lparti_lpartj[tid].y; + parts_in_bundle_cj += count_j; + max_parts_j = max(max_parts_j, count_j); + + // last_task = tid; + } + } + const int first_part_tmp_i = pack_vars->bundle_first_part[bid]; + const int bundle_n_parts = + pack_vars->bundle_last_part[bid] - first_part_tmp_i; + + cudaMemcpyAsync(&d_parts_send[first_part_tmp_i], + &parts_send[first_part_tmp_i], + bundle_n_parts * sizeof(struct part_aos_f4_f_send), + cudaMemcpyHostToDevice, stream[bid]); + +#ifdef CUDA_DEBUG + cudaError_t cu_error = + cudaPeekAtLastError(); // cudaGetLastError(); // + // Get error code + if (cu_error != cudaSuccess) { + fprintf(stderr, + "CUDA error with pair density H2D async memcpy ci: %s cpuid id " + "is: %i\n ", + cudaGetErrorString(cu_error), r->cpuid); + error("Something's up with your cuda code"); + } +#endif + + // const int tasksperbundle = pack_vars->tasksperbundle; + /* LAUNCH THE GPU KERNELS for ci & cj */ + // int tid = 0; + // int offset = bid * tasksperbundle; + // int tasks_left = tasksperbundle; + // if (bid == nBundles_temp - 1) { + // tasks_left = + // tasks_packed - (nBundles_temp - 1) * tasksperbundle; + // } + + // Setup 2d grid of GPU thread blocks for ci (number of tasks is + // the y dimension and max_parts is the x dimension + int numBlocks_y = 0; // tasks_left; + int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE; + int bundle_part_0 = pack_vars->bundle_first_part[bid]; + // int bundle_first_task = pack_vars->bundle_first_task_list[bid]; + // fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", + // bundle_part_0, bundle_first_task); + + /* Launch the kernel for ci using data for ci and cj */ + runner_dopair_branch_force_gpu_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, + stream[bid], numBlocks_x, numBlocks_y, + bundle_part_0, bundle_n_parts); + +#ifdef CUDA_DEBUG + cu_error = cudaPeekAtLastError(); // Get error code + if (cu_error != cudaSuccess) { + fprintf( + stderr, + "CUDA error with pair density kernel launch: %s cpuid id is: %i\n " + "nbx %i nby %i max_parts_i %i max_parts_j %i\n", + cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y, + max_parts_i, max_parts_j); + exit(0); + } +#endif + + // Copy results back to CPU BUFFERS + cudaMemcpyAsync(&parts_recv[first_part_tmp_i], + &d_parts_recv[first_part_tmp_i], + bundle_n_parts * sizeof(struct part_aos_f4_f_recv), + cudaMemcpyDeviceToHost, stream[bid]); + cudaEventRecord(pair_end[bid], stream[bid]); + +#ifdef CUDA_DEBUG + cu_error = cudaPeekAtLastError(); // cudaGetLastError(); // + // Get error code + if (cu_error != cudaSuccess) { + fprintf(stderr, + "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ", + cudaGetErrorString(cu_error), r->cpuid); + error("Something's up with your cuda code"); + } +#endif + } /*End of looping over bundles to launch in streams*/ + + /* Make sure all the kernels and copies back are finished */ + // cudaDeviceSynchronize(); + + /*Time end of GPU work*/ + clock_gettime(CLOCK_REALTIME, &t1); + *gpu_time += + (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; + /* Now copy the data back from the CPU thread-local buffers to the cells */ + /* Pack length counter for use in unpacking */ + int pack_length_unpack = 0; + ticks total_cpu_unpack_ticks = 0.; + for (int bid = 0; bid < nBundles_temp; bid++) { + /*Time unpacking*/ + clock_gettime(CLOCK_REALTIME, &t0); + + // cudaStreamSynchronize(stream[bid]); + cudaEventSynchronize(pair_end[bid]); + + clock_gettime(CLOCK_REALTIME, &t1); + *gpu_time += + (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; + + /*Time unpacking*/ + // clock_gettime(CLOCK_REALTIME, &tp0); + // int bundle_first_task = pack_vars->bundle_first_task_list[bid]; + + for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) { + + if (tid < tasks_packed) { + clock_gettime(CLOCK_REALTIME, &tp0); + /*grab cell and task pointers*/ + struct cell *cii = pack_vars->ci_list[tid]; + struct cell *cjj = pack_vars->cj_list[tid]; + struct task *tii = pack_vars->task_list[tid]; + /*Let's lock ci*/ + while (cell_locktree(cii)) { + ; /* spin until we acquire the lock */ + } + /*Let's lock cj*/ + while (cell_locktree(cjj)) { + ; /* spin until we acquire the lock */ + } + + const ticks tic = getticks(); + + /* Do the copy */ + runner_do_ci_cj_gpu_unpack_neat_aos_f4_f( + r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid, + 2 * pack_vars->count_max_parts, e); + + const ticks toc = getticks(); + + total_cpu_unpack_ticks += toc - tic; + + /* Record things for debugging */ + cii->gpu_done_pair_f++; + cjj->gpu_done_pair_f++; + pthread_mutex_lock(&s->sleep_mutex); + atomic_dec(&s->waiting); + pthread_cond_broadcast(&s->sleep_cond); + pthread_mutex_unlock(&s->sleep_mutex); + // /* Release the locks */ + cell_unlocktree(cii); + // /* Release the locks */ + cell_unlocktree(cjj); + + /*Time end of unpacking*/ + clock_gettime(CLOCK_REALTIME, &tp1); + *unpack_time += (tp1.tv_sec - tp0.tv_sec) + + (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0; + + /*schedule my dependencies (Only unpacks really)*/ + enqueue_dependencies(s, tii); + /*Signal sleeping runners*/ + // MATTHIEU signal_sleeping_runners(s, tii); + + tii->gpu_done = 1; + } + } + } + /* Zero counters for the next pack operations */ + pack_vars->count_parts = 0; + pack_vars->tasks_packed = 0; + + /* Write the timers back to the task */ + t->total_cpu_unpack_ticks += total_cpu_unpack_ticks; + // /*Time end of unpacking*/ + // clock_gettime(CLOCK_REALTIME, &t1); + // *packing_time += (t1.tv_sec - t0.tv_sec) + + // (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; +} /*End of GPU work*/ diff --git a/src/runner_gpu_pack_functions.c b/src/runner_gpu_pack_functions.c new file mode 100644 index 0000000000..af743e6172 --- /dev/null +++ b/src/runner_gpu_pack_functions.c @@ -0,0 +1,813 @@ +// #include "active.h" +// #include +// #include +// #include "cuda/cell_gpu.h" +// #include "runner_gpu_functions.cuh" +/* This object's header. */ +#include "runner.h" +/* Local headers. */ +#include "active.h" +#include "engine.h" +#include "runner_gpu_pack_functions.h" +#include "scheduler.h" +#include "space_getsid.h" +#include "timers.h" +#include "runner_doiact_hydro.h" + +void runner_doself1_gpu_pack_neat_aos_f4( + struct runner *r, struct cell *__restrict__ c, + struct part_aos_f4_send *__restrict__ parts_aos_buffer, int timer, + int *pack_length, int tid, int count_max_parts_tmp) { + + TIMER_TIC; + + /* Anything to do here? */ + if (c->hydro.count == 0) return; + + int count = c->hydro.count; + int local_pack_position = (*pack_length); + +#ifdef SWIFT_DEBUG_CHECKS + if (local_pack_position + count >= count_max_parts_tmp) { + fprintf(stderr, + "Exceeded count_max_parts_tmp. Make arrays bigger! count_max %i " + "count %i\n", + count_max_parts_tmp, local_pack_position + count); + error("0"); + } +#endif + int2 frst_lst_prts = {local_pack_position, local_pack_position + count}; + /* Pack the particle data into CPU-side buffers*/ + pack_neat_aos_f4(c, parts_aos_buffer, tid, local_pack_position, count, + frst_lst_prts); + /* Increment pack length accordingly */ + (*pack_length) += count; + + if (timer) TIMER_TOC(timer_doself_gpu_pack); +} + +void runner_doself1_gpu_pack_neat_aos_f4_g( + struct runner *r, struct cell *c, + struct part_aos_f4_g_send *parts_aos_buffer, int timer, int *pack_length, + int tid, int count_max_parts_tmp) { + + TIMER_TIC; + + /* Anything to do here? */ + if (c->hydro.count == 0) return; + + int count = c->hydro.count; + int local_pack_position = (*pack_length); + +#ifdef SWIFT_DEBUG_CHECKS + if (local_pack_position + count >= count_max_parts_tmp) { + fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n"); + exit(0); + } +#endif + + /* Pack the particle data into CPU-side buffers*/ + pack_neat_aos_f4_g(c, parts_aos_buffer, tid, local_pack_position, count); + /* Increment pack length accordingly */ + (*pack_length) += count; + + if (timer) TIMER_TOC(timer_doself_gpu_pack); +} + +void runner_doself1_gpu_pack_neat_aos_f4_f( + struct runner *r, struct cell *restrict c, + struct part_aos_f4_f_send *restrict parts_aos_buffer, int timer, + int *pack_length, int tid, int count_max_parts_tmp) { + + TIMER_TIC; + + /* Anything to do here? */ + if (c->hydro.count == 0) return; + + int count = c->hydro.count; + int local_pack_position = (*pack_length); + +#ifdef SWIFT_DEBUG_CHECKS + if (local_pack_position + count >= count_max_parts_tmp) { + fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n"); + exit(0); + } +#endif + + /* Pack the particle data into CPU-side buffers*/ + pack_neat_aos_f4_f(c, parts_aos_buffer, tid, local_pack_position, count); + /* Increment pack length accordingly */ + (*pack_length) += count; + + if (timer) TIMER_TOC(timer_doself_gpu_pack); +} + +extern inline void pack_neat_pair_aos_f4( + struct cell *__restrict c, + struct part_aos_f4_send *__restrict parts_aos_buffer, int tid, + const int local_pack_position, const int count, const float3 shift, + const int2 cstarts) { + /*Data to be copied to GPU*/ + for (int i = 0; i < count; i++) { + const int id_in_pack = i + local_pack_position; + parts_aos_buffer[id_in_pack].x_p_h.x = c->hydro.parts[i].x[0] - shift.x; + parts_aos_buffer[id_in_pack].x_p_h.y = c->hydro.parts[i].x[1] - shift.y; + parts_aos_buffer[id_in_pack].x_p_h.z = c->hydro.parts[i].x[2] - shift.z; + parts_aos_buffer[id_in_pack].x_p_h.w = c->hydro.parts[i].h; + parts_aos_buffer[id_in_pack].ux_m.x = c->hydro.parts[i].v[0]; + parts_aos_buffer[id_in_pack].ux_m.y = c->hydro.parts[i].v[1]; + parts_aos_buffer[id_in_pack].ux_m.z = c->hydro.parts[i].v[2]; + parts_aos_buffer[id_in_pack].ux_m.w = c->hydro.parts[i].mass; + parts_aos_buffer[id_in_pack].cjs_cje.x = cstarts.x; + parts_aos_buffer[id_in_pack].cjs_cje.y = cstarts.y; + } +} + +void pack_neat_aos_f4(struct cell *__restrict__ c, + struct part_aos_f4_send *__restrict__ parts_aos_buffer, + int tid, int local_pack_position, int count, + int2 frst_lst_prts) { + + struct part ptmps[count]; + memcpy(ptmps, (c->hydro.parts), count * sizeof(struct part)); + // ptmps = c->hydro.parts; + const float cellx = c->loc[0], celly = c->loc[1], cellz = c->loc[2]; + for (int i = 0; i < count; i++) { + const int id_in_pack = i + local_pack_position; + // const struct part p = ptmps[i]; + /*Data to be copied to GPU*/ + parts_aos_buffer[id_in_pack].x_p_h.x = ptmps[i].x[0] - cellx; + parts_aos_buffer[id_in_pack].x_p_h.y = ptmps[i].x[1] - celly; + parts_aos_buffer[id_in_pack].x_p_h.z = ptmps[i].x[2] - cellz; + parts_aos_buffer[id_in_pack].x_p_h.w = ptmps[i].h; + parts_aos_buffer[id_in_pack].ux_m.x = ptmps[i].v[0]; + parts_aos_buffer[id_in_pack].ux_m.y = ptmps[i].v[1]; + parts_aos_buffer[id_in_pack].ux_m.z = ptmps[i].v[2]; + parts_aos_buffer[id_in_pack].ux_m.w = ptmps[i].mass; + // /*Initialise sums to zero before CPU/GPU copy*/ + // const float4 zeroes = {0.0, 0.0, 0.0, 0.0}; + // parts_aos_buffer[id_in_pack].rho_dh_wcount = zeroes; + // parts_aos_buffer[id_in_pack].rot_ux_div_v = zeroes; + } +} + +void pack_neat_aos_f4_g(struct cell *c, + struct part_aos_f4_g_send *parts_aos_buffer, int tid, + int local_pack_position, int count) { + + const struct part *ptmps; + ptmps = c->hydro.parts; + const float cellx = c->loc[0], celly = c->loc[1], cellz = c->loc[2]; + for (int i = 0; i < count; i++) { + int id_in_pack = i + local_pack_position; + const struct part p = ptmps[i]; + /*Data to be copied to GPU*/ + parts_aos_buffer[id_in_pack].x_h.x = p.x[0] - cellx; + parts_aos_buffer[id_in_pack].x_h.y = p.x[1] - celly; + parts_aos_buffer[id_in_pack].x_h.z = p.x[2] - cellz; + parts_aos_buffer[id_in_pack].x_h.w = p.h; + parts_aos_buffer[id_in_pack].ux_m.x = p.v[0]; + parts_aos_buffer[id_in_pack].ux_m.y = p.v[1]; + parts_aos_buffer[id_in_pack].ux_m.z = p.v[2]; + parts_aos_buffer[id_in_pack].ux_m.w = p.mass; + parts_aos_buffer[id_in_pack].rho_avisc_u_c.x = p.rho; + parts_aos_buffer[id_in_pack].rho_avisc_u_c.y = p.viscosity.alpha; + parts_aos_buffer[id_in_pack].rho_avisc_u_c.z = p.u; // p.density.rot_v[0]; + parts_aos_buffer[id_in_pack].rho_avisc_u_c.w = + p.force.soundspeed; // p.density.rot_v[0]; + } +} + +extern inline void pack_neat_pair_aos_f4_g( + struct cell *__restrict c, + struct part_aos_f4_g_send *__restrict parts_aos_buffer, int tid, + const int local_pack_position, const int count, const float3 shift, + const int2 cstarts) { + /*Data to be copied to GPU*/ + for (int i = 0; i < count; i++) { + const int id_in_pack = i + local_pack_position; + parts_aos_buffer[id_in_pack].x_h.x = c->hydro.parts[i].x[0] - shift.x; + parts_aos_buffer[id_in_pack].x_h.y = c->hydro.parts[i].x[1] - shift.y; + parts_aos_buffer[id_in_pack].x_h.z = c->hydro.parts[i].x[2] - shift.z; + parts_aos_buffer[id_in_pack].x_h.w = c->hydro.parts[i].h; + parts_aos_buffer[id_in_pack].ux_m.x = c->hydro.parts[i].v[0]; + parts_aos_buffer[id_in_pack].ux_m.y = c->hydro.parts[i].v[1]; + parts_aos_buffer[id_in_pack].ux_m.z = c->hydro.parts[i].v[2]; + parts_aos_buffer[id_in_pack].ux_m.w = c->hydro.parts[i].mass; + parts_aos_buffer[id_in_pack].rho_avisc_u_c.x = c->hydro.parts[i].rho; + parts_aos_buffer[id_in_pack].rho_avisc_u_c.y = + c->hydro.parts[i].viscosity.alpha; + parts_aos_buffer[id_in_pack].rho_avisc_u_c.z = + c->hydro.parts[i].u; // p.density.rot_v[0]; + parts_aos_buffer[id_in_pack].rho_avisc_u_c.w = + c->hydro.parts[i].force.soundspeed; // p.density.rot_v[0]; + parts_aos_buffer[id_in_pack].cjs_cje.x = cstarts.x; + parts_aos_buffer[id_in_pack].cjs_cje.y = cstarts.y; + } +} + +void pack_neat_aos_f4_f(const struct cell *restrict c, + struct part_aos_f4_f_send *restrict parts_aos, int tid, + int local_pack_position, int count) { + + // const struct part *restrict ptmps; + // ptmps = c->hydro.parts; + const int pp = local_pack_position; + const float cellx = c->loc[0]; + const float celly = c->loc[1]; + const float cellz = c->loc[2]; + /*Data to be copied to GPU local memory*/ + for (int i = 0; i < count; i++) { + parts_aos[i + pp].x_h.x = c->hydro.parts[i].x[0] - cellx; + parts_aos[i + pp].x_h.y = c->hydro.parts[i].x[1] - celly; + parts_aos[i + pp].x_h.z = c->hydro.parts[i].x[2] - cellz; + parts_aos[i + pp].x_h.w = c->hydro.parts[i].h; + } + for (int i = 0; i < count; i++) { + parts_aos[i + pp].ux_m.x = c->hydro.parts[i].v[0]; + parts_aos[i + pp].ux_m.y = c->hydro.parts[i].v[1]; + parts_aos[i + pp].ux_m.z = c->hydro.parts[i].v[2]; + parts_aos[i + pp].ux_m.w = c->hydro.parts[i].mass; + } + for (int i = 0; i < count; i++) { + parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.x = + c->hydro.parts[i].force.f; + parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.y = + c->hydro.parts[i].force.balsara; + parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.z = + c->hydro.parts[i].time_bin; + parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.w = + c->hydro.parts[i].limiter_data.min_ngb_time_bin; + } + for (int i = 0; i < count; i++) { + parts_aos[i + pp].rho_p_c_vsigi.x = c->hydro.parts[i].rho; + parts_aos[i + pp].rho_p_c_vsigi.y = c->hydro.parts[i].force.pressure; + parts_aos[i + pp].rho_p_c_vsigi.z = c->hydro.parts[i].force.soundspeed; + parts_aos[i + pp].rho_p_c_vsigi.w = c->hydro.parts[i].viscosity.v_sig; + } + for (int i = 0; i < count; i++) { + parts_aos[i + pp].u_alphavisc_alphadiff.x = c->hydro.parts[i].u; + parts_aos[i + pp].u_alphavisc_alphadiff.y = + c->hydro.parts[i].viscosity.alpha; + parts_aos[i + pp].u_alphavisc_alphadiff.z = + c->hydro.parts[i].diffusion.alpha; + } +} + +extern inline void pack_neat_pair_aos_f4_f( + struct cell *__restrict c, struct part_aos_f4_f_send *__restrict parts_aos, + int tid, const int local_pack_position, const int count, const float3 shift, + const int2 cstarts) { + // const struct part *restrict ptmps; + // ptmps = c->hydro.parts; + const int pp = local_pack_position; + /*Data to be copied to GPU local memory*/ + for (int i = 0; i < count; i++) { + const int id = i + pp; + parts_aos[id].x_h.x = c->hydro.parts[i].x[0] - shift.x; + parts_aos[id].x_h.y = c->hydro.parts[i].x[1] - shift.y; + parts_aos[id].x_h.z = c->hydro.parts[i].x[2] - shift.z; + parts_aos[id].x_h.w = c->hydro.parts[i].h; + parts_aos[id].ux_m.x = c->hydro.parts[i].v[0]; + parts_aos[id].ux_m.y = c->hydro.parts[i].v[1]; + parts_aos[id].ux_m.z = c->hydro.parts[i].v[2]; + parts_aos[id].ux_m.w = c->hydro.parts[i].mass; + parts_aos[id].f_bals_timebin_mintimebin_ngb.x = c->hydro.parts[i].force.f; + parts_aos[id].f_bals_timebin_mintimebin_ngb.y = + c->hydro.parts[i].force.balsara; + parts_aos[id].f_bals_timebin_mintimebin_ngb.z = c->hydro.parts[i].time_bin; + parts_aos[id].f_bals_timebin_mintimebin_ngb.w = + c->hydro.parts[i].limiter_data.min_ngb_time_bin; + parts_aos[id].rho_p_c_vsigi.x = c->hydro.parts[i].rho; + parts_aos[id].rho_p_c_vsigi.y = c->hydro.parts[i].force.pressure; + parts_aos[id].rho_p_c_vsigi.z = c->hydro.parts[i].force.soundspeed; + parts_aos[id].rho_p_c_vsigi.w = c->hydro.parts[i].viscosity.v_sig; + parts_aos[id].u_alphavisc_alphadiff.x = c->hydro.parts[i].u; + parts_aos[id].u_alphavisc_alphadiff.y = c->hydro.parts[i].viscosity.alpha; + parts_aos[id].u_alphavisc_alphadiff.z = c->hydro.parts[i].diffusion.alpha; + parts_aos[id].cjs_cje.x = cstarts.x; + parts_aos[id].cjs_cje.y = cstarts.y; + } +} + +void runner_doself1_gpu_unpack_neat_aos_f4( + struct runner *r, struct cell *c, struct part_aos_f4_recv *parts_aos_buffer, + int timer, int *pack_length, int tid, int count_max_parts_tmp, + struct engine *e) { + TIMER_TIC; + + /* Anything to do here? */ + if (c->hydro.count == 0) return; + if (!cell_is_active_hydro(c, e)) { + message("Inactive cell\n"); + return; + } + int count = c->hydro.count; + int local_pack_position = (*pack_length); + +#ifdef SWIFT_DEBUG_CHECKS + if (local_pack_position + count >= count_max_parts_tmp) { + fprintf(stderr, + "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is " + "%i pointer to pack_length is %i, local_pack_position is % i, " + "count is %i\n", + (*pack_length), pack_length, local_pack_position, count, e); + } +#endif + + /* Copy particle data from CPU buffers to cells */ + unpack_neat_aos_f4(c, parts_aos_buffer, tid, local_pack_position, count, e); + // Increment pack length accordingly + (*pack_length) += count; +} + +void runner_doself1_gpu_unpack_neat_aos_f4_g( + struct runner *r, struct cell *c, + struct part_aos_f4_g_recv *parts_aos_buffer, int timer, int *pack_length, + int tid, int count_max_parts_tmp, struct engine *e) { + TIMER_TIC; + + /* Anything to do here? */ + if (c->hydro.count == 0) return; + if (!cell_is_active_hydro(c, e)) { + message("Inactive cell\n"); + return; + } + int count = c->hydro.count; + int local_pack_position = (*pack_length); + +#ifdef SWIFT_DEBUG_CHECKS + if (local_pack_position + count >= count_max_parts_tmp) { + fprintf(stderr, + "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is " + "%i pointer to pack_length is %i, local_pack_position is % i, " + "count is %i\n", + (*pack_length), pack_length, local_pack_position, count, e); + } +#endif + + /* Copy particle data from CPU buffers to cells */ + unpack_neat_aos_f4_g(c, parts_aos_buffer, tid, local_pack_position, count, e); + // Increment pack length accordingly + (*pack_length) += count; +} + +void runner_doself1_gpu_unpack_neat_aos_f4_f( + struct runner *r, struct cell *c, + struct part_aos_f4_f_recv *parts_aos_buffer, int timer, int *pack_length, + int tid, int count_max_parts_tmp, struct engine *e) { + TIMER_TIC; + + /* Anything to do here? */ + if (c->hydro.count == 0) return; + if (!cell_is_active_hydro(c, e)) { + message("Inactive cell\n"); + return; + } + int count = c->hydro.count; + int local_pack_position = (*pack_length); + +#ifdef SWIFT_DEBUG_CHECKS + if (local_pack_position + count >= count_max_parts_tmp) { + fprintf(stderr, + "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is " + "%i pointer to pack_length is %i, local_pack_position is % i, " + "count is %i\n", + (*pack_length), pack_length, local_pack_position, count, e); + } +#endif + + /* Copy particle data from CPU buffers to cells */ + unpack_neat_aos_f4_f(c, parts_aos_buffer, tid, local_pack_position, count, e); + // Increment pack length accordingly + (*pack_length) += count; +} + +#include +void unpack_neat_aos_f4(struct cell *c, + struct part_aos_f4_recv *parts_aos_buffer, int tid, + int local_pack_position, int count, struct engine *e) { + + struct part_aos_f4_recv *parts_tmp = &parts_aos_buffer[local_pack_position]; + for (int i = 0; i < count; i++) { + + struct part_aos_f4_recv p_tmp = parts_tmp[i]; + float4 rho_dh_wcount = p_tmp.rho_dh_wcount; + float4 rot_ux_div_v = p_tmp.rot_ux_div_v; + struct part *p = &c->hydro.parts[i]; + if(!PART_IS_ACTIVE(p, e))continue; + p->rho += rho_dh_wcount.x; + p->density.rho_dh += rho_dh_wcount.y; + p->density.wcount += rho_dh_wcount.z; + p->density.wcount_dh += rho_dh_wcount.w; + p->density.rot_v[0] += rot_ux_div_v.x; + p->density.rot_v[1] += rot_ux_div_v.y; + p->density.rot_v[2] += rot_ux_div_v.z; + p->viscosity.div_v += rot_ux_div_v.w; + } +} + +void unpack_neat_aos_f4_g(struct cell *c, + struct part_aos_f4_g_recv *parts_aos_buffer, int tid, + int local_pack_position, int count, + struct engine *e) { + + struct part_aos_f4_g_recv *parts_tmp = &parts_aos_buffer[local_pack_position]; + for (int i = 0; i < count; i++) { + struct part_aos_f4_g_recv p_tmp = parts_tmp[i]; + struct part *p = &c->hydro.parts[i]; + if(!PART_IS_ACTIVE(p, e))continue; + const float v_sig = p->viscosity.v_sig; + p->viscosity.v_sig = fmaxf(p_tmp.vsig_lapu_aviscmax.x, v_sig); + p->diffusion.laplace_u += p_tmp.vsig_lapu_aviscmax.y; + const float max_ngb = p->force.alpha_visc_max_ngb; + p->force.alpha_visc_max_ngb = fmaxf(p_tmp.vsig_lapu_aviscmax.z, max_ngb); + } +} + +void unpack_neat_aos_f4_f(struct cell *restrict c, + struct part_aos_f4_f_recv *restrict parts_aos_buffer, + int tid, int local_pack_position, int count, + struct engine *e) { + int pp = local_pack_position; + for (int i = 0; i < count; i++) { + if(!PART_IS_ACTIVE(&c->hydro.parts[i], e))continue; + c->hydro.parts[i].a_hydro[0] += parts_aos_buffer[i + pp].a_hydro.x; + c->hydro.parts[i].a_hydro[1] += parts_aos_buffer[i + pp].a_hydro.y; + c->hydro.parts[i].a_hydro[2] += parts_aos_buffer[i + pp].a_hydro.z; + } + for (int i = 0; i < count; i++) { + if(!PART_IS_ACTIVE(&c->hydro.parts[i], e))continue; + c->hydro.parts[i].viscosity.v_sig = + fmaxf(parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.z, + c->hydro.parts[i].viscosity.v_sig); + c->hydro.parts[i].limiter_data.min_ngb_time_bin = + (int)(parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.w + 0.5f); + } + for (int i = 0; i < count; i++) { + if(!PART_IS_ACTIVE(&c->hydro.parts[i], e))continue; + c->hydro.parts[i].u_dt += + parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.x; + c->hydro.parts[i].force.h_dt += + parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.y; + } +} + +void unpack_neat_pair_aos_f4(struct runner *r, struct cell *restrict c, + struct part_aos_f4_recv *restrict parts_aos_buffer, + int tid, int local_pack_position, int count, + struct engine *e) { + + // struct part_aos_f4_recv * restrict parts_tmp = + // &parts_aos_buffer[local_pack_position]; + if (cell_is_active_hydro(c, e)) { + int pp = local_pack_position; + for (int i = 0; i < count; i++) { + int j = i + pp; + c->hydro.parts[i].rho += parts_aos_buffer[j].rho_dh_wcount.x; + c->hydro.parts[i].density.rho_dh += parts_aos_buffer[j].rho_dh_wcount.y; + c->hydro.parts[i].density.wcount += parts_aos_buffer[j].rho_dh_wcount.z; + c->hydro.parts[i].density.wcount_dh += + parts_aos_buffer[j].rho_dh_wcount.w; + c->hydro.parts[i].density.rot_v[0] += parts_aos_buffer[j].rot_ux_div_v.x; + c->hydro.parts[i].density.rot_v[1] += parts_aos_buffer[j].rot_ux_div_v.y; + c->hydro.parts[i].density.rot_v[2] += parts_aos_buffer[j].rot_ux_div_v.z; + c->hydro.parts[i].viscosity.div_v += parts_aos_buffer[j].rot_ux_div_v.w; + } + } +} + +void unpack_neat_pair_aos_f4_g( + struct runner *r, struct cell *restrict c, + struct part_aos_f4_g_recv *restrict parts_aos_buffer, int tid, + int local_pack_position, int count, struct engine *e) { + // struct part_aos_f4_recv * restrict parts_tmp = + // &parts_aos_buffer[local_pack_position]; int pp = local_pack_position; for + // (int i = 0; i < count; i++) { + // int j = i + pp; + // c->hydro.parts[i].viscosity.v_sig = + // parts_aos_buffer[j].vsig_lapu_aviscmax.x; + // c->hydro.parts[i].diffusion.laplace_u += + // parts_aos_buffer[j].vsig_lapu_aviscmax.y; + // c->hydro.parts[i].force.alpha_visc_max_ngb = + // parts_aos_buffer[j].vsig_lapu_aviscmax.z; + // } + if (cell_is_active_hydro(c, e)) { + + struct part_aos_f4_g_recv *parts_tmp = + &parts_aos_buffer[local_pack_position]; + for (int i = 0; i < count; i++) { + struct part_aos_f4_g_recv p_tmp = parts_tmp[i]; + struct part *p = &c->hydro.parts[i]; + const float v_sig = p->viscosity.v_sig; + p->viscosity.v_sig = fmaxf(p_tmp.vsig_lapu_aviscmax.x, v_sig); + p->diffusion.laplace_u += p_tmp.vsig_lapu_aviscmax.y; + const float max_ngb = p->force.alpha_visc_max_ngb; + p->force.alpha_visc_max_ngb = fmaxf(p_tmp.vsig_lapu_aviscmax.z, max_ngb); + } + } +} + +void unpack_neat_pair_aos_f4_f( + struct runner *r, struct cell *restrict c, + struct part_aos_f4_f_recv *restrict parts_aos_buffer, int tid, + int local_pack_position, int count, struct engine *e) { + // struct part_aos_f4_f_recv *restrict parts_tmp = + //&parts_aos_buffer[local_pack_position]; + if (cell_is_active_hydro(c, e)) { + int pp = local_pack_position; + for (int i = 0; i < count; i++) { + // struct part_aos_f4_f_recv p_tmp = parts_tmp[i]; + // struct part *restrict p = &c->hydro.parts[i]; + int j = i + pp; + c->hydro.parts[i].a_hydro[0] += parts_aos_buffer[j].a_hydro.x; + c->hydro.parts[i].a_hydro[1] += parts_aos_buffer[j].a_hydro.y; + c->hydro.parts[i].a_hydro[2] += parts_aos_buffer[j].a_hydro.z; + c->hydro.parts[i].viscosity.v_sig = + fmaxf(parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.z, + c->hydro.parts[i].viscosity.v_sig); + c->hydro.parts[i].limiter_data.min_ngb_time_bin = + (int)(parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.w + 0.5f); + c->hydro.parts[i].u_dt += + parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.x; + c->hydro.parts[i].force.h_dt += + parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.y; + } + } +} + +void runner_do_ci_cj_gpu_unpack_neat_aos_f4( + struct runner *r, struct cell *ci, struct cell *cj, + struct part_aos_f4_recv *parts_aos_buffer, int timer, int *pack_length, + int tid, int count_max_parts_tmp, struct engine *e) { + + /* Anything to do here? */ +// if (ci->hydro.count == 0 || cj->hydro.count == 0) +// return; + if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)) { + message("Inactive cell\n"); + return; + } + int count_ci = ci->hydro.count; + int count_cj = cj->hydro.count; + int local_pack_position = (*pack_length); + +#ifdef SWIFT_DEBUG_CHECKS + if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) { + fprintf(stderr, + "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is " + "%i pointer to pack_length is %i, local_pack_position is % i, " + "count is %i\n", + (*pack_length), pack_length, local_pack_position, count_ci, e); + } +#endif + + /* Pack the particle data into CPU-side buffers*/ + // if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j + // %i\n", local_pack_position, count_ci, count_cj); + unpack_neat_pair_aos_f4(r, ci, parts_aos_buffer, tid, local_pack_position, + count_ci, e); + local_pack_position += count_ci; + /* Pack the particle data into CPU-side buffers*/ + // if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j + // %i\n", local_pack_position, count_ci, count_cj); + unpack_neat_pair_aos_f4(r, cj, parts_aos_buffer, tid, local_pack_position, + count_cj, e); + /* Increment pack length accordingly */ + (*pack_length) += count_ci + count_cj; + // if(r->cpuid == 0)exit(0); +} + +void runner_do_ci_cj_gpu_unpack_neat_aos_f4_g( + struct runner *r, struct cell *ci, struct cell *cj, + struct part_aos_f4_g_recv *parts_aos_buffer, int timer, int *pack_length, + int tid, int count_max_parts_tmp, struct engine *e) { + + /* Anything to do here? */ + // if (c->hydro.count == 0) + // return; + if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)) { + message("Inactive cell\n"); + return; + } + int count_ci = ci->hydro.count; + int count_cj = cj->hydro.count; + int local_pack_position = (*pack_length); + +#ifdef SWIFT_DEBUG_CHECKS + if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) { + fprintf(stderr, + "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is " + "%i pointer to pack_length is %i, local_pack_position is % i, " + "count is %i\n", + (*pack_length), pack_length, local_pack_position, count_ci, e); + } +#endif + + /* Pack the particle data into CPU-side buffers*/ + // if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j + // %i\n", local_pack_position, count_ci, count_cj); + unpack_neat_pair_aos_f4_g(r, ci, parts_aos_buffer, tid, local_pack_position, + count_ci, e); + local_pack_position += count_ci; + /* Pack the particle data into CPU-side buffers*/ + // if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j + // %i\n", local_pack_position, count_ci, count_cj); + unpack_neat_pair_aos_f4_g(r, cj, parts_aos_buffer, tid, local_pack_position, + count_cj, e); + /* Increment pack length accordingly */ + (*pack_length) += count_ci + count_cj; + // if(r->cpuid == 0)exit(0); +} + +void runner_do_ci_cj_gpu_unpack_neat_aos_f4_f( + struct runner *r, struct cell *ci, struct cell *cj, + struct part_aos_f4_f_recv *parts_aos_buffer, int timer, int *pack_length, + int tid, int count_max_parts_tmp, struct engine *e) { + + /* Anything to do here? */ + // if (c->hydro.count == 0) + // return; + if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)) { + message("Inactive cell\n"); + return; + } + int count_ci = ci->hydro.count; + int count_cj = cj->hydro.count; + int local_pack_position = (*pack_length); + +#ifdef SWIFT_DEBUG_CHECKS + if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) { + fprintf(stderr, + "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is " + "%i pointer to pack_length is %i, local_pack_position is % i, " + "count is %i\n", + (*pack_length), pack_length, local_pack_position, count_ci, e); + } +#endif + + /* Pack the particle data into CPU-side buffers*/ + // if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j + // %i\n", local_pack_position, count_ci, count_cj); + unpack_neat_pair_aos_f4_f(r, ci, parts_aos_buffer, tid, local_pack_position, + count_ci, e); + local_pack_position += count_ci; + /* Pack the particle data into CPU-side buffers*/ + // if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j + // %i\n", local_pack_position, count_ci, count_cj); + unpack_neat_pair_aos_f4_f(r, cj, parts_aos_buffer, tid, local_pack_position, + count_cj, e); + /* Increment pack length accordingly */ + (*pack_length) += count_ci + count_cj; + // if(r->cpuid == 0)exit(0); +} + +void runner_do_ci_cj_gpu_pack_neat_aos_f4( + struct runner *r, struct cell *restrict ci, struct cell *restrict cj, + struct part_aos_f4_send *restrict parts_aos_buffer, int timer, + int *pack_length, int tid, int count_max_parts_tmp, const int count_ci, + const int count_cj, float3 shift_tmp) { + + TIMER_TIC; + + /* Anything to do here? */ + if (ci->hydro.count == 0) return; + + int local_pack_position = (*pack_length); + +#ifdef SWIFT_DEBUG_CHECKS + if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) { + fprintf(stderr, + "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i" + "ci %i cj %i count_max %i\n", + local_pack_position, count_ci, count_cj, count_max_parts_tmp); + error(); + } +#endif + + /* Pack the particle data into CPU-side buffers*/ + const float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1], + shift_tmp.z + cj->loc[2]}; + const int lpp1 = local_pack_position; + + const int2 cis_cie = {local_pack_position, local_pack_position + count_ci}; + + const int2 cjs_cje = {local_pack_position + count_ci, + local_pack_position + count_ci + count_cj}; + + pack_neat_pair_aos_f4(ci, parts_aos_buffer, tid, lpp1, count_ci, shift_i, + cjs_cje); + + local_pack_position += count_ci; + /* Pack the particle data into CPU-side buffers*/ + const float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]}; + const int lpp2 = local_pack_position; + + pack_neat_pair_aos_f4(cj, parts_aos_buffer, tid, lpp2, count_cj, shift_j, + cis_cie); + /* Increment pack length accordingly */ + (*pack_length) += count_ci + count_cj; + + if (timer) TIMER_TOC(timer_doself_gpu_pack); +} + +void runner_do_ci_cj_gpu_pack_neat_aos_f4_g( + struct runner *r, struct cell *restrict ci, struct cell *restrict cj, + struct part_aos_f4_g_send *restrict parts_aos_buffer, int timer, + int *pack_length, int tid, int count_max_parts_tmp, const int count_ci, + const int count_cj, float3 shift_tmp) { + + TIMER_TIC; + + /* Anything to do here? */ + if (ci->hydro.count == 0) return; + + int local_pack_position = (*pack_length); + +#ifdef SWIFT_DEBUG_CHECKS + if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) { + fprintf(stderr, + "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i" + "ci %i cj %i count_max %i\n", + local_pack_position, count_ci, count_cj, count_max_parts_tmp); + error(); + } +#endif + + /* Pack the particle data into CPU-side buffers*/ + const float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1], + shift_tmp.z + cj->loc[2]}; + const int lpp1 = local_pack_position; + + const int2 cis_cie = {local_pack_position, local_pack_position + count_ci}; + + const int2 cjs_cje = {local_pack_position + count_ci, + local_pack_position + count_ci + count_cj}; + + pack_neat_pair_aos_f4_g(ci, parts_aos_buffer, tid, lpp1, count_ci, shift_i, + cjs_cje); + + local_pack_position += count_ci; + /* Pack the particle data into CPU-side buffers*/ + const float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]}; + const int lpp2 = local_pack_position; + + pack_neat_pair_aos_f4_g(cj, parts_aos_buffer, tid, lpp2, count_cj, shift_j, + cis_cie); + /* Increment pack length accordingly */ + (*pack_length) += count_ci + count_cj; + + if (timer) TIMER_TOC(timer_doself_gpu_pack); +} + +void runner_do_ci_cj_gpu_pack_neat_aos_f4_f( + struct runner *r, struct cell *restrict ci, struct cell *restrict cj, + struct part_aos_f4_f_send *restrict parts_aos_buffer, int timer, + int *pack_length, int tid, int count_max_parts_tmp, const int count_ci, + const int count_cj, float3 shift_tmp) { + + TIMER_TIC; + + /* Anything to do here? */ + if (ci->hydro.count == 0) return; + + int local_pack_position = (*pack_length); + +#ifdef SWIFT_DEBUG_CHECKS + if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) { + fprintf(stderr, + "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i" + "ci %i cj %i count_max %i\n", + local_pack_position, count_ci, count_cj, count_max_parts_tmp); + error(); + } +#endif + + /* Pack the particle data into CPU-side buffers*/ + const float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1], + shift_tmp.z + cj->loc[2]}; + const int lpp1 = local_pack_position; + + const int2 cis_cie = {local_pack_position, local_pack_position + count_ci}; + + const int2 cjs_cje = {local_pack_position + count_ci, + local_pack_position + count_ci + count_cj}; + + pack_neat_pair_aos_f4_f(ci, parts_aos_buffer, tid, lpp1, count_ci, shift_i, + cjs_cje); + + local_pack_position += count_ci; + /* Pack the particle data into CPU-side buffers*/ + const float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]}; + const int lpp2 = local_pack_position; + + pack_neat_pair_aos_f4_f(cj, parts_aos_buffer, tid, lpp2, count_cj, shift_j, + cis_cie); + /* Increment pack length accordingly */ + (*pack_length) += count_ci + count_cj; + + if (timer) TIMER_TOC(timer_doself_gpu_pack); +} +// #ifdef WITHCUDA +// } +// #endif diff --git a/src/runner_gpu_pack_functions.h b/src/runner_gpu_pack_functions.h new file mode 100644 index 0000000000..8730219711 --- /dev/null +++ b/src/runner_gpu_pack_functions.h @@ -0,0 +1,246 @@ +#include "cuda/part_gpu.h" +void runner_doself1_gpu_pack( + struct runner *r, struct cell *c, int timer, int *pack_length, double *x_p, + double *y_p, double *z_p, int tid, int *tid_p, long long *id, float *ux, + float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz, + float *mass, float *h, float *u, float *u_dt, float *rho, float *SPH_sum, + float *locx, float *locy, float *locz, float *widthx, float *widthy, + float *widthz, float *h_max, int *count_p, float *wcount, float *wcount_dh, + float *rho_dh, float *rot_u, float *rot_v, float *rot_w, float *div_v, + float *div_v_previous_step, float *alpha_visc, float *v_sig, + float *laplace_u, float *alpha_diff, float *f, float *soundspeed, + float *h_dt, float *balsara, float *pressure, float *alpha_visc_max_ngb, + timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin, + char *to_be_synchronized, int count_max_parts_tmp); +void runner_doself1_gpu_pack_neat(struct runner *r, struct cell *c, + struct part_soa parts_soa, int timer, + int *pack_length, int tid, + int count_max_parts_tmp); +void runner_doself1_gpu_pack_neat_aos(struct runner *r, struct cell *c, + struct part_aos *parts_aos, int timer, + int *pack_length, int tid, + int count_max_parts_tmp); +void runner_doself1_gpu_pack_neat_aos_f4( + struct runner *r, struct cell *__restrict__ c, + struct part_aos_f4_send *__restrict__ parts_aos, int timer, + int *pack_length, int tid, int count_max_parts_tmp); +void runner_doself1_gpu_pack_neat_aos_g(struct runner *r, struct cell *c, + struct part_aos_g *parts_aos, int timer, + int *pack_length, int tid, + int count_max_parts_tmp); +void runner_doself1_gpu_pack_neat_aos_f4_g(struct runner *r, struct cell *c, + struct part_aos_f4_g_send *parts_aos, + int timer, int *pack_length, int tid, + int count_max_parts_tmp); +void runner_doself1_gpu_pack_neat_aos_f(struct runner *r, struct cell *c, + struct part_aos_f *parts_aos, int timer, + int *pack_length, int tid, + int count_max_parts_tmp); +void runner_doself1_gpu_pack_neat_aos_f4_f( + struct runner *r, struct cell *restrict c, + struct part_aos_f4_f_send *restrict parts_aos, int timer, int *pack_length, + int tid, int count_max_parts_tmp); +void runner_doself1_gpu_pack_forc_aos(struct runner *r, struct cell *c, + struct part_aos *parts_aos, int timer, + int *pack_length, int tid, + int count_max_parts_tmp); +void runner_doself1_gpu_pack_grad_aos(struct runner *r, struct cell *c, + struct part_aos *parts_aos, int timer, + int *pack_length, int tid, + int count_max_parts_tmp); +void runner_doself1_gpu_unpack_neat(struct runner *r, struct cell *c, + struct part_soa parts_soa, int timer, + int *pack_length, int tid, + int count_max_parts_tmp, struct engine *e); +void runner_doself1_gpu_unpack_neat_aos(struct runner *r, struct cell *c, + struct part_aos *parts_aos_buffer, + int timer, int *pack_length, int tid, + int count_max_parts_tmp, + struct engine *e); +void runner_doself1_gpu_unpack_neat_aos_f4( + struct runner *r, struct cell *c, struct part_aos_f4_recv *parts_aos_buffer, + int timer, int *pack_length, int tid, int count_max_parts_tmp, + struct engine *e); +void runner_doself1_gpu_unpack_neat_aos_g(struct runner *r, struct cell *c, + struct part_aos_g *parts_aos_buffer, + int timer, int *pack_length, int tid, + int count_max_parts_tmp, + struct engine *e); +void runner_doself1_gpu_unpack_neat_aos_f4_g( + struct runner *r, struct cell *c, + struct part_aos_f4_g_recv *parts_aos_buffer, int timer, int *pack_length, + int tid, int count_max_parts_tmp, struct engine *e); +void runner_doself1_gpu_unpack_neat_aos_f(struct runner *r, struct cell *c, + struct part_aos_f *parts_aos_buffer, + int timer, int *pack_length, int tid, + int count_max_parts_tmp, + struct engine *e); +void runner_doself1_gpu_unpack_neat_aos_f4_f( + struct runner *r, struct cell *restrict c, + struct part_aos_f4_f_recv *restrict parts_aos_buffer, int timer, + int *pack_length, int tid, int count_max_parts_tmp, struct engine *e); +void pack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid, + int *tid_p, long long *id, float *ux, float *uy, float *uz, + float *a_hydrox, float *a_hydroy, float *a_hydroz, float *mass, + float *h, float *u, float *u_dt, float *rho, float *SPH_sum, + float *locx, float *locy, float *locz, float *widthx, float *widthy, + float *widthz, float *h_max, int *count_p, float *wcount, + float *wcount_dh, float *rho_dh, float *rot_u, float *rot_v, + float *rot_w, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff, + float *f, float *soundspeed, float *h_dt, float *balsara, + float *pressure, float *alpha_visc_max_ngb, timebin_t *time_bin, + timebin_t *wakeup, timebin_t *min_ngb_time_bin, + char *to_be_synchronized, int local_pack_position, int count); +void pack_neat(struct cell *c, struct part_soa parts_soa, int tid, + int local_pack_position, int count); +void pack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid, + int local_pack_position, int count); +void pack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer, + int tid, int local_pack_position, int count); +void pack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos, int tid, + int local_pack_position, int count); +void pack_neat_aos_f4(struct cell *c, struct part_aos_f4_send *parts_aos_buffer, + int tid, int local_pack_position, int count, + int2 frst_lst_prts); +void pack_neat_aos_f4_g(struct cell *c, + struct part_aos_f4_g_send *parts_aos_buffer, int tid, + int local_pack_position, int count); +void pack_neat_aos_f4_f(const struct cell *restrict c, + struct part_aos_f4_f_send *restrict parts_aos, int tid, + int local_pack_position, int count); +void unpack_neat(struct cell *c, struct part_soa parts_soa_buffer, int tid, + int local_pack_position, int count, struct engine *e); +void unpack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid, + int local_pack_position, int count, struct engine *e); +void unpack_neat_aos_f4(struct cell *c, + struct part_aos_f4_recv *parts_aos_buffer, int tid, + int local_pack_position, int count, struct engine *e); +void unpack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer, + int tid, int local_pack_position, int count, + struct engine *e); +void unpack_neat_aos_f4_g(struct cell *c, + struct part_aos_f4_g_recv *parts_aos_buffer, int tid, + int local_pack_position, int count, struct engine *e); +void unpack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos_buffer, + int tid, int local_pack_position, int count, + struct engine *e); +void unpack_neat_aos_f4_f(struct cell *restrict c, + struct part_aos_f4_f_recv *restrict parts_aos_buffer, + int tid, int local_pack_position, int count, + struct engine *e); +void unpack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid, + int *tid_p, long long *id, float *ux, float *uy, float *uz, + float *a_hydrox, float *a_hydroy, float *a_hydroz, float *mass, + float *h, float *u, float *u_dt, float *rho, float *SPH_sum, + float *locx, float *locy, float *locz, float *widthx, float *widthy, + float *widthz, float *h_max, int *count_p, float *wcount, + float *wcount_dh, float *rho_dh, float *rot_u, float *rot_v, + float *rot_w, float *div_v, float *div_v_previous_step, + float *alpha_visc, float *v_sig, float *laplace_u, + float *alpha_diff, float *f, float *soundspeed, float *h_dt, + float *balsara, float *pressure, float *alpha_visc_max_ngb, + timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin, + char *to_be_synchronized, int local_pack_position, int count, + struct engine *e); +void runner_doself1_gpu_unpack( + struct runner *r, struct cell *c, int timer, int *pack_length, double *x_p, + double *y_p, double *z_p, int tid, int *tid_p, long long *id, float *ux, + float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz, + float *mass, float *h, float *u, float *u_dt, float *rho, float *SPH_sum, + float *locx, float *locy, float *locz, float *widthx, float *widthy, + float *widthz, float *h_max, int *count_p, float *wcount, float *wcount_dh, + float *rho_dh, float *rot_u, float *rot_v, float *rot_w, float *div_v, + float *div_v_previous_step, float *alpha_visc, float *v_sig, + float *laplace_u, float *alpha_diff, float *f, float *soundspeed, + float *h_dt, float *balsara, float *pressure, float *alpha_visc_max_ngb, + timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin, + char *to_be_synchronized, int count_max_parts_tmp, struct engine *e); + +void runner_do_ci_cj_gpu_pack_neat(struct runner *r, struct cell *ci, + struct cell *cj, + struct part_soa parts_soa_buffer, int timer, + int *pack_length, int tid, + int count_max_parts_tmp, int count_ci, + int count_cj); + +void runner_do_ci_cj_gpu_pack_neat_aos(struct runner *r, struct cell *ci, + struct cell *cj, + struct part_aos *parts_aos_buffer, + int timer, int *pack_length, int tid, + int count_max_parts_tmp, int count_ci, + int count_cj, float3 shift_tmp); + +void runner_do_ci_cj_gpu_pack_neat_aos_f4( + struct runner *r, struct cell *restrict ci, struct cell *restrict cj, + struct part_aos_f4_send *restrict parts_aos_buffer, int timer, + int *pack_length, int tid, int count_max_parts_tmp, const int count_ci, + const int count_cj, float3 shift_tmp); + +void runner_do_ci_cj_gpu_pack_neat_aos_g(struct runner *r, struct cell *ci, + struct cell *cj, + struct part_aos_g *parts_aos_buffer, + int timer, int *pack_length, int tid, + int count_max_parts_tmp, int count_ci, + int count_cj); + +void runner_do_ci_cj_gpu_pack_neat_aos_f4_g( + struct runner *r, struct cell *restrict ci, struct cell *restrict cj, + struct part_aos_f4_g_send *restrict parts_aos_buffer, int timer, + int *pack_length, int tid, int count_max_parts_tmp, const int count_ci, + const int count_cj, float3 shift_tmp); + +void runner_do_ci_cj_gpu_pack_neat_aos_f(struct runner *r, struct cell *ci, + struct cell *cj, + struct part_aos_f *parts_aos_buffer, + int timer, int *pack_length, int tid, + int count_max_parts_tmp, int count_ci, + int count_cj); + +void runner_do_ci_cj_gpu_pack_neat_aos_f4_f( + struct runner *r, struct cell *restrict ci, struct cell *restrict cj, + struct part_aos_f4_f_send *restrict parts_aos_buffer, int timer, + int *pack_length, int tid, int count_max_parts_tmp, const int count_ci, + const int count_cj, float3 shift_tmp); + +void runner_do_ci_cj_gpu_unpack_neat(struct runner *r, struct cell *ci, + struct cell *cj, + struct part_soa parts_soa_buffer, + int timer, int *pack_length, int tid, + int count_max_parts_tmp, struct engine *e); + +void runner_do_ci_cj_gpu_unpack_neat_aos(struct runner *r, struct cell *ci, + struct cell *cj, + struct part_aos *parts_aos_buffer, + int timer, int *pack_length, int tid, + int count_max_parts_tmp, + struct engine *e); + +void runner_do_ci_cj_gpu_unpack_neat_aos_f4( + struct runner *r, struct cell *ci, struct cell *cj, + struct part_aos_f4_recv *parts_aos_buffer, int timer, int *pack_length, + int tid, int count_max_parts_tmp, struct engine *e); + +void runner_do_ci_cj_gpu_unpack_neat_aos_f4_g( + struct runner *r, struct cell *ci, struct cell *cj, + struct part_aos_f4_g_recv *parts_aos_buffer, int timer, int *pack_length, + int tid, int count_max_parts_tmp, struct engine *e); + +void runner_do_ci_cj_gpu_unpack_neat_aos_g(struct runner *r, struct cell *ci, + struct cell *cj, + struct part_aos_g *parts_aos_buffer, + int timer, int *pack_length, int tid, + int count_max_parts_tmp, + struct engine *e); + +void runner_do_ci_cj_gpu_unpack_neat_aos_f(struct runner *r, struct cell *ci, + struct cell *cj, + struct part_aos_f *parts_aos_buffer, + int timer, int *pack_length, int tid, + int count_max_parts_tmp, + struct engine *e); + +void runner_do_ci_cj_gpu_unpack_neat_aos_f4_f( + struct runner *r, struct cell *ci, struct cell *cj, + struct part_aos_f4_f_recv *parts_aos_buffer, int timer, int *pack_length, + int tid, int count_max_parts_tmp, struct engine *e); diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu new file mode 100644 index 0000000000..2376aafba7 --- /dev/null +++ b/src/runner_main_clean.cu @@ -0,0 +1,1864 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) + * Matthieu Schaller (matthieu.schaller@durham.ac.uk) + * 2015 Peter W. Draper (p.w.draper@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + * + ******************************************************************************/ +/* Config parameters. */ +#define GPUOFFLOAD_DENSITY 1 // off-load hydro density to GPU +#define GPUOFFLOAD_GRADIENT 1 // off-load hydro gradient to GPU +#define GPUOFFLOAD_FORCE 1 // off-load hydro force to GPU + +// #define DUMP_TIMINGS 1 +#include "../config.h" + +/* MPI headers. */ +#ifdef WITH_MPI +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* Config parameters. */ +#include + +/* MPI headers. */ +#ifdef WITH_MPI +#include +#endif + +/* This object's header. */ +#include "runner.h" + +/* Local headers. */ +#include "engine.h" +#include "feedback.h" +#include "runner_doiact_sinks.h" +#include "scheduler.h" +#include "space_getsid.h" +#include "timers.h" + +/* Import the gravity loop functions. */ +#include "runner_doiact_grav.h" + +/* Import the density loop functions. */ +#define FUNCTION density +#define FUNCTION_TASK_LOOP TASK_LOOP_DENSITY +#include "runner_doiact_hydro.h" +#include "runner_doiact_undef.h" + +/* Import the gradient loop functions (if required). */ +#ifdef EXTRA_HYDRO_LOOP +#define FUNCTION gradient +#define FUNCTION_TASK_LOOP TASK_LOOP_GRADIENT +#include "runner_doiact_hydro.h" +#include "runner_doiact_undef.h" +#endif + +/* Import the force loop functions. */ +#define FUNCTION force +#define FUNCTION_TASK_LOOP TASK_LOOP_FORCE +#include "runner_doiact_hydro.h" +#include "runner_doiact_undef.h" + +/* Import the limiter loop functions. */ +#define FUNCTION limiter +#define FUNCTION_TASK_LOOP TASK_LOOP_LIMITER +#include "runner_doiact_limiter.h" +#include "runner_doiact_undef.h" + +/* Import the stars density loop functions. */ +#define FUNCTION density +#define FUNCTION_TASK_LOOP TASK_LOOP_DENSITY +#include "runner_doiact_stars.h" +#include "runner_doiact_undef.h" + +#ifdef EXTRA_STAR_LOOPS + +/* Import the stars prepare1 loop functions. */ +#define FUNCTION prep1 +#define FUNCTION_TASK_LOOP TASK_LOOP_STARS_PREP1 +#include "runner_doiact_stars.h" +#include "runner_doiact_undef.h" + +/* Import the stars prepare2 loop functions. */ +#define FUNCTION prep2 +#define FUNCTION_TASK_LOOP TASK_LOOP_STARS_PREP2 +#include "runner_doiact_stars.h" +#include "runner_doiact_undef.h" + +#endif /* EXTRA_STAR_LOOPS */ + +/* Import the stars feedback loop functions. */ +#define FUNCTION feedback +#define FUNCTION_TASK_LOOP TASK_LOOP_FEEDBACK +#include "runner_doiact_stars.h" +#include "runner_doiact_undef.h" + +/* Import the black hole density loop functions. */ +#define FUNCTION density +#define FUNCTION_TASK_LOOP TASK_LOOP_DENSITY +#include "runner_doiact_black_holes.h" +#include "runner_doiact_undef.h" + +/* Import the black hole feedback loop functions. */ +#define FUNCTION swallow +#define FUNCTION_TASK_LOOP TASK_LOOP_SWALLOW +#include "runner_doiact_black_holes.h" +#include "runner_doiact_undef.h" + +/* Import the black hole feedback loop functions. */ +#define FUNCTION feedback +#define FUNCTION_TASK_LOOP TASK_LOOP_FEEDBACK +#include "runner_doiact_black_holes.h" +#include "runner_doiact_undef.h" + +/* Import the RT gradient loop functions */ +#define FUNCTION rt_gradient +#define FUNCTION_TASK_LOOP TASK_LOOP_RT_GRADIENT +#include "runner_doiact_hydro.h" +#include "runner_doiact_undef.h" + +/* Import the RT transport (force) loop functions. */ +#define FUNCTION rt_transport +#define FUNCTION_TASK_LOOP TASK_LOOP_RT_TRANSPORT +#include "runner_doiact_hydro.h" +#include "runner_doiact_undef.h" + +#ifdef __cplusplus +} +#endif +/** + * @brief The #runner main thread routine. + * + * @param data A pointer to this thread's data. + **/ + +/* CUDA Header. Wrap in extern "C" to prevent C++ function name mangling */ +#ifdef WITH_CUDA +#ifdef __cplusplus +extern "C" { +#endif + +#include "cuda/part_gpu.h" +#include +#include +#include +#include "runner_doiact_functions_hydro_gpu.h" +#include "runner_gpu_pack_functions.h" +#include "cuda/GPU_runner_functions.h" + +#ifdef __cplusplus +} +#endif + +void *runner_main2(void *data) { + struct runner *r = (struct runner *)data; + struct engine *e = r->e; + struct scheduler *sched = &e->sched; + struct space *space = e->s; + + //////////Declare and allocate GPU launch control data structures///////// + /*pack_vars contain data required for self and pair packing tasks destined + * for the GPU*/ + //A. N: Needed + struct pack_vars_self *pack_vars_self_dens; + struct pack_vars_self *pack_vars_self_forc; + struct pack_vars_self *pack_vars_self_grad; + struct pack_vars_pair *pack_vars_pair_dens; + struct pack_vars_pair *pack_vars_pair_forc; + struct pack_vars_pair *pack_vars_pair_grad; + + cudaMallocHost((void **)&pack_vars_self_dens, + sizeof(struct pack_vars_self *)); + cudaMallocHost((void **)&pack_vars_self_forc, + sizeof(struct pack_vars_self *)); + cudaMallocHost((void **)&pack_vars_self_grad, + sizeof(struct pack_vars_self *)); + + cudaMallocHost((void **)&pack_vars_pair_dens, + sizeof(struct pack_vars_pair *)); + cudaMallocHost((void **)&pack_vars_pair_forc, + sizeof(struct pack_vars_pair *)); + cudaMallocHost((void **)&pack_vars_pair_grad, + sizeof(struct pack_vars_pair *)); + /////////////////////////////////////////////////////////////////////////// + /*Find and print GPU name(s)*/ + int devId = 0; //gpu device name + struct cudaDeviceProp prop; + int nDevices; + int maxBlocksSM; + int nSMs; + /*Get my rank*/ + int mpi_rank = 0; +#ifdef WITH_MPI + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); +#endif + cudaGetDeviceCount(&nDevices); + //If running on MPI we set code to use one MPI rank per GPU + //This was found to work very well and simplifies writing slurm scipts + if (nDevices == 1) cudaSetDevice(devId); +#ifdef WITH_MPI + else { + cudaSetDevice(mpi_rank); + devId = mpi_rank; + } +#endif + //Now tell me some info about my device + cudaGetDeviceProperties(&prop, devId); + cudaDeviceGetAttribute(&maxBlocksSM, cudaDevAttrMaxBlocksPerMultiprocessor, + devId); + cudaDeviceGetAttribute(&nSMs, cudaDevAttrMultiProcessorCount, devId); + int nPartsPerCell = space->nr_parts / space->tot_cells; + + if (r->cpuid == 0 && mpi_rank == 0) { + message("%i devices available device id is %i\n", nDevices, devId); + message("Device : %s\n", prop.name); + message("nSMs %i max blocks per SM %i maxnBlocks per stream %i\n", + nSMs, maxBlocksSM, nSMs * maxBlocksSM); + message("Target nBlocks per kernel is %i\n", + N_TASKS_BUNDLE_SELF * nPartsPerCell / BLOCK_SIZE); + message("Target nBlocks per stream is %i\n", + N_TASKS_PER_PACK_SELF * nPartsPerCell / BLOCK_SIZE); + } + + cudaError_t cu_error; + size_t free_mem, total_mem; + cudaMemGetInfo(&free_mem, &total_mem); + + message("free mem %lu, total mem %lu", free_mem, total_mem); + // how many tasks do we want for each launch of GPU kernel + const int target_n_tasks = sched->pack_size; + const int target_n_tasks_pair = sched->pack_size_pair; + pack_vars_self_dens->target_n_tasks = target_n_tasks; + pack_vars_pair_dens->target_n_tasks = target_n_tasks_pair; + pack_vars_self_forc->target_n_tasks = target_n_tasks; + pack_vars_pair_forc->target_n_tasks = target_n_tasks_pair; + pack_vars_self_grad->target_n_tasks = target_n_tasks; + pack_vars_pair_grad->target_n_tasks = target_n_tasks_pair; + // how many tasks we want in each bundle (used for launching kernels in + // different streams) + const int bundle_size = N_TASKS_BUNDLE_SELF; + const int bundle_size_pair = N_TASKS_BUNDLE_PAIR; + pack_vars_self_dens->bundle_size = bundle_size; + pack_vars_pair_dens->bundle_size = bundle_size_pair; + pack_vars_self_forc->bundle_size = bundle_size; + pack_vars_pair_forc->bundle_size = bundle_size_pair; + pack_vars_self_grad->bundle_size = bundle_size; + pack_vars_pair_grad->bundle_size = bundle_size_pair; + // Keep track of first and last particles for each task (particle data is + // arranged in long arrays containing particles from all the tasks we will + // work with) + /* A. N.: Needed for offloading self tasks as we use these to sort through + * which parts need to interact with which */ + int2 *task_first_part_f4; + int2 *task_first_part_f4_f; + int2 *task_first_part_f4_g; + int2 *d_task_first_part_f4; + int2 *d_task_first_part_f4_f; + int2 *d_task_first_part_f4_g; + cudaMallocHost((void **)&task_first_part_f4, target_n_tasks * sizeof(int2)); + cudaMalloc((void **)&d_task_first_part_f4, target_n_tasks * sizeof(int2)); + cudaMallocHost((void **)&task_first_part_f4_f, target_n_tasks * sizeof(int2)); + cudaMalloc((void **)&d_task_first_part_f4_f, target_n_tasks * sizeof(int2)); + cudaMallocHost((void **)&task_first_part_f4_g, target_n_tasks * sizeof(int2)); + cudaMalloc((void **)&d_task_first_part_f4_g, target_n_tasks * sizeof(int2)); + + /*A. N.: Needed but only for small part in launch functions. Might + be useful for recursion on the GPU so keep for now */ + int4 *fparti_fpartj_lparti_lpartj_dens; + int4 *fparti_fpartj_lparti_lpartj_forc; + int4 *fparti_fpartj_lparti_lpartj_grad; + cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_dens, + target_n_tasks * sizeof(int4)); + cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_forc, + target_n_tasks * sizeof(int4)); + cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_grad, + target_n_tasks * sizeof(int4)); + + /* nBundles is the number of task bundles each + thread has ==> Used to loop through bundles */ + int nBundles = (target_n_tasks + bundle_size - 1) / bundle_size; + int nBundles_pair = + (target_n_tasks_pair + bundle_size_pair - 1) / bundle_size_pair; + + if (r->cpuid == 0) { + fprintf(stderr, "engine_rank %i cpuid %i nBundles/nStreams %i\n", + engine_rank, r->cpuid, nBundles); + fprintf(stderr, "nBundles/nStreams Pair %i\n", nBundles_pair); + } + + pack_vars_self_dens->nBundles = nBundles; + pack_vars_pair_dens->nBundles = nBundles_pair; + pack_vars_self_forc->nBundles = nBundles; + pack_vars_pair_forc->nBundles = nBundles_pair; + pack_vars_self_grad->nBundles = nBundles; + pack_vars_pair_grad->nBundles = nBundles_pair; + + // first part and last part are the first and last particle ids (locally + // within this thread). A. Nasar: All these are used in GPU offload setup + + cudaMallocHost((void **)&pack_vars_self_dens->bundle_first_part, + nBundles * sizeof(int)); + cudaMallocHost((void **)&pack_vars_self_dens->bundle_last_part, + nBundles * sizeof(int)); + cudaMallocHost((void **)&pack_vars_self_dens->bundle_first_task_list, + nBundles * sizeof(int)); + + cudaMallocHost((void **)&pack_vars_pair_dens->bundle_first_part, + 2 * nBundles * sizeof(int)); + cudaMallocHost((void **)&pack_vars_pair_dens->bundle_last_part, + 2 * nBundles * sizeof(int)); + cudaMallocHost((void **)&pack_vars_pair_dens->bundle_first_task_list, + 2 * nBundles * sizeof(int)); + + cudaMallocHost((void **)&pack_vars_self_forc->bundle_first_part, + nBundles * sizeof(int)); + cudaMallocHost((void **)&pack_vars_self_forc->bundle_last_part, + nBundles * sizeof(int)); + cudaMallocHost((void **)&pack_vars_self_forc->bundle_first_task_list, + nBundles * sizeof(int)); + + cudaMallocHost((void **)&pack_vars_pair_forc->bundle_first_part, + 2 * nBundles * sizeof(int)); + cudaMallocHost((void **)&pack_vars_pair_forc->bundle_last_part, + 2 * nBundles * sizeof(int)); + cudaMallocHost((void **)&pack_vars_pair_forc->bundle_first_task_list, + 2 * nBundles * sizeof(int)); + + cudaMallocHost((void **)&pack_vars_self_grad->bundle_first_part, + nBundles * sizeof(int)); + cudaMallocHost((void **)&pack_vars_self_grad->bundle_last_part, + nBundles * sizeof(int)); + cudaMallocHost((void **)&pack_vars_self_grad->bundle_first_task_list, + nBundles * sizeof(int)); + + cudaMallocHost((void **)&pack_vars_pair_grad->bundle_first_part, + 2 * nBundles * sizeof(int)); + cudaMallocHost((void **)&pack_vars_pair_grad->bundle_last_part, + 2 * nBundles * sizeof(int)); + cudaMallocHost((void **)&pack_vars_pair_grad->bundle_first_task_list, + 2 * nBundles * sizeof(int)); + + /*Create streams so that we can off-load different batches of work in + * different streams and get some con-CURRENCY! Events used to maximise + * asynchrony further*/ + + cudaStream_t stream[nBundles]; + cudaStream_t stream_pairs[nBundles_pair]; + + cudaEvent_t self_end[nBundles]; + for (int i = 0; i < nBundles; i++) cudaEventCreate(&self_end[i]); + cudaEvent_t self_end_g[nBundles]; + for (int i = 0; i < nBundles; i++) cudaEventCreate(&self_end_g[i]); + cudaEvent_t self_end_f[nBundles]; + for (int i = 0; i < nBundles; i++) cudaEventCreate(&self_end_f[i]); + + cudaEvent_t pair_end[nBundles]; + for (int i = 0; i < nBundles; i++) cudaEventCreate(&pair_end[i]); + cudaEvent_t pair_end_g[nBundles]; + for (int i = 0; i < nBundles; i++) cudaEventCreate(&pair_end_g[i]); + cudaEvent_t pair_end_f[nBundles]; + for (int i = 0; i < nBundles; i++) cudaEventCreate(&pair_end_f[i]); + + int tasksperbundle = (target_n_tasks + nBundles - 1) / nBundles; + int tasksperbundle_pair = + (target_n_tasks_pair + nBundles_pair - 1) / nBundles_pair; + + pack_vars_self_dens->tasksperbundle = tasksperbundle; + pack_vars_pair_dens->tasksperbundle = tasksperbundle_pair; + pack_vars_self_forc->tasksperbundle = tasksperbundle; + pack_vars_pair_forc->tasksperbundle = tasksperbundle_pair; + pack_vars_self_grad->tasksperbundle = tasksperbundle; + pack_vars_pair_grad->tasksperbundle = tasksperbundle_pair; + + for (int i = 0; i < nBundles; ++i) + cudaStreamCreateWithFlags(&stream[i], cudaStreamNonBlocking); + for (int i = 0; i < nBundles_pair; ++i) + cudaStreamCreateWithFlags(&stream_pairs[i], cudaStreamNonBlocking); + + pack_vars_self_dens->count_parts = 0; + pack_vars_pair_dens->count_parts = 0; + pack_vars_self_forc->count_parts = 0; + pack_vars_pair_forc->count_parts = 0; + pack_vars_self_grad->count_parts = 0; + pack_vars_pair_grad->count_parts = 0; + + /*Estimate how many particles to pack for GPU for each GPU launch + * instruction*/ + int nr_nodes = 1, res = 0; +#ifdef WITH_MPI + if ((res = MPI_Comm_size(MPI_COMM_WORLD, &nr_nodes)) != MPI_SUCCESS) + error("MPI_Comm_size failed with error %i.", res); +#endif + int parts_per_top_level_cell = + space->nr_local_cells_with_particles / + space->nr_parts; /*A. Nasar: What I think is a good approximation for + average N particles in each top level cell*/ + float eta_neighbours = e->s->eta_neighbours; + int np_per_cell = ceil(2.0 * eta_neighbours); + np_per_cell *= np_per_cell * np_per_cell; + /*A. Nasar: Increase parts per recursed task-level cell by buffer to + ensure we allocate enough memory*/ + int buff = ceil(0.5 * np_per_cell); + /*A. Nasar: Multiplication by 2 is also to ensure we do not over-run + * the allocated memory on buffers and GPU. This can happen if calculated h + * is larger than cell width and splitting makes bigger than target cells*/ + int count_max_parts_tmp = 64 * 8 * target_n_tasks * (np_per_cell + buff); + + pack_vars_self_dens->count_max_parts = count_max_parts_tmp; + pack_vars_pair_dens->count_max_parts = count_max_parts_tmp; + pack_vars_self_forc->count_max_parts = count_max_parts_tmp; + pack_vars_pair_forc->count_max_parts = count_max_parts_tmp; + pack_vars_self_grad->count_max_parts = count_max_parts_tmp; + pack_vars_pair_grad->count_max_parts = count_max_parts_tmp; + + /*Declare Buffer and GPU particle arrays*/ + struct part_aos_f4_send *parts_aos_f4_send; + struct part_aos_f4_recv *parts_aos_f4_recv; + + struct part_aos_f4_f_send *parts_aos_forc_f4_send; + struct part_aos_f4_f_recv *parts_aos_forc_f4_recv; + + struct part_aos_f4_g_send *parts_aos_grad_f4_send; + struct part_aos_f4_g_recv *parts_aos_grad_f4_recv; + + struct part_aos_f4_send *d_parts_aos_f4_send; + struct part_aos_f4_recv *d_parts_aos_f4_recv; + + struct part_aos_f4_f_send *d_parts_aos_forc_f4_send; + struct part_aos_f4_f_recv *d_parts_aos_forc_f4_recv; + + struct part_aos_f4_g_send *d_parts_aos_grad_f4_send; + struct part_aos_f4_g_recv *d_parts_aos_grad_f4_recv; + + struct part_aos_f4_send *parts_aos_pair_f4_send; + struct part_aos_f4_recv *parts_aos_pair_f4_recv; + + struct part_aos_f4_send *d_parts_aos_pair_f4_send; + struct part_aos_f4_recv *d_parts_aos_pair_f4_recv; + + struct part_aos_f4_f_send *parts_aos_pair_f4_f_send; + struct part_aos_f4_f_recv *parts_aos_pair_f4_f_recv; + + struct part_aos_f4_f_send *d_parts_aos_pair_f4_f_send; + struct part_aos_f4_f_recv *d_parts_aos_pair_f4_f_recv; + + struct part_aos_f4_g_send *parts_aos_pair_f4_g_send; + struct part_aos_f4_g_recv *parts_aos_pair_f4_g_recv; + + struct part_aos_f4_g_send *d_parts_aos_pair_f4_g_send; + struct part_aos_f4_g_recv *d_parts_aos_pair_f4_g_recv; + + /*Now allocate memory for Buffer and GPU particle arrays*/ + cudaMalloc((void **)&d_parts_aos_f4_send, + count_max_parts_tmp * sizeof(struct part_aos_f4_send)); + cudaMalloc((void **)&d_parts_aos_f4_recv, + count_max_parts_tmp * sizeof(struct part_aos_f4_recv)); + + cudaMalloc((void **)&d_parts_aos_forc_f4_send, + count_max_parts_tmp * sizeof(struct part_aos_f4_f_send)); + cudaMalloc((void **)&d_parts_aos_forc_f4_recv, + count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv)); + + cudaMalloc((void **)&d_parts_aos_grad_f4_send, + count_max_parts_tmp * sizeof(struct part_aos_f4_g_send)); + cudaMalloc((void **)&d_parts_aos_grad_f4_recv, + count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv)); + + cudaMallocHost((void **)&parts_aos_f4_send, + count_max_parts_tmp * sizeof(struct part_aos_f4_send)); + cudaMallocHost((void **)&parts_aos_f4_recv, + count_max_parts_tmp * sizeof(struct part_aos_f4_recv)); + + cudaMallocHost((void **)&parts_aos_forc_f4_send, + count_max_parts_tmp * sizeof(struct part_aos_f4_f_send)); + cudaMallocHost((void **)&parts_aos_forc_f4_recv, + count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv)); + + cudaMallocHost((void **)&parts_aos_grad_f4_send, + count_max_parts_tmp * sizeof(struct part_aos_f4_g_send)); + cudaMallocHost((void **)&parts_aos_grad_f4_recv, + count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv)); + + cudaMalloc((void **)&d_parts_aos_pair_f4_send, + 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_send)); + cudaMalloc((void **)&d_parts_aos_pair_f4_recv, + 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_recv)); + + cudaMalloc((void **)&d_parts_aos_pair_f4_f_send, + 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_send)); + cudaMalloc((void **)&d_parts_aos_pair_f4_f_recv, + 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv)); + + cudaMalloc((void **)&d_parts_aos_pair_f4_g_send, + 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_send)); + cudaMalloc((void **)&d_parts_aos_pair_f4_g_recv, + 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv)); + + cudaMallocHost((void **)&parts_aos_pair_f4_send, + 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_send)); + cudaMallocHost((void **)&parts_aos_pair_f4_recv, + 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_recv)); + + cudaMallocHost((void **)&parts_aos_pair_f4_g_send, + 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_send)); + cudaMallocHost((void **)&parts_aos_pair_f4_g_recv, + 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv)); + + cudaMallocHost((void **)&parts_aos_pair_f4_f_send, + 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_send)); + cudaMallocHost((void **)&parts_aos_pair_f4_f_recv, + 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv)); + + /*Declare some global variables*/ + float d_a = e->cosmology->a; + float d_H = e->cosmology->H; + int step = 0; + + // a list of the cells and tasks the GPU will work on + pack_vars_self_dens->task_list = + (struct task **)calloc(target_n_tasks, sizeof(struct task *)); + pack_vars_self_dens->cell_list = + (struct cell **)calloc(target_n_tasks, sizeof(struct cell *)); + + pack_vars_pair_dens->task_list = + (struct task **)calloc(target_n_tasks, sizeof(struct task *)); + pack_vars_pair_dens->top_task_list = + (struct task **)calloc(target_n_tasks, sizeof(struct task *)); + int n_leaves_max = 4096; + /*Allocate target_n_tasks for top level tasks. This is a 2D array with length target_n_tasks and width n_leaves_max*/ + struct leaf_cell_list l_list[target_n_tasks]; + pack_vars_pair_dens->leaf_list = (struct leaf_cell_list *)calloc(target_n_tasks, sizeof(struct leaf_cell_list)); + for (int i = 0; i < target_n_tasks; i++){ +// l_list[i].ci = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *)); +// l_list[i].cj = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *)); +// l_list[i].n_leaves = 0; + pack_vars_pair_dens->leaf_list[i].ci = malloc(n_leaves_max * sizeof(struct cell *)); + pack_vars_pair_dens->leaf_list[i].cj = malloc(n_leaves_max * sizeof(struct cell *)); + pack_vars_pair_dens->leaf_list[i].n_leaves = 0; + pack_vars_pair_dens->leaf_list[i].n_packed = 0; +// for (int j = 0; j < n_leaves_max; j++){ +// pack_vars_pair_dens->leaf_list[i].ci[j] = l_list[i].ci[j]; +// pack_vars_pair_dens->leaf_list[i].cj[j] = l_list[i].cj[j]; +// } + } +// pack_vars_pair_dens->leaf_list = l_list; +// pack_vars_pair_dens->leaf_list->ci = +// (struct cell **)calloc(n_leaves_max, sizeof(struct cell *)); +// pack_vars_pair_dens->leaf_list->cj = +// (struct cell **)calloc(n_leaves_max, sizeof(struct cell *)); + /*Allocate memory for n_leaves_max task pointers per top level task*/ + + pack_vars_pair_dens->ci_list = + (struct cell **)calloc(target_n_tasks, sizeof(struct cell *)); + pack_vars_pair_dens->cj_list = + (struct cell **)calloc(target_n_tasks, sizeof(struct cell *)); + + pack_vars_self_forc->task_list = + (struct task **)calloc(target_n_tasks, sizeof(struct task *)); + pack_vars_self_forc->cell_list = + (struct cell **)calloc(target_n_tasks, sizeof(struct cell *)); + + pack_vars_pair_forc->task_list = + (struct task **)calloc(target_n_tasks, sizeof(struct task *)); + pack_vars_pair_forc->ci_list = + (struct cell **)calloc(target_n_tasks, sizeof(struct cell *)); + pack_vars_pair_forc->cj_list = + (struct cell **)calloc(target_n_tasks, sizeof(struct cell *)); + + pack_vars_self_grad->task_list = + (struct task **)calloc(target_n_tasks, sizeof(struct task *)); + pack_vars_self_grad->cell_list = + (struct cell **)calloc(target_n_tasks, sizeof(struct cell *)); + + pack_vars_pair_grad->task_list = + (struct task **)calloc(target_n_tasks, sizeof(struct task *)); + pack_vars_pair_grad->ci_list = + (struct cell **)calloc(target_n_tasks, sizeof(struct cell *)); + pack_vars_pair_grad->cj_list = + (struct cell **)calloc(target_n_tasks, sizeof(struct cell *)); + + // number of density self tasks executed + int tasks_done_cpu = 0; + int tasks_done_gpu = 0; + int tasks_done_gpu_inc = 0; + + /* Main loop. */ + while (1) { + /*Stuff for debugging*/ + int n_full_d_bundles = 0, n_full_g_bundles = 0, n_full_f_bundles = 0; + int n_full_p_d_bundles = 0, n_full_p_g_bundles = 0, n_full_p_f_bundles = 0; + int n_partial_d_bundles = 0, n_partial_g_bundles = 0, + n_partial_f_bundles = 0; + int n_partial_p_d_bundles = 0, n_partial_p_g_bundles = 0, + n_partial_p_f_bundles = 0; + int output = 0; + int packed_self = 0; + int packed_pair = 0; + int packed_self_f = 0; + int packed_pair_f = 0; + int packed_self_g = 0; + int packed_pair_g = 0; + int density = 0; + int density_sub = 0; + int unpacked = 0; + int unpacked_f = 0; + int unpacked_g = 0; + int unpacked_pair = 0; + int unpacked_pair_f = 0; + int unpacked_pair_g = 0; + int ghost_in = 0; + int cpu_self = 0; + int cpu_self_f = 0; + int cpu_self_g = 0; + int cpu_pair = 0; + int cpu_pair_f = 0; + int cpu_pair_g = 0; + int n_leafs_total = 0; + // Initialise timers to zero + double time_for_density_cpu = 0.0; + double time_for_density_cpu_pair = 0.0; + double time_for_cpu_g = 0.0; + double time_for_cpu_pair_g = 0.0; + double time_for_cpu_f = 0.0; + double time_for_cpu_pair_f = 0.0; + double time_for_density_cpu_sub = 0.0; + double time_for_density_gpu = 0.0; + double time_for_density_gpu_pair = 0.0; + double time_for_gpu_f = 0.0; + double time_for_gpu_pair_f = 0.0; + double time_for_gpu_g = 0.0; + double time_for_gpu_pair_g = 0.0; + double unpack_time_self_g = 0.0; + double unpack_time_self_f = 0.0; + double unpack_time_self = 0.0; + double time_for_gpu_pair = 0.0; + int nr_cells = space->nr_cells; + /* Wait at the barrier. */ + engine_barrier(e); + // Initialise packing counters + pack_vars_self_dens->tasks_packed = 0; + pack_vars_pair_dens->tasks_packed = 0; + pack_vars_self_dens->count_parts = 0; + pack_vars_pair_dens->count_parts = 0; + pack_vars_pair_dens->task_locked = 0; + pack_vars_pair_dens->top_tasks_packed = 0; + // Initialise packing counters + pack_vars_self_forc->tasks_packed = 0; + pack_vars_pair_forc->tasks_packed = 0; + pack_vars_self_forc->count_parts = 0; + pack_vars_pair_forc->count_parts = 0; + // Initialise packing counters + pack_vars_self_grad->tasks_packed = 0; + pack_vars_pair_grad->tasks_packed = 0; + pack_vars_self_grad->count_parts = 0; + pack_vars_pair_grad->count_parts = 0; + for(int i = 0; i < target_n_tasks; i++) + pack_vars_pair_dens->leaf_list[i].n_leaves = 0; + + int total_tasks_packed_this_time_pair = 0; + double packing_time = 0.0; + double packing_time_f = 0.0; + double packing_time_g = 0.0; + double unpacking_time = 0.0; + double unpacking_time_f = 0.0; + double unpacking_time_g = 0.0; + double packing_time_pair = 0.0; + double packing_time_pair_f = 0.0; + double packing_time_pair_g = 0.0; + double unpacking_time_pair = 0.0; + double unpacking_time_pair_f = 0.0; + double unpacking_time_pair_g = 0.0; + double time_for_copy_to_struct = 0.0; + double tot_time_for_hard_memcpys = 0.0; + /* Can we go home yet? */ + if (e->step_props & engine_step_prop_done) break; + /* Re-set the pointer to the previous task, as there is none. */ + struct task *t = NULL; + struct task *prev = NULL; + /*Some bits for output in case of debug*/ + char buf5[20]; + snprintf(buf5, sizeof(buf5), "t%dr%dstep%d", r->cpuid, engine_rank, step); +#ifdef DUMP_TIMINGS + FILE *fgpu_steps; + fgpu_steps = fopen(buf5, "w"); +#endif + // if (step == 0) cudaProfilerStart(); + step++; + + sched->nr_packs_self_dens_done = 0; + sched->nr_packs_pair_dens_done = 0; + sched->nr_packs_self_forc_done = 0; + sched->nr_packs_pair_forc_done = 0; + sched->nr_packs_self_grad_done = 0; + sched->nr_packs_pair_grad_done = 0; + int n_cells_d = 0; + int n_cells_g = 0; + int n_cells_f = 0; + int n_cells_p_d = 0; + int n_cells_p_g = 0; + int n_cells_p_f = 0; + int n_w_prts_gtr_target_d = 0; + int n_w_prts_gtr_target_g = 0; + int n_w_prts_gtr_target_f = 0; + int n_w_prts_gtr_target_p_d = 0; + int n_w_prts_gtr_target_p_g = 0; + int n_w_prts_gtr_target_p_f = 0; + int g100 = 0; + int l100 = 0; + int maxcount = 0; + /* Loop while there are tasks... */ + tasks_done_gpu_inc = 0; + ticks hang_time = getticks(); + struct task * ttop_prev; + while (1) { + // A. Nasar: Get qid for re-use later + int qid = r->qid; + /* If there's no old task, try to get a new one. */ + if (t == NULL) { + /* Get the task. */ + TIMER_TIC + t = scheduler_gettask(sched, qid, prev); + TIMER_TOC(timer_gettask); + /* Did I get anything? */ + if (t == NULL) break; + } + /* Get the cells. */ + struct cell *ci = t->ci; + struct cell *cj = t->cj; + + struct task * ttop = t; + + if (ci == NULL && (t->subtype != task_subtype_gpu_unpack_d + && t->subtype != task_subtype_gpu_unpack_g + && t->subtype != task_subtype_gpu_unpack_f)) error("This cannot be"); + +#ifdef SWIFT_DEBUG_TASKS + /* Mark the thread we run on */ + t->rid = r->cpuid; + + /* And recover the pair direction */ + if (t->type == task_type_pair || t->type == task_type_sub_pair) { + struct cell *ci_temp = ci; + struct cell *cj_temp = cj; + double shift[3]; + if (t->subtype != task_subtype_gpu_unpack_d && + t->subtype != task_subtype_gpu_unpack_g && + t->subtype != task_subtype_gpu_unpack_f) + t->sid = space_getsid_and_swap_cells(e->s, &ci_temp, &cj_temp, shift); + } else { + t->sid = -1; + } +#endif + +#ifdef SWIFT_DEBUG_CHECKS + /* Check that we haven't scheduled an inactive task */ + t->ti_run = e->ti_current; + /* Store the task that will be running (for debugging only) */ + r->t = t; +#endif + + const ticks task_beg = getticks(); + /* Different types of tasks... */ + switch (t->type) { + case task_type_self: + if (t->subtype == task_subtype_gpu_unpack_d) { + unpacked++; + } else if (t->subtype == task_subtype_gpu_unpack_g) { + unpacked_g++; + } else if (t->subtype == task_subtype_gpu_unpack_f) { + unpacked_f++; + } else if (t->subtype == task_subtype_density) { + cpu_self++; +#ifndef GPUOFFLOAD_DENSITY + struct timespec t0, t1, dt; + clock_gettime(CLOCK_REALTIME, &t0); + runner_doself1_branch_density(r, ci); + clock_gettime(CLOCK_REALTIME, &t1); + tasks_done_cpu++; + time_for_density_cpu += (t1.tv_sec - t0.tv_sec) + + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; + density++; +#endif + /* GPU WORK */ + } else if (t->subtype == task_subtype_gpu_pack_d) { + packed_self++; +#ifdef GPUOFFLOAD_DENSITY + ticks tic_cpu_pack = getticks(); + packing_time += + runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci, t, + parts_aos_f4_send, task_first_part_f4); + //Record times for task analysis + t->total_cpu_pack_ticks += getticks() - tic_cpu_pack; + /* No pack tasks left in queue, flag that we want to run */ + int launch_leftovers = pack_vars_self_dens->launch_leftovers; + /*Packed enough tasks. Let's go*/ + int launch = pack_vars_self_dens->launch; + /* Do we have enough stuff to run the GPU ? */ + if (launch || launch_leftovers) { + /*Launch GPU tasks*/ + int t_packed = pack_vars_self_dens->tasks_packed; + runner_doself1_launch_f4( + r, sched, pack_vars_self_dens, ci, t, parts_aos_f4_send, + parts_aos_f4_recv, d_parts_aos_f4_send, d_parts_aos_f4_recv, + stream, d_a, d_H, e, &packing_time, &time_for_density_gpu, + &unpack_time_self, devId, + task_first_part_f4, d_task_first_part_f4, self_end); + } /*End of GPU work Self*/ +#endif + } /* self / pack */ + else if (t->subtype == task_subtype_gpu_pack_g) { + packed_self_g++; +#ifdef GPUOFFLOAD_GRADIENT + ticks tic_cpu_pack = getticks(); + packing_time_g += runner_doself1_pack_f4_g( + r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send, + task_first_part_f4_g); + //Record times for task analysis + t->total_cpu_pack_ticks += getticks() - tic_cpu_pack; + /* No pack tasks left in queue, flag that we want to run */ + int launch_leftovers = pack_vars_self_grad->launch_leftovers; + /*Packed enough tasks let's go*/ + int launch = pack_vars_self_grad->launch; + /* Do we have enough stuff to run the GPU ? */ + if (launch || launch_leftovers) { + /*Launch GPU tasks*/ + int t_packed = pack_vars_self_grad->tasks_packed; + runner_doself1_launch_f4_g( + r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send, + parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send, + d_parts_aos_grad_f4_recv, stream, d_a, d_H, e, + &packing_time_g, &time_for_gpu_g, task_first_part_f4_g, + d_task_first_part_f4_g, self_end_g, &unpack_time_self_g); + } /*End of GPU work Self*/ +#endif // GPUGRADSELF + } else if (t->subtype == task_subtype_gpu_pack_f) { + packed_self_f++; +#ifdef GPUOFFLOAD_FORCE + ticks tic_cpu_pack = getticks(); + packing_time_f += runner_doself1_pack_f4_f( + r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send, + task_first_part_f4_f); + //Record times for task analysis + t->total_cpu_pack_ticks += getticks() - tic_cpu_pack; + /* No pack tasks left in queue, flag that we want to run */ + int launch_leftovers = pack_vars_self_forc->launch_leftovers; + /*Packed enough tasks let's go*/ + int launch = pack_vars_self_forc->launch; + /* Do we have enough stuff to run the GPU ? */ + if (launch || launch_leftovers) { + /*Launch GPU tasks*/ + int t_packed = pack_vars_self_forc->tasks_packed; + runner_doself1_launch_f4_f( + r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send, + parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send, + d_parts_aos_forc_f4_recv, stream, d_a, d_H, e, + &packing_time_f, &time_for_gpu_f, task_first_part_f4_f, + d_task_first_part_f4_f, self_end_f, &unpack_time_self_f); + } /*End of GPU work Self*/ +#endif + } +#ifdef EXTRA_HYDRO_LOOP + else if (t->subtype == task_subtype_gradient) { + cpu_self_g++; +#ifndef GPUOFFLOAD_GRADIENT + struct timespec t0, t1, dt; + clock_gettime(CLOCK_REALTIME, &t0); + runner_doself1_branch_gradient(r, ci); + clock_gettime(CLOCK_REALTIME, &t1); + tasks_done_cpu++; + time_for_cpu_g += (t1.tv_sec - t0.tv_sec) + + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; +#endif + } +#endif + else if (t->subtype == task_subtype_force) { + cpu_self_f++; +#ifndef GPUOFFLOAD_FORCE + struct timespec t0, t1; + clock_gettime(CLOCK_REALTIME, &t0); + runner_doself2_branch_force(r, ci); + clock_gettime(CLOCK_REALTIME, &t1); + tasks_done_cpu++; + time_for_cpu_f += (t1.tv_sec - t0.tv_sec) + + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; +#endif + } else if (t->subtype == task_subtype_limiter) + runner_doself1_branch_limiter(r, ci); + else if (t->subtype == task_subtype_grav) + runner_doself_recursive_grav(r, ci, 1); + else if (t->subtype == task_subtype_external_grav) + runner_do_grav_external(r, ci, 1); + else if (t->subtype == task_subtype_stars_density) + runner_doself_branch_stars_density(r, ci); +#ifdef EXTRA_STAR_LOOPS + else if (t->subtype == task_subtype_stars_prep1) + runner_doself_branch_stars_prep1(r, ci); + else if (t->subtype == task_subtype_stars_prep2) + runner_doself_branch_stars_prep2(r, ci); +#endif + else if (t->subtype == task_subtype_stars_feedback) + runner_doself_branch_stars_feedback(r, ci); + else if (t->subtype == task_subtype_bh_density) + runner_doself_branch_bh_density(r, ci); + else if (t->subtype == task_subtype_bh_swallow) + runner_doself_branch_bh_swallow(r, ci); + else if (t->subtype == task_subtype_do_gas_swallow) + runner_do_gas_swallow_self(r, ci, 1); + else if (t->subtype == task_subtype_do_bh_swallow) + runner_do_bh_swallow_self(r, ci, 1); + else if (t->subtype == task_subtype_bh_feedback) + runner_doself_branch_bh_feedback(r, ci); + else if (t->subtype == task_subtype_rt_gradient) + runner_doself1_branch_rt_gradient(r, ci); + else if (t->subtype == task_subtype_rt_transport) + runner_doself2_branch_rt_transport(r, ci); + else if (t->subtype == task_subtype_sink_swallow) + runner_doself_branch_sinks_swallow(r, ci); + else if (t->subtype == task_subtype_sink_do_gas_swallow) + runner_do_sinks_gas_swallow_self(r, ci, 1); + else if (t->subtype == task_subtype_sink_do_sink_swallow) + runner_do_sinks_sink_swallow_self(r, ci, 1); + else + error("Unknown/invalid task subtype (%s).", + subtaskID_names[t->subtype]); + break; + + case task_type_pair: + if (t->subtype == task_subtype_density) { + cpu_pair++; +#ifndef GPUOFFLOAD_DENSITY + struct timespec t0, t1, dt; + clock_gettime(CLOCK_REALTIME, &t0); + runner_dopair1_branch_density(r, ci, cj); + clock_gettime(CLOCK_REALTIME, &t1); + tasks_done_cpu++; + time_for_density_cpu_pair += + (t1.tv_sec - t0.tv_sec) + + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; +#endif + } + /* GPU WORK */ + else if (t->subtype == task_subtype_gpu_pack_d) { + packed_pair++; +#ifdef GPUOFFLOAD_DENSITY + + ticks tic_cpu_pack = getticks(); + + /////////////////////W.I.P!!!//////////////////////////////////////////////////////// + /*Call recursion here. This will be a function in runner_doiact_functions_hydro_gpu.h. + * We are recursing separately to find out how much work we have before offloading*/ + //We need to allocate a list to put cell pointers into for each new task + int n_expected_tasks = 4096; //A. Nasar: Need to come up with a good estimate for this + int n_leaves_found = 0; + int top_tasks_packed = pack_vars_pair_dens->top_tasks_packed; + int depth = 0; + + pack_vars_pair_dens->leaf_list[top_tasks_packed].n_leaves = 0; + pack_vars_pair_dens->leaf_list[top_tasks_packed].n_start = 0; + pack_vars_pair_dens->leaf_list[top_tasks_packed].n_packed = 0; + + runner_recurse_gpu(r, sched, pack_vars_pair_dens, ci, cj, t, + parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, &n_leaves_found, depth, n_expected_tasks); + + n_leafs_total += n_leaves_found; + int cstart = 0, cid = 0; + + pack_vars_pair_dens->top_task_list[top_tasks_packed] = t; + + pack_vars_pair_dens->top_tasks_packed++; + pack_vars_pair_dens->task_locked = 1; + int t_s, t_e; + t_s = 0; + int n_t_tasks = pack_vars_pair_dens->target_n_tasks; + t->total_cpu_pack_ticks += getticks() - tic_cpu_pack; + + int ntop_packed = pack_vars_pair_dens->top_tasks_packed; + + while(cstart < n_leaves_found){ + tic_cpu_pack = getticks(); + +// if(pack_vars_pair_dens->top_task_list[0] == ttop_prev) +// error("Working on prev top level task"); + pack_vars_pair_dens->launch_leftovers = 0; + pack_vars_pair_dens->launch = 0; + /*Loop through n_daughters such that the pack_vars_pair_dens counters are updated*/ + while(cstart < n_leaves_found && pack_vars_pair_dens->tasks_packed < n_t_tasks){ + // n_start is incremented in pack. However, for cases where we have launched + // but there are still some daughters left unpacked, we need to restart the + // count from zero for the packed arrays as the daughters we previously worked on are no longer necessary. + // Thus, the counter for cii and cjj should remain cstart but counter for packing/unpacking arrays + // should be n_start which is set to zero after launch. count_parts should also be zero ater launch + struct cell * cii = pack_vars_pair_dens->leaf_list[ntop_packed - 1].ci[cstart]; + struct cell * cjj = pack_vars_pair_dens->leaf_list[ntop_packed - 1].cj[cstart]; + packing_time_pair += runner_dopair1_pack_f4( + /////////////////////////////Are we sure we should use + /////////////////////////////cells_left/cells right and not + /////////////////////////////pack_vars_pair_dens->leaf_list[top_tasks_packed].ci & cj? + r, sched, pack_vars_pair_dens, cii, cjj, t, + ///////////////////////////// HERE ////////////////////////////////////////// + parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens); + if(pack_vars_pair_dens->count_parts > count_max_parts_tmp) + error("Packed more parts than possible"); + cstart++; + } + /* Copies done. Release the lock ! */ + t->total_cpu_pack_ticks += getticks() - tic_cpu_pack; + /* Packed enough tasks or no pack tasks left in queue, flag that + * we want to run */ + int launch = pack_vars_pair_dens->launch; + int launch_leftovers = pack_vars_pair_dens->launch_leftovers; + /* Do we have enough stuff to run the GPU ? */ + if (launch || launch_leftovers) { + /*Launch GPU tasks*/ + int t_packed = pack_vars_pair_dens->tasks_packed; + runner_dopair1_launch_f4_one_memcpy( + r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, + parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send, + d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, + &packing_time_pair, &time_for_density_gpu_pair, + &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, + pair_end); + //A. Nasar: Unpack data and zero count_parts counter + runner_dopair1_unpack_f4( + r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, + parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send, + d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, + &packing_time_pair, &time_for_density_gpu_pair, + &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, + pair_end, cstart, n_leaves_found); + /*This ensure that if we still have leaves left we start at index 1. + Otherwise, reset the index since we will be grabbing a new task*/ + int n_packed = pack_vars_pair_dens->tasks_packed; + //A. Nasar: We've packed all daughters and have launched --> one way or the other + if(cstart == n_leaves_found){ + pack_vars_pair_dens->top_tasks_packed = 0; +// for(int i = 0; i < ntop_packed; i++){ +// pack_vars_pair_dens->leaf_list[i].n_leaves = 0; +// pack_vars_pair_dens->leaf_list[i].n_packed = 0; +// pack_vars_pair_dens->leaf_list[i].n_start = 0; +// } + } + // A. Nasar: We've launched but we have not packed all daughters. + // Need to set counters so we start from the last top-task packed + // and it's last packed daughter-task and start packing to the beginning of GPU arrays + // which is reset to zero (count_parts) in "....unpack_f4()" + else{ + for(int i = 1; i < pack_vars_pair_dens->top_tasks_packed; i++) + pack_vars_pair_dens->leaf_list[i].n_start = 0; + pack_vars_pair_dens->top_tasks_packed = 1; + pack_vars_pair_dens->top_task_list[0]= t; + // A. Nasar: We've launched so need to restart counting tasks + // from zero and need to reset tasks_packed to zero. + // However, the counter for + pack_vars_pair_dens->leaf_list[0].n_start = cstart; + + pack_vars_pair_dens->leaf_list[0].n_packed = 0; + //A. Nasar: We have packed all daughter tasks in this parent task + /*This makes it such that the remaining leaf tasks are packed starting from a + fresh list since we are still in the while cstart < n_leaves_found loop**/ + } + // A. Nasar: These need to be reset to zero either way as our GPU array counters + // need to re-start from zero + pack_vars_pair_dens->tasks_packed = 0; + pack_vars_pair_dens->launch_leftovers = 0; + pack_vars_pair_dens->launch = 0; + } + /////////////////////////////////////////////////////////////////////// + } + ttop_prev = t; + cell_unlocktree(ci); + cell_unlocktree(cj); +// pack_vars_pair_dens->launch_leftovers = 0; +// pack_vars_pair_dens->launch = 0; + /////////////////////W.I.P!!!//////////////////////////////////////////////////////// + +#endif // GPUOFFLOAD_DENSITY + } /* pair / pack */ + else if (t->subtype == task_subtype_gpu_pack_g) { + packed_pair_g++; +#ifdef GPUOFFLOAD_GRADIENT + ticks tic_cpu_pack = getticks(); + packing_time_pair_g += + runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci, + cj, t, parts_aos_pair_f4_g_send, e, + fparti_fpartj_lparti_lpartj_grad); + t->total_cpu_pack_ticks += getticks() - tic_cpu_pack; + /* No pack tasks left in queue, flag that we want to run */ + int launch_leftovers = pack_vars_pair_grad->launch_leftovers; + /*Packed enough tasks, let's go*/ + int launch = pack_vars_pair_grad->launch; + /* Do we have enough stuff to run the GPU ? */ + if (launch || launch_leftovers) { + /*Launch GPU tasks*/ + int t_packed = pack_vars_pair_grad->tasks_packed; + // signal_sleeping_runners(sched, t, t_packed); + runner_dopair1_launch_f4_g_one_memcpy( + r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send, + parts_aos_pair_f4_g_recv, d_parts_aos_pair_f4_g_send, + d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e, + &packing_time_pair_g, &time_for_gpu_pair_g, + &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, + pair_end_g); + } + pack_vars_pair_grad->launch_leftovers = 0; +#endif // GPUOFFLOAD_GRADIENT + } else if (t->subtype == task_subtype_gpu_pack_f) { + packed_pair_f++; +#ifdef GPUOFFLOAD_FORCE + ticks tic_cpu_pack = getticks(); + /*Pack data and increment counters checking if we should run on the GPU after packing this task*/ + packing_time_pair_f += + runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci, + cj, t, parts_aos_pair_f4_f_send, e, + fparti_fpartj_lparti_lpartj_forc); + /* No pack tasks left in queue, flag that we want to run */ + int launch_leftovers = pack_vars_pair_forc->launch_leftovers; + /*Packed enough tasks let's go*/ + int launch = pack_vars_pair_forc->launch; + /* Do we have enough stuff to run the GPU ? */ + if (launch || launch_leftovers) { + /*Launch GPU tasks*/ + int t_packed = pack_vars_pair_forc->tasks_packed; + // signal_sleeping_runners(sched, t, t_packed); + runner_dopair1_launch_f4_f_one_memcpy( + r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send, + parts_aos_pair_f4_f_recv, d_parts_aos_pair_f4_f_send, + d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, + &packing_time_pair_f, &time_for_gpu_pair_f, + &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, + pair_end_f); + + pack_vars_pair_forc->launch_leftovers = 0; + } /* End of GPU work Pairs */ +#endif // GPUOFFLOAD_FORCE + } else if (t->subtype == task_subtype_gpu_unpack_d) { + unpacked_pair++; + } else if (t->subtype == task_subtype_gpu_unpack_g) { + unpacked_pair_g++; + } else if (t->subtype == task_subtype_gpu_unpack_f) { + unpacked_pair_f++; + } +#ifdef EXTRA_HYDRO_LOOP + else if (t->subtype == task_subtype_gradient) { + int Do_nothing = 0; +#ifndef GPUOFFLOAD_GRADIENT + struct timespec t0, t1, dt; + clock_gettime(CLOCK_REALTIME, &t0); + runner_dopair1_branch_gradient(r, ci, cj); + clock_gettime(CLOCK_REALTIME, &t1); + tasks_done_cpu++; + time_for_cpu_pair_g += (t1.tv_sec - t0.tv_sec) + + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; +#endif + } +#endif // EXTRA_HYDRO_LOOP + else if (t->subtype == task_subtype_force) { + int Do_nothing = 0; +#ifndef GPUOFFLOAD_FORCE + struct timespec t0, t1, dt; + clock_gettime(CLOCK_REALTIME, &t0); + runner_dopair2_branch_force(r, ci, cj); + clock_gettime(CLOCK_REALTIME, &t1); + tasks_done_cpu++; + time_for_cpu_pair_f += (t1.tv_sec - t0.tv_sec) + + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; +#endif // GPUOFFLOAD_FORCE + } else if (t->subtype == task_subtype_limiter) + runner_dopair1_branch_limiter(r, ci, cj); + else if (t->subtype == task_subtype_grav) + runner_dopair_recursive_grav(r, ci, cj, 1); + else if (t->subtype == task_subtype_stars_density) + runner_dopair_branch_stars_density(r, ci, cj); +#ifdef EXTRA_STAR_LOOPS + else if (t->subtype == task_subtype_stars_prep1) + runner_dopair_branch_stars_prep1(r, ci, cj); + else if (t->subtype == task_subtype_stars_prep2) + runner_dopair_branch_stars_prep2(r, ci, cj); +#endif + else if (t->subtype == task_subtype_stars_feedback) + runner_dopair_branch_stars_feedback(r, ci, cj); + else if (t->subtype == task_subtype_bh_density) + runner_dopair_branch_bh_density(r, ci, cj); + else if (t->subtype == task_subtype_bh_swallow) + runner_dopair_branch_bh_swallow(r, ci, cj); + else if (t->subtype == task_subtype_do_gas_swallow) + runner_do_gas_swallow_pair(r, ci, cj, 1); + else if (t->subtype == task_subtype_do_bh_swallow) + runner_do_bh_swallow_pair(r, ci, cj, 1); + else if (t->subtype == task_subtype_bh_feedback) + runner_dopair_branch_bh_feedback(r, ci, cj); + else if (t->subtype == task_subtype_rt_gradient) + runner_dopair1_branch_rt_gradient(r, ci, cj); + else if (t->subtype == task_subtype_rt_transport) + runner_dopair2_branch_rt_transport(r, ci, cj); + else if (t->subtype == task_subtype_sink_swallow) + runner_dopair_branch_sinks_swallow(r, ci, cj); + else if (t->subtype == task_subtype_sink_do_gas_swallow) + runner_do_sinks_gas_swallow_pair(r, ci, cj, 1); + else if (t->subtype == task_subtype_sink_do_sink_swallow) + runner_do_sinks_sink_swallow_pair(r, ci, cj, 1); + else + error("Unknown/invalid task subtype (%s/%s).", + taskID_names[t->type], subtaskID_names[t->subtype]); + break; + + case task_type_sub_self: + if (t->subtype == task_subtype_density) { + struct timespec t0, t1, dt; + const int count = ci->hydro.count; + density_sub++; + clock_gettime(CLOCK_REALTIME, &t0); + runner_dosub_self1_density(r, ci, 1); + clock_gettime(CLOCK_REALTIME, &t1); + tasks_done_cpu++; + time_for_density_cpu_sub += + (t1.tv_sec - t0.tv_sec) + + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; + } +#ifdef EXTRA_HYDRO_LOOP + else if (t->subtype == task_subtype_gradient) { + runner_dosub_self1_gradient(r, ci, 1); + } +#endif + else if (t->subtype == task_subtype_force) { + runner_dosub_self2_force(r, ci, 1); + } else if (t->subtype == task_subtype_limiter) + runner_dosub_self1_limiter(r, ci, 1); + else if (t->subtype == task_subtype_stars_density) + runner_dosub_self_stars_density(r, ci, 1); +#ifdef EXTRA_STAR_LOOPS + else if (t->subtype == task_subtype_stars_prep1) + runner_dosub_self_stars_prep1(r, ci, 1); + else if (t->subtype == task_subtype_stars_prep2) + runner_dosub_self_stars_prep2(r, ci, 1); +#endif + else if (t->subtype == task_subtype_stars_feedback) + runner_dosub_self_stars_feedback(r, ci, 1); + else if (t->subtype == task_subtype_bh_density) + runner_dosub_self_bh_density(r, ci, 1); + else if (t->subtype == task_subtype_bh_swallow) + runner_dosub_self_bh_swallow(r, ci, 1); + else if (t->subtype == task_subtype_do_gas_swallow) + runner_do_gas_swallow_self(r, ci, 1); + else if (t->subtype == task_subtype_do_bh_swallow) + runner_do_bh_swallow_self(r, ci, 1); + else if (t->subtype == task_subtype_bh_feedback) + runner_dosub_self_bh_feedback(r, ci, 1); + else if (t->subtype == task_subtype_rt_gradient) + runner_dosub_self1_rt_gradient(r, ci, 1); + else if (t->subtype == task_subtype_rt_transport) + runner_dosub_self2_rt_transport(r, ci, 1); + else if (t->subtype == task_subtype_sink_swallow) + runner_dosub_self_sinks_swallow(r, ci, 1); + else if (t->subtype == task_subtype_sink_do_gas_swallow) + runner_do_sinks_gas_swallow_self(r, ci, 1); + else if (t->subtype == task_subtype_sink_do_sink_swallow) + runner_do_sinks_sink_swallow_self(r, ci, 1); + else + error("Unknown/invalid task subtype (%s/%s).", + taskID_names[t->type], subtaskID_names[t->subtype]); + break; + + case task_type_sub_pair: + if (t->subtype == task_subtype_density) { + int nothing = 0; + runner_dosub_pair1_density(r, ci, cj, 1); + } +#ifdef EXTRA_HYDRO_LOOP + else if (t->subtype == task_subtype_gradient) { + runner_dosub_pair1_gradient(r, ci, cj, 1); + } +#endif + else if (t->subtype == task_subtype_force) { + runner_dosub_pair2_force(r, ci, cj, 1); + } else if (t->subtype == task_subtype_limiter) + runner_dosub_pair1_limiter(r, ci, cj, 1); + else if (t->subtype == task_subtype_stars_density) + runner_dosub_pair_stars_density(r, ci, cj, 1); +#ifdef EXTRA_STAR_LOOPS + else if (t->subtype == task_subtype_stars_prep1) + runner_dosub_pair_stars_prep1(r, ci, cj, 1); + else if (t->subtype == task_subtype_stars_prep2) + runner_dosub_pair_stars_prep2(r, ci, cj, 1); +#endif + else if (t->subtype == task_subtype_stars_feedback) + runner_dosub_pair_stars_feedback(r, ci, cj, 1); + else if (t->subtype == task_subtype_bh_density) + runner_dosub_pair_bh_density(r, ci, cj, 1); + else if (t->subtype == task_subtype_bh_swallow) + runner_dosub_pair_bh_swallow(r, ci, cj, 1); + else if (t->subtype == task_subtype_do_gas_swallow) + runner_do_gas_swallow_pair(r, ci, cj, 1); + else if (t->subtype == task_subtype_do_bh_swallow) + runner_do_bh_swallow_pair(r, ci, cj, 1); + else if (t->subtype == task_subtype_bh_feedback) + runner_dosub_pair_bh_feedback(r, ci, cj, 1); + else if (t->subtype == task_subtype_rt_gradient) + runner_dosub_pair1_rt_gradient(r, ci, cj, 1); + else if (t->subtype == task_subtype_rt_transport) + runner_dosub_pair2_rt_transport(r, ci, cj, 1); + else if (t->subtype == task_subtype_sink_swallow) + runner_dosub_pair_sinks_swallow(r, ci, cj, 1); + else if (t->subtype == task_subtype_sink_do_gas_swallow) + runner_do_sinks_gas_swallow_pair(r, ci, cj, 1); + else if (t->subtype == task_subtype_sink_do_sink_swallow) + runner_do_sinks_sink_swallow_pair(r, ci, cj, 1); + else + error("Unknown/invalid task subtype (%s/%s).", + taskID_names[t->type], subtaskID_names[t->subtype]); + break; + + case task_type_sort: + /* Cleanup only if any of the indices went stale. */ + runner_do_hydro_sort( + r, ci, t->flags, + ci->hydro.dx_max_sort_old > space_maxreldx * ci->dmin, + cell_get_flag(ci, cell_flag_rt_requests_sort), 1); + /* Reset the sort flags as our work here is done. */ + t->flags = 0; + break; + case task_type_rt_sort: + /* Cleanup only if any of the indices went stale. + * NOTE: we check whether we reset the sort flags when the + * recv tasks are running. Cells without an RT recv task + * don't have rt_sort tasks. */ + runner_do_hydro_sort( + r, ci, t->flags, + ci->hydro.dx_max_sort_old > space_maxreldx * ci->dmin, 1, 1); + /* Reset the sort flags as our work here is done. */ + t->flags = 0; + break; + case task_type_stars_sort: + /* Cleanup only if any of the indices went stale. */ + runner_do_stars_sort( + r, ci, t->flags, + ci->stars.dx_max_sort_old > space_maxreldx * ci->dmin, 1); + /* Reset the sort flags as our work here is done. */ + t->flags = 0; + break; + case task_type_init_grav: + runner_do_init_grav(r, ci, 1); + break; + case task_type_ghost: + runner_do_ghost(r, ci, 1); + break; +#ifdef EXTRA_HYDRO_LOOP + case task_type_extra_ghost: + runner_do_extra_ghost(r, ci, 1); + break; +#endif + case task_type_stars_ghost: + runner_do_stars_ghost(r, ci, 1); + break; + case task_type_bh_density_ghost: + runner_do_black_holes_density_ghost(r, ci, 1); + break; + case task_type_bh_swallow_ghost3: + runner_do_black_holes_swallow_ghost(r, ci, 1); + break; + case task_type_drift_part: + runner_do_drift_part(r, ci, 1); + break; + case task_type_drift_spart: + runner_do_drift_spart(r, ci, 1); + break; + case task_type_drift_sink: + runner_do_drift_sink(r, ci, 1); + break; + case task_type_drift_bpart: + runner_do_drift_bpart(r, ci, 1); + break; + case task_type_drift_gpart: + runner_do_drift_gpart(r, ci, 1); + break; + case task_type_kick1: + runner_do_kick1(r, ci, 1); + break; + case task_type_kick2: + runner_do_kick2(r, ci, 1); + break; + case task_type_end_hydro_force: + runner_do_end_hydro_force(r, ci, 1); + break; + case task_type_end_grav_force: + runner_do_end_grav_force(r, ci, 1); + break; + case task_type_csds: + runner_do_csds(r, ci, 1); + break; + case task_type_timestep: + runner_do_timestep(r, ci, 1); + break; + case task_type_timestep_limiter: + runner_do_limiter(r, ci, 0, 1); + break; + case task_type_timestep_sync: + runner_do_sync(r, ci, 0, 1); + break; + case task_type_collect: + runner_do_timestep_collect(r, ci, 1); + break; + case task_type_rt_collect_times: + runner_do_collect_rt_times(r, ci, 1); + break; +#ifdef WITH_MPI + case task_type_send: + if (t->subtype == task_subtype_tend) { + free(t->buff); + } else if (t->subtype == task_subtype_sf_counts) { + free(t->buff); + } else if (t->subtype == task_subtype_part_swallow) { + free(t->buff); + } else if (t->subtype == task_subtype_bpart_merger) { + free(t->buff); + } else if (t->subtype == task_subtype_limiter) { + free(t->buff); + } + break; + case task_type_recv: + if (t->subtype == task_subtype_tend) { + cell_unpack_end_step(ci, (struct pcell_step *)t->buff); + free(t->buff); + } else if (t->subtype == task_subtype_sf_counts) { + cell_unpack_sf_counts(ci, (struct pcell_sf *)t->buff); + cell_clear_stars_sort_flags(ci, /*clear_unused_flags=*/0); + free(t->buff); + } else if (t->subtype == task_subtype_xv) { + runner_do_recv_part(r, ci, 1, 1); + } else if (t->subtype == task_subtype_rho) { + runner_do_recv_part(r, ci, 0, 1); + } else if (t->subtype == task_subtype_gradient) { + runner_do_recv_part(r, ci, 0, 1); + } else if (t->subtype == task_subtype_rt_gradient) { + runner_do_recv_part(r, ci, 2, 1); + } else if (t->subtype == task_subtype_rt_transport) { + runner_do_recv_part(r, ci, -1, 1); + } else if (t->subtype == task_subtype_part_swallow) { + cell_unpack_part_swallow(ci, + (struct black_holes_part_data *)t->buff); + free(t->buff); + } else if (t->subtype == task_subtype_bpart_merger) { + cell_unpack_bpart_swallow(ci, + (struct black_holes_bpart_data *)t->buff); + free(t->buff); + } else if (t->subtype == task_subtype_limiter) { + /* Nothing to do here. Unpacking done in a separate task */ + } else if (t->subtype == task_subtype_gpart) { + runner_do_recv_gpart(r, ci, 1); + } else if (t->subtype == task_subtype_spart_density) { + runner_do_recv_spart(r, ci, 1, 1); + } else if (t->subtype == task_subtype_part_prep1) { + runner_do_recv_part(r, ci, 0, 1); + } else if (t->subtype == task_subtype_spart_prep2) { + runner_do_recv_spart(r, ci, 0, 1); + } else if (t->subtype == task_subtype_bpart_rho) { + runner_do_recv_bpart(r, ci, 1, 1); + } else if (t->subtype == task_subtype_bpart_feedback) { + runner_do_recv_bpart(r, ci, 0, 1); + } else { + error("Unknown/invalid task subtype (%d).", t->subtype); + } + break; + + case task_type_pack: + runner_do_pack_limiter(r, ci, &t->buff, 1); + task_get_unique_dependent(t)->buff = t->buff; + break; + case task_type_unpack: + runner_do_unpack_limiter(r, ci, t->buff, 1); + break; +#endif + case task_type_grav_down: + runner_do_grav_down(r, t->ci, 1); + break; + case task_type_grav_long_range: + runner_do_grav_long_range(r, t->ci, 1); + break; + case task_type_grav_mm: + runner_dopair_grav_mm_progenies(r, t->flags, t->ci, t->cj); + break; + case task_type_cooling: + runner_do_cooling(r, t->ci, 1); + break; + case task_type_star_formation: + runner_do_star_formation(r, t->ci, 1); + break; + case task_type_star_formation_sink: + runner_do_star_formation_sink(r, t->ci, 1); + break; + case task_type_stars_resort: + runner_do_stars_resort(r, t->ci, 1); + break; + case task_type_sink_formation: + runner_do_sink_formation(r, t->ci); + break; + case task_type_fof_self: + runner_do_fof_search_self(r, t->ci, 1); + break; + case task_type_fof_pair: + runner_do_fof_search_pair(r, t->ci, t->cj, 1); + break; + case task_type_fof_attach_self: + runner_do_fof_attach_self(r, t->ci, 1); + break; + case task_type_fof_attach_pair: + runner_do_fof_attach_pair(r, t->ci, t->cj, 1); + break; + case task_type_neutrino_weight: + runner_do_neutrino_weighting(r, ci, 1); + break; + case task_type_rt_ghost1: + runner_do_rt_ghost1(r, t->ci, 1); + break; + case task_type_rt_ghost2: + runner_do_rt_ghost2(r, t->ci, 1); + break; + case task_type_rt_tchem: + runner_do_rt_tchem(r, t->ci, 1); + break; + case task_type_rt_advance_cell_time: + runner_do_rt_advance_cell_time(r, t->ci, 1); + break; + default: + error("Unknown/invalid task type (%d).", t->type); + } + r->active_time += (getticks() - task_beg); + +/* Mark that we have run this task on these cells */ +#ifdef SWIFT_DEBUG_CHECKS + if (ci != NULL) { + ci->tasks_executed[t->type]++; + ci->subtasks_executed[t->subtype]++; + } + if (cj != NULL) { + cj->tasks_executed[t->type]++; + cj->subtasks_executed[t->subtype]++; + } + /* This runner is not doing a task anymore */ + r->t = NULL; +#endif + + /* We're done with this task, see if we get a next one. */ + prev = t; + + if (t->subtype == task_subtype_gpu_pack_d) { +#ifdef GPUOFFLOAD_DENSITY + /* Don't enqueue unpacks yet. Just signal the runners */ + t->skip = 1; + t->toc = getticks(); + t->total_ticks += t->toc - t->tic; + t = NULL; +#else + t = scheduler_done(sched, t); +#endif + } + + else if (t->subtype == task_subtype_gpu_pack_g) { +#ifdef GPUOFFLOAD_GRADIENT + /* Don't enqueue unpacks yet. Just signal the runners */ + t->skip = 1; + t->toc = getticks(); + t->total_ticks += t->toc - t->tic; + t = NULL; +#else + t = scheduler_done(sched, t); +#endif + } + + else if (t->subtype == task_subtype_gpu_pack_f) { +#ifdef GPUOFFLOAD_FORCE + /* Don't enqueue unpacks yet. Just signal the runners */ + t->skip = 1; + t->toc = getticks(); + t->total_ticks += t->toc - t->tic; + t = NULL; +#else + t = scheduler_done(sched, t); +#endif + } + + else if (t->subtype != task_subtype_gpu_pack_d && + t->subtype != task_subtype_gpu_pack_g && + t->subtype != task_subtype_gpu_pack_f) { + t = scheduler_done(sched, t); + } + } /* main loop. */ + + message("n_leafs found %i", n_leafs_total); +// message("cpu %i packed %i cells with %i containing more parts than target of %i max_count %i", +// r->cpuid, n_cells_d, n_w_prts_gtr_target_d, np_per_cell, maxcount); +// message("cpu %i packed %i cells_G with %i containing more parts than target of %i max_count %i", +// r->cpuid, n_cells_g, n_w_prts_gtr_target_g, np_per_cell, maxcount); +// message("cpu %i packed %i cells_F with %i containing more parts than target of %i max_count %i", +// r->cpuid, n_cells_f, n_w_prts_gtr_target_f, np_per_cell, maxcount); +// message("cpu %i packed %i pairs_D with %i containing more parts than target of %i max_count %i", +// r->cpuid, n_cells_p_d, n_w_prts_gtr_target_p_d, np_per_cell, maxcount); +// message("cpu %i packed %i pairs_G with %i containing more parts than target of %i max_count %i", +// r->cpuid, n_cells_p_g, n_w_prts_gtr_target_p_g, np_per_cell, maxcount); +// message("cpu %i packed %i pairs_F with %i containing more parts than target of %i max_count %i", +// r->cpuid, n_cells_p_f, n_w_prts_gtr_target_p_f, np_per_cell, maxcount); + + // message("Worked on %i supers w more than 100 parts", g100); + // Stuff for writing debug data to file for validation + //// if (step % 10 == 0 || step == 1) { + // if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "x, y, z, + // rho, rhodh, v_sig, lap_u, a_visc_max, ax, ay, az\n"); for (int tid + // = 0; tid < space->nr_local_cells; + // tid++) { /* This should indeed be tasks_done_gpu as they are + // the only + //// tasks which have been done*/ + // struct cell *ctemp = &(space->cells_top[tid]); + // for (int i = 0; i < ctemp->hydro.count; i++) { + // fprintf(fgpu_steps, "%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, + // %f, %f\n", + // ctemp->hydro.parts[i].x[0], + // ctemp->hydro.parts[i].x[1], + // ctemp->hydro.parts[i].x[2], ctemp->hydro.parts[i].rho, + // ctemp->hydro.parts[i].density.rho_dh, + // ctemp->hydro.parts[i].viscosity.v_sig, + // ctemp->hydro.parts[i].diffusion.laplace_u, + // ctemp->hydro.parts[i].force.alpha_visc_max_ngb, + // ctemp->hydro.parts[i].a_hydro[0], + // ctemp->hydro.parts[i].a_hydro[1], + // ctemp->hydro.parts[i].a_hydro[2]); + //// message("wcount %f density %f", + /// ctemp->hydro.parts[i].density.wcount, ctemp->hydro.parts[i].rho); / + /// message("wcount is %f\n", ctemp->hydro.parts[i].density.wcount); + // } + // } + //// } + /*Output compute times to separate files. cat later into one file*/ +// if (step % 11 == 0 || step == 1) { +#ifdef DUMP_TIMINGS +#if defined(GPUOFFLOAD_DENSITY) || defined(GPUOFFLOAD_GRADIENT) || \ + defined(GPUOFFLOAD_FORCE) + // char buffer[30]; + // snprintf(buffer, sizeof(buffer), "t%d_stepnfullbundles%d", + // r->cpuid, step); FILE *fullbundles = fopen(buffer, "w"); + // if(r->cpuid == 0)fprintf(fullbundles, "nfull, npartial, + // nfullpair, npartialpair\n"); else fprintf(fullbundles, "%i, %i, + // %i, %i\n", n_full_d_bundles, n_partial_d_bundles, + // n_full_p_d_bundles, n_partial_p_d_bundles); fflush(fullbundles); + + /////////////////////////////////////////////////////////////// + /// to ooutput timings uncomment this + /////////////////////////////////////////////////////////////// + if (r->cpuid == 0 && engine_rank == 0) + fprintf(fgpu_steps, + "GPU_SD, P_SD, U_SD, GPU_PD, P_PD, U_PD, " + "GPU_SF, P_SF, U_SF, GPU_PF, P_PF, U_PF, GPU_SG, P_SG, U_SG, " + "GPU_PG, P_PG, U_PG\n " + "%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, " + "%e, %e\n", + time_for_density_gpu, packing_time, unpack_time_self, + time_for_density_gpu_pair, packing_time_pair, unpacking_time_pair, + time_for_gpu_f, packing_time_f, unpack_time_self_f, + time_for_gpu_pair_f, packing_time_pair_f, unpacking_time_pair_f, + time_for_gpu_g, packing_time_g, unpack_time_self_g, + time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f); + + else + fprintf(fgpu_steps, + "%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, " + "%e, %e\n", + time_for_density_gpu, packing_time, unpack_time_self, + time_for_density_gpu_pair, packing_time_pair, unpacking_time_pair, + time_for_gpu_f, packing_time_f, unpack_time_self_f, + time_for_gpu_pair_f, packing_time_pair_f, unpacking_time_pair_f, + time_for_gpu_g, packing_time_g, unpack_time_self_g, + time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f); + ////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////// + +#else // No GPU offload + if (r->cpuid == 0 && engine_rank == 0) + fprintf(fgpu_steps, + "CPU TIME SELF, CPU TIME PAIR, " + "CPU TIME SELF F, CPU TIME PAIR F, CPU TIME SELF G, CPU TIME " + "PAIR G\n " + "%e, %e, %e, %e, %e, %e\n", + time_for_density_cpu, time_for_density_cpu_pair, time_for_cpu_f, + time_for_cpu_pair_f, time_for_cpu_g, time_for_cpu_pair_g); + + else + fprintf(fgpu_steps, "%e, %e, %e, %e, %e, %e,\n", time_for_density_cpu, + time_for_density_cpu_pair, time_for_cpu_f, time_for_cpu_pair_f, + time_for_cpu_g, time_for_cpu_pair_g); +#endif + // } + fflush(fgpu_steps); + fclose(fgpu_steps); +#endif // DUMPTIMINGS + time_for_density_cpu = 0.0; + time_for_density_gpu = 0.0; + time_for_density_cpu_pair = 0.0; + time_for_density_gpu_pair = 0.0; + time_for_density_cpu_sub = 0.0; + tot_time_for_hard_memcpys = 0.0; + tasks_done_gpu = 0; + tasks_done_cpu = 0; + tasks_done_gpu_inc = 0; + if (ghost_in > 0) + fprintf(stderr, "total tasks not done on GPU %i is %i\n", r->cpuid, + ghost_in); + packed_self = 0; + packed_pair = 0; + packed_self_f = 0; + packed_pair_f = 0; + packed_self_g = 0; + packed_pair_g = 0; + density = 0; + density_sub = 0; + unpacked = 0; + // if(step == 2)cudaProfilerStop(); + // if(step == 2)exit(0); + // size_t free_byte ; + // size_t total_byte ; + // cudaError_t cuda_status = cudaMemGetInfo( &free_byte, + //&total_byte ) ; double free = (double)free_byte; double + // available = (double)total_byte; double used = (available - free); + // fprintf(stderr, "Used %f GB GPU memory\n", used/1e9); + /* Wait at the wait barrier. */ + // swift_barrier_wait(&e->wait_barrier); + } + // Free all data + // cudaFree(d_tid_p); + // cudaFree(d_id); + // cudaFree(d_x_p); + // cudaFree(d_y_p); + // cudaFree(d_z_p); + // cudaFree(d_ux); + // cudaFree(d_uy); + // cudaFree(d_uz); + // cudaFree(d_a_hydrox); + // cudaFree(d_a_hydroy); + // cudaFree(d_a_hydroz); + // cudaFree(d_mass); + // cudaFree(d_h); + // cudaFree(d_u); + // cudaFree(d_u_dt); + // cudaFree(d_rho); + // cudaFree(d_SPH_sum); + // cudaFree(d_locx); + // cudaFree(d_locy); + // cudaFree(d_locz); + // cudaFree(d_widthx); + // cudaFree(d_widthy); + // cudaFree(d_widthz); + // cudaFree(d_h_max); + // cudaFree(d_count_p); + // cudaFree(d_wcount); + // cudaFree(d_wcount_dh); + // cudaFree(d_rho_dh); + // cudaFree(d_rot_ux); + // cudaFree(d_rot_uy); + // cudaFree(d_rot_uz); + // cudaFree(d_div_v); + // cudaFree(d_div_v_previous_step); + // cudaFree(d_alpha_visc); + // cudaFree(d_v_sig); + // cudaFree(d_laplace_u); + // cudaFree(d_alpha_diff); + // cudaFree(d_f); + // cudaFree(d_soundspeed); + // cudaFree(d_h_dt); + // cudaFree(d_balsara); + // cudaFree(d_pressure); + // cudaFree(d_alpha_visc_max_ngb); + // cudaFree(d_time_bin); + // cudaFree(d_wakeup); + // cudaFree(d_min_ngb_time_bin); + // cudaFree(d_to_be_synchronized); + // cudaFree(tid_p); + // cudaFree(id); + // cudaFree(mass); + // cudaFree(h); + // cudaFree(u); + // cudaFree(u_dt); + // cudaFree(rho); + // cudaFree(SPH_sum); + // cudaFree(x_p); + // cudaFree(y_p); + // cudaFree(z_p); + // cudaFree(ux); + // cudaFree(uy); + // cudaFree(uz); + // cudaFree(a_hydrox); + // cudaFree(a_hydroy); + // cudaFree(a_hydroz); + // cudaFree(locx); + // cudaFree(locy); + // cudaFree(locz); + // cudaFree(widthx); + // cudaFree(widthy); + // cudaFree(widthz); + // cudaFree(h_max); + // cudaFree(count_p); + // cudaFree(wcount); + // cudaFree(wcount_dh); + // cudaFree(rho_dh); + // cudaFree(rot_ux); + // cudaFree(rot_uy); + // cudaFree(rot_uz); + // cudaFree(div_v); + // cudaFree(div_v_previous_step); + // cudaFree(alpha_visc); + // cudaFree(v_sig); + // cudaFree(laplace_u); + // cudaFree(alpha_diff); + // cudaFree(f); + // cudaFree(soundspeed); + // cudaFree(h_dt); + // cudaFree(balsara); + // cudaFree(pressure); + // cudaFree(alpha_visc_max_ngb); + // cudaFree(time_bin); + // cudaFree(wakeup); + // cudaFree(min_ngb_time_bin); + // cudaFree(to_be_synchronized); + // cudaFree(partid_p); + // cudaFree(d_task_first_part); + // cudaFree(d_task_last_part); + // cudaFree(task_first_part_self_dens); + // cudaFree(task_last_part_self_dens); + // cudaFree(task_first_part_pair_ci); + // cudaFree(task_last_part_pair_ci); + // cudaFree(task_first_part_pair_cj); + // cudaFree(task_last_part_pair_cj); + // cudaFree(d_bundle_first_part_self_dens); + // cudaFree(d_bundle_last_part_self_dens); + // cudaFree(bundle_first_part_self_dens); + // cudaFree(bundle_last_part_self_dens); + // cudaFree(bundle_first_part_pair_ci); + // cudaFree(bundle_last_part_pair_ci); + // cudaFree(bundle_first_part_pair_cj); + // cudaFree(bundle_last_part_pair_cj); + // free(ci_list_self_dens); + // free(ci_list_pair); + // free(cj_list_pair); + + /* Be kind, rewind. */ + return NULL; +} + +#endif // WITH_CUDA + diff --git a/src/runner_others.c b/src/runner_others.c index cbace92a63..914b1f47a3 100644 --- a/src/runner_others.c +++ b/src/runner_others.c @@ -381,7 +381,7 @@ void runner_do_star_formation(struct runner *r, struct cell *c, int timer) { /* Loop over the gas particles in this cell. */ for (int k = 0; k < count; k++) { - + continue; //A. Nasar: Commented out to try without inhibited particles /* Get a handle on the part. */ struct part *restrict p = &parts[k]; struct xpart *restrict xp = &xparts[k]; diff --git a/src/scheduler.c b/src/scheduler.c index 2b156f8250..69203e37b6 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -61,6 +61,7 @@ int activate_by_unskip = 1; #endif +#include "cuda/BLOCK_SIZE.h" /** * @brief Re-set the list of active tasks. */ @@ -900,7 +901,9 @@ void scheduler_write_cell_dependencies(struct scheduler *s, int verbose, int local_count = 0; for (int i = 0; i < s->nr_tasks; i++) { const struct task *ta = &s->tasks[i]; - + // if(ta->subtype == task_subtype_gpu_unpack_d + // || ta->subtype == task_subtype_gpu_unpack_f + // || ta->subtype == task_subtype_gpu_unpack_g)continue; /* Are we using this task? * For the 0-step, we wish to show all the tasks (even the inactives). */ if (step != 0 && ta->skip) continue; @@ -952,7 +955,10 @@ void scheduler_write_cell_dependencies(struct scheduler *s, int verbose, /* and their dependencies */ for (int j = 0; j < ta->nr_unlock_tasks; j++) { const struct task *tb = ta->unlock_tasks[j]; - + if (tb->subtype == task_subtype_gpu_unpack_d || + tb->subtype == task_subtype_gpu_unpack_f || + tb->subtype == task_subtype_gpu_unpack_g) + continue; /* Are we using this task? * For the 0-step, we wish to show all the tasks (even the inactive). */ if (step != 0 && tb->skip) continue; @@ -1167,6 +1173,237 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) { const int with_black_holes = (s->space->e->policy & engine_policy_black_holes); + /* Iterate on this task until we're done with it. */ + int redo = 1; + while (redo) { + /* Reset the redo flag. */ + redo = 0; + + /* Is this a non-empty self-task? */ + const int is_self = + (t->type == task_type_self) && (t->ci != NULL) && + ((t->ci->hydro.count > 0) || (with_stars && t->ci->stars.count > 0) || + (with_sinks && t->ci->sinks.count > 0) || + (with_black_holes && t->ci->black_holes.count > 0)); + + /* Is this a non-empty pair-task? */ + const int is_pair = (t->type == task_type_pair) && (t->ci != NULL) && + (t->cj != NULL) && + ((t->ci->hydro.count > 0) || + (with_feedback && t->ci->stars.count > 0) || + (with_sinks && t->ci->sinks.count > 0) || + (with_black_holes && t->ci->black_holes.count > 0)) && + ((t->cj->hydro.count > 0) || + (with_feedback && t->cj->stars.count > 0) || + (with_sinks && t->cj->sinks.count > 0) || + (with_black_holes && t->cj->black_holes.count > 0)); + + /* Empty task? */ + if (!is_self && !is_pair) { + t->type = task_type_none; + t->subtype = task_subtype_none; + t->ci = NULL; + t->cj = NULL; + t->skip = 1; + break; + } + + /* Self-interaction? */ + if (t->type == task_type_self) { + /* Get a handle on the cell involved. */ + struct cell *ci = t->ci; + + /* Foreign task? */ + if (ci->nodeID != s->nodeID) { + t->skip = 1; + break; + } + + /* Is this cell even split and the task does not violate h ? */ + if (cell_can_split_self_hydro_task(ci)) { + /* Make a sub? */ + if (scheduler_dosub && (ci->hydro.count < space_subsize_self_hydro_default) && + (ci->stars.count < space_subsize_self_stars)) { + /* convert to a self-subtask. */ + t->type = task_type_sub_self; + + /* Otherwise, make tasks explicitly. */ + } else { + /* Take a step back (we're going to recycle the current task)... */ + redo = 1; + + /* Add the self tasks. */ + int first_child = 0; + while (ci->progeny[first_child] == NULL) first_child++; + + t->ci = ci->progeny[first_child]; + cell_set_flag(t->ci, cell_flag_has_tasks); + + for (int k = first_child + 1; k < 8; k++) { + /* Do we have a non-empty progenitor? */ + if (ci->progeny[k] != NULL && + (ci->progeny[k]->hydro.count || + (with_stars && ci->progeny[k]->stars.count))) { + scheduler_splittask_hydro( + scheduler_addtask(s, task_type_self, t->subtype, 0, 0, + ci->progeny[k], NULL), + s); + } + } + + /* Make a task for each pair of progeny */ + for (int j = 0; j < 8; j++) { + /* Do we have a non-empty progenitor? */ + if (ci->progeny[j] != NULL && + (ci->progeny[j]->hydro.count || + (with_feedback && ci->progeny[j]->stars.count))) { + for (int k = j + 1; k < 8; k++) { + /* Do we have a second non-empty progenitor? */ + if (ci->progeny[k] != NULL && + (ci->progeny[k]->hydro.count || + (with_feedback && ci->progeny[k]->stars.count))) { + scheduler_splittask_hydro( + scheduler_addtask(s, task_type_pair, t->subtype, + sub_sid_flag[j][k], 0, ci->progeny[j], + ci->progeny[k]), + s); + } + } + } + } + } + + } /* Cell is split */ + + } /* Self interaction */ + + /* Pair interaction? */ + else if (t->type == task_type_pair) { + /* Get a handle on the cells involved. */ + struct cell *ci = t->ci; + struct cell *cj = t->cj; + + /* Foreign task? */ + if (ci->nodeID != s->nodeID && cj->nodeID != s->nodeID) { + t->skip = 1; + break; + } + + /* Get the sort ID, use space_getsid_and_swap_cells and not t->flags + to make sure we get ci and cj swapped if needed. */ + double shift[3]; + const int sid = space_getsid_and_swap_cells(s->space, &ci, &cj, shift); + +#ifdef SWIFT_DEBUG_CHECKS + if (sid != t->flags) + error("Got pair task with incorrect flags: sid=%d flags=%lld", sid, + t->flags); +#endif + + /* Should this task be split-up? */ + if (cell_can_split_pair_hydro_task(ci) && + cell_can_split_pair_hydro_task(cj)) { + + const int h_count_i = ci->hydro.count; + const int h_count_j = cj->hydro.count; + + const int s_count_i = ci->stars.count; + const int s_count_j = cj->stars.count; + + int do_sub_hydro = 1; + int do_sub_stars_i = 1; + int do_sub_stars_j = 1; + if (h_count_i > 0 && h_count_j > 0) { + + /* Note: Use division to avoid integer overflow. */ + do_sub_hydro = + h_count_i * sid_scale[sid] < space_subsize_pair_hydro_default / h_count_j; + } + if (s_count_i > 0 && h_count_j > 0) { + + /* Note: Use division to avoid integer overflow. */ + do_sub_stars_i = + s_count_i * sid_scale[sid] < space_subsize_pair_stars / h_count_j; + } + if (s_count_j > 0 && h_count_i > 0) { + + /* Note: Use division to avoid integer overflow. */ + do_sub_stars_j = + s_count_j * sid_scale[sid] < space_subsize_pair_stars / h_count_i; + } + + /* Replace by a single sub-task? */ + if (scheduler_dosub && + (do_sub_hydro && do_sub_stars_i && do_sub_stars_j) && + !sort_is_corner(sid)) { + + /* Make this task a sub task. */ + t->type = task_type_sub_pair; + + /* Otherwise, split it. */ + } else { + /* Take a step back (we're going to recycle the current task)... */ + redo = 1; + + /* Loop over the sub-cell pairs for the current sid and add new tasks + * for them. */ + struct cell_split_pair *csp = &cell_split_pairs[sid]; + + t->ci = ci->progeny[csp->pairs[0].pid]; + t->cj = cj->progeny[csp->pairs[0].pjd]; + if (t->ci != NULL) cell_set_flag(t->ci, cell_flag_has_tasks); + if (t->cj != NULL) cell_set_flag(t->cj, cell_flag_has_tasks); + + t->flags = csp->pairs[0].sid; + for (int k = 1; k < csp->count; k++) { + scheduler_splittask_hydro( + scheduler_addtask(s, task_type_pair, t->subtype, + csp->pairs[k].sid, 0, + ci->progeny[csp->pairs[k].pid], + cj->progeny[csp->pairs[k].pjd]), + s); + } + } + + /* Otherwise, break it up if it is too large? */ + } else if (scheduler_doforcesplit && ci->split && cj->split && + (ci->hydro.count > space_maxsize / cj->hydro.count)) { + + /* Replace the current task. */ + t->type = task_type_none; + + for (int j = 0; j < 8; j++) + if (ci->progeny[j] != NULL && ci->progeny[j]->hydro.count) + for (int k = 0; k < 8; k++) + if (cj->progeny[k] != NULL && cj->progeny[k]->hydro.count) { + struct task *tl = + scheduler_addtask(s, task_type_pair, t->subtype, 0, 0, + ci->progeny[j], cj->progeny[k]); + scheduler_splittask_hydro(tl, s); + tl->flags = space_getsid_and_swap_cells(s->space, &t->ci, + &t->cj, shift); + } + } + } /* pair interaction? */ + } /* iterate over the current task. */ +} + +/** + * @brief Split a hydrodynamic task if too large. + * + * @param t The #task + * @param s The #scheduler we are working in. + */ +static void scheduler_splittask_hydro_GPU(struct task *t, struct scheduler *s) { + /* Are we considering both stars and hydro when splitting? */ + /* Note this is not very clean as the scheduler should not really + access the engine... */ + const int with_feedback = (s->space->e->policy & engine_policy_feedback); + const int with_stars = (s->space->e->policy & engine_policy_stars); + const int with_sinks = (s->space->e->policy & engine_policy_sinks); + const int with_black_holes = + (s->space->e->policy & engine_policy_black_holes); + /* Iterate on this task until we're done with it. */ int redo = 1; while (redo) { @@ -1362,8 +1599,6 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) { /* Otherwise, break it up if it is too large? */ } else if (scheduler_doforcesplit && ci->split && cj->split && (ci->hydro.count > space_maxsize / cj->hydro.count)) { - // message( "force splitting pair with %i and %i parts." , - // ci->hydro.count , cj->hydro.count ); /* Replace the current task. */ t->type = task_type_none; @@ -1651,6 +1886,19 @@ void scheduler_splittasks_mapper(void *map_data, int num_elements, scheduler_splittask_gravity(t, s); } else if (t->subtype == task_subtype_grav) { scheduler_splittask_gravity(t, s); + // if task is gpu task do not split A. Nasar + } else if (t->subtype == task_subtype_gpu_pack_d || + t->subtype == task_subtype_gpu_pack_g || + t->subtype == task_subtype_gpu_pack_f) { + scheduler_splittask_hydro_GPU(t, s); + } else if (t->subtype == task_subtype_gpu_unpack_d || + t->subtype == task_subtype_gpu_unpack_g || + t->subtype == task_subtype_gpu_unpack_f) { + /*Do nothing and grab next task to split. + *These tasks are cell-less so cannot split. + *Will remove this if statement if set on splitting + *b4 creating unpack tasks*/ + continue; } else { #ifdef SWIFT_DEBUG_CHECKS error("Unexpected task sub-type %s/%s", taskID_names[t->type], @@ -1740,6 +1988,8 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type, t->tic = 0; t->toc = 0; t->total_ticks = 0; + t->total_cpu_pack_ticks = 0; + t->total_cpu_unpack_ticks = 0; #ifdef SWIFT_DEBUG_CHECKS t->activated_by_unskip = 0; t->activated_by_marktask = 0; @@ -1748,6 +1998,26 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type, if (ci != NULL) cell_set_flag(ci, cell_flag_has_tasks); if (cj != NULL) cell_set_flag(cj, cell_flag_has_tasks); + // #ifdef WITH_CUDA A. Nasar + if (t->subtype == task_subtype_gpu_pack_d) { + if (t->type == task_type_self || t->type == task_type_sub_self) + atomic_inc(&s->nr_self_pack_tasks_d); + if (t->type == task_type_pair || t->type == task_type_sub_pair) + atomic_inc(&s->nr_pair_pack_tasks_d); + } + if (t->subtype == task_subtype_gpu_pack_f) { + if (t->type == task_type_self || t->type == task_type_sub_self) + atomic_inc(&s->nr_self_pack_tasks_f); + if (t->type == task_type_pair || t->type == task_type_sub_pair) + atomic_inc(&s->nr_pair_pack_tasks_f); + } + if (t->subtype == task_subtype_gpu_pack_g) { + if (t->type == task_type_self || t->type == task_type_sub_self) + atomic_inc(&s->nr_self_pack_tasks_g); + if (t->type == task_type_pair || t->type == task_type_sub_pair) + atomic_inc(&s->nr_pair_pack_tasks_g); + } + // #endif /* Add an index for it. */ // lock_lock( &s->lock ); s->tasks_ind[atomic_inc(&s->nr_tasks)] = ind; @@ -1833,6 +2103,13 @@ void scheduler_set_unlocks(struct scheduler *s) { struct task *t = &s->tasks[k]; for (int i = 0; i < t->nr_unlock_tasks; i++) { for (int j = i + 1; j < t->nr_unlock_tasks; j++) { + /*Fix for the case when one unpack task works over the same cell + * connected to two pair pack tasks*/ + if (t->subtype == task_subtype_gpu_unpack_d || + t->subtype == task_subtype_gpu_unpack_g || + t->subtype == task_subtype_gpu_unpack_f) { + continue; + } if (t->unlock_tasks[i] == t->unlock_tasks[j]) error("duplicate unlock! t->type=%s/%s unlocking type=%s/%s", taskID_names[t->type], subtaskID_names[t->subtype], @@ -1940,13 +2217,20 @@ void scheduler_reset(struct scheduler *s, int size) { /* Reset the counters. */ s->size = size; s->nr_tasks = 0; + s->nr_self_pack_tasks_d = 0; // A. Nasar + s->nr_pair_pack_tasks_d = 0; + s->nr_self_pack_tasks_f = 0; + s->nr_pair_pack_tasks_f = 0; + s->nr_self_pack_tasks_g = 0; + s->nr_pair_pack_tasks_g = 0; s->tasks_next = 0; s->waiting = 0; s->nr_unlocks = 0; s->completed_unlock_writes = 0; s->active_count = 0; s->total_ticks = 0; - + s->pack_size = N_TASKS_PER_PACK_SELF; + s->pack_size_pair = N_TASKS_PER_PACK_PAIR; /* Set the task pointers in the queues. */ for (int k = 0; k < s->nr_queues; k++) s->queues[k].tasks = s->tasks; } @@ -2007,6 +2291,24 @@ void scheduler_reweight(struct scheduler *s, int verbose) { cost = 1.f * (wscale * gcount_i) * gcount_i; } else if (t->subtype == task_subtype_external_grav) cost = 1.f * wscale * gcount_i; + else if (t->subtype == task_subtype_gpu_pack_d) // A. Nasar + cost = 1.f * (wscale * count_i * count_i); // * s->pack_size; + else if (t->subtype == task_subtype_gpu_pack_f) + cost = 1.f * (wscale * count_i * count_i); // * s->pack_size; + else if (t->subtype == task_subtype_gpu_pack_g) + cost = 1.f * (wscale * count_i * count_i); // * s->pack_size; + else if (t->subtype == task_subtype_gpu_unpack_d) + //cost = wscale * s->pack_size; + cost = (wscale * count_i) * count_i * s->pack_size; + // cost = 1.f * wscale * s->pack_size; + else if (t->subtype == task_subtype_gpu_unpack_f) + cost = (wscale * count_i) * count_i * s->pack_size; +// cost = wscale * s->pack_size; +// cost = 1.f * wscale * s->pack_size; + else if (t->subtype == task_subtype_gpu_unpack_g) + cost = (wscale * count_i) * count_i * s->pack_size; +// cost = wscale * s->pack_size; +// cost = 1.f * wscale * s->pack_size; else if (t->subtype == task_subtype_stars_density || t->subtype == task_subtype_stars_prep1 || t->subtype == task_subtype_stars_prep2 || @@ -2045,7 +2347,36 @@ void scheduler_reweight(struct scheduler *s, int verbose) { cost = 3.f * (wscale * gcount_i) * gcount_j; else cost = 2.f * (wscale * gcount_i) * gcount_j; - + // Abouzied: Think about good cost (for rainy days) A. Nasar + } else if (t->subtype == task_subtype_gpu_pack_d) { + // cost = 2.f * (wscale * count_i) * count_i; + if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) + cost = 3.f * (wscale * count_i * count_i); + else + cost = 2.f * (wscale * count_i) * count_j * sid_scale[t->flags]; + } else if (t->subtype == task_subtype_gpu_pack_f) { +// cost = 2.f * (wscale * count_i) * count_i; + if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) + cost = 3.f * (wscale * count_i * count_i) * sid_scale[t->flags]; + else + cost = 2.f * (wscale * count_i) * count_j * sid_scale[t->flags]; + + } else if (t->subtype == task_subtype_gpu_pack_g) { + if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) + cost = 3.f * (wscale * count_i * count_i) * sid_scale[t->flags]; + else + cost = 2.f * (wscale * count_i) * count_j * sid_scale[t->flags]; + +// cost = 2.f * (wscale * count_i) * count_i; + } else if (t->subtype == task_subtype_gpu_unpack_d) { + cost = (wscale * count_i) * count_i * s->pack_size; + //cost = 1.f * wscale; + } else if (t->subtype == task_subtype_gpu_unpack_f) { + cost = (wscale * count_i) * count_i * s->pack_size; + //cost = 1.f * wscale; + } else if (t->subtype == task_subtype_gpu_unpack_g) { + cost = (wscale * count_i) * count_i * s->pack_size; + //cost = 1.f * wscale; } else if (t->subtype == task_subtype_stars_density || t->subtype == task_subtype_stars_prep1 || t->subtype == task_subtype_stars_prep2 || @@ -2177,7 +2508,21 @@ void scheduler_reweight(struct scheduler *s, int verbose) { } else if (t->subtype == task_subtype_do_bh_swallow) { cost = 1.f * wscale * (bcount_i + bcount_j); - + } else if (t->subtype == task_subtype_gpu_pack_d) { + cost = 2.f * (wscale * count_i) * count_i; + } else if (t->subtype == task_subtype_gpu_pack_f) { + cost = 2.f * (wscale * count_i) * count_i; + } else if (t->subtype == task_subtype_gpu_pack_g) { + cost = 2.f * (wscale * count_i) * count_i; + } else if (t->subtype == task_subtype_gpu_unpack_d) { + cost = (wscale * count_i) * count_i * s->pack_size; + //cost = 1.f * wscale; + } else if (t->subtype == task_subtype_gpu_unpack_f) { + cost = (wscale * count_i) * count_i * s->pack_size; + //cost = 1.f * wscale; + } else if (t->subtype == task_subtype_gpu_unpack_g) { + cost = (wscale * count_i) * count_i * s->pack_size; + //cost = 1.f * wscale; } else if (t->subtype == task_subtype_density || t->subtype == task_subtype_gradient || t->subtype == task_subtype_force || @@ -2216,10 +2561,25 @@ void scheduler_reweight(struct scheduler *s, int verbose) { cost = 1.f * wscale * count_i; } else if (t->subtype == task_subtype_do_bh_swallow) { cost = 1.f * wscale * bcount_i; - } else if (t->subtype == task_subtype_density || - t->subtype == task_subtype_gradient || - t->subtype == task_subtype_force || - t->subtype == task_subtype_limiter) { + } else if (t->subtype == task_subtype_gpu_pack_d) // A. Nasar + cost = 1.f * (wscale * count_i) * count_i; // * s->pack_size; + else if (t->subtype == task_subtype_gpu_pack_f) + cost = 1.f * (wscale * count_i) * count_i; // * s->pack_size; + else if (t->subtype == task_subtype_gpu_pack_g) + cost = 1.f * (wscale * count_i) * count_i; // * s->pack_size; + else if (t->subtype == task_subtype_gpu_unpack_d) + cost = (wscale * count_i) * count_i * s->pack_size; + //cost = 1.f * wscale * s->pack_size; + else if (t->subtype == task_subtype_gpu_unpack_f) + cost = (wscale * count_i) * count_i * s->pack_size; + //cost = 1.f * wscale * s->pack_size; + else if (t->subtype == task_subtype_gpu_unpack_g) + cost = (wscale * count_i) * count_i * s->pack_size; + //cost = 1.f * wscale * s->pack_size; + else if (t->subtype == task_subtype_density || + t->subtype == task_subtype_gradient || + t->subtype == task_subtype_force || + t->subtype == task_subtype_limiter) { cost = 1.f * (wscale * count_i) * count_i; } else if (t->subtype == task_subtype_rt_gradient) { cost = 1.f * wscale * scount_i * count_i; @@ -2231,10 +2591,10 @@ void scheduler_reweight(struct scheduler *s, int verbose) { } break; case task_type_ghost: - if (t->ci == t->ci->hydro.super) cost = wscale * count_i; + if (t->ci == t->ci->hydro.super) cost = wscale * count_i * count_i; break; case task_type_extra_ghost: - if (t->ci == t->ci->hydro.super) cost = wscale * count_i; + if (t->ci == t->ci->hydro.super) cost = wscale * count_i * count_i; break; case task_type_stars_ghost: if (t->ci == t->ci->hydro.super) cost = wscale * scount_i; @@ -2246,7 +2606,7 @@ void scheduler_reweight(struct scheduler *s, int verbose) { if (t->ci == t->ci->hydro.super) cost = wscale * bcount_i; break; case task_type_drift_part: - cost = wscale * count_i; + cost = wscale * count_i * count_i; break; case task_type_drift_gpart: cost = wscale * gcount_i; @@ -2273,7 +2633,7 @@ void scheduler_reweight(struct scheduler *s, int verbose) { cost = wscale * (gcount_i + gcount_j); break; case task_type_end_hydro_force: - cost = wscale * count_i; + cost = wscale * count_i * count_i; break; case task_type_end_grav_force: cost = wscale * gcount_i; @@ -2309,15 +2669,15 @@ void scheduler_reweight(struct scheduler *s, int verbose) { break; case task_type_kick1: cost = - wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i); + wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i) * count_i; break; case task_type_kick2: cost = - wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i); + wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i) * count_i; break; case task_type_timestep: cost = - wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i); + wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i) * count_i; break; case task_type_timestep_limiter: cost = wscale * count_i; @@ -2374,6 +2734,27 @@ void scheduler_rewait_mapper(void *map_data, int num_elements, /* Increment the task's own wait counter for the enqueueing. */ atomic_inc(&t->wait); + t->done = 0; + t->gpu_done = 0; + + // if (t->type == task_type_self){ // A. Nasar increment number of + // waiting tasks + // if(t->subtype == task_subtype_gpu_pack_d) + // atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left); + // if (t->subtype == task_subtype_gpu_pack_f) + // atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left_f); + // if (t->subtype == task_subtype_gpu_pack_g) + // atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left_g); + // } + // + // if (t->type == task_type_pair){ + // if(t->subtype == task_subtype_gpu_pack_d) + // atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left); + // if (t->subtype == task_subtype_gpu_pack_f) + // atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left_f); + // if (t->subtype == task_subtype_gpu_pack_g) + // atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left_g); + // } #ifdef SWIFT_DEBUG_CHECKS /* Check that we don't have more waits that what can be stored. */ @@ -2411,7 +2792,26 @@ void scheduler_enqueue_mapper(void *map_data, int num_elements, * @param s The #scheduler. */ void scheduler_start(struct scheduler *s) { - + for (int i = 0; i < s->nr_queues; i++) { // A. Nasar + s->queues[i].n_packs_self_left_d = 0; + s->queues[i].n_packs_pair_left_d = 0; + s->queues[i].n_packs_self_left_f = 0; + s->queues[i].n_packs_pair_left_f = 0; + s->queues[i].n_packs_self_left_g = 0; + s->queues[i].n_packs_pair_left_g = 0; + s->queues[i].n_packs_self_stolen_d = 0; + s->queues[i].n_packs_pair_stolen_d = 0; + s->queues[i].n_packs_self_stolen_f = 0; + s->queues[i].n_packs_pair_stolen_f = 0; + s->queues[i].n_packs_self_stolen_g = 0; + s->queues[i].n_packs_pair_stolen_g = 0; + s->s_d_left[i] = 0; + s->s_g_left[i] = 0; + s->s_f_left[i] = 0; + s->p_d_left[i] = 0; + s->p_g_left[i] = 0; + s->p_f_left[i] = 0; + } /* Re-wait the tasks. */ if (s->active_count > 1000) { threadpool_map(s->threadpool, scheduler_rewait_mapper, s->tid_active, @@ -2487,6 +2887,21 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) { t->subtype == task_subtype_external_grav) { qid = t->ci->grav.super->owner; owner = &t->ci->grav.super->owner; + } else if (t->subtype == task_subtype_gpu_pack_d) { // A. Nasar + qid = t->ci->hydro.super->owner; + owner = &t->ci->hydro.super->owner; + } else if (t->subtype == task_subtype_gpu_pack_f) { + qid = t->ci->hydro.super->owner; + owner = &t->ci->hydro.super->owner; + } else if (t->subtype == task_subtype_gpu_pack_g) { + qid = t->ci->hydro.super->owner; + owner = &t->ci->hydro.super->owner; + } else if (t->subtype == task_subtype_gpu_unpack_d) { + qid = -1; + } else if (t->subtype == task_subtype_gpu_unpack_f) { + qid = -1; + } else if (t->subtype == task_subtype_gpu_unpack_g) { + qid = -1; } else { qid = t->ci->hydro.super->owner; owner = &t->ci->hydro.super->owner; @@ -2513,13 +2928,19 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) { break; case task_type_pair: case task_type_sub_pair: - qid = t->ci->super->owner; - owner = &t->ci->super->owner; - if ((qid < 0) || - ((t->cj->super->owner > -1) && - (s->queues[qid].count > s->queues[t->cj->super->owner].count))) { - qid = t->cj->super->owner; - owner = &t->cj->super->owner; + if (t->subtype == task_subtype_gpu_unpack_d || + t->subtype == task_subtype_gpu_unpack_f || + t->subtype == task_subtype_gpu_unpack_g) { + qid = -1; + } else { + qid = t->ci->super->owner; + owner = &t->ci->super->owner; + if ((qid < 0) || + ((t->cj->super->owner > -1) && + (s->queues[qid].count > s->queues[t->cj->super->owner].count))) { + qid = t->cj->super->owner; + owner = &t->cj->super->owner; + } } break; case task_type_recv: @@ -2729,12 +3150,83 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) { /* Save qid as owner for next time a task accesses this cell. */ if (owner != NULL) *owner = qid; - +// if (t->type == task_type_self || t->type == task_type_sub_self) { +// if (t->subtype == task_subtype_gpu_pack_d && t->ci->hydro.count > 0) { +// return; +// } +// if (t->subtype == task_subtype_gpu_pack_f && t->ci->hydro.count > 0) { +// return; +// } +// if (t->subtype == task_subtype_gpu_pack_g && t->ci->hydro.count > 0) { +// return; +// } +// } +// /* A. Nasar NEED to think about how to do this with +// MPI where ci may not be on this node/rank */ +// if (t->type == task_type_pair || t->type == task_type_sub_pair) { +// if (t->subtype == task_subtype_gpu_pack_d && t->ci->hydro.count > 0 && t->cj->hydro.count > 0) { +// return; +// } +// if (t->subtype == task_subtype_gpu_pack_f && t->ci->hydro.count > 0 && t->cj->hydro.count > 0) { +// return; +// } +// if (t->subtype == task_subtype_gpu_pack_g && t->ci->hydro.count > 0 && t->cj->hydro.count > 0) { +// return; +// } +// } /* Increase the waiting counter. */ atomic_inc(&s->waiting); - /* Insert the task into that queue. */ queue_insert(&s->queues[qid], t); + /* A. Nasar: Increment counters required for the pack tasks */ + if (t->type == task_type_self || t->type == task_type_sub_self) { + if (t->subtype == task_subtype_gpu_pack_d && t->ci->hydro.count > 0) { + lock_lock(&s->queues[qid].lock); + s->queues[qid].n_packs_self_left_d++; + if (lock_unlock(&s->queues[qid].lock) != 0) + error("Error unlocking queue"); + atomic_inc(&s->s_d_left[qid]); + } + if (t->subtype == task_subtype_gpu_pack_f && t->ci->hydro.count > 0) { + lock_lock(&s->queues[qid].lock); + s->queues[qid].n_packs_self_left_f++; + if (lock_unlock(&s->queues[qid].lock) != 0) + error("Error unlocking queue"); + atomic_inc(&s->s_f_left[qid]); + } + if (t->subtype == task_subtype_gpu_pack_g && t->ci->hydro.count > 0) { + lock_lock(&s->queues[qid].lock); + s->queues[qid].n_packs_self_left_g++; + if (lock_unlock(&s->queues[qid].lock) != 0) + error("Error unlocking queue"); + atomic_inc(&s->s_g_left[qid]); + } + } + /* A. Nasar NEED to think about how to do this with + MPI where ci may not be on this node/rank */ + if (t->type == task_type_pair || t->type == task_type_sub_pair) { + if (t->subtype == task_subtype_gpu_pack_d && t->ci->hydro.count > 0 && t->cj->hydro.count > 0) { + lock_lock(&s->queues[qid].lock); + s->queues[qid].n_packs_pair_left_d++; + if (lock_unlock(&s->queues[qid].lock) != 0) + error("Error unlocking queue"); + atomic_inc(&s->p_d_left[qid]); + } + if (t->subtype == task_subtype_gpu_pack_f && t->ci->hydro.count > 0 && t->cj->hydro.count > 0) { + lock_lock(&s->queues[qid].lock); + s->queues[qid].n_packs_pair_left_f++; + if (lock_unlock(&s->queues[qid].lock) != 0) + error("Error unlocking queue"); + atomic_inc(&s->p_f_left[qid]); + } + if (t->subtype == task_subtype_gpu_pack_g && t->ci->hydro.count > 0 && t->cj->hydro.count > 0) { + lock_lock(&s->queues[qid].lock); + s->queues[qid].n_packs_pair_left_g++; + if (lock_unlock(&s->queues[qid].lock) != 0) + error("Error unlocking queue"); + atomic_inc(&s->p_g_left[qid]); + } + } } } @@ -2778,12 +3270,48 @@ struct task *scheduler_done(struct scheduler *s, struct task *t) { /* Mark the task as skip. */ t->skip = 1; + t->done = 1; + /* Return the next best task. Note that we currently do not implement anything that does this, as getting it to respect priorities is too tricky and currently unnecessary. */ return NULL; } +struct task *signal_sleeping_runners(struct scheduler *s, struct task *t, + int tasks_packed) { + /* Mark the task as skip. */ + // t->skip = 1; + + /* Task definitely done, signal any sleeping runners. */ + if (!t->implicit) { + pthread_mutex_lock(&s->sleep_mutex); + atomic_sub(&s->waiting, tasks_packed); + pthread_cond_broadcast(&s->sleep_cond); + pthread_mutex_unlock(&s->sleep_mutex); + } + return NULL; +} + +struct task *enqueue_dependencies(struct scheduler *s, struct task *t) { + + /* Loop through the dependencies and add them to a queue if + they are ready. */ + for (int k = 0; k < t->nr_unlock_tasks; k++) { + struct task *t2 = t->unlock_tasks[k]; + if (t2->skip) continue; + + const int res = atomic_dec(&t2->wait); + if (res < 1) { + error("Negative wait!"); + } else if (res == 1) { + scheduler_enqueue(s, t2); + } + } + + return NULL; +} + /** * @brief Resolve a single dependency by hand. * @@ -2911,10 +3439,12 @@ struct task *scheduler_gettask(struct scheduler *s, int qid, /* Check qid. */ if (qid >= nr_queues || qid < 0) error("Bad queue ID."); + /*Get a pointer to our queue for re-use*/ + struct queue *q = &s->queues[qid]; /* Loop as long as there are tasks... */ while (s->waiting > 0 && res == NULL) { /* Try more than once before sleeping. */ - for (int tries = 0; res == NULL && s->waiting && tries < scheduler_maxtries; + for (int tries = 0; res == NULL && s->waiting && tries < scheduler_maxtries * 100; tries++) { /* Try to get a task from the suggested queue. */ if (s->queues[qid].count > 0 || s->queues[qid].count_incoming > 0) { @@ -2926,21 +3456,109 @@ struct task *scheduler_gettask(struct scheduler *s, int qid, /* If unsuccessful, try stealing from the other queues. */ if (s->flags & scheduler_flag_steal) { + int count = 0, qids[nr_queues]; - for (int k = 0; k < nr_queues; k++) + + /* Make list of queues that have 1 or more tasks in them */ + for (int k = 0; k < nr_queues; k++) { + if (k == qid) continue; if (s->queues[k].count > 0 || s->queues[k].count_incoming > 0) { qids[count++] = k; } + } + for (int k = 0; k < scheduler_maxsteal && count > 0; k++) { + + /* Pick a queue at random among the non-empty ones */ const int ind = rand_r(&seed) % count; - TIMER_TIC - res = queue_gettask(&s->queues[qids[ind]], prev, 0); + /*Get a pointer to the queue we're stealing from*/ + int qstl_id = qids[ind]; + + /* If we got the queue we already have, abort */ + if (qid == qstl_id) { + /* Reduce the size of the list of non-empty queues */ + qids[ind] = qids[--count]; + continue; + } + + /* The queue we are stealing from */ + struct queue *q_stl = &s->queues[qstl_id]; + + /* Can we lock our own queue? */ + if (lock_trylock(&q->lock) != 0) { + + /* No --> continue and try a different queue */ + continue; + + } else { + + /* Yes --> Try locking the que we steal from */ + if (lock_trylock(&q_stl->lock) != 0) { + + /* Failed? --> Unlock the 1st queue and + try again */ + if (lock_unlock(&q->lock) != 0) + error("Unlocking our queue failed"); + continue; + } + } + + /* We now have locked q and q_stl */ + + /* Try to get a task from that random queue */ + TIMER_TIC; + res = queue_gettask(q_stl, prev, 0); TIMER_TOC(timer_qsteal); + + /* Lucky? i.e. did we actually get a task? */ if (res != NULL) { + + /*A.Nasar: Get task type*/ + enum task_types type = res->type; + enum task_subtypes subtype = res->subtype; + + /*Move counter from the robbed to the robber*/ + if ((type == task_type_self || type == task_type_sub_self) && + subtype == task_subtype_gpu_pack_d) { + q->n_packs_self_left_d--; + q_stl->n_packs_self_left_d--; + } + if ((type == task_type_self || type == task_type_sub_self) && + subtype == task_subtype_gpu_pack_g) { + q->n_packs_self_left_g--; + q_stl->n_packs_self_left_g--; + } + if ((type == task_type_self || type == task_type_sub_self) && + subtype == task_subtype_gpu_pack_f) { + q->n_packs_self_left_f--; + q_stl->n_packs_self_left_f--; + } + if ((type == task_type_pair || type == task_type_sub_pair) && + subtype == task_subtype_gpu_pack_d) { + q->n_packs_pair_left_d--; + q_stl->n_packs_pair_left_d--; + } + if ((type == task_type_pair || type == task_type_sub_pair) && + subtype == task_subtype_gpu_pack_g) { + q->n_packs_pair_left_g--; + q_stl->n_packs_pair_left_g--; + } + if ((type == task_type_pair || type == task_type_sub_pair) && + subtype == task_subtype_gpu_pack_f) { + q->n_packs_pair_left_f--; + q_stl->n_packs_pair_left_f--; + } + /* Run with the task */ break; } else { + + /* Reduce the size of the list of non-empty queues */ qids[ind] = qids[--count]; } + + if (lock_unlock(&q->lock) != 0) error("Unlocking our queue failed"); + if (lock_unlock(&q_stl->lock) != 0) + error("Unlocking the stealing queue failed"); } if (res != NULL) break; } @@ -2956,6 +3574,11 @@ struct task *scheduler_gettask(struct scheduler *s, int qid, pthread_mutex_lock(&s->sleep_mutex); res = queue_gettask(&s->queues[qid], prev, 1); if (res == NULL && s->waiting > 0) { + // struct queue qq = s->queues[qid]; + // message("s->waiting %i self_stolen %i, self_left %i, pair_stolen + // %i, pair_left %i", s->waiting, + // qq.n_packs_self_stolen_f, qq.n_packs_self_left_f, + // qq.n_packs_pair_stolen_f, qq.n_packs_pair_left_f); pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex); } pthread_mutex_unlock(&s->sleep_mutex); @@ -3002,6 +3625,16 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks, /* Initialize each queue. */ for (int k = 0; k < nr_queues; k++) queue_init(&s->queues[k], NULL); + /* Initialize each queue. */ + for (int k = 0; k < nr_queues; k++) { + s->s_d_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues); + s->s_g_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues); + s->s_f_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues); + s->p_d_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues); + s->p_g_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues); + s->p_f_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues); + } + /* Init the sleep mutex and cond. */ if (pthread_cond_init(&s->sleep_cond, NULL) != 0 || pthread_mutex_init(&s->sleep_mutex, NULL) != 0) @@ -3090,6 +3723,13 @@ void scheduler_free_tasks(struct scheduler *s) { } s->size = 0; s->nr_tasks = 0; + // reset GPU task counters too + s->nr_self_pack_tasks_d = 0; + s->nr_self_pack_tasks_f = 0; + s->nr_self_pack_tasks_g = 0; + s->nr_pair_pack_tasks_d = 0; + s->nr_pair_pack_tasks_f = 0; + s->nr_pair_pack_tasks_g = 0; } /** @@ -3207,6 +3847,19 @@ void scheduler_report_task_times_mapper(void *map_data, int num_elements, const float total_time = clocks_from_ticks(t->total_ticks); const enum task_categories cat = task_get_category(t); time_local[cat] += total_time; + + if (t->subtype == task_subtype_gpu_pack_d || + t->subtype == task_subtype_gpu_pack_f || + t->subtype == task_subtype_gpu_pack_g) { + time_local[task_category_gpu_pack] += + clocks_from_ticks(t->total_cpu_pack_ticks); + time_local[task_category_gpu] -= + clocks_from_ticks(t->total_cpu_pack_ticks); + time_local[task_category_gpu] -= + clocks_from_ticks(t->total_cpu_unpack_ticks); + time_local[task_category_gpu_unpack] += + clocks_from_ticks(t->total_cpu_unpack_ticks); + } } /* Update the global counters */ diff --git a/src/scheduler.h b/src/scheduler.h index 6ea7b41d58..b7f8b9f2ad 100644 --- a/src/scheduler.h +++ b/src/scheduler.h @@ -60,6 +60,35 @@ extern int activate_by_unskip; /* Data of a scheduler. */ struct scheduler { + + int nr_packs_self_dens_done; // A. Nasar + int nr_packs_pair_dens_done; + int nr_packs_self_forc_done; + int nr_packs_pair_forc_done; + int nr_packs_self_grad_done; + int nr_packs_pair_grad_done; + + volatile int *s_d_left; + volatile int *s_g_left; + volatile int *s_f_left; + volatile int *p_d_left; + volatile int *p_g_left; + volatile int *p_f_left; + /* Actual number of GPU tasks. */ + int nr_gpu_tasks; + /* Number of tasks we want*/ + int target_gpu_tasks; + /* Actual number of density pack tasks. */ + int nr_self_pack_tasks_d, nr_pair_pack_tasks_d; + /* Actual number of force pack tasks. */ + int nr_self_pack_tasks_f, nr_pair_pack_tasks_f; + /* Actual number of gradient pack tasks. */ + int nr_self_pack_tasks_g, nr_pair_pack_tasks_g; + + /*how many tasks we want to try and work on at once on the GPU*/ + int pack_size; + int pack_size_pair; + /* Scheduler flags. */ unsigned int flags; @@ -323,5 +352,8 @@ void scheduler_write_task_level(const struct scheduler *s, int step); void scheduler_dump_queues(struct engine *e); void scheduler_report_task_times(const struct scheduler *s, const int nr_threads); +struct task *enqueue_dependencies(struct scheduler *s, struct task *t); +struct task *signal_sleeping_runners(struct scheduler *s, struct task *t, + int tasks_packed); #endif /* SWIFT_SCHEDULER_H */ diff --git a/src/space.h b/src/space.h index 4e0e849d64..a5358c913c 100644 --- a/src/space.h +++ b/src/space.h @@ -48,7 +48,7 @@ struct hydro_props; #define space_cellallocchunk 1000 #define space_splitsize_default 400 #define space_maxsize_default 8000000 -#define space_grid_split_threshold_default 400 +#define space_grid_split_threshold_default 100 #define space_extra_parts_default 0 #define space_extra_gparts_default 0 #define space_extra_sparts_default 100 @@ -94,6 +94,9 @@ extern double engine_foreign_alloc_margin; */ struct space { + /*Used to define GPU task memory allocation*/ + float eta_neighbours; + /*! Spatial extent. */ double dim[3]; diff --git a/src/space_getsid.h b/src/space_getsid.h index df81615d3c..f5e0101d30 100644 --- a/src/space_getsid.h +++ b/src/space_getsid.h @@ -46,7 +46,6 @@ __attribute__((always_inline, nonnull)) INLINE static int space_getsid_and_swap_cells(const struct space *s, struct cell **ci, struct cell **cj, double shift[3]) { - /* Get the relative distance between the pairs, wrapping. */ const int periodic = s->periodic; double dx[3]; @@ -79,4 +78,89 @@ space_getsid_and_swap_cells(const struct space *s, struct cell **ci, return sid; } +__attribute__((always_inline, nonnull)) +INLINE static int // A. Nasar Same as usual but only used to pack GPU cells +space_getsid_GPU(const struct space *s, struct cell **ci, struct cell **cj, + double *shift_x, double *shift_y, double *shift_z) { + /* Get the relative distance between the pairs, wrapping. */ + const int periodic = s->periodic; + double dx[3]; + for (int k = 0; k < 3; k++) dx[k] = (*cj)->loc[k] - (*ci)->loc[k]; + + if (periodic && dx[0] < -s->dim[0] / 2) + *(shift_x) = s->dim[0]; + else if (periodic && dx[0] > s->dim[0] / 2) + *(shift_x) = -s->dim[0]; + else + *(shift_x) = 0.0; + + dx[0] += *(shift_x); + + if (periodic && dx[1] < -s->dim[1] / 2) + *(shift_y) = s->dim[1]; + else if (periodic && dx[1] > s->dim[1] / 2) + *(shift_y) = -s->dim[1]; + else + *(shift_y) = 0.0; + + dx[1] += *(shift_y); + + if (periodic && dx[2] < -s->dim[2] / 2) + *(shift_z) = s->dim[2]; + else if (periodic && dx[2] > s->dim[2] / 2) + *(shift_z) = -s->dim[2]; + else + *(shift_z) = 0.0; + + dx[2] += *(shift_z); + + /* Get the sorting index. */ + int sid = 0; + for (int k = 0; k < 3; k++) + sid = 3 * sid + ((dx[k] < 0.0) ? 0 : ((dx[k] > 0.0) ? 2 : 1)); + + /* Switch the cells around? */ + if (runner_flip[sid]) { + struct cell *temp = *ci; + *ci = *cj; + *cj = temp; + *(shift_x) = -*(shift_x); + *(shift_y) = -*(shift_y); + *(shift_z) = -*(shift_z); + } + sid = sortlistID[sid]; + + /* Return the sort ID. */ + return sid; +} + +__attribute__((always_inline, nonnull)) INLINE static int space_getsid_filter( + const struct space *s, struct cell **ci, struct cell **cj, + double shift[3]) { + + /* Get the relative distance between the pairs, wrapping. */ + const int periodic = s->periodic; + double dx[3]; + for (int k = 0; k < 3; k++) { + dx[k] = (*cj)->loc[k] - (*ci)->loc[k]; + if (periodic && dx[k] < -s->dim[k] / 2) + shift[k] = s->dim[k]; + else if (periodic && dx[k] > s->dim[k] / 2) + shift[k] = -s->dim[k]; + else + shift[k] = 0.0; + dx[k] += shift[k]; + } + + /* Get the sorting index. */ + int sid = 0; + for (int k = 0; k < 3; k++) + sid = 3 * sid + ((dx[k] < 0.0) ? 0 : ((dx[k] > 0.0) ? 2 : 1)); + + sid = sortlistID[sid]; + + /* Return the sort ID. */ + return sid; +} + #endif /* SWIFT_SPACE_GETSID_H */ diff --git a/src/space_recycle.c b/src/space_recycle.c index cf84227302..0b915ac7a2 100644 --- a/src/space_recycle.c +++ b/src/space_recycle.c @@ -232,6 +232,12 @@ void space_rebuild_recycle_mapper(void *map_data, int num_elements, c->mpi.recv = NULL; c->mpi.send = NULL; #endif + c->hydro.density_pack = NULL; // A. Nasar + c->hydro.density_unpack = NULL; + c->hydro.gradient_pack = NULL; + c->hydro.gradient_unpack = NULL; + c->hydro.force_pack = NULL; + c->hydro.force_unpack = NULL; } } diff --git a/src/task.c b/src/task.c index 3b504a79e6..cbe9547e9d 100644 --- a/src/task.c +++ b/src/task.c @@ -164,15 +164,22 @@ const char *subtaskID_names[task_subtype_count] = { "sink_do_gas_swallow", "rt_gradient", "rt_transport", + "gpu_pack", // A. Nasar + "gpu_pack_g", + "gpu_pack_f", + "gpu_unpack", + "gpu_unpack_g", + "gpu_unpack_f", }; const char *task_category_names[task_category_count] = { - "drift", "sorts", "resort", - "hydro", "gravity", "feedback", - "black holes", "cooling", "star formation", - "limiter", "sync", "time integration", - "mpi", "pack", "fof", - "others", "neutrino", "sink", + "drift", "sorts", "resort", + "hydro", "gravity", "feedback", + "black holes", "cooling", "star formation", + "limiter", "sync", "time integration", + "mpi", "pack", "gpu", + "gpu_pack", "gpu_unpack", "fof", + "others", "neutrino", "sink", "RT", "CSDS"}; #ifdef WITH_MPI @@ -598,6 +605,22 @@ void task_unlock(struct task *t) { #ifdef SWIFT_TASKS_WITHOUT_ATOMICS cell_unlocktree(ci); #endif + } else if (subtype == task_subtype_gpu_unpack_d) { + // for(int pp = 0; pp < 128 /*should be sched->pack_size*/; + // pp++){ + // cell_unlocktree(t->ci_unpack[pp]); + // } + /*Do nothing and be on your way*/ + } else if (subtype == task_subtype_gpu_unpack_f) { + /*Do nothing and be on your way*/ + } else if (subtype == task_subtype_gpu_unpack_g) { + /*Do nothing and be on your way*/ + } else if (subtype == task_subtype_gpu_pack_d) { + cell_unlocktree(ci); + } else if (subtype == task_subtype_gpu_pack_f) { + cell_unlocktree(ci); + } else if (subtype == task_subtype_gpu_pack_g) { + cell_unlocktree(ci); } else { /* hydro */ cell_unlocktree(ci); } @@ -645,6 +668,21 @@ void task_unlock(struct task *t) { cell_unlocktree(ci); cell_unlocktree(cj); #endif + } else if (subtype == task_subtype_gpu_pack_d) { + cell_unlocktree(ci); + cell_unlocktree(cj); + } else if (subtype == task_subtype_gpu_pack_f) { + cell_unlocktree(ci); + cell_unlocktree(cj); + } else if (subtype == task_subtype_gpu_pack_g) { + cell_unlocktree(ci); + cell_unlocktree(cj); + } else if (subtype == task_subtype_gpu_unpack_d) { + /* Nothing to do */ + } else if (subtype == task_subtype_gpu_unpack_f) { + /* Nothing to do */ + } else if (subtype == task_subtype_gpu_unpack_g) { + /* Nothing to do */ } else { /* hydro */ cell_unlocktree(ci); cell_unlocktree(cj); @@ -848,6 +886,38 @@ int task_lock(struct task *t) { if (ci->hydro.hold) return 0; if (cell_locktree(ci) != 0) return 0; #endif + } else if (subtype == task_subtype_gpu_pack_d) { + /* Attempt to lock the cell */ + if (ci->hydro.hold) return 0; + if (cell_locktree(ci) != 0) return 0; + } else if (subtype == task_subtype_gpu_pack_f) { + /* Attempt to lock the cell */ + if (ci->hydro.hold) return 0; + if (cell_locktree(ci) != 0) return 0; + } else if (subtype == task_subtype_gpu_pack_g) { + /* Attempt to lock the cell */ + if (ci->hydro.hold) return 0; + if (cell_locktree(ci) != 0) return 0; + } else if (subtype == task_subtype_gpu_unpack_d) { + // for(int pp = 0; pp < 128 /*should be sched->pack_size*/; + // pp++){ + // if (t->ci_unpack[pp]->gpu_done == 0){ + // message("trying to queue an unpack before all packs + // done on GPU"); return 0; + // } + //// if (t->ci_unpack[pp]->hydro.hold) + //// return 0; + //// if (cell_locktree(t->ci_unpack[pp]) != 0) + //// return 0; + // } + /* Nothing to do here */ + return 1; + } else if (subtype == task_subtype_gpu_unpack_f) { + /* Nothing to do here */ + return 1; + } else if (subtype == task_subtype_gpu_unpack_g) { + /* Nothing to do here */ + return 1; } else { /* subtype == hydro */ if (ci->hydro.hold) return 0; if (cell_locktree(ci) != 0) return 0; @@ -964,6 +1034,39 @@ int task_lock(struct task *t) { return 0; } #endif + } else if (subtype == task_subtype_gpu_pack_d) { + /* Lock the parts in both cells */ + if (ci->hydro.hold || cj->hydro.hold) return 0; + if (cell_locktree(ci) != 0) return 0; + if (cell_locktree(cj) != 0) { + cell_unlocktree(ci); + return 0; + } + } else if (subtype == task_subtype_gpu_pack_f) { + /* Lock the parts in both cells */ + if (ci->hydro.hold || cj->hydro.hold) return 0; + if (cell_locktree(ci) != 0) return 0; + if (cell_locktree(cj) != 0) { + cell_unlocktree(ci); + return 0; + } + } else if (subtype == task_subtype_gpu_pack_g) { + /* Lock the parts in both cells */ + if (ci->hydro.hold || cj->hydro.hold) return 0; + if (cell_locktree(ci) != 0) return 0; + if (cell_locktree(cj) != 0) { + cell_unlocktree(ci); + return 0; + } + } else if (subtype == task_subtype_gpu_unpack_d) { + /* Nothing to do here. */ + return 1; + } else if (subtype == task_subtype_gpu_unpack_f) { + /* Nothing to do here. */ + return 1; + } else if (subtype == task_subtype_gpu_unpack_g) { + /* Nothing to do here. */ + return 1; } else { /* subtype == hydro */ /* Lock the parts in both cells */ if (ci->hydro.hold || cj->hydro.hold) return 0; @@ -1127,6 +1230,19 @@ void task_get_group_name(int type, int subtype, char *cluster) { } switch (subtype) { + /* A. Nasar */ + case task_subtype_gpu_pack_d: + case task_subtype_gpu_unpack_d: + strcpy(cluster, "Density"); + break; + case task_subtype_gpu_pack_f: + case task_subtype_gpu_unpack_f: + strcpy(cluster, "Force"); + break; + case task_subtype_gpu_pack_g: + case task_subtype_gpu_unpack_g: + strcpy(cluster, "Gradient"); + break; case task_subtype_density: strcpy(cluster, "Density"); break; @@ -1629,8 +1745,16 @@ void task_dump_active(struct engine *e) { /* Get destination rank of MPI requests. */ int paired = (t->cj != NULL); - int otherrank = t->ci->nodeID; - if (paired) otherrank = t->cj->nodeID; + int otherrank = 0; + // A. N.: Mods requied to stop code crashing when debugging GPU tasks + if (t->subtype != task_subtype_gpu_unpack_d && + t->subtype != task_subtype_gpu_unpack_f && + t->subtype != task_subtype_gpu_unpack_g) + otherrank = t->ci->nodeID; + if (paired && t->subtype != task_subtype_gpu_unpack_d && + t->subtype != task_subtype_gpu_unpack_f && + t->subtype != task_subtype_gpu_unpack_g) + otherrank = t->cj->nodeID; fprintf(file_thread, "%i %i %s %s %i %i %lli %lli %i %i %i %i %lli\n", engine_rank, otherrank, taskID_names[t->type], @@ -1757,6 +1881,14 @@ enum task_categories task_get_category(const struct task *t) { case task_subtype_force: return task_category_hydro; + case task_subtype_gpu_pack_d: // A. Nasar + case task_subtype_gpu_unpack_d: + case task_subtype_gpu_pack_f: + case task_subtype_gpu_unpack_f: + case task_subtype_gpu_pack_g: + case task_subtype_gpu_unpack_g: + return task_category_gpu; + case task_subtype_limiter: return task_category_limiter; diff --git a/src/task.h b/src/task.h index b405a0795f..c6991751b5 100644 --- a/src/task.h +++ b/src/task.h @@ -160,6 +160,12 @@ enum task_subtypes { task_subtype_sink_do_gas_swallow, task_subtype_rt_gradient, task_subtype_rt_transport, + task_subtype_gpu_pack_d, // A. Nasar + task_subtype_gpu_pack_g, + task_subtype_gpu_pack_f, + task_subtype_gpu_unpack_d, + task_subtype_gpu_unpack_g, + task_subtype_gpu_unpack_f, task_subtype_count } __attribute__((packed)); @@ -196,6 +202,9 @@ enum task_categories { task_category_time_integration, task_category_mpi, task_category_pack, + task_category_gpu, + task_category_gpu_pack, + task_category_gpu_unpack, task_category_fof, task_category_others, task_category_neutrino, @@ -235,6 +244,15 @@ struct task { /*! Pointers to the cells this task acts upon */ struct cell *ci, *cj; + int done; // A. Nasar + + int gpu_done; + + int corner_pair; + + /*! Pointers to the cells this task acts upon */ + struct cell **ci_unpack; //, **cj; + /*! List of tasks unlocked by this one */ struct task **unlock_tasks; @@ -286,6 +304,9 @@ struct task { /*! Start and end time of this task */ ticks tic, toc; + ticks total_cpu_pack_ticks; + ticks total_cpu_unpack_ticks; + /* Total time spent running this task */ ticks total_ticks; diff --git a/swift.c b/swift.c index b63941cd63..7a9277ae5c 100644 --- a/swift.c +++ b/swift.c @@ -1108,7 +1108,7 @@ int main(int argc, char *argv[]) { hydro_props_init(&hydro_properties, &prog_const, &us, params); else bzero(&hydro_properties, sizeof(struct hydro_props)); - + float eta_neighbours = hydro_properties.eta_neighbours; /* Initialise the equation of state */ if (with_hydro) eos_init(&eos, &prog_const, &us, params); @@ -1388,7 +1388,7 @@ int main(int argc, char *argv[]) { with_self_gravity, with_star_formation, with_sinks, with_DM_particles, with_DM_background_particles, with_neutrinos, talking, dry_run, nr_nodes); - + s.eta_neighbours = eta_neighbours; /* Initialise the line of sight properties. */ if (with_line_of_sight) los_init(s.dim, &los_properties, params);