diff --git a/.gitignore b/.gitignore
index 46ef541ee9..0e3cb19964 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,8 @@ swift
 swift_mpi
 fof
 fof_mpi
+swift_cuda
+swift_mpicuda
 
 src/version_string.h
 swift*.tar.gz
diff --git a/Makefile.am b/Makefile.am
index b5ede6fd97..51f34ac1ed 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -74,6 +74,23 @@ bin_PROGRAMS += fof_mpi
 endif
 endif
 
+# BUILD CUDA versions as well?
+if HAVECUDA
+bin_PROGRAMS += swift_cuda
+if HAVEMPI
+bin_PROGRAMS += swift_mpicuda
+endif
+endif
+
+
+# BUILD HIP versions as well?
+if HAVEHIP
+bin_PROGRAMS += swift_hip
+if HAVEMPI
+bin_PROGRAMS += swift_mpihip
+endif
+endif
+
 # engine_policy_setaffinity is available?
 if HAVESETAFFINITY
 ENGINE_POLICY_SETAFFINITY=| engine_policy_setaffinity
@@ -91,6 +108,28 @@ swift_mpi_SOURCES = swift.c
 swift_mpi_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)"
 swift_mpi_LDADD =  src/libswiftsim_mpi.la argparse/libargparse.la $(MPI_LIBS) $(VELOCIRAPTOR_MPI_LIBS) $(EXTRA_LIBS) $(LD_CSDS)
 
+# Sources for swift_cuda
+swift_cuda_SOURCES = swift.c dummy.C
+swift_cuda_CXXFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(CUDA_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_CUDA
+swift_cuda_LDADD =  src/.libs/libswiftsim_cuda.a src/cuda/.libs/libswiftCUDA.a $(EXTRA_LIBS) $(CUDA_LIBS) -lcudart argparse/.libs/libargparse.a src/.libs/libgrav.la
+
+# Sources for swift_hip
+swift_hip_SOURCES = swift.c dummy.C
+swift_hip_CXXFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(HIP_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_HIP
+swift_hip_LDADD =  src/.libs/libswiftsim_hip.a src/hip/.libs/libswiftHIP.a $(EXTRA_LIBS) $(HIP_LIBS) -lamdhip64 -L/opt/rocm-5.1.0/lib -lhsa-runtime64 -L/opt/rocm-5.1.0/lib64 -lamd_comgr argparse/.libs/libargparse.a src/.libs/libgrav.la
+
+# Sources for swift_mpicuda, do we need an affinity policy for MPI?
+swift_mpicuda_SOURCES = swift.c dummy.C
+swift_mpicuda_CXXFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) $(CUDA_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_CUDA
+swift_mpicuda_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) $(CUDA_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_CUDA
+swift_mpicuda_LDADD =  src/.libs/libswiftsim_mpicuda.a argparse/.libs/libargparse.a src/.libs/libgrav.la src/cuda/.libs/libswiftCUDA.a $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -lcudart
+
+# Sources for swift_mpihip, do we need an affinity policy for MPI?
+swift_mpihip_SOURCES = swift.c dummy.C
+swift_mpihip_CXXFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) $(HIP_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_HIP
+swift_mpihip_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) $(HIP_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_HIP
+swift_mpihip_LDADD =  src/.libs/libswiftsim_mpihip.a argparse/.libs/libargparse.a src/.libs/libgrav.la src/hip/.libs/libswiftHIP.a $(MPI_LIBS) $(EXTRA_LIBS) $(HIP_LIBS) -lamdhip64
+
 # Sources for fof
 fof_SOURCES = swift_fof.c
 fof_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)"
diff --git a/configure.ac b/configure.ac
index b0173c6954..59fc40aba5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -41,6 +41,10 @@ AC_USE_SYSTEM_EXTENSIONS
 AC_PROG_CC
 AM_PROG_CC_C_O
 
+# Find and test the C++ compiler.
+AC_PROG_CXX
+AC_PROG_CXX_C_O
+
 # We need this for compilation hints and possibly FFTW.
 AX_OPENMP
 
@@ -995,6 +999,78 @@ AH_VERBATIM([__STDC_FORMAT_MACROS],
 #define __STDC_FORMAT_MACROS 1
 #endif])
 
+
+
+# Check for CUDA
+have_cuda="no"
+AC_ARG_WITH([cuda],
+    [AS_HELP_STRING([--with-cuda=PATH],
+       [root directory where CUDA is installed @<:@yes/no@:>@]
+    )],
+    [],
+    [with_cuda="no"]
+)
+if test "x$with_cuda" != "xno"; then
+   if test "x$with_cuda" != "xyes"; then
+      CUDA_CFLAGS="-I$with_cuda/include"
+      CUDA_LIBS="-L$with_cuda/lib -L$with_cuda/lib64 -lcudart"
+      NVCC="$with_cuda/bin/nvcc"
+      have_cuda="yes"
+   else
+      AC_PATH_PROG([NVCC],[nvcc])
+      echo "Found nvcc = $NVCC"
+      if test -n "$NVCC"; then
+         CUDA_ROOT="`dirname $NVCC`/.."
+         CUDA_CFLAGS="-I${CUDA_ROOT}/include"
+         CUDA_LIBS="-L${CUDA_ROOT}/lib -L${CUDA_ROOT}/lib64 -lcudart"
+         have_cuda="yes"
+      fi
+   fi
+   if test "x$have_cuda" != "xno"; then
+      AC_DEFINE([HAVE_CUDA], 1, [The CUDA compiler is installed.])
+   fi
+   CFLAGS="${CFLAGS} "
+fi
+AC_SUBST(CUDA_CFLAGS)
+AC_SUBST(CUDA_LIBS)
+AC_SUBST(NVCC)
+AM_CONDITIONAL([HAVECUDA],[test -n "$NVCC"])
+
+# Check for HIP
+have_hip="no"
+AC_ARG_WITH([hip],
+    [AS_HELP_STRING([--with-hip=PATH],
+       [root directory where HIP is installed @<:@yes/no@:>@]
+    )],
+    [],
+    [with_hip="no"]
+)
+if test "x$with_hip" != "xno"; then
+   if test "x$with_hip" != "xyes"; then
+      HIP_CFLAGS="-I$with_hip/include"
+      HIP_LIBS="-L$with_hip/lib -L$with_hip/lib64"
+      HIPCC="$with_hip/bin/hipcc"
+      have_hip="yes"
+   else
+      AC_PATH_PROG([HIPCC],[hipcc])
+      echo "Found hipcc = $HIPCC"
+      if test -n "$HIPCC"; then
+         HIP_ROOT="`dirname $HIPCC`/.."
+         HIP_CFLAGS="-I${HIP_ROOT}/include"
+         HIP_LIBS="-L${HIP_ROOT}/lib -L${HIP_ROOT}/lib64"
+         have_hip="yes"
+      fi
+   fi
+   if test "x$have_hip" != "xno"; then
+      AC_DEFINE([HAVE_HIP], 1, [The HIP compiler is installed.])
+   fi
+   CFLAGS="${CFLAGS} "
+fi
+AC_SUBST(HIP_CFLAGS)
+AC_SUBST(HIP_LIBS)
+AC_SUBST(HIPCC)
+AM_CONDITIONAL([HAVEHIP],[test -n "$HIPCC"])
+
 # Check for FFTW. We test for this in the standard directories by default,
 # and only disable if using --with-fftw=no or --without-fftw. When a value
 # is given FFTW must be found.
@@ -3246,6 +3322,10 @@ AC_CONFIG_FILES([tests/testSelectOutput.sh], [chmod +x tests/testSelectOutput.sh
 AC_CONFIG_FILES([tests/testFormat.sh], [chmod +x tests/testFormat.sh])
 AC_CONFIG_FILES([tests/testNeutrinoCosmology.sh], [chmod +x tests/testNeutrinoCosmology.sh])
 AC_CONFIG_FILES([tests/output_list_params.yml])
+# cuda .in file
+AC_CONFIG_FILES([src/cuda/Makefile])
+# hip .in file
+AC_CONFIG_FILES([src/hip/Makefile])
 
 # Save the compilation options
 AC_DEFINE_UNQUOTED([SWIFT_CONFIG_FLAGS],["$swift_config_flags"],[Flags passed to configure])
@@ -3276,6 +3356,8 @@ AC_MSG_RESULT([
    HDF5 enabled         : $with_hdf5
     - parallel          : $have_parallel_hdf5
    METIS/ParMETIS       : $have_metis / $have_parmetis
+   CUDA enabled         : $have_cuda
+   HIP enabled          : $have_hip
    FFTW3 enabled        : $have_fftw
     - threaded/openmp   : $have_threaded_fftw / $have_openmp_fftw
     - MPI               : $have_mpi_fftw
diff --git a/cudalt.py b/cudalt.py
new file mode 100755
index 0000000000..e8643cd1e6
--- /dev/null
+++ b/cudalt.py
@@ -0,0 +1,80 @@
+#!/usr/bin/python3
+# libtoolish hack: compile a .cu file like libtool does
+import sys
+import os
+
+lo_filepath = sys.argv[1]
+o_filepath = lo_filepath.replace(".lo", ".o")
+
+try:
+   i = o_filepath.rindex("/")
+   lo_dir = o_filepath[0:i+1]
+   o_filename = o_filepath[i+1:]
+
+except ValueError:
+   lo_dir = ""
+   o_filename = o_filepath
+
+local_pic_dir = ".libs/"
+local_npic_dir = ""
+pic_dir = lo_dir + local_pic_dir
+npic_dir = lo_dir + local_npic_dir
+
+pic_filepath = pic_dir + o_filename
+npic_filepath = npic_dir + o_filename
+local_pic_filepath = local_pic_dir + o_filename
+local_npic_filepath = local_npic_dir + o_filename
+
+# Make lib dir
+try:
+   os.mkdir(pic_dir)
+except OSError:
+   pass
+
+# generate the command to compile the .cu for shared library
+args = sys.argv[2:]
+args.extend(["-Xcompiler","-fPIC"]) 
+# position indep code
+args.append("-o")
+args.append(pic_filepath)
+command = " ".join(args)
+print (command)
+
+# compile the .cu
+rv = os.system(command)
+if rv != 0:
+    sys.exit(1)
+
+# generate the command to compile the .cu for static library
+args = sys.argv[2:]
+args.append("-o")
+args.append(npic_filepath)
+command = " ".join(args)
+print (command)
+
+# compile the .cu
+rv = os.system(command)
+if rv != 0:
+    sys.exit(1)
+
+# get libtool version
+fd = os.popen("libtool --version")
+libtool_version = fd.readline()
+fd.close()
+
+# generate the .lo file
+f = open(lo_filepath, "w")
+f.write("# " +  lo_filepath + " - a libtool object file\n")
+f.write("# Generated by " + libtool_version + "\n")
+f.write("#\n")
+f.write("# Please DO NOT delete this file!\n")
+f.write("# It is necessary for linking the library.\n\n")
+
+f.write("# Name of the PIC object.\n")
+f.write("pic_object='" + local_pic_filepath + "'\n\n")
+
+f.write("# Name of the non-PIC object.\n")
+f.write("non_pic_object='" + local_npic_filepath + "'\n")
+f.close()
+
+sys.exit(0)
diff --git a/dummy.C b/dummy.C
new file mode 100755
index 0000000000..bbf68f8cea
--- /dev/null
+++ b/dummy.C
@@ -0,0 +1,3 @@
+void dummy(){
+
+}
diff --git a/examples/HydroTests/GreshoVortex_3D/getGlass.sh b/examples/HydroTests/GreshoVortex_3D/getGlass.sh
index d5c5f590ac..068986fc10 100755
--- a/examples/HydroTests/GreshoVortex_3D/getGlass.sh
+++ b/examples/HydroTests/GreshoVortex_3D/getGlass.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-wget http://virgodb.cosma.dur.ac.uk/swift-webstorage/ICs/glassCube_64.hdf5
+wget http://virgodb.cosma.dur.ac.uk/swift-webstorage/ICs/glassCube_128.hdf5
diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index a95a0eae32..6c945e7473 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,21 +7,25 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 15
-
+  max_top_level_cells: 8
+  tasks_per_cell: 200
+    #  deadlock_waiting_time_s:   10
+    #  cell_split_size: 100
+    #  cell_sub_size_pair_hydro:  10000 # (Optional) Maximal number of hydro-hydro interactions per sub-pair hydro/star task (this is the default value).
+    #  cell_sub_size_self_hydro:  100 # (Optional) Maximal number of hydro-hydro interactions per sub-self hydro/star task. Set to how many cells are targeted for GPU tasks  
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
   time_end:   1.    # The end time of the simulation (in internal units).
   dt_min:     1e-6  # The minimal time-step size of the simulation (in internal units).
-  dt_max:     1e-2  # The maximal time-step size of the simulation (in internal units).
+  dt_max:     1e-4  # The maximal time-step size of the simulation (in internal units).
 
 # Parameters governing the snapshots
 Snapshots:
   basename:            gresho # Common part of the name of output files
   time_first:          0.     # Time of the first output (in internal units)
-  delta_time:          1e-1   # Time difference between consecutive outputs (in internal units)
-  compression:         1
+  delta_time:          1e-3   # Time difference between consecutive outputs (in internal units)
+  #  compression:         1
   
 # Parameters governing the conserved quantities statistics
 Statistics:
@@ -29,10 +33,11 @@ Statistics:
 
 # Parameters for the hydrodynamics scheme
 SPH:
-  resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+  resolution_eta:        1.9   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
   
 # Parameters related to the initial conditions
 InitialConditions:
-  file_name:  ./greshoVortex.hdf5     # The file to read
-  periodic:   1
\ No newline at end of file
+  file_name:  greshoVortex.hdf5
+  periodic:   1
+    #  replicate:  2
diff --git a/examples/HydroTests/GreshoVortex_3D/gresho_split_size_500.yml b/examples/HydroTests/GreshoVortex_3D/gresho_split_size_500.yml
new file mode 100644
index 0000000000..3105787d75
--- /dev/null
+++ b/examples/HydroTests/GreshoVortex_3D/gresho_split_size_500.yml
@@ -0,0 +1,42 @@
+# Define the system of units to use internally. 
+InternalUnitSystem:
+  UnitMass_in_cgs:     1   # Grams
+  UnitLength_in_cgs:   1   # Centimeters
+  UnitVelocity_in_cgs: 1   # Centimeters per second
+  UnitCurrent_in_cgs:  1   # Amperes
+  UnitTemp_in_cgs:     1   # Kelvin
+
+Scheduler:
+  max_top_level_cells: 16
+  tasks_per_cell: 200
+  cell_split_size: 700
+  cell_sub_size_pair_hydro:  49000 # (Optional) Maximal number of hydro-hydro interactions per sub-pair hydro/star task (this is the default value).
+  cell_sub_size_self_hydro:  700 # (Optional) Maximal number of hydro-hydro interactions per sub-self hydro/star task. Set to how many cells are targeted for GPU tasks  
+# Parameters governing the time integration
+TimeIntegration:
+  time_begin: 0.    # The starting time of the simulation (in internal units).
+  time_end:   1.    # The end time of the simulation (in internal units).
+  dt_min:     1e-6  # The minimal time-step size of the simulation (in internal units).
+  dt_max:     1e-4  # The maximal time-step size of the simulation (in internal units).
+
+# Parameters governing the snapshots
+Snapshots:
+  basename:            gresho # Common part of the name of output files
+  time_first:          0.     # Time of the first output (in internal units)
+  delta_time:          1e-3   # Time difference between consecutive outputs (in internal units)
+  #  compression:         1
+  
+# Parameters governing the conserved quantities statistics
+Statistics:
+  delta_time:          1e-2 # Time between statistics output
+
+# Parameters for the hydrodynamics scheme
+SPH:
+  resolution_eta:        1.9   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+  CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
+  
+# Parameters related to the initial conditions
+InitialConditions:
+  file_name:  greshoVortex.hdf5
+  periodic:   1
+  replicate:  8
diff --git a/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/isolated_galaxy.yml b/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/isolated_galaxy.yml
index 8717af63bd..bcabd810dd 100644
--- a/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/isolated_galaxy.yml
+++ b/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/isolated_galaxy.yml
@@ -10,6 +10,13 @@ InternalUnitSystem:
   UnitCurrent_in_cgs:  1             # Amperes
   UnitTemp_in_cgs:     1             # Kelvin
 
+
+
+
+
+
+
+
 # Parameters for the self-gravity scheme
 Gravity:
   eta:          0.025                 # Constant dimensionless multiplier for time integration.
@@ -24,7 +31,7 @@ TimeIntegration:
   time_begin:        0.    # The starting time of the simulation (in internal units).
   time_end:          0.1   # The end time of the simulation (in internal units).
   dt_min:            1e-9  # The minimal time-step size of the simulation (in internal units).
-  dt_max:            1e-2  # The maximal time-step size of the simulation (in internal units).
+  dt_max:            1e-6  # The maximal time-step size of the simulation (in internal units).
 
 # Parameters governing the snapshots
 Snapshots:
diff --git a/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/run.sh b/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/run.sh
index 6931897b2c..6a2fa4d897 100755
--- a/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/run.sh
+++ b/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/run.sh
@@ -30,7 +30,7 @@ then
     ./getEaglePhotometryTable.sh
 fi
 
-../../../swift --threads=16 --feedback --external-gravity --self-gravity --stars --star-formation --cooling --hydro --limiter --sync isolated_galaxy.yml 2>&1 | tee output.log
+../../../swift_mpicuda --threads=16 --feedback --external-gravity --self-gravity --stars --star-formation --cooling --hydro --limiter --sync isolated_galaxy.yml 2>&1 | tee output.log
 
 # Kennicutt-Schmidt law plot
 python3 plotSolution.py 100
diff --git a/src/Makefile.am b/src/Makefile.am
index 8099524651..99092acde4 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -16,7 +16,10 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 # Add the non-standard paths to the included library headers
-AM_CFLAGS = $(HDF5_CPPFLAGS) $(GSL_INCS) $(FFTW_INCS) $(NUMA_INCS) $(GRACKLE_INCS)  $(SUNDIALS_INCS) $(CHEALPIX_CFLAGS)
+AM_CFLAGS = $(HDF5_CPPFLAGS) $(GSL_INCS) $(FFTW_INCS) $(NUMA_INCS) $(GRACKLE_INCS)  $(SUNDIALS_INCS) $(CHEALPIX_CFLAGS) -O0
+
+# Add HIP Path
+AM_CFLAGS += -D__HIP_PLATFORM_AMD__
 
 # Assign a "safe" version number
 AM_LDFLAGS = $(HDF5_LDFLAGS) $(FFTW_LIBS)
@@ -40,6 +43,22 @@ lib_LTLIBRARIES += libswiftsim_mpi.la
 noinst_LTLIBRARIES += libgrav_mpi.la
 endif
 
+# Build a cuda version too?
+if HAVECUDA
+lib_LTLIBRARIES += libswiftsim_cuda.la
+if HAVEMPI
+lib_LTLIBRARIES += libswiftsim_mpicuda.la
+endif
+endif
+
+# Build a hip version too?
+if HAVEHIP
+lib_LTLIBRARIES += libswiftsim_hip.la
+if HAVEMPI
+lib_LTLIBRARIES += libswiftsim_mpihip.la
+endif
+endif
+
 # List required headers
 include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h 
 include_HEADERS += cell_hydro.h cell_stars.h cell_grav.h cell_sinks.h cell_black_holes.h cell_rt.h cell_grid.h
@@ -161,7 +180,7 @@ endif
 AM_SOURCES = space.c space_rebuild.c space_regrid.c space_unique_id.c 
 AM_SOURCES += space_sort.c space_split.c space_extras.c space_first_init.c space_init.c 
 AM_SOURCES += space_cell_index.c space_recycle.c 
-AM_SOURCES += runner_main.c runner_doiact_hydro.c runner_doiact_limiter.c
+AM_SOURCES += runner_main.c runner_doiact_hydro.c runner_doiact_limiter.c runner_gpu_pack_functions.c
 AM_SOURCES += runner_doiact_stars.c runner_doiact_black_holes.c runner_ghost.c
 AM_SOURCES += runner_recv.c runner_pack.c
 AM_SOURCES += runner_sort.c runner_drift.c runner_black_holes.c runner_time_integration.c 
@@ -208,7 +227,7 @@ AM_SOURCES += $(SPHM1RT_RT_SOURCES)
 AM_SOURCES += $(GEAR_RT_SOURCES)
 
 # Include files for distribution, not installation.
-nobase_noinst_HEADERS = align.h approx_math.h atomic.h barrier.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h 
+nobase_noinst_HEADERS = align.h approx_math.h atomic.h barrier.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h runner_gpu_pack_functions.h
 nobase_noinst_HEADERS += gravity_iact.h kernel_long_gravity.h vector.h accumulate.h cache.h exp.h log.h
 nobase_noinst_HEADERS += runner_doiact_nosort.h runner_doiact_hydro.h runner_doiact_stars.h runner_doiact_black_holes.h runner_doiact_grav.h
 nobase_noinst_HEADERS += runner_doiact_functions_hydro.h runner_doiact_functions_stars.h runner_doiact_functions_black_holes.h 
@@ -526,6 +545,33 @@ libswiftsim_mpi_la_LDFLAGS = $(AM_LDFLAGS) $(MPI_LIBS) $(EXTRA_LIBS) -version-in
 libswiftsim_mpi_la_SHORTNAME = mpi
 libswiftsim_mpi_la_LIBADD = $(GRACKLE_LIBS) $(VELOCIRAPTOR_LIBS) $(MPI_LIBS) libgrav_mpi.la
 
+# Sources and flags for regular CUDA library
+libswiftsim_cuda_la_SOURCES = $(AM_SOURCES)
+libswiftsim_cuda_la_CFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) -DWITH_CUDA
+libswiftsim_cuda_la_CXXFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) -DWITH_CUDA
+libswiftsim_cuda_la_LDFLAGS = $(AM_LDFLAGS) $(EXTRA_LIBS) $(CUDA_LIBS)
+libswiftsim_cuda_la_SHORTNAME = cuda
+libswiftsim_cuda_la_LIBADD = $(GRACKLE_LIBS) $(VELOCIRAPTOR_LIBS) $(MPI_LIBS) libgrav.la
+
+# Sources and flags for regular HIP library
+libswiftsim_hip_la_SOURCES = $(AM_SOURCES)
+libswiftsim_hip_la_CFLAGS = $(AM_CFLAGS) $(HIP_CFLAGS) -DWITH_HIP
+libswiftsim_hip_la_LDFLAGS = $(AM_LDFLAGS) $(EXTRA_LIBS) $(HIP_LIBS) -lamdhip64
+libswiftsim_hip_la_SHORTNAME = hip
+libswiftsim_hip_la_LIBADD = $(GRACKLE_LIBS) $(VELOCIRAPTOR_LIBS) $(MPI_LIBS) libgrav.la
+
+# Sources and flags for MPI CUDA library
+libswiftsim_mpicuda_la_SOURCES = $(AM_SOURCES)
+libswiftsim_mpicuda_la_CFLAGS = $(AM_CFLAGS) $(MPI_FLAGS) $(CUDA_CFLAGS) -DWITH_CUDA
+libswiftsim_mpicuda_la_CXXFLAGS = $(AM_CFLAGS) $(MPI_FLAGS) $(CUDA_CFLAGS) -DWITH_CUDA
+libswiftsim_mpicuda_la_LDFLAGS = $(AM_LDFLAGS) $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS)
+libswiftsim_mpicuda_la_SHORTNAME = mpicuda
+libswiftsim_mpicuda_la_LIBADD = $(GRACKLE_LIBS) $(VELOCIRAPTOR_LIBS) $(MPI_LIBS) libgrav_mpi.la
+
+#subdir
+SUBDIRS = . cuda
+SUBDIRS += . hip
+
 # Versioning. If any sources change then update the version_string.h file with
 # the current git revision and package version.
 # May have a checkout without a version_string.h file and no git command (tar/zip
diff --git a/src/cell.h b/src/cell.h
index cac5c49878..1d2aa0d7e1 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -360,6 +360,39 @@ enum cell_flags {
  */
 struct cell {
 
+  /*Marks a cell for GPU execution A. Nasar */
+  bool is_gpu_cell;
+
+  int unpacker_cell;
+
+  /*Marks a cell as having done its pack task 0->not 1-> yes*/
+  int pack_done;
+  /*Marks a cell as having done its pack task 0->not 1-> yes*/
+  int pack_done_g;
+  /*Marks a cell as having done its pack task 0->not 1-> yes*/
+  int pack_done_f;
+
+  /*Has the task run on the GPU? 0->No, 1-> Yes*/
+  int gpu_done;
+  /*Has the task run on the GPU? 0->No, 1-> Yes*/
+  int gpu_done_g;
+  /*Has the task run on the GPU? 0->No, 1-> Yes*/
+  int gpu_done_f;
+
+  /*Has the task run on the GPU? 0->No, 1-> Yes*/
+  int unpack_done;
+  /*Has the task run on the GPU? 0->No, 1-> Yes*/
+  int unpack_done_g;
+  /*Has the task run on the GPU? 0->No, 1-> Yes*/
+  int unpack_done_f;
+
+  /*Has the pair task run on the GPU? 0->No, 1-> Yes*/
+  int gpu_done_pair;
+  /*Has the pair task run on the GPU? 0->No, 1-> Yes*/
+  int gpu_done_pair_g;
+  /*Has the pair task run on the GPU? 0->No, 1-> Yes*/
+  int gpu_done_pair_f;
+
   /*! The cell location on the grid (corner nearest to the origin). */
   double loc[3];
 
diff --git a/src/cell_hydro.h b/src/cell_hydro.h
index 39db7bc219..14b37dcd6d 100644
--- a/src/cell_hydro.h
+++ b/src/cell_hydro.h
@@ -61,6 +61,25 @@ struct cell_hydro {
     /*! Linked list of the tasks computing this cell's hydro density. */
     struct link *density;
 
+    /*! Linked list of the tasks computing this cell's hydro density pack. A.
+     * Nasar */
+    struct link *density_pack;
+    struct link *density_unpack;
+    /*! Linked list of the tasks computing this cell's hydro force pack. */
+    struct link *force_pack;
+    struct link *force_unpack;
+    /*! Linked list of the tasks computing this cell's hydro gradient pack. */
+    struct link *gradient_pack;
+    struct link *gradient_unpack;
+
+    struct task *d_pack;
+    struct task *g_pack;
+    struct task *f_pack;
+
+    struct task *d_unpack;
+    struct task *g_unpack;
+    struct task *f_unpack;
+
     /* Linked list of the tasks computing this cell's hydro gradients. */
     struct link *gradient;
 
diff --git a/src/cell_unskip.c b/src/cell_unskip.c
index 6ad14a3560..a9572ea3bc 100644
--- a/src/cell_unskip.c
+++ b/src/cell_unskip.c
@@ -884,7 +884,7 @@ void cell_activate_subcell_hydro_tasks(struct cell *ci, struct cell *cj,
       cell_activate_hydro_sorts(ci, sid, s);
       cell_activate_hydro_sorts(cj, sid, s);
     }
-  } /* Otherwise, pair interation */
+  } /* Otherwise, pair interaction */
 }
 
 /**
@@ -1657,7 +1657,6 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
     if ((ci_active && ci_nodeID == nodeID) ||
         (cj_active && cj_nodeID == nodeID)) {
       scheduler_activate(s, t);
-
       /* Activate hydro drift */
       if (t->type == task_type_self) {
         if (ci_nodeID == nodeID) cell_activate_drift_part(ci, s);
@@ -1903,19 +1902,94 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
 #endif
     }
   }
-
   /* Unskip all the other task types. */
   int c_active = cell_is_active_hydro(c, e);
   if (c->nodeID == nodeID && c_active) {
+    for (struct link *l = c->hydro.density_pack; l != NULL;
+         l = l->next) { /* A. Nasar */
+    	if(l->t->type == task_type_self && l->t->ci->hydro.count > 0)
+          scheduler_activate(s, l->t);
+    	else if(l->t->type == task_type_pair && l->t->ci->hydro.count > 0 && l->t->cj->hydro.count > 0)
+          scheduler_activate(s, l->t);
+#ifdef SWIFT_DEBUG_CHECKS
+      if (l->t->ci != NULL) {
+        l->t->ci->pack_done = 0;
+        l->t->ci->gpu_done = 0;
+        l->t->ci->unpack_done = 0;
+      }
+      if (l->t->cj != NULL) {
+        l->t->cj->pack_done = 0;
+        l->t->cj->gpu_done = 0;
+        l->t->cj->unpack_done = 0;
+      }
+#endif
+    }
+    for (struct link *l = c->hydro.density_unpack; l != NULL; l = l->next) {
+      scheduler_activate(s, l->t);
+#ifdef SWIFT_DEBUG_CHECKS
+      l->t->gpu_done = 0;
+#endif
+    }
     for (struct link *l = c->hydro.gradient; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
     }
     for (struct link *l = c->hydro.force; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
     }
-
     for (struct link *l = c->hydro.limiter; l != NULL; l = l->next)
       scheduler_activate(s, l->t);
+    // A. Nasar activate force and gradient packing tasks
+    for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
+    	if(l->t->type == task_type_self && l->t->ci->hydro.count > 0)
+          scheduler_activate(s, l->t);
+    	else if(l->t->type == task_type_pair && l->t->ci->hydro.count > 0 && l->t->cj->hydro.count > 0)
+          scheduler_activate(s, l->t);
+#ifdef SWIFT_DEBUG_CHECKS
+      if (l->t->ci != NULL) {
+        l->t->ci->pack_done_f = 0;
+        l->t->ci->gpu_done_f = 0;
+        l->t->ci->unpack_done_f = 0;
+      }
+      if (l->t->cj != NULL) {
+        l->t->cj->pack_done_f = 0;
+        l->t->cj->gpu_done_f = 0;
+        l->t->cj->unpack_done_f = 0;
+      }
+#endif
+    }
+    for (struct link *l = c->hydro.force_unpack; l != NULL; l = l->next) {
+      scheduler_activate(s, l->t);
+#ifdef SWIFT_DEBUG_CHECKS
+      l->t->gpu_done = 0;
+#endif
+    }
+
+#ifdef EXTRA_HYDRO_LOOP
+    for (struct link *l = c->hydro.gradient_pack; l != NULL; l = l->next) {
+    	if(l->t->type == task_type_self && l->t->ci->hydro.count > 0)
+          scheduler_activate(s, l->t);
+    	else if(l->t->type == task_type_pair && l->t->ci->hydro.count > 0 && l->t->cj->hydro.count > 0)
+          scheduler_activate(s, l->t);
+#ifdef SWIFT_DEBUG_CHECKS
+      if (l->t->ci != NULL) {
+        l->t->ci->pack_done_g = 0;
+        l->t->ci->gpu_done_g = 0;
+        l->t->ci->unpack_done_g = 0;
+      }
+      if (l->t->cj != NULL) {
+        l->t->cj->pack_done_g = 0;
+        l->t->cj->gpu_done_g = 0;
+        l->t->cj->unpack_done_g = 0;
+      }
+#endif
+    }
+    for (struct link *l = c->hydro.gradient_unpack; l != NULL; l = l->next) {
+      scheduler_activate(s, l->t);
+#ifdef SWIFT_DEBUG_CHECKS
+      l->t->gpu_done = 0;
+#endif
+    }
+#endif
 
     if (c->hydro.extra_ghost != NULL)
       scheduler_activate(s, c->hydro.extra_ghost);
diff --git a/src/clocks.h b/src/clocks.h
index e39d8e8195..4cc7cdaac7 100644
--- a/src/clocks.h
+++ b/src/clocks.h
@@ -20,8 +20,11 @@
 #define SWIFT_CLOCKS_H
 
 /* Config parameters. */
+#ifdef WITH_CUDA
+#include "../config.h"
+#else
 #include <config.h>
-
+#endif
 /* System includes. */
 #include <sys/times.h>
 
diff --git a/src/cuda/BLOCK_SIZE.h b/src/cuda/BLOCK_SIZE.h
new file mode 100644
index 0000000000..2d5dda1af2
--- /dev/null
+++ b/src/cuda/BLOCK_SIZE.h
@@ -0,0 +1,12 @@
+#ifndef BLOCK_SIZE_H
+#define BLOCK_SIZE_H
+
+#define BLOCK_SIZE 64
+#define N_TASKS_PER_PACK_SELF 8
+#define N_TASKS_BUNDLE_SELF 2 
+
+#define BLOCK_SIZE_PAIR 64
+#define N_TASKS_PER_PACK_PAIR 4
+#define N_TASKS_BUNDLE_PAIR 1
+
+#endif  // BLOCK_SIZE_H
diff --git a/src/cuda/GPU_runner_functions.cu b/src/cuda/GPU_runner_functions.cu
new file mode 100644
index 0000000000..d3c08c10ae
--- /dev/null
+++ b/src/cuda/GPU_runner_functions.cu
@@ -0,0 +1,4323 @@
+/*******************************************************************************
+ * This file contains functions used to setup and execute GPU tasks from within
+ *runner_main.c. Consider this a translator allowing .cu based functions to be
+ *called from within runner_main.c
+ ******************************************************************************/
+
+/* Hacky method to make c++ compilers not die. */
+#ifdef WITH_CUDA
+#ifndef static
+#define static
+#endif
+#ifndef restrict
+#define restrict __restrict__
+#endif
+#endif
+
+/* Required header files */
+#include <stdio.h>
+/*ifdef WITH_CUDA prevents name mangling. C code sees exact names
+ of functions rather than mangled template names produced by C++*/
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+
+#include "../../config.h"
+
+#ifndef BLOCK_SIZE_H
+#include "BLOCK_SIZE.h"
+#endif
+
+#include "GPU_runner_functions.h"
+#include "device_functions.h"
+#include "part_gpu.h"
+
+#include <cuda_profiler_api.h>
+
+#ifdef WITH_CUDA
+}
+#endif
+
+/* function to initialise GPU and printout GPU name*/
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void Initialise_GPU() {
+  int devId = 0;
+  // find and print device name
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, devId);
+  printf("Device : %s\n", prop.name);
+  cudaSetDevice(devId);
+  // cuda
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void tester(struct part_soa parts_soa, int *d_task_first_part,
+                       int *d_task_last_part, float d_a, float d_H, int bid,
+                       int tid, int count_tasks, int tasksperbundle,
+                       int nBlocks_per_task, int bundle_first_task,
+                       int max_parts, int time_bin_inhibited) {
+  extern __shared__ float vars[];
+  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+  first_part_in_task_blocks = d_task_first_part[task_id],
+  last_part_in_task_blocks = d_task_last_part[task_id];
+  __syncthreads();
+  const int pid = threadid + first_part_in_task_blocks;
+
+  if (pid < last_part_in_task_blocks) {
+    parts_soa.tid_p[pid] = 1;
+  }
+  //  if(parts_soa.tid_p[pid] == 1 && pid < last_part_in_task_blocks)
+  //	  printf("tid %i last_part_in_blocks %i\n", parts_soa.tid_p[pid],
+  // last_part_in_task_blocks);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_self_density_GPU(
+    struct part_soa parts_soa, int *d_task_first_part, int *d_task_last_part,
+    float d_a, float d_H, int count_tasks, int tasksperbundle,
+    int nBlocks_per_task, int bundle_first_task, int max_parts) {
+  extern __shared__ float vars[];
+  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  //  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+  int first_part_in_task_blocks, last_part_in_task_blocks;
+  first_part_in_task_blocks = d_task_first_part[task_id],
+  last_part_in_task_blocks = d_task_last_part[task_id];
+  //  __syncthreads();
+  const int pid = threadid + first_part_in_task_blocks;
+
+  int ttid = 0;
+  int first_part = 0;
+  int count = 0;
+  int last_part = 0;
+  float cellx = 0.0, celly = 0.0, cellz = 0.0;
+  float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float rho_dhi = 0.0;
+  float wcounti = 0.0;
+  float wcount_dhi = 0.0;
+  float div_vi = 0.0;
+  float rot_uxi = 0.0;
+  float rot_uyi = 0.0;
+  float rot_uzi = 0.0;
+  int Found_neighbours = 0;
+  //	if(pid<b_last_part&&pid<last_part_in_task_blocks){
+  if (pid < last_part_in_task_blocks) {
+    ttid = parts_soa.tid_p[pid];
+    first_part = d_task_first_part[ttid];
+    last_part = d_task_last_part[ttid];
+    count = last_part - first_part;
+    cellx = parts_soa.locx[pid], celly = parts_soa.locy[pid],
+    cellz = parts_soa.locz[pid];
+    hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+    mi = parts_soa.mass[pid];
+    uxi = parts_soa.ux[pid];
+    uyi = parts_soa.uy[pid];
+    uzi = parts_soa.uz[pid];
+    pix = parts_soa.x_p[pid] - cellx;
+    piy = parts_soa.y_p[pid] - celly;
+    piz = parts_soa.z_p[pid] - cellz;
+  }
+  //  if (threadIdx.x == 0) {
+  //    first_part_tid_0 = first_part;
+  //    last_part_tid_0 = last_part;
+  //  }
+  //  __syncthreads();
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&vars[0];
+  float *y_p_tmp = (float *)&vars[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&vars[BLOCK_SIZE * 2];
+  float *h_tmp = (float *)&vars[BLOCK_SIZE * 3];
+  float *mass_tmp = (float *)&vars[BLOCK_SIZE * 4];
+  float *ux_tmp = (float *)&vars[BLOCK_SIZE * 5];
+  float *uy_tmp = (float *)&vars[BLOCK_SIZE * 6];
+  float *uz_tmp = (float *)&vars[BLOCK_SIZE * 7];
+  timebin_t *timebin = (timebin_t *)&vars[BLOCK_SIZE * 8];
+  /*Particles copied in blocks to shared memory*/
+  for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+       b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    x_p_tmp[threadIdx.x] = parts_soa.x_p[j];
+    y_p_tmp[threadIdx.x] = parts_soa.y_p[j];
+    z_p_tmp[threadIdx.x] = parts_soa.z_p[j];
+    h_tmp[threadIdx.x] = parts_soa.h[j];
+    mass_tmp[threadIdx.x] = parts_soa.mass[j];
+    ux_tmp[threadIdx.x] = parts_soa.ux[j];
+    uy_tmp[threadIdx.x] = parts_soa.uy[j];
+    uz_tmp[threadIdx.x] = parts_soa.uz[j];
+    timebin[threadIdx.x] = parts_soa.time_bin[j];
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+      //      if ((j != pid) && (j < last_part_in_task_blocks) &&
+      //          timebin[j_block] != time_bin_inhibited) {
+      //      if ((j < last_part_in_task_blocks) &&
+      //    	  timebin[j_block] != time_bin_inhibited) {
+      if (j < last_part_in_task_blocks) {
+        /* Compute the pairwise distance. */
+        const float pjx = x_p_tmp[j_block] - cellx;
+        const float pjy = y_p_tmp[j_block] - celly;
+        const float pjz = z_p_tmp[j_block] - cellz;
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        //				if((hi < 0.0001f || hj < 0.0001f || r2 <
+        // 0.0000001f) && pid < last_part_in_task_blocks){
+        // printf("very small value for hi %f or hj %f or r2 %f\n", hi, hj, r2);
+        //				}
+        if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
+          //        if (r2 < hig2 && r2 > (0.01f/256.f)*(0.01f/256.f)) {
+          Found_neighbours = 1;
+          const float r = sqrt(r2);
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          /* Get the kernel for hi. */
+          if (hi < 1.f / 256.f) printf("h < dx\n");
+          //          if(hi<1.f/256.f)printf("h < dx\n");
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          rhoi += mj * wi;
+          rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+          wcounti += wi;
+          wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+
+          /* Compute dv dot r */
+          float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+          div_vi -= faci * dvdr;
+
+          /* Compute dv cross r */
+          float curlvrx = dvy * zij - dvz * yij;
+          float curlvry = dvz * xij - dvx * zij;
+          float curlvrz = dvx * yij - dvy * xij;
+
+          rot_uxi += faci * curlvrx;
+          rot_uyi += faci * curlvry;
+          rot_uzi += faci * curlvrz;
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks) {
+    //	float wi, wi_dx;
+    //	d_kernel_deval(0.f, &wi, &wi_dx);
+    //	printf("mass i %e, self rho %e sum rho %e\n", mi, mi*wi, rhoi);
+    //    if(Found_neighbours == 0) printf("Not sure what's going on but no
+    //    neighbours found in GPU loop\n");
+    parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+    parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+    parts_soa.div_v[pid] = div_vi;
+    parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi,
+    parts_soa.rot_uz[pid] = rot_uzi;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void DOSELF_GPU_AOS(struct part_aos *parts_aos,
+                               int *d_task_first_part, int *d_task_last_part,
+                               float d_a, float d_H, int count_tasks,
+                               int tasksperbundle, int nBlocks_per_task,
+                               int bundle_first_task, int max_parts,
+                               double *d_cell_x, double *d_cell_y,
+                               double *d_cell_z) {
+  extern __shared__ float vars[];
+  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  //  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+  int first_part_in_task_blocks, last_part_in_task_blocks;
+  first_part_in_task_blocks = d_task_first_part[task_id],
+  last_part_in_task_blocks = d_task_last_part[task_id];
+  //  __syncthreads();
+  const int pid = threadid + first_part_in_task_blocks;
+
+  int ttid = 0;
+  int first_part = 0;
+  int count = 0;
+  int last_part = 0;
+  float cellx = 0.0, celly = 0.0, cellz = 0.0;
+  float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float rho_dhi = 0.0;
+  float wcounti = 0.0;
+  float wcount_dhi = 0.0;
+  float div_vi = 0.0;
+  float rot_uxi = 0.0;
+  float rot_uyi = 0.0;
+  float rot_uzi = 0.0;
+  int Found_neighbours = 0;
+  struct part_aos ipart = parts_aos[pid];
+  //	if(pid<b_last_part&&pid<last_part_in_task_blocks){
+  if (pid < last_part_in_task_blocks) {
+    ttid = task_id;
+    first_part = d_task_first_part[ttid];
+    last_part = d_task_last_part[ttid];
+    count = last_part - first_part;
+    cellx = d_cell_x[ttid], celly = d_cell_y[ttid], cellz = d_cell_z[ttid];
+    hi = ipart.h, hig2 = hi * hi * kernel_gamma2;
+    mi = ipart.mass;
+    uxi = ipart.ux;
+    uyi = ipart.uy;
+    uzi = ipart.uz;
+    pix = ipart.x_p - cellx;
+    piy = ipart.y_p - celly;
+    piz = ipart.z_p - cellz;
+  }
+  //  if (threadIdx.x == 0) {
+  //    first_part_tid_0 = first_part;
+  //    last_part_tid_0 = last_part;
+  //  }
+  //  __syncthreads();
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&vars[0];
+  float *y_p_tmp = (float *)&vars[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&vars[BLOCK_SIZE * 2];
+  float *h_tmp = (float *)&vars[BLOCK_SIZE * 3];
+  float *mass_tmp = (float *)&vars[BLOCK_SIZE * 4];
+  float *ux_tmp = (float *)&vars[BLOCK_SIZE * 5];
+  float *uy_tmp = (float *)&vars[BLOCK_SIZE * 6];
+  float *uz_tmp = (float *)&vars[BLOCK_SIZE * 7];
+  int *timebin = (int *)&vars[BLOCK_SIZE * 8];
+  /*Particles copied in blocks to shared memory*/
+  //  struct parts_aos jparts[count];
+  for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+       b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    struct part_aos jpart = parts_aos[j];
+    x_p_tmp[threadIdx.x] = jpart.x_p;
+    y_p_tmp[threadIdx.x] = jpart.y_p;
+    z_p_tmp[threadIdx.x] = jpart.z_p;
+    h_tmp[threadIdx.x] = jpart.h;
+    mass_tmp[threadIdx.x] = jpart.mass;
+    ux_tmp[threadIdx.x] = jpart.ux;
+    uy_tmp[threadIdx.x] = jpart.uy;
+    uz_tmp[threadIdx.x] = jpart.uz;
+    timebin[threadIdx.x] = jpart.time_bin;
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+      if (j < last_part_in_task_blocks) {
+        /* Compute the pairwise distance. */
+        const float pjx = x_p_tmp[j_block] - cellx;
+        const float pjy = y_p_tmp[j_block] - celly;
+        const float pjz = z_p_tmp[j_block] - cellz;
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
+          Found_neighbours = 1;
+          const float r = sqrt(r2);
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          /* Get the kernel for hi. */
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          rhoi += mj * wi;
+          rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+          wcounti += wi;
+          wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+
+          /* Compute dv dot r */
+          float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+          div_vi -= faci * dvdr;
+
+          /* Compute dv cross r */
+          float curlvrx = dvy * zij - dvz * yij;
+          float curlvry = dvz * xij - dvx * zij;
+          float curlvrz = dvx * yij - dvy * xij;
+
+          rot_uxi += faci * curlvrx;
+          rot_uyi += faci * curlvry;
+          rot_uzi += faci * curlvrz;
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks) {
+    //	float wi, wi_dx;
+    //	d_kernel_deval(0.f, &wi, &wi_dx);
+    //	printf("mass i %e, self rho %e sum rho %e\n", mi, mi*wi, rhoi);
+    //    if(Found_neighbours == 0) printf("Not sure what's going on but no
+    //    neighbours found in GPU loop\n");
+    parts_aos[pid].rho = rhoi, parts_aos[pid].rho_dh = rho_dhi;
+    parts_aos[pid].wcount = wcounti, parts_aos[pid].wcount_dh = wcount_dhi;
+    parts_aos[pid].div_v = div_vi;
+    parts_aos[pid].rot_ux = rot_uxi, parts_aos[pid].rot_uy = rot_uyi,
+    parts_aos[pid].rot_uz = rot_uzi;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+// template <typename T>
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+// #include <cuda/barrier>
+__global__ void DOSELF_GPU_AOS_F4(
+    struct part_aos_f4_send *__restrict__ parts_send,
+    struct part_aos_f4_recv *__restrict__ parts_recv, const float d_a,
+    const float d_H, const int bundle_first_task,
+    const int2 *__restrict__ d_task_first_part_f4) {
+
+  extern __shared__ float4 vars_f4[];
+
+  //  auto group = cooperative_groups::this_thread_block();
+  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+  //  cuda::barrier<cuda::thread_scope_system> bar;
+
+  int first_part_in_task_blocks, last_part_in_task_blocks;
+  int2 first_last_parts = d_task_first_part_f4[task_id];
+  first_part_in_task_blocks = first_last_parts.x;
+  last_part_in_task_blocks = first_last_parts.y;
+
+  const int pid = threadid + first_part_in_task_blocks;
+
+  float4 res_rho = {0.0, 0.0, 0.0, 0.0};
+  float4 res_rot = {0.0, 0.0, 0.0, 0.0};
+  const part_aos_f4_send pi = parts_send[pid];
+  const float4 x_pi = pi.x_p_h;
+  const float4 ux_pi = pi.ux_m;
+  const float hi = x_pi.w, hig2 = hi * hi * kernel_gamma2;
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float4 *__restrict__ x_p_h_tmp = (float4 *)&vars_f4[0];
+  float4 *__restrict__ ux_m_tmp = (float4 *)&vars_f4[BLOCK_SIZE];
+  /*Particles copied in blocks to shared memory*/
+  for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+       b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    struct part_aos_f4_send pj = parts_send[j];
+    x_p_h_tmp[threadIdx.x] = pj.x_p_h;
+    ux_m_tmp[threadIdx.x] = pj.ux_m;
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+      if (j < last_part_in_task_blocks) {
+        /* Compute the pairwise distance. */
+        const float4 x_p_h_j = x_p_h_tmp[j_block];
+        const float4 ux_m_j = ux_m_tmp[j_block];
+        const float xij = x_pi.x - x_p_h_j.x, yij = x_pi.y - x_p_h_j.y,
+                    zij = x_pi.z - x_p_h_j.z;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
+          const float r = sqrtf(r2);
+          /* Recover some data */
+          const float mj = ux_m_j.w;
+          /* Get the kernel for hi. */
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+          /*Add to sums of rho, rho_dh, wcount and wcount_dh*/
+          res_rho.x += mj * wi;
+          res_rho.y -= mj * (hydro_dimension * wi + ui * wi_dx);
+          res_rho.z += wi;
+          res_rho.w -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+
+          /* Compute dv dot r */
+          const float dvx = ux_pi.x - ux_m_j.x, dvy = ux_pi.y - ux_m_j.y,
+                      dvz = ux_pi.z - ux_m_j.z;
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+          /* Compute dv cross r */
+          const float curlvrx = dvy * zij - dvz * yij;
+          const float curlvry = dvz * xij - dvx * zij;
+          const float curlvrz = dvx * yij - dvy * xij;
+          /*Add to sums of rot_u and div_v*/
+          res_rot.x += faci * curlvrx;
+          res_rot.y += faci * curlvry;
+          res_rot.z += faci * curlvrz;
+          res_rot.w -= faci * dvdr;
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks) {
+    parts_recv[pid].rho_dh_wcount = res_rho;
+    parts_recv[pid].rot_ux_div_v = res_rot;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_density_aos(struct part_aos *parts_aos, int *d_task_first_part,
+                        int *d_task_last_part, float d_a, float d_H,
+                        const char *loop_type, cudaStream_t stream,
+                        int block_size, int count_tasks, int tasksperbundle,
+                        int numBlocks_x, int numBlocks_y, int bundle_first_task,
+                        int max_parts, double *d_cell_x, double *d_cell_y,
+                        double *d_cell_z) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  DOSELF_GPU_AOS<<<gridShape, BLOCK_SIZE,
+                   8 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int),
+                   stream>>>(parts_aos, d_task_first_part, d_task_last_part,
+                             d_a, d_H, count_tasks, tasksperbundle,
+                             nBlocks_per_task, bundle_first_task, max_parts,
+                             d_cell_x, d_cell_y, d_cell_z);
+  //  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+  //        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+  //        count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+  //        max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+struct first_part {
+  int list[32];
+};
+void launch_density_aos_f4(struct part_aos_f4_send *parts_send,
+                           struct part_aos_f4_recv *parts_recv, float d_a,
+                           float d_H, cudaStream_t stream, int numBlocks_x,
+                           int numBlocks_y, int bundle_first_task,
+                           int2 *d_task_first_part_f4) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  DOSELF_GPU_AOS_F4<<<gridShape, BLOCK_SIZE, 2 * BLOCK_SIZE * sizeof(float4),
+                      stream>>>(parts_send, parts_recv, d_a, d_H,
+                                bundle_first_task, d_task_first_part_f4);
+  //  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+  //        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+  //        count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+  //        max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void DOSELF_GPU_AOS_G(struct part_aos_g *parts_aos,
+                                 int *d_task_first_part, int *d_task_last_part,
+                                 float d_a, float d_H, int count_tasks,
+                                 int tasksperbundle, int nBlocks_per_task,
+                                 int bundle_first_task, int max_parts,
+                                 double *d_cell_x, double *d_cell_y,
+                                 double *d_cell_z) {
+  extern __shared__ float varsg[];
+  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  //  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+  int first_part_in_task_blocks, last_part_in_task_blocks;
+  first_part_in_task_blocks = d_task_first_part[task_id],
+  last_part_in_task_blocks = d_task_last_part[task_id];
+  //  __syncthreads();
+  const int pid = threadid + first_part_in_task_blocks;
+
+  int ttid = 0;
+  int first_part = 0;
+  int count = 0;
+  int last_part = 0;
+  float cellx = 0.0, celly = 0.0, cellz = 0.0;
+  float ci = 0.0, cj = 0.0;
+  float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float div_vi = 0.0;
+  int Found_neighbours = 0;
+  float v_sig;
+  float u = 0.f;
+  float laplace_u = 0.0;
+  float alpha_visc_max_ngb = 0.0;
+  if (pid < last_part_in_task_blocks) {
+    ttid = task_id;
+    first_part = d_task_first_part[ttid];
+    last_part = d_task_last_part[ttid];
+    count = last_part - first_part;
+    cellx = d_cell_x[ttid], celly = d_cell_y[ttid], cellz = d_cell_z[ttid];
+    hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
+    mi = parts_aos[pid].mass;
+    uxi = parts_aos[pid].ux;
+    uyi = parts_aos[pid].uy;
+    uzi = parts_aos[pid].uz;
+    pix = parts_aos[pid].x_p - cellx;
+    piy = parts_aos[pid].y_p - celly;
+    piz = parts_aos[pid].z_p - cellz;
+    ci = parts_aos[pid].soundspeed;
+    v_sig = parts_aos[pid].v_sig;
+    u = parts_aos[pid].u;
+    laplace_u = parts_aos[pid].laplace_u;
+    alpha_visc_max_ngb = parts_aos[pid].alpha_visc_max_ngb;
+  }
+  //  if (threadIdx.x == 0) {
+  //    first_part_tid_0 = first_part;
+  //    last_part_tid_0 = last_part;
+  //  }
+  //  __syncthreads();
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&varsg[0];
+  float *y_p_tmp = (float *)&varsg[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&varsg[BLOCK_SIZE * 2];
+  float *h_tmp = (float *)&varsg[BLOCK_SIZE * 3];
+  float *mass_tmp = (float *)&varsg[BLOCK_SIZE * 4];
+  float *ux_tmp = (float *)&varsg[BLOCK_SIZE * 5];
+  float *uy_tmp = (float *)&varsg[BLOCK_SIZE * 6];
+  float *uz_tmp = (float *)&varsg[BLOCK_SIZE * 7];
+  float *cj_tmp = (float *)&varsg[BLOCK_SIZE * 8];
+  float *alpha_tmp = (float *)&varsg[BLOCK_SIZE * 9];
+  float *u_tmp = (float *)&varsg[BLOCK_SIZE * 10];
+  float *rho_tmp = (float *)&varsg[BLOCK_SIZE * 11];
+  int *timebin = (int *)&varsg[BLOCK_SIZE * 12];
+  /*Particles copied in blocks to shared memory*/
+  for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+       b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    x_p_tmp[threadIdx.x] = parts_aos[j].x_p;
+    y_p_tmp[threadIdx.x] = parts_aos[j].y_p;
+    z_p_tmp[threadIdx.x] = parts_aos[j].z_p;
+    h_tmp[threadIdx.x] = parts_aos[j].h;
+    mass_tmp[threadIdx.x] = parts_aos[j].mass;
+    ux_tmp[threadIdx.x] = parts_aos[j].ux;
+    uy_tmp[threadIdx.x] = parts_aos[j].uy;
+    uz_tmp[threadIdx.x] = parts_aos[j].uz;
+    timebin[threadIdx.x] = parts_aos[j].time_bin;
+    cj_tmp[threadIdx.x] = parts_aos[j].soundspeed;
+    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+    u_tmp[threadIdx.x] = parts_aos[j].u;
+    rho_tmp[threadIdx.x] = parts_aos[j].rho;
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+      //      if ((j != pid) && (j < last_part_in_task_blocks) &&
+      //          timebin[j_block] != time_bin_inhibited) {
+      //      if ((j < last_part_in_task_blocks) &&
+      //    	  timebin[j_block] != time_bin_inhibited) {
+      if (j < last_part_in_task_blocks) {
+        /* Compute the pairwise distance. */
+        const float pjx = x_p_tmp[j_block] - cellx;
+        const float pjy = y_p_tmp[j_block] - celly;
+        const float pjz = z_p_tmp[j_block] - cellz;
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
+          Found_neighbours = 1;
+          const float r = sqrt(r2);
+          const float r_inv = 1.f / r;
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          /* Get the kernel for hi. */
+          const float h_inv = 1.f / hi;
+          float wi, wi_dx;
+          /* Cosmology terms for the signal velocity */
+          const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+          const float a2_Hubble = d_a * d_a * d_H;
+          /* Compute dv dot r */
+          float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+          /* Add Hubble flow */
+          const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+          /* Are the particles moving towards each others ? */
+          const float omega_ij = min(dvdr_Hubble, 0.f);
+          const float mu_ij =
+              fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+
+          /* Signal velocity */
+          const float new_v_sig =
+              ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
+          /* Update if we need to */
+          v_sig = max(v_sig, new_v_sig);
+          /* Calculate Del^2 u for the thermal diffusion coefficient. */
+          /* Need to get some kernel values F_ij = wi_dx */
+          const float ui = r * h_inv;
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          const float delta_u_factor = (u - u_tmp[j_block]) * r_inv;
+          laplace_u += mj * delta_u_factor * wi_dx / rho_tmp[j_block];
+
+          /* Set the maximal alpha from the previous step over the neighbours
+           * (this is used to limit the diffusion in hydro_prepare_force) */
+          const float alpha_j = alpha_tmp[j_block];
+          alpha_visc_max_ngb = max(alpha_visc_max_ngb, alpha_j);
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks) {
+    parts_aos[pid].v_sig = v_sig, parts_aos[pid].laplace_u = laplace_u;
+    parts_aos[pid].alpha_visc_max_ngb = alpha_visc_max_ngb;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void DOSELF_GPU_AOS_F4_G(
+    struct part_aos_f4_g_send *__restrict__ parts_send,
+    struct part_aos_f4_g_recv *__restrict__ parts_recv, const float d_a,
+    const float d_H, const int bundle_first_task,
+    const int2 *__restrict__ d_task_first_part_f4) {
+
+  extern __shared__ float4 varsf4_g[];
+
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+  int2 first_last_parts = d_task_first_part_f4[task_id];
+  int first_part_in_task_blocks = first_last_parts.x;
+  int last_part_in_task_blocks = first_last_parts.y;
+  //  __syncthreads();
+  const int pid = threadid + first_part_in_task_blocks;
+
+  /*Keep this*/
+  float v_sig = 0.f;
+  float alpha_visc_max_ngb = 0.f;
+  /////////////
+
+  struct part_aos_f4_g_send pi = parts_send[pid];
+  float4 x_h_i = pi.x_h;
+  float4 ux_m_i = pi.ux_m;
+  float4 rho_avisc_u_c_i = pi.rho_avisc_u_c;
+  float3 vsig_lapu_aviscmax_i = {0.f, 0.f, 0.f};
+
+  const float hi = x_h_i.w, hig2 = hi * hi * kernel_gamma2;
+
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float4 *__restrict__ x_h_tmp = (float4 *)&varsf4_g[0];
+  float4 *__restrict__ ux_m_tmp = (float4 *)&varsf4_g[BLOCK_SIZE];
+  float4 *__restrict__ rho_avisc_u_c_tmp = (float4 *)&varsf4_g[BLOCK_SIZE * 2];
+
+  /*Particles copied in blocks to shared memory*/
+  for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+       b += BLOCK_SIZE) {
+
+    int j = b + threadIdx.x;
+
+    struct part_aos_f4_g_send pj = parts_send[j];
+    x_h_tmp[threadIdx.x] = pj.x_h;
+    ux_m_tmp[threadIdx.x] = pj.ux_m;
+    rho_avisc_u_c_tmp[threadIdx.x] = pj.rho_avisc_u_c;
+
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+      if (j < last_part_in_task_blocks) {
+        float4 x_h_j = x_h_tmp[j_block];
+        float4 ux_m_j = ux_m_tmp[j_block];
+        float4 rho_avisc_u_c_j = rho_avisc_u_c_tmp[j_block];
+        /* Compute the pairwise distance. */
+        const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y,
+                    zij = x_h_i.z - x_h_j.z;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+
+        if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
+          const float r = sqrt(r2);
+          const float r_inv = 1.f / r;
+          /* Recover some data */
+          const float mj = ux_m_j.w;
+          /* Get the kernel for hi. */
+          const float h_inv = 1.f / hi;
+          float wi, wi_dx;
+          /* Cosmology terms for the signal velocity */
+          const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+          const float a2_Hubble = d_a * d_a * d_H;
+          /* Compute dv dot r */
+          float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y,
+                dvz = ux_m_i.z - ux_m_j.z;
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+          /* Add Hubble flow */
+          const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+          /* Are the particles moving towards each others ? */
+          const float omega_ij = min(dvdr_Hubble, 0.f);
+          const float mu_ij =
+              fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+
+          /* Signal velocity */
+          const float new_v_sig = rho_avisc_u_c_i.w + rho_avisc_u_c_j.w -
+                                  const_viscosity_beta * mu_ij;
+          /* Update if we need to */
+          vsig_lapu_aviscmax_i.x = fmaxf(vsig_lapu_aviscmax_i.x, new_v_sig);
+          /* Calculate Del^2 u for the thermal diffusion coefficient. */
+          /* Need to get some kernel values F_ij = wi_dx */
+          const float ui = r * h_inv;
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          const float delta_u_factor =
+              (rho_avisc_u_c_i.z - rho_avisc_u_c_j.z) * r_inv;
+          vsig_lapu_aviscmax_i.y +=
+              mj * delta_u_factor * wi_dx / rho_avisc_u_c_j.x;
+
+          /* Set the maximal alpha from the previous step over the neighbours
+           * (this is used to limit the diffusion in hydro_prepare_force) */
+          const float alpha_j = rho_avisc_u_c_j.y;
+          vsig_lapu_aviscmax_i.z = fmaxf(vsig_lapu_aviscmax_i.z, alpha_j);
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks) {
+    //	  printf("v %f lap %f maxvisc %f\n", vsig_lapu_aviscmax_empty_i.x,
+    // vsig_lapu_aviscmax_empty_i.y, vsig_lapu_aviscmax_empty_i.z);
+    parts_recv[pid].vsig_lapu_aviscmax = vsig_lapu_aviscmax_i;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void DOSELF_GPU_AOS_F(struct part_aos_f *parts_aos,
+                                 int *d_task_first_part, int *d_task_last_part,
+                                 float d_a, float d_H, int count_tasks,
+                                 int tasksperbundle, int nBlocks_per_task,
+                                 int bundle_first_task, int max_parts,
+                                 double *d_cell_x, double *d_cell_y,
+                                 double *d_cell_z) {
+  extern __shared__ float varsf[];
+  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  int first_part_in_task_blocks, last_part_in_task_blocks;
+  first_part_in_task_blocks = d_task_first_part[task_id],
+  last_part_in_task_blocks = d_task_last_part[task_id];
+
+  const int pid = threadid + first_part_in_task_blocks;
+
+  int ttid = 0;
+  int first_part = 0;
+  int count = 0;
+  int last_part = 0;
+  float cellx = 0.0, celly = 0.0, cellz = 0.0;
+  float ci = 0.0, cj = 0.0;
+  float hi = 0.0, hig2 = 0.0;
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float div_vi = 0.0;
+  int Found_neighbours = 0;
+  float v_sigi;
+  float ui = 0.f;
+  float u_dti = 0.f;
+  float laplace_ui = 0.0;
+  float alpha_visc_max_ngb = 0.0;
+  float pressurei = 0.0;
+  float alphavisci = 0.0;
+  float alphadiffi = 0.0;
+  float fi = 0.0;
+  float balsarai = 0.0;
+  float ahydroxi = 0.0;
+  float ahydroyi = 0.0;
+  float ahydrozi = 0.0;
+  float h_dti = 0.0;
+  int min_ngb_time_bin = 0;
+  if (pid < last_part_in_task_blocks) {
+    ttid = task_id;
+    first_part = d_task_first_part[ttid];
+    last_part = d_task_last_part[ttid];
+    count = last_part - first_part;
+    cellx = d_cell_x[ttid], celly = d_cell_y[ttid], cellz = d_cell_z[ttid];
+    hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
+    mi = parts_aos[pid].mass;
+    uxi = parts_aos[pid].ux;
+    uyi = parts_aos[pid].uy;
+    uzi = parts_aos[pid].uz;
+    pix = parts_aos[pid].x_p - cellx;
+    piy = parts_aos[pid].y_p - celly;
+    piz = parts_aos[pid].z_p - cellz;
+    ci = parts_aos[pid].soundspeed;
+    fi = parts_aos[pid].f;
+    v_sigi = parts_aos[pid].v_sig;
+    ui = parts_aos[pid].u;
+    rhoi = parts_aos[pid].rho;
+    pressurei = parts_aos[pid].pressure;
+    balsarai = parts_aos[pid].balsara;
+    alphavisci = parts_aos[pid].alpha_visc;
+    alphadiffi = parts_aos[pid].alpha_diff;
+    min_ngb_time_bin = parts_aos[pid].min_ngb_time_bin;
+    //    laplace_u = parts_aos[pid].laplace_u;
+    //    alpha_visc_max_ngb = parts_aos[pid].alpha_visc_max_ngb;
+  }
+  //  if (threadIdx.x == 0) {
+  //    first_part_tid_0 = first_part;
+  //    last_part_tid_0 = last_part;
+  //  }
+  //  __syncthreads();
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&varsf[0];
+  float *y_p_tmp = (float *)&varsf[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&varsf[BLOCK_SIZE * 2];
+  float *h_tmp = (float *)&varsf[BLOCK_SIZE * 3];
+  float *mass_tmp = (float *)&varsf[BLOCK_SIZE * 4];
+  float *ux_tmp = (float *)&varsf[BLOCK_SIZE * 5];
+  float *uy_tmp = (float *)&varsf[BLOCK_SIZE * 6];
+  float *uz_tmp = (float *)&varsf[BLOCK_SIZE * 7];
+  float *cj_tmp = (float *)&varsf[BLOCK_SIZE * 8];
+  float *alphavisc_tmp = (float *)&varsf[BLOCK_SIZE * 9];
+  float *alphadiff_tmp = (float *)&varsf[BLOCK_SIZE * 10];
+  float *u_tmp = (float *)&varsf[BLOCK_SIZE * 11];
+  float *rho_tmp = (float *)&varsf[BLOCK_SIZE * 12];
+  float *pressure_tmp = (float *)&varsf[BLOCK_SIZE * 13];
+  float *f_tmp = (float *)&varsf[BLOCK_SIZE * 14];
+  float *balsara_tmp = (float *)&varsf[BLOCK_SIZE * 15];
+  int *timebin = (int *)&varsf[BLOCK_SIZE * 16];
+  /*Particles copied in blocks to shared memory*/
+  for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+       b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    x_p_tmp[threadIdx.x] = parts_aos[j].x_p;
+    y_p_tmp[threadIdx.x] = parts_aos[j].y_p;
+    z_p_tmp[threadIdx.x] = parts_aos[j].z_p;
+    h_tmp[threadIdx.x] = parts_aos[j].h;
+    mass_tmp[threadIdx.x] = parts_aos[j].mass;
+    ux_tmp[threadIdx.x] = parts_aos[j].ux;
+    uy_tmp[threadIdx.x] = parts_aos[j].uy;
+    uz_tmp[threadIdx.x] = parts_aos[j].uz;
+    timebin[threadIdx.x] = parts_aos[j].time_bin;
+    cj_tmp[threadIdx.x] = parts_aos[j].soundspeed;
+    //    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+    u_tmp[threadIdx.x] = parts_aos[j].u;
+    rho_tmp[threadIdx.x] = parts_aos[j].rho;
+    alphavisc_tmp[threadIdx.x] = parts_aos[j].alpha_visc;
+    alphadiff_tmp[threadIdx.x] = parts_aos[j].alpha_diff;
+    pressure_tmp[threadIdx.x] = parts_aos[j].pressure;
+    f_tmp[threadIdx.x] = parts_aos[j].f;
+    balsara_tmp[threadIdx.x] = parts_aos[j].balsara;
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+      if (j < last_part_in_task_blocks) {
+        /* Compute the pairwise distance. */
+        const float pjx = x_p_tmp[j_block] - cellx;
+        const float pjy = y_p_tmp[j_block] - celly;
+        const float pjz = z_p_tmp[j_block] - cellz;
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
+
+          //          /* Cosmology terms for the signal velocity */
+          const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+          const float a2_Hubble = d_a * d_a * d_H;
+          const float r = sqrt(r2);
+          const float r_inv = 1.f / r;
+          //          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          //          /* Get the kernel for hi. */
+          const float hi_inv = 1.f / hi;
+          const float hid_inv =
+              d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
+          const float xi = r * hi_inv;
+          float wi, wi_dx;
+          d_kernel_deval(xi, &wi, &wi_dx);
+          const float wi_dr = hid_inv * wi_dx;
+          /* Get the kernel for hj. */
+          const float hj = h_tmp[j_block];
+          const float hj_inv = 1.0f / hj;
+          const float hjd_inv =
+              d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
+          const float xj = r * hj_inv;
+          float wj, wj_dx;
+          d_kernel_deval(xj, &wj, &wj_dx);
+          const float wj_dr = hjd_inv * wj_dx;
+          //          /* Compute dv dot r */
+          float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+          //          /* Add Hubble flow */
+          const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+          //          /* Are the particles moving towards each others ? */
+          const float omega_ij = min(dvdr_Hubble, 0.f);
+          const float mu_ij =
+              fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+                                         //
+                                         //          /* Signal velocity */
+          const float v_sig =
+              ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
+
+          /* Variable smoothing length term */
+          const float f_ij = 1.f - fi / mj;
+          const float f_ji = 1.f - f_tmp[j_block] / mi;
+
+          /* Balsara term */
+          const float balsaraj = balsara_tmp[j_block];
+          /* Construct the full viscosity term */
+          const float rhoj = rho_tmp[j_block];
+          const float pressurej = pressure_tmp[j_block];
+          const float rho_ij = rhoi + rhoj;
+          const float alpha = alphavisci + alphavisc_tmp[j_block];
+          const float visc =
+              -0.25f * alpha * v_sig * mu_ij * (balsarai + balsaraj) / rho_ij;
+          /* Convolve with the kernel */
+          const float visc_acc_term =
+              0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv;
+          /* Compute gradient terms */
+          const float P_over_rho2_i = pressurei / (rhoi * rhoi) * f_ij;
+          const float P_over_rho2_j = pressurej / (rhoj * rhoj) * f_ji;
+
+          /* SPH acceleration term */
+          const float sph_acc_term =
+              (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv;
+
+          /* Assemble the acceleration */
+          const float acc = sph_acc_term + visc_acc_term;
+          /* Use the force Luke ! */
+          ahydroxi -= mj * acc * xij;
+          ahydroyi -= mj * acc * yij;
+          ahydrozi -= mj * acc * zij;
+          //          if(rhoi == 0 || rhoj == 0 || pressurei == 0 || pressurej
+          //          == 0)printf("ri %f rj %f pi %f pj %f\n", rhoi, rhoj,
+          //          pressurei, pressurej);
+          /* Get the time derivative for u. */
+          const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr;
+
+          /* Viscosity term */
+          const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble;
+          const float press_sum = pressurei + pressurej;
+          /* Diffusion term */
+          /* Combine the alpha_diff into a pressure-based switch -- this allows
+           * the alpha from the highest pressure particle to dominate, so that
+           * the diffusion limited particles always take precedence - another
+           * trick to allow the scheme to work with thermal feedback. */
+          float alpha_diff =
+              (pressurei * alphadiffi + pressurej * alphadiff_tmp[j_block]) /
+              (press_sum);
+          if (fabsf(press_sum) < 1e-10) alpha_diff = 0.f;
+          const float v_diff =
+              alpha_diff * 0.5f *
+              (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
+               fabsf(fac_mu * r_inv * dvdr_Hubble));
+          /* wi_dx + wj_dx / 2 is F_ij */
+          const float diff_du_term =
+              v_diff * (ui - u_tmp[j_block]) *
+              (f_ij * wi_dr / rhoi + f_ji * wj_dr / rhoj);
+
+          /* Assemble the energy equation term */
+          const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term;
+
+          /* Internal energy time derivative */
+          u_dti += du_dt_i * mj;
+          if (mj == 0.f) printf("zero mass mj %f\n", mj);
+
+          /* Get the time derivative for h. */
+          h_dti -= mj * dvdr * r_inv / rhoj * wi_dr;
+
+          /* Update if we need to; this should be guaranteed by the gradient
+           * loop but due to some possible synchronisation problems this is here
+           * as a _quick fix_. Added: 14th August 2019. To be removed by 1st Jan
+           * 2020. (JB) */
+          v_sigi = max(v_sigi, v_sig);
+          int time_bin_j = timebin[j_block];
+          if (time_bin_j > 0)
+            min_ngb_time_bin = min(min_ngb_time_bin, time_bin_j);
+          //          printf("Got in\n");
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks) {
+    parts_aos[pid].v_sig = v_sigi;
+    parts_aos[pid].h_dt = h_dti;
+    parts_aos[pid].u_dt = u_dti;
+    parts_aos[pid].a_hydrox = ahydroxi;
+    parts_aos[pid].a_hydroy = ahydroyi;
+    parts_aos[pid].a_hydroz = ahydrozi;
+    parts_aos[pid].min_ngb_time_bin = min_ngb_time_bin;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void DOSELF_GPU_AOS_F4_F(
+    struct part_aos_f4_f_send *__restrict__ parts_send,
+    struct part_aos_f4_f_recv *__restrict__ parts_recv, const float d_a,
+    const float d_H, const int bundle_first_task,
+    const int2 *__restrict__ d_task_first_part_f4) {
+
+  extern __shared__ float4 varsf4_f[];
+
+  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  int first_part_in_task_blocks, last_part_in_task_blocks;
+  //  first_part_in_task_blocks = d_task_first_part[task_id],
+  //  last_part_in_task_blocks = d_task_last_part[task_id];
+  int2 first_last_parts = d_task_first_part_f4[task_id];
+  first_part_in_task_blocks = first_last_parts.x;
+  last_part_in_task_blocks = first_last_parts.y;
+
+  const int pid = threadid + first_part_in_task_blocks;
+
+  int ttid = 0;
+  int first_part = 0;
+  int count = 0;
+  int last_part = 0;
+  const part_aos_f4_f_send pi = parts_send[pid];
+  float4 x_h_i = pi.x_h;
+  float4 ux_m_i = pi.ux_m;
+  float4 f_b_t_mintbinngb_i = pi.f_bals_timebin_mintimebin_ngb;
+  float4 rho_p_c_vsig_i = pi.rho_p_c_vsigi;
+  float3 u_avisc_adiff_i = pi.u_alphavisc_alphadiff;
+
+  const float mi = ux_m_i.w;
+  int Found_neighbours = 0;
+  float pressurei = rho_p_c_vsig_i.y;
+  const float ci = rho_p_c_vsig_i.z;
+  float3 ahydro = {0.0, 0.0, 0.0};
+  float4 udt_hdt_vsig_mintbinngb = {0.0, 0.0, 0.0, 0.0};
+  udt_hdt_vsig_mintbinngb.z = rho_p_c_vsig_i.w;
+  udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w;
+
+  float hi = x_h_i.w;
+  float hig2 = hi * hi * kernel_gamma2;
+
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float4 *__restrict__ x_h_tmp = (float4 *)&varsf4_f[0];
+  float4 *__restrict__ ux_m_tmp = (float4 *)&varsf4_f[BLOCK_SIZE];
+  float4 *__restrict__ f_b_t_mintbinngb_tmp =
+      (float4 *)&varsf4_f[BLOCK_SIZE * 2];
+  float4 *__restrict__ rho_p_c_vsig_tmp = (float4 *)&varsf4_f[BLOCK_SIZE * 3];
+  float3 *__restrict__ u_avisc_adiff_tmp = (float3 *)&varsf4_f[BLOCK_SIZE * 4];
+  /*Particles copied in blocks to shared memory*/
+  for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+       b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    struct part_aos_f4_f_send pj = parts_send[j];
+    x_h_tmp[threadIdx.x] = pj.x_h;
+    ux_m_tmp[threadIdx.x] = pj.ux_m;
+    f_b_t_mintbinngb_tmp[threadIdx.x] = pj.f_bals_timebin_mintimebin_ngb;
+    rho_p_c_vsig_tmp[threadIdx.x] = pj.rho_p_c_vsigi;
+    //    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+    u_avisc_adiff_tmp[threadIdx.x] = pj.u_alphavisc_alphadiff;
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+      if (j < last_part_in_task_blocks) {
+        /* Compute the pairwise distance. */
+        float4 x_h_j = x_h_tmp[j_block];
+        float4 ux_m_j = ux_m_tmp[j_block];
+        float4 f_b_t_mintbinngb_j = f_b_t_mintbinngb_tmp[j_block];
+        float4 rho_p_c_vsig_j = rho_p_c_vsig_tmp[j_block];
+        float3 u_avisc_adiff_j = u_avisc_adiff_tmp[j_block];
+        const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y,
+                    zij = x_h_i.z - x_h_j.z;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
+          //          /* Cosmology terms for the signal velocity */
+          const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+          const float a2_Hubble = d_a * d_a * d_H;
+          const float r = sqrt(r2);
+          const float r_inv = 1.f / r;
+          //          /* Recover some data */
+          const float mj = ux_m_j.w;
+          //          /* Get the kernel for hi. */
+          const float hi_inv = 1.f / hi;
+          const float hid_inv =
+              d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
+          const float xi = r * hi_inv;
+          float wi, wi_dx;
+          d_kernel_deval(xi, &wi, &wi_dx);
+          const float wi_dr = hid_inv * wi_dx;
+          /* Get the kernel for hj. */
+          const float hj = x_h_j.w;
+          const float hj_inv = 1.0f / hj;
+          const float hjd_inv =
+              d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
+          const float xj = r * hj_inv;
+          float wj, wj_dx;
+          d_kernel_deval(xj, &wj, &wj_dx);
+          const float wj_dr = hjd_inv * wj_dx;
+          //          /* Compute dv dot r */
+          float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y,
+                dvz = ux_m_i.z - ux_m_j.z;
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+          //          /* Add Hubble flow */
+          const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+          //          /* Are the particles moving towards each others ? */
+          const float omega_ij = min(dvdr_Hubble, 0.f);
+          const float mu_ij =
+              fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+                                         //
+                                         //          /* Signal velocity */
+          const float cj = rho_p_c_vsig_j.z;
+          const float v_sig = ci + cj - const_viscosity_beta * mu_ij;
+
+          /* Variable smoothing length term */
+          const float f_ij = 1.f - f_b_t_mintbinngb_i.x / mj;
+          const float f_ji = 1.f - f_b_t_mintbinngb_j.x / mi;
+
+          /* Construct the full viscosity term */
+          const float pressurej = rho_p_c_vsig_j.y;
+          const float rho_ij = rho_p_c_vsig_i.x + rho_p_c_vsig_j.x;
+          const float alpha = u_avisc_adiff_i.y + u_avisc_adiff_j.y;
+          const float visc = -0.25f * alpha * v_sig * mu_ij *
+                             (f_b_t_mintbinngb_i.y + f_b_t_mintbinngb_j.y) /
+                             rho_ij;
+          /* Convolve with the kernel */
+          const float visc_acc_term =
+              0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv;
+          /* Compute gradient terms */
+          const float rhoi2 = rho_p_c_vsig_i.x * rho_p_c_vsig_i.x;
+          const float rhoj2 = rho_p_c_vsig_j.x * rho_p_c_vsig_j.x;
+          const float P_over_rho2_i = pressurei / (rhoi2)*f_ij;
+          const float P_over_rho2_j = pressurej / (rhoj2)*f_ji;
+
+          /* SPH acceleration term */
+          const float sph_acc_term =
+              (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv;
+
+          /* Assemble the acceleration */
+          const float acc = sph_acc_term + visc_acc_term;
+          /* Use the force Luke ! */
+          ahydro.x -= mj * acc * xij;
+          ahydro.y -= mj * acc * yij;
+          ahydro.z -= mj * acc * zij;
+          /* Get the time derivative for u. */
+          const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr;
+
+          /* Viscosity term */
+          const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble;
+          /* Diffusion term */
+          /* Combine the alpha_diff into a pressure-based switch -- this allows
+           * the alpha from the highest pressure particle to dominate, so that
+           * the diffusion limited particles always take precedence - another
+           * trick to allow the scheme to work with thermal feedback. */
+          float alpha_diff =
+              (pressurei * u_avisc_adiff_i.z + pressurej * u_avisc_adiff_j.z) /
+              (pressurei + pressurej);
+          if (fabsf(pressurei + pressurej) < 1e-10) alpha_diff = 0.f;
+          const float v_diff =
+              alpha_diff * 0.5f *
+              (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
+               fabsf(fac_mu * r_inv * dvdr_Hubble));
+          /* wi_dx + wj_dx / 2 is F_ij */
+          const float diff_du_term = v_diff *
+                                     (u_avisc_adiff_i.x - u_avisc_adiff_j.x) *
+                                     (f_ij * wi_dr / rho_p_c_vsig_i.x +
+                                      f_ji * wj_dr / rho_p_c_vsig_j.x);
+
+          /* Assemble the energy equation term */
+          const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term;
+
+          /* Internal energy time derivative */
+          udt_hdt_vsig_mintbinngb.x += du_dt_i * mj;
+
+          /* Get the time derivative for h. */
+          udt_hdt_vsig_mintbinngb.y -=
+              mj * dvdr * r_inv / rho_p_c_vsig_j.x * wi_dr;
+
+          /* Update if we need to; this should be guaranteed by the gradient
+           * loop but due to some possible synchronisation problems this is here
+           * as a _quick fix_. Added: 14th August 2019. To be removed by 1st Jan
+           * 2020. (JB) */
+          udt_hdt_vsig_mintbinngb.z = fmaxf(udt_hdt_vsig_mintbinngb.z, v_sig);
+          unsigned int time_bin_j = (f_b_t_mintbinngb_j.z + 0.5f);
+          unsigned int min_tb_i = (f_b_t_mintbinngb_i.w + 0.5f);
+          if (time_bin_j > 0) f_b_t_mintbinngb_i.w = min(min_tb_i, time_bin_j);
+          //          printf("Got in\n");
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks) {
+    udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w;
+    parts_recv[pid].udt_hdt_vsig_mintimebin_ngb = udt_hdt_vsig_mintbinngb;
+    parts_recv[pid].a_hydro = ahydro;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_density_GPU_naive(
+    struct part_soa parts_soa_ci, struct part_soa parts_soa_cj,
+    int *d_task_first_part_ci, int *d_task_first_part_cj,
+    int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H,
+    int bid, int tid, int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, int time_bin_inhibited) {
+
+  extern __shared__ float vars[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  __shared__ int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  __shared__ int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+
+  first_part_in_task_blocks_ci = d_task_first_part_ci[task_id];
+  last_part_in_task_blocks_ci = d_task_last_part_ci[task_id];
+  first_part_in_task_blocks_cj = d_task_first_part_cj[task_id];
+  last_part_in_task_blocks_cj = d_task_last_part_cj[task_id];
+
+  __syncthreads();
+  // Now we start calculations for particles in cell i
+  const int pid = threadid + first_part_in_task_blocks_ci;
+
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
+  int ttid = 0;
+  int first_part = 0;
+  int count = 0;
+  int last_part = 0;
+  float cellx = 0.0, celly = 0.0, cellz = 0.0;
+  float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float rho_dhi = 0.0;
+  float wcounti = 0.0;
+  float wcount_dhi = 0.0;
+  float div_vi = 0.0;
+  float rot_uxi = 0.0;
+  float rot_uyi = 0.0;
+  float rot_uzi = 0.0;
+  int Found_neighbours = 0;
+  //	if(pid<b_last_part&&pid<last_part_in_task_blocks){
+  if (pid < last_part_in_task_blocks_ci) {
+    ttid = parts_soa_ci.tid_p[pid];
+    first_part = d_task_first_part_ci[ttid];
+    last_part = d_task_last_part_ci[ttid];
+    count = last_part - first_part;
+    cellx = parts_soa_ci.locx[pid], celly = parts_soa_ci.locy[pid],
+    cellz = parts_soa_ci.locz[pid];
+    hi = parts_soa_ci.h[pid], hig2 = hi * hi * kernel_gamma2;
+    mi = parts_soa_ci.mass[pid];
+    uxi = parts_soa_ci.ux[pid];
+    uyi = parts_soa_ci.uy[pid];
+    uzi = parts_soa_ci.uz[pid];
+    pix = parts_soa_ci.x_p[pid] - cellx;
+    piy = parts_soa_ci.y_p[pid] - celly;
+    piz = parts_soa_ci.z_p[pid] - cellz;
+  }
+  //  if (threadIdx.x == 0) {
+  //    first_part_tid_0 = first_part;
+  //    last_part_tid_0 = last_part;
+  //  }
+  //  __syncthreads();
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&vars[0];
+  float *y_p_tmp = (float *)&vars[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&vars[BLOCK_SIZE * 2];
+  float *h_tmp = (float *)&vars[BLOCK_SIZE * 3];
+  float *mass_tmp = (float *)&vars[BLOCK_SIZE * 4];
+  float *ux_tmp = (float *)&vars[BLOCK_SIZE * 5];
+  float *uy_tmp = (float *)&vars[BLOCK_SIZE * 6];
+  float *uz_tmp = (float *)&vars[BLOCK_SIZE * 7];
+  timebin_t *timebin = (timebin_t *)&vars[BLOCK_SIZE * 8];
+  /*Particles copied in blocks to shared memory*/
+  for (int b = first_part_in_task_blocks_cj; b < last_part_in_task_blocks_cj;
+       b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    x_p_tmp[threadIdx.x] = parts_soa_cj.x_p[j];
+    y_p_tmp[threadIdx.x] = parts_soa_cj.y_p[j];
+    z_p_tmp[threadIdx.x] = parts_soa_cj.z_p[j];
+    h_tmp[threadIdx.x] = parts_soa_cj.h[j];
+    mass_tmp[threadIdx.x] = parts_soa_cj.mass[j];
+    ux_tmp[threadIdx.x] = parts_soa_cj.ux[j];
+    uy_tmp[threadIdx.x] = parts_soa_cj.uy[j];
+    uz_tmp[threadIdx.x] = parts_soa_cj.uz[j];
+    timebin[threadIdx.x] = parts_soa_cj.time_bin[j];
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+      //      if ((j != pid) && (j < last_part_in_task_blocks) &&
+      //          timebin[j_block] != time_bin_inhibited) {
+      //      if ((j < last_part_in_task_blocks) &&
+      //    	  timebin[j_block] != time_bin_inhibited) {
+      if (j < last_part_in_task_blocks_cj) {
+        /* Compute the pairwise distance. */
+        const float pjx = x_p_tmp[j_block] - cellx;
+        const float pjy = y_p_tmp[j_block] - celly;
+        const float pjz = z_p_tmp[j_block] - cellz;
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        //        const float hj = h_tmp[j_block], hjg2 = hj * hj *
+        //        kernel_gamma2;
+        //				if((hi < 0.0001f || hj < 0.0001f || r2 <
+        // 0.0000001f) && pid < last_part_in_task_blocks){
+        // printf("very small value for hi %f or hj %f or r2 %f\n", hi, hj, r2);
+        //				}
+        //        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
+        if (r2 < hig2 && r2 > (0.01f / dx) * (0.01f / dx)) {
+          Found_neighbours = 1;
+          const float r = sqrt(r2);
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          /* Get the kernel for hi. */
+          if (hi < 1.f / dx) printf("h < dx\n");
+          //          if(hi<1.f/256.f)printf("h < dx\n");
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          rhoi += mj * wi;
+          rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+          wcounti += wi;
+          wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+
+          /* Compute dv dot r */
+          float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+          div_vi -= faci * dvdr;
+
+          /* Compute dv cross r */
+          float curlvrx = dvy * zij - dvz * yij;
+          float curlvry = dvz * xij - dvx * zij;
+          float curlvrz = dvx * yij - dvy * xij;
+
+          rot_uxi += faci * curlvrx;
+          rot_uyi += faci * curlvry;
+          rot_uzi += faci * curlvrz;
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks_ci) {
+    parts_soa_ci.rho[pid] = rhoi, parts_soa_ci.rho_dh[pid] = rho_dhi;
+    parts_soa_ci.wcount[pid] = wcounti,
+    parts_soa_ci.wcount_dh[pid] = wcount_dhi;
+    parts_soa_ci.div_v[pid] = div_vi;
+    parts_soa_ci.rot_ux[pid] = rot_uxi, parts_soa_ci.rot_uy[pid] = rot_uyi;
+    parts_soa_ci.rot_uz[pid] = rot_uzi;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_density_pair_two_kernels(
+    struct part_soa parts_soa_ci, struct part_soa parts_soa_cj,
+    int *d_task_first_part_ci, int *d_task_first_part_cj,
+    int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H,
+    const char *loop_type, cudaStream_t stream, int bid, int block_size,
+    int count_tasks, int tasksperbundle, int max_parts_i, int max_parts_j,
+    int numBlocks_y, int tid, int offset, int bundle_first_task,
+    int time_bin_inhibited) {
+
+  int max_parts = max(max_parts_j, max_parts_i);
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+  /*Do ci*/
+  runner_do_pair_density_GPU_naive<<<gridShape, BLOCK_SIZE,
+                                     8 * BLOCK_SIZE * sizeof(float) +
+                                         BLOCK_SIZE * sizeof(timebin_t),
+                                     stream>>>(
+      parts_soa_ci, parts_soa_cj, d_task_first_part_ci, d_task_first_part_cj,
+      d_task_last_part_ci, d_task_last_part_cj, d_a, d_H, bid, tid, count_tasks,
+      tasksperbundle, nBlocks_per_task, bundle_first_task, time_bin_inhibited);
+
+  //  numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  //  gridShape = dim3(numBlocks_x, numBlocks_y);
+  //  nBlocks_per_task = numBlocks_x;
+  /*Now do cj*/
+  runner_do_pair_density_GPU_naive<<<gridShape, BLOCK_SIZE,
+                                     8 * BLOCK_SIZE * sizeof(float) +
+                                         BLOCK_SIZE * sizeof(timebin_t),
+                                     stream>>>(
+      parts_soa_cj, parts_soa_ci, d_task_first_part_cj, d_task_first_part_ci,
+      d_task_last_part_cj, d_task_last_part_ci, d_a, d_H, bid, tid, count_tasks,
+      tasksperbundle, nBlocks_per_task, bundle_first_task, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIRGPU(struct part_soa parts_soa, int pid,
+                          int last_part_in_task_blocks_ci,
+                          int first_part_in_task_blocks_cj,
+                          int last_part_in_task_blocks_cj, float d_a, float d_H,
+                          int time_bin_inhibited, float *vars) {
+
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
+
+  float cellx = 0.0, celly = 0.0, cellz = 0.0;
+  float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float rho_dhi = 0.0;
+  float wcounti = 0.0;
+  float wcount_dhi = 0.0;
+  float div_vi = 0.0;
+  float rot_uxi = 0.0;
+  float rot_uyi = 0.0;
+  float rot_uzi = 0.0;
+  int Found_neighbours = 0;
+
+  if (pid < last_part_in_task_blocks_ci) {
+    cellx = parts_soa.locx[pid], celly = parts_soa.locy[pid],
+    cellz = parts_soa.locz[pid];
+    hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+    mi = parts_soa.mass[pid];
+    uxi = parts_soa.ux[pid];
+    uyi = parts_soa.uy[pid];
+    uzi = parts_soa.uz[pid];
+    pix = parts_soa.x_p[pid] - cellx;
+    piy = parts_soa.y_p[pid] - celly;
+    piz = parts_soa.z_p[pid] - cellz;
+  }
+
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&vars[0];
+  float *y_p_tmp = (float *)&vars[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&vars[BLOCK_SIZE * 2];
+  float *h_tmp = (float *)&vars[BLOCK_SIZE * 3];
+  float *mass_tmp = (float *)&vars[BLOCK_SIZE * 4];
+  float *ux_tmp = (float *)&vars[BLOCK_SIZE * 5];
+  float *uy_tmp = (float *)&vars[BLOCK_SIZE * 6];
+  float *uz_tmp = (float *)&vars[BLOCK_SIZE * 7];
+  timebin_t *timebin = (timebin_t *)&uz_tmp[BLOCK_SIZE];
+  /*Particles copied in blocks to shared memory*/
+  for (int b = first_part_in_task_blocks_cj; b < last_part_in_task_blocks_cj;
+       b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    x_p_tmp[threadIdx.x] = parts_soa.x_p[j];
+    y_p_tmp[threadIdx.x] = parts_soa.y_p[j];
+    z_p_tmp[threadIdx.x] = parts_soa.z_p[j];
+    h_tmp[threadIdx.x] = parts_soa.h[j];
+    mass_tmp[threadIdx.x] = parts_soa.mass[j];
+    ux_tmp[threadIdx.x] = parts_soa.ux[j];
+    uy_tmp[threadIdx.x] = parts_soa.uy[j];
+    uz_tmp[threadIdx.x] = parts_soa.uz[j];
+    timebin[threadIdx.x] = parts_soa.time_bin[j];
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+      if (j < last_part_in_task_blocks_cj) {
+        /* Compute the pairwise distance. */
+        const float pjx = x_p_tmp[j_block] - cellx;
+        const float pjy = y_p_tmp[j_block] - celly;
+        const float pjz = z_p_tmp[j_block] - cellz;
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+
+        if (r2 < hig2 && r2 > (0.01f / dx) * (0.01f / dx)) {
+          Found_neighbours = 1;
+          const float r = sqrt(r2);
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          /* Get the kernel for hi. */
+          if (hi < 1.f / dx) printf("h < dx\n");
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          rhoi += mj * wi;
+          rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+          wcounti += wi;
+          wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+
+          /* Compute dv dot r */
+          float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+          div_vi -= faci * dvdr;
+
+          /* Compute dv cross r */
+          float curlvrx = dvy * zij - dvz * yij;
+          float curlvry = dvz * xij - dvx * zij;
+          float curlvrz = dvx * yij - dvy * xij;
+
+          rot_uxi += faci * curlvrx;
+          rot_uyi += faci * curlvry;
+          rot_uzi += faci * curlvrz;
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks_ci) {
+    parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+    parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+    parts_soa.div_v[pid] = div_vi;
+    parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi;
+    parts_soa.rot_uz[pid] = rot_uzi;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NONSYMGPU(struct part_soa parts_soa, int pid,
+                                 const int ci_start, const int ci_end,
+                                 const int cj_start, const int cj_end,
+                                 float d_a, float d_H, float *vars_pair,
+                                 double *d_shift_x, double *d_shift_y,
+                                 double *d_shift_z, const int task_id_tmp,
+                                 int flip_order) {
+
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
+
+  float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float rho_dhi = 0.0;
+  float wcounti = 0.0;
+  float wcount_dhi = 0.0;
+  float div_vi = 0.0;
+  float rot_uxi = 0.0;
+  float rot_uyi = 0.0;
+  float rot_uzi = 0.0;
+  int Found_neighbours = 0;
+  int count_i = cj_start;
+  //  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
+  //  last_part_in_task_blocks_ci %i\n",
+  //		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
+  // last_part_in_task_blocks_ci);
+  if (pid < ci_end) {
+    hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+    mi = parts_soa.mass[pid];
+    uxi = parts_soa.ux[pid];
+    uyi = parts_soa.uy[pid];
+    uzi = parts_soa.uz[pid];
+    pix = parts_soa.x_p[pid] - d_shift_x[task_id_tmp];
+    piy = parts_soa.y_p[pid] - d_shift_y[task_id_tmp];
+    piz = parts_soa.z_p[pid] - d_shift_z[task_id_tmp];
+  }
+
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&vars_pair[0];
+  float *y_p_tmp = (float *)&x_p_tmp[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&y_p_tmp[BLOCK_SIZE];
+  float *h_tmp = (float *)&z_p_tmp[BLOCK_SIZE];
+  float *mass_tmp = (float *)&h_tmp[BLOCK_SIZE];
+  float *ux_tmp = (float *)&mass_tmp[BLOCK_SIZE];
+  float *uy_tmp = (float *)&ux_tmp[BLOCK_SIZE];
+  float *uz_tmp = (float *)&uy_tmp[BLOCK_SIZE];
+  timebin_t *timebin = (timebin_t *)&uz_tmp[BLOCK_SIZE];
+
+  /*Particles copied in blocks to shared memory*/
+  for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) {
+    const int tid_x = threadIdx.x;
+    int j = b + tid_x;
+    x_p_tmp[tid_x] = parts_soa.x_p[j];
+    y_p_tmp[tid_x] = parts_soa.y_p[j];
+    z_p_tmp[tid_x] = parts_soa.z_p[j];
+    //	h_tmp[tid_x] = parts_soa.h[j];
+    mass_tmp[tid_x] = parts_soa.mass[j];
+    ux_tmp[tid_x] = parts_soa.ux[j];
+    uy_tmp[tid_x] = parts_soa.uy[j];
+    uz_tmp[tid_x] = parts_soa.uz[j];
+    timebin[tid_x] = parts_soa.time_bin[j];
+
+    __syncthreads();
+    const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
+    const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
+    const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
+    /*j_block is the particle's index in the block. Loop through particles in
+     * shared memory one by one*/
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      int jj = b + j_block;
+      if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+        const float pjx = x_p_tmp[j_block] - shift_x_j;
+        const float pjy = y_p_tmp[j_block] - shift_y_j;
+        const float pjz = z_p_tmp[j_block] - shift_z_j;
+
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        //		const float xij = (pix - pjx) * flip_order, yij = (piy -
+        // pjy) * flip_order, zij = (piz - pjz) * flip_order;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2) {
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          const float r = sqrt(r2);
+          /* Get the kernel for hi. */
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          rhoi += mj * wi;
+          rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+          wcounti += wi;
+          wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+          /* Compute dv dot r */
+          const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                      dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+          /* Compute dv cross r */
+          const float curlvrx = dvy * zij - dvz * yij;
+          const float curlvry = dvz * xij - dvx * zij;
+          const float curlvrz = dvx * yij - dvy * xij;
+
+          div_vi -= faci * dvdr;
+
+          rot_uxi += faci * curlvrx;
+          rot_uyi += faci * curlvry;
+          rot_uzi += faci * curlvrz;
+        }
+      } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+    } /*End of looping through particles in shared memory---Shared arrays
+         zero'ed for next step in outer loop*/
+    __syncthreads();
+  } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+  if (pid >= ci_start && pid < ci_end) {
+    parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+    parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+    parts_soa.div_v[pid] = div_vi;
+    parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi;
+    parts_soa.rot_uz[pid] = rot_uzi;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NONSYMGPUAOS(struct part_aos *parts_aos, int pid,
+                                    const int ci_start, const int ci_end,
+                                    const int cj_start, const int cj_end,
+                                    float d_a, float d_H, float *vars_pair_aos,
+                                    double *d_shift_x, double *d_shift_y,
+                                    double *d_shift_z, const int task_id_tmp,
+                                    int flip_order) {
+
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
+
+  float hi = 0.0, hig2 = 0.0;
+
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float rho_dhi = 0.0;
+  float wcounti = 0.0;
+  float wcount_dhi = 0.0;
+  float div_vi = 0.0;
+  float rot_uxi = 0.0;
+  float rot_uyi = 0.0;
+  float rot_uzi = 0.0;
+  int Found_neighbours = 0;
+  int count_i = cj_start;
+  //  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
+  //  last_part_in_task_blocks_ci %i\n",
+  //		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
+  // last_part_in_task_blocks_ci);
+  if (pid < ci_end) {
+    hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
+    mi = parts_aos[pid].mass;
+    uxi = parts_aos[pid].ux;
+    uyi = parts_aos[pid].uy;
+    uzi = parts_aos[pid].uz;
+    pix = parts_aos[pid].x_p;  // - d_shift_x[task_id_tmp];
+    piy = parts_aos[pid].y_p;  // - d_shift_y[task_id_tmp];
+    piz = parts_aos[pid].z_p;  // - d_shift_z[task_id_tmp];
+  }
+
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&vars_pair_aos[0];
+  float *y_p_tmp = (float *)&x_p_tmp[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&y_p_tmp[BLOCK_SIZE];
+  float *h_tmp = (float *)&z_p_tmp[BLOCK_SIZE];
+  float *mass_tmp = (float *)&h_tmp[BLOCK_SIZE];
+  float *ux_tmp = (float *)&mass_tmp[BLOCK_SIZE];
+  float *uy_tmp = (float *)&ux_tmp[BLOCK_SIZE];
+  float *uz_tmp = (float *)&uy_tmp[BLOCK_SIZE];
+  int *timebin = (int *)&uz_tmp[BLOCK_SIZE];
+
+  /*Particles copied in blocks to shared memory*/
+  for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) {
+    const int tid_x = threadIdx.x;
+    int j = b + tid_x;
+    x_p_tmp[tid_x] = parts_aos[j].x_p;
+    y_p_tmp[tid_x] = parts_aos[j].y_p;
+    z_p_tmp[tid_x] = parts_aos[j].z_p;
+    //	h_tmp[tid_x] = parts_aos[j].h;
+    mass_tmp[tid_x] = parts_aos[j].mass;
+    ux_tmp[tid_x] = parts_aos[j].ux;
+    uy_tmp[tid_x] = parts_aos[j].uy;
+    uz_tmp[tid_x] = parts_aos[j].uz;
+    timebin[tid_x] = parts_aos[j].time_bin;
+    //	const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
+    //	const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
+    //	const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
+    __syncthreads();
+    /*j_block is the particle's index in the block. Loop through particles in
+     * shared memory one by one*/
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      int jj = b + j_block;
+      if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+        const float pjx = x_p_tmp[j_block];  // - shift_x_j;
+        const float pjy = y_p_tmp[j_block];  // - shift_y_j;
+        const float pjz = z_p_tmp[j_block];  // - shift_z_j;
+
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        //		const float xij = (pix - pjx) * flip_order, yij = (piy -
+        // pjy) * flip_order, zij = (piz - pjz) * flip_order;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2) {
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          const float r = sqrt(r2);
+          /* Get the kernel for hi. */
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          rhoi += mj * wi;
+          rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+          wcounti += wi;
+          wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+          /* Compute dv dot r */
+          const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                      dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+          /* Compute dv cross r */
+          const float curlvrx = dvy * zij - dvz * yij;
+          const float curlvry = dvz * xij - dvx * zij;
+          const float curlvrz = dvx * yij - dvy * xij;
+
+          div_vi -= faci * dvdr;
+
+          rot_uxi += faci * curlvrx;
+          rot_uyi += faci * curlvry;
+          rot_uzi += faci * curlvrz;
+          //		  if(timebin[j_block] != 1000 && timebin[j_block] !=
+          // 20)printf("incorrect timebin %i\n", timebin[j_block]);
+        }
+      } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+    } /*End of looping through particles in shared memory---Shared arrays
+         zero'ed for next step in outer loop*/
+    __syncthreads();
+  } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+  if (pid >= ci_start && pid < ci_end) {
+    //	printf("timebin %i\n", parts_aos[pid].time_bin);
+    parts_aos[pid].rho = rhoi, parts_aos[pid].rho_dh = rho_dhi;
+    parts_aos[pid].wcount = wcounti, parts_aos[pid].wcount_dh = wcount_dhi;
+    parts_aos[pid].div_v = div_vi;
+    parts_aos[pid].rot_ux = rot_uxi, parts_aos[pid].rot_uy = rot_uyi;
+    parts_aos[pid].rot_uz = rot_uzi;
+    parts_aos[pid].time_bin = 20;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NONSYMGPUAOSF4(
+    struct part_aos_f4_send *__restrict__ parts_send,
+    struct part_aos_f4_recv *__restrict__ parts_recv, int pid,
+    const int ci_start, const int ci_end, const int cj_start, const int cj_end,
+    float d_a, float d_H, float4 *vars_pair_aos_f4) {
+
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
+
+  float hi = 0.0, hig2 = 0.0;
+
+  int Found_neighbours = 0;
+  int count_i = cj_start;
+
+  float4 res_rho = {0.0, 0.0, 0.0, 0.0};
+  float4 res_rot = {0.0, 0.0, 0.0, 0.0};
+  const part_aos_f4_send pi = parts_send[pid];
+  const float4 x_pi = pi.x_p_h;
+  const float4 ux_pi = pi.ux_m;
+  //  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
+  //  last_part_in_task_blocks_ci %i\n",
+  //		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
+  // last_part_in_task_blocks_ci);
+  //  if (pid < ci_end) {
+  hi = x_pi.w, hig2 = hi * hi * kernel_gamma2;
+  //  }
+
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float4 *__restrict__ x_p_h_tmp = (float4 *)&vars_pair_aos_f4[0];
+  float4 *__restrict__ ux_m_tmp = (float4 *)&vars_pair_aos_f4[BLOCK_SIZE];
+
+  /*Particles copied in blocks to shared memory*/
+  for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) {
+    const int tid_x = threadIdx.x;
+    int j = b + tid_x;
+    struct part_aos_f4_send pj = parts_send[j];
+    x_p_h_tmp[tid_x] = pj.x_p_h;
+    ux_m_tmp[tid_x] = pj.ux_m;
+    __syncthreads();
+    /*j_block is the particle's index in the block. Loop through particles in
+     * shared memory one by one*/
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      int jj = b + j_block;
+      if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+        const float4 x_p_h_j = x_p_h_tmp[j_block];
+        const float4 ux_m_j = ux_m_tmp[j_block];
+
+        const float xij = x_pi.x - x_p_h_j.x, yij = x_pi.y - x_p_h_j.y,
+                    zij = x_pi.z - x_p_h_j.z;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2) {
+          /* Recover some data */
+          const float mj = ux_m_j.w;
+          const float r = sqrt(r2);
+          /* Get the kernel for hi. */
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+          /*Add to sums of rho, rho_dh, wcount and wcount_dh*/
+          res_rho.x += mj * wi;
+          res_rho.y -= mj * (hydro_dimension * wi + ui * wi_dx);
+          res_rho.z += wi;
+          res_rho.w -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+          /* Compute dv dot r */
+          const float dvx = ux_pi.x - ux_m_j.x, dvy = ux_pi.y - ux_m_j.y,
+                      dvz = ux_pi.z - ux_m_j.z;
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+          /* Compute dv cross r */
+          const float curlvrx = dvy * zij - dvz * yij;
+          const float curlvry = dvz * xij - dvx * zij;
+          const float curlvrz = dvx * yij - dvy * xij;
+
+          res_rot.x += faci * curlvrx;
+          res_rot.y += faci * curlvry;
+          res_rot.z += faci * curlvrz;
+          res_rot.w -= faci * dvdr;
+        }
+      } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+    } /*End of looping through particles in shared memory---Shared arrays
+         zero'ed for next step in outer loop*/
+    __syncthreads();
+  } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+  if (pid >= ci_start && pid < ci_end) {
+    parts_recv[pid].rho_dh_wcount = res_rho;
+    parts_recv[pid].rot_ux_div_v = res_rot;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NAIVEGPUAOSF4(
+    const struct part_aos_f4_send pi,
+    struct part_aos_f4_send *__restrict__ parts_send,
+    struct part_aos_f4_recv *__restrict__ parts_recv, int pid,
+    const int cj_start, const int cj_end, float d_a, float d_H) {
+
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
+
+  float hi = 0.0, hig2 = 0.0;
+
+  int Found_neighbours = 0;
+  int count_i = cj_start;
+
+  float4 res_rho = {0.0, 0.0, 0.0, 0.0};
+  float4 res_rot = {0.0, 0.0, 0.0, 0.0};
+  //  const part_aos_f4_send pi = parts_send[pid];
+  const float4 x_pi = pi.x_p_h;
+  const float4 ux_pi = pi.ux_m;
+  //  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
+  //  last_part_in_task_blocks_ci %i\n",
+  //		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
+  // last_part_in_task_blocks_ci);
+  //  if (pid < ci_end) {
+  hi = x_pi.w, hig2 = hi * hi * kernel_gamma2;
+  //  }
+
+  //  printf("js %i je %i\n", cj_start, cj_end);
+  /*Particles copied in blocks to shared memory*/
+  for (int j = cj_start; j < cj_end; j++) {
+    struct part_aos_f4_send pj = parts_send[j];
+
+    const float4 x_p_h_j = pj.x_p_h;
+    const float4 ux_m_j = pj.ux_m;
+
+    const float xij = x_pi.x - x_p_h_j.x, yij = x_pi.y - x_p_h_j.y,
+                zij = x_pi.z - x_p_h_j.z;
+    const float r2 = xij * xij + yij * yij + zij * zij;
+    //	printf("r2 %f \n", r2);
+    if (r2 < hig2) {
+      /* Recover some data */
+      const float mj = ux_m_j.w;
+      const float r = sqrt(r2);
+      /* Get the kernel for hi. */
+      const float h_inv = 1.f / hi;
+      const float ui = r * h_inv;
+      float wi, wi_dx;
+
+      d_kernel_deval(ui, &wi, &wi_dx);
+      /*Add to sums of rho, rho_dh, wcount and wcount_dh*/
+      res_rho.x += mj * wi;
+      res_rho.y -= mj * (hydro_dimension * wi + ui * wi_dx);
+      res_rho.z += wi;
+      res_rho.w -= (hydro_dimension * wi + ui * wi_dx);
+
+      const float r_inv = 1.f / r;
+      const float faci = mj * wi_dx * r_inv;
+      /* Compute dv dot r */
+      const float dvx = ux_pi.x - ux_m_j.x, dvy = ux_pi.y - ux_m_j.y,
+                  dvz = ux_pi.z - ux_m_j.z;
+      const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+      /* Compute dv cross r */
+      const float curlvrx = dvy * zij - dvz * yij;
+      const float curlvry = dvz * xij - dvx * zij;
+      const float curlvrz = dvx * yij - dvy * xij;
+
+      res_rot.x += faci * curlvrx;
+      res_rot.y += faci * curlvry;
+      res_rot.z += faci * curlvrz;
+      res_rot.w -= faci * dvdr;
+    }
+  } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+  //  if (pid >= ci_start && pid < ci_end) {
+  parts_recv[pid].rho_dh_wcount = res_rho;
+  parts_recv[pid].rot_ux_div_v = res_rot;
+  //  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NONSYMGPUAOSG(struct part_aos_g *parts_aos, int pid,
+                                     const int ci_start, const int ci_end,
+                                     const int cj_start, const int cj_end,
+                                     float d_a, float d_H,
+                                     float *vars_pair_aosg, double *d_shift_x,
+                                     double *d_shift_y, double *d_shift_z,
+                                     const int task_id_tmp, int flip_order) {
+
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
+
+  float hi = 0.0, hig2 = 0.0;
+
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float div_vi = 0.0;
+  int Found_neighbours = 0;
+  float v_sig;
+  float u = 0.f;
+  float laplace_u = 0.0;
+  float alpha_visc_max_ngb = 0.0;
+  float ci = 0.0;
+
+  int count_i = cj_start;
+  if (pid < ci_end) {
+    hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
+    mi = parts_aos[pid].mass;
+    uxi = parts_aos[pid].ux;
+    uyi = parts_aos[pid].uy;
+    uzi = parts_aos[pid].uz;
+    ci = parts_aos[pid].soundspeed;
+    v_sig = parts_aos[pid].v_sig;
+    u = parts_aos[pid].u;
+    laplace_u = parts_aos[pid].laplace_u;
+    alpha_visc_max_ngb = parts_aos[pid].alpha_visc_max_ngb;
+
+    pix = parts_aos[pid].x_p - d_shift_x[task_id_tmp];
+    piy = parts_aos[pid].y_p - d_shift_y[task_id_tmp];
+    piz = parts_aos[pid].z_p - d_shift_z[task_id_tmp];
+  }
+
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&vars_pair_aosg[0];
+  float *y_p_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 2];
+  float *h_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 3];
+  float *mass_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 4];
+  float *ux_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 5];
+  float *uy_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 6];
+  float *uz_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 7];
+  float *cj_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 8];
+  float *alpha_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 9];
+  float *u_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 10];
+  float *rho_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 11];
+  int *timebin = (int *)&vars_pair_aosg[BLOCK_SIZE * 12];
+
+  /*Particles copied in blocks to shared memory*/
+  for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) {
+    const int tid_x = threadIdx.x;
+    int j = b + tid_x;
+    x_p_tmp[threadIdx.x] = parts_aos[j].x_p;
+    y_p_tmp[threadIdx.x] = parts_aos[j].y_p;
+    z_p_tmp[threadIdx.x] = parts_aos[j].z_p;
+    h_tmp[threadIdx.x] = parts_aos[j].h;
+    mass_tmp[threadIdx.x] = parts_aos[j].mass;
+    ux_tmp[threadIdx.x] = parts_aos[j].ux;
+    uy_tmp[threadIdx.x] = parts_aos[j].uy;
+    uz_tmp[threadIdx.x] = parts_aos[j].uz;
+    timebin[threadIdx.x] = parts_aos[j].time_bin;
+    cj_tmp[threadIdx.x] = parts_aos[j].soundspeed;
+    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+    u_tmp[threadIdx.x] = parts_aos[j].u;
+    rho_tmp[threadIdx.x] = parts_aos[j].rho;
+    const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
+    const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
+    const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
+    __syncthreads();
+    /*j_block is the particle's index in the block. Loop through particles in
+     * shared memory one by one*/
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      int jj = b + j_block;
+      if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+        const float pjx = x_p_tmp[j_block] - shift_x_j;
+        const float pjy = y_p_tmp[j_block] - shift_y_j;
+        const float pjz = z_p_tmp[j_block] - shift_z_j;
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2) {
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          const float r = sqrt(r2);
+          const float r_inv = 1.f / r;
+          /* Get the kernel for hi. */
+          const float h_inv = 1.f / hi;
+          float wi, wi_dx;
+          /* Cosmology terms for the signal velocity */
+          const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+          const float a2_Hubble = d_a * d_a * d_H;
+          /* Compute dv dot r */
+          const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                      dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+          /* Add Hubble flow */
+          const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+          /* Are the particles moving towards each others ? */
+          const float omega_ij = min(dvdr_Hubble, 0.f);
+          const float mu_ij =
+              fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+
+          /* Signal velocity */
+          const float new_v_sig =
+              ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
+          /* Update if we need to */
+          v_sig = max(v_sig, new_v_sig);
+          /* Calculate Del^2 u for the thermal diffusion coefficient. */
+          /* Need to get some kernel values F_ij = wi_dx */
+          const float ui = r * h_inv;
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          const float delta_u_factor = (u - u_tmp[j_block]) * r_inv;
+          laplace_u += mj * delta_u_factor * wi_dx / rho_tmp[j_block];
+
+          /* Set the maximal alpha from the previous step over the neighbours
+           * (this is used to limit the diffusion in hydro_prepare_force) */
+          const float alpha_j = alpha_tmp[j_block];
+          alpha_visc_max_ngb = max(alpha_visc_max_ngb, alpha_j);
+        }
+      } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+    } /*End of looping through particles in shared memory---Shared arrays
+         zero'ed for next step in outer loop*/
+    __syncthreads();
+  } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+  if (pid >= ci_start && pid < ci_end) {
+    parts_aos[pid].v_sig = v_sig, parts_aos[pid].laplace_u = laplace_u;
+    parts_aos[pid].alpha_visc_max_ngb = alpha_visc_max_ngb;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NAIVEGPUAOSF4G(
+    const struct part_aos_f4_g_send pi,
+    struct part_aos_f4_g_send *__restrict__ parts_send,
+    struct part_aos_f4_g_recv *__restrict__ parts_recv, int pid,
+    const int cj_start, const int cj_end, float d_a, float d_H) {
+
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
+
+  float hi = 0.0, hig2 = 0.0;
+
+  int Found_neighbours = 0;
+  int count_i = cj_start;
+
+  float4 res_rho = {0.0, 0.0, 0.0, 0.0};
+  float4 res_rot = {0.0, 0.0, 0.0, 0.0};
+  //  const part_aos_f4_send pi = parts_send[pid];
+  const float4 x_h_i = pi.x_h;
+  const float4 ux_m_i = pi.ux_m;
+  const float4 rho_avisc_u_c_i = pi.rho_avisc_u_c;
+  float3 vsig_lapu_aviscmax_i = {0.f, 0.f, 0.f};
+
+  //  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
+  //  last_part_in_task_blocks_ci %i\n",
+  //		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
+  // last_part_in_task_blocks_ci);
+  //  if (pid < ci_end) {
+  hi = x_h_i.w, hig2 = hi * hi * kernel_gamma2;
+  //  }
+
+  //  printf("js %i je %i\n", cj_start, cj_end);
+  /*Particles copied in blocks to shared memory*/
+  for (int j = cj_start; j < cj_end; j++) {
+    struct part_aos_f4_g_send pj = parts_send[j];
+
+    const float4 x_h_j = pj.x_h;
+    const float4 ux_m_j = pj.ux_m;
+    const float4 rho_avisc_u_c_j = pj.rho_avisc_u_c;
+    const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y,
+                zij = x_h_i.z - x_h_j.z;
+    const float r2 = xij * xij + yij * yij + zij * zij;
+    //	printf("r2 %f \n", r2);
+    if (r2 < hig2) {
+      const float r = sqrt(r2);
+      const float r_inv = 1.f / r;
+      /* Recover some data */
+      const float mj = ux_m_j.w;
+      /* Get the kernel for hi. */
+      const float h_inv = 1.f / hi;
+      float wi, wi_dx;
+      /* Cosmology terms for the signal velocity */
+      const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+      const float a2_Hubble = d_a * d_a * d_H;
+      /* Compute dv dot r */
+      float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y,
+            dvz = ux_m_i.z - ux_m_j.z;
+      const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+      /* Add Hubble flow */
+      const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+      /* Are the particles moving towards each others ? */
+      const float omega_ij = min(dvdr_Hubble, 0.f);
+      const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+
+      /* Signal velocity */
+      const float new_v_sig =
+          rho_avisc_u_c_i.w + rho_avisc_u_c_j.w - const_viscosity_beta * mu_ij;
+      /* Update if we need to */
+      vsig_lapu_aviscmax_i.x = fmaxf(vsig_lapu_aviscmax_i.x, new_v_sig);
+      /* Calculate Del^2 u for the thermal diffusion coefficient. */
+      /* Need to get some kernel values F_ij = wi_dx */
+      const float ui = r * h_inv;
+      d_kernel_deval(ui, &wi, &wi_dx);
+
+      const float delta_u_factor =
+          (rho_avisc_u_c_i.z - rho_avisc_u_c_j.z) * r_inv;
+      vsig_lapu_aviscmax_i.y += mj * delta_u_factor * wi_dx / rho_avisc_u_c_j.x;
+
+      /* Set the maximal alpha from the previous step over the neighbours
+       * (this is used to limit the diffusion in hydro_prepare_force) */
+      const float alpha_j = rho_avisc_u_c_j.y;
+      vsig_lapu_aviscmax_i.z = fmaxf(vsig_lapu_aviscmax_i.z, alpha_j);
+    }
+  } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+  //  if (pid >= ci_start && pid < ci_end) {
+  parts_recv[pid].vsig_lapu_aviscmax = vsig_lapu_aviscmax_i;
+  //  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NONSYMGPUAOSF(struct part_aos_f *parts_aos, int pid,
+                                     const int ci_start, const int ci_end,
+                                     const int cj_start, const int cj_end,
+                                     float d_a, float d_H,
+                                     float *vars_pair_aosf, double *d_shift_x,
+                                     double *d_shift_y, double *d_shift_z,
+                                     const int task_id_tmp, int flip_order) {
+
+  float ci = 0.0, cj = 0.0;
+  float hi = 0.0, hig2 = 0.0;
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float div_vi = 0.0;
+  int Found_neighbours = 0;
+  float v_sigi;
+  float ui = 0.f;
+  float u_dti = 0.f;
+  float laplace_ui = 0.0;
+  float alpha_visc_max_ngb = 0.0;
+  float pressurei = 0.0;
+  float alphavisci = 0.0;
+  float alphadiffi = 0.0;
+  float fi = 0.0;
+  float balsarai = 0.0;
+  float ahydroxi = 0.0;
+  float ahydroyi = 0.0;
+  float ahydrozi = 0.0;
+  float h_dti = 0.0;
+  int min_ngb_time_bin = 0;
+  if (pid < ci_end) {
+    hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
+    mi = parts_aos[pid].mass;
+    uxi = parts_aos[pid].ux;
+    uyi = parts_aos[pid].uy;
+    uzi = parts_aos[pid].uz;
+    ci = parts_aos[pid].soundspeed;
+    fi = parts_aos[pid].f;
+    v_sigi = parts_aos[pid].v_sig;
+    ui = parts_aos[pid].u;
+    rhoi = parts_aos[pid].rho;
+    pressurei = parts_aos[pid].pressure;
+    balsarai = parts_aos[pid].balsara;
+    alphavisci = parts_aos[pid].alpha_visc;
+    alphadiffi = parts_aos[pid].alpha_diff;
+    min_ngb_time_bin = parts_aos[pid].min_ngb_time_bin;
+    pix = parts_aos[pid].x_p - d_shift_x[task_id_tmp];
+    piy = parts_aos[pid].y_p - d_shift_y[task_id_tmp];
+    piz = parts_aos[pid].z_p - d_shift_z[task_id_tmp];
+  }
+  //  if (threadIdx.x == 0) {
+  //    first_part_tid_0 = first_part;
+  //    last_part_tid_0 = last_part;
+  //  }
+  //  __syncthreads();
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&vars_pair_aosf[0];
+  float *y_p_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 2];
+  float *h_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 3];
+  float *mass_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 4];
+  float *ux_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 5];
+  float *uy_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 6];
+  float *uz_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 7];
+  float *cj_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 8];
+  float *alphavisc_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 9];
+  float *alphadiff_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 10];
+  float *u_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 11];
+  float *rho_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 12];
+  float *pressure_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 13];
+  float *f_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 14];
+  float *balsara_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 15];
+  int *timebin = (int *)&vars_pair_aosf[BLOCK_SIZE * 16];
+  /*Particles copied in blocks to shared memory*/
+  for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    x_p_tmp[threadIdx.x] = parts_aos[j].x_p;
+    y_p_tmp[threadIdx.x] = parts_aos[j].y_p;
+    z_p_tmp[threadIdx.x] = parts_aos[j].z_p;
+    h_tmp[threadIdx.x] = parts_aos[j].h;
+    mass_tmp[threadIdx.x] = parts_aos[j].mass;
+    ux_tmp[threadIdx.x] = parts_aos[j].ux;
+    uy_tmp[threadIdx.x] = parts_aos[j].uy;
+    uz_tmp[threadIdx.x] = parts_aos[j].uz;
+    timebin[threadIdx.x] = parts_aos[j].time_bin;
+    cj_tmp[threadIdx.x] = parts_aos[j].soundspeed;
+    //    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+    u_tmp[threadIdx.x] = parts_aos[j].u;
+    rho_tmp[threadIdx.x] = parts_aos[j].rho;
+    alphavisc_tmp[threadIdx.x] = parts_aos[j].alpha_visc;
+    alphadiff_tmp[threadIdx.x] = parts_aos[j].alpha_diff;
+    pressure_tmp[threadIdx.x] = parts_aos[j].pressure;
+    f_tmp[threadIdx.x] = parts_aos[j].f;
+    balsara_tmp[threadIdx.x] = parts_aos[j].balsara;
+    const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
+    const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
+    const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
+    __syncthreads();
+    /*j_block is the particle's index in the block. Loop through particles in
+     * shared memory one by one*/
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      int jj = b + j_block;
+      if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+        /* Compute the pairwise distance. */
+        const float pjx = x_p_tmp[j_block] - shift_x_j;
+        const float pjy = y_p_tmp[j_block] - shift_y_j;
+        const float pjz = z_p_tmp[j_block] - shift_z_j;
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2) {
+
+          //          /* Cosmology terms for the signal velocity */
+          const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+          const float a2_Hubble = d_a * d_a * d_H;
+          const float r = sqrt(r2);
+          const float r_inv = 1.f / r;
+          //          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          //          /* Get the kernel for hi. */
+          const float hi_inv = 1.f / hi;
+          const float hid_inv =
+              d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
+          const float xi = r * hi_inv;
+          float wi, wi_dx;
+          d_kernel_deval(xi, &wi, &wi_dx);
+          const float wi_dr = hid_inv * wi_dx;
+          /* Get the kernel for hj. */
+          const float hj = h_tmp[j_block];
+          const float hj_inv = 1.0f / hj;
+          const float hjd_inv =
+              d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
+          const float xj = r * hj_inv;
+          float wj, wj_dx;
+          d_kernel_deval(xj, &wj, &wj_dx);
+          const float wj_dr = hjd_inv * wj_dx;
+          //          /* Compute dv dot r */
+          float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+          //          /* Add Hubble flow */
+          const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+          //          /* Are the particles moving towards each others ? */
+          const float omega_ij = min(dvdr_Hubble, 0.f);
+          const float mu_ij =
+              fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+                                         //
+                                         //          /* Signal velocity */
+          const float v_sig =
+              ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
+
+          /* Variable smoothing length term */
+          const float f_ij = 1.f - fi / mj;
+          const float f_ji = 1.f - f_tmp[j_block] / mi;
+
+          /* Balsara term */
+          const float balsaraj = balsara_tmp[j_block];
+          /* Construct the full viscosity term */
+          const float rhoj = rho_tmp[j_block];
+          const float pressurej = pressure_tmp[j_block];
+          const float rho_ij = rhoi + rhoj;
+          const float alpha = alphavisci + alphavisc_tmp[j_block];
+          const float visc =
+              -0.25f * alpha * v_sig * mu_ij * (balsarai + balsaraj) / rho_ij;
+          /* Convolve with the kernel */
+          const float visc_acc_term =
+              0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv;
+          /* Compute gradient terms */
+          const float P_over_rho2_i = pressurei / (rhoi * rhoi) * f_ij;
+          const float P_over_rho2_j = pressurej / (rhoj * rhoj) * f_ji;
+
+          /* SPH acceleration term */
+          const float sph_acc_term =
+              (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv;
+
+          /* Assemble the acceleration */
+          const float acc = sph_acc_term + visc_acc_term;
+          /* Use the force Luke ! */
+          ahydroxi -= mj * acc * xij;
+          ahydroyi -= mj * acc * yij;
+          ahydrozi -= mj * acc * zij;
+          //          if(rhoi == 0 || rhoj == 0 || pressurei == 0 || pressurej
+          //          == 0)printf("ri %f rj %f pi %f pj %f\n", rhoi, rhoj,
+          //          pressurei, pressurej);
+          /* Get the time derivative for u. */
+          const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr;
+
+          /* Viscosity term */
+          const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble;
+          const float press_sum = pressurei + pressurej;
+          /* Diffusion term */
+          /* Combine the alpha_diff into a pressure-based switch -- this allows
+           * the alpha from the highest pressure particle to dominate, so that
+           * the diffusion limited particles always take precedence - another
+           * trick to allow the scheme to work with thermal feedback. */
+          float alpha_diff =
+              (pressurei * alphadiffi + pressurej * alphadiff_tmp[j_block]) /
+              (press_sum);
+          if (fabsf(press_sum) < 1e-10) alpha_diff = 0.f;
+          const float v_diff =
+              alpha_diff * 0.5f *
+              (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
+               fabsf(fac_mu * r_inv * dvdr_Hubble));
+          /* wi_dx + wj_dx / 2 is F_ij */
+          const float diff_du_term =
+              v_diff * (ui - u_tmp[j_block]) *
+              (f_ij * wi_dr / rhoi + f_ji * wj_dr / rhoj);
+
+          /* Assemble the energy equation term */
+          const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term;
+
+          /* Internal energy time derivative */
+          u_dti += du_dt_i * mj;
+          if (mj == 0.f) printf("zero mass mj %f\n", mj);
+
+          /* Get the time derivative for h. */
+          h_dti -= mj * dvdr * r_inv / rhoj * wi_dr;
+
+          /* Update if we need to; this should be guaranteed by the gradient
+           * loop but due to some possible synchronisation problems this is here
+           * as a _quick fix_. Added: 14th August 2019. To be removed by 1st Jan
+           * 2020. (JB) */
+          v_sigi = max(v_sigi, v_sig);
+          int time_bin_j = timebin[j_block];
+          if (time_bin_j > 0)
+            min_ngb_time_bin = min(min_ngb_time_bin, time_bin_j);
+          //          printf("Got in\n");
+        }
+      }
+    }
+    __syncthreads();
+  } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+  if (pid >= ci_start && pid < ci_end) {
+    parts_aos[pid].v_sig = v_sigi;
+    parts_aos[pid].h_dt = h_dti;
+    parts_aos[pid].u_dt = u_dti;
+    parts_aos[pid].a_hydrox = ahydroxi;
+    parts_aos[pid].a_hydroy = ahydroyi;
+    parts_aos[pid].a_hydroz = ahydrozi;
+    parts_aos[pid].min_ngb_time_bin = min_ngb_time_bin;
+    //    printf("%f %f %f %f %f %f\n", v_sigi, h_dti, u_dti, ahydroxi,
+    //    ahydroyi, ahydrozi);
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NAIVEGPUAOSF4F(
+    const struct part_aos_f4_f_send pi,
+    struct part_aos_f4_f_send *__restrict__ parts_send,
+    struct part_aos_f4_f_recv *__restrict__ parts_recv, int pid,
+    const int cj_start, const int cj_end, float d_a, float d_H) {
+
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
+
+  int Found_neighbours = 0;
+
+  //  const part_aos_f4_send pi = parts_send[pid];
+  const float4 x_h_i = pi.x_h;
+  const float4 ux_m_i = pi.ux_m;
+
+  float4 f_b_t_mintbinngb_i = pi.f_bals_timebin_mintimebin_ngb;
+  const float4 rho_p_c_vsig_i = pi.rho_p_c_vsigi;
+  const float3 u_avisc_adiff_i = pi.u_alphavisc_alphadiff;
+
+  const float mi = ux_m_i.w;
+  const float pressurei = rho_p_c_vsig_i.y;
+  const float ci = rho_p_c_vsig_i.z;
+  float3 ahydro = {0.0, 0.0, 0.0};
+  float4 udt_hdt_vsig_mintbinngb = {0.0, 0.0, 0.0, 0.0};
+  udt_hdt_vsig_mintbinngb.z = rho_p_c_vsig_i.w;
+  udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w;
+
+  const float hi = x_h_i.w;
+  const float hig2 = hi * hi * kernel_gamma2;
+
+  //  printf("js %i je %i\n", cj_start, cj_end);
+  /*Particles copied in blocks to shared memory*/
+  for (int j = cj_start; j < cj_end; j++) {
+    struct part_aos_f4_f_send pj = parts_send[j];
+    const float4 x_h_j = pj.x_h;
+    const float4 ux_m_j = pj.ux_m;
+    const float4 f_b_t_mintbinngb_j = pj.f_bals_timebin_mintimebin_ngb;
+    const float4 rho_p_c_vsig_j = pj.rho_p_c_vsigi;
+    //    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+    const float3 u_avisc_adiff_j = pj.u_alphavisc_alphadiff;
+    const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y,
+                zij = x_h_i.z - x_h_j.z;
+    const float r2 = xij * xij + yij * yij + zij * zij;
+    //	printf("r2 %f \n", r2);
+    if (r2 < hig2) {
+      //          /* Cosmology terms for the signal velocity */
+      const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+      const float a2_Hubble = d_a * d_a * d_H;
+      const float r = sqrt(r2);
+      const float r_inv = 1.f / r;
+      //          /* Recover some data */
+      const float mj = ux_m_j.w;
+      //          /* Get the kernel for hi. */
+      const float hi_inv = 1.f / hi;
+      const float hid_inv = d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
+      const float xi = r * hi_inv;
+      float wi, wi_dx;
+      d_kernel_deval(xi, &wi, &wi_dx);
+      const float wi_dr = hid_inv * wi_dx;
+      /* Get the kernel for hj. */
+      const float hj = x_h_j.w;
+      const float hj_inv = 1.0f / hj;
+      const float hjd_inv = d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
+      const float xj = r * hj_inv;
+      float wj, wj_dx;
+      d_kernel_deval(xj, &wj, &wj_dx);
+      const float wj_dr = hjd_inv * wj_dx;
+      //          /* Compute dv dot r */
+      float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y,
+            dvz = ux_m_i.z - ux_m_j.z;
+      const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+      //          /* Add Hubble flow */
+      const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+      //          /* Are the particles moving towards each others ? */
+      const float omega_ij = min(dvdr_Hubble, 0.f);
+      const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+                                                     //
+      //          /* Signal velocity */
+      const float cj = rho_p_c_vsig_j.z;
+      const float v_sig = ci + cj - const_viscosity_beta * mu_ij;
+
+      /* Variable smoothing length term */
+      const float f_ij = 1.f - f_b_t_mintbinngb_i.x / mj;
+      const float f_ji = 1.f - f_b_t_mintbinngb_j.x / mi;
+
+      /* Construct the full viscosity term */
+      const float pressurej = rho_p_c_vsig_j.y;
+      const float rho_ij = rho_p_c_vsig_i.x + rho_p_c_vsig_j.x;
+      const float alpha = u_avisc_adiff_i.y + u_avisc_adiff_j.y;
+      const float visc = -0.25f * alpha * v_sig * mu_ij *
+                         (f_b_t_mintbinngb_i.y + f_b_t_mintbinngb_j.y) / rho_ij;
+      /* Convolve with the kernel */
+      const float visc_acc_term =
+          0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv;
+      /* Compute gradient terms */
+      const float rhoi2 = rho_p_c_vsig_i.x * rho_p_c_vsig_i.x;
+      const float rhoj2 = rho_p_c_vsig_j.x * rho_p_c_vsig_j.x;
+      const float P_over_rho2_i = pressurei / (rhoi2)*f_ij;
+      const float P_over_rho2_j = pressurej / (rhoj2)*f_ji;
+
+      /* SPH acceleration term */
+      const float sph_acc_term =
+          (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv;
+
+      /* Assemble the acceleration */
+      const float acc = sph_acc_term + visc_acc_term;
+      /* Use the force Luke ! */
+      ahydro.x -= mj * acc * xij;
+      ahydro.y -= mj * acc * yij;
+      ahydro.z -= mj * acc * zij;
+      /* Get the time derivative for u. */
+      const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr;
+
+      /* Viscosity term */
+      const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble;
+      /* Diffusion term */
+      /* Combine the alpha_diff into a pressure-based switch -- this allows the
+       * alpha from the highest pressure particle to dominate, so that the
+       * diffusion limited particles always take precedence - another trick to
+       * allow the scheme to work with thermal feedback. */
+      float alpha_diff =
+          (pressurei * u_avisc_adiff_i.z + pressurej * u_avisc_adiff_j.z) /
+          (pressurei + pressurej);
+      if (fabsf(pressurei + pressurej) < 1e-10) alpha_diff = 0.f;
+      const float v_diff = alpha_diff * 0.5f *
+                           (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
+                            fabsf(fac_mu * r_inv * dvdr_Hubble));
+      /* wi_dx + wj_dx / 2 is F_ij */
+      const float diff_du_term =
+          v_diff * (u_avisc_adiff_i.x - u_avisc_adiff_j.x) *
+          (f_ij * wi_dr / rho_p_c_vsig_i.x + f_ji * wj_dr / rho_p_c_vsig_j.x);
+
+      /* Assemble the energy equation term */
+      const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term;
+
+      /* Internal energy time derivative */
+      udt_hdt_vsig_mintbinngb.x += du_dt_i * mj;
+
+      /* Get the time derivative for h. */
+      udt_hdt_vsig_mintbinngb.y -= mj * dvdr * r_inv / rho_p_c_vsig_j.x * wi_dr;
+
+      /* Update if we need to; this should be guaranteed by the gradient loop
+       * but due to some possible synchronisation problems this is here as a
+       * _quick fix_. Added: 14th August 2019. To be removed by 1st Jan 2020.
+       * (JB) */
+      udt_hdt_vsig_mintbinngb.z = fmaxf(udt_hdt_vsig_mintbinngb.z, v_sig);
+      unsigned int time_bin_j = (f_b_t_mintbinngb_j.z + 0.5f);
+      unsigned int min_tb_i = (f_b_t_mintbinngb_i.w + 0.5f);
+      if (time_bin_j > 0) f_b_t_mintbinngb_i.w = min(min_tb_i, time_bin_j);
+      //          printf("Got in\n");
+    }
+  } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+  //  if (pid >= ci_start && pid < ci_end) {
+  udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w;
+  parts_recv[pid].udt_hdt_vsig_mintimebin_ngb = udt_hdt_vsig_mintbinngb;
+  parts_recv[pid].a_hydro = ahydro;
+  //  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2GPU(struct part_soa parts_soa, int pid,
+                           const int ci_start, const int ci_end,
+                           const int cj_start, const int cj_end, float d_a,
+                           float d_H, int time_bin_inhibited, float *vars_pair,
+                           double *d_shift_x, double *d_shift_y,
+                           double *d_shift_z, const int task_id_tmp) {
+
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
+
+  float cellx = 0.0, celly = 0.0, cellz = 0.0;
+  float cellxj = 0.0, cellyj = 0.0, cellzj = 0.0;
+  float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  double pix = 0.0;
+  double piy = 0.0;
+  double piz = 0.0;
+  float rhoi = 0.0;
+  float rho_dhi = 0.0;
+  float wcounti = 0.0;
+  float wcount_dhi = 0.0;
+  float div_vi = 0.0;
+  float rot_uxi = 0.0;
+  float rot_uyi = 0.0;
+  float rot_uzi = 0.0;
+  int Found_neighbours = 0;
+  int count_i = cj_start;
+  //  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
+  //  last_part_in_task_blocks_ci %i\n",
+  //		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
+  // last_part_in_task_blocks_ci);
+
+  if (pid < ci_end) {
+    cellx = parts_soa.locx[pid];
+    celly = parts_soa.locy[pid];
+    cellz = parts_soa.locz[pid];
+    const int j = cj_start;
+    cellxj = parts_soa.locx[j];
+    cellyj = parts_soa.locy[j];
+    cellzj = parts_soa.locz[j];
+    hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+    mi = parts_soa.mass[pid];
+    uxi = parts_soa.ux[pid];
+    uyi = parts_soa.uy[pid];
+    uzi = parts_soa.uz[pid];
+    pix = parts_soa.x_p[pid] - d_shift_x[task_id_tmp];
+    piy = parts_soa.y_p[pid] - d_shift_y[task_id_tmp];
+    piz = parts_soa.z_p[pid] - d_shift_z[task_id_tmp];
+  }
+
+  int n_neighbours = 0;
+  float av_dist = 0.f;
+  float av_distx = 0.f;
+  float av_disty = 0.f;
+  float av_distz = 0.f;
+  float distby2h = 0.f;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  double *x_p_tmp = (double *)&vars_pair[0];
+  double *y_p_tmp = (double *)&x_p_tmp[BLOCK_SIZE];
+  double *z_p_tmp = (double *)&y_p_tmp[BLOCK_SIZE];
+  float *h_tmp = (float *)&z_p_tmp[BLOCK_SIZE];
+  float *mass_tmp = (float *)&h_tmp[BLOCK_SIZE];
+  float *ux_tmp = (float *)&mass_tmp[BLOCK_SIZE];
+  float *uy_tmp = (float *)&ux_tmp[BLOCK_SIZE];
+  float *uz_tmp = (float *)&uy_tmp[BLOCK_SIZE];
+  timebin_t *timebin = (timebin_t *)&uz_tmp[BLOCK_SIZE];
+  float *rho_tmp = (float *)&timebin[BLOCK_SIZE];
+  float *rho_dh_tmp = (float *)&rho_tmp[BLOCK_SIZE];
+  float *wcount_tmp = (float *)&rho_dh_tmp[BLOCK_SIZE];
+  float *wcount_dh_tmp = (float *)&wcount_tmp[BLOCK_SIZE];
+  float *div_v_tmp = (float *)&wcount_dh_tmp[BLOCK_SIZE];
+  float *rot_ux_tmp = (float *)&div_v_tmp[BLOCK_SIZE];
+  float *rot_uy_tmp = (float *)&rot_ux_tmp[BLOCK_SIZE];
+  float *rot_uz_tmp = (float *)&rot_uy_tmp[BLOCK_SIZE];
+
+  /*Particles copied in blocks to shared memory*/
+  for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) {
+    const int tid_x = threadIdx.x;
+    int j = b + tid_x;
+    x_p_tmp[tid_x] = parts_soa.x_p[j];
+    y_p_tmp[tid_x] = parts_soa.y_p[j];
+    z_p_tmp[tid_x] = parts_soa.z_p[j];
+    h_tmp[tid_x] = parts_soa.h[j];
+    mass_tmp[tid_x] = parts_soa.mass[j];
+    ux_tmp[tid_x] = parts_soa.ux[j];
+    uy_tmp[tid_x] = parts_soa.uy[j];
+    uz_tmp[tid_x] = parts_soa.uz[j];
+    timebin[tid_x] = parts_soa.time_bin[j];
+    rho_tmp[tid_x] = 0.f;
+    rho_dh_tmp[tid_x] = 0.f;
+    wcount_tmp[tid_x] = 0.f;
+    wcount_dh_tmp[tid_x] = 0.f;
+    div_v_tmp[tid_x] = 0.f;
+    rot_ux_tmp[tid_x] = 0.f;
+    rot_uy_tmp[tid_x] = 0.f;
+    rot_uz_tmp[tid_x] = 0.f;
+    __syncthreads();
+    const double shift_x_j = d_shift_x[task_id_tmp + 1];
+    const double shift_y_j = d_shift_y[task_id_tmp + 1];
+    const double shift_z_j = d_shift_z[task_id_tmp + 1];
+    /*j_block is the particle's index in the block. Loop through particles in
+     * shared memory one by one*/
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      int jj = b + j_block;
+      if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+        const double pjx = x_p_tmp[j_block] - shift_x_j;
+        const double pjy = y_p_tmp[j_block] - shift_y_j;
+        const double pjz = z_p_tmp[j_block] - shift_z_j;
+
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        //		const float xij = pjx - pix, yij = pjy - piy, zij = pjz
+        //- piz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        const float hj = h_tmp[j_block];
+        const float hjg2 = hj * hj * kernel_gamma2;
+        //		if(r2 > 32.f * hig2 && hig2 != 0.f) printf("x %f y %f z
+        //%f r %f hig2 %f\n", xij/dx, yij/dx, zij/dx, sqrt(r2)/dx);
+        /* Compute dv dot r */
+        const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                    dvz = uzi - uz_tmp[j_block];
+        const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+        /* Compute dv cross r */
+        const float curlvrx = dvy * zij - dvz * yij;
+        const float curlvry = dvz * xij - dvx * zij;
+        const float curlvrz = dvx * yij - dvy * xij;
+
+        const float r = sqrt(r2);
+        if (r2 < hig2) {
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          /* Get the kernel for hi. */
+          //		  if(hi<1.f/dx)printf("h < dx\n");
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          rhoi += mj * wi;
+          rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+          wcounti += wi;
+          wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+
+          div_vi -= faci * dvdr;
+
+          rot_uxi += faci * curlvrx;
+          rot_uyi += faci * curlvry;
+          rot_uzi += faci * curlvrz;
+          //
+        }
+        if (r2 < hjg2) {
+          /* Recover some data */
+          /* Get the kernel for hi. */
+          const float hj_inv = 1.f / hj;
+          const float uj = r * hj_inv;
+          float wj, wj_dx;
+
+          d_kernel_deval(uj, &wj, &wj_dx);
+
+          //		  atomicAdd(&rho_tmp[j_block], mi * wj);
+          atomicAdd(&parts_soa.rho[j], mi * wj);
+          //		  atomicAdd(&rho_dh_tmp[j_block], -mi * (hydro_dimension
+          //* wj + uj * wj_dx));
+          atomicAdd(&parts_soa.rho_dh[j],
+                    -mi * (hydro_dimension * wj + uj * wj_dx));
+
+          //		  atomicAdd(&wcount_tmp[j_block], wj);
+          atomicAdd(&parts_soa.wcount[j], wj);
+          //		  atomicAdd(&wcount_dh_tmp[j_block], -(hydro_dimension *
+          // wj + uj * wj_dx));
+          atomicAdd(&parts_soa.wcount_dh[j],
+                    -(hydro_dimension * wj + uj * wj_dx));
+
+          const float r_inv = 1.f / r;
+          const float facj = mi * wj_dx * r_inv;
+
+          //		  atomicAdd(&div_v_tmp[j_block], -facj * dvdr);
+          atomicAdd(&parts_soa.div_v[j], -facj * dvdr);
+
+          //		  atomicAdd(&rot_ux_tmp[j_block], facj * curlvrx);
+          //		  atomicAdd(&rot_uy_tmp[j_block], facj * curlvry);
+          //		  atomicAdd(&rot_uz_tmp[j_block], facj * curlvrz);
+          atomicAdd(&parts_soa.rot_ux[j], facj * curlvrx);
+          atomicAdd(&parts_soa.rot_uy[j], facj * curlvry);
+          atomicAdd(&parts_soa.rot_uz[j], facj * curlvrz);
+          //		  printf("rho %f rho_dh %f wcount %f wcount_dh %f div_v
+          //%f rotux %f rotuy %f rotuz %f\n" 				 ,rhoi,
+          // rho_dhi, wcounti, wcount_dhi, div_vi, rot_uxi, rot_uyi, rot_uzi);
+        } /*if r2<hjg2 */
+      } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+    } /*End of looping through particles in shared memory---Shared arrays
+         zero'ed for next step in outer loop*/
+    __syncthreads();
+    //	if(j < cj_end){
+    //	  atomicAdd(&parts_soa.rho[j], rho_tmp[threadIdx.x]);
+    //	  atomicAdd(&parts_soa.rho_dh[j], rho_dh_tmp[threadIdx.x]);
+    //	  atomicAdd(&parts_soa.wcount[j], wcount_tmp[threadIdx.x]);
+    //	  atomicAdd(&parts_soa.wcount_dh[j], wcount_dh_tmp[threadIdx.x]);
+    //	  atomicAdd(&parts_soa.div_v[j], div_v_tmp[threadIdx.x]);
+    //	  atomicAdd(&parts_soa.rot_ux[j], rot_ux_tmp[threadIdx.x]);
+    //	  atomicAdd(&parts_soa.rot_uy[j], rot_uy_tmp[threadIdx.x]);
+    //	  atomicAdd(&parts_soa.rot_uz[j], rot_uz_tmp[threadIdx.x]);
+    //	}
+    //	__syncthreads();
+    //	parts_soa.rho[j] += rho_tmp[threadIdx.x];
+    //	parts_soa.rho_dh[j] += rho_dh_tmp[threadIdx.x];
+    //	parts_soa.wcount[j] += wcount_tmp[threadIdx.x];
+    //	parts_soa.wcount_dh[j] =+ wcount_dh_tmp[threadIdx.x];
+    //	parts_soa.div_v[j] += div_v_tmp[threadIdx.x];
+    //	parts_soa.rot_ux[j] += rot_ux_tmp[threadIdx.x];
+    //	parts_soa.rot_uy[j] =+ rot_uy_tmp[threadIdx.x];
+    //	parts_soa.rot_uz[j] += rot_uz_tmp[threadIdx.x];
+  } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+  if (pid >= ci_start && pid < ci_end) {
+    //	if(n_neighbours > 0){
+    //		distby2h = distby2h/n_neighbours;
+    //		av_dist = av_dist/(n_neighbours*dx);
+    //	}
+    //    av_distx = av_distx/(n_neighbours*dx);
+    //    av_disty = av_disty/(n_neighbours*dx);
+    //    av_distz = av_distz/(n_neighbours*dx);
+    parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+    parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+    parts_soa.div_v[pid] = div_vi;
+    parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi;
+    parts_soa.rot_uz[pid] = rot_uzi;
+    //	if(rhoi != 0.f)printf("rho i %f, rho_dh i %f\n", rhoi, rho_dhi);
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_density_GPU(
+    struct part_soa parts_soa, int *d_task_first_part_ci,
+    int *d_task_first_part_cj, int *d_task_last_part_ci,
+    int *d_task_last_part_cj, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, int time_bin_inhibited) {
+
+  extern __shared__ float vars[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+
+  first_part_in_task_blocks_ci = d_task_first_part_ci[task_id];
+  last_part_in_task_blocks_ci = d_task_last_part_ci[task_id];
+  first_part_in_task_blocks_cj = d_task_first_part_cj[task_id];
+  last_part_in_task_blocks_cj = d_task_last_part_cj[task_id];
+
+  // Now we start calculations for particles in cell i
+  const int pid = threadid + first_part_in_task_blocks_ci;
+
+  /*Don't ever put me in an if statement. I've got __syncthreads inside*/
+  DOPAIRGPU(parts_soa, pid, last_part_in_task_blocks_ci,
+            first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, d_a, d_H,
+            time_bin_inhibited, vars);
+  //  __syncthreads();
+  // Now we start calculations for particles in cell i
+  const int pjd = threadid + last_part_in_task_blocks_ci;
+  /*Don't ever put me in an if statement. I've got __syncthreads inside*/
+  DOPAIRGPU(parts_soa, pjd, last_part_in_task_blocks_cj,
+            first_part_in_task_blocks_ci, last_part_in_task_blocks_ci, d_a, d_H,
+            time_bin_inhibited, vars);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_sym_density_GPU(
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, int time_bin_inhibited, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
+
+  extern __shared__ float vars_pair[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  // Now we start calculations for particles in cell i
+  const int pid = threadid + ci_start;
+
+  /*Don't ever put me in an if statement. I've got __syncthreads inside*/
+  DOPAIR2GPU(parts_soa, pid, ci_start, ci_end, cj_start, cj_end, d_a, d_H,
+             time_bin_inhibited, vars_pair, d_shift_x, d_shift_y, d_shift_z,
+             task_id_tmp);
+  //  __syncthreads();
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_nonsym_density_GPU(
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, int time_bin_inhibited, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
+
+  extern __shared__ float vars_pair[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  /* Start calculations for particles in cell i
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pid = threadid + ci_start;
+  const int flip_i = 1;
+  DOPAIR2NONSYMGPU(parts_soa, pid, ci_start, ci_end, cj_start, cj_end, d_a, d_H,
+                   vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp,
+                   flip_i);
+
+  /*Necessary evil to stop parts from j and i co-existing on shared memory for
+   * sums*/
+  __syncthreads();
+
+  /*Now do cj
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pjd = threadid + cj_start;
+  const int flip_j = -1;
+  DOPAIR2NONSYMGPU(parts_soa, pjd, cj_start, cj_end, ci_start, ci_end, d_a, d_H,
+                   vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp + 1,
+                   flip_j);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_ci_density_GPU(
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, double *d_shift_x, double *d_shift_y,
+    double *d_shift_z) {
+
+  extern __shared__ float vars_pair[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  /* Start calculations for particles in cell i
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pid = threadid + ci_start;
+  const int flip_i = 1;
+  DOPAIR2NONSYMGPU(parts_soa, pid, ci_start, ci_end, cj_start, cj_end, d_a, d_H,
+                   vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp,
+                   flip_i);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_cj_density_GPU(
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, double *d_shift_x, double *d_shift_y,
+    double *d_shift_z) {
+
+  extern __shared__ float vars_pair[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  /*Now do cj
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pjd = threadid + cj_start;
+  const int flip_j = -1;
+  DOPAIR2NONSYMGPU(parts_soa, pjd, cj_start, cj_end, ci_start, ci_end, d_a, d_H,
+                   vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp + 1,
+                   flip_j);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_ci_density_GPU_aos(
+    struct part_aos *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, double *d_shift_x, double *d_shift_y,
+    double *d_shift_z) {
+
+  extern __shared__ float vars_pair_aos[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  /* Start calculations for particles in cell i
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pid = threadid + ci_start;
+  const int flip_i = 1;
+  DOPAIR2NONSYMGPUAOS(parts_aos, pid, ci_start, ci_end, cj_start, cj_end, d_a,
+                      d_H, vars_pair_aos, d_shift_x, d_shift_y, d_shift_z,
+                      task_id_tmp, flip_i);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_cj_density_GPU_aos(
+    struct part_aos *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, double *d_shift_x, double *d_shift_y,
+    double *d_shift_z) {
+
+  extern __shared__ float vars_pair_aos[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  /*Now do cj
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pjd = threadid + cj_start;
+  const int flip_j = -1;
+  DOPAIR2NONSYMGPUAOS(parts_aos, pjd, cj_start, cj_end, ci_start, ci_end, d_a,
+                      d_H, vars_pair_aos, d_shift_x, d_shift_y, d_shift_z,
+                      task_id_tmp + 1, flip_j);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_ci_density_GPU_aos_f4(
+    struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+    int4 *fparti_fpartj_lparti_lpartj_dens, float d_a, float d_H,
+    int bundle_first_task) {
+
+  extern __shared__ float4 vars_pair_i_f4[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int ci_start = fparti_fpartj_lparti_lpartj_dens[task_id].x;
+  const int cj_start = fparti_fpartj_lparti_lpartj_dens[task_id].y;
+  const int ci_end = fparti_fpartj_lparti_lpartj_dens[task_id].z;
+  const int cj_end = fparti_fpartj_lparti_lpartj_dens[task_id].w;
+
+  /* Start calculations for particles in cell i
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pid = threadid + ci_start;
+
+  DOPAIR2NONSYMGPUAOSF4(parts_send, parts_recv, pid, ci_start, ci_end, cj_start,
+                        cj_end, d_a, d_H, vars_pair_i_f4);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_cj_density_GPU_aos_f4(
+    struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+    int4 *fparti_fpartj_lparti_lpartj_dens, float d_a, float d_H,
+    int bundle_first_task) {
+
+  extern __shared__ float4 vars_pair_j_f4[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  const int ci_start = fparti_fpartj_lparti_lpartj_dens[task_id].x;
+  const int cj_start = fparti_fpartj_lparti_lpartj_dens[task_id].y;
+  const int ci_end = fparti_fpartj_lparti_lpartj_dens[task_id].z;
+  const int cj_end = fparti_fpartj_lparti_lpartj_dens[task_id].w;
+
+  /*Now do cj
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pjd = threadid + cj_start;
+  DOPAIR2NONSYMGPUAOSF4(parts_send, parts_recv, pjd, cj_start, cj_end, ci_start,
+                        ci_end, d_a, d_H, vars_pair_j_f4);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_density_GPU_aos_f4(
+    struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+    float d_a, float d_H, int bundle_first_part, int bundle_n_parts) {
+
+  //  extern __shared__ float4 vars_pair_i_f4[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int pid = bundle_first_part + threadid;
+  //  const int task_id = bundle_first_part + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  if (pid < bundle_first_part + bundle_n_parts) {
+    const struct part_aos_f4_send pi = parts_send[pid];
+    const int cj_start = pi.cjs_cje.x;
+    const int cj_end = pi.cjs_cje.y;
+
+    /* Start calculations for particles in cell i*/
+    DOPAIR2NAIVEGPUAOSF4(pi, parts_send, parts_recv, pid, cj_start, cj_end, d_a,
+                         d_H);
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_ci_density_GPU_aos_g(
+    struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, double *d_shift_x, double *d_shift_y,
+    double *d_shift_z) {
+
+  extern __shared__ float vars_pair_aosg[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  /* Start calculations for particles in cell i
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pid = threadid + ci_start;
+  const int flip_i = 1;
+  DOPAIR2NONSYMGPUAOSG(parts_aos, pid, ci_start, ci_end, cj_start, cj_end, d_a,
+                       d_H, vars_pair_aosg, d_shift_x, d_shift_y, d_shift_z,
+                       task_id_tmp, flip_i);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_cj_density_GPU_aos_g(
+    struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, double *d_shift_x, double *d_shift_y,
+    double *d_shift_z) {
+
+  extern __shared__ float vars_pair_aosg[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  /*Now do cj
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pjd = threadid + cj_start;
+  const int flip_j = -1;
+  DOPAIR2NONSYMGPUAOSG(parts_aos, pjd, cj_start, cj_end, ci_start, ci_end, d_a,
+                       d_H, vars_pair_aosg, d_shift_x, d_shift_y, d_shift_z,
+                       task_id_tmp + 1, flip_j);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_gradient_GPU_aos_f4(
+    struct part_aos_f4_g_send *parts_send,
+    struct part_aos_f4_g_recv *parts_recv, float d_a, float d_H,
+    int bundle_first_part, int bundle_n_parts) {
+
+  //  extern __shared__ float4 vars_pair_i_f4[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int pid = bundle_first_part + threadid;
+  //  const int task_id = bundle_first_part + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  if (pid < bundle_first_part + bundle_n_parts) {
+    const struct part_aos_f4_g_send pi = parts_send[pid];
+    const int cj_start = pi.cjs_cje.x;
+    const int cj_end = pi.cjs_cje.y;
+    /* Start calculations for particles in cell i*/
+    DOPAIR2NAIVEGPUAOSF4G(pi, parts_send, parts_recv, pid, cj_start, cj_end,
+                          d_a, d_H);
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_ci_density_GPU_aos_f(
+    struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, double *d_shift_x, double *d_shift_y,
+    double *d_shift_z) {
+
+  extern __shared__ float vars_pair_aosf[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  /* Start calculations for particles in cell i
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pid = threadid + ci_start;
+  const int flip_i = 1;
+  DOPAIR2NONSYMGPUAOSF(parts_aos, pid, ci_start, ci_end, cj_start, cj_end, d_a,
+                       d_H, vars_pair_aosf, d_shift_x, d_shift_y, d_shift_z,
+                       task_id_tmp, flip_i);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_cj_density_GPU_aos_f(
+    struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, double *d_shift_x, double *d_shift_y,
+    double *d_shift_z) {
+
+  extern __shared__ float vars_pair_aosf[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  /*Now do cj
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pjd = threadid + cj_start;
+  const int flip_j = -1;
+  DOPAIR2NONSYMGPUAOSF(parts_aos, pjd, cj_start, cj_end, ci_start, ci_end, d_a,
+                       d_H, vars_pair_aosf, d_shift_x, d_shift_y, d_shift_z,
+                       task_id_tmp + 1, flip_j);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_force_GPU_aos_f4(
+    struct part_aos_f4_f_send *parts_send,
+    struct part_aos_f4_f_recv *parts_recv, float d_a, float d_H,
+    int bundle_first_part, int bundle_n_parts) {
+
+  //  extern __shared__ float4 vars_pair_i_f4[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int pid = bundle_first_part + threadid;
+  //  const int task_id = bundle_first_part + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  if (pid < bundle_first_part + bundle_n_parts) {
+    const struct part_aos_f4_f_send pi = parts_send[pid];
+    const int cj_start = pi.cjs_cje.x;
+    const int cj_end = pi.cjs_cje.y;
+    /* Start calculations for particles in cell i */
+    DOPAIR2NAIVEGPUAOSF4F(pi, parts_send, parts_recv, pid, cj_start, cj_end,
+                          d_a, d_H);
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopair1_branch_density_gpu(
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, int time_bin_inhibited,
+    double *d_shift_x, double *d_shift_y, double *d_shift_z) {
+
+  int max_parts = max(max_parts_j, max_parts_i);
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  //  fprintf(stderr,"max_parts %i, max_partsi %i, max_partsj %i\n, "
+  //		  "numBlocks_x %i, numBlocks_y %i, BLOCK_SIZE %i\n", max_parts,
+  //		  max_parts_i, max_parts_j, numBlocks_x, numBlocks_y,
+  // BLOCK_SIZE);
+
+  /*Do ci & cj*/
+  //  fprintf(stderr, "BLOCK_SIZE %i max parts %i num idle threads %i\n",
+  //  BLOCK_SIZE, max_parts, numBlocks_x * BLOCK_SIZE - max_parts);
+
+  //  runner_do_pair_sym_density_GPU<<<gridShape, BLOCK_SIZE,
+  //		  13 * BLOCK_SIZE * sizeof(float) +
+  //		  3 * BLOCK_SIZE * sizeof(double) +
+  //              BLOCK_SIZE * sizeof(timebin_t),
+  //          stream>>>(
+  //      parts_soa, d_task_first_parts_pair, d_task_last_parts_pair,
+  //      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+  //      nBlocks_per_task, bundle_first_task, time_bin_inhibited, d_shift_x,
+  //      d_shift_y, d_shift_z);
+
+  runner_do_pair_nonsym_density_GPU<<<gridShape, BLOCK_SIZE,
+                                      5 * BLOCK_SIZE * sizeof(float) +
+                                          3 * BLOCK_SIZE * sizeof(float) +
+                                          BLOCK_SIZE * sizeof(timebin_t),
+                                      stream>>>(
+      parts_soa, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+      tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+      time_bin_inhibited, d_shift_x, d_shift_y, d_shift_z);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopairci_branch_density_gpu(
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
+
+  int max_parts = max_parts_i;
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+  runner_do_pair_ci_density_GPU<<<gridShape, BLOCK_SIZE,
+                                  5 * BLOCK_SIZE * sizeof(float) +
+                                      3 * BLOCK_SIZE * sizeof(float) +
+                                      BLOCK_SIZE * sizeof(timebin_t),
+                                  stream>>>(
+      parts_soa, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+      tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+      d_shift_x, d_shift_y, d_shift_z);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopaircj_branch_density_gpu(
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
+
+  int max_parts = max_parts_j;
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+  runner_do_pair_cj_density_GPU<<<gridShape, BLOCK_SIZE,
+                                  5 * BLOCK_SIZE * sizeof(float) +
+                                      3 * BLOCK_SIZE * sizeof(float) +
+                                      BLOCK_SIZE * sizeof(timebin_t),
+                                  stream>>>(
+      parts_soa, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+      tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+      d_shift_x, d_shift_y, d_shift_z);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopairci_branch_density_gpu_aos(
+    struct part_aos *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
+
+  int max_parts = max_parts_i;
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+  runner_do_pair_ci_density_GPU_aos<<<gridShape, BLOCK_SIZE,
+                                      5 * BLOCK_SIZE * sizeof(float) +
+                                          3 * BLOCK_SIZE * sizeof(float) +
+                                          BLOCK_SIZE * sizeof(int),
+                                      stream>>>(
+      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+      tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+      d_shift_x, d_shift_y, d_shift_z);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopaircj_branch_density_gpu_aos(
+    struct part_aos *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
+
+  int max_parts = max_parts_j;
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+  runner_do_pair_cj_density_GPU_aos<<<gridShape, BLOCK_SIZE,
+                                      5 * BLOCK_SIZE * sizeof(float) +
+                                          3 * BLOCK_SIZE * sizeof(float) +
+                                          BLOCK_SIZE * sizeof(int),
+                                      stream>>>(
+      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+      tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+      d_shift_x, d_shift_y, d_shift_z);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopairci_branch_density_gpu_aos_f4(
+    struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+    float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+  runner_do_pair_ci_density_GPU_aos_f4<<<
+      gridShape, BLOCK_SIZE, 2 * BLOCK_SIZE * sizeof(float4), stream>>>(
+      parts_send, parts_recv, fparti_fpartj_lparti_lpartj_dens, d_a, d_H,
+      bundle_first_task);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopaircj_branch_density_gpu_aos_f4(
+    struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+    float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+  runner_do_pair_cj_density_GPU_aos_f4<<<
+      gridShape, BLOCK_SIZE, 2 * BLOCK_SIZE * sizeof(float4), stream>>>(
+      parts_send, parts_recv, fparti_fpartj_lparti_lpartj_dens, d_a, d_H,
+      bundle_first_task);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopair_branch_density_gpu_aos_f4(
+    struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+    float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_part, int bundle_n_parts) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+  //	  fprintf(stderr, "nblocks %i\n", numBlocks_x);
+  runner_do_pair_density_GPU_aos_f4<<<numBlocks_x, BLOCK_SIZE, 0, stream>>>(
+      parts_send, parts_recv, d_a, d_H, bundle_first_part, bundle_n_parts);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopairci_branch_density_gpu_aos_g(
+    struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
+
+  int max_parts = max_parts_i;
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+  runner_do_pair_ci_density_GPU_aos_g<<<
+      gridShape, BLOCK_SIZE,
+      12 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int), stream>>>(
+      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+      tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+      d_shift_x, d_shift_y, d_shift_z);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopaircj_branch_density_gpu_aos_g(
+    struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
+
+  int max_parts = max_parts_j;
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+  runner_do_pair_cj_density_GPU_aos_g<<<
+      gridShape, BLOCK_SIZE,
+      12 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int), stream>>>(
+      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+      tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+      d_shift_x, d_shift_y, d_shift_z);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopair_branch_gradient_gpu_aos_f4(
+    struct part_aos_f4_g_send *parts_send,
+    struct part_aos_f4_g_recv *parts_recv, float d_a, float d_H,
+    cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_part, int bundle_n_parts) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+  //	  fprintf(stderr, "nblocks %i\n", numBlocks_x);
+  runner_do_pair_gradient_GPU_aos_f4<<<numBlocks_x, BLOCK_SIZE, 0, stream>>>(
+      parts_send, parts_recv, d_a, d_H, bundle_first_part, bundle_n_parts);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopairci_branch_density_gpu_aos_f(
+    struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
+
+  int max_parts = max_parts_i;
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+  runner_do_pair_ci_density_GPU_aos_f<<<
+      gridShape, BLOCK_SIZE,
+      17 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int), stream>>>(
+      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+      tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+      d_shift_x, d_shift_y, d_shift_z);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopaircj_branch_density_gpu_aos_f(
+    struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
+
+  int max_parts = max_parts_j;
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+  runner_do_pair_cj_density_GPU_aos_f<<<
+      gridShape, BLOCK_SIZE,
+      17 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int), stream>>>(
+      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+      tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+      d_shift_x, d_shift_y, d_shift_z);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopair_branch_force_gpu_aos_f4(
+    struct part_aos_f4_f_send *parts_send,
+    struct part_aos_f4_f_recv *parts_recv, float d_a, float d_H,
+    cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_part, int bundle_n_parts) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+  //	  fprintf(stderr, "nblocks %i\n", numBlocks_x);
+  runner_do_pair_force_GPU_aos_f4<<<numBlocks_x, BLOCK_SIZE, 0, stream>>>(
+      parts_send, parts_recv, d_a, d_H, bundle_first_part, bundle_n_parts);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+
+__global__ void runner_do_self_density_GPU_naive(
+    struct part_soa parts_soa, int *d_task_first_part, int *d_task_last_part,
+    float d_a, float d_H, int bid, int tid, int count_tasks, int tasksperbundle,
+    int nBlocks_per_task, int bundle_first_task, int max_parts,
+    int time_bin_inhibited) {
+
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+  first_part_in_task_blocks = d_task_first_part[task_id];
+  last_part_in_task_blocks = d_task_last_part[task_id];
+
+  const int pid = threadid + first_part_in_task_blocks;
+
+  int ttid = 0;
+  int first_part = 0;
+  int count = 0;
+  int last_part = 0;
+  float cellx = 0.0, celly = 0.0, cellz = 0.0;
+  float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float rho_dhi = 0.0;
+  float wcounti = 0.0;
+  float wcount_dhi = 0.0;
+  float div_vi = 0.0;
+  float rot_uxi = 0.0;
+  float rot_uyi = 0.0;
+  float rot_uzi = 0.0;
+  int Found_neighbours = 0;
+
+  if (pid < last_part_in_task_blocks) {
+    ttid = parts_soa.tid_p[pid];
+    first_part = d_task_first_part[ttid];
+    last_part = d_task_last_part[ttid];
+    count = last_part - first_part;
+    cellx = parts_soa.locx[pid], celly = parts_soa.locy[pid],
+    cellz = parts_soa.locz[pid];
+    hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+    mi = parts_soa.mass[pid];
+    uxi = parts_soa.ux[pid];
+    uyi = parts_soa.uy[pid];
+    uzi = parts_soa.uz[pid];
+    pix = parts_soa.x_p[pid] - cellx;
+    piy = parts_soa.y_p[pid] - celly;
+    piz = parts_soa.z_p[pid] - cellz;
+
+    int n_neighbours = 0;
+
+    /*Naive loop over neighbours*/
+    for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+         b += BLOCK_SIZE) {
+      for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+        int j = j_block + b;
+        if (j < last_part_in_task_blocks) {
+          const float x_p_tmp = parts_soa.x_p[j];
+          const float y_p_tmp = parts_soa.y_p[j];
+          const float z_p_tmp = parts_soa.z_p[j];
+          const float h_tmp = parts_soa.h[j];
+          const float mass_tmp = parts_soa.mass[j];
+          const float ux_tmp = parts_soa.ux[j];
+          const float uy_tmp = parts_soa.uy[j];
+          const float uz_tmp = parts_soa.uz[j];
+          const timebin_t timebin = parts_soa.time_bin[j];
+
+          /* Compute the pairwise distance. */
+          const float pjx = x_p_tmp - cellx;
+          const float pjy = y_p_tmp - celly;
+          const float pjz = z_p_tmp - cellz;
+          const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+          const float r2 = xij * xij + yij * yij + zij * zij;
+          const float hj = h_tmp, hjg2 = hj * hj * kernel_gamma2;
+          if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
+            Found_neighbours = 1;
+            const float r = sqrt(r2);
+            /* Recover some data */
+            const float mj = mass_tmp;
+            /* Get the kernel for hi. */
+            if (hi < 1.f / 128.f) printf("h < dx\n");
+            const float h_inv = 1.f / hi;
+            const float ui = r * h_inv;
+            float wi, wi_dx;
+
+            d_kernel_deval(ui, &wi, &wi_dx);
+
+            rhoi += mj * wi;
+            rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+            wcounti += wi;
+            wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+            const float r_inv = 1.f / r;
+            const float faci = mj * wi_dx * r_inv;
+
+            /* Compute dv dot r */
+            float dvx = uxi - ux_tmp, dvy = uyi - uy_tmp, dvz = uzi - uz_tmp;
+            const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+            div_vi -= faci * dvdr;
+
+            /* Compute dv cross r */
+            float curlvrx = dvy * zij - dvz * yij;
+            float curlvry = dvz * xij - dvx * zij;
+            float curlvrz = dvx * yij - dvy * xij;
+
+            rot_uxi += faci * curlvrx;
+            rot_uyi += faci * curlvry;
+            rot_uzi += faci * curlvrz;
+          }
+        }
+      }
+    }
+    //    float wi, wi_dx;
+    //    d_kernel_deval(0.f, &wi, &wi_dx);
+    if (Found_neighbours == 0)
+      printf("Not sure what's going on but no neighbours found in GPU loop\n");
+    parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+    parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+    parts_soa.div_v[pid] = div_vi;
+    parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi,
+    parts_soa.rot_uz[pid] = rot_uzi;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_tester_kernel(struct part_soa parts_soa, int *d_task_first_part,
+                          int *d_task_last_part, float d_a, float d_H,
+                          const char *loop_type, cudaStream_t stream, int bid,
+                          int block_size, int count_tasks, int tasksperbundle,
+                          int numBlocks_x, int numBlocks_y, int tid, int offset,
+                          int bundle_first_task, int max_parts,
+                          int time_bin_inhibited) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  tester<<<gridShape, BLOCK_SIZE,
+           8 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(timebin_t),
+           stream>>>(parts_soa, d_task_first_part, d_task_last_part, d_a, d_H,
+                     bid, tid, count_tasks, tasksperbundle, nBlocks_per_task,
+                     bundle_first_task, max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part,
+                           int *d_task_last_part, float d_a, float d_H,
+                           const char *loop_type, cudaStream_t stream,
+                           int block_size, int count_tasks, int tasksperbundle,
+                           int numBlocks_x, int numBlocks_y,
+                           int bundle_first_task, int max_parts) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  runner_do_self_density_GPU<<<gridShape, BLOCK_SIZE,
+                               8 * BLOCK_SIZE * sizeof(float) +
+                                   BLOCK_SIZE * sizeof(timebin_t),
+                               stream>>>(
+      parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, count_tasks,
+      tasksperbundle, nBlocks_per_task, bundle_first_task, max_parts);
+  //  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+  //        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+  //        count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+  //        max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_gradient_aos(struct part_aos_g *parts_aos, int *d_task_first_part,
+                         int *d_task_last_part, float d_a, float d_H,
+                         const char *loop_type, cudaStream_t stream,
+                         int block_size, int count_tasks, int tasksperbundle,
+                         int numBlocks_x, int numBlocks_y,
+                         int bundle_first_task, int max_parts, double *d_cell_x,
+                         double *d_cell_y, double *d_cell_z) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  DOSELF_GPU_AOS_G<<<gridShape, BLOCK_SIZE,
+                     12 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int),
+                     stream>>>(parts_aos, d_task_first_part, d_task_last_part,
+                               d_a, d_H, count_tasks, tasksperbundle,
+                               nBlocks_per_task, bundle_first_task, max_parts,
+                               d_cell_x, d_cell_y, d_cell_z);
+  //  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+  //        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+  //        count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+  //        max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_gradient_aos_f4(struct part_aos_f4_g_send *parts_send,
+                            struct part_aos_f4_g_recv *parts_recv, float d_a,
+                            float d_H, cudaStream_t stream, int numBlocks_x,
+                            int numBlocks_y, int bundle_first_task,
+                            int2 *d_task_first_part_f4) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  DOSELF_GPU_AOS_F4_G<<<gridShape, BLOCK_SIZE, 3 * BLOCK_SIZE * sizeof(float4),
+                        stream>>>(parts_send, parts_recv, d_a, d_H,
+                                  bundle_first_task, d_task_first_part_f4);
+  //  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+  //        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+  //        count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+  //        max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_force_aos(struct part_aos_f *parts_aos, int *d_task_first_part,
+                      int *d_task_last_part, float d_a, float d_H,
+                      const char *loop_type, cudaStream_t stream,
+                      int block_size, int count_tasks, int tasksperbundle,
+                      int numBlocks_x, int numBlocks_y, int bundle_first_task,
+                      int max_parts, double *d_cell_x, double *d_cell_y,
+                      double *d_cell_z) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  DOSELF_GPU_AOS_F<<<gridShape, BLOCK_SIZE,
+                     16 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int),
+                     stream>>>(parts_aos, d_task_first_part, d_task_last_part,
+                               d_a, d_H, count_tasks, tasksperbundle,
+                               nBlocks_per_task, bundle_first_task, max_parts,
+                               d_cell_x, d_cell_y, d_cell_z);
+  //  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+  //        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+  //        count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+  //        max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_force_aos_f4(struct part_aos_f4_f_send *d_parts_send,
+                         struct part_aos_f4_f_recv *d_parts_recv, float d_a,
+                         float d_H, cudaStream_t stream, int numBlocks_x,
+                         int numBlocks_y, int bundle_first_task,
+                         int2 *d_task_first_part_f4) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  DOSELF_GPU_AOS_F4_F<<<
+      gridShape, BLOCK_SIZE,
+      4 * BLOCK_SIZE * sizeof(float4) + BLOCK_SIZE * sizeof(float3), stream>>>(
+      d_parts_send, d_parts_recv, d_a, d_H, bundle_first_task,
+      d_task_first_part_f4);
+  //  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+  //        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+  //        count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+  //        max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
diff --git a/src/cuda/GPU_runner_functions.h b/src/cuda/GPU_runner_functions.h
new file mode 100644
index 0000000000..27bbecdd92
--- /dev/null
+++ b/src/cuda/GPU_runner_functions.h
@@ -0,0 +1,148 @@
+#ifndef CUDA_HEADERS_H
+#define CUDA_HEADERS_H
+#define n_streams 1024
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "part_gpu.h"
+void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part,
+                           int *d_task_last_part, float d_a, float d_H,
+                           const char *loop_type, cudaStream_t stream,
+                           int block_size, int count_tasks, int tasksperbundle,
+                           int numBlocks_x, int numBlocks_y,
+                           int bundle_first_task, int max_parts);
+void launch_density_aos(struct part_aos *parts_aos, int *d_task_first_part,
+                        int *d_task_last_part, float d_a, float d_H,
+                        const char *loop_type, cudaStream_t stream,
+                        int block_size, int count_tasks, int tasksperbundle,
+                        int numBlocks_x, int numBlocks_y, int bundle_first_task,
+                        int max_parts, double *d_cell_x, double *d_cell_y,
+                        double *d_cell_z);
+void launch_density_aos_f4(struct part_aos_f4_send *parts_send,
+                           struct part_aos_f4_recv *parts_recv, float d_a,
+                           float d_H, cudaStream_t stream, int numBlocks_x,
+                           int numBlocks_y, int bundle_first_task,
+                           int2 *d_task_first_part_f4);
+void launch_gradient_aos(struct part_aos_g *parts_aos, int *d_task_first_part,
+                         int *d_task_last_part, float d_a, float d_H,
+                         const char *loop_type, cudaStream_t stream,
+                         int block_size, int count_tasks, int tasksperbundle,
+                         int numBlocks_x, int numBlocks_y,
+                         int bundle_first_task, int max_parts, double *d_cell_x,
+                         double *d_cell_y, double *d_cell_z);
+void launch_gradient_aos_f4(struct part_aos_f4_g_send *parts_send,
+                            struct part_aos_f4_g_recv *parts_recv, float d_a,
+                            float d_H, cudaStream_t stream, int numBlocks_x,
+                            int numBlocks_y, int bundle_first_task,
+                            int2 *d_task_first_part_f4);
+void launch_force_aos(struct part_aos_f *parts_aos, int *d_task_first_part,
+                      int *d_task_last_part, float d_a, float d_H,
+                      const char *loop_type, cudaStream_t stream,
+                      int block_size, int count_tasks, int tasksperbundle,
+                      int numBlocks_x, int numBlocks_y, int bundle_first_task,
+                      int max_parts, double *d_cell_x, double *d_cell_y,
+                      double *d_cell_z);
+void launch_force_aos_f4(struct part_aos_f4_f_send *parts_send,
+                         struct part_aos_f4_f_recv *parts_recv, float d_a,
+                         float d_H, cudaStream_t stream, int numBlocks_x,
+                         int numBlocks_y, int bundle_first_task,
+                         int2 *d_task_first_part_f4);
+void launch_density_pair_two_kernels(
+    struct part_soa parts_soa_ci, struct part_soa parts_soa_cj,
+    int *d_task_first_part_ci, int *d_task_first_part_cj,
+    int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H,
+    const char *loop_type, cudaStream_t stream, int bid, int block_size,
+    int count_tasks, int tasksperbundle, int max_parts_i, int max_parts_j,
+    int numBlocks_y, int tid, int offset, int bundle_first_task,
+    int max_active_bin);
+void runner_dopair1_branch_density_gpu(
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, int max_active_bin,
+    double *d_shift_x, double *d_shift_y, double *d_shift_z);
+void runner_dopairci_branch_density_gpu(
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z);
+void runner_dopaircj_branch_density_gpu(
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z);
+void runner_dopairci_branch_density_gpu_aos(
+    struct part_aos *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z);
+void runner_dopairci_branch_density_gpu_aos_f4(
+    struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+    float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens);
+void runner_dopaircj_branch_density_gpu_aos_f4(
+    struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+    float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens);
+void runner_dopair_branch_density_gpu_aos_f4(
+    struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+    float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_part, int bundle_n_parts);
+void runner_dopaircj_branch_density_gpu_aos(
+    struct part_aos *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z);
+void runner_dopairci_branch_density_gpu_aos_g(
+    struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z);
+void runner_dopaircj_branch_density_gpu_aos_g(
+    struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z);
+void runner_dopair_branch_gradient_gpu_aos_f4(
+    struct part_aos_f4_g_send *parts_send,
+    struct part_aos_f4_g_recv *parts_recv, float d_a, float d_H,
+    cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_part, int bundle_n_parts);
+void runner_dopairci_branch_density_gpu_aos_f(
+    struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z);
+void runner_dopaircj_branch_density_gpu_aos_f(
+    struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z);
+void runner_dopair_branch_force_gpu_aos_f4(
+    struct part_aos_f4_f_send *parts_send,
+    struct part_aos_f4_f_recv *parts_recv, float d_a, float d_H,
+    cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_part, int bundle_n_parts);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // CUDA_HEADER_H
diff --git a/src/cuda/Makefile.am b/src/cuda/Makefile.am
new file mode 100644
index 0000000000..5fb5bbc34f
--- /dev/null
+++ b/src/cuda/Makefile.am
@@ -0,0 +1,66 @@
+SOURCES_CUDA = GPU_runner_functions.cu tester.cu ../files_for_new_functions/arrays_malloc.cu ../files_for_new_functions/host_device_data_transfer.cu #../runner_main.cu
+include_HEADERS = GPU_runner_functions.h device_functions.h BLOCK_SIZE.h tester.h ../files_for_new_functions/arrays_malloc.h ../files_for_new_functions/host_device_data_transfer.h
+EXTRA_DIST = $(SOURCES_CUDA) $(include_HEADERS)
+
+if HAVECUDA
+
+AM_CFLAGS = -I.. $(HDF5_CPPFLAGS)
+CUDA_MYFLAGS = -D_FORCE_INLINES -O4 -lineinfo -src-in-ptx --maxrregcount=64 -ftz=true -DWITH_CUDA --default-stream per-thread --use_fast_math -lcudadevrt #-dlink -ccbin=gcc 
+CUDA_MYFLAGS += -arch=sm_70
+CUDA_MYFLAGS += --extra-device-vectorization
+
+#CUDA_MYFLAGS = -D_FORCE_INLINES -O3 -lineinfo -src-in-ptx --maxrregcount=32 -ftz=true -DWITH_CUDA -ccbin=gcc -m64 --default-stream per-thread #-dlink
+#CUDA_MYFLAGS += -arch=sm_80 \
+#-gencode=arch=compute_80,code=sm_80 \
+#-gencode=arch=compute_86,code=sm_86 \
+#-gencode=arch=compute_87,code=sm_87 \
+#-gencode=arch=compute_86,code=compute_86
+#CUDA_MYFLAGS += --extra-device-vectorization
+
+# Assign a "safe" version number
+AM_LDFLAGS = $(HDF5_LDFLAGS) $(FFTW_LIBS) -version-info 0:0:0
+
+#bin_PROGRAMS = test_27_cells test_125_cells
+
+# Rules to compile CUDA code.
+.cu.o:
+	$(NVCC) -c $(NVCCFLAGS) $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) $< -o $@
+.cu.lo:
+	PATH=$(top_srcdir):$(PATH) && cudalt.py $@ $(NVCC) -c $(NVCCFLAGS) $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) $<
+
+# The library. Dummy C library so that we get libtool linking setup.
+lib_LTLIBRARIES = libswiftCUDA.la libswiftdummy.la
+
+# Special link command to avoid including CFLAGS which are not understood.
+libswiftCUDA_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
+        $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
+        $(libswiftCUDA_la_LDFLAGS) \
+        $(LDFLAGS) -o $@
+
+libswiftCUDA_la_SOURCES = $(SOURCES_CUDA)
+libswiftCUDA_la_CFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) ../libswiftsim_cuda.la -I../
+libswiftCUDA_la_CXXFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) ../libswiftsim_cuda.la -I../
+libswiftCUDA_la_LIBADD = ../.libs/libswiftsim_cuda.la
+libswiftCUDA_la_LDFLAGS = $(AM_LDFLAGS)
+
+if HAVEMPI
+libswiftCUDA_la_CFLAGS += ../libswiftsim_mpicuda.la
+libswiftCUDA_la_CXXFLAGS += ../libswiftsim_mpicuda.la
+libswiftCUDA_la_LIBADD += ../.libs/libswiftsim_mpicuda.la
+endif
+
+libswiftdummy_la_SOURCES = dummy.c
+libswiftdummy_la_CFLAGS = $(AM_CFLAGS)
+libswiftdummy_la_LDFLAGS = $(AM_LDFLAGS)
+
+#test_27_cells_SOURCES=test27cells.c
+#test_27_cells_CFLAGS=$(AM_CFLAGS) -DWITH_CUDA $(CUDA_CFLAGS) 
+#test_27_cells_LDADD= ../.libs/libswiftsim_cuda.la ../.libs/libswiftsim_mpicuda.la libswiftCUDA.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -L/home/aidan/cuda_7.5/lib64/ -lcudart
+#test_27_cells_LDFLAGS = $(AM_LDFLAGS) $(CUDA_CFLAGS)
+
+#test_125_cells_SOURCES=test125cells.c
+#test_125_cells_CFLAGS=$(AM_CFLAGS) -DWITH_CUDA $(CUDA_CFLAGS)
+#test_125_cells_LDADD= ../libswiftsim_cuda.la ../libswiftsim_mpicuda.la libswiftCUDA.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -L/home/aidan/cuda_7.5/lib64/ -lcudart
+#test_125_cells_LDFLAGS = $(AM_LDFLAGS) $(CUDA_CFLAGS) 
+
+endif
diff --git a/src/cuda/device_functions.h b/src/cuda/device_functions.h
new file mode 100644
index 0000000000..afc4a1a5d8
--- /dev/null
+++ b/src/cuda/device_functions.h
@@ -0,0 +1,149 @@
+#ifndef DEVICE_FUNCTIONS_H
+#define DEVICE_FUNCTIONS_H
+#include "../../config.h"
+
+/* Local headers. */
+// #include "../dimension.h"
+// #include "../error.h"
+// #include "../inline.h"
+// #include "../minmax.h"
+// #include "../vector.h"
+
+// Is this even necessary? Probably not as our code will operate differently
+#define num_cuda_threads 128
+#define hydro_dimension 3.f
+
+/// Here we define stuff from kernel_hydro.h when using cubic_spline_kernel.
+/// Will worry about sorting 'if statements for different kernels later////
+/* First some powers of gamma = H/h */
+#define kernel_gamma ((float)(1.825742))
+#define kernel_gamma_inv ((float)(1. / kernel_gamma))
+#define kernel_gamma2 ((float)(kernel_gamma * kernel_gamma))
+#define kernel_ivals 2
+#define kernel_degree 3 /*!< Degree of the polynomial */
+#define kernel_gamma_dim ((float)(kernel_gamma * kernel_gamma * kernel_gamma))
+#define kernel_gamma_dim_plus_one \
+  ((float)(kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma))
+#define kernel_gamma_inv_dim \
+  ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma)))
+#define kernel_gamma_inv_dim_plus_one \
+  ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma)))
+#define kernel_ivals_f ((float)kernel_ivals) /*!< Number of branches */
+#define kernel_constant ((float)(16. * M_1_PI))
+/*! Cosmology default beta=3.0.
+ * Alpha can be set in the parameter file.
+ * Beta is defined as in e.g. Price (2010) Eqn (103) */
+#define const_viscosity_beta 3.0f
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+/**
+ * @brief Returns the argument to the power given by the dimension plus one
+ *
+ * Computes \f$x^{d+1}\f$.
+ */
+__device__ float d_pow_dimension_plus_one(float x) {
+
+#if defined(HYDRO_DIMENSION_3D)
+
+  const float x2 = x * x;
+  return x2 * x2;
+
+#elif defined(HYDRO_DIMENSION_2D)
+
+  return x * x * x;
+
+#elif defined(HYDRO_DIMENSION_1D)
+
+  return x * x;
+
+#else
+
+  error("The dimension is not defined !");
+  return 0.f;
+
+#endif
+}
+
+/**
+ * @brief Return the argument to the power three adiabatic index minus five over
+ * two.
+ *
+ * Computes \f$x^{(3\gamma - 5)/2}\f$.
+ *
+ * @param x Argument
+ */
+__device__ float d_pow_three_gamma_minus_five_over_two(float x) {
+#if defined(HYDRO_GAMMA_5_3)
+
+  return 1.f; /* x^(0) */
+
+#elif defined(HYDRO_GAMMA_7_5)
+
+  return powf(x, -0.4f); /* x^(-2/5) */
+
+#elif defined(HYDRO_GAMMA_4_3)
+
+  return 1.f / sqrtf(x); /* x^(-1/2) */
+
+#elif defined(HYDRO_GAMMA_2_1)
+
+  return sqrtf(x); /* x^(1/2) */
+
+#else
+
+  error("The adiabatic index is not defined !");
+  return 0.f;
+
+#endif
+}
+
+/**
+ * @brief Computes the kernel function and its derivative.
+ *
+ * The kernel function needs to be mutliplied by \f$h^{-d}\f$ and the gradient
+ * by \f$h^{-(d+1)}\f$, where \f$d\f$ is the dimensionality of the problem.
+ *
+ * Returns 0 if \f$u > \gamma = H/h\f$.
+ *
+ * @param u The ratio of the distance to the smoothing length \f$u = x/h\f$.
+ * @param W (return) The value of the kernel function \f$W(x,h)\f$.
+ * @param dW_dx (return) The norm of the gradient of \f$|\nabla W(x,h)|\f$.
+ */
+__device__ void d_kernel_deval(float u, float *__restrict__ W,
+                               float *__restrict__ dW_dx) {
+
+  /* Go to the range [0,1[ from [0,H[ */
+  const float x = u * kernel_gamma_inv;
+
+  /* Pick the correct branch of the kernel */
+  const int temp = (int)(x * kernel_ivals_f);
+  const int ind = temp > kernel_ivals ? kernel_ivals : temp;
+  static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)] = {
+      3.f,  -3.f, 0.f,  0.5f, /* 0 < u < 0.5 */
+      -1.f, 3.f,  -3.f, 1.f,  /* 0.5 < u < 1 */
+      0.f,  0.f,  0.f,  0.f}; /* 1 < u */
+  const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
+  /* First two terms of the polynomial ... */
+  float w = coeffs[0] * x + coeffs[1];
+  float dw_dx = coeffs[0];
+
+  /* ... and the rest of them */
+  for (int k = 2; k <= kernel_degree; k++) {
+    dw_dx = dw_dx * x + w;
+    w = x * w + coeffs[k];
+  }
+
+  w = fmaxf(w, 0.f);
+  dw_dx = fminf(dw_dx, 0.f);
+
+  /* Return everything */
+  *W = w * kernel_constant * kernel_gamma_inv_dim;
+  *dW_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one;
+}
+
+#ifdef WITH_CUDA
+}
+#endif
+
+#endif  // DEVICE_FUNCTIONS_H
diff --git a/src/cuda/dummy.c b/src/cuda/dummy.c
new file mode 100755
index 0000000000..c75d2d873c
--- /dev/null
+++ b/src/cuda/dummy.c
@@ -0,0 +1,9 @@
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void swiftcudadummy(void) {}
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/cuda/kernel_definitions.cu b/src/cuda/kernel_definitions.cu
new file mode 100644
index 0000000000..a272b7beee
--- /dev/null
+++ b/src/cuda/kernel_definitions.cu
@@ -0,0 +1,114 @@
+/*******************************************************************************
+ * This file contains functions used to setup and execute GPU tasks from within
+ *runner_main.c. Consider this a translator allowing .cu based functions to be
+ *called from within runner_main.c
+ ******************************************************************************/
+#ifdef WITH_CUDA
+#ifndef static
+#define static
+#endif
+// #ifndef restrict
+// #define restrict __restrict__
+// #endif
+#endif
+
+/* Required header files */
+#include <stdio.h>
+/*ifdef __cplusplus prevents name mangling. C code sees exact names
+ of functions rather than mangled template names produced by C++*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cell_gpu.h"
+#include "cuda_headers.h"
+#ifdef __cplusplus
+}
+#endif
+
+/* function to initialise and printout GPU name*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+void Initialise_GPU() {
+  int devId = 0;
+  // find and print device name
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, devId);
+  printf("Device : %s\n", prop.name);
+  cudaSetDevice(devId);
+  // cuda
+}
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void CPU_runner_doself1_branch_gradient(struct cell_gpu *restrict ci_gpu) {
+  int id = ci_gpu->hydro.parts[0].id;
+  printf("id of first part %d\n", id);
+  // Do stuff here for interactions on CPU but using the temporary GPU arrays
+  //	const int count_i = ci_gpu->hydro.count;
+  //  	const int count_j = cj_gpu->hydro.count;
+  //	system("pause");
+  /* Anything to do here? */
+  //  	if (!cell_is_active_hydro(ci_gpu, e) && !cell_is_active_hydro(cj_gpu,
+  //  e)) return;
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void GPU_runner_doself1_branch_gradient(struct cell_gpu *restrict ci_gpu) {
+  int count = ci_gpu->hydro.count;
+  int numBlocks = (count + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+  struct cell_gpu *d_ci_gpu;
+  cudaMalloc((void **)&d_ci_gpu, sizeof(cell_gpu));
+
+  cudaMemcpy(d_ci_gpu, ci_gpu, sizeof(cell_gpu), cudaMemcpyHostToDevice);
+  SPH_Sum_Self<<<numBlocks, BLOCK_SIZE>>>(d_ci_gpu);
+  cudaMemcpy(ci_gpu, d_ci_gpu, sizeof(cell_gpu), cudaMemcpyDeviceToHost);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void SPH_Sum_Self(cell_gpu *d_ci_gpu) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int i = index;
+  float sumLoc, xi, yi, zi;
+  struct part_gpu *restrict parts = d_ci_gpu->hydro.parts;
+  xi = parts[i].x[0];
+  yi = parts[i].x[1];
+  zi = parts[i].x[2];
+  sumLoc = 0.f;
+  float h = parts[i].h, mass = parts[i].mass, rho = parts[i].rho;
+  const int count = d_ci_gpu->hydro.count;
+  //__shared__ float sh_x[BLOCK_SIZE], sh_y[BLOCK_SIZE];
+  // copy neighbour particles data to shared memory
+  // for (unsigned int j1=0; j1<n; j1+=BLOCK_SIZE){
+  // float xj=
+  //}
+  for (int j = 0; j < count; j++) {
+    float xj = parts[j].x[0], yj = parts[j].x[1], zj = parts[j].x[2];
+    float rad = sqrt((xj - xi) * (xj - xi) + (yj - yi) * (yj - yi));
+    float q = rad / h;
+    float q4 = 1.f - 0.5f * q;
+    q4 = q4 * q4 * q4 * q4;
+    float w = q4 * (2.0f * q + 1.0f);
+    float v = mass / rho;
+    if (q < 2.0f) sumLoc = sumLoc + w * v * 7.0 * 7.0 / (4.0 * 22.0 * h * h);
+  }
+  // d_Particles[i].ker_sum=sumLoc;
+}
+#ifdef WITH_CUDA
+}
+#endif
diff --git a/src/cuda/part_gpu.h b/src/cuda/part_gpu.h
new file mode 100644
index 0000000000..f07b5bfda6
--- /dev/null
+++ b/src/cuda/part_gpu.h
@@ -0,0 +1,413 @@
+#ifndef PART_GPU_H
+#define PART_GPU_H
+/* Config parameters. */
+#include "../../config.h"
+#include "../align.h"
+typedef int8_t timebin_t;
+
+#ifdef __WITH_CUDA
+extern "C" {
+#endif
+
+// #include </usr/local/cuda-12.2/targets/x86_64-linux/include/vector_types.h>
+
+#include <vector_types.h>
+
+typedef struct part_soa {
+  /*Task ID*/
+  int *tid_p;
+  /*bundle ID*/
+  int *bid_p;
+  /*! Particle unique ID. */
+  long long *id;
+  /*! Pointer to corresponding gravity part. */
+  //	struct gpu_gpart* gpart;
+  /*! Particle position. */
+  double *x_p;
+  double *y_p;
+  double *z_p;
+  /*! Particle predicted velocity. */
+  float *ux;
+  float *uy;
+  float *uz;
+  /*! Particle acceleration. */
+  float *a_hydrox;
+  float *a_hydroy;
+  float *a_hydroz;
+  /*! Particle mass. */
+  float *mass;
+  /*! Particle smoothing length. */
+  float *h;
+  /*! Particle internal energy. */
+  float *u;
+  /*! Time derivative of the internal energy. */
+  float *u_dt;
+  /*! Particle density. */
+  float *rho;
+  /*! Kernel summation (For testing/debugging). */
+  float *SPH_sum;
+
+  /* Cell information */
+  /*! The cell location on the grid (corner nearest to the origin). */
+  float *locx;
+  float *locy;
+  float *locz;
+  /*! The cell dimensions. */
+  float *widthx;
+  float *widthy;
+  float *widthz;
+  float *h_max;
+  int *count_p;
+  int *count_test;
+  /* Density information */
+
+  /*! Neighbour number count. */
+  float *wcount;
+
+  /*! Derivative of the neighbour number with respect to h. */
+  float *wcount_dh;
+
+  /*! Derivative of density with respect to h */
+  float *rho_dh;
+
+  /*! Particle velocity curl. */
+  float *rot_ux;
+  float *rot_uy;
+  float *rot_uz;
+
+  /* viscosity information */
+
+  /*! Particle velocity divergence */
+  float *div_v;
+
+  /*! Particle velocity divergence from previous step */
+  float *div_v_previous_step;
+
+  /*! Artificial viscosity parameter */
+  float *alpha_visc;
+
+  /*! Signal velocity */
+  float *v_sig;
+
+  /* thermal diffusion information  */
+
+  /*! del^2 u, a smoothed quantity */
+  float *laplace_u;
+
+  /*! Thermal diffusion coefficient */
+  float *alpha_diff;
+
+  /* force information  */
+
+  /*! "Grad h" term -- only partial in P-U */
+  float *f;
+
+  /*! Particle soundspeed. */
+  float *soundspeed;
+
+  /*! Time derivative of smoothing length  */
+  float *h_dt;
+
+  /*! Balsara switch */
+  float *balsara;
+
+  /*! Particle pressure. */
+  float *pressure;
+  /*! Maximal alpha (viscosity) over neighbours */
+  float *alpha_visc_max_ngb;
+
+  /* timestep stuff */
+
+  /*! Time-step length */
+  timebin_t *time_bin;
+
+  /*all part of struct timestep_limiter_data, we had to destruct it
+   as GPUs don't like pointer chasing especially when memcpying*/
+  /* Need waking-up ? */
+  timebin_t *wakeup;
+
+  /*! Minimal time-bin across all neighbours */
+  timebin_t *min_ngb_time_bin;
+
+  /* Do we want this particle to be synched back on the time-line? */
+  char *to_be_synchronized;
+} part_soa;
+/*Container for particle data requierd for density calcs*/
+typedef struct part_aos {
+
+  /*! Particle position. */
+  double x_p;
+  double y_p;
+  double z_p;
+
+  /*! Particle position. */
+  double locx;
+  double locy;
+  double locz;
+
+  /*! Particle predicted velocity. */
+  float ux;
+  float uy;
+  float uz;
+  /*! Particle mass. */
+  float mass;
+  /*! Particle smoothing length. */
+  float h;
+  /*! Particle density. */
+  float rho;
+
+  /* Density information */
+  /*! Neighbour number count. */
+  float wcount;
+  /*! Derivative of the neighbour number with respect to h. */
+  float wcount_dh;
+  /*! Derivative of density with respect to h */
+  float rho_dh;
+  /*! Particle velocity curl. */
+  float rot_ux;
+  float rot_uy;
+  float rot_uz;
+
+  /* viscosity information */
+  /*! Particle velocity divergence */
+  float div_v;
+
+  /* timestep stuff */
+  /*! Time-step length */
+  int time_bin;
+} part_aos;
+
+/*Container for particle data requierd for density calcs*/
+typedef struct part_aos_f4_send {
+  /*! Particle position and h -> x, y, z, h */
+  float4 x_p_h;
+
+  /*! Particle predicted velocity and mass -> ux, uy, uz, m */
+  float4 ux_m;
+  /*Markers for where neighbour cell j starts and stops in array indices for
+   * pair tasks*/
+  int2 cjs_cje;
+} part_aos_f4_send __attribute__((aligned(SWIFT_STRUCT_ALIGNMENT)));
+
+typedef struct part_aos_f4_recv {
+  /* Density information; rho */
+  /*! Derivative of density with respect to h; rho_dh,
+   * Neighbour number count; w_count
+   * * Derivative of the neighbour number with respect to h; w_count_dh */
+  float4 rho_dh_wcount;
+  /*! Particle velocity curl; rot_ux and
+   * velocity divergence; div_v */
+  float4 rot_ux_div_v;
+} part_aos_f4_recv;
+
+/*Container for particle data required for density calcs*/
+typedef struct part_aos_f4 {
+  /*! Particle position and h -> x, y, z, h */
+  float4 x_p_h;
+
+  /*! Particle predicted velocity and mass -> ux, uy, uz, m */
+  float4 ux_m;
+  /* Density information; rho */
+  /*! Derivative of density with respect to h; rho_dh,
+   * Neighbour number count; w_count
+   * * Derivative of the neighbour number with respect to h; w_count_dh */
+  float4 rho_dh_wcount;
+
+  /*! Particle velocity curl; rot_ux and
+   * velocity divergence; div_v */
+  float4 rot_ux_div_v;
+
+} part_aos_f4;
+
+/*Container for particle data required for force calcs*/
+typedef struct part_aos_f {
+
+  /*! Particle position. */
+  double x_p;
+  double y_p;
+  double z_p;
+
+  /*! Particle predicted velocity. */
+  float ux;
+  float uy;
+  float uz;
+  /*! Particle mass. */
+  float mass;
+  /*! Particle smoothing length. */
+  float h;
+  /*! Particle density. */
+  float rho;
+  /*! Particle pressure. */
+  float pressure;
+
+  /* Density information */
+  /*! Speed of sound. */
+  float soundspeed;
+  /*! Variable smoothing length term */
+  float f;
+  /*! Derivative of density with respect to h */
+  float balsara;
+  /*! Particle velocity curl. */
+  float alpha_visc;
+  float a_hydrox;
+  float a_hydroy;
+  float a_hydroz;
+  float alpha_diff;
+
+  /* viscosity information */
+  /*! Internal energy */
+  float u;
+  float u_dt;
+  /*! h time derivative */
+  float h_dt;
+  float v_sig;
+
+  /* timestep stuff */
+  /*! Time-step length */
+  int time_bin;
+  int min_ngb_time_bin;
+} part_aos_f;
+
+/*Container for particle data requierd for force calcs*/
+typedef struct part_aos_f4_f {
+
+  /*Data required for the calculation:
+  Values read to local GPU memory*/
+  /*! Particle position smoothing length */
+  float4 x_h;
+  /*! Particle predicted velocity and mass */
+  float4 ux_m;
+  /*! Variable smoothing length term f, balsara, timebin
+   * and initial value of min neighbour timebin */
+  float4 f_bals_timebin_mintimebin_ngb;
+  /*! Particle density, pressure, speed of sound & v_sig to read*/
+  float4 rho_p_c_vsigi;
+  /*! Particle Internal energy u, alpha constants for visc and diff */
+  float3 u_alphavisc_alphadiff;
+
+  /*Result: Values output to global GPU memory*/
+  /* change of u and h with dt, v_sig and returned value of
+   * minimum neighbour timebin */
+  float4 udt_hdt_vsig_mintimebin_ngb;
+  /*Particle acceleration vector*/
+  float3 a_hydro;
+
+} part_aos_f4_f;
+
+/*Container for particle data requierd for force calcs*/
+typedef struct part_aos_f4_f_send {
+
+  /*Data required for the calculation:
+  Values read to local GPU memory*/
+  /*! Particle position smoothing length */
+  float4 x_h;
+  /*! Particle predicted velocity and mass */
+  float4 ux_m;
+  /*! Variable smoothing length term f, balsara, timebin
+   * and initial value of min neighbour timebin */
+  float4 f_bals_timebin_mintimebin_ngb;
+  /*! Particle density, pressure, speed of sound & v_sig to read*/
+  float4 rho_p_c_vsigi;
+  /*! Particle Internal energy u, alpha constants for visc and diff */
+  float3 u_alphavisc_alphadiff;
+
+  int2 cjs_cje;
+
+} part_aos_f4_f_send;
+
+/*Container for particle data requierd for force calcs*/
+typedef struct part_aos_f4_f_recv {
+
+  /*Result: Values output to global GPU memory*/
+  /* change of u and h with dt, v_sig and returned value of
+   * minimum neighbour timebin */
+  float4 udt_hdt_vsig_mintimebin_ngb;
+  /*Particle acceleration vector*/
+  float3 a_hydro;
+
+} part_aos_f4_f_recv;
+
+/*Container for particle data requierd for gradient calcs*/
+typedef struct part_aos_g {
+
+  /*! Particle position. */
+  double x_p;
+  double y_p;
+  double z_p;
+
+  /*! Particle velocity. */
+  float ux;
+  float uy;
+  float uz;
+  /*! Particle mass. */
+  float mass;
+  /*! Particle smoothing length. */
+  float h;
+  /*! Particle density. */
+  float rho;
+
+  /* viscosity information */
+  float visc_alpha;
+  float laplace_u;
+  float alpha_visc_max_ngb;
+  float v_sig;
+
+  float u;
+
+  float soundspeed;
+
+  /* timestep stuff */
+  /*! Time-step length */
+  int time_bin;
+} part_aos_g;
+
+/*Container for particle data requierd for gradient calcs*/
+typedef struct part_aos_f4_g {
+
+  /*! Particle position & smoothing length */
+  float4 x_h;
+
+  /*! Particle velocity and mass */
+  float4 ux_m;
+
+  /*! Particle density alpha visc internal energy u and speed of sound c */
+  float4 rho_avisc_u_c;
+
+  /* viscosity information results */
+  float3 vsig_lapu_aviscmax_empty;
+
+} part_aos_f4_g;
+
+/*Container for particle data requierd for gradient calcs*/
+typedef struct part_aos_f4_g_send {
+
+  /*! Particle position & smoothing length */
+  float4 x_h;
+
+  /*! Particle velocity and mass */
+  float4 ux_m;
+
+  /*! Particle density alpha visc internal energy u and speed of sound c */
+  float4 rho_avisc_u_c;
+
+  /* viscosity information results */
+  float3 vsig_lapu_aviscmax;
+
+  /*Data for cell start and end*/
+  int2 cjs_cje;
+
+} part_aos_f4_g_send;
+
+/*Container for particle data requierd for gradient calcs*/
+typedef struct part_aos_f4_g_recv {
+
+  /* viscosity information results */
+  float3 vsig_lapu_aviscmax;
+
+} part_aos_f4_g_recv;
+
+#ifdef __WITH_CUDA
+}
+#endif
+
+#endif  // PART_GPU_H
diff --git a/src/cuda/tester.cu b/src/cuda/tester.cu
new file mode 100644
index 0000000000..3ffaf9e10c
--- /dev/null
+++ b/src/cuda/tester.cu
@@ -0,0 +1,21 @@
+#include "tester.h"
+
+#include <iostream>
+#include <vector>
+#ifdef __cplusplus
+extern "C" {
+#endif
+void testing_linkage(int a, float *b, float c) {
+  std::vector<float> b_value_list;
+  b_value_list.reserve(a);
+  for (int i = 0; i < a; i++) {
+    (*b) = (*b) + c;
+    b_value_list.push_back((*b));
+    std::cout << "Vector value is " << b_value_list[i] << " b value is " << (*b)
+              << std::endl;
+  }
+  std::cout << "Final value of b is " << (*b) << std::endl;
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/cuda/tester.h b/src/cuda/tester.h
new file mode 100755
index 0000000000..5729e66904
--- /dev/null
+++ b/src/cuda/tester.h
@@ -0,0 +1,9 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void testing_linkage(int a, float *b, float c);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/engine.c b/src/engine.c
index 6d1fa0e3f7..023885cb0c 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -1092,12 +1092,22 @@ int engine_estimate_nr_tasks(const struct engine *e) {
      */
     n1 += 38;
     n2 += 2;
+#ifdef WITH_CUDA  // A. Nasar
+    n1 += 4;  // Self force and density packs should be 2 but doubled to prevent
+              // code crash due to unpack tasks
+    n1 += 52;  // Pair force and density packs should be 26 but doubled to
+               // prevent code crash due to unpack tasks
+#endif
 #ifdef WITH_MPI
     n1 += 6;
 #endif
 
 #ifdef EXTRA_HYDRO_LOOP
     n1 += 15;
+#ifdef WITH_CUDA
+    n1 += 1;   // Self gradient packs
+    n1 += 13;  // Pair gradient packs
+#endif
 #ifdef WITH_MPI
     n1 += 2;
 #endif
@@ -1750,9 +1760,13 @@ void engine_skip_force_and_kick(struct engine *e) {
         t->type == task_type_rt_ghost2 || t->type == task_type_rt_tchem ||
         t->type == task_type_rt_advance_cell_time ||
         t->type == task_type_neutrino_weight || t->type == task_type_csds ||
-        t->subtype == task_subtype_force ||
+        t->subtype == task_subtype_force ||  // A. Nasar
+        t->subtype == task_subtype_gpu_pack_f ||
+        t->subtype == task_subtype_gpu_unpack_f ||
         t->subtype == task_subtype_limiter ||
         t->subtype == task_subtype_gradient ||
+        t->subtype == task_subtype_gpu_pack_g ||
+        t->subtype == task_subtype_gpu_unpack_g ||
         t->subtype == task_subtype_stars_prep1 ||
         t->subtype == task_subtype_stars_prep2 ||
         t->subtype == task_subtype_stars_feedback ||
@@ -2192,7 +2206,25 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
   }
 #endif
 
+  //  scheduler_write_dependencies(&e->sched, e->verbose, e->step); // A. Nasar
+  //  write deps before running first step
   /* Now, launch the calculation */
+  //  message("n tasks %i", e->sched.nr_tasks);
+  //  for (int i = 0; i < e->sched.nr_tasks; i++){
+  //	  struct task *tmp_t = &e->sched.tasks[i];
+  //	  if(tmp_t->subtype == task_subtype_density){
+  //		if(tmp_t->skip == 1)error("inactive density task");
+  //	  }
+  ////	  if(tmp_t->subtype == task_subtype_force){
+  ////		if(tmp_t->skip == 1)error("inactive force task");
+  ////	  }
+  //	  if(tmp_t->subtype == task_subtype_gpu_pack_d){
+  //		if(tmp_t->skip == 1)error("inactive pack task");
+  //	  }
+  //	  if(tmp_t->subtype == task_subtype_gpu_unpack_d){
+  //	    if(tmp_t->skip == 1)error("inactive unpack task");
+  //	  }
+  //  }
   TIMER_TIC;
   engine_launch(e, "tasks");
   TIMER_TOC(timer_runners);
@@ -2280,6 +2312,22 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
   scheduler_write_cell_dependencies(&e->sched, e->verbose, e->step);
   if (e->nodeID == 0) scheduler_write_task_level(&e->sched, e->step);
 
+  //  for (int i = 0; i < e->sched.nr_tasks; i++){
+  //	  struct task *tmp_t = &e->sched.tasks[i];
+  //	  if(tmp_t->subtype == task_subtype_density){
+  //		if(tmp_t->skip == 1)error("inactive density task");
+  //	  }
+  //	  if(tmp_t->subtype == task_subtype_force){
+  //		if(tmp_t->skip == 1)error("inactive force task");
+  //	  }
+  //	  if(tmp_t->subtype == task_subtype_gpu_pack_d){
+  //		if(tmp_t->skip == 1)error("inactive pack task");
+  //	  }
+  //	  if(tmp_t->subtype == task_subtype_gpu_unpack_d){
+  //	    if(tmp_t->skip == 1)error("inactive unpack task");
+  //	  }
+  //  }
+
   /* Run the 0th time-step */
   TIMER_TIC2;
   engine_launch(e, "tasks");
diff --git a/src/engine_config.c b/src/engine_config.c
index 5e6c4eb98c..4c0c4420c4 100644
--- a/src/engine_config.c
+++ b/src/engine_config.c
@@ -32,6 +32,19 @@
 #include <numa.h>
 #endif
 
+#ifdef WITH_CUDA
+#include "runner_main_clean.cu"
+
+#include <cuda_runtime.h> /* A. Nasar */
+#endif
+
+#ifdef WITH_HIP
+// #include "/opt/rocm-5.1.0/hip/include/hip/hip_runtime.h"
+#include "runner_main_clean.hip"
+
+#include <hip/hip_runtime.h>
+#endif
+
 /* This object's header. */
 #include "engine.h"
 
@@ -909,9 +922,12 @@ void engine_config(int restart, int fof, struct engine *e,
   e->links_per_tasks =
       parser_get_opt_param_float(params, "Scheduler:links_per_tasks", 25.);
 
-  /* Init the scheduler. */
+  /* Init the scheduler. Allow stealing*/
   scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
                  (e->policy & scheduler_flag_steal), e->nodeID, &e->threadpool);
+  /* Init the scheduler. NO stealing  A. Nasar */
+  //  scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
+  //                 &e->threadpool);
 
   /* Maximum size of MPI task messages, in KB, that should not be buffered,
    * that is sent using MPI_Issend, not MPI_Isend. 4Mb by default. Can be
@@ -981,9 +997,20 @@ void engine_config(int restart, int fof, struct engine *e,
   for (int k = 0; k < e->nr_threads; k++) {
     e->runners[k].id = k;
     e->runners[k].e = e;
+
+#ifdef WITH_CUDA
+    if (pthread_create(&e->runners[k].thread, NULL, &runner_main2,
+                       &e->runners[k]) != 0)
+      error("Failed to create GPU runner thread.");
+#elif WITH_HIP
+    if (pthread_create(&e->runners[k].thread, NULL, &runner_main_hip,
+                       &e->runners[k]) != 0)
+      error("Failed to create runner thread.");
+#else
     if (pthread_create(&e->runners[k].thread, NULL, &runner_main,
                        &e->runners[k]) != 0)
       error("Failed to create runner thread.");
+#endif
 
     /* Try to pin the runner to a given core */
     if (with_aff &&
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 1c5a65d88f..a0ff23b2be 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -583,8 +583,13 @@ void engine_addtasks_recv_hydro(
   /* Early abort (are we below the level where tasks are)? */
   if (!cell_get_flag(c, cell_flag_has_tasks)) return;
 
-  /* Have we reached a level where there are any hydro tasks ? */
-  if (t_xv == NULL && c->hydro.density != NULL) {
+    /* Have we reached a level where there are any hydro tasks ? */
+#ifdef WITH_CUDA  // A. Nasar
+  if (t_xv == NULL && c->hydro.density != NULL && c->hydro.density_pack != NULL)
+#else
+  if (t_xv == NULL && c->hydro.density != NULL)
+#endif /*WITH_CUDA*/
+  {
 
 #ifdef SWIFT_DEBUG_CHECKS
     /* Make sure this cell has a valid tag. */
@@ -711,6 +716,18 @@ void engine_addtasks_recv_hydro(
       scheduler_addunlock(s, t_xv, l->t);
       scheduler_addunlock(s, l->t, t_rho);
     }
+#ifdef WITH_CUDA /* A. Nasar POSSIBLE BUG HERE (More like PROBABLE) NOT \
+                    REQUIRED Ghost in for cell j is*/
+    for (struct link *l = c->hydro.density_pack; l != NULL; l = l->next) {
+      scheduler_addunlock(s, t_xv, l->t);
+      scheduler_addunlock(s, l->t, t_rho);
+    }
+    for (struct link *l = c->hydro.density_unpack; l != NULL; l = l->next) {
+      scheduler_addunlock(s, l->t, t_rho);
+    }
+
+#endif
+
 #ifdef EXTRA_HYDRO_LOOP
     for (struct link *l = c->hydro.gradient; l != NULL; l = l->next) {
       scheduler_addunlock(s, t_rho, l->t);
@@ -720,12 +737,37 @@ void engine_addtasks_recv_hydro(
       scheduler_addunlock(s, t_gradient, l->t);
       scheduler_addunlock(s, l->t, tend);
     }
-#else
+#ifdef WITH_CUDA
+    for (struct link *l = c->hydro.gradient_pack; l != NULL; l = l->next) {
+      scheduler_addunlock(s, t_rho, l->t);
+      scheduler_addunlock(s, l->t, t_gradient);
+    }
+    for (struct link *l = c->hydro.gradient_unpack; l != NULL; l = l->next) {
+      scheduler_addunlock(s, l->t, t_gradient);
+    }
+
+    for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
+      scheduler_addunlock(s, t_gradient, l->t);
+      scheduler_addunlock(s, l->t, tend);
+    }
+    for (struct link *l = c->hydro.force_unpack; l != NULL; l = l->next) {
+      scheduler_addunlock(s, l->t, tend);
+    }
+
+#endif /*WITH_CUDA*/
+#else  /*EXTRA_HYDRO_LOOP*/
     for (struct link *l = c->hydro.force; l != NULL; l = l->next) {
       scheduler_addunlock(s, t_rho, l->t);
       scheduler_addunlock(s, l->t, tend);
     }
-#endif
+#ifdef WITH_CUDA
+    for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
+      scheduler_addunlock(s, t_rho, l->t);
+      //      scheduler_addunlock(s, l->t, t_ti);
+    }
+    scheduler_addunlock(s, c->hydro.super->hydro.f_unpack, tend);
+#endif /*WITH_CUDA*/
+#endif /*EXTRA_HYDRO_LOOP*/
 
     if (with_limiter) {
       for (struct link *l = c->hydro.limiter; l != NULL; l = l->next) {
@@ -2088,7 +2130,10 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
 
   for (int ind = 0; ind < num_elements; ind++) {
     struct task *t = &((struct task *)map_data)[ind];
-
+    if (t->ci == NULL) {  // Possible fix missing when moving code over.
+                          // Prevents unpack tasks continuing past here
+      break;
+    }
     struct cell *ci = t->ci;
     struct cell *cj = t->cj;
     const enum task_types t_type = t->type;
@@ -2116,6 +2161,12 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
 
       if (t_subtype == task_subtype_density) {
         engine_addlink(e, &ci->hydro.density, t);
+      } else if (t_subtype == task_subtype_gpu_pack_d) {  // A. Nasar
+        engine_addlink(e, &ci->hydro.density_pack, t);
+        //      } else if (t_subtype == task_subtype_gpu_pack_f) {
+        //        engine_addlink(e, &ci->hydro.force_pack, t);
+        //      } else if (t_subtype == task_subtype_gpu_pack_g) {
+        //        engine_addlink(e, &ci->hydro.gradient_pack, t);
       } else if (t_subtype == task_subtype_grav) {
         engine_addlink(e, &ci->grav.grav, t);
       } else if (t_subtype == task_subtype_external_grav) {
@@ -2130,6 +2181,15 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
       if (t_subtype == task_subtype_density) {
         engine_addlink(e, &ci->hydro.density, t);
         engine_addlink(e, &cj->hydro.density, t);
+      } else if (t_subtype == task_subtype_gpu_pack_d) {  // A. Nasar
+        engine_addlink(e, &ci->hydro.density_pack, t);
+        engine_addlink(e, &cj->hydro.density_pack, t);
+        //      } else if (t_subtype == task_subtype_gpu_pack_f) {
+        //        engine_addlink(e, &ci->hydro.force_pack, t);
+        //        engine_addlink(e, &cj->hydro.force_pack, t);
+        //      } else if (t_subtype == task_subtype_gpu_pack_g) {
+        //        engine_addlink(e, &ci->hydro.gradient_pack, t);
+        //        engine_addlink(e, &cj->hydro.gradient_pack, t);
       } else if (t_subtype == task_subtype_grav) {
         engine_addlink(e, &ci->grav.grav, t);
         engine_addlink(e, &cj->grav.grav, t);
@@ -2146,6 +2206,15 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
 
       if (t_subtype == task_subtype_density) {
         engine_addlink(e, &ci->hydro.density, t);
+      } else if (t_subtype == task_subtype_gpu_pack_d) {  // A. Nasar
+        engine_addlink(e, &ci->hydro.density_pack, t);
+        //        error("Abouzied: you need to code this up!");
+      } else if (t_subtype == task_subtype_gpu_pack_f) {
+        engine_addlink(e, &ci->hydro.force_pack, t);
+        //        error("Abouzied: you need to code this up!");
+      } else if (t_subtype == task_subtype_gpu_pack_g) {
+        engine_addlink(e, &ci->hydro.gradient_pack, t);
+        //        error("Abouzied: you need to code this up!");
       } else if (t_subtype == task_subtype_grav) {
         engine_addlink(e, &ci->grav.grav, t);
       } else if (t_subtype == task_subtype_external_grav) {
@@ -2160,6 +2229,18 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
       if (t_subtype == task_subtype_density) {
         engine_addlink(e, &ci->hydro.density, t);
         engine_addlink(e, &cj->hydro.density, t);
+      } else if (t_subtype == task_subtype_gpu_pack_d) {
+        engine_addlink(e, &ci->hydro.density_pack, t);
+        engine_addlink(e, &cj->hydro.density_pack, t);
+        //        error("Abouzied: you need to code this up!");
+      } else if (t_subtype == task_subtype_gpu_pack_f) {
+        engine_addlink(e, &ci->hydro.force_pack, t);
+        engine_addlink(e, &cj->hydro.force_pack, t);
+        //        error("Abouzied: you need to code this up!");
+      } else if (t_subtype == task_subtype_gpu_pack_g) {
+        engine_addlink(e, &ci->hydro.gradient_pack, t);
+        engine_addlink(e, &cj->hydro.gradient_pack, t);
+        //        error("Abouzied: you need to code this up!");
       } else if (t_subtype == task_subtype_grav) {
         engine_addlink(e, &ci->grav.grav, t);
         engine_addlink(e, &cj->grav.grav, t);
@@ -2197,7 +2278,7 @@ void engine_link_gravity_tasks(struct engine *e) {
     /* Get a pointer to the task. */
     struct task *t = &sched->tasks[k];
 
-    if (t->type == task_type_none) continue;
+    if (t->type == task_type_none || t->ci == NULL) continue;
 
     /* Get the cells we act on */
     struct cell *ci = t->ci;
@@ -2425,12 +2506,14 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
   const int with_sink = (e->policy & engine_policy_sinks);
 #ifdef EXTRA_HYDRO_LOOP
   struct task *t_gradient = NULL;
+  struct task *t_gradient_gpu = NULL;  // A. Nasar
 #endif
 #ifdef EXTRA_STAR_LOOPS
   struct task *t_star_prep1 = NULL;
   struct task *t_star_prep2 = NULL;
 #endif
   struct task *t_force = NULL;
+  struct task *t_force_gpu = NULL;
   struct task *t_limiter = NULL;
   struct task *t_star_density = NULL;
   struct task *t_star_feedback = NULL;
@@ -2466,6 +2549,33 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
     }
 
+    /*Make packing depend on sorts and drift A. Nasar */
+    else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack_d) {
+      scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
+      /* Task for the second GPU hydro loop A. Nasar */
+      t_force_gpu = scheduler_addtask(sched, task_type_self,
+                                      task_subtype_gpu_pack_f, 0, 0, ci, NULL);
+      /* Link the tasks to the cells. Do the same for GPU tasks A. Nasar */
+      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+#ifdef EXTRA_HYDRO_LOOP
+      /* Same work for the additional GPU hydro loop A. Nasar */
+      t_gradient_gpu = scheduler_addtask(
+          sched, task_type_self, task_subtype_gpu_pack_g, 0, 0, ci, NULL);
+      /* Add the link between the new loops and the cell. Same for GPU task A.
+       * Nasar */
+      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+      // A. Nasar add unlocks for pack tasks here. Unpacks depend on packs and
+      // will be used to create downstream deps later
+      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+                          t_gradient_gpu);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
+                          t_force_gpu);
+#else
+      /* Now, build all the dependencies for the hydro */
+      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
+#endif
+    }
+
     /* Sort tasks depend on the drift of the cell (stars version). */
     else if (t_type == task_type_stars_sort && ci->nodeID == nodeID) {
       scheduler_addunlock(sched, ci->hydro.super->stars.drift, t);
@@ -2549,6 +2659,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
 
       /* Link the tasks to the cells */
       engine_addlink(e, &ci->hydro.force, t_force);
+
       if (with_timestep_limiter) {
         engine_addlink(e, &ci->hydro.limiter, t_limiter);
       }
@@ -2582,10 +2693,9 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Same work for the additional hydro loop */
       t_gradient = scheduler_addtask(sched, task_type_self,
                                      task_subtype_gradient, flags, 0, ci, NULL);
-
-      /* Add the link between the new loops and the cell */
+      /* Add the link between the new loops and the cell. Same for GPU task A.
+       * Nasar */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
-
       /* Now, build all the dependencies for the hydro */
       engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                            t_limiter, ci, with_cooling,
@@ -2727,6 +2837,80 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       }
     }
 
+    /*Make packing depend on sorts and drift A. Nasar */
+    else if (t_type == task_type_pair && t_subtype == task_subtype_gpu_pack_d) {
+      /* Make all density tasks depend on the drift */
+      if (ci->nodeID == nodeID) {
+        scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
+      }
+      if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+        scheduler_addunlock(sched, cj->hydro.super->hydro.drift, t);
+      }
+      /* Make all density tasks depend on the sorts */
+      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
+      if (ci->hydro.super != cj->hydro.super) {
+        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t);
+      }
+      /* New task for the force A. Nasar */
+      t_force_gpu = scheduler_addtask(sched, task_type_pair,
+                                      task_subtype_gpu_pack_f, 0, 0, ci, cj);
+#ifdef MPI_SYMMETRIC_FORCE_INTERACTION
+      /* The order of operations for an inactive local cell interacting
+       * with an active foreign cell is not guaranteed because the density
+       * (and gradient) iact loops don't exist in that case. So we need
+       * an explicit dependency here to have sorted cells. */
+
+      /* Make GPU force tasks depend on the sorts A. Nasar */
+      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force_gpu);
+      if (ci->hydro.super != cj->hydro.super) {
+        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force_gpu);
+      }
+#endif
+      /* Do teh same for GPU tasks A. Nasar*/
+      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+      engine_addlink(e, &cj->hydro.force_pack, t_force_gpu);
+#ifdef EXTRA_HYDRO_LOOP
+      /* Start by constructing the task for the second and third GPU hydro loop
+       * A. Nasar */
+      t_gradient_gpu = scheduler_addtask(sched, task_type_pair,
+                                         task_subtype_gpu_pack_g, 0, 0, ci, cj);
+      //      /* Add the link between the new loop and both cells */
+      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+      engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu);
+
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super_hydro-cells */
+      if (ci->nodeID == nodeID) {
+        /*Same for GPU tasks*/
+        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+                            t_gradient_gpu);
+        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
+                            t_force_gpu);
+      }
+      if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+        /*Same for GPU tasks*/
+        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
+                            t_gradient_gpu);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost,
+                            t_force_gpu);
+      }
+#else
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super_hydro-cells */
+      if (ci->nodeID == nodeID) {
+        // GPU tasks A. Nasar
+        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+                            t_force_gpu);
+      }
+      if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+        // GPU tasks A. Nasar
+        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
+                            t_force_gpu);
+      }
+#endif
+
+    }
+
     /* Otherwise, pair interaction? */
     else if (t_type == task_type_pair && t_subtype == task_subtype_density) {
 
@@ -2849,6 +3033,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
 
       engine_addlink(e, &ci->hydro.force, t_force);
       engine_addlink(e, &cj->hydro.force, t_force);
+
       if (with_timestep_limiter) {
         engine_addlink(e, &ci->hydro.limiter, t_limiter);
         engine_addlink(e, &cj->hydro.limiter, t_limiter);
@@ -2931,6 +3116,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
                                              with_cooling,
                                              with_timestep_limiter);
       }
+
 #endif
 
       if (with_feedback) {
@@ -3269,7 +3455,39 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
         }
       }
     }
+    /*Make packing depend on sorts and drift A. Nasar */
+    else if (t_type == task_type_sub_self &&
+             t_subtype == task_subtype_gpu_pack_d) {
+
+      scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
+      /* Start by constructing the task for the second hydro loop */
+      t_force_gpu =
+          scheduler_addtask(sched, task_type_sub_self, task_subtype_gpu_pack_f,
+                            flags, 0, ci, NULL);
+      /* Add the link between the new loop and the cell */
+      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+#ifdef EXTRA_HYDRO_LOOP
+
+      /* Start by constructing the task for the second and third hydro loop */
+      t_gradient_gpu =
+          scheduler_addtask(sched, task_type_sub_self, task_subtype_gpu_pack_g,
+                            flags, 0, ci, NULL);
+      /* Add the link between the new loop and the cell */
+      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super_hydro-cells */
+      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+                          t_gradient_gpu);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
+                          t_force_gpu);
+#else
 
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super_hydro-cells */
+      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
+#endif
+    }
     /* Otherwise, sub-self interaction? */
     else if (t_type == task_type_sub_self &&
              t_subtype == task_subtype_density) {
@@ -3355,6 +3573,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
 
       /* Add the link between the new loop and the cell */
       engine_addlink(e, &ci->hydro.force, t_force);
+
       if (with_timestep_limiter) {
         engine_addlink(e, &ci->hydro.limiter, t_limiter);
       }
@@ -3388,10 +3607,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Start by constructing the task for the second and third hydro loop */
       t_gradient = scheduler_addtask(sched, task_type_sub_self,
                                      task_subtype_gradient, flags, 0, ci, NULL);
-
       /* Add the link between the new loop and the cell */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
-
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super_hydro-cells */
       engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
@@ -3541,7 +3758,64 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
 
     /* Otherwise, sub-pair interaction? */
     else if (t_type == task_type_sub_pair &&
-             t_subtype == task_subtype_density) {
+             t_subtype == task_subtype_gpu_pack_d) {
+      /* Make all density pack tasks depend on the drift */
+      if (ci->nodeID == nodeID) {
+        scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
+      }
+      if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+        scheduler_addunlock(sched, cj->hydro.super->hydro.drift, t);
+      }
+      /* Make all density tasks depend on the sorts */
+      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
+      if (ci->hydro.super != cj->hydro.super) {
+        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t);
+      }
+      t_force_gpu = scheduler_addtask(
+          sched, task_type_sub_pair, task_subtype_gpu_pack_f, flags, 0, ci, cj);
+#ifdef MPI_SYMMETRIC_FORCE_INTERACTION
+      /* Make all force tasks depend on the sorts */
+      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force_gpu);
+      if (ci->hydro.super != cj->hydro.super) {
+        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force_gpu);
+      }
+#endif
+      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+      engine_addlink(e, &cj->hydro.force_pack, t_force_gpu);
+#ifdef EXTRA_HYDRO_LOOP
+      t_gradient_gpu = scheduler_addtask(
+          sched, task_type_sub_pair, task_subtype_gpu_pack_g, flags, 0, ci, cj);
+      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+      engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu);
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super_hydro-cells */
+      if (ci->nodeID == nodeID) {
+        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+                            t_gradient_gpu);
+        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
+                            t_force_gpu);
+      }
+      if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
+                            t_gradient_gpu);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost,
+                            t_force_gpu);
+      }
+#else
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super_hydro-cells */
+      if (ci->nodeID == nodeID) {
+        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+                            t_force_gpu);
+      }
+      if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
+                            t_force_gpu);
+      }
+#endif
+
+    } else if (t_type == task_type_sub_pair &&
+               t_subtype == task_subtype_density) {
 
       const int bcount_i = ci->black_holes.count;
       const int bcount_j = cj->black_holes.count;
@@ -3724,11 +3998,9 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Start by constructing the task for the second and third hydro loop */
       t_gradient = scheduler_addtask(sched, task_type_sub_pair,
                                      task_subtype_gradient, flags, 0, ci, cj);
-
       /* Add the link between the new loop and both cells */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
       engine_addlink(e, &cj->hydro.gradient, t_gradient);
-
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super_hydro-cells */
       if (ci->nodeID == nodeID) {
@@ -4142,9 +4414,13 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
       continue;
 
     /* If the cell is local build a self-interaction */
+    // struct task *t_pack_self;  // A. Nasar
     if (ci->nodeID == nodeID) {
       scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0, ci,
                         NULL);
+      // A. Nasar also add a pack task for GPU
+      scheduler_addtask(sched, task_type_self, task_subtype_gpu_pack_d, 0, 0,
+                        ci, NULL);
     }
 
     /* Now loop over all the neighbours of this cell */
@@ -4178,6 +4454,8 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
           const int sid = sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))];
           scheduler_addtask(sched, task_type_pair, task_subtype_density, sid, 0,
                             ci, cj);
+          scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack_d, sid,
+                            0, ci, cj);  // A. Nasar
 
 #ifdef SWIFT_DEBUG_CHECKS
 #ifdef WITH_MPI
@@ -4600,7 +4878,6 @@ void engine_maketasks(struct engine *e) {
   struct cell *cells = s->cells_top;
   const int nr_cells = s->nr_cells;
   const ticks tic = getticks();
-
   /* Re-set the scheduler. */
   scheduler_reset(sched, engine_estimate_nr_tasks(e));
 
@@ -4715,7 +4992,251 @@ void engine_maketasks(struct engine *e) {
      *                sched->tasks, sched->nr_tasks, sizeof(struct task),
      *                threadpool_auto_chunk_size, e); */
   }
+    int unsplit = 0, split = 0;
+    /*These loops should really be threadmapped A. Nasar*/
+    for (int i = 0; i < sched->nr_tasks; i++) {
+  	  struct task * t = &sched->tasks[i];
+  	  if(t->type == task_type_sub_self && t->subtype ==
+   task_subtype_gpu_pack_d){
+          t->type = task_type_self;
+  	  }
+        if(t->type == task_type_sub_pair && t->subtype ==
+        task_subtype_gpu_pack_d){
+      	t->type = task_type_pair;
+        }
+  	  if(t->type == task_type_sub_self && t->subtype ==
+   task_subtype_gpu_pack_g){
+          t->type = task_type_self;
+  	  }
+        if(t->type == task_type_sub_pair && t->subtype ==
+        task_subtype_gpu_pack_g){
+      	t->type = task_type_pair;
+        }
+  	  if(t->type == task_type_sub_self && t->subtype ==
+   task_subtype_gpu_pack_f){
+          t->type = task_type_self;
+  	  }
+        if(t->type == task_type_sub_pair && t->subtype ==
+        task_subtype_gpu_pack_f){
+      	t->type = task_type_pair;
+        }
+    }
+
+  /* Now, create unpack tasks based on the existing packs and create
+   * the dependencies pack->unpack->ghost_in A. Nasar */
+  const int pack_size = sched->pack_size;
+  const int pack_size_pair = sched->pack_size_pair;
+
+  int count_current_self = 0;
+  int count_current_pair = 0;
+
+  struct task *last_created_self_unpack = NULL;
+  struct task *last_created_pair_unpack = NULL;
 
+  /* Loop over all the currently existing pack tasks
+   * These loops should be thread-mapped too but will be a bit more tricky: A.
+   * Nasar*/
+  for (int i = 0; i < sched->nr_tasks; i++) {
+
+    struct task *t = &sched->tasks[i];
+    if (t->subtype != task_subtype_gpu_pack_d) continue;
+
+    if (t->type == task_type_self || t->type == task_type_sub_self) {
+
+      if (count_current_self % pack_size == 0) {
+        last_created_self_unpack = scheduler_addtask(
+            sched, task_type_self, task_subtype_gpu_unpack_d, 0, 0, NULL, NULL);
+        last_created_self_unpack->gpu_done = 0;
+      }
+
+      /* pack -> unpack -> ghost_in */
+      scheduler_addunlock(sched, t, last_created_self_unpack);
+      scheduler_addunlock(sched, last_created_self_unpack,
+                          t->ci->hydro.super->hydro
+                              .ghost_in);  // Keep self_unpack dependency here,
+                                           // pairs added later using links
+      /*Creating links between each cell and its unpack task*/
+      engine_addlink(e, &t->ci->hydro.density_unpack, last_created_self_unpack);
+      t->ci->hydro.d_unpack = last_created_self_unpack;
+      ++count_current_self;
+    }
+
+    else if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+      if (count_current_pair % pack_size_pair == 0) {
+        last_created_pair_unpack = scheduler_addtask(
+            sched, task_type_pair, task_subtype_gpu_unpack_d, 0, 0, NULL, NULL);
+      }
+
+      scheduler_addunlock(sched, t, last_created_pair_unpack);
+      if (t->ci->nodeID == e->nodeID)
+        scheduler_addunlock(sched, last_created_pair_unpack,
+                            t->ci->hydro.super->hydro.ghost_in);
+      if ((t->cj->nodeID == e->nodeID) &&
+          (t->ci->hydro.super != t->cj->hydro.super))
+        scheduler_addunlock(sched, last_created_pair_unpack,
+                            t->cj->hydro.super->hydro.ghost_in);
+
+      engine_addlink(e, &t->ci->hydro.density_unpack, last_created_pair_unpack);
+      engine_addlink(e, &t->cj->hydro.density_unpack, last_created_pair_unpack);
+
+      ++count_current_pair;
+    } else {
+      /* Abouzied: I need to implement the sub-self and sub-pair version */
+      error("Something bad happened");
+    }
+  }
+#ifdef SWIFT_DEBUG_CHECKS
+  if (count_current_self != sched->nr_self_pack_tasks_d)
+    error("We did not find the correct number of self pack tasks!!");
+  if (count_current_pair != sched->nr_pair_pack_tasks_d)
+    error("We did not find the correct number of pair pack tasks!!");
+#endif
+
+  /*Now create unpacks for all gpu_pack_g (gradient) tasks A. Nasar */
+  count_current_self = 0;
+  count_current_pair = 0;
+
+  last_created_self_unpack = NULL;
+  last_created_pair_unpack = NULL;
+  /* Loop over all the currently existing gradient pack tasks */
+  for (int i = 0; i < sched->nr_tasks; i++) {
+
+    struct task *t = &sched->tasks[i];
+    if (t->subtype != task_subtype_gpu_pack_g) continue;
+
+    if (t->type == task_type_self || t->type == task_type_sub_self) {
+
+      if (count_current_self % pack_size == 0) {
+        last_created_self_unpack = scheduler_addtask(
+            sched, task_type_self, task_subtype_gpu_unpack_g, 0, 0, NULL, NULL);
+        last_created_self_unpack->gpu_done = 0;
+      }
+
+      /* pack -> unpack -> ghost_in */
+      scheduler_addunlock(sched, t, last_created_self_unpack);
+      scheduler_addunlock(sched, last_created_self_unpack,
+                          t->ci->hydro.super->hydro.extra_ghost);
+      /*Creating links between a each cell and its unpack task*/
+      engine_addlink(e, &t->ci->hydro.gradient_unpack,
+                     last_created_self_unpack);
+      t->ci->hydro.g_unpack = last_created_self_unpack;
+
+      ++count_current_self;
+    }
+
+    else if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+      if (count_current_pair % pack_size_pair == 0) {
+        last_created_pair_unpack = scheduler_addtask(
+            sched, task_type_pair, task_subtype_gpu_unpack_g, 0, 0, NULL, NULL);
+      }
+
+      /* pack -> unpack -> ghost_in */
+      scheduler_addunlock(sched, t, last_created_pair_unpack);
+      if (t->ci->nodeID == e->nodeID)
+        scheduler_addunlock(sched, last_created_pair_unpack,
+                            t->ci->hydro.super->hydro.extra_ghost);
+      if ((t->cj->nodeID == e->nodeID) &&
+          (t->ci->hydro.super != t->cj->hydro.super))
+        scheduler_addunlock(sched, last_created_pair_unpack,
+                            t->cj->hydro.super->hydro.extra_ghost);
+
+      engine_addlink(e, &t->ci->hydro.gradient_unpack,
+                     last_created_pair_unpack);
+      engine_addlink(e, &t->cj->hydro.gradient_unpack,
+                     last_created_pair_unpack);
+
+      ++count_current_pair;
+    } else {
+      /* Abouzied: I need to implement the sub-self and sub-pair version */
+      error("Something bad happened");
+    }
+  }
+#ifdef SWIFT_DEBUG_CHECKS
+  if (count_current_self != sched->nr_self_pack_tasks_g)
+    error(
+        "We did not find the correct number of G self pack tasks!! count %i "
+        "what it shoudl be %i",
+        count_current_self, sched->nr_self_pack_tasks_g);
+  if (count_current_pair != sched->nr_pair_pack_tasks_g)
+    error(
+        "We did not find the correct number of G pair pack tasks!! count %i "
+        "what it shoudl be %i",
+        count_current_pair, sched->nr_pair_pack_tasks_g);
+#endif
+
+  /*Now create unpacks for all gpu_pack_f (force) tasks*/
+  count_current_self = 0;
+  count_current_pair = 0;
+
+  last_created_self_unpack = NULL;
+  last_created_pair_unpack = NULL;
+  /* Loop over all the currently existing gradient pack tasks */
+  for (int i = 0; i < sched->nr_tasks; i++) {
+
+    struct task *t = &sched->tasks[i];
+    if (t->subtype != task_subtype_gpu_pack_f) continue;
+
+    if (t->type == task_type_self || t->type == task_type_sub_self) {
+
+      if (count_current_self % pack_size == 0) {
+        last_created_self_unpack = scheduler_addtask(
+            sched, task_type_self, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL);
+      }
+
+      /* pack -> unpack -> ghost_in */
+      scheduler_addunlock(sched, t, last_created_self_unpack);
+      scheduler_addunlock(sched, last_created_self_unpack,
+                          t->ci->hydro.super->hydro.end_force);
+      /*Creating links between a each cell and its unpack task*/
+      engine_addlink(e, &t->ci->hydro.force_unpack, last_created_self_unpack);
+
+      ++count_current_self;
+    }
+
+    else if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+      if (count_current_pair % pack_size_pair == 0) {
+        last_created_pair_unpack = scheduler_addtask(
+            sched, task_type_pair, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL);
+      }
+
+      /* pack -> unpack -> ghost_in */
+      scheduler_addunlock(sched, t, last_created_pair_unpack);
+      if (t->ci->nodeID == e->nodeID)
+        scheduler_addunlock(sched, last_created_pair_unpack,
+                            t->ci->hydro.super->hydro.end_force);
+      if ((t->cj->nodeID == e->nodeID) &&
+          (t->ci->hydro.super != t->cj->hydro.super))
+        scheduler_addunlock(sched, last_created_pair_unpack,
+                            t->cj->hydro.super->hydro.end_force);
+
+      engine_addlink(e, &t->ci->hydro.force_unpack, last_created_pair_unpack);
+      engine_addlink(e, &t->cj->hydro.force_unpack, last_created_pair_unpack);
+
+      ++count_current_pair;
+    } else {
+      /* Abouzied: I need to implement the sub-self and sub-pair version */
+      error("Something bad happened");
+    }
+  }
+#ifdef SWIFT_DEBUG_CHECKS
+  if (count_current_self != sched->nr_self_pack_tasks_f)
+    error("We did not find the correct number of F self pack tasks!!");
+  if (count_current_pair != sched->nr_pair_pack_tasks_f)
+    error("We did not find the correct number of F pair pack tasks!!");
+#endif
+  /*Debug code to check if some tasks are not split to desired level in tree for
+   * GPU*/
+  //  for (int i = 0; i < sched->nr_tasks; i++) {
+  //    struct task *t = &sched->tasks[i];
+  //    if(t->ci != NULL){
+  ////      if(t->type == task_type_pair && ((t->ci->split && !t->cj->split) ||
+  ///(!t->ci->split && t->cj->split))) /    	  error("one is split the other
+  /// isn't");
+  //      if(t->ci->hydro.count > 80 && t->type == task_type_self)
+  //    	  error("Count is %i task subtype (%s)",
+  //                  t->ci->hydro.count, subtaskID_names[t->subtype]);
+  //    }
+  //  }
   if (e->verbose)
     message("Making extra hydroloop tasks took %.3f %s.",
             clocks_from_ticks(getticks() - tic2), clocks_getunit());
@@ -4866,4 +5387,39 @@ void engine_maketasks(struct engine *e) {
   if (e->verbose)
     message("took %.3f %s (including reweight).",
             clocks_from_ticks(getticks() - tic), clocks_getunit());
+
+  /* Loop over all the CPU hydro tasks to make implicit (needs threadmapping)*/
+  for (int i = 0; i < sched->nr_tasks; i++) {
+
+    struct task *t = &sched->tasks[i];
+    if (t->subtype == task_subtype_density ||
+        t->subtype == task_subtype_gradient ||
+        t->subtype == task_subtype_force) {
+      t->implicit = 1;
+    }
+    //    if (t->subtype == task_subtype_gpu_pack_d ||
+    //      t->subtype == task_subtype_gpu_pack_g ||
+    //	  t->subtype == task_subtype_gpu_pack_f ||
+    //	  t->subtype == task_subtype_gpu_unpack_d ||
+    //	  t->subtype == task_subtype_gpu_unpack_g ||
+    //	  t->subtype == task_subtype_gpu_unpack_f){
+    //    	t->implicit = 1;
+    //    }
+    //    if (t->subtype == task_subtype_gpu_pack_g ||
+    //	  t->subtype == task_subtype_gpu_pack_f ||
+    //	  t->subtype == task_subtype_gpu_unpack_g ||
+    //	  t->subtype == task_subtype_gpu_unpack_f){// ||
+    ////	  (t->type == task_type_pair &&
+    ////	   t->subtype == task_subtype_gpu_pack_d)){
+    //    	t->implicit = 1;
+    //    }
+    //    if ((t->subtype == task_subtype_gpu_pack_d ||
+    //      t->subtype == task_subtype_gpu_pack_g  ||
+    //	  t->subtype == task_subtype_gpu_pack_f) &&
+    //	  (t->type == task_type_sub_pair ||
+    //	  t->type == task_type_sub_self)){
+    //    	t->implicit = 1;
+    ////    	error("STill have subs");
+    //    }
+  }
 }
diff --git a/src/engine_marktasks.c b/src/engine_marktasks.c
index 27b31c99c4..89f5e41b74 100644
--- a/src/engine_marktasks.c
+++ b/src/engine_marktasks.c
@@ -86,6 +86,25 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
     const enum task_types t_type = t->type;
     const enum task_subtypes t_subtype = t->subtype;
 
+    // Activate GPU unpack tasks (cell-less dummy tasks so need activating
+    // separately)
+    if (t_type == task_type_self &&
+        (t_subtype == task_subtype_gpu_unpack_d ||
+         t_subtype == task_subtype_gpu_unpack_g ||
+         t_subtype == task_subtype_gpu_unpack_f)) {  // A. Nasar
+      scheduler_activate(s, t);
+      continue;
+    }
+
+    if (t_type == task_type_pair &&
+        (t_subtype == task_subtype_gpu_unpack_d ||
+         t_subtype == task_subtype_gpu_unpack_g ||
+         t_subtype == task_subtype_gpu_unpack_f)) {  // A. Nasar
+      scheduler_activate(s, t);
+      continue;
+      //      fprintf(stderr,"activated pair unpack in marktasks\n");
+    }
+
     /* Single-cell task? */
     if (t_type == task_type_self || t_type == task_type_sub_self) {
 
@@ -93,7 +112,17 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       struct cell *ci = t->ci;
 
 #ifdef SWIFT_DEBUG_CHECKS
+#ifndef WITH_CUDA  // A. Nasar
       if (ci->nodeID != nodeID) error("Non-local self task found");
+#else
+      if ((ci->nodeID != nodeID) && (t_subtype != task_subtype_gpu_unpack_d) &&
+          (t_subtype != task_subtype_gpu_unpack_f) &&
+          (t_subtype != task_subtype_gpu_unpack_g)) {
+        fprintf(stderr, "task is %i\n", subtaskID_names[t->subtype]);
+        error("Non-local self task found. Task is subtaskID_names[%s]",
+              subtaskID_names[t->subtype]);
+      }
+#endif
 #endif
 
       const int ci_active_hydro = cell_is_active_hydro(ci, e);
@@ -115,6 +144,39 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
         }
       }
 
+      /* Activate packing for GPU A. Nasar */
+      else if (t_type == task_type_self &&
+               t_subtype == task_subtype_gpu_pack_d) {
+        if (ci_active_hydro) {
+          scheduler_activate(s, t);
+          ci->pack_done = 0;
+          ci->gpu_done = 0;
+          ci->unpack_done = 0;
+        }
+      }
+
+      /* Activate packing for GPU */
+      else if (t_type == task_type_self &&
+               t_subtype == task_subtype_gpu_pack_g) {
+        if (ci_active_hydro) {
+          scheduler_activate(s, t);
+          ci->pack_done_g = 0;
+          ci->gpu_done_g = 0;
+          ci->unpack_done_g = 0;
+        }
+      }
+
+      /* Activate packing for GPU */
+      else if (t_type == task_type_self &&
+               t_subtype == task_subtype_gpu_pack_f) {
+        if (ci_active_hydro) {
+          scheduler_activate(s, t);
+          ci->pack_done_f = 0;
+          ci->gpu_done_f = 0;
+          ci->unpack_done_f = 0;
+        }
+      }
+
       /* Store current values of dx_max and h_max. */
       else if (t_type == task_type_sub_self &&
                t_subtype == task_subtype_density) {
@@ -125,12 +187,22 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
         }
       }
 
+      /* Store current values of dx_max and h_max. A. Nasar: Unsure if we
+         actually need this*/
+      else if (t_type == task_type_sub_self &&
+               t_subtype == task_subtype_gpu_pack_d) {
+        if (ci_active_hydro) {
+          scheduler_activate(s, t);
+        }
+      }
+
       else if (t_type == task_type_self && t_subtype == task_subtype_force) {
         if (ci_active_hydro) scheduler_activate(s, t);
       }
 
       else if (t_type == task_type_sub_self &&
-               t_subtype == task_subtype_force) {
+               (t_subtype == task_subtype_force ||
+                t_subtype == task_subtype_gpu_pack_f)) {
         if (ci_active_hydro) scheduler_activate(s, t);
       }
 
@@ -149,7 +221,8 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       }
 
       else if (t_type == task_type_sub_self &&
-               t_subtype == task_subtype_gradient) {
+               (t_subtype == task_subtype_gradient ||
+                t_subtype == task_subtype_gpu_pack_g)) {
         if (ci_active_hydro) scheduler_activate(s, t);
       }
 
@@ -409,7 +482,29 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       const int ci_active_rt = cell_is_rt_active(ci, e);
       const int cj_active_rt = cell_is_rt_active(cj, e);
 
-      /* Only activate tasks that involve a local active cell. */
+      /* Activate packing for GPU A. Nasar */
+      if (t_subtype == task_subtype_gpu_pack_d &&
+          ((ci_active_hydro && ci_nodeID == nodeID) ||
+           (cj_active_hydro && cj_nodeID == nodeID))) {
+        scheduler_activate(s, t);
+        ci->gpu_done_pair = 0;
+        cj->gpu_done_pair = 0;
+      } else if (t_subtype == task_subtype_gpu_pack_g &&
+                 ((ci_active_hydro && ci_nodeID == nodeID) ||
+                  (cj_active_hydro && cj_nodeID == nodeID))) {
+        scheduler_activate(s, t);
+        ci->gpu_done_pair_g = 0;
+        cj->gpu_done_pair_g = 0;
+      } else if (t_subtype == task_subtype_gpu_pack_f &&
+                 ((ci_active_hydro && ci_nodeID == nodeID) ||
+                  (cj_active_hydro && cj_nodeID == nodeID))) {
+        scheduler_activate(s, t);
+        ci->gpu_done_pair_f = 0;
+        cj->gpu_done_pair_f = 0;
+      }
+
+      /* Only activate tasks that involve a local active cell. A. Nasar THIS
+       * COULD BE SOURCE OF BUG */
       if ((t_subtype == task_subtype_density ||
            t_subtype == task_subtype_gradient ||
            t_subtype == task_subtype_limiter ||
diff --git a/src/error.h b/src/error.h
index a9b7481cf4..806b74f123 100644
--- a/src/error.h
+++ b/src/error.h
@@ -22,7 +22,11 @@
 #define SWIFT_ERROR_H
 
 /* Config parameters. */
+#ifdef WITH_CUDA
+#include "../config.h"
+#else
 #include <config.h>
+#endif
 
 /* Some standard headers. */
 #include <stdio.h>
diff --git a/src/files_for_new_functions/arrays_malloc.cu b/src/files_for_new_functions/arrays_malloc.cu
new file mode 100644
index 0000000000..3bbf998231
--- /dev/null
+++ b/src/files_for_new_functions/arrays_malloc.cu
@@ -0,0 +1,363 @@
+#include "cuda/part_gpu.h"
+
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+
+#include "arrays_malloc.h"
+
+void allocate_host(struct part_soa *parts_soa, int count_max_parts_tmp) {
+  ///////////Malloc Host arrays
+  cudaMallocHost((void **)&parts_soa->tid_p, count_max_parts_tmp * sizeof(int));
+  cudaMallocHost((void **)&parts_soa->id,
+                 count_max_parts_tmp * sizeof(long long));
+  cudaMallocHost((void **)&parts_soa->mass,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->h, count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->u, count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->u_dt,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->rho, count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->SPH_sum,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->x_p,
+                 count_max_parts_tmp * sizeof(double));
+  cudaMallocHost((void **)&parts_soa->y_p,
+                 count_max_parts_tmp * sizeof(double));
+  cudaMallocHost((void **)&parts_soa->z_p,
+                 count_max_parts_tmp * sizeof(double));
+  cudaMallocHost((void **)&parts_soa->ux, count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->uy, count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->uz, count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->a_hydrox,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->a_hydroy,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->a_hydroz,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->locx,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->locy,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->locz,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->widthx,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->widthy,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->widthz,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->h_max,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->count_p,
+                 count_max_parts_tmp * sizeof(int));
+  cudaMallocHost((void **)&parts_soa->wcount,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->wcount_dh,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->rho_dh,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->rot_ux,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->rot_uy,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->rot_uz,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->div_v,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->div_v_previous_step,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->alpha_visc,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->v_sig,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->laplace_u,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->alpha_diff,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->f, count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->soundspeed,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->h_dt,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->balsara,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->pressure,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->alpha_visc_max_ngb,
+                 count_max_parts_tmp * sizeof(float));
+  /* timestep stuff */
+  cudaMallocHost((void **)&parts_soa->time_bin,
+                 count_max_parts_tmp * sizeof(timebin_t));
+  cudaMallocHost((void **)&parts_soa->wakeup,
+                 count_max_parts_tmp * sizeof(timebin_t));
+  cudaMallocHost((void **)&parts_soa->min_ngb_time_bin,
+                 count_max_parts_tmp * sizeof(timebin_t));
+  cudaMallocHost((void **)&parts_soa->to_be_synchronized,
+                 count_max_parts_tmp * sizeof(char));
+}
+
+void allocate_device(struct part_soa d_parts_soa, int count_max_parts_tmp) {
+  ////////now malloc variables for particle data on the GPU. Sheesh
+  fprintf(stderr, "before malloc\n");
+  cudaMalloc((void **)&(d_parts_soa.tid_p), sizeof(int) * count_max_parts_tmp);
+  fprintf(stderr, "after malloc\n");
+  cudaMalloc((void **)&(d_parts_soa.id),
+             sizeof(long long) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.x_p), sizeof(double) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.y_p), sizeof(double) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.z_p), sizeof(double) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.ux), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.uy), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.uz), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.a_hydrox),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.a_hydroy),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.a_hydroz),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.mass), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.h), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.u), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.u_dt), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.rho), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.SPH_sum),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.locx), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.locy), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.locz), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.widthx),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.widthy),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.widthz),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.h_max),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.count_p),
+             sizeof(int) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.wcount),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.wcount_dh),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.rho_dh),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.rot_ux),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.rot_uy),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.rot_uz),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.div_v),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.div_v_previous_step),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.alpha_visc),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.v_sig),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.laplace_u),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.alpha_diff),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.f), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.soundspeed),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.h_dt), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.balsara),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.pressure),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.alpha_visc_max_ngb),
+             sizeof(float) * count_max_parts_tmp);
+  /* timestep stuff */
+  cudaMalloc((void **)&(d_parts_soa.time_bin),
+             sizeof(timebin_t) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.wakeup),
+             sizeof(timebin_t) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.min_ngb_time_bin),
+             sizeof(timebin_t) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.to_be_synchronized),
+             sizeof(char) * count_max_parts_tmp);
+}
+
+cudaError_t cudaAllocInt(int **d_var, int elements) {
+  return cudaMalloc((void **)d_var, elements * sizeof(int));
+}
+cudaError_t cudaAllocFloat(float **d_var, int elements) {
+  return cudaMalloc((void **)d_var, elements * sizeof(float));
+}
+cudaError_t cudaAllocDouble(double **d_var, int elements) {
+  return cudaMalloc((void **)d_var, elements * sizeof(double));
+}
+cudaError_t cudaAllocLonglong(long long **d_var, int elements) {
+  return cudaMalloc((void **)d_var, elements * sizeof(long long));
+}
+cudaError_t cudaAllocChar(char **d_var, int elements) {
+  return cudaMalloc((void **)d_var, elements * sizeof(char));
+}
+cudaError_t cudaAllocTimebin(timebin_t **d_var, int elements) {
+  return cudaMalloc((void **)d_var, elements * sizeof(timebin_t));
+}
+
+void allocate_device_dirty(
+    int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p,
+    double **d_z_p, float **d_ux, float **d_uy, float **d_uz,
+    float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz, float **d_mass,
+    float **d_h, float **d_u, float **d_u_dt, float **d_rho, float **d_locx,
+    float **d_locy, float **d_locz, float **d_widthx, float **d_widthy,
+    float **d_widthz, float **d_h_max, int **d_count_p, float **d_wcount,
+    float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy,
+    float **d_rot_uz, float **d_div_v, float **d_div_v_previous_step,
+    float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
+    float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt,
+    float **d_balsara, float **d_pressure, float **d_alpha_visc_max_ngb,
+    timebin_t **d_time_bin, timebin_t **d_wakeup,
+    timebin_t **d_min_ngb_time_bin, char **d_to_be_synchronized,
+    int count_max_parts_tmp) {
+  ////////Malloc variables for particle data on the GPU. Sheesh, that's a lot
+
+  size_t free_byte;
+  size_t total_byte;
+
+  cudaError_t cuda_status = cudaMemGetInfo(&free_byte, &total_byte);
+  double free = (double)free_byte;
+  double available = (double)total_byte;
+  double used = (available - free);
+  //          message("free %lf used %lf", free/10.E8, used/10.E8);
+
+  cudaError_t cu_error = cudaAllocInt(d_tid_p, count_max_parts_tmp);
+  cu_error = cudaAllocLonglong(d_id, count_max_parts_tmp);
+  cu_error = cudaAllocDouble(d_x_p, count_max_parts_tmp);
+  cu_error = cudaAllocDouble(d_y_p, count_max_parts_tmp);
+  cu_error = cudaAllocDouble(d_z_p, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_ux, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_uy, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_uz, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_a_hydrox, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_a_hydroy, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_a_hydroz, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_mass, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_h, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_u, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_u_dt, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_rho, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_locx, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_locy, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_locz, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_widthx, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_widthy, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_widthz, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_h_max, count_max_parts_tmp);
+  cu_error = cudaAllocInt(d_count_p, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_wcount, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_wcount_dh, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_rho_dh, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_rot_ux, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_rot_uy, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_rot_uz, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_div_v, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_div_v_previous_step, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_alpha_visc, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_v_sig, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_laplace_u, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_alpha_diff, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_f, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_soundspeed, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_h_dt, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_balsara, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_pressure, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_alpha_visc_max_ngb, count_max_parts_tmp);
+  /* timestep stuff */
+  cu_error = cudaAllocTimebin(d_time_bin, count_max_parts_tmp);
+  cu_error = cudaAllocTimebin(d_wakeup, count_max_parts_tmp);
+  cu_error = cudaAllocTimebin(d_min_ngb_time_bin, count_max_parts_tmp);
+  cu_error = cudaAllocChar(d_to_be_synchronized, count_max_parts_tmp);
+//		  cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
+//		  double free_end = (double)free_byte;
+//		  available = (double)total_byte;
+//		  double used_end = (available - free_end);
+//          message("cuda malloc self free %lf GB used %lf GB used to allocate
+//          self"
+//        		  " data %lf MB", free_end/10.E8, used_end/10.E8,
+//        (used_end - used)/10.E5);
+//          message("at end of malloc dirty: %s",
+//		  	       cudaGetErrorString(cu_error));
+#ifdef CUDA_DEBUG
+  if (cu_error != cudaSuccess) {
+    fprintf(stderr, "CUDA error at end of malloc dirty: %s\n",
+            cudaGetErrorString(cu_error));
+    exit(0);
+  }
+#endif
+}
+
+void allocate_device_test(int **tid_test, int count_max_parts_tmp) {
+  ////////now malloc variables for particle data on the GPU. Sheesh
+
+  cudaMalloc((void **)tid_test, sizeof(int) * count_max_parts_tmp);
+
+  cudaError_t cu_error = cudaPeekAtLastError();  // Get error code
+  fprintf(stderr, "malloc tid: %s\n", cudaGetErrorString(cu_error));
+
+  if (cu_error != cudaSuccess) {
+    fprintf(stderr, "CUDA error with malloc tid: %s\n",
+            cudaGetErrorString(cu_error));
+    exit(0);
+  }
+}
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_malloc(struct part_soa *parts_soa, int alloc_type,
+                 int count_max_parts_tmp) {
+  allocate_host(parts_soa, count_max_parts_tmp);
+}
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_malloc(struct part_soa d_parts_soa, int alloc_type,
+                   int count_max_parts_tmp) {
+  allocate_device(d_parts_soa, count_max_parts_tmp);
+}
+
+void device_malloc_dirty(
+    int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p,
+    double **d_z_p, float **d_ux, float **d_uy, float **d_uz,
+    float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz, float **d_mass,
+    float **d_h, float **d_u, float **d_u_dt, float **d_rho, float **d_locx,
+    float **d_locy, float **d_locz, float **d_widthx, float **d_widthy,
+    float **d_widthz, float **d_h_max, int **d_count_p, float **d_wcount,
+    float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy,
+    float **d_rot_uz, float **d_div_v, float **d_div_v_previous_step,
+    float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
+    float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt,
+    float **d_balsara, float **d_pressure, float **d_alpha_visc_max_ngb,
+    timebin_t **d_time_bin, timebin_t **d_wakeup,
+    timebin_t **d_min_ngb_time_bin, char **d_to_be_synchronized,
+    int count_max_parts_tmp) {
+
+  allocate_device_dirty(
+      d_tid_p, d_id, d_x_p, d_y_p, d_z_p, d_ux, d_uy, d_uz, d_a_hydrox,
+      d_a_hydroy, d_a_hydroz, d_mass, d_h, d_u, d_u_dt, d_rho, d_locx, d_locy,
+      d_locz, d_widthx, d_widthy, d_widthz, d_h_max, d_count_p, d_wcount,
+      d_wcount_dh, d_rho_dh, d_rot_ux, d_rot_uy, d_rot_uz, d_div_v,
+      d_div_v_previous_step, d_alpha_visc, d_v_sig, d_laplace_u, d_alpha_diff,
+      d_f, d_soundspeed, d_h_dt, d_balsara, d_pressure, d_alpha_visc_max_ngb,
+      d_time_bin, d_wakeup, d_min_ngb_time_bin, d_to_be_synchronized,
+      count_max_parts_tmp);
+}
+
+void device_malloc_test(int **tid_test, int count_max_parts_tmp) {
+
+  allocate_device_test(tid_test, count_max_parts_tmp);
+}
+
+#ifdef WITH_CUDA
+}
+#endif
diff --git a/src/files_for_new_functions/arrays_malloc.h b/src/files_for_new_functions/arrays_malloc.h
new file mode 100644
index 0000000000..1107b51444
--- /dev/null
+++ b/src/files_for_new_functions/arrays_malloc.h
@@ -0,0 +1,64 @@
+#include "cuda/part_gpu.h"
+
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include <error.h>
+
+cudaError_t cudaAllocInt(int **d_var, int elements);
+cudaError_t cudaAllocFloat(float **d_var, int elements);
+cudaError_t cudaAllocDouble(double **d_var, int elements);
+cudaError_t cudaAllocLonglong(long long **d_var, int elements);
+cudaError_t cudaAllocChar(char **d_var, int elements);
+cudaError_t cudaAllocTimebin(timebin_t **d_var, int elements);
+
+void allocate_host(struct part_soa *parts_soa, int count_max_parts_tmp);
+
+void allocate_device(struct part_soa d_parts_soa, int count_max_parts_tmp);
+
+void allocate_device_dirty(
+    int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p,
+    double **d_z_p, float **d_ux, float **d_uy, float **d_uz,
+    float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz, float **d_mass,
+    float **d_h, float **d_u, float **d_u_dt, float **d_rho, float **d_locx,
+    float **d_locy, float **d_locz, float **d_widthx, float **d_widthy,
+    float **d_widthz, float **d_h_max, int **d_count_p, float **d_wcount,
+    float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy,
+    float **d_rot_uz, float **d_div_v, float **d_div_v_previous_step,
+    float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
+    float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt,
+    float **d_balsara, float **d_pressure, float **d_alpha_visc_max_ngb,
+    timebin_t **d_time_bin, timebin_t **d_wakeup,
+    timebin_t **d_min_ngb_time_bin, char **d_to_be_synchronized,
+    int count_max_parts_tmp);
+
+void allocate_device_test(int **tid_test, int count_max_parts_tmp);
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_malloc(struct part_soa *parts_soa, int alloc_type,
+                 int count_max_parts_tmp);
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_malloc(struct part_soa d_parts_soa, int alloc_type,
+                   int count_max_parts_tmp);
+
+void device_malloc_dirty(
+    int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p,
+    double **d_z_p, float **d_ux, float **d_uy, float **d_uz,
+    float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz, float **d_mass,
+    float **d_h, float **d_u, float **d_u_dt, float **d_rho, float **d_locx,
+    float **d_locy, float **d_locz, float **d_widthx, float **d_widthy,
+    float **d_widthz, float **d_h_max, int **d_count_p, float **d_wcount,
+    float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy,
+    float **d_rot_uz, float **d_div_v, float **d_div_v_previous_step,
+    float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
+    float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt,
+    float **d_balsara, float **d_pressure, float **d_alpha_visc_max_ngb,
+    timebin_t **d_time_bin, timebin_t **d_wakeup,
+    timebin_t **d_min_ngb_time_bin, char **d_to_be_synchronized,
+    int count_max_parts_tmp);
+
+void device_malloc_test(int **tid_test, int count_max_parts_tmp);
diff --git a/src/files_for_new_functions/host_device_data_transfer.cu b/src/files_for_new_functions/host_device_data_transfer.cu
new file mode 100644
index 0000000000..ede719529b
--- /dev/null
+++ b/src/files_for_new_functions/host_device_data_transfer.cu
@@ -0,0 +1,566 @@
+#include "cuda/part_gpu.h"
+
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+
+void host2device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp) {
+  //	int * tid_h;
+  //	cudaMallocHost((void **)&tid_h,
+  //			count_max_parts_tmp * sizeof(int));
+  for (int i = 0; i < count_max_parts_tmp; i++) {
+    tid_h[i] = 100;
+    //		fprintf(stderr,"tid_h %i\n", tid_h[i]);
+  }
+
+  cudaMemcpy(d_tid_p, tid_h, count_max_parts_tmp * sizeof(int),
+             cudaMemcpyHostToDevice);
+  cudaDeviceSynchronize();
+  //	cudaFree(tid_h);
+}
+
+void device2host_test(struct part_soa parts_soa, int *tid_h,
+                      int count_max_parts_tmp) {
+  int *tid_p = parts_soa.tid_p;
+  cudaMemcpy(tid_h, tid_p, count_max_parts_tmp * sizeof(int),
+             cudaMemcpyDeviceToHost);
+  for (int i = 0; i < count_max_parts_tmp; i++) {
+    fprintf(stderr, "tid is %i\n", tid_h[i]);
+  }
+}
+
+void device2device_test(int *tid_p, struct part_soa parts_soa,
+                        int count_max_parts_tmp) {
+  cudaMemcpy(tid_p, parts_soa.tid_p, sizeof(int *), cudaMemcpyHostToDevice);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp) {
+
+  host2device_test(d_tid_p, tid_h, count_max_parts_tmp);
+}
+
+void device_host_test(struct part_soa parts_soa, int *tid_h,
+                      int count_max_parts_tmp) {
+
+  device2host_test(parts_soa, tid_h, count_max_parts_tmp);
+}
+
+void device_device_test(int *tid_p, struct part_soa parts_soa,
+                        int count_max_parts_tmp) {
+
+  device2device_test(tid_p, parts_soa, count_max_parts_tmp);
+}
+
+void device2host_density(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp) {
+  cudaMemcpy(parts_soa_buffer.tid_p, tid_p, count_max_parts_tmp * sizeof(int),
+             cudaMemcpyDeviceToHost);
+}
+void device_host_cpy(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp) {
+
+  device2host_density(parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz,
+                      a_hydrox, a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx,
+                      locy, locz, widthx, widthy, widthz, h_max, count_p,
+                      wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz, div_v,
+                      div_v_previous_step, alpha_visc, v_sig, laplace_u,
+                      alpha_diff, f, soundspeed, h_dt, balsara, pressure,
+                      alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
+                      to_be_synchronized, count_max_parts_tmp);
+}
+
+void device2device_density(
+    struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp, cudaStream_t stream) {
+
+  cudaMemcpyAsync(&(parts_soa_buffer->tid_p), &tid_p, sizeof(int *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->locx), &locx, sizeof(float *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->locy), &locy, sizeof(float *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->locz), &locz, sizeof(float *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->h), &h, sizeof(float *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->mass), &mass, sizeof(float *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->x_p), &x_p, sizeof(double *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->y_p), &y_p, sizeof(double *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->z_p), &z_p, sizeof(double *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->ux), &ux, sizeof(float *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->uy), &uy, sizeof(float *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->uz), &uz, sizeof(float *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->time_bin), &time_bin, sizeof(timebin_t *),
+                  cudaMemcpyHostToDevice, stream);
+}
+
+void host2device_density(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp) {
+  cudaError_t cu_error;
+  cudaMemcpy(&tid_p, &(parts_soa_buffer.tid_p),
+             count_max_parts_tmp * sizeof(int), cudaMemcpyHostToDevice);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_cpy(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp) {
+
+  host2device_density(parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz,
+                      a_hydrox, a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx,
+                      locy, locz, widthx, widthy, widthz, h_max, count_p,
+                      wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz, div_v,
+                      div_v_previous_step, alpha_visc, v_sig, laplace_u,
+                      alpha_diff, f, soundspeed, h_dt, balsara, pressure,
+                      alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
+                      to_be_synchronized, count_max_parts_tmp);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_device_bind(
+    struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp, cudaStream_t stream) {
+
+  device2device_density(
+      parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox,
+      a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx,
+      widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy,
+      rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+      alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
+      time_bin, wakeup, min_ngb_time_bin, to_be_synchronized,
+      count_max_parts_tmp, stream);
+}
+
+void host2device_async_density(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream) {
+  cudaError_t cu_error;
+  cudaMemcpyAsync(&tid_p[first_part_tmp],
+                  &(parts_soa_buffer.tid_p[first_part_tmp]),
+                  bundle_n_parts * sizeof(int), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(
+      &locx[first_part_tmp], &(parts_soa_buffer.locx[first_part_tmp]),
+      bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(
+      &locy[first_part_tmp], &(parts_soa_buffer.locy[first_part_tmp]),
+      bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&locz[first_part_tmp], &parts_soa_buffer.locz[first_part_tmp],
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(&h[first_part_tmp], &parts_soa_buffer.h[first_part_tmp],
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(&mass[first_part_tmp], &parts_soa_buffer.mass[first_part_tmp],
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(&x_p[first_part_tmp], &parts_soa_buffer.x_p[first_part_tmp],
+                  bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(&y_p[first_part_tmp], &parts_soa_buffer.y_p[first_part_tmp],
+                  bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(&z_p[first_part_tmp], &parts_soa_buffer.z_p[first_part_tmp],
+                  bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(&ux[first_part_tmp], &parts_soa_buffer.ux[first_part_tmp],
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(&uy[first_part_tmp], &parts_soa_buffer.uy[first_part_tmp],
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(&uz[first_part_tmp], &parts_soa_buffer.uz[first_part_tmp],
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(
+      &time_bin[first_part_tmp], &parts_soa_buffer.time_bin[first_part_tmp],
+      bundle_n_parts * sizeof(timebin_t), cudaMemcpyHostToDevice, stream);
+}
+
+void host2device_async_density_pair(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream) {
+
+  //  int bundle_n_parts = bundle_n_parts_i + bundle_n_parts_j;
+  cudaError_t cu_error;
+  //  cudaMemcpyAsync(&tid_p[first_part_tmp],
+  //  &(parts_soa_buffer.tid_p[first_part_tmp]),
+  //				  bundle_n_parts * sizeof(int),
+  // cudaMemcpyHostToDevice, 				  stream);
+  //  cudaMemcpyAsync(&locx[first_part_tmp],
+  //  &(parts_soa_buffer.locx[first_part_tmp]),
+  //				  bundle_n_parts * sizeof(float),
+  //				  cudaMemcpyHostToDevice, stream);
+  //  cudaMemcpyAsync(&locy[first_part_tmp],
+  //  &(parts_soa_buffer.locy[first_part_tmp]),
+  //				  bundle_n_parts * sizeof(float),
+  //				  cudaMemcpyHostToDevice, stream);
+  //  cudaMemcpyAsync(&locz[first_part_tmp],
+  //  &parts_soa_buffer.locz[first_part_tmp],
+  //				  bundle_n_parts * sizeof(float),
+  //				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&h[first_part_tmp], &parts_soa_buffer.h[first_part_tmp],
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(&mass[first_part_tmp], &parts_soa_buffer.mass[first_part_tmp],
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(&x_p[first_part_tmp], &parts_soa_buffer.x_p[first_part_tmp],
+                  bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(&y_p[first_part_tmp], &parts_soa_buffer.y_p[first_part_tmp],
+                  bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(&z_p[first_part_tmp], &parts_soa_buffer.z_p[first_part_tmp],
+                  bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(&ux[first_part_tmp], &parts_soa_buffer.ux[first_part_tmp],
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(&uy[first_part_tmp], &parts_soa_buffer.uy[first_part_tmp],
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(&uz[first_part_tmp], &parts_soa_buffer.uz[first_part_tmp],
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(
+      &time_bin[first_part_tmp], &parts_soa_buffer.time_bin[first_part_tmp],
+      bundle_n_parts * sizeof(timebin_t), cudaMemcpyHostToDevice, stream);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_async_cpy(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream) {
+
+  host2device_async_density(
+      parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox,
+      a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx,
+      widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy,
+      rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+      alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
+      time_bin, wakeup, min_ngb_time_bin, to_be_synchronized, first_part_tmp,
+      bundle_n_parts, stream);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_async_cpy_pair(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp_i,
+    int bundle_n_parts, cudaStream_t stream) {
+
+  host2device_async_density_pair(
+      parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox,
+      a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx,
+      widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy,
+      rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+      alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
+      time_bin, wakeup, min_ngb_time_bin, to_be_synchronized, first_part_tmp_i,
+      bundle_n_parts, stream);
+}
+
+void device2host_async_density(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream) {
+  cudaError_t cu_error;
+
+  cudaMemcpyAsync(&parts_soa_buffer.rho[first_part_tmp], &rho[first_part_tmp],
+                  bundle_n_parts * sizeof(float), cudaMemcpyDeviceToHost,
+                  stream);
+  cudaMemcpyAsync(&parts_soa_buffer.rho_dh[first_part_tmp],
+                  &rho_dh[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.wcount[first_part_tmp],
+                  &wcount[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.wcount_dh[first_part_tmp],
+                  &wcount_dh[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.div_v[first_part_tmp],
+                  &div_v[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.rot_ux[first_part_tmp],
+                  &rot_ux[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.rot_uy[first_part_tmp],
+                  &rot_uy[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.rot_uz[first_part_tmp],
+                  &rot_uz[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
+}
+
+void device2host_async_density_pair(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream) {
+  cudaError_t cu_error;
+  //  fprintf(stderr, "parts i %i parts j %i\n", bundle_n_parts_i,
+  //  bundle_n_parts_j); int bundle_n_parts = bundle_n_parts_i +
+  //  bundle_n_parts_j;
+
+  cudaMemcpyAsync(&parts_soa_buffer.rho[first_part_tmp], &rho[first_part_tmp],
+                  bundle_n_parts * sizeof(float), cudaMemcpyDeviceToHost,
+                  stream);
+  cudaMemcpyAsync(&parts_soa_buffer.rho_dh[first_part_tmp],
+                  &rho_dh[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.wcount[first_part_tmp],
+                  &wcount[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.wcount_dh[first_part_tmp],
+                  &wcount_dh[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.div_v[first_part_tmp],
+                  &div_v[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.rot_ux[first_part_tmp],
+                  &rot_ux[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.rot_uy[first_part_tmp],
+                  &rot_uy[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.rot_uz[first_part_tmp],
+                  &rot_uz[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_host_async_cpy(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream) {
+
+  device2host_async_density(
+      parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox,
+      a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx,
+      widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy,
+      rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+      alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
+      time_bin, wakeup, min_ngb_time_bin, to_be_synchronized, first_part_tmp,
+      bundle_n_parts, stream);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_host_async_cpy_pair(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream) {
+
+  device2host_async_density_pair(
+      parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox,
+      a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx,
+      widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy,
+      rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+      alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
+      time_bin, wakeup, min_ngb_time_bin, to_be_synchronized, first_part_tmp,
+      bundle_n_parts, stream);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_device_async_bind(
+    struct part_soa *parts_soa, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized) {
+
+  parts_soa->tid_p = tid_p;
+  parts_soa->locx = locx;
+  parts_soa->locy = locy;
+  parts_soa->locz = locz;
+  parts_soa->h = h;
+  parts_soa->mass = mass;
+  parts_soa->x_p = x_p;
+  parts_soa->y_p = y_p;
+  parts_soa->z_p = z_p;
+  parts_soa->rho = rho;
+  parts_soa->rho_dh = rho_dh;
+  parts_soa->wcount = wcount;
+  parts_soa->wcount_dh = wcount_dh;
+  parts_soa->ux = ux;
+  parts_soa->uy = uy;
+  parts_soa->uz = uz;
+  parts_soa->div_v = div_v;
+  parts_soa->rot_ux = rot_ux;
+  parts_soa->rot_uy = rot_uy;
+  parts_soa->rot_uz = rot_uz;
+  parts_soa->time_bin = time_bin;
+}
+
+#ifdef WITH_CUDA
+}
+#endif
diff --git a/src/files_for_new_functions/host_device_data_transfer.h b/src/files_for_new_functions/host_device_data_transfer.h
new file mode 100644
index 0000000000..204afd51fa
--- /dev/null
+++ b/src/files_for_new_functions/host_device_data_transfer.h
@@ -0,0 +1,234 @@
+#include "cuda/part_gpu.h"
+
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+void host2device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp);
+
+void device2host_test(struct part_soa parts_soa, int *tid_h,
+                      int count_max_parts_tmp);
+
+void device2device_test(int *tid_p, struct part_soa parts_soa,
+                        int count_max_parts_tmp);
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp);
+
+void device_host_test(struct part_soa parts_soa, int *tid_h,
+                      int count_max_parts_tmp);
+
+void device_device_test(int *tid_p, struct part_soa parts_soa,
+                        int count_max_parts_tmp);
+
+void device2host_density(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp);
+
+void device_host_cpy(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp);
+
+void device2device_density(
+    struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp, cudaStream_t stream);
+
+void host2device_density(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp);
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_cpy(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp);
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_device_bind(
+    struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp, cudaStream_t stream);
+
+void host2device_async_density(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream);
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_async_cpy(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream);
+
+void device2host_async_density(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream);
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_host_async_cpy(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream);
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_device_async_bind(
+    struct part_soa *parts_soa, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized);
+
+void host_device_async_cpy_pair(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream);
+
+void device_host_async_cpy_pair(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream);
+
+void device2host_async_density_pair(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts_i, int bundle_n_parts_j, cudaStream_t stream);
diff --git a/src/hip/BLOCK_SIZE.h b/src/hip/BLOCK_SIZE.h
new file mode 100644
index 0000000000..d36e10b99b
--- /dev/null
+++ b/src/hip/BLOCK_SIZE.h
@@ -0,0 +1,10 @@
+#ifndef BLOCK_SIZE_H
+#define BLOCK_SIZE_H
+#ifdef WITH_CUDA
+// extern "C" {
+#endif
+#define BLOCK_SIZE 512
+#ifdef WITH_CUDA
+//}
+#endif
+#endif  // BLOCK_SIZE_H
diff --git a/src/hip/Data_and_GPU_prep_functions.cu b/src/hip/Data_and_GPU_prep_functions.cu
new file mode 100644
index 0000000000..57cbe0ad7c
--- /dev/null
+++ b/src/hip/Data_and_GPU_prep_functions.cu
@@ -0,0 +1,229 @@
+/*
+ * Data_and_GPU_prep_functions.cu
+ *
+ *  Created on: 17 Apr 2022
+ *      Author: abouzied
+ */
+
+/*ifdef WITH_CUDA prevents name mangling. C code sees exact names
+ of functions rather than mangled template names produced by C++*/
+// #ifdef WITH_CUDA
+//	extern "C"{
+// #endif
+
+// #include "cuda/cuda_headers.h"
+// #include "device_functions.h"
+// #include "cuda/cell_gpu.h"
+#include <cuda_profiler_api.h>
+#include <vector.h>
+// #include "../config.h"
+
+void populate_parts_list(struct cell *ci, struct part_gpu *parts) {
+  ////////////////////////////////////////////
+  ///*****Copy variables for cell i (self interaction)*****/
+  int count = ci->hydro.count;
+
+  //	   fprintf(stderr,"Tester 111\n");
+  for (int p = 0; p < count; p++) {
+
+    parts[p].id = ci->hydro.parts[p].id;
+
+    //		   fprintf(stderr,"Tester 222\n");
+    parts[p].count = count;
+    parts[p].h_max = ci->hydro.h_max;
+
+    for (int d = 0; d < 3; d++) {
+      parts[p].x[d] = ci->hydro.parts[p].x[d];
+      parts[p].v[d] = ci->hydro.parts[p].v[d];
+      parts[p].a_hydro[d] = ci->hydro.parts[p].a_hydro[d];
+      parts[p].loc[d] = ci->loc[d];
+    }
+    parts[p].mass = ci->hydro.parts[p].mass;
+    parts[p].h = ci->hydro.parts[p].h;
+    parts[p].u = ci->hydro.parts[p].u;
+    parts[p].u_dt = ci->hydro.parts[p].u_dt;
+    parts[p].rho = ci->hydro.parts[p].rho;
+    parts[p].div_v = ci->hydro.parts[p].viscosity.div_v;
+    parts[p].div_v_previous_step =
+        ci->hydro.parts[p].viscosity.div_v_previous_step;
+    parts[p].alpha_visc = ci->hydro.parts[p].viscosity.alpha;
+    parts[p].v_sig = ci->hydro.parts[p].viscosity.v_sig;
+    parts[p].laplace_u = ci->hydro.parts[p].diffusion.laplace_u;
+    parts[p].alpha_diff = ci->hydro.parts[p].diffusion.alpha;
+    parts[p].f = ci->hydro.parts[p].force.f;
+    parts[p].soundspeed = ci->hydro.parts[p].force.soundspeed;
+    parts[p].h_dt = ci->hydro.parts[p].force.h_dt;
+    parts[p].balsara = ci->hydro.parts[p].force.balsara;
+    parts[p].pressure = ci->hydro.parts[p].force.pressure;
+    parts[p].time_bin = ci->hydro.parts[p].time_bin;
+    parts[p].wakeup = ci->hydro.parts[p].limiter_data.wakeup;
+    parts[p].min_ngb_time_bin =
+        ci->hydro.parts[p].limiter_data.min_ngb_time_bin;
+    parts[p].to_be_synchronized =
+        ci->hydro.parts[p].limiter_data.to_be_synchronized;
+    parts[p].wcount = ci->hydro.parts[p].density.wcount;
+    parts[p].wcount_dh = ci->hydro.parts[p].density.wcount_dh;
+    parts[p].rho_dh = ci->hydro.parts[p].density.rho_dh;
+    parts[p].div_v = ci->hydro.parts[p].viscosity.div_v;
+    parts[p].rot_v[0] = ci->hydro.parts[p].density.rot_v[0];
+    parts[p].rot_v[1] = ci->hydro.parts[p].density.rot_v[1];
+    parts[p].rot_v[2] = ci->hydro.parts[p].density.rot_v[2];
+    parts[p].SPH_sum = 0.f;
+  }
+}
+
+void populate_parts_list_soa(
+    int count_all_parts, struct cell *ci, int first_part_tmp, int count,
+    int tid, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+    float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy,
+    float *a_hydroz, float *mass, float *h, float *u, float *u_dt, float *rho,
+    float *SPH_sum, float *locx, float *locy, float *locz, float *widthx,
+    float *widthy, float *widthz, float *h_max, int *count_p, float *wcount,
+    float *wcount_dh, float *rho_dh, float *rot_u, float *rot_v, float *rot_w,
+    float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig,
+    float *laplace_u, float *alpha_diff, float *f, float *soundspeed,
+    float *h_dt, float *balsara, float *pressure, float *alpha_visc_max_ngb,
+    timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+    char *to_be_synchronized) {
+  ////////////////////////////////////////////
+  struct part *ptmps;
+  ptmps = ci->hydro.parts;
+  //	   fprintf(stderr,"Tester 111\n");
+#pragma unroll
+  for (int p = 0; p < count; p++) {
+    int p_gid = p + first_part_tmp;
+    //		    if(p_gid>=count_all_parts){
+    //		    	fprintf(stderr,"p>all parts");
+    //		    	exit(0);
+    //		    }
+    id[p_gid] = ptmps[p].id;
+    count_p[p_gid] = count;
+    tid_p[p_gid] = tid;
+    h_max[p_gid] = ci->hydro.h_max;
+    x_p[p_gid] = ptmps[p].x[0];
+    y_p[p_gid] = ptmps[p].x[1];
+    z_p[p_gid] = ptmps[p].x[2];
+    ux[p_gid] = ptmps[p].v[0];
+    uy[p_gid] = ptmps[p].v[1];
+    uz[p_gid] = ptmps[p].v[2];
+    a_hydrox[p_gid] = ptmps[p].a_hydro[0];
+    a_hydroy[p_gid] = ptmps[p].a_hydro[1];
+    a_hydroz[p_gid] = ptmps[p].a_hydro[2];
+    locx[p_gid] = ci->loc[0];
+    locy[p_gid] = ci->loc[1];
+    locz[p_gid] = ci->loc[2];
+
+    mass[p_gid] = ptmps[p].mass;
+    h[p_gid] = ptmps[p].h;
+    u[p_gid] = ptmps[p].u;
+    u_dt[p_gid] = ptmps[p].u_dt;
+    rho[p_gid] = ptmps[p].rho;
+    div_v[p_gid] = ptmps[p].viscosity.div_v;
+    div_v_previous_step[p_gid] = ptmps[p].viscosity.div_v_previous_step;
+    alpha_visc[p_gid] = ptmps[p].viscosity.alpha;
+    v_sig[p_gid] = ptmps[p].viscosity.v_sig;
+    laplace_u[p_gid] = ptmps[p].diffusion.laplace_u;
+    alpha_diff[p_gid] = ptmps[p].diffusion.alpha;
+    f[p_gid] = ptmps[p].force.f;
+    soundspeed[p_gid] = ptmps[p].force.soundspeed;
+    h_dt[p_gid] = ptmps[p].force.h_dt;
+    balsara[p_gid] = ptmps[p].force.balsara;
+    pressure[p_gid] = ptmps[p].force.pressure;
+    time_bin[p_gid] = ptmps[p].time_bin;
+    wakeup[p_gid] = ptmps[p].limiter_data.wakeup;
+    min_ngb_time_bin[p_gid] = ptmps[p].limiter_data.min_ngb_time_bin;
+    to_be_synchronized[p_gid] = ptmps[p].limiter_data.to_be_synchronized;
+    wcount[p_gid] = ptmps[p].density.wcount;
+    wcount_dh[p_gid] = ptmps[p].density.wcount_dh;
+    rho_dh[p_gid] = ptmps[p].density.rho_dh;
+    div_v[p_gid] = ptmps[p].viscosity.div_v;
+    rot_u[p_gid] = ptmps[p].density.rot_v[0];
+    rot_v[p_gid] = ptmps[p].density.rot_v[1];
+    rot_w[p_gid] = ptmps[p].density.rot_v[2];
+    SPH_sum[p_gid] = 0.f;
+    //			fprintf(stderr,"tid is %i\n",tid_p[p]);
+    //			fprintf(stderr,"Tester 222, count=%i, p=%i\n", count,
+    // id[p_gid]);
+  }
+}
+
+void pack_data_soa(int count_all_parts, struct cell *ci, int first_part_tmp,
+                   int count, int tid, int *tid_p, long long *id, double *x_p,
+                   double *y_p, double *z_p, float *ux, float *uy, float *uz,
+                   float *a_hydrox, float *a_hydroy, float *a_hydroz,
+                   float *mass, float *h, float *u, float *u_dt, float *rho,
+                   float *SPH_sum, float *locx, float *locy, float *locz,
+                   float *widthx, float *widthy, float *widthz, float *h_max,
+                   int *count_p, float *wcount, float *wcount_dh, float *rho_dh,
+                   float *rot_u, float *rot_v, float *rot_w, float *div_v,
+                   float *div_v_previous_step, float *alpha_visc, float *v_sig,
+                   float *laplace_u, float *alpha_diff, float *f,
+                   float *soundspeed, float *h_dt, float *balsara,
+                   float *pressure, float *alpha_visc_max_ngb,
+                   timebin_t *time_bin, timebin_t *wakeup,
+                   timebin_t *min_ngb_time_bin, char *to_be_synchronized) {
+  ////////////////////////////////////////////
+  struct part *ptmps;
+  ptmps = ci->hydro.parts;
+  //	   fprintf(stderr,"Tester 111\n");
+#pragma unroll
+  for (int p = 0; p < count; p++) {
+    int p_gid = p + first_part_tmp;
+    //		    if(p_gid>=count_all_parts){
+    //		    	fprintf(stderr,"p>all parts");
+    //		    	exit(0);
+    //		    }
+    id[p_gid] = ptmps[p].id;
+    count_p[p_gid] = count;
+    tid_p[p_gid] = tid;
+    h_max[p_gid] = ci->hydro.h_max;
+    x_p[p_gid] = ptmps[p].x[0];
+    y_p[p_gid] = ptmps[p].x[1];
+    z_p[p_gid] = ptmps[p].x[2];
+    ux[p_gid] = ptmps[p].v[0];
+    uy[p_gid] = ptmps[p].v[1];
+    uz[p_gid] = ptmps[p].v[2];
+    a_hydrox[p_gid] = ptmps[p].a_hydro[0];
+    a_hydroy[p_gid] = ptmps[p].a_hydro[1];
+    a_hydroz[p_gid] = ptmps[p].a_hydro[2];
+    locx[p_gid] = ci->loc[0];
+    locy[p_gid] = ci->loc[1];
+    locz[p_gid] = ci->loc[2];
+
+    mass[p_gid] = ptmps[p].mass;
+    h[p_gid] = ptmps[p].h;
+    u[p_gid] = ptmps[p].u;
+    u_dt[p_gid] = ptmps[p].u_dt;
+    rho[p_gid] = ptmps[p].rho;
+    div_v[p_gid] = ptmps[p].viscosity.div_v;
+    div_v_previous_step[p_gid] = ptmps[p].viscosity.div_v_previous_step;
+    alpha_visc[p_gid] = ptmps[p].viscosity.alpha;
+    v_sig[p_gid] = ptmps[p].viscosity.v_sig;
+    laplace_u[p_gid] = ptmps[p].diffusion.laplace_u;
+    alpha_diff[p_gid] = ptmps[p].diffusion.alpha;
+    f[p_gid] = ptmps[p].force.f;
+    soundspeed[p_gid] = ptmps[p].force.soundspeed;
+    h_dt[p_gid] = ptmps[p].force.h_dt;
+    balsara[p_gid] = ptmps[p].force.balsara;
+    pressure[p_gid] = ptmps[p].force.pressure;
+    time_bin[p_gid] = ptmps[p].time_bin;
+    wakeup[p_gid] = ptmps[p].limiter_data.wakeup;
+    min_ngb_time_bin[p_gid] = ptmps[p].limiter_data.min_ngb_time_bin;
+    to_be_synchronized[p_gid] = ptmps[p].limiter_data.to_be_synchronized;
+    wcount[p_gid] = ptmps[p].density.wcount;
+    wcount_dh[p_gid] = ptmps[p].density.wcount_dh;
+    rho_dh[p_gid] = ptmps[p].density.rho_dh;
+    div_v[p_gid] = ptmps[p].viscosity.div_v;
+    rot_u[p_gid] = ptmps[p].density.rot_v[0];
+    rot_v[p_gid] = ptmps[p].density.rot_v[1];
+    rot_w[p_gid] = ptmps[p].density.rot_v[2];
+    SPH_sum[p_gid] = 0.f;
+    //			fprintf(stderr,"tid is %i\n",tid_p[p]);
+    //			fprintf(stderr,"Tester 222, count=%i, p=%i\n", count,
+    // id[p_gid]);
+  }
+}
+
+// #ifdef WITH_CUDA
+//	}
+// #endif
diff --git a/src/hip/HIP_runner_functions.h b/src/hip/HIP_runner_functions.h
new file mode 100644
index 0000000000..43a52f96ed
--- /dev/null
+++ b/src/hip/HIP_runner_functions.h
@@ -0,0 +1,22 @@
+#ifndef CUDA_HEADERS_H
+#define CUDA_HEADERS_H
+#define n_streams 1024
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "part_gpu.h"
+void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part,
+                           int *d_task_last_part, int *d_bundle_first_part,
+                           int *d_bundle_last_part, float d_a, float d_H,
+                           const char *loop_type, hipStream_t stream, int bid,
+                           int block_size, int count_tasks, int tasksperbundle,
+                           int numBlocks_x, int numBlocks_y, int tid,
+                           int offset, int bundle_first_task, int max_parts,
+                           int max_active_bin);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // CUDA_HEADER_H
diff --git a/src/hip/HIP_runner_functions.hip b/src/hip/HIP_runner_functions.hip
new file mode 100755
index 0000000000..634c67a9ad
--- /dev/null
+++ b/src/hip/HIP_runner_functions.hip
@@ -0,0 +1,229 @@
+#include "hip/hip_runtime.h"
+/*******************************************************************************
+ * This file contains functions used to setup and execute GPU tasks from within
+ *runner_main.c. Consider this a translator allowing .cu based functions to be
+ *called from within runner_main.c
+ ******************************************************************************/
+
+/* Hacky method to make c++ compilers not die. */
+#ifdef WITH_HIP
+#ifndef static
+#define static
+#endif
+#ifndef restrict
+#define restrict __restrict__
+#endif
+#endif
+
+/* Required header files */
+#include <stdio.h>
+/*ifdef WITH_HIP prevents name mangling. C code sees exact names
+ of functions rather than mangled template names produced by C++*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../../config.h"
+#include "BLOCK_SIZE.h"
+#include "HIP_runner_functions.h"
+#include "hip/device_functions.h"
+#include "part_gpu.h"
+
+void Initialise_GPU() {
+  int devId = 0;
+  // find and print device name
+  hipDeviceProp_t prop;
+  hipGetDeviceProperties(&prop, devId);
+  printf("Device : %s\n", prop.name);
+  hipSetDevice(devId);
+  // cuda
+}
+#ifdef __cplusplus
+}
+#endif
+
+__global__ void runner_do_self_density_GPU(
+    struct part_soa parts_soa, int *d_task_first_part, int *d_task_last_part,
+    int *d_bundle_first_part, int *d_bundle_last_part, float d_a, float d_H,
+    int bid, int tid, int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, int max_parts, int time_bin_inhibited) {
+  extern __shared__ float vars[];
+  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+  first_part_in_task_blocks = d_task_first_part[task_id],
+  last_part_in_task_blocks = d_task_last_part[task_id];
+  __syncthreads();
+  const int b_first_part = d_bundle_first_part[bid];
+  const int pid = threadid + first_part_in_task_blocks;
+  const int b_last_part = d_bundle_last_part[bid];
+
+  int ttid = 0;
+  int first_part = 0;
+  int count = 0;
+  int last_part = 0;
+  float cellx = 0.0, celly = 0.0, cellz = 0.0;
+  float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float rho_dhi = 0.0;
+  float wcounti = 0.0;
+  float wcount_dhi = 0.0;
+  float div_vi = 0.0;
+  float rot_uxi = 0.0;
+  float rot_uyi = 0.0;
+  float rot_uzi = 0.0;
+  //	if(pid<b_last_part&&pid<last_part_in_task_blocks){
+  if (pid < last_part_in_task_blocks) {
+    ttid = parts_soa.tid_p[pid];
+    first_part = d_task_first_part[ttid];
+    last_part = d_task_last_part[ttid];
+    count = last_part - first_part;
+    cellx = parts_soa.locx[pid], celly = parts_soa.locy[pid],
+    cellz = parts_soa.locz[pid];
+    hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+    mi = parts_soa.mass[pid];
+    uxi = parts_soa.ux[pid];
+    uyi = parts_soa.uy[pid];
+    uzi = parts_soa.uz[pid];
+    pix = parts_soa.x_p[pid] - cellx;
+    piy = parts_soa.y_p[pid] - celly;
+    piz = parts_soa.z_p[pid] - cellz;
+  }
+  if (threadIdx.x == 0) {
+    first_part_tid_0 = first_part;
+    last_part_tid_0 = last_part;
+  }
+  __syncthreads();
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&vars[0];
+  float *y_p_tmp = (float *)&vars[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&vars[BLOCK_SIZE * 2];
+  float *h_tmp = (float *)&vars[BLOCK_SIZE * 3];
+  float *mass_tmp = (float *)&vars[BLOCK_SIZE * 4];
+  float *ux_tmp = (float *)&vars[BLOCK_SIZE * 5];
+  float *uy_tmp = (float *)&vars[BLOCK_SIZE * 6];
+  float *uz_tmp = (float *)&vars[BLOCK_SIZE * 7];
+  timebin_t *timebin = (timebin_t *)&vars[BLOCK_SIZE * 8];
+  /*Particles copied in blocks to shared memory*/
+  for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+       b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    x_p_tmp[threadIdx.x] = parts_soa.x_p[j];
+    y_p_tmp[threadIdx.x] = parts_soa.y_p[j];
+    z_p_tmp[threadIdx.x] = parts_soa.z_p[j];
+    h_tmp[threadIdx.x] = parts_soa.h[j];
+    mass_tmp[threadIdx.x] = parts_soa.mass[j];
+    ux_tmp[threadIdx.x] = parts_soa.ux[j];
+    uy_tmp[threadIdx.x] = parts_soa.uy[j];
+    uz_tmp[threadIdx.x] = parts_soa.uz[j];
+    timebin[threadIdx.x] = parts_soa.time_bin[j];
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+//      if ((j != pid) && (j < last_part_in_task_blocks) &&
+//          timebin[j_block] != time_bin_inhibited) {
+//      if ((j < last_part_in_task_blocks) &&
+//    	  timebin[j_block] != time_bin_inhibited) {
+      if (j < last_part_in_task_blocks) {
+        /* Compute the pairwise distance. */
+        const float pjx = x_p_tmp[j_block] - cellx;
+        const float pjy = y_p_tmp[j_block] - celly;
+        const float pjz = z_p_tmp[j_block] - cellz;
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        const float hj = h_tmp[j_block], hjg2 = hj * hj * kernel_gamma2;
+        //				if((hi < 0.0001f || hj < 0.0001f || r2 <
+        //0.0000001f) && pid < last_part_in_task_blocks){ 					printf("very small
+        //value for hi %f or hj %f or r2 %f\n", hi, hj, r2);
+        //				}
+        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
+          const float r = sqrt(r2);
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          /* Get the kernel for hi. */
+          if(hi<1.f/128.f)printf("h < dx\n");
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          rhoi += mj * wi;
+          rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+          wcounti += wi;
+          wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+
+          /* Compute dv dot r */
+          float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+          div_vi -= faci * dvdr;
+
+          /* Compute dv cross r */
+          float curlvrx = dvy * zij - dvz * yij;
+          float curlvry = dvz * xij - dvx * zij;
+          float curlvrz = dvx * yij - dvy * xij;
+
+          rot_uxi += faci * curlvrx;
+          rot_uyi += faci * curlvry;
+          rot_uzi += faci * curlvrz;
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks) {
+	float wi, wi_dx;
+	d_kernel_deval(0.f, &wi, &wi_dx);
+//	printf("mass i %e, self rho %e sum rho %e\n", mi, mi*wi, rhoi);
+    parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+    parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+    parts_soa.div_v[pid] = div_vi;
+    parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi,
+    parts_soa.rot_uz[pid] = rot_uzi;
+  }
+}
+#ifdef __cplusplus
+extern "C" {
+#endif
+void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part,
+                           int *d_task_last_part, int *d_bundle_first_part,
+                           int *d_bundle_last_part, float d_a, float d_H,
+                           const char *loop_type, hipStream_t stream, int bid,
+                           int block_size, int count_tasks, int tasksperbundle,
+                           int numBlocks_x, int numBlocks_y, int tid,
+                           int offset, int bundle_first_task, int max_parts,
+                           int time_bin_inhibited) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  runner_do_self_density_GPU<<<gridShape, BLOCK_SIZE,
+                               8 * BLOCK_SIZE * sizeof(float) +
+                                   BLOCK_SIZE * sizeof(timebin_t),
+                               stream>>>(
+      parts_soa, d_task_first_part, d_task_last_part, d_bundle_first_part,
+      d_bundle_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+      nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/hip/Makefile.am b/src/hip/Makefile.am
new file mode 100755
index 0000000000..fc626b8831
--- /dev/null
+++ b/src/hip/Makefile.am
@@ -0,0 +1,55 @@
+SOURCES_HIP = HIP_runner_functions.hip
+include_HEADERS = HIP_runner_functions.h device_functions.h BLOCK_SIZE.h tester.h
+EXTRA_DIST = $(SOURCES_HIP) $(include_HEADERS)
+
+if HAVEHIP
+
+AM_CFLAGS = -I.. $(HDF5_CPPFLAGS)
+HIP_MYFLAGS = -D_FORCE_INLINES -O3 -g -DWITH_HIP --offload-arch=gfx90a
+#HIP_MYFLAGS = -D_FORCE_INLINES -O3 -g -v -lineinfo -src-in-ptx --maxrregcount=32 -ftz=true -DWITH_HIP -ccbin=gcc -m64 --default-stream per-thread#-dlink
+
+# Assign a "safe" version number
+AM_LDFLAGS = $(HDF5_LDFLAGS) $(FFTW_LIBS) -version-info 0:0:0
+
+#bin_PROGRAMS = test_27_cells test_125_cells
+
+# Rules to compile HIP code.
+.hip.o:
+	$(HIPCC) -c $(HIPFLAGS) $(AM_CFLAGS) $(HIP_CFLAGS) $(HIP_MYFLAGS) $< -o $@
+.hip.lo:
+	PATH=$(top_srcdir):$(PATH) && cudalt.py $@ $(HIPCC) -c $(HIPFLAGS) $(AM_CFLAGS) $(HIP_CFLAGS) $(HIP_MYFLAGS) $<
+
+# The library. Dummy C library so that we get libtool linking setup.
+lib_LTLIBRARIES = libswiftHIP.la libswiftdummy.la
+
+# Special link command to avoid including CFLAGS which are not understood.
+libswiftHIP_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
+        $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
+        $(libswiftHIP_la_LDFLAGS) \
+        $(LDFLAGS) -o $@
+
+libswiftHIP_la_SOURCES = $(SOURCES_HIP)
+libswiftHIP_la_CFLAGS = $(AM_CFLAGS) $(HIP_CFLAGS) $(HIP_MYFLAGS) ../libswiftsim_hip.la -I../
+libswiftHIP_la_LIBADD = ../.libs/libswiftsim_hip.la
+libswiftHIP_la_LDFLAGS = $(AM_LDFLAGS)
+
+if HAVEMPI
+libswiftHIP_la_CFLAGS += ../libswiftsim_mpihip.la
+libswiftHIP_la_LIBADD += ../.libs/libswiftsim_mpihip.la
+endif
+
+libswiftdummy_la_SOURCES = dummy.c
+libswiftdummy_la_CFLAGS = $(AM_CFLAGS)
+libswiftdummy_la_LDFLAGS = $(AM_LDFLAGS)
+
+#test_27_cells_SOURCES=test27cells.c
+#test_27_cells_CFLAGS=$(AM_CFLAGS) -DWITH_HIP $(HIP_CFLAGS) 
+#test_27_cells_LDADD= ../.libs/libswiftsim_hip.la ../.libs/libswiftsim_mpihip.la libswiftHIP.la $(MPI_LIBS) $(EXTRA_LIBS) $(HIP_LIBS)
+#test_27_cells_LDFLAGS = $(AM_LDFLAGS) $(HIP_CFLAGS)
+
+#test_125_cells_SOURCES=test125cells.c
+#test_125_cells_CFLAGS=$(AM_CFLAGS) -DWITH_HIP $(HIP_CFLAGS)
+#test_125_cells_LDADD= ../libswiftsim_hip.la ../libswiftsim_mpihip.la libswiftHIP.la $(MPI_LIBS) $(EXTRA_LIBS) $(HIP_LIBS)
+#test_125_cells_LDFLAGS = $(AM_LDFLAGS) $(HIP_CFLAGS) 
+
+endif
diff --git a/src/hip/am--include-marker b/src/hip/am--include-marker
new file mode 100644
index 0000000000..9ce06a81ea
--- /dev/null
+++ b/src/hip/am--include-marker
@@ -0,0 +1 @@
+# dummy
diff --git a/src/hip/cell_gpu.h b/src/hip/cell_gpu.h
new file mode 100644
index 0000000000..dc8d9306f2
--- /dev/null
+++ b/src/hip/cell_gpu.h
@@ -0,0 +1,292 @@
+#ifndef CELL_GPU_H
+#define CELL_GPU_H
+/* Config parameters. */
+#include "../config.h"
+typedef int8_t timebin_t;
+struct xpart_gpu {
+  /*! Offset between current position and position at last tree rebuild. */
+  float x_diff[3];
+  /*! Offset between the current position and position at the last sort. */
+  float x_diff_sort[3];
+  /*! Velocity at the last full step. */
+  float v_full[3];
+  /*! Internal energy at the last full step. */
+  float u_full;
+};
+struct part_gpu {
+  /*Task ID*/
+  int tid;
+  /*! Particle unique ID. */
+  long long id;
+  /*! Pointer to corresponding gravity part. */
+  //	struct gpu_gpart* gpart;
+  /*! Particle position. */
+  float x[3];
+  /*! Particle predicted velocity. */
+  float v[3];
+  /*! Particle acceleration. */
+  float a_hydro[3];
+  /*! Particle mass. */
+  float mass;
+  /*! Particle smoothing length. */
+  float h;
+  /*! Particle internal energy. */
+  float u;
+  /*! Time derivative of the internal energy. */
+  float u_dt;
+  /*! Particle density. */
+  float rho;
+  /*! Kernel summation (For testing/debugging). */
+  float SPH_sum;
+
+  /* Cell information */
+  /*! The cell location on the grid (corner nearest to the origin). */
+  float loc[3];
+  /*! The cell dimensions. */
+  float width[3];
+  float h_max;
+  int count;
+  /* Density information */
+
+  /*! Neighbour number count. */
+  float wcount;
+
+  /*! Derivative of the neighbour number with respect to h. */
+  float wcount_dh;
+
+  /*! Derivative of density with respect to h */
+  float rho_dh;
+
+  /*! Particle velocity curl. */
+  float rot_v[3];
+
+  /* viscosity information */
+
+  /*! Particle velocity divergence */
+  float div_v;
+
+  /*! Particle velocity divergence from previous step */
+  float div_v_previous_step;
+
+  /*! Artificial viscosity parameter */
+  float alpha_visc;
+
+  /*! Signal velocity */
+  float v_sig;
+
+  /* thermal diffusion information  */
+
+  /*! del^2 u, a smoothed quantity */
+  float laplace_u;
+
+  /*! Thermal diffusion coefficient */
+  float alpha_diff;
+
+  /* force information  */
+
+  /*! "Grad h" term -- only partial in P-U */
+  float f;
+
+  /*! Particle soundspeed. */
+  float soundspeed;
+
+  /*! Time derivative of smoothing length  */
+  float h_dt;
+
+  /*! Balsara switch */
+  float balsara;
+
+  /*! Particle pressure. */
+  float pressure;
+  /*! Maximal alpha (viscosity) over neighbours */
+  float alpha_visc_max_ngb;
+
+  /* timestep stuff */
+
+  /*! Time-step length */
+  timebin_t time_bin;
+
+  /*all part of struct timestep_limiter_data, we had to destruct it
+   as GPUs don't like pointer chasing especially when memcpying*/
+  /* Need waking-up ? */
+  timebin_t wakeup;
+
+  /*! Minimal time-bin across all neighbours */
+  timebin_t min_ngb_time_bin;
+
+  /* Do we want this particle to be synched back on the time-line? */
+  char to_be_synchronized;
+};
+
+typedef struct part_soa {
+  /*Task ID*/
+  int *tid_p;
+  /*bundle ID*/
+  int *bid_p;
+  /*! Particle unique ID. */
+  long long *id;
+  /*! Pointer to corresponding gravity part. */
+  //	struct gpu_gpart* gpart;
+  /*! Particle position. */
+  double *x_p;
+  double *y_p;
+  double *z_p;
+  /*! Particle predicted velocity. */
+  float *ux;
+  float *uy;
+  float *uz;
+  /*! Particle acceleration. */
+  float *a_hydrox;
+  float *a_hydroy;
+  float *a_hydroz;
+  /*! Particle mass. */
+  float *mass;
+  /*! Particle smoothing length. */
+  float *h;
+  /*! Particle internal energy. */
+  float *u;
+  /*! Time derivative of the internal energy. */
+  float *u_dt;
+  /*! Particle density. */
+  float *rho;
+  /*! Kernel summation (For testing/debugging). */
+  float *SPH_sum;
+
+  /* Cell information */
+  /*! The cell location on the grid (corner nearest to the origin). */
+  float *locx;
+  float *locy;
+  float *locz;
+  /*! The cell dimensions. */
+  float *widthx;
+  float *widthy;
+  float *widthz;
+  float *h_max;
+  int *count_p;
+  int *count_test;
+  /* Density information */
+
+  /*! Neighbour number count. */
+  float *wcount;
+
+  /*! Derivative of the neighbour number with respect to h. */
+  float *wcount_dh;
+
+  /*! Derivative of density with respect to h */
+  float *rho_dh;
+
+  /*! Particle velocity curl. */
+  float *rot_ux;
+  float *rot_uy;
+  float *rot_uz;
+
+  /* viscosity information */
+
+  /*! Particle velocity divergence */
+  float *div_v;
+
+  /*! Particle velocity divergence from previous step */
+  float *div_v_previous_step;
+
+  /*! Artificial viscosity parameter */
+  float *alpha_visc;
+
+  /*! Signal velocity */
+  float *v_sig;
+
+  /* thermal diffusion information  */
+
+  /*! del^2 u, a smoothed quantity */
+  float *laplace_u;
+
+  /*! Thermal diffusion coefficient */
+  float *alpha_diff;
+
+  /* force information  */
+
+  /*! "Grad h" term -- only partial in P-U */
+  float *f;
+
+  /*! Particle soundspeed. */
+  float *soundspeed;
+
+  /*! Time derivative of smoothing length  */
+  float *h_dt;
+
+  /*! Balsara switch */
+  float *balsara;
+
+  /*! Particle pressure. */
+  float *pressure;
+  /*! Maximal alpha (viscosity) over neighbours */
+  float *alpha_visc_max_ngb;
+
+  /* timestep stuff */
+
+  /*! Time-step length */
+  timebin_t *time_bin;
+
+  /*all part of struct timestep_limiter_data, we had to destruct it
+   as GPUs don't like pointer chasing especially when memcpying*/
+  /* Need waking-up ? */
+  timebin_t *wakeup;
+
+  /*! Minimal time-bin across all neighbours */
+  timebin_t *min_ngb_time_bin;
+
+  /* Do we want this particle to be synched back on the time-line? */
+  char *to_be_synchronized;
+
+} part_soa;
+
+struct task_cell {
+  struct part_gpu *parts;
+};
+// struct parts_gpu_SoA{
+//	struct task_cell *tasks;
+// };
+
+struct cell_hydro_gpu {
+  //	struct part_gpu *parts;
+  //	struct xpart_gpu *xparts;
+  float h_max;
+  int count;
+};
+struct cell_gpu {
+  /*! The cell location on the grid (corner nearest to the origin). */
+  float loc[3];
+  /*! The cell dimensions. */
+  float width[3];
+  /*Details of contents (particles) and properties*/
+  struct cell_hydro_gpu hydro;
+};
+struct cell_gpu_flat {
+  /*! The cell location on the grid (corner nearest to the origin). */
+  float loc[3];
+  /*! The cell dimensions. */
+  float width[3];
+  float h_max;
+  int count;
+};
+
+struct cells_gpu_flat {
+  float *locx;
+  float *locy;
+  float *locz;
+  /*! The cell dimensions. */
+  float *widthx;
+  float *widthy;
+  float *widthz;
+  /*! The cell location on the grid (corner nearest to the origin). */
+  /*	float *loc[3];*/
+  /*! The cell dimensions. */
+  /*	float *width[3];*/
+  float *h_max;
+  int *count;
+};
+
+struct cells_gpu_flat_test {
+  float *locx;
+};
+
+#endif  // CELL_GPU_H
diff --git a/src/hip/cuda_headers.h b/src/hip/cuda_headers.h
new file mode 100644
index 0000000000..2df61a53b5
--- /dev/null
+++ b/src/hip/cuda_headers.h
@@ -0,0 +1,63 @@
+#ifndef CUDA_HEADERS_H
+#define CUDA_HEADERS_H
+#define n_streams 1024
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+
+void GPU_runner_doself1_branch_gradient(struct cell_gpu *ci_gpu,
+                                        struct part_gpu *parts_gpu);
+void cuda_tester(struct cell **ci_list_mgd, int numBlocksTest,
+                 int block_size_test, int count_tasks);
+void launch_cuda_kernel(struct cell_gpu *ci_gpu, struct part_gpu *parts,
+                        int numBlocks, float d_a, float d_H,
+                        const char *loop_type);
+void launch_cuda_kernel_streams(struct part_gpu *d_parts, int numBlocks,
+                                float d_a, float d_H, const char *loop_type,
+                                cudaStream_t stream, int tid, int count,
+                                int max_count, float cellx, float celly,
+                                float cellz, int first_part, int last_part);
+void launch_cuda_kernel_bundles(struct cell_gpu *d_all_cells,
+                                struct part_gpu **d_all_parts, int numBlocks,
+                                float d_a, float d_H, const char *loop_type,
+                                cudaStream_t stream, int bid, int block_size,
+                                int count_tasks, int tasksperbundle,
+                                int numBlocks_x, int numBlocks_y, int tid,
+                                int offset);
+void launch_cuda_kernel_bundles_revised(
+    struct part_gpu *d_all_parts, int *d_task_first_part, int *d_task_last_part,
+    int *d_bundle_first_part, int *d_bundle_last_part, int numBlocks, float d_a,
+    float d_H, const char *loop_type, cudaStream_t stream, int bid,
+    int block_size, int count_tasks, int tasksperbundle, int numBlocks_x,
+    int numBlocks_y, int tid, int offset);
+void launch_cuda_kernel_bundles_revised_soa(
+    struct part_soa parts_gpu_soa, int *d_task_first_part,
+    int *d_task_last_part, int *d_bundle_first_part, int *d_bundle_last_part,
+    int numBlocks, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int numBlocks_x, int numBlocks_y, int tid, int offset,
+    int bundle_first_task, int max_parts);
+void launch_cuda_print_streams(int numBlocks, cudaStream_t stream, int tid);
+void launch_cuda_kernel_tester(struct cell_gpu *d_ci_gpu,
+                               struct part_gpu **d_parts, int numBlocks,
+                               float d_a, float d_H, const char *loop_type,
+                               cudaStream_t stream, int bid, int block_size,
+                               int count_tasks, int tasksperbundle,
+                               int numBlocks_x, int numBlocks_y, int tid);
+void launch_cuda_kernel_bundles_test(struct cell_gpu *d_all_cells,
+                                     struct part_gpu **d_all_parts,
+                                     int numBlocks, float d_a, float d_H,
+                                     int count_tasks);
+void mgd_mem_cuda_kernel_bundles(struct part_gpu **parts_gpu_list,
+                                 int numBlocks, float d_a, float d_H,
+                                 const char *loop_type, cudaStream_t stream,
+                                 int bid, int block_size, int count_tasks,
+                                 int tasksperbundle, int numBlocks_x,
+                                 int numBlocks_y, int tid, int offset);
+
+#ifdef WITH_CUDA
+}
+#endif
+
+#endif  // CUDA_HEADER_H
diff --git a/src/hip/device_functions.h b/src/hip/device_functions.h
new file mode 100644
index 0000000000..237c87dec1
--- /dev/null
+++ b/src/hip/device_functions.h
@@ -0,0 +1,149 @@
+#ifndef DEVICE_FUNCTIONS_H
+#define DEVICE_FUNCTIONS_H
+#include "../../config.h"
+
+/* Local headers. */
+// #include "../dimension.h"
+// #include "../error.h"
+// #include "../inline.h"
+// #include "../minmax.h"
+// #include "../vector.h"
+
+// Is this even necessary? Probably not as our code will operate differently
+#define num_cuda_threads 128
+#define hydro_dimension 3.f
+
+/// Here we define stuff from kernel_hydro.h when using cubic_spline_kernel.
+/// Will worry about sorting 'if statements for different kernels later////
+/* First some powers of gamma = H/h */
+#define kernel_gamma ((float)(1.825742))
+#define kernel_gamma_inv ((float)(1. / kernel_gamma))
+#define kernel_gamma2 ((float)(kernel_gamma * kernel_gamma))
+#define kernel_ivals 2
+#define kernel_degree 3 /*!< Degree of the polynomial */
+#define kernel_gamma_dim ((float)(kernel_gamma * kernel_gamma * kernel_gamma))
+#define kernel_gamma_dim_plus_one \
+  ((float)(kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma))
+#define kernel_gamma_inv_dim \
+  ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma)))
+#define kernel_gamma_inv_dim_plus_one \
+  ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma)))
+#define kernel_ivals_f ((float)kernel_ivals) /*!< Number of branches */
+#define kernel_constant ((float)(16. * M_1_PI))
+/*! Cosmology default beta=3.0.
+ * Alpha can be set in the parameter file.
+ * Beta is defined as in e.g. Price (2010) Eqn (103) */
+#define const_viscosity_beta 3.0f
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+/**
+ * @brief Returns the argument to the power given by the dimension plus one
+ *
+ * Computes \f$x^{d+1}\f$.
+ */
+__device__ float d_pow_dimension_plus_one(float x) {
+
+#if defined(HYDRO_DIMENSION_3D)
+
+  const float x2 = x * x;
+  return x2 * x2;
+
+#elif defined(HYDRO_DIMENSION_2D)
+
+  return x * x * x;
+
+#elif defined(HYDRO_DIMENSION_1D)
+
+  return x * x;
+
+#else
+
+  error("The dimension is not defined !");
+  return 0.f;
+
+#endif
+}
+
+/**
+ * @brief Return the argument to the power three adiabatic index minus five over
+ * two.
+ *
+ * Computes \f$x^{(3\gamma - 5)/2}\f$.
+ *
+ * @param x Argument
+ */
+__device__ float d_pow_three_gamma_minus_five_over_two(float x) {
+#if defined(HYDRO_GAMMA_5_3)
+
+  return 1.f; /* x^(0) */
+
+#elif defined(HYDRO_GAMMA_7_5)
+
+  return powf(x, -0.4f); /* x^(-2/5) */
+
+#elif defined(HYDRO_GAMMA_4_3)
+
+  return 1.f / sqrtf(x); /* x^(-1/2) */
+
+#elif defined(HYDRO_GAMMA_2_1)
+
+  return sqrtf(x); /* x^(1/2) */
+
+#else
+
+  error("The adiabatic index is not defined !");
+  return 0.f;
+
+#endif
+}
+
+/**
+ * @brief Computes the kernel function and its derivative.
+ *
+ * The kernel function needs to be mutliplied by \f$h^{-d}\f$ and the gradient
+ * by \f$h^{-(d+1)}\f$, where \f$d\f$ is the dimensionality of the problem.
+ *
+ * Returns 0 if \f$u > \gamma = H/h\f$.
+ *
+ * @param u The ratio of the distance to the smoothing length \f$u = x/h\f$.
+ * @param W (return) The value of the kernel function \f$W(x,h)\f$.
+ * @param dW_dx (return) The norm of the gradient of \f$|\nabla W(x,h)|\f$.
+ */
+__device__ void d_kernel_deval(float u, float *restrict W,
+                               float *restrict dW_dx) {
+
+  /* Go to the range [0,1[ from [0,H[ */
+  const float x = u * kernel_gamma_inv;
+
+  /* Pick the correct branch of the kernel */
+  const int temp = (int)(x * kernel_ivals_f);
+  const int ind = temp > kernel_ivals ? kernel_ivals : temp;
+  static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)] = {
+      3.f,  -3.f, 0.f,  0.5f, /* 0 < u < 0.5 */
+      -1.f, 3.f,  -3.f, 1.f,  /* 0.5 < u < 1 */
+      0.f,  0.f,  0.f,  0.f}; /* 1 < u */
+  const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
+  /* First two terms of the polynomial ... */
+  float w = coeffs[0] * x + coeffs[1];
+  float dw_dx = coeffs[0];
+
+  /* ... and the rest of them */
+  for (int k = 2; k <= kernel_degree; k++) {
+    dw_dx = dw_dx * x + w;
+    w = x * w + coeffs[k];
+  }
+
+  w = max(w, 0.f);
+  dw_dx = min(dw_dx, 0.f);
+
+  /* Return everything */
+  *W = w * kernel_constant * kernel_gamma_inv_dim;
+  *dW_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one;
+}
+
+#ifdef WITH_CUDA
+}
+#endif
+
+#endif  // DEVICE_FUNCTIONS_H
diff --git a/src/hip/dummy.c b/src/hip/dummy.c
new file mode 100755
index 0000000000..66ab4665f9
--- /dev/null
+++ b/src/hip/dummy.c
@@ -0,0 +1,2 @@
+#include <stdio.h>
+void swiftcudadummy() {}
diff --git a/src/hip/dummy.cpp b/src/hip/dummy.cpp
new file mode 100755
index 0000000000..66ab4665f9
--- /dev/null
+++ b/src/hip/dummy.cpp
@@ -0,0 +1,2 @@
+#include <stdio.h>
+void swiftcudadummy() {}
diff --git a/src/hip/part_gpu.h b/src/hip/part_gpu.h
new file mode 100644
index 0000000000..5d7e32c611
--- /dev/null
+++ b/src/hip/part_gpu.h
@@ -0,0 +1,137 @@
+#ifndef PART_GPU_H
+#define PART_GPU_H
+/* Config parameters. */
+#include "../../config.h"
+typedef int8_t timebin_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// extern "C" {
+
+typedef struct part_soa {
+  /*Task ID*/
+  int *tid_p;
+  /*bundle ID*/
+  int *bid_p;
+  /*! Particle unique ID. */
+  long long *id;
+  /*! Pointer to corresponding gravity part. */
+  //	struct gpu_gpart* gpart;
+  /*! Particle position. */
+  double *x_p;
+  double *y_p;
+  double *z_p;
+  /*! Particle predicted velocity. */
+  float *ux;
+  float *uy;
+  float *uz;
+  /*! Particle acceleration. */
+  float *a_hydrox;
+  float *a_hydroy;
+  float *a_hydroz;
+  /*! Particle mass. */
+  float *mass;
+  /*! Particle smoothing length. */
+  float *h;
+  /*! Particle internal energy. */
+  float *u;
+  /*! Time derivative of the internal energy. */
+  float *u_dt;
+  /*! Particle density. */
+  float *rho;
+  /*! Kernel summation (For testing/debugging). */
+  float *SPH_sum;
+
+  /* Cell information */
+  /*! The cell location on the grid (corner nearest to the origin). */
+  float *locx;
+  float *locy;
+  float *locz;
+  /*! The cell dimensions. */
+  float *widthx;
+  float *widthy;
+  float *widthz;
+  float *h_max;
+  int *count_p;
+  int *count_test;
+  /* Density information */
+
+  /*! Neighbour number count. */
+  float *wcount;
+
+  /*! Derivative of the neighbour number with respect to h. */
+  float *wcount_dh;
+
+  /*! Derivative of density with respect to h */
+  float *rho_dh;
+
+  /*! Particle velocity curl. */
+  float *rot_ux;
+  float *rot_uy;
+  float *rot_uz;
+
+  /* viscosity information */
+
+  /*! Particle velocity divergence */
+  float *div_v;
+
+  /*! Particle velocity divergence from previous step */
+  float *div_v_previous_step;
+
+  /*! Artificial viscosity parameter */
+  float *alpha_visc;
+
+  /*! Signal velocity */
+  float *v_sig;
+
+  /* thermal diffusion information  */
+
+  /*! del^2 u, a smoothed quantity */
+  float *laplace_u;
+
+  /*! Thermal diffusion coefficient */
+  float *alpha_diff;
+
+  /* force information  */
+
+  /*! "Grad h" term -- only partial in P-U */
+  float *f;
+
+  /*! Particle soundspeed. */
+  float *soundspeed;
+
+  /*! Time derivative of smoothing length  */
+  float *h_dt;
+
+  /*! Balsara switch */
+  float *balsara;
+
+  /*! Particle pressure. */
+  float *pressure;
+  /*! Maximal alpha (viscosity) over neighbours */
+  float *alpha_visc_max_ngb;
+
+  /* timestep stuff */
+
+  /*! Time-step length */
+  timebin_t *time_bin;
+
+  /*all part of struct timestep_limiter_data, we had to destruct it
+   as GPUs don't like pointer chasing especially when memcpying*/
+  /* Need waking-up ? */
+  timebin_t *wakeup;
+
+  /*! Minimal time-bin across all neighbours */
+  timebin_t *min_ngb_time_bin;
+
+  /* Do we want this particle to be synched back on the time-line? */
+  char *to_be_synchronized;
+};
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif  // PART_GPU_H
diff --git a/src/hip/print_something.cu b/src/hip/print_something.cu
new file mode 100755
index 0000000000..b69ad05dd4
--- /dev/null
+++ b/src/hip/print_something.cu
@@ -0,0 +1,37 @@
+#ifdef WITH_CUDA
+#ifndef static
+#define static
+#endif
+#ifndef restrict
+#define restrict __restrict__
+#endif
+#endif
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cuda_headers.h"
+#ifdef __cplusplus
+}
+#endif
+
+extern "C" {
+void print_something_cu() { printf("In Here\n"); }
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void Initialise_GPU() {
+  int devId = 0;
+  // find and print device name
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, devId);
+  printf("Device : %s\n", prop.name);
+  cudaSetDevice(devId);
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/hip/tasks_gpu.h b/src/hip/tasks_gpu.h
new file mode 100755
index 0000000000..a3912aee2c
--- /dev/null
+++ b/src/hip/tasks_gpu.h
@@ -0,0 +1,74 @@
+/* Config parameters. */
+#include "../config.h"
+
+struct tasks_self_gpu {
+  struct task_gpu *tgpu;
+};
+
+/**
+ * @brief A task to be run by the #scheduler.
+ */
+struct task_gpu {
+
+  /*! Pointers to the cells this task acts upon */
+  struct cell *ci, *cj;
+
+  /*! List of tasks unlocked by this one */
+  struct task_gpu **unlock_tasks;
+
+  /*! Flags used to carry additional information (e.g. sort directions) */
+  long long flags;
+
+#ifdef WITH_MPI
+
+  /*! Buffer for this task's communications */
+  void *buff;
+
+  /*! MPI request corresponding to this task */
+  MPI_Request req;
+
+#endif
+
+  /*! Rank of a task in the order */
+  int rank;
+
+  /*! Weight of the task */
+  float weight;
+
+  /*! Number of tasks unlocked by this one */
+  int nr_unlock_tasks;
+
+  /*! Number of unsatisfied dependencies */
+  int wait;
+
+  /*! Type of the task */
+  enum task_types type;
+
+  /*! Sub-type of the task (for the tasks that have one */
+  enum task_subtypes subtype;
+
+  /*! Should the scheduler skip this task ? */
+  char skip;
+
+  /*! Is this task implicit (i.e. does not do anything) ? */
+  char implicit;
+
+#ifdef SWIFT_DEBUG_TASKS
+  /*! ID of the queue or runner owning this task */
+  short int rid;
+
+  /*! Information about the direction of the pair task */
+  short int sid;
+#endif
+
+  /*! Start and end time of this task */
+  ticks tic, toc;
+
+  /* Total time spent running this task */
+  ticks total_ticks;
+
+#ifdef SWIFT_DEBUG_CHECKS
+  /* When was this task last run? */
+  integertime_t ti_run;
+#endif /* SWIFT_DEBUG_CHECKS */
+};
diff --git a/src/hip/tester.cu b/src/hip/tester.cu
new file mode 100644
index 0000000000..3ffaf9e10c
--- /dev/null
+++ b/src/hip/tester.cu
@@ -0,0 +1,21 @@
+#include "tester.h"
+
+#include <iostream>
+#include <vector>
+#ifdef __cplusplus
+extern "C" {
+#endif
+void testing_linkage(int a, float *b, float c) {
+  std::vector<float> b_value_list;
+  b_value_list.reserve(a);
+  for (int i = 0; i < a; i++) {
+    (*b) = (*b) + c;
+    b_value_list.push_back((*b));
+    std::cout << "Vector value is " << b_value_list[i] << " b value is " << (*b)
+              << std::endl;
+  }
+  std::cout << "Final value of b is " << (*b) << std::endl;
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/hip/tester.h b/src/hip/tester.h
new file mode 100755
index 0000000000..5729e66904
--- /dev/null
+++ b/src/hip/tester.h
@@ -0,0 +1,9 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void testing_linkage(int a, float *b, float c);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/memuse.h b/src/memuse.h
index 5883e68684..d51ab4282d 100644
--- a/src/memuse.h
+++ b/src/memuse.h
@@ -20,8 +20,11 @@
 #define SWIFT_MEMUSE_H
 
 /* Config parameters. */
+#ifdef WITH_CUDA
+#include "../config.h"
+#else
 #include <config.h>
-
+#endif
 /* Includes. */
 #include <stdlib.h>
 
diff --git a/src/queue.c b/src/queue.c
index 30601667cd..790b6b1335 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -178,7 +178,6 @@ void queue_insert(struct queue *q, struct task *t) {
       }
     }
   }
-
   /* Increase the incoming count. */
   atomic_inc(&q->count_incoming);
 }
diff --git a/src/queue.h b/src/queue.h
index 0576403bef..b90ca90b46 100644
--- a/src/queue.h
+++ b/src/queue.h
@@ -75,6 +75,28 @@ struct queue {
   int *tid_incoming;
   volatile unsigned int first_incoming, last_incoming, count_incoming;
 
+  /*Number of pack tasks left in queue A. Nasar */
+  volatile int
+      n_packs_self_left_d; /*Number of density pack tasks left in queue*/
+  volatile int n_packs_self_left_f; /*Number of force pack tasks left in queue*/
+  volatile int
+      n_packs_self_left_g; /*Number of gradient pack tasks left in queue*/
+
+  volatile int n_packs_pair_left_d;
+  volatile int n_packs_pair_left_f;
+  volatile int n_packs_pair_left_g;
+
+  volatile int
+      n_packs_self_stolen_d; /*Number of density pack tasks left in queue*/
+  volatile int
+      n_packs_self_stolen_f; /*Number of force pack tasks left in queue*/
+  volatile int
+      n_packs_self_stolen_g; /*Number of gradient pack tasks left in queue*/
+
+  volatile int n_packs_pair_stolen_d;
+  volatile int n_packs_pair_stolen_f;
+  volatile int n_packs_pair_stolen_g;
+
 } __attribute__((aligned(queue_struct_align)));
 
 /* Function prototypes. */
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
new file mode 100644
index 0000000000..a78ec6409c
--- /dev/null
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -0,0 +1,2116 @@
+#include "scheduler.h"
+#include "runner_doiact_hydro.h"
+#include "active.h"
+#include <atomic.h>
+struct pack_vars_self {
+  /*List of tasks and respective cells to be packed*/
+  struct task **task_list;
+  struct task **top_task_list;
+  struct cell **cell_list;
+  /*List of cell positions*/
+  double *cellx;
+  double *celly;
+  double *cellz;
+  /*List of cell positions*/
+  double *d_cellx;
+  double *d_celly;
+  double *d_cellz;
+  int bundle_size;
+  /*How many particles in a bundle*/
+  int count_parts;
+  /**/
+  int tasks_packed;
+  int top_tasks_packed;
+  int *task_first_part;
+  int *task_last_part;
+  int *d_task_first_part;
+  int *d_task_last_part;
+  int *bundle_first_part;
+  int *bundle_last_part;
+  int *bundle_first_task_list;
+  int count_max_parts;
+  int launch;
+  int launch_leftovers;
+  int target_n_tasks;
+  int nBundles;
+  int tasksperbundle;
+
+} pack_vars_self;
+struct leaf_cell_list{
+  struct cell **ci;
+  struct cell **cj;
+  int n_leaves;
+  int n_start;
+  int n_end;
+  int n_packed;
+};
+struct pack_vars_pair {
+  /*List of tasks and respective cells to be packed*/
+  struct task **task_list;
+  struct task **top_task_list;
+  struct leaf_cell_list * leaf_list;
+  struct cell **ci_list;
+  struct cell **cj_list;
+  /*List of cell shifts*/
+  double *shiftx;
+  double *shifty;
+  double *shiftz;
+  /*List of cell shifts*/
+  double *d_shiftx;
+  double *d_shifty;
+  double *d_shiftz;
+  int bundle_size;
+  /*How many particles in a bundle*/
+  int count_parts;
+  /**/
+  int tasks_packed;
+  int top_tasks_packed;
+  int *task_first_part;
+  int *task_last_part;
+  int *d_task_first_part;
+  int *d_task_last_part;
+  int *bundle_first_part;
+  int *bundle_last_part;
+  int *bundle_first_task_list;
+  int count_max_parts;
+  int launch;
+  int launch_leftovers;
+  int target_n_tasks;
+  int nBundles;
+  int tasksperbundle;
+  int task_locked;
+
+} pack_vars_pair;
+
+struct pack_vars_pair_f4 {
+  /*List of tasks and respective cells to be packed*/
+  struct task **task_list;
+  struct cell **ci_list;
+  struct cell **cj_list;
+  /*List of cell shifts*/
+  float3 *shift;
+  /*List of cell shifts*/
+  float3 *d_shift;
+  int bundle_size;
+  /*How many particles in a bundle*/
+  int count_parts;
+  /**/
+  int tasks_packed;
+  int4 *fparti_fpartj_lparti_lpartj;
+  int4 *d_fparti_fpartj_lparti_lpartj;
+  int *bundle_first_part;
+  int *bundle_last_part;
+  int *bundle_first_task_list;
+  int count_max_parts;
+  int launch;
+  int launch_leftovers;
+  int target_n_tasks;
+  int nBundles;
+  int tasksperbundle;
+
+} pack_vars_pair_f4;
+
+#include "cuda/BLOCK_SIZE.h"
+#include "cuda/GPU_runner_functions.h"
+#include "runner_gpu_pack_functions.h"
+#include "task.h"
+#define CUDA_DEBUG
+
+double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
+                              struct pack_vars_self *pack_vars, struct cell *ci,
+                              struct task *t,
+                              struct part_aos_f4_send *parts_send,
+                              int2 *task_first_part_f4) {
+  /* Timers for how long this all takes.
+   * t0 and t1 are from start to finish including GPU calcs
+   * tp0 and tp1 only time packing and unpacking*/
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+  /* Find my queue for use later*/
+  int qid = r->qid;
+  /*Place pointers to the task and cells packed in an array for use later
+   * when unpacking after the GPU offload*/
+  int tasks_packed = pack_vars->tasks_packed;
+  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->cell_list[tasks_packed] = ci;
+  /* Identify row in particle arrays where this task starts*/
+  task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
+  int *count_parts_self = &pack_vars->count_parts;
+  /* This re-arranges the particle data from cell->hydro->parts into a
+  long array of part structs*/
+  runner_doself1_gpu_pack_neat_aos_f4(
+      r, ci, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
+      count_parts_self, tasks_packed, pack_vars->count_max_parts);
+  /* Identify the row in the array where this task ends (row id of its
+     last particle)*/
+  task_first_part_f4[tasks_packed].y = pack_vars->count_parts;
+  /* Identify first particle for each bundle of tasks */
+  const int bundle_size = pack_vars->bundle_size;
+  if (tasks_packed % bundle_size == 0) {
+    int bid = tasks_packed / bundle_size;
+    pack_vars->bundle_first_part[bid] = task_first_part_f4[tasks_packed].x;
+    pack_vars->bundle_first_task_list[bid] = tasks_packed;
+  }
+  /* Tell the cell it has been packed */
+  ci->pack_done++;
+  /* Record that we have now done a packing (self) */
+  t->done = 1;
+  pack_vars->tasks_packed++;
+  pack_vars->launch = 0;
+  pack_vars->launch_leftovers = 0;
+
+  /*Get a lock to the queue so we can safely decrement counter and check for launch leftover condition*/
+  lock_lock(&s->queues[qid].lock);
+  s->queues[qid].n_packs_self_left_d--;
+  if (s->queues[qid].n_packs_self_left_d < 1) pack_vars->launch_leftovers = 1;
+  lock_unlock(&s->queues[qid].lock);
+  /*Have we packed enough tasks to offload to GPU?*/
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+    pack_vars->launch = 1;
+
+  /*Record the end of packing time*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  /* Release the lock on the cell */
+  cell_unlocktree(ci);
+  t->gpu_done = 1;
+  /*Calculate time spent packing and return to runner_main*/
+  return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
+
+double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
+                                struct pack_vars_self *pack_vars,
+                                struct cell *ci, struct task *t,
+                                struct part_aos_f4_g_send *parts_send,
+                                int2 *task_first_part_f4) {
+
+  /* Timers for how long this all takes.
+   * t0 and t1 are from start to finish including GPU calcs
+   * tp0 and tp1 only time packing and unpacking*/
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+  /* Find my queue for use later*/
+  int qid = r->qid;
+  /*Place pointers to the task and cells packed in an array for use later
+   * when unpacking after the GPU offload*/
+  int tasks_packed = pack_vars->tasks_packed;
+  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->cell_list[tasks_packed] = ci;
+  /* Identify row in particle arrays where this task starts*/
+  task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
+  int *count_parts_self = &pack_vars->count_parts;
+  /* This re-arranges the particle data from cell->hydro->parts into a
+  long array of part structs*/
+  runner_doself1_gpu_pack_neat_aos_f4_g(
+      r, ci, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
+      count_parts_self, tasks_packed, pack_vars->count_max_parts);
+  /* identify the row in the array where this task ends (row id of its
+     last particle)*/
+  task_first_part_f4[tasks_packed].y = pack_vars->count_parts;
+  /* Identify first particle for each bundle of tasks */
+  const int bundle_size = pack_vars->bundle_size;
+  if (tasks_packed % bundle_size == 0) {
+    int bid = tasks_packed / bundle_size;
+    pack_vars->bundle_first_part[bid] = task_first_part_f4[tasks_packed].x;
+    pack_vars->bundle_first_task_list[bid] = tasks_packed;
+  }
+  /* Tell the cell it has been packed */
+  ci->pack_done_g++;
+  /* Record that we have now done a packing (self) */
+  t->done = 1;
+  pack_vars->tasks_packed++;
+  pack_vars->launch = 0;
+  pack_vars->launch_leftovers = 0;
+  /*Get a lock to the queue so we can safely decrement counter and check for launch leftover condition*/
+  lock_lock(&s->queues[qid].lock);
+  s->queues[qid].n_packs_self_left_g--;
+  if (s->queues[qid].n_packs_self_left_g < 1) pack_vars->launch_leftovers = 1;
+  lock_unlock(&s->queues[qid].lock);
+
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+    pack_vars->launch = 1;
+  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+   * launch_leftovers statement)*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  /* Release the lock on the cell */
+  cell_unlocktree(ci);
+  /*Calculate time spent packing and return to runner_main*/
+  return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
+
+double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
+                                struct pack_vars_self *pack_vars,
+                                struct cell *ci, struct task *t,
+                                struct part_aos_f4_f_send *parts_send,
+                                int2 *task_first_part_f4) {
+
+  /* Timers for how long this all takes.
+   * t0 and t1 are from start to finish including GPU calcs
+   * tp0 and tp1 only time packing and unpacking*/
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+  /* Find my queue for use later*/
+  int qid = r->qid;
+  /*Place pointers to the task and cells packed in an array for use later
+   * when unpacking after the GPU offload*/
+  int tasks_packed = pack_vars->tasks_packed;
+  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->cell_list[tasks_packed] = ci;
+  /* Identify row in particle arrays where this task starts*/
+  task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
+  int *count_parts_self = &pack_vars->count_parts;
+  /* This re-arranges the particle data from cell->hydro->parts into a
+  long array of part structs*/
+  runner_doself1_gpu_pack_neat_aos_f4_f(
+      r, ci, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
+      count_parts_self, tasks_packed, pack_vars->count_max_parts);
+  /* Identify the row in the array where this task ends (row id of its
+     last particle) */
+  task_first_part_f4[tasks_packed].y = pack_vars->count_parts;
+  /* Identify first particle for each bundle of tasks */
+  const int bundle_size = pack_vars->bundle_size;
+  if (tasks_packed % bundle_size == 0) {
+    int bid = tasks_packed / bundle_size;
+    pack_vars->bundle_first_part[bid] = task_first_part_f4[tasks_packed].x;
+    pack_vars->bundle_first_task_list[bid] = tasks_packed;
+  }
+  /* Tell the cell it has been packed */
+  ci->pack_done_f++;
+  /* Record that we have now done a packing (self) */
+  t->done = 1;
+  pack_vars->tasks_packed++;
+  pack_vars->launch = 0;
+  pack_vars->launch_leftovers = 0;
+  /*Get a lock to the queue so we can safely decrement counter and check for launch leftover condition*/
+  lock_lock(&s->queues[qid].lock);
+  s->queues[qid].n_packs_self_left_f--;
+  if (s->queues[qid].n_packs_self_left_f < 1) pack_vars->launch_leftovers = 1;
+  lock_unlock(&s->queues[qid].lock);
+  /*Have we packed enough tasks to offload to GPU?*/
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+    pack_vars->launch = 1;
+
+  /*Record the end of packing time*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  /* Release the lock on the cell */
+  cell_unlocktree(ci);
+  /*Calculate time spent packing and return to runner_main*/
+  return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
+
+void runner_recurse_gpu(struct runner *r, struct scheduler *s,
+                              struct pack_vars_pair *restrict pack_vars,
+                              struct cell *ci, struct cell *cj, struct task *t,
+                              struct part_aos_f4_send *parts_send,
+                              struct engine *e,
+                              int4 *fparti_fpartj_lparti_lpartj, int *n_leafs_found,
+							  int depth, int n_expected_tasks) {
+
+	/* Should we even bother? A. Nasar: For GPU code we need to be clever about this */
+  if (!CELL_IS_ACTIVE(ci, e) && !CELL_IS_ACTIVE(cj, e)) return;
+  if (ci->hydro.count == 0 || cj->hydro.count == 0) return;
+
+  /* Get the type of pair and flip ci/cj if needed. */
+  double shift[3];
+  const int sid = space_getsid_and_swap_cells(s, &ci, &cj, shift);
+
+  /* Recurse? */
+  if (cell_can_recurse_in_pair_hydro_task(ci) &&
+	  cell_can_recurse_in_pair_hydro_task(cj)) {
+	struct cell_split_pair *csp = &cell_split_pairs[sid];
+	for (int k = 0; k < csp->count; k++) {
+	  const int pid = csp->pairs[k].pid;
+	  const int pjd = csp->pairs[k].pjd;
+	  /*Do we want to do anything before we recurse?*/
+
+	  /*We probably want to record */
+	  if (ci->progeny[pid] != NULL && cj->progeny[pjd] != NULL){
+		runner_recurse_gpu(r, s, pack_vars, ci->progeny[pid], cj->progeny[pjd], t, parts_send, e, fparti_fpartj_lparti_lpartj,
+				n_leafs_found, depth + 1, n_expected_tasks);
+//	        message("recursing to depth %i", depth + 1);
+	  }
+	}
+  }
+  else if (CELL_IS_ACTIVE(ci, e) || CELL_IS_ACTIVE(cj, e)) {
+	/* if any cell empty: skip */
+	if(ci->hydro.count == 0 || cj->hydro.count == 0) return;
+	int leafs_found = *n_leafs_found;
+	/*for all leafs to be sent add to cell list */
+//	cells_left[leafs_found] = ci;
+//	cells_right[leafs_found] = cj;
+	/*Add leaf cells to list for each top_level task*/
+	pack_vars->leaf_list[pack_vars->top_tasks_packed].ci[leafs_found] = ci;
+	pack_vars->leaf_list[pack_vars->top_tasks_packed].cj[leafs_found] = cj;
+	pack_vars->leaf_list[pack_vars->top_tasks_packed].n_leaves++;
+//	error("stop");
+	*n_leafs_found = leafs_found + 1;
+	if(*n_leafs_found >= n_expected_tasks)
+		error("Created %i more than expected leaf cells. depth %i", *n_leafs_found, depth);
+  }
+
+};
+
+double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
+                              struct pack_vars_pair *restrict pack_vars,
+                              struct cell *ci, struct cell *cj, struct task *t,
+                              struct part_aos_f4_send *parts_send,
+                              struct engine *e,
+                              int4 *fparti_fpartj_lparti_lpartj) {
+  /* Timers for how long this all takes.
+   * t0 and t1 are from start to finish including GPU calcs
+   * tp0 and tp1 only time packing and unpacking*/
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+  int tasks_packed = pack_vars->tasks_packed;
+  int qid = r->qid;
+
+  double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+  struct cell *citmp, *cjtmp;
+  citmp=ci;
+  cjtmp=cj;
+  /* Get the type of pair and flip ci/cj if needed. */
+  double shift[3];
+  const int sid = space_getsid_and_swap_cells(s, &citmp, &cjtmp, shift);
+  if(citmp != ci) error("I'm flipped");
+  /*Get the shifts in case of periodics*/
+  space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+
+  /*Get pointers to the list of tasks and cells packed*/
+//  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->ci_list[tasks_packed] = ci;
+  pack_vars->cj_list[tasks_packed] = cj;
+
+  float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
+
+  const int count_ci = ci->hydro.count;
+  const int count_cj = cj->hydro.count;
+
+  /*Assign an id for this task*/
+  const int tid = tasks_packed;
+
+  /* Find first parts in task for ci and cj. Packed_tmp is index for cell i.
+   * packed_tmp+1 is index for cell j */
+  fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts;
+  fparti_fpartj_lparti_lpartj[tasks_packed].y =
+      pack_vars->count_parts + count_ci;
+
+  int *count_parts = &pack_vars->count_parts;
+  /* This re-arranges the particle data from cell->hydro->parts into a
+  long array of part structs*/
+  runner_do_ci_cj_gpu_pack_neat_aos_f4(
+      r, ci, cj, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
+      count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj,
+      shift_tmp);
+  /* Find last parts in task for ci and cj*/
+  fparti_fpartj_lparti_lpartj[tasks_packed].z =
+      pack_vars->count_parts - count_cj;
+  fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts;
+
+  /* Tell the cells they have been packed */
+  ci->pack_done++;
+  cj->pack_done++;
+
+  /* Identify first particle for each bundle of tasks */
+  const int bundle_size = pack_vars->bundle_size;
+  if (tasks_packed % bundle_size == 0) {
+    int bid = tasks_packed / bundle_size;
+    pack_vars->bundle_first_part[bid] =
+        fparti_fpartj_lparti_lpartj[tasks_packed].x;
+    pack_vars->bundle_first_task_list[bid] = tasks_packed;
+  }
+  /* Record that we have now done a packing (self) */
+  t->done = 1;
+  pack_vars->tasks_packed++;
+  pack_vars->launch = 0;
+  pack_vars->launch_leftovers = 0;
+  pack_vars->leaf_list[pack_vars->top_tasks_packed - 1].n_packed++;
+
+  //A. Nasar: Need to come back to this at some point!
+  lock_lock(&s->queues[qid].lock);
+  s->queues[qid].n_packs_pair_left_d--;
+  if (s->queues[qid].n_packs_pair_left_d < 1) pack_vars->launch_leftovers = 1;
+  lock_unlock(&s->queues[qid].lock);
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks){
+    pack_vars->launch = 1;
+  }
+  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+   * launch_leftovers statement)*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+};
+
+double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
+                                struct pack_vars_pair *restrict pack_vars,
+                                struct cell *ci, struct cell *cj,
+                                struct task *t,
+                                struct part_aos_f4_g_send *parts_send,
+                                struct engine *e,
+                                int4 *fparti_fpartj_lparti_lpartj) {
+
+  /* Timers for how long this all takes.
+   * t0 and t1 are from start to finish including GPU calcs
+   * tp0 and tp1 only time packing and unpacking*/
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+  int tasks_packed = pack_vars->tasks_packed;
+
+  int qid = r->qid;
+  //  pthread_mutex_lock(&s->sleep_mutex);
+  //  atomic_dec(&(s->p_g_left[qid]));
+  //  pthread_cond_broadcast(&s->sleep_cond);
+  //  pthread_mutex_unlock(&s->sleep_mutex);
+
+  double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+  /*Get the shifts in case of periodics*/
+  space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+
+  /*Get pointers to the list of tasks and cells packed*/
+  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->ci_list[tasks_packed] = ci;
+  pack_vars->cj_list[tasks_packed] = cj;
+
+  float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
+
+  const int count_ci = ci->hydro.count;
+  const int count_cj = cj->hydro.count;
+
+  /*Assign an id for this task*/
+  const int tid = tasks_packed;
+
+  /* Find first parts in task for ci and cj. Packed_tmp is index for cell i.
+   * packed_tmp+1 is index for cell j */
+  //    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
+  //    pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts +
+  //    count_ci;
+
+  fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts;
+  fparti_fpartj_lparti_lpartj[tasks_packed].y =
+      pack_vars->count_parts + count_ci;
+
+  int *count_parts = &pack_vars->count_parts;
+  //    if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid,
+  //    pack_vars->count_parts);
+  /* This re-arranges the particle data from cell->hydro->parts into a
+  long array of part structs*/
+  runner_do_ci_cj_gpu_pack_neat_aos_f4_g(
+      r, ci, cj, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
+      count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj,
+      shift_tmp);
+  //	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no
+  // timing, 1 for timing*/, 		  count_parts, tasks_packed,
+  // pack_vars->count_max_parts); //This may cause an issue. Be sure to test
+  // that
+  // pack_vars->count_parts is actually increment here
+  /* Find last parts in task for ci and cj. Packed_tmp is index for cell i.
+   * packed_tmp+1 is index for cell j */
+
+  //    if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count
+  //    %i\n", r->cpuid, *count_parts, 		pack_vars->count_parts);
+  fparti_fpartj_lparti_lpartj[tasks_packed].z =
+      pack_vars->count_parts - count_cj;
+  fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts;
+  //    pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts -
+  //    count_cj; pack_vars->task_last_part[packed_tmp + 1] =
+  //    pack_vars->count_parts;
+
+  /* Tell the cells they have been packed */
+  ci->pack_done_g++;
+  cj->pack_done_g++;
+
+  /* Identify first particle for each bundle of tasks */
+  const int bundle_size = pack_vars->bundle_size;
+  if (tasks_packed % bundle_size == 0) {
+    int bid = tasks_packed / bundle_size;
+    pack_vars->bundle_first_part[bid] =
+        fparti_fpartj_lparti_lpartj[tasks_packed].x;
+    pack_vars->bundle_first_task_list[bid] = tasks_packed;
+  }
+
+  /* Record that we have now done a packing (self) */
+  t->done = 1;
+  /* Copies done. Release the lock ! */
+  cell_unlocktree(ci);
+  cell_unlocktree(cj);
+  pack_vars->tasks_packed++;
+  pack_vars->launch = 0;
+  pack_vars->launch_leftovers = 0;
+  /* Record that we have now done a packing (self) */
+  //  int qid = r->qid;
+  //  atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
+
+  lock_lock(&s->queues[qid].lock);
+
+  s->queues[qid].n_packs_pair_left_g--;
+
+  if (s->queues[qid].n_packs_pair_left_g < 1) pack_vars->launch_leftovers = 1;
+
+  lock_unlock(&s->queues[qid].lock);
+
+  //  if ((s->p_g_left[qid] < 1))
+  //    pack_vars->launch_leftovers = 1;
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+    pack_vars->launch = 1;
+  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+   * launch_leftovers statement)*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
+
+double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
+                                struct pack_vars_pair *restrict pack_vars,
+                                struct cell *ci, struct cell *cj,
+                                struct task *t,
+                                struct part_aos_f4_f_send *parts_send,
+                                struct engine *e,
+                                int4 *fparti_fpartj_lparti_lpartj) {
+
+  /* Timers for how long this all takes.
+   * t0 and t1 are from start to finish including GPU calcs
+   * tp0 and tp1 only time packing and unpacking*/
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+  int tasks_packed = pack_vars->tasks_packed;
+
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+  //  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
+  //  pthread_mutex_lock(&s->sleep_mutex);
+  atomic_dec(&(s->p_f_left[qid]));
+  //  pthread_cond_broadcast(&s->sleep_cond);
+  //  pthread_mutex_unlock(&s->sleep_mutex);
+
+  double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+  /*Get the shifts in case of periodics*/
+  space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+
+  /*Get pointers to the list of tasks and cells packed*/
+  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->ci_list[tasks_packed] = ci;
+  pack_vars->cj_list[tasks_packed] = cj;
+
+  float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
+
+  const int count_ci = ci->hydro.count;
+  const int count_cj = cj->hydro.count;
+
+  /*Assign an id for this task*/
+  const int tid = tasks_packed;
+
+  /* Find first parts in task for ci and cj. Packed_tmp is index for cell i.
+   * packed_tmp+1 is index for cell j */
+  //    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
+  //    pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts +
+  //    count_ci;
+
+  fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts;
+  fparti_fpartj_lparti_lpartj[tasks_packed].y =
+      pack_vars->count_parts + count_ci;
+
+  int *count_parts = &pack_vars->count_parts;
+  //    if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid,
+  //    pack_vars->count_parts);
+  /* This re-arranges the particle data from cell->hydro->parts into a
+  long array of part structs*/
+  runner_do_ci_cj_gpu_pack_neat_aos_f4_f(
+      r, ci, cj, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
+      count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj,
+      shift_tmp);
+  //	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no
+  // timing, 1 for timing*/, 		  count_parts, tasks_packed,
+  // pack_vars->count_max_parts); //This may cause an issue. Be sure to test
+  // that
+  // pack_vars->count_parts is actually increment here
+  /* Find last parts in task for ci and cj. Packed_tmp is index for cell i.
+   * packed_tmp+1 is index for cell j */
+
+  //    if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count
+  //    %i\n", r->cpuid, *count_parts, 		pack_vars->count_parts);
+  fparti_fpartj_lparti_lpartj[tasks_packed].z =
+      pack_vars->count_parts - count_cj;
+  fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts;
+  //    pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts -
+  //    count_cj; pack_vars->task_last_part[packed_tmp + 1] =
+  //    pack_vars->count_parts;
+
+  /* Tell the cells they have been packed */
+  ci->pack_done_f++;
+  cj->pack_done_f++;
+
+  /* Identify first particle for each bundle of tasks */
+  const int bundle_size = pack_vars->bundle_size;
+  if (tasks_packed % bundle_size == 0) {
+    int bid = tasks_packed / bundle_size;
+    pack_vars->bundle_first_part[bid] =
+        fparti_fpartj_lparti_lpartj[tasks_packed].x;
+    pack_vars->bundle_first_task_list[bid] = tasks_packed;
+  }
+
+  /* Record that we have now done a packing (self) */
+  t->done = 1;
+  /* Copies done. Release the lock ! */
+  cell_unlocktree(ci);
+  cell_unlocktree(cj);
+  pack_vars->tasks_packed++;
+  pack_vars->launch = 0;
+  pack_vars->launch_leftovers = 0;
+
+  lock_lock(&s->queues[qid].lock);
+
+  s->queues[qid].n_packs_pair_left_f--;
+
+  if (s->queues[qid].n_packs_pair_left_f < 1) pack_vars->launch_leftovers = 1;
+
+  lock_unlock(&s->queues[qid].lock);
+
+  //  if ((s->p_f_left[qid] < 1))
+  //    pack_vars->launch_leftovers = 1;
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+    pack_vars->launch = 1;
+  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+   * launch_leftovers statement)*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
+
+void runner_doself1_launch_f4(
+    struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars,
+    struct cell *ci, struct task *t, struct part_aos_f4_send *parts_send,
+    struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
+    struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    float d_H, struct engine *e, double *packing_time, double *gpu_time,
+    double *unpack_time, int devId,
+    int2 *task_first_part_f4, int2 *d_task_first_part_f4,
+    cudaEvent_t *self_end) {
+
+  struct timespec t0, t1, tp0, tp1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero tasks packed but somehow got into GPU loop");
+    //	  pack_vars->bundle_first_part[nBundles_temp] =
+    // pack_vars->task_first_part[tasks_packed - 1];
+    pack_vars->bundle_first_part[nBundles_temp] =
+        task_first_part_f4[tasks_packed - 1].x;
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+  //    clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
+  /*Copy arrays containing first and last part for each task to GPU*/
+  //    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+  //               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+  //    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+  //               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+  //    cudaMemPrefetchAsync(d_task_first_part_self_dens_f4, tasks_packed *
+  //    sizeof(int2), devId, NULL);
+  /*Copy cell shifts to device*/
+  //    cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
+  //               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  //    cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
+  //               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  //    cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
+  //               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  //    clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
+  //    *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
+  //			(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) / 1000000000.0;
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  int max_parts;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    max_parts = 0;
+    int parts_in_bundle = 0;
+    const int first_task = bid * bundle_size;
+    int last_task = (bid + 1) * bundle_size;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in the bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count = task_first_part_f4[tid].y - task_first_part_f4[tid].x;
+        parts_in_bundle += count;
+        max_parts = max(max_parts, count);
+        last_task = tid;
+      }
+    }
+    //	  const int n_tasks = last_task - first_task;
+
+    const int first_part_tmp = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp;
+    //	  clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
+    //      cudaMemPrefetchAsync(&d_task_first_part_self_dens_f4[first_task],
+    //      (last_task - first_task) * sizeof(int2),
+    //    		  devId, stream[bid]);
+    cudaMemcpyAsync(&d_task_first_part_f4[first_task],
+                    &task_first_part_f4[first_task],
+                    (last_task + 1 - first_task) * sizeof(int2),
+                    cudaMemcpyHostToDevice, stream[bid]);
+    //	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();
+    //// 	  if (cu_error != cudaSuccess) { 		fprintf(
+    /// stderr, 			"CUDA error in density
+    // self host 2 device memcpy: %s cpuid id is: %i\n ",
+    //			cudaGetErrorString(cu_error), r->cpuid);
+    //		exit(0);
+    //	  }
+    //       clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
+    //       *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
+    //   			(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) /
+    //   1000000000.0;
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
+                    bundle_n_parts * sizeof(struct part_aos_f4_send),
+                    cudaMemcpyHostToDevice, stream[bid]);
+
+    // #ifdef CUDA_DEBUG
+    //	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();
+    ////
+    //										//
+    // Get error code 	  if (cu_error != cudaSuccess) { 		fprintf(
+    // stderr, 			"CUDA error in density self host 2 device
+    // memcpy: %s cpuid id is: %i\n ",
+    // cudaGetErrorString(cu_error), r->cpuid);
+    //		exit(0);
+    //	  }
+    // #endif
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
+    // Will launch a 2d grid of GPU thread blocks (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    //	  const char *loop_type = "density";
+    //	  struct first_part first_parts;
+    //	  for(int i = 0; i < numBlocks_y; i++) first_parts.list[i] =
+    // pack_vars->task_first_part[i]; 	  fprintf(stderr, "Launching kernel with
+    // %i tasks leftovers %i\n", 			  tasks_packed,
+    // pack_vars->launch_leftovers);
+    // Launch the kernel
+    launch_density_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
+                          numBlocks_x, numBlocks_y, bundle_first_task,
+                          d_task_first_part_f4);
+    // #ifdef CUDA_DEBUG
+    //	  cu_error = cudaPeekAtLastError(); // Get error code
+    //	  if (cu_error != cudaSuccess) {
+    //		fprintf(stderr,
+    //				"CUDA error with self density kernel launch: %s
+    // cpuid id is: %i\n ",
+    // cudaGetErrorString(cu_error), r->cpuid); 		exit(0);
+    //	  }
+    // #endif
+    cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
+                    bundle_n_parts * sizeof(struct part_aos_f4_recv),
+                    cudaMemcpyDeviceToHost, stream[bid]);
+    cudaEventRecord(self_end[bid], stream[bid]);
+    // #ifdef CUDA_DEBUG
+    //	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+    //										//
+    // Get error code 	  if (cu_error != cudaSuccess) {
+    // fprintf(stderr, 				"CUDA error with self density
+    // D2H memcpy: %s cpuid id is: %i\n ",
+    // cudaGetErrorString(cu_error),
+    // r->cpuid); 		error("Something's up with your cuda code");
+    //	  }
+    // #endif
+  } /*End of looping over bundles to launch in streams*/
+  /* Make sure all the kernels and copies back are finished */
+  //	cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  ticks total_cpu_unpack_ticks = 0.;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+    //		cudaStreamSynchronize(stream[bid]);
+    cudaEventSynchronize(self_end[bid]);
+
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *gpu_time +=
+        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+    /*Time unpacking*/
+    //		clock_gettime(CLOCK_REALTIME, &tp0);
+
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+      if (tid < tasks_packed) {
+        struct cell *cii = pack_vars->cell_list[tid];
+        struct task *tii = pack_vars->task_list[tid];
+
+        //              struct cell *cii = ci_list_self_dens[tid];
+        //              struct task *tii = task_list_self_dens[tid];
+
+        clock_gettime(CLOCK_REALTIME, &tp0);
+
+        //			  clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
+        while (cell_locktree(cii)) {
+          ; /* spin until we acquire the lock */
+        }
+        //			  clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
+        //				*hmemcpy_time += (t1hmemcpy.tv_sec -
+        // t0hmemcpy.tv_sec) + 				(t1hmemcpy.tv_nsec -
+        // t0hmemcpy.tv_nsec) / 1000000000.0;
+        const ticks tic = getticks();
+        /* Do the copy */
+        runner_doself1_gpu_unpack_neat_aos_f4(r, cii, parts_recv, 0,
+                                              &pack_length_unpack, tid,
+                                              pack_vars->count_max_parts, e);
+        const ticks toc = getticks();
+
+        total_cpu_unpack_ticks += toc - tic;
+        /* Record things for debugging */
+        cii->gpu_done++;
+        /*Time end of unpacking*/
+        clock_gettime(CLOCK_REALTIME, &tp1);
+        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+        pthread_mutex_lock(&s->sleep_mutex);
+        atomic_dec(&s->waiting);
+        pthread_cond_broadcast(&s->sleep_cond);
+        pthread_mutex_unlock(&s->sleep_mutex);
+        /* Release the lock */
+        cell_unlocktree(cii);
+
+        /*schedule my dependencies (Only unpacks really)*/
+        enqueue_dependencies(s, tii);
+        /*Signal sleeping runners*/
+        // MATTHIEU signal_sleeping_runners(s, tii);
+
+        tii->gpu_done = 1;
+      }
+    }
+    /*Time end of unpacking*/
+    //		clock_gettime(CLOCK_REALTIME, &tp1);
+    //		*hmemcpy_time += (tp1.tv_sec - tp0.tv_sec) +
+    //		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+    //		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
+    //		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+
+  t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
+
+} /*End of GPU work Self*/
+
+void runner_doself1_launch_f4_g(
+    struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars,
+    struct cell *ci, struct task *t, struct part_aos_f4_g_send *parts_send,
+    struct part_aos_f4_g_recv *parts_recv,
+    struct part_aos_f4_g_send *d_parts_send,
+    struct part_aos_f4_g_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    float d_H, struct engine *e, double *packing_time, double *gpu_time,
+    int2 *task_first_part_f4, int2 *d_task_first_part_f4, cudaEvent_t *self_end,
+    double *unpack_time) {
+
+  struct timespec t0, t1, tp0, tp1;
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    //	  if(tasks_packed == 0) error("zero tasks packed but somehow got into
+    // GPU loop");
+    pack_vars->bundle_first_part[nBundles_temp] =
+        task_first_part_f4[tasks_packed - 1].x;
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  int max_parts;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    max_parts = 0;
+    int parts_in_bundle = 0;
+    const int first_task = bid * bundle_size;
+    int last_task = (bid + 1) * bundle_size;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in the bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count = task_first_part_f4[tid].y - task_first_part_f4[tid].x;
+        parts_in_bundle += count;
+        max_parts = max(max_parts, count);
+        last_task = tid;
+      }
+    }
+
+    const int first_part_tmp = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp;
+
+    cudaMemcpyAsync(&d_task_first_part_f4[first_task],
+                    &task_first_part_f4[first_task],
+                    (last_task + 1 - first_task) * sizeof(int2),
+                    cudaMemcpyHostToDevice, stream[bid]);
+
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
+                    bundle_n_parts * sizeof(struct part_aos_f4_g_send),
+                    cudaMemcpyHostToDevice, stream[bid]);
+    //	  fprintf(stderr, "bid %i first_part %i nparts %i\n", bid,
+    // first_part_tmp, bundle_n_parts);
+
+#ifdef CUDA_DEBUG
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error in gradient self host 2 device memcpy: %s cpuid id "
+              "is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      exit(0);
+    }
+#endif
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
+    // Will launch a 2d grid of GPU thread blocks (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    //	  const char *loop_type = "density";
+    // Launch the kernel
+    launch_gradient_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
+                           numBlocks_x, numBlocks_y, bundle_first_task,
+                           d_task_first_part_f4);
+#ifdef CUDA_DEBUG
+    cu_error = cudaPeekAtLastError();  // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(
+          stderr,
+          "CUDA error with self gradient kernel launch: %s cpuid id is: %i\n ",
+          cudaGetErrorString(cu_error), r->cpuid);
+      exit(0);
+    }
+#endif
+    cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
+                    bundle_n_parts * sizeof(struct part_aos_f4_g_recv),
+                    cudaMemcpyDeviceToHost, stream[bid]);
+    cudaEventRecord(self_end[bid], stream[bid]);
+
+#ifdef CUDA_DEBUG
+    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
+                                       // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self gradient D2H memcpy: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
+#endif
+  } /*End of looping over bundles to launch in streams*/
+  //	exit(0);
+  /* Make sure all the kernels and copies back are finished */
+  //	cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  ticks total_cpu_unpack_ticks = 0.;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+    //		cudaStreamSynchronize(stream[bid]);
+    cudaEventSynchronize(self_end[bid]);
+
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *gpu_time +=
+        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+    /*Time unpacking*/
+    //		clock_gettime(CLOCK_REALTIME, &tp0);
+
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+      if (tid < tasks_packed) {
+
+        struct cell *cii = pack_vars->cell_list[tid];
+        struct task *tii = pack_vars->task_list[tid];
+
+        //              struct cell *cii = ci_list_self_dens[tid];
+        //              struct task *tii = task_list_self_dens[tid];
+
+        while (cell_locktree(cii)) {
+          ; /* spin until we acquire the lock */
+        }
+        /*Time unpacking*/
+        clock_gettime(CLOCK_REALTIME, &tp0);
+        const ticks tic = getticks();
+
+        /* Do the copy */
+        runner_doself1_gpu_unpack_neat_aos_f4_g(r, cii, parts_recv, 0,
+                                                &pack_length_unpack, tid,
+                                                pack_vars->count_max_parts, e);
+        const ticks toc = getticks();
+
+        total_cpu_unpack_ticks += toc - tic;
+        /*Time end of unpacking*/
+        clock_gettime(CLOCK_REALTIME, &tp1);
+        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+        /* Record things for debugging */
+        cii->gpu_done_g++;
+        pthread_mutex_lock(&s->sleep_mutex);
+        atomic_dec(&s->waiting);
+        pthread_cond_broadcast(&s->sleep_cond);
+        pthread_mutex_unlock(&s->sleep_mutex);
+        /* Release the lock */
+        cell_unlocktree(cii);
+
+        /*schedule my dependencies (Only unpacks really)*/
+        enqueue_dependencies(s, tii);
+        /*Signal sleeping runners*/
+        // MATTHIEU signal_sleeping_runners(s, tii);
+
+        tii->gpu_done = 1;
+      }
+    }
+    /*Time end of unpacking*/
+    //		clock_gettime(CLOCK_REALTIME, &tp1);
+    //		*unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+    //		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+    //		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
+    //		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+
+  t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
+
+} /*End of GPU work Self Gradient*/
+
+void runner_doself1_launch_f4_f(
+    struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars,
+    struct cell *ci, struct task *t, struct part_aos_f4_f_send *parts_send,
+    struct part_aos_f4_f_recv *parts_recv,
+    struct part_aos_f4_f_send *d_parts_send,
+    struct part_aos_f4_f_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    float d_H, struct engine *e, double *packing_time, double *gpu_time,
+    int2 *task_first_part_f4_f, int2 *d_task_first_part_f4_f,
+    cudaEvent_t *self_end, double *unpack_time) {
+
+  struct timespec t0, t1, tp0, tp1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero tasks packed but somehow got into GPU loop");
+    pack_vars->bundle_first_part[nBundles_temp] =
+        task_first_part_f4_f[tasks_packed - 1].x;
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+  /*Copy arrays containing first and last part for each task to GPU*/
+  //    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+  //               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+  //    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+  //               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+
+  /*Copy cell shifts to device*/
+  //    cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
+  //               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  //    cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
+  //               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  //    cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
+  //               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  int max_parts = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    max_parts = 0;
+    int parts_in_bundle = 0;
+    const int first_task = bid * bundle_size;
+    int last_task = (bid + 1) * bundle_size;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in the bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count = task_first_part_f4_f[tid].y - task_first_part_f4_f[tid].x;
+        parts_in_bundle += count;
+        max_parts = max(max_parts, count);
+        last_task = tid;
+      }
+    }
+
+    const int first_part_tmp = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp;
+    cudaMemcpyAsync(&d_task_first_part_f4_f[first_task],
+                    &task_first_part_f4_f[first_task],
+                    (last_task + 1 - first_task) * sizeof(int2),
+                    cudaMemcpyHostToDevice, stream[bid]);
+
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
+                    bundle_n_parts * sizeof(struct part_aos_f4_f_send),
+                    cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error in density self host 2 device memcpy: %s cpuid id "
+              "is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      exit(0);
+    }
+#endif
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
+    // Will launch a 2d grid of GPU thread blocks (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    // Launch the kernel
+    launch_force_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
+                        numBlocks_x, numBlocks_y, bundle_first_task,
+                        d_task_first_part_f4_f);
+#ifdef CUDA_DEBUG
+    cu_error = cudaPeekAtLastError();  // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self force kernel launch: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      exit(0);
+    }
+#endif
+    cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
+                    bundle_n_parts * sizeof(struct part_aos_f4_f_recv),
+                    cudaMemcpyDeviceToHost, stream[bid]);
+    cudaEventRecord(self_end[bid], stream[bid]);
+
+#ifdef CUDA_DEBUG
+    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
+                                       // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self firce D2H memcpy: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
+#endif
+  } /*End of looping over bundles to launch in streams*/
+
+  /* Make sure all the kernels and copies back are finished */
+  //	cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  ticks total_cpu_unpack_ticks = 0.;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+    //		cudaStreamSynchronize(stream[bid]);
+    cudaEventSynchronize(self_end[bid]);
+
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *gpu_time +=
+        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+    /*Time unpacking*/
+    //		clock_gettime(CLOCK_REALTIME, &tp0);
+
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+      if (tid < tasks_packed) {
+        struct cell *cii = pack_vars->cell_list[tid];
+        struct task *tii = pack_vars->task_list[tid];
+
+        //              struct cell *cii = ci_list_self_dens[tid];
+        //              struct task *tii = task_list_self_dens[tid];
+
+        while (cell_locktree(cii)) {
+          ; /* spin until we acquire the lock */
+        }
+        clock_gettime(CLOCK_REALTIME, &tp0);
+        const ticks tic = getticks();
+
+        /* Do the copy */
+        runner_doself1_gpu_unpack_neat_aos_f4_f(r, cii, parts_recv, 0,
+                                                &pack_length_unpack, tid,
+                                                pack_vars->count_max_parts, e);
+        const ticks toc = getticks();
+
+        total_cpu_unpack_ticks += toc - tic;
+        /* Record things for debugging */
+        cii->gpu_done_f++;
+        clock_gettime(CLOCK_REALTIME, &tp1);
+        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+        pthread_mutex_lock(&s->sleep_mutex);
+        atomic_dec(&s->waiting);
+        pthread_cond_broadcast(&s->sleep_cond);
+        pthread_mutex_unlock(&s->sleep_mutex);
+        /* Release the lock */
+        cell_unlocktree(cii);
+
+        /*schedule my dependencies (Only unpacks really)*/
+        enqueue_dependencies(s, tii);
+        /*Signal sleeping runners*/
+        // MATTHIEU signal_sleeping_runners(s, tii);
+
+        tii->gpu_done = 1;
+      }
+    }
+    /*Time end of unpacking*/
+    //		clock_gettime(CLOCK_REALTIME, &tp1);
+    //		*unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+    //		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+  }
+
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+
+  t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
+} /*End of GPU work Self Gradient*/
+
+void runner_dopair1_launch_f4_one_memcpy(
+    struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+    struct task *t, struct part_aos_f4_send *parts_send,
+    struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
+    struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    float d_H, struct engine *e, double *packing_time, double *gpu_time,
+    double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
+    cudaEvent_t *pair_end) {
+
+  struct timespec t0, t1, tp0, tp1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero pair tasks packed but somehow got into GPU loop");
+    //	  pack_vars->bundle_first_part[nBundles_temp] =
+    // pack_vars->task_first_part[packed_tmp - 2];
+    pack_vars->bundle_first_part[nBundles_temp] =
+        fparti_fpartj_lparti_lpartj_dens[tasks_packed - 1].x;
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z -
+                      fparti_fpartj_lparti_lpartj_dens[tid].x;
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w -
+                      fparti_fpartj_lparti_lpartj_dens[tid].y;
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
+        //        if(count_i > 100 || count_j > 100)
+        //        	error("Sending data for excessive n parts %i %i",
+        //        count_i, count_j);
+      }
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+                    &parts_send[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_send),
+                    cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
+              "is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code first_part %i bundle size %i",
+            first_part_tmp_i, bundle_n_parts);
+    }
+#endif
+    /* LAUNCH THE GPU KERNELS for ci & cj */
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = 0;  // tasks_left;
+    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_part_0 = pack_vars->bundle_first_part[bid];
+    /* Launch the kernel for ci using data for ci and cj */
+    runner_dopair_branch_density_gpu_aos_f4(
+        d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
+        numBlocks_y, bundle_part_0, bundle_n_parts);
+
+#ifdef CUDA_DEBUG
+    cu_error = cudaPeekAtLastError();  // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(
+          stderr,
+          "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+          "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+          cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
+          max_parts_i, max_parts_j);
+      error("Something's up with kernel launch.");
+    }
+#endif
+
+    // Copy results back to CPU BUFFERS
+    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+                    &d_parts_recv[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_recv),
+                    cudaMemcpyDeviceToHost, stream[bid]);
+    cudaEventRecord(pair_end[bid], stream[bid]);
+
+#ifdef CUDA_DEBUG
+    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
+                                       // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
+#endif
+  } /*End of looping over bundles to launch in streams*/
+
+  /* Make sure all the kernels and copies back are finished */
+  //	cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+
+  int pack_length_unpack = 0;
+  ticks total_cpu_unpack_ticks = 0;
+
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+    /*Time unpacking*/
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+    //		cudaStreamSynchronize(stream[bid]);
+    cudaEventSynchronize(pair_end[bid]);
+
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *gpu_time +=
+        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+    ////////////
+
+    /*Time unpacking*/
+    //		clock_gettime(CLOCK_REALTIME, &tp0);
+
+//    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+//
+//      if (tid < tasks_packed) {
+//        clock_gettime(CLOCK_REALTIME, &tp0);
+//        /*grab cell and task pointers*/
+//        struct cell *cii = pack_vars->ci_list[tid];
+//        struct cell *cjj = pack_vars->cj_list[tid];
+//        struct task *tii = pack_vars->task_list[tid];
+//
+////        if(!pack_vars->task_locked){
+////          /*Let's lock ci*/
+////          while (cell_locktree(cii)) {
+////            ; /* spin until we acquire the lock */
+////          }
+////          /*Let's lock cj*/
+////          while (cell_locktree(cjj)) {
+////            ; /* spin until we acquire the lock */
+////          }
+////          pack_vars->task_locked = 1;
+////        }
+//
+//        const ticks tic = getticks();
+//
+//        /* Do the copy */
+//        runner_do_ci_cj_gpu_unpack_neat_aos_f4(
+//            r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
+//            2 * pack_vars->count_max_parts, e);
+//
+//        const ticks toc = getticks();
+//
+//        total_cpu_unpack_ticks += toc - tic;
+//
+//        /* Record things for debugging */
+//        cii->gpu_done_pair++;
+//        cjj->gpu_done_pair++;
+//
+////        if(pack_vars->task_locked){
+////          /* Release the locks */
+////          cell_unlocktree(cii);
+////          /* Release the locks */
+////          cell_unlocktree(cjj);
+//          pack_vars->task_locked = 0;
+////        }
+//
+//        /*Time end of unpacking*/
+//        clock_gettime(CLOCK_REALTIME, &tp1);
+//        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+//                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+//        /*Signal sleeping runners*/
+//        // MATTHIEU signal_sleeping_runners(s, tii);
+//
+//        tii->gpu_done = 1;
+//      }
+//    }
+  }
+
+  /* Zero counters for the next pack operations */
+//  pack_vars->count_parts = 0;
+//  pack_vars->tasks_packed = 0;
+
+  //	/*Time end of unpacking*/
+  //	clock_gettime(CLOCK_REALTIME, &t1);
+  //	*packing_time += (t1.tv_sec - t0.tv_sec) +
+  //	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+  /* Write the timers back to the task */
+  t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
+
+} /*End of GPU work*/
+
+void runner_dopair1_unpack_f4(
+    struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+    struct task *t, struct part_aos_f4_send *parts_send,
+    struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
+    struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    float d_H, struct engine *e, double *packing_time, double *gpu_time,
+    double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
+    cudaEvent_t *pair_end, int cstart, int n_leaves_found){
+
+  int topid;
+  int pack_length_unpack = 0;
+  ticks total_cpu_unpack_ticks = 0;
+  /*Loop over top level tasks*/
+  for (topid = 0; topid < pack_vars->top_tasks_packed; topid++) {
+	const ticks tic = getticks();
+	/* Loop through each daughter task */
+	int n_leaves_in_task = pack_vars->leaf_list[topid].n_packed;
+	int nstart = pack_vars->leaf_list[topid].n_start;
+	for(int tid = nstart; tid < n_leaves_in_task + nstart; tid++){
+	  /*Get pointers to the leaf cells. SEEMS I'm NOT GETTING A CORRECT POINTER
+	   *but likely due to incorrect book keeping*/
+	  struct cell * cii_l = pack_vars->leaf_list[topid].ci[tid];
+	  struct cell * cjj_l = pack_vars->leaf_list[topid].cj[tid];
+	  message("loc %f %f %f topid %i tid %i nleaves %i", pack_vars->leaf_list[topid].ci[tid]->loc[0]
+                            , pack_vars->leaf_list[topid].ci[tid]->loc[1]
+	                        , pack_vars->leaf_list[topid].ci[tid]->loc[2]
+                            , topid, tid, n_leaves_in_task);
+//	  if(*cii_l == NULL || *cjj_l == NULL)error("stop");
+	  runner_do_ci_cj_gpu_unpack_neat_aos_f4(
+			r, cii_l, cjj_l, parts_recv, 0, &pack_length_unpack, tid,
+			2 * pack_vars->count_max_parts, e);
+	}
+
+	const ticks toc = getticks();
+	total_cpu_unpack_ticks += toc - tic;
+	pack_vars->count_parts = 0;
+	/*For some reason the code fails if we get a leaf pair task
+	 *this if statement stops the code from trying to unlock same cells twice*/
+	if(topid == pack_vars->top_tasks_packed -1 && cstart != n_leaves_found)
+		continue;
+    enqueue_dependencies(s, pack_vars->top_task_list[topid]);
+    pthread_mutex_lock(&s->sleep_mutex);
+    atomic_dec(&s->waiting);
+    pthread_cond_broadcast(&s->sleep_cond);
+    pthread_mutex_unlock(&s->sleep_mutex);
+  }
+}
+void runner_dopair1_launch_f4_g_one_memcpy(
+    struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+    struct task *t, struct part_aos_f4_g_send *parts_send,
+    struct part_aos_f4_g_recv *parts_recv,
+    struct part_aos_f4_g_send *d_parts_send,
+    struct part_aos_f4_g_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    float d_H, struct engine *e, double *packing_time, double *gpu_time,
+    double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
+    cudaEvent_t *pair_end) {
+
+  struct timespec t0, t1, tp0, tp1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /*tasks-packed needs decrementing before calculating packed_tmp as it was
+   * incremented in runner_dopair1_pack*/
+  //	const int packed_tmp = 2 * (tasks_packed - 1);
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero pair tasks packed but somehow got into GPU loop");
+    //	  pack_vars->bundle_first_part[nBundles_temp] =
+    // pack_vars->task_first_part[packed_tmp - 2];
+    pack_vars->bundle_first_part[nBundles_temp] =
+        fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  //	int max_parts = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count_i = fparti_fpartj_lparti_lpartj[tid].z -
+                      fparti_fpartj_lparti_lpartj[tid].x;
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = fparti_fpartj_lparti_lpartj[tid].w -
+                      fparti_fpartj_lparti_lpartj[tid].y;
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
+      }
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+                    &parts_send[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_g_send),
+                    cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
+              "is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
+#endif
+
+    //	  const int tasksperbundle = pack_vars->tasksperbundle;
+    /* LAUNCH THE GPU KERNELS for ci & cj */
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = 0;  // tasks_left;
+    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_part_0 = pack_vars->bundle_first_part[bid];
+    //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n",
+    //              bundle_part_0, bundle_first_task);
+
+    /* Launch the kernel for ci using data for ci and cj */
+    runner_dopair_branch_gradient_gpu_aos_f4(
+        d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
+        numBlocks_y, bundle_part_0, bundle_n_parts);
+
+#ifdef CUDA_DEBUG
+    cu_error = cudaPeekAtLastError();  // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(
+          stderr,
+          "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+          "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+          cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
+          max_parts_i, max_parts_j);
+      exit(0);
+    }
+#endif
+
+    // Copy results back to CPU BUFFERS
+    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+                    &d_parts_recv[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_g_recv),
+                    cudaMemcpyDeviceToHost, stream[bid]);
+    cudaEventRecord(pair_end[bid], stream[bid]);
+
+#ifdef CUDA_DEBUG
+    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
+                                       // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
+#endif
+  } /*End of looping over bundles to launch in streams*/
+
+  /* Make sure all the kernels and copies back are finished */
+  //	cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+
+  ticks total_cpu_unpack_ticks = 0.;
+
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+    /*Time unpacking*/
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+    //		cudaStreamSynchronize(stream[bid]);
+    cudaEventSynchronize(pair_end[bid]);
+
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *gpu_time +=
+        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+    /*Time unpacking*/
+    //		clock_gettime(CLOCK_REALTIME, &tp0);
+    //		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+      if (tid < tasks_packed) {
+        clock_gettime(CLOCK_REALTIME, &tp0);
+        /*grab cell and task pointers*/
+        struct cell *cii = pack_vars->ci_list[tid];
+        struct cell *cjj = pack_vars->cj_list[tid];
+        struct task *tii = pack_vars->task_list[tid];
+        /*Let's lock ci*/
+        while (cell_locktree(cii)) {
+          ; /* spin until we acquire the lock */
+        }
+        /*Let's lock cj*/
+        while (cell_locktree(cjj)) {
+          ; /* spin until we acquire the lock */
+        }
+
+        const ticks tic = getticks();
+
+        /* Do the copy */
+        runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(
+            r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
+            2 * pack_vars->count_max_parts, e);
+
+        const ticks toc = getticks();
+
+        total_cpu_unpack_ticks += toc - tic;
+
+        /* Record things for debugging */
+        cii->gpu_done_pair_g++;
+        cjj->gpu_done_pair_g++;
+        pthread_mutex_lock(&s->sleep_mutex);
+        atomic_dec(&s->waiting);
+        pthread_cond_broadcast(&s->sleep_cond);
+        pthread_mutex_unlock(&s->sleep_mutex);
+        /* Release the locks */
+        cell_unlocktree(cii);
+        /* Release the locks */
+        cell_unlocktree(cjj);
+
+        /*Time end of unpacking*/
+        clock_gettime(CLOCK_REALTIME, &tp1);
+        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+        /*schedule my dependencies (Only unpacks really)*/
+        enqueue_dependencies(s, tii);
+        /*Signal sleeping runners*/
+        // MATTHIEU signal_sleeping_runners(s, tii);
+
+        tii->gpu_done = 1;
+      }
+    }
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+
+  /* Write the timers back to the task */
+  t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
+  //	/*Time end of unpacking*/
+  //	clock_gettime(CLOCK_REALTIME, &t1);
+  //	*packing_time += (t1.tv_sec - t0.tv_sec) +
+  //	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+} /*End of GPU work*/
+
+void runner_dopair1_launch_f4_f_one_memcpy(
+    struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+    struct task *t, struct part_aos_f4_f_send *parts_send,
+    struct part_aos_f4_f_recv *parts_recv,
+    struct part_aos_f4_f_send *d_parts_send,
+    struct part_aos_f4_f_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    float d_H, struct engine *e, double *packing_time, double *gpu_time,
+    double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
+    cudaEvent_t *pair_end) {
+
+  struct timespec t0, t1, tp0, tp1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /*tasks-packed needs decrementing before calculating packed_tmp as it was
+   * incremented in runner_dopair1_pack*/
+  //	const int packed_tmp = 2 * (tasks_packed - 1);
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero pair tasks packed but somehow got into GPU loop");
+    //	  pack_vars->bundle_first_part[nBundles_temp] =
+    // pack_vars->task_first_part[packed_tmp - 2];
+    pack_vars->bundle_first_part[nBundles_temp] =
+        fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  //	int max_parts = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    //      const int first_task = bid * pack_vars->bundle_size;
+    //	  int last_task = (bid + 1) * bundle_size;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count_i = fparti_fpartj_lparti_lpartj[tid].z -
+                      fparti_fpartj_lparti_lpartj[tid].x;
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = fparti_fpartj_lparti_lpartj[tid].w -
+                      fparti_fpartj_lparti_lpartj[tid].y;
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
+
+        //		  last_task = tid;
+      }
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+                    &parts_send[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_f_send),
+                    cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
+              "is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
+#endif
+
+    //	  const int tasksperbundle = pack_vars->tasksperbundle;
+    /* LAUNCH THE GPU KERNELS for ci & cj */
+    //      int tid = 0;
+    //      int offset = bid * tasksperbundle;
+    //      int tasks_left = tasksperbundle;
+    //      if (bid == nBundles_temp - 1) {
+    //        tasks_left =
+    //        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    //      }
+
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = 0;  // tasks_left;
+    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_part_0 = pack_vars->bundle_first_part[bid];
+    //      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n",
+    //              bundle_part_0, bundle_first_task);
+
+    /* Launch the kernel for ci using data for ci and cj */
+    runner_dopair_branch_force_gpu_aos_f4(d_parts_send, d_parts_recv, d_a, d_H,
+                                          stream[bid], numBlocks_x, numBlocks_y,
+                                          bundle_part_0, bundle_n_parts);
+
+#ifdef CUDA_DEBUG
+    cu_error = cudaPeekAtLastError();  // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(
+          stderr,
+          "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+          "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+          cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
+          max_parts_i, max_parts_j);
+      exit(0);
+    }
+#endif
+
+    // Copy results back to CPU BUFFERS
+    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+                    &d_parts_recv[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_f_recv),
+                    cudaMemcpyDeviceToHost, stream[bid]);
+    cudaEventRecord(pair_end[bid], stream[bid]);
+
+#ifdef CUDA_DEBUG
+    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
+                                       // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
+#endif
+  } /*End of looping over bundles to launch in streams*/
+
+  /* Make sure all the kernels and copies back are finished */
+  //	cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  ticks total_cpu_unpack_ticks = 0.;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+    /*Time unpacking*/
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+    //		cudaStreamSynchronize(stream[bid]);
+    cudaEventSynchronize(pair_end[bid]);
+
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *gpu_time +=
+        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+    /*Time unpacking*/
+    //		clock_gettime(CLOCK_REALTIME, &tp0);
+    //		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+      if (tid < tasks_packed) {
+        clock_gettime(CLOCK_REALTIME, &tp0);
+        /*grab cell and task pointers*/
+        struct cell *cii = pack_vars->ci_list[tid];
+        struct cell *cjj = pack_vars->cj_list[tid];
+        struct task *tii = pack_vars->task_list[tid];
+        /*Let's lock ci*/
+        while (cell_locktree(cii)) {
+          ; /* spin until we acquire the lock */
+        }
+        /*Let's lock cj*/
+        while (cell_locktree(cjj)) {
+          ; /* spin until we acquire the lock */
+        }
+
+        const ticks tic = getticks();
+
+        /* Do the copy */
+        runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(
+            r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
+            2 * pack_vars->count_max_parts, e);
+
+        const ticks toc = getticks();
+
+        total_cpu_unpack_ticks += toc - tic;
+
+        /* Record things for debugging */
+        cii->gpu_done_pair_f++;
+        cjj->gpu_done_pair_f++;
+        pthread_mutex_lock(&s->sleep_mutex);
+        atomic_dec(&s->waiting);
+        pthread_cond_broadcast(&s->sleep_cond);
+        pthread_mutex_unlock(&s->sleep_mutex);
+        //		  /* Release the locks */
+        cell_unlocktree(cii);
+        //		  /* Release the locks */
+        cell_unlocktree(cjj);
+
+        /*Time end of unpacking*/
+        clock_gettime(CLOCK_REALTIME, &tp1);
+        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+        /*schedule my dependencies (Only unpacks really)*/
+        enqueue_dependencies(s, tii);
+        /*Signal sleeping runners*/
+        // MATTHIEU signal_sleeping_runners(s, tii);
+
+        tii->gpu_done = 1;
+      }
+    }
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+
+  /* Write the timers back to the task */
+  t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
+  //	/*Time end of unpacking*/
+  //	clock_gettime(CLOCK_REALTIME, &t1);
+  //	*packing_time += (t1.tv_sec - t0.tv_sec) +
+  //	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+} /*End of GPU work*/
diff --git a/src/runner_gpu_pack_functions.c b/src/runner_gpu_pack_functions.c
new file mode 100644
index 0000000000..af743e6172
--- /dev/null
+++ b/src/runner_gpu_pack_functions.c
@@ -0,0 +1,813 @@
+// #include "active.h"
+// #include <cuda_runtime.h>
+// #include <vector>
+// #include "cuda/cell_gpu.h"
+// #include "runner_gpu_functions.cuh"
+/* This object's header. */
+#include "runner.h"
+/* Local headers. */
+#include "active.h"
+#include "engine.h"
+#include "runner_gpu_pack_functions.h"
+#include "scheduler.h"
+#include "space_getsid.h"
+#include "timers.h"
+#include "runner_doiact_hydro.h"
+
+void runner_doself1_gpu_pack_neat_aos_f4(
+    struct runner *r, struct cell *__restrict__ c,
+    struct part_aos_f4_send *__restrict__ parts_aos_buffer, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp) {
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0) return;
+
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! count_max %i "
+            "count %i\n",
+            count_max_parts_tmp, local_pack_position + count);
+    error("0");
+  }
+#endif
+  int2 frst_lst_prts = {local_pack_position, local_pack_position + count};
+  /* Pack the particle data into CPU-side buffers*/
+  pack_neat_aos_f4(c, parts_aos_buffer, tid, local_pack_position, count,
+                   frst_lst_prts);
+  /* Increment pack length accordingly */
+  (*pack_length) += count;
+
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_doself1_gpu_pack_neat_aos_f4_g(
+    struct runner *r, struct cell *c,
+    struct part_aos_f4_g_send *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp) {
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0) return;
+
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
+    exit(0);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  pack_neat_aos_f4_g(c, parts_aos_buffer, tid, local_pack_position, count);
+  /* Increment pack length accordingly */
+  (*pack_length) += count;
+
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_doself1_gpu_pack_neat_aos_f4_f(
+    struct runner *r, struct cell *restrict c,
+    struct part_aos_f4_f_send *restrict parts_aos_buffer, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp) {
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0) return;
+
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
+    exit(0);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  pack_neat_aos_f4_f(c, parts_aos_buffer, tid, local_pack_position, count);
+  /* Increment pack length accordingly */
+  (*pack_length) += count;
+
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
+}
+
+extern inline void pack_neat_pair_aos_f4(
+    struct cell *__restrict c,
+    struct part_aos_f4_send *__restrict parts_aos_buffer, int tid,
+    const int local_pack_position, const int count, const float3 shift,
+    const int2 cstarts) {
+  /*Data to be copied to GPU*/
+  for (int i = 0; i < count; i++) {
+    const int id_in_pack = i + local_pack_position;
+    parts_aos_buffer[id_in_pack].x_p_h.x = c->hydro.parts[i].x[0] - shift.x;
+    parts_aos_buffer[id_in_pack].x_p_h.y = c->hydro.parts[i].x[1] - shift.y;
+    parts_aos_buffer[id_in_pack].x_p_h.z = c->hydro.parts[i].x[2] - shift.z;
+    parts_aos_buffer[id_in_pack].x_p_h.w = c->hydro.parts[i].h;
+    parts_aos_buffer[id_in_pack].ux_m.x = c->hydro.parts[i].v[0];
+    parts_aos_buffer[id_in_pack].ux_m.y = c->hydro.parts[i].v[1];
+    parts_aos_buffer[id_in_pack].ux_m.z = c->hydro.parts[i].v[2];
+    parts_aos_buffer[id_in_pack].ux_m.w = c->hydro.parts[i].mass;
+    parts_aos_buffer[id_in_pack].cjs_cje.x = cstarts.x;
+    parts_aos_buffer[id_in_pack].cjs_cje.y = cstarts.y;
+  }
+}
+
+void pack_neat_aos_f4(struct cell *__restrict__ c,
+                      struct part_aos_f4_send *__restrict__ parts_aos_buffer,
+                      int tid, int local_pack_position, int count,
+                      int2 frst_lst_prts) {
+
+  struct part ptmps[count];
+  memcpy(ptmps, (c->hydro.parts), count * sizeof(struct part));
+  //  ptmps = c->hydro.parts;
+  const float cellx = c->loc[0], celly = c->loc[1], cellz = c->loc[2];
+  for (int i = 0; i < count; i++) {
+    const int id_in_pack = i + local_pack_position;
+    //    const struct part p = ptmps[i];
+    /*Data to be copied to GPU*/
+    parts_aos_buffer[id_in_pack].x_p_h.x = ptmps[i].x[0] - cellx;
+    parts_aos_buffer[id_in_pack].x_p_h.y = ptmps[i].x[1] - celly;
+    parts_aos_buffer[id_in_pack].x_p_h.z = ptmps[i].x[2] - cellz;
+    parts_aos_buffer[id_in_pack].x_p_h.w = ptmps[i].h;
+    parts_aos_buffer[id_in_pack].ux_m.x = ptmps[i].v[0];
+    parts_aos_buffer[id_in_pack].ux_m.y = ptmps[i].v[1];
+    parts_aos_buffer[id_in_pack].ux_m.z = ptmps[i].v[2];
+    parts_aos_buffer[id_in_pack].ux_m.w = ptmps[i].mass;
+    //    /*Initialise sums to zero before CPU/GPU copy*/
+    //    const float4 zeroes = {0.0, 0.0, 0.0, 0.0};
+    //    parts_aos_buffer[id_in_pack].rho_dh_wcount = zeroes;
+    //    parts_aos_buffer[id_in_pack].rot_ux_div_v = zeroes;
+  }
+}
+
+void pack_neat_aos_f4_g(struct cell *c,
+                        struct part_aos_f4_g_send *parts_aos_buffer, int tid,
+                        int local_pack_position, int count) {
+
+  const struct part *ptmps;
+  ptmps = c->hydro.parts;
+  const float cellx = c->loc[0], celly = c->loc[1], cellz = c->loc[2];
+  for (int i = 0; i < count; i++) {
+    int id_in_pack = i + local_pack_position;
+    const struct part p = ptmps[i];
+    /*Data to be copied to GPU*/
+    parts_aos_buffer[id_in_pack].x_h.x = p.x[0] - cellx;
+    parts_aos_buffer[id_in_pack].x_h.y = p.x[1] - celly;
+    parts_aos_buffer[id_in_pack].x_h.z = p.x[2] - cellz;
+    parts_aos_buffer[id_in_pack].x_h.w = p.h;
+    parts_aos_buffer[id_in_pack].ux_m.x = p.v[0];
+    parts_aos_buffer[id_in_pack].ux_m.y = p.v[1];
+    parts_aos_buffer[id_in_pack].ux_m.z = p.v[2];
+    parts_aos_buffer[id_in_pack].ux_m.w = p.mass;
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.x = p.rho;
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.y = p.viscosity.alpha;
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.z = p.u;  // p.density.rot_v[0];
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.w =
+        p.force.soundspeed;  // p.density.rot_v[0];
+  }
+}
+
+extern inline void pack_neat_pair_aos_f4_g(
+    struct cell *__restrict c,
+    struct part_aos_f4_g_send *__restrict parts_aos_buffer, int tid,
+    const int local_pack_position, const int count, const float3 shift,
+    const int2 cstarts) {
+  /*Data to be copied to GPU*/
+  for (int i = 0; i < count; i++) {
+    const int id_in_pack = i + local_pack_position;
+    parts_aos_buffer[id_in_pack].x_h.x = c->hydro.parts[i].x[0] - shift.x;
+    parts_aos_buffer[id_in_pack].x_h.y = c->hydro.parts[i].x[1] - shift.y;
+    parts_aos_buffer[id_in_pack].x_h.z = c->hydro.parts[i].x[2] - shift.z;
+    parts_aos_buffer[id_in_pack].x_h.w = c->hydro.parts[i].h;
+    parts_aos_buffer[id_in_pack].ux_m.x = c->hydro.parts[i].v[0];
+    parts_aos_buffer[id_in_pack].ux_m.y = c->hydro.parts[i].v[1];
+    parts_aos_buffer[id_in_pack].ux_m.z = c->hydro.parts[i].v[2];
+    parts_aos_buffer[id_in_pack].ux_m.w = c->hydro.parts[i].mass;
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.x = c->hydro.parts[i].rho;
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.y =
+        c->hydro.parts[i].viscosity.alpha;
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.z =
+        c->hydro.parts[i].u;  // p.density.rot_v[0];
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.w =
+        c->hydro.parts[i].force.soundspeed;  // p.density.rot_v[0];
+    parts_aos_buffer[id_in_pack].cjs_cje.x = cstarts.x;
+    parts_aos_buffer[id_in_pack].cjs_cje.y = cstarts.y;
+  }
+}
+
+void pack_neat_aos_f4_f(const struct cell *restrict c,
+                        struct part_aos_f4_f_send *restrict parts_aos, int tid,
+                        int local_pack_position, int count) {
+
+  //  const struct part *restrict ptmps;
+  //  ptmps = c->hydro.parts;
+  const int pp = local_pack_position;
+  const float cellx = c->loc[0];
+  const float celly = c->loc[1];
+  const float cellz = c->loc[2];
+  /*Data to be copied to GPU local memory*/
+  for (int i = 0; i < count; i++) {
+    parts_aos[i + pp].x_h.x = c->hydro.parts[i].x[0] - cellx;
+    parts_aos[i + pp].x_h.y = c->hydro.parts[i].x[1] - celly;
+    parts_aos[i + pp].x_h.z = c->hydro.parts[i].x[2] - cellz;
+    parts_aos[i + pp].x_h.w = c->hydro.parts[i].h;
+  }
+  for (int i = 0; i < count; i++) {
+    parts_aos[i + pp].ux_m.x = c->hydro.parts[i].v[0];
+    parts_aos[i + pp].ux_m.y = c->hydro.parts[i].v[1];
+    parts_aos[i + pp].ux_m.z = c->hydro.parts[i].v[2];
+    parts_aos[i + pp].ux_m.w = c->hydro.parts[i].mass;
+  }
+  for (int i = 0; i < count; i++) {
+    parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.x =
+        c->hydro.parts[i].force.f;
+    parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.y =
+        c->hydro.parts[i].force.balsara;
+    parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.z =
+        c->hydro.parts[i].time_bin;
+    parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.w =
+        c->hydro.parts[i].limiter_data.min_ngb_time_bin;
+  }
+  for (int i = 0; i < count; i++) {
+    parts_aos[i + pp].rho_p_c_vsigi.x = c->hydro.parts[i].rho;
+    parts_aos[i + pp].rho_p_c_vsigi.y = c->hydro.parts[i].force.pressure;
+    parts_aos[i + pp].rho_p_c_vsigi.z = c->hydro.parts[i].force.soundspeed;
+    parts_aos[i + pp].rho_p_c_vsigi.w = c->hydro.parts[i].viscosity.v_sig;
+  }
+  for (int i = 0; i < count; i++) {
+    parts_aos[i + pp].u_alphavisc_alphadiff.x = c->hydro.parts[i].u;
+    parts_aos[i + pp].u_alphavisc_alphadiff.y =
+        c->hydro.parts[i].viscosity.alpha;
+    parts_aos[i + pp].u_alphavisc_alphadiff.z =
+        c->hydro.parts[i].diffusion.alpha;
+  }
+}
+
+extern inline void pack_neat_pair_aos_f4_f(
+    struct cell *__restrict c, struct part_aos_f4_f_send *__restrict parts_aos,
+    int tid, const int local_pack_position, const int count, const float3 shift,
+    const int2 cstarts) {
+  //  const struct part *restrict ptmps;
+  //  ptmps = c->hydro.parts;
+  const int pp = local_pack_position;
+  /*Data to be copied to GPU local memory*/
+  for (int i = 0; i < count; i++) {
+    const int id = i + pp;
+    parts_aos[id].x_h.x = c->hydro.parts[i].x[0] - shift.x;
+    parts_aos[id].x_h.y = c->hydro.parts[i].x[1] - shift.y;
+    parts_aos[id].x_h.z = c->hydro.parts[i].x[2] - shift.z;
+    parts_aos[id].x_h.w = c->hydro.parts[i].h;
+    parts_aos[id].ux_m.x = c->hydro.parts[i].v[0];
+    parts_aos[id].ux_m.y = c->hydro.parts[i].v[1];
+    parts_aos[id].ux_m.z = c->hydro.parts[i].v[2];
+    parts_aos[id].ux_m.w = c->hydro.parts[i].mass;
+    parts_aos[id].f_bals_timebin_mintimebin_ngb.x = c->hydro.parts[i].force.f;
+    parts_aos[id].f_bals_timebin_mintimebin_ngb.y =
+        c->hydro.parts[i].force.balsara;
+    parts_aos[id].f_bals_timebin_mintimebin_ngb.z = c->hydro.parts[i].time_bin;
+    parts_aos[id].f_bals_timebin_mintimebin_ngb.w =
+        c->hydro.parts[i].limiter_data.min_ngb_time_bin;
+    parts_aos[id].rho_p_c_vsigi.x = c->hydro.parts[i].rho;
+    parts_aos[id].rho_p_c_vsigi.y = c->hydro.parts[i].force.pressure;
+    parts_aos[id].rho_p_c_vsigi.z = c->hydro.parts[i].force.soundspeed;
+    parts_aos[id].rho_p_c_vsigi.w = c->hydro.parts[i].viscosity.v_sig;
+    parts_aos[id].u_alphavisc_alphadiff.x = c->hydro.parts[i].u;
+    parts_aos[id].u_alphavisc_alphadiff.y = c->hydro.parts[i].viscosity.alpha;
+    parts_aos[id].u_alphavisc_alphadiff.z = c->hydro.parts[i].diffusion.alpha;
+    parts_aos[id].cjs_cje.x = cstarts.x;
+    parts_aos[id].cjs_cje.y = cstarts.y;
+  }
+}
+
+void runner_doself1_gpu_unpack_neat_aos_f4(
+    struct runner *r, struct cell *c, struct part_aos_f4_recv *parts_aos_buffer,
+    int timer, int *pack_length, int tid, int count_max_parts_tmp,
+    struct engine *e) {
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0) return;
+  if (!cell_is_active_hydro(c, e)) {
+    message("Inactive cell\n");
+    return;
+  }
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count, e);
+  }
+#endif
+
+  /* Copy particle data from CPU buffers to cells */
+  unpack_neat_aos_f4(c, parts_aos_buffer, tid, local_pack_position, count, e);
+  // Increment pack length accordingly
+  (*pack_length) += count;
+}
+
+void runner_doself1_gpu_unpack_neat_aos_f4_g(
+    struct runner *r, struct cell *c,
+    struct part_aos_f4_g_recv *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp, struct engine *e) {
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0) return;
+  if (!cell_is_active_hydro(c, e)) {
+    message("Inactive cell\n");
+    return;
+  }
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count, e);
+  }
+#endif
+
+  /* Copy particle data from CPU buffers to cells */
+  unpack_neat_aos_f4_g(c, parts_aos_buffer, tid, local_pack_position, count, e);
+  // Increment pack length accordingly
+  (*pack_length) += count;
+}
+
+void runner_doself1_gpu_unpack_neat_aos_f4_f(
+    struct runner *r, struct cell *c,
+    struct part_aos_f4_f_recv *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp, struct engine *e) {
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0) return;
+  if (!cell_is_active_hydro(c, e)) {
+    message("Inactive cell\n");
+    return;
+  }
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count, e);
+  }
+#endif
+
+  /* Copy particle data from CPU buffers to cells */
+  unpack_neat_aos_f4_f(c, parts_aos_buffer, tid, local_pack_position, count, e);
+  // Increment pack length accordingly
+  (*pack_length) += count;
+}
+
+#include <stdatomic.h>
+void unpack_neat_aos_f4(struct cell *c,
+                        struct part_aos_f4_recv *parts_aos_buffer, int tid,
+                        int local_pack_position, int count, struct engine *e) {
+
+  struct part_aos_f4_recv *parts_tmp = &parts_aos_buffer[local_pack_position];
+  for (int i = 0; i < count; i++) {
+
+    struct part_aos_f4_recv p_tmp = parts_tmp[i];
+    float4 rho_dh_wcount = p_tmp.rho_dh_wcount;
+    float4 rot_ux_div_v = p_tmp.rot_ux_div_v;
+    struct part *p = &c->hydro.parts[i];
+    if(!PART_IS_ACTIVE(p, e))continue;
+    p->rho += rho_dh_wcount.x;
+    p->density.rho_dh += rho_dh_wcount.y;
+    p->density.wcount += rho_dh_wcount.z;
+    p->density.wcount_dh += rho_dh_wcount.w;
+    p->density.rot_v[0] += rot_ux_div_v.x;
+    p->density.rot_v[1] += rot_ux_div_v.y;
+    p->density.rot_v[2] += rot_ux_div_v.z;
+    p->viscosity.div_v += rot_ux_div_v.w;
+  }
+}
+
+void unpack_neat_aos_f4_g(struct cell *c,
+                          struct part_aos_f4_g_recv *parts_aos_buffer, int tid,
+                          int local_pack_position, int count,
+                          struct engine *e) {
+
+  struct part_aos_f4_g_recv *parts_tmp = &parts_aos_buffer[local_pack_position];
+  for (int i = 0; i < count; i++) {
+    struct part_aos_f4_g_recv p_tmp = parts_tmp[i];
+    struct part *p = &c->hydro.parts[i];
+    if(!PART_IS_ACTIVE(p, e))continue;
+    const float v_sig = p->viscosity.v_sig;
+    p->viscosity.v_sig = fmaxf(p_tmp.vsig_lapu_aviscmax.x, v_sig);
+    p->diffusion.laplace_u += p_tmp.vsig_lapu_aviscmax.y;
+    const float max_ngb = p->force.alpha_visc_max_ngb;
+    p->force.alpha_visc_max_ngb = fmaxf(p_tmp.vsig_lapu_aviscmax.z, max_ngb);
+  }
+}
+
+void unpack_neat_aos_f4_f(struct cell *restrict c,
+                          struct part_aos_f4_f_recv *restrict parts_aos_buffer,
+                          int tid, int local_pack_position, int count,
+                          struct engine *e) {
+  int pp = local_pack_position;
+  for (int i = 0; i < count; i++) {
+	if(!PART_IS_ACTIVE(&c->hydro.parts[i], e))continue;
+    c->hydro.parts[i].a_hydro[0] += parts_aos_buffer[i + pp].a_hydro.x;
+    c->hydro.parts[i].a_hydro[1] += parts_aos_buffer[i + pp].a_hydro.y;
+    c->hydro.parts[i].a_hydro[2] += parts_aos_buffer[i + pp].a_hydro.z;
+  }
+  for (int i = 0; i < count; i++) {
+	if(!PART_IS_ACTIVE(&c->hydro.parts[i], e))continue;
+    c->hydro.parts[i].viscosity.v_sig =
+        fmaxf(parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.z,
+              c->hydro.parts[i].viscosity.v_sig);
+    c->hydro.parts[i].limiter_data.min_ngb_time_bin =
+        (int)(parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.w + 0.5f);
+  }
+  for (int i = 0; i < count; i++) {
+    if(!PART_IS_ACTIVE(&c->hydro.parts[i], e))continue;
+    c->hydro.parts[i].u_dt +=
+        parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.x;
+    c->hydro.parts[i].force.h_dt +=
+        parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.y;
+  }
+}
+
+void unpack_neat_pair_aos_f4(struct runner *r, struct cell *restrict c,
+                             struct part_aos_f4_recv *restrict parts_aos_buffer,
+                             int tid, int local_pack_position, int count,
+                             struct engine *e) {
+
+  //  struct part_aos_f4_recv * restrict parts_tmp =
+  //  &parts_aos_buffer[local_pack_position];
+  if (cell_is_active_hydro(c, e)) {
+    int pp = local_pack_position;
+    for (int i = 0; i < count; i++) {
+      int j = i + pp;
+      c->hydro.parts[i].rho += parts_aos_buffer[j].rho_dh_wcount.x;
+      c->hydro.parts[i].density.rho_dh += parts_aos_buffer[j].rho_dh_wcount.y;
+      c->hydro.parts[i].density.wcount += parts_aos_buffer[j].rho_dh_wcount.z;
+      c->hydro.parts[i].density.wcount_dh +=
+          parts_aos_buffer[j].rho_dh_wcount.w;
+      c->hydro.parts[i].density.rot_v[0] += parts_aos_buffer[j].rot_ux_div_v.x;
+      c->hydro.parts[i].density.rot_v[1] += parts_aos_buffer[j].rot_ux_div_v.y;
+      c->hydro.parts[i].density.rot_v[2] += parts_aos_buffer[j].rot_ux_div_v.z;
+      c->hydro.parts[i].viscosity.div_v += parts_aos_buffer[j].rot_ux_div_v.w;
+    }
+  }
+}
+
+void unpack_neat_pair_aos_f4_g(
+    struct runner *r, struct cell *restrict c,
+    struct part_aos_f4_g_recv *restrict parts_aos_buffer, int tid,
+    int local_pack_position, int count, struct engine *e) {
+  //  struct part_aos_f4_recv * restrict parts_tmp =
+  //  &parts_aos_buffer[local_pack_position]; int pp = local_pack_position; for
+  //  (int i = 0; i < count; i++) {
+  //	  int j = i + pp;
+  //	  c->hydro.parts[i].viscosity.v_sig =
+  // parts_aos_buffer[j].vsig_lapu_aviscmax.x;
+  //	  c->hydro.parts[i].diffusion.laplace_u +=
+  // parts_aos_buffer[j].vsig_lapu_aviscmax.y;
+  //	  c->hydro.parts[i].force.alpha_visc_max_ngb =
+  // parts_aos_buffer[j].vsig_lapu_aviscmax.z;
+  //  }
+  if (cell_is_active_hydro(c, e)) {
+
+    struct part_aos_f4_g_recv *parts_tmp =
+        &parts_aos_buffer[local_pack_position];
+    for (int i = 0; i < count; i++) {
+      struct part_aos_f4_g_recv p_tmp = parts_tmp[i];
+      struct part *p = &c->hydro.parts[i];
+      const float v_sig = p->viscosity.v_sig;
+      p->viscosity.v_sig = fmaxf(p_tmp.vsig_lapu_aviscmax.x, v_sig);
+      p->diffusion.laplace_u += p_tmp.vsig_lapu_aviscmax.y;
+      const float max_ngb = p->force.alpha_visc_max_ngb;
+      p->force.alpha_visc_max_ngb = fmaxf(p_tmp.vsig_lapu_aviscmax.z, max_ngb);
+    }
+  }
+}
+
+void unpack_neat_pair_aos_f4_f(
+    struct runner *r, struct cell *restrict c,
+    struct part_aos_f4_f_recv *restrict parts_aos_buffer, int tid,
+    int local_pack_position, int count, struct engine *e) {
+  //	  struct part_aos_f4_f_recv *restrict parts_tmp =
+  //&parts_aos_buffer[local_pack_position];
+  if (cell_is_active_hydro(c, e)) {
+    int pp = local_pack_position;
+    for (int i = 0; i < count; i++) {
+      //	      struct part_aos_f4_f_recv p_tmp = parts_tmp[i];
+      //	      struct part *restrict p = &c->hydro.parts[i];
+      int j = i + pp;
+      c->hydro.parts[i].a_hydro[0] += parts_aos_buffer[j].a_hydro.x;
+      c->hydro.parts[i].a_hydro[1] += parts_aos_buffer[j].a_hydro.y;
+      c->hydro.parts[i].a_hydro[2] += parts_aos_buffer[j].a_hydro.z;
+      c->hydro.parts[i].viscosity.v_sig =
+          fmaxf(parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.z,
+                c->hydro.parts[i].viscosity.v_sig);
+      c->hydro.parts[i].limiter_data.min_ngb_time_bin =
+          (int)(parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.w + 0.5f);
+      c->hydro.parts[i].u_dt +=
+          parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.x;
+      c->hydro.parts[i].force.h_dt +=
+          parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.y;
+    }
+  }
+}
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4(
+    struct runner *r, struct cell *ci, struct cell *cj,
+    struct part_aos_f4_recv *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp, struct engine *e) {
+
+  /* Anything to do here? */
+//    if (ci->hydro.count == 0 || cj->hydro.count == 0)
+//      return;
+  if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)) {
+    message("Inactive cell\n");
+    return;
+  }
+  int count_ci = ci->hydro.count;
+  int count_cj = cj->hydro.count;
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count_ci, e);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f4(r, ci, parts_aos_buffer, tid, local_pack_position,
+                          count_ci, e);
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f4(r, cj, parts_aos_buffer, tid, local_pack_position,
+                          count_cj, e);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
+  //  if(r->cpuid == 0)exit(0);
+}
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(
+    struct runner *r, struct cell *ci, struct cell *cj,
+    struct part_aos_f4_g_recv *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp, struct engine *e) {
+
+  /* Anything to do here? */
+  //  if (c->hydro.count == 0)
+  //    return;
+  if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)) {
+    message("Inactive cell\n");
+    return;
+  }
+  int count_ci = ci->hydro.count;
+  int count_cj = cj->hydro.count;
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count_ci, e);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f4_g(r, ci, parts_aos_buffer, tid, local_pack_position,
+                            count_ci, e);
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f4_g(r, cj, parts_aos_buffer, tid, local_pack_position,
+                            count_cj, e);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
+  //  if(r->cpuid == 0)exit(0);
+}
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(
+    struct runner *r, struct cell *ci, struct cell *cj,
+    struct part_aos_f4_f_recv *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp, struct engine *e) {
+
+  /* Anything to do here? */
+  //  if (c->hydro.count == 0)
+  //    return;
+  if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)) {
+    message("Inactive cell\n");
+    return;
+  }
+  int count_ci = ci->hydro.count;
+  int count_cj = cj->hydro.count;
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count_ci, e);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f4_f(r, ci, parts_aos_buffer, tid, local_pack_position,
+                            count_ci, e);
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f4_f(r, cj, parts_aos_buffer, tid, local_pack_position,
+                            count_cj, e);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
+  //  if(r->cpuid == 0)exit(0);
+}
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f4(
+    struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
+    struct part_aos_f4_send *restrict parts_aos_buffer, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp, const int count_ci,
+    const int count_cj, float3 shift_tmp) {
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (ci->hydro.count == 0) return;
+
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
+            "ci %i cj %i count_max %i\n",
+            local_pack_position, count_ci, count_cj, count_max_parts_tmp);
+    error();
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  const float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1],
+                          shift_tmp.z + cj->loc[2]};
+  const int lpp1 = local_pack_position;
+
+  const int2 cis_cie = {local_pack_position, local_pack_position + count_ci};
+
+  const int2 cjs_cje = {local_pack_position + count_ci,
+                        local_pack_position + count_ci + count_cj};
+
+  pack_neat_pair_aos_f4(ci, parts_aos_buffer, tid, lpp1, count_ci, shift_i,
+                        cjs_cje);
+
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+  const float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]};
+  const int lpp2 = local_pack_position;
+
+  pack_neat_pair_aos_f4(cj, parts_aos_buffer, tid, lpp2, count_cj, shift_j,
+                        cis_cie);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
+
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f4_g(
+    struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
+    struct part_aos_f4_g_send *restrict parts_aos_buffer, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp, const int count_ci,
+    const int count_cj, float3 shift_tmp) {
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (ci->hydro.count == 0) return;
+
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
+            "ci %i cj %i count_max %i\n",
+            local_pack_position, count_ci, count_cj, count_max_parts_tmp);
+    error();
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  const float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1],
+                          shift_tmp.z + cj->loc[2]};
+  const int lpp1 = local_pack_position;
+
+  const int2 cis_cie = {local_pack_position, local_pack_position + count_ci};
+
+  const int2 cjs_cje = {local_pack_position + count_ci,
+                        local_pack_position + count_ci + count_cj};
+
+  pack_neat_pair_aos_f4_g(ci, parts_aos_buffer, tid, lpp1, count_ci, shift_i,
+                          cjs_cje);
+
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+  const float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]};
+  const int lpp2 = local_pack_position;
+
+  pack_neat_pair_aos_f4_g(cj, parts_aos_buffer, tid, lpp2, count_cj, shift_j,
+                          cis_cie);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
+
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f4_f(
+    struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
+    struct part_aos_f4_f_send *restrict parts_aos_buffer, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp, const int count_ci,
+    const int count_cj, float3 shift_tmp) {
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (ci->hydro.count == 0) return;
+
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
+            "ci %i cj %i count_max %i\n",
+            local_pack_position, count_ci, count_cj, count_max_parts_tmp);
+    error();
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  const float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1],
+                          shift_tmp.z + cj->loc[2]};
+  const int lpp1 = local_pack_position;
+
+  const int2 cis_cie = {local_pack_position, local_pack_position + count_ci};
+
+  const int2 cjs_cje = {local_pack_position + count_ci,
+                        local_pack_position + count_ci + count_cj};
+
+  pack_neat_pair_aos_f4_f(ci, parts_aos_buffer, tid, lpp1, count_ci, shift_i,
+                          cjs_cje);
+
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+  const float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]};
+  const int lpp2 = local_pack_position;
+
+  pack_neat_pair_aos_f4_f(cj, parts_aos_buffer, tid, lpp2, count_cj, shift_j,
+                          cis_cie);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
+
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
+}
+// #ifdef WITHCUDA
+// }
+// #endif
diff --git a/src/runner_gpu_pack_functions.h b/src/runner_gpu_pack_functions.h
new file mode 100644
index 0000000000..8730219711
--- /dev/null
+++ b/src/runner_gpu_pack_functions.h
@@ -0,0 +1,246 @@
+#include "cuda/part_gpu.h"
+void runner_doself1_gpu_pack(
+    struct runner *r, struct cell *c, int timer, int *pack_length, double *x_p,
+    double *y_p, double *z_p, int tid, int *tid_p, long long *id, float *ux,
+    float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+    float *mass, float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
+    float *locx, float *locy, float *locz, float *widthx, float *widthy,
+    float *widthz, float *h_max, int *count_p, float *wcount, float *wcount_dh,
+    float *rho_dh, float *rot_u, float *rot_v, float *rot_w, float *div_v,
+    float *div_v_previous_step, float *alpha_visc, float *v_sig,
+    float *laplace_u, float *alpha_diff, float *f, float *soundspeed,
+    float *h_dt, float *balsara, float *pressure, float *alpha_visc_max_ngb,
+    timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+    char *to_be_synchronized, int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat(struct runner *r, struct cell *c,
+                                  struct part_soa parts_soa, int timer,
+                                  int *pack_length, int tid,
+                                  int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos(struct runner *r, struct cell *c,
+                                      struct part_aos *parts_aos, int timer,
+                                      int *pack_length, int tid,
+                                      int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos_f4(
+    struct runner *r, struct cell *__restrict__ c,
+    struct part_aos_f4_send *__restrict__ parts_aos, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos_g(struct runner *r, struct cell *c,
+                                        struct part_aos_g *parts_aos, int timer,
+                                        int *pack_length, int tid,
+                                        int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos_f4_g(struct runner *r, struct cell *c,
+                                           struct part_aos_f4_g_send *parts_aos,
+                                           int timer, int *pack_length, int tid,
+                                           int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos_f(struct runner *r, struct cell *c,
+                                        struct part_aos_f *parts_aos, int timer,
+                                        int *pack_length, int tid,
+                                        int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos_f4_f(
+    struct runner *r, struct cell *restrict c,
+    struct part_aos_f4_f_send *restrict parts_aos, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp);
+void runner_doself1_gpu_pack_forc_aos(struct runner *r, struct cell *c,
+                                      struct part_aos *parts_aos, int timer,
+                                      int *pack_length, int tid,
+                                      int count_max_parts_tmp);
+void runner_doself1_gpu_pack_grad_aos(struct runner *r, struct cell *c,
+                                      struct part_aos *parts_aos, int timer,
+                                      int *pack_length, int tid,
+                                      int count_max_parts_tmp);
+void runner_doself1_gpu_unpack_neat(struct runner *r, struct cell *c,
+                                    struct part_soa parts_soa, int timer,
+                                    int *pack_length, int tid,
+                                    int count_max_parts_tmp, struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos(struct runner *r, struct cell *c,
+                                        struct part_aos *parts_aos_buffer,
+                                        int timer, int *pack_length, int tid,
+                                        int count_max_parts_tmp,
+                                        struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_f4(
+    struct runner *r, struct cell *c, struct part_aos_f4_recv *parts_aos_buffer,
+    int timer, int *pack_length, int tid, int count_max_parts_tmp,
+    struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_g(struct runner *r, struct cell *c,
+                                          struct part_aos_g *parts_aos_buffer,
+                                          int timer, int *pack_length, int tid,
+                                          int count_max_parts_tmp,
+                                          struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_f4_g(
+    struct runner *r, struct cell *c,
+    struct part_aos_f4_g_recv *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp, struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_f(struct runner *r, struct cell *c,
+                                          struct part_aos_f *parts_aos_buffer,
+                                          int timer, int *pack_length, int tid,
+                                          int count_max_parts_tmp,
+                                          struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_f4_f(
+    struct runner *r, struct cell *restrict c,
+    struct part_aos_f4_f_recv *restrict parts_aos_buffer, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
+void pack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
+          int *tid_p, long long *id, float *ux, float *uy, float *uz,
+          float *a_hydrox, float *a_hydroy, float *a_hydroz, float *mass,
+          float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
+          float *locx, float *locy, float *locz, float *widthx, float *widthy,
+          float *widthz, float *h_max, int *count_p, float *wcount,
+          float *wcount_dh, float *rho_dh, float *rot_u, float *rot_v,
+          float *rot_w, float *div_v, float *div_v_previous_step,
+          float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+          float *f, float *soundspeed, float *h_dt, float *balsara,
+          float *pressure, float *alpha_visc_max_ngb, timebin_t *time_bin,
+          timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+          char *to_be_synchronized, int local_pack_position, int count);
+void pack_neat(struct cell *c, struct part_soa parts_soa, int tid,
+               int local_pack_position, int count);
+void pack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid,
+                   int local_pack_position, int count);
+void pack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer,
+                     int tid, int local_pack_position, int count);
+void pack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos, int tid,
+                     int local_pack_position, int count);
+void pack_neat_aos_f4(struct cell *c, struct part_aos_f4_send *parts_aos_buffer,
+                      int tid, int local_pack_position, int count,
+                      int2 frst_lst_prts);
+void pack_neat_aos_f4_g(struct cell *c,
+                        struct part_aos_f4_g_send *parts_aos_buffer, int tid,
+                        int local_pack_position, int count);
+void pack_neat_aos_f4_f(const struct cell *restrict c,
+                        struct part_aos_f4_f_send *restrict parts_aos, int tid,
+                        int local_pack_position, int count);
+void unpack_neat(struct cell *c, struct part_soa parts_soa_buffer, int tid,
+                 int local_pack_position, int count, struct engine *e);
+void unpack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid,
+                     int local_pack_position, int count, struct engine *e);
+void unpack_neat_aos_f4(struct cell *c,
+                        struct part_aos_f4_recv *parts_aos_buffer, int tid,
+                        int local_pack_position, int count, struct engine *e);
+void unpack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer,
+                       int tid, int local_pack_position, int count,
+                       struct engine *e);
+void unpack_neat_aos_f4_g(struct cell *c,
+                          struct part_aos_f4_g_recv *parts_aos_buffer, int tid,
+                          int local_pack_position, int count, struct engine *e);
+void unpack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos_buffer,
+                       int tid, int local_pack_position, int count,
+                       struct engine *e);
+void unpack_neat_aos_f4_f(struct cell *restrict c,
+                          struct part_aos_f4_f_recv *restrict parts_aos_buffer,
+                          int tid, int local_pack_position, int count,
+                          struct engine *e);
+void unpack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
+            int *tid_p, long long *id, float *ux, float *uy, float *uz,
+            float *a_hydrox, float *a_hydroy, float *a_hydroz, float *mass,
+            float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
+            float *locx, float *locy, float *locz, float *widthx, float *widthy,
+            float *widthz, float *h_max, int *count_p, float *wcount,
+            float *wcount_dh, float *rho_dh, float *rot_u, float *rot_v,
+            float *rot_w, float *div_v, float *div_v_previous_step,
+            float *alpha_visc, float *v_sig, float *laplace_u,
+            float *alpha_diff, float *f, float *soundspeed, float *h_dt,
+            float *balsara, float *pressure, float *alpha_visc_max_ngb,
+            timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+            char *to_be_synchronized, int local_pack_position, int count,
+            struct engine *e);
+void runner_doself1_gpu_unpack(
+    struct runner *r, struct cell *c, int timer, int *pack_length, double *x_p,
+    double *y_p, double *z_p, int tid, int *tid_p, long long *id, float *ux,
+    float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+    float *mass, float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
+    float *locx, float *locy, float *locz, float *widthx, float *widthy,
+    float *widthz, float *h_max, int *count_p, float *wcount, float *wcount_dh,
+    float *rho_dh, float *rot_u, float *rot_v, float *rot_w, float *div_v,
+    float *div_v_previous_step, float *alpha_visc, float *v_sig,
+    float *laplace_u, float *alpha_diff, float *f, float *soundspeed,
+    float *h_dt, float *balsara, float *pressure, float *alpha_visc_max_ngb,
+    timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+    char *to_be_synchronized, int count_max_parts_tmp, struct engine *e);
+
+void runner_do_ci_cj_gpu_pack_neat(struct runner *r, struct cell *ci,
+                                   struct cell *cj,
+                                   struct part_soa parts_soa_buffer, int timer,
+                                   int *pack_length, int tid,
+                                   int count_max_parts_tmp, int count_ci,
+                                   int count_cj);
+
+void runner_do_ci_cj_gpu_pack_neat_aos(struct runner *r, struct cell *ci,
+                                       struct cell *cj,
+                                       struct part_aos *parts_aos_buffer,
+                                       int timer, int *pack_length, int tid,
+                                       int count_max_parts_tmp, int count_ci,
+                                       int count_cj, float3 shift_tmp);
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f4(
+    struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
+    struct part_aos_f4_send *restrict parts_aos_buffer, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp, const int count_ci,
+    const int count_cj, float3 shift_tmp);
+
+void runner_do_ci_cj_gpu_pack_neat_aos_g(struct runner *r, struct cell *ci,
+                                         struct cell *cj,
+                                         struct part_aos_g *parts_aos_buffer,
+                                         int timer, int *pack_length, int tid,
+                                         int count_max_parts_tmp, int count_ci,
+                                         int count_cj);
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f4_g(
+    struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
+    struct part_aos_f4_g_send *restrict parts_aos_buffer, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp, const int count_ci,
+    const int count_cj, float3 shift_tmp);
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f(struct runner *r, struct cell *ci,
+                                         struct cell *cj,
+                                         struct part_aos_f *parts_aos_buffer,
+                                         int timer, int *pack_length, int tid,
+                                         int count_max_parts_tmp, int count_ci,
+                                         int count_cj);
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f4_f(
+    struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
+    struct part_aos_f4_f_send *restrict parts_aos_buffer, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp, const int count_ci,
+    const int count_cj, float3 shift_tmp);
+
+void runner_do_ci_cj_gpu_unpack_neat(struct runner *r, struct cell *ci,
+                                     struct cell *cj,
+                                     struct part_soa parts_soa_buffer,
+                                     int timer, int *pack_length, int tid,
+                                     int count_max_parts_tmp, struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos(struct runner *r, struct cell *ci,
+                                         struct cell *cj,
+                                         struct part_aos *parts_aos_buffer,
+                                         int timer, int *pack_length, int tid,
+                                         int count_max_parts_tmp,
+                                         struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4(
+    struct runner *r, struct cell *ci, struct cell *cj,
+    struct part_aos_f4_recv *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp, struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(
+    struct runner *r, struct cell *ci, struct cell *cj,
+    struct part_aos_f4_g_recv *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp, struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_g(struct runner *r, struct cell *ci,
+                                           struct cell *cj,
+                                           struct part_aos_g *parts_aos_buffer,
+                                           int timer, int *pack_length, int tid,
+                                           int count_max_parts_tmp,
+                                           struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f(struct runner *r, struct cell *ci,
+                                           struct cell *cj,
+                                           struct part_aos_f *parts_aos_buffer,
+                                           int timer, int *pack_length, int tid,
+                                           int count_max_parts_tmp,
+                                           struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(
+    struct runner *r, struct cell *ci, struct cell *cj,
+    struct part_aos_f4_f_recv *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp, struct engine *e);
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
new file mode 100644
index 0000000000..2376aafba7
--- /dev/null
+++ b/src/runner_main_clean.cu
@@ -0,0 +1,1864 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+/* Config parameters. */
+#define GPUOFFLOAD_DENSITY 1   // off-load hydro density to GPU
+#define GPUOFFLOAD_GRADIENT 1  // off-load hydro gradient to GPU
+#define GPUOFFLOAD_FORCE 1     // off-load hydro force to GPU
+
+// #define DUMP_TIMINGS 1
+#include "../config.h"
+
+/* MPI headers. */
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Config parameters. */
+#include <config.h>
+
+/* MPI headers. */
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+/* This object's header. */
+#include "runner.h"
+
+/* Local headers. */
+#include "engine.h"
+#include "feedback.h"
+#include "runner_doiact_sinks.h"
+#include "scheduler.h"
+#include "space_getsid.h"
+#include "timers.h"
+
+/* Import the gravity loop functions. */
+#include "runner_doiact_grav.h"
+
+/* Import the density loop functions. */
+#define FUNCTION density
+#define FUNCTION_TASK_LOOP TASK_LOOP_DENSITY
+#include "runner_doiact_hydro.h"
+#include "runner_doiact_undef.h"
+
+/* Import the gradient loop functions (if required). */
+#ifdef EXTRA_HYDRO_LOOP
+#define FUNCTION gradient
+#define FUNCTION_TASK_LOOP TASK_LOOP_GRADIENT
+#include "runner_doiact_hydro.h"
+#include "runner_doiact_undef.h"
+#endif
+
+/* Import the force loop functions. */
+#define FUNCTION force
+#define FUNCTION_TASK_LOOP TASK_LOOP_FORCE
+#include "runner_doiact_hydro.h"
+#include "runner_doiact_undef.h"
+
+/* Import the limiter loop functions. */
+#define FUNCTION limiter
+#define FUNCTION_TASK_LOOP TASK_LOOP_LIMITER
+#include "runner_doiact_limiter.h"
+#include "runner_doiact_undef.h"
+
+/* Import the stars density loop functions. */
+#define FUNCTION density
+#define FUNCTION_TASK_LOOP TASK_LOOP_DENSITY
+#include "runner_doiact_stars.h"
+#include "runner_doiact_undef.h"
+
+#ifdef EXTRA_STAR_LOOPS
+
+/* Import the stars prepare1 loop functions. */
+#define FUNCTION prep1
+#define FUNCTION_TASK_LOOP TASK_LOOP_STARS_PREP1
+#include "runner_doiact_stars.h"
+#include "runner_doiact_undef.h"
+
+/* Import the stars prepare2 loop functions. */
+#define FUNCTION prep2
+#define FUNCTION_TASK_LOOP TASK_LOOP_STARS_PREP2
+#include "runner_doiact_stars.h"
+#include "runner_doiact_undef.h"
+
+#endif /* EXTRA_STAR_LOOPS */
+
+/* Import the stars feedback loop functions. */
+#define FUNCTION feedback
+#define FUNCTION_TASK_LOOP TASK_LOOP_FEEDBACK
+#include "runner_doiact_stars.h"
+#include "runner_doiact_undef.h"
+
+/* Import the black hole density loop functions. */
+#define FUNCTION density
+#define FUNCTION_TASK_LOOP TASK_LOOP_DENSITY
+#include "runner_doiact_black_holes.h"
+#include "runner_doiact_undef.h"
+
+/* Import the black hole feedback loop functions. */
+#define FUNCTION swallow
+#define FUNCTION_TASK_LOOP TASK_LOOP_SWALLOW
+#include "runner_doiact_black_holes.h"
+#include "runner_doiact_undef.h"
+
+/* Import the black hole feedback loop functions. */
+#define FUNCTION feedback
+#define FUNCTION_TASK_LOOP TASK_LOOP_FEEDBACK
+#include "runner_doiact_black_holes.h"
+#include "runner_doiact_undef.h"
+
+/* Import the RT gradient loop functions */
+#define FUNCTION rt_gradient
+#define FUNCTION_TASK_LOOP TASK_LOOP_RT_GRADIENT
+#include "runner_doiact_hydro.h"
+#include "runner_doiact_undef.h"
+
+/* Import the RT transport (force) loop functions. */
+#define FUNCTION rt_transport
+#define FUNCTION_TASK_LOOP TASK_LOOP_RT_TRANSPORT
+#include "runner_doiact_hydro.h"
+#include "runner_doiact_undef.h"
+
+#ifdef __cplusplus
+}
+#endif
+/**
+ * @brief The #runner main thread routine.
+ *
+ * @param data A pointer to this thread's data.
+ **/
+
+/* CUDA Header. Wrap in extern "C" to prevent C++ function name mangling */
+#ifdef WITH_CUDA
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "cuda/part_gpu.h"
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include "runner_doiact_functions_hydro_gpu.h"
+#include "runner_gpu_pack_functions.h"
+#include "cuda/GPU_runner_functions.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+void *runner_main2(void *data) {
+  struct runner *r = (struct runner *)data;
+  struct engine *e = r->e;
+  struct scheduler *sched = &e->sched;
+  struct space *space = e->s;
+
+  //////////Declare and allocate GPU launch control data structures/////////
+  /*pack_vars contain data required for self and pair packing tasks destined
+   *  for the GPU*/
+  //A. N: Needed
+  struct pack_vars_self *pack_vars_self_dens;
+  struct pack_vars_self *pack_vars_self_forc;
+  struct pack_vars_self *pack_vars_self_grad;
+  struct pack_vars_pair *pack_vars_pair_dens;
+  struct pack_vars_pair *pack_vars_pair_forc;
+  struct pack_vars_pair *pack_vars_pair_grad;
+
+  cudaMallocHost((void **)&pack_vars_self_dens,
+                 sizeof(struct pack_vars_self *));
+  cudaMallocHost((void **)&pack_vars_self_forc,
+                 sizeof(struct pack_vars_self *));
+  cudaMallocHost((void **)&pack_vars_self_grad,
+                 sizeof(struct pack_vars_self *));
+
+  cudaMallocHost((void **)&pack_vars_pair_dens,
+                 sizeof(struct pack_vars_pair *));
+  cudaMallocHost((void **)&pack_vars_pair_forc,
+                 sizeof(struct pack_vars_pair *));
+  cudaMallocHost((void **)&pack_vars_pair_grad,
+                 sizeof(struct pack_vars_pair *));
+  ///////////////////////////////////////////////////////////////////////////
+  /*Find and print GPU name(s)*/
+  int devId = 0;  //gpu device name
+  struct cudaDeviceProp prop;
+  int nDevices;
+  int maxBlocksSM;
+  int nSMs;
+  /*Get my rank*/
+  int mpi_rank = 0;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+#endif
+  cudaGetDeviceCount(&nDevices);
+  //If running on MPI we set code to use one MPI rank per GPU
+  //This was found to work very well and simplifies writing slurm scipts
+  if (nDevices == 1) cudaSetDevice(devId);
+#ifdef WITH_MPI
+  else {
+    cudaSetDevice(mpi_rank);
+    devId = mpi_rank;
+  }
+#endif
+  //Now tell me some info about my device
+  cudaGetDeviceProperties(&prop, devId);
+  cudaDeviceGetAttribute(&maxBlocksSM, cudaDevAttrMaxBlocksPerMultiprocessor,
+                         devId);
+  cudaDeviceGetAttribute(&nSMs, cudaDevAttrMultiProcessorCount, devId);
+  int nPartsPerCell = space->nr_parts / space->tot_cells;
+
+  if (r->cpuid == 0 && mpi_rank == 0) {
+    message("%i devices available device id is %i\n", nDevices, devId);
+    message("Device : %s\n", prop.name);
+    message("nSMs %i max blocks per SM %i maxnBlocks per stream %i\n",
+            nSMs, maxBlocksSM, nSMs * maxBlocksSM);
+    message("Target nBlocks per kernel is %i\n",
+            N_TASKS_BUNDLE_SELF * nPartsPerCell / BLOCK_SIZE);
+    message("Target nBlocks per stream is %i\n",
+            N_TASKS_PER_PACK_SELF * nPartsPerCell / BLOCK_SIZE);
+  }
+
+  cudaError_t cu_error;
+  size_t free_mem, total_mem;
+  cudaMemGetInfo(&free_mem, &total_mem);
+
+  message("free mem %lu, total mem %lu", free_mem, total_mem);
+  // how many tasks do we want for each launch of GPU kernel
+  const int target_n_tasks = sched->pack_size;
+  const int target_n_tasks_pair = sched->pack_size_pair;
+  pack_vars_self_dens->target_n_tasks = target_n_tasks;
+  pack_vars_pair_dens->target_n_tasks = target_n_tasks_pair;
+  pack_vars_self_forc->target_n_tasks = target_n_tasks;
+  pack_vars_pair_forc->target_n_tasks = target_n_tasks_pair;
+  pack_vars_self_grad->target_n_tasks = target_n_tasks;
+  pack_vars_pair_grad->target_n_tasks = target_n_tasks_pair;
+  // how many tasks we want in each bundle (used for launching kernels in
+  // different streams)
+  const int bundle_size = N_TASKS_BUNDLE_SELF;
+  const int bundle_size_pair = N_TASKS_BUNDLE_PAIR;
+  pack_vars_self_dens->bundle_size = bundle_size;
+  pack_vars_pair_dens->bundle_size = bundle_size_pair;
+  pack_vars_self_forc->bundle_size = bundle_size;
+  pack_vars_pair_forc->bundle_size = bundle_size_pair;
+  pack_vars_self_grad->bundle_size = bundle_size;
+  pack_vars_pair_grad->bundle_size = bundle_size_pair;
+  // Keep track of first and last particles for each task (particle data is
+  // arranged in long arrays containing particles from all the tasks we will
+  // work with)
+  /* A. N.: Needed for offloading self tasks as we use these to sort through
+   *        which parts need to interact with which */
+  int2 *task_first_part_f4;
+  int2 *task_first_part_f4_f;
+  int2 *task_first_part_f4_g;
+  int2 *d_task_first_part_f4;
+  int2 *d_task_first_part_f4_f;
+  int2 *d_task_first_part_f4_g;
+  cudaMallocHost((void **)&task_first_part_f4, target_n_tasks * sizeof(int2));
+  cudaMalloc((void **)&d_task_first_part_f4, target_n_tasks * sizeof(int2));
+  cudaMallocHost((void **)&task_first_part_f4_f, target_n_tasks * sizeof(int2));
+  cudaMalloc((void **)&d_task_first_part_f4_f, target_n_tasks * sizeof(int2));
+  cudaMallocHost((void **)&task_first_part_f4_g, target_n_tasks * sizeof(int2));
+  cudaMalloc((void **)&d_task_first_part_f4_g, target_n_tasks * sizeof(int2));
+
+  /*A. N.: Needed but only for small part in launch functions. Might
+           be useful for recursion on the GPU so keep for now     */
+  int4 *fparti_fpartj_lparti_lpartj_dens;
+  int4 *fparti_fpartj_lparti_lpartj_forc;
+  int4 *fparti_fpartj_lparti_lpartj_grad;
+  cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_dens,
+                 target_n_tasks * sizeof(int4));
+  cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_forc,
+                 target_n_tasks * sizeof(int4));
+  cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_grad,
+                 target_n_tasks * sizeof(int4));
+
+  /* nBundles is the number of task bundles each
+  thread has ==> Used to loop through bundles */
+  int nBundles = (target_n_tasks + bundle_size - 1) / bundle_size;
+  int nBundles_pair =
+      (target_n_tasks_pair + bundle_size_pair - 1) / bundle_size_pair;
+
+  if (r->cpuid == 0) {
+    fprintf(stderr, "engine_rank %i cpuid %i nBundles/nStreams %i\n",
+            engine_rank, r->cpuid, nBundles);
+    fprintf(stderr, "nBundles/nStreams Pair %i\n", nBundles_pair);
+  }
+
+  pack_vars_self_dens->nBundles = nBundles;
+  pack_vars_pair_dens->nBundles = nBundles_pair;
+  pack_vars_self_forc->nBundles = nBundles;
+  pack_vars_pair_forc->nBundles = nBundles_pair;
+  pack_vars_self_grad->nBundles = nBundles;
+  pack_vars_pair_grad->nBundles = nBundles_pair;
+
+  // first part and last part are the first and last particle ids (locally
+  // within this thread). A. Nasar: All these are used in GPU offload setup
+
+  cudaMallocHost((void **)&pack_vars_self_dens->bundle_first_part,
+                 nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_self_dens->bundle_last_part,
+                 nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_self_dens->bundle_first_task_list,
+                 nBundles * sizeof(int));
+
+  cudaMallocHost((void **)&pack_vars_pair_dens->bundle_first_part,
+                 2 * nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_pair_dens->bundle_last_part,
+                 2 * nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_pair_dens->bundle_first_task_list,
+                 2 * nBundles * sizeof(int));
+
+  cudaMallocHost((void **)&pack_vars_self_forc->bundle_first_part,
+                 nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_self_forc->bundle_last_part,
+                 nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_self_forc->bundle_first_task_list,
+                 nBundles * sizeof(int));
+
+  cudaMallocHost((void **)&pack_vars_pair_forc->bundle_first_part,
+                 2 * nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_pair_forc->bundle_last_part,
+                 2 * nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_pair_forc->bundle_first_task_list,
+                 2 * nBundles * sizeof(int));
+
+  cudaMallocHost((void **)&pack_vars_self_grad->bundle_first_part,
+                 nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_self_grad->bundle_last_part,
+                 nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_self_grad->bundle_first_task_list,
+                 nBundles * sizeof(int));
+
+  cudaMallocHost((void **)&pack_vars_pair_grad->bundle_first_part,
+                 2 * nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_pair_grad->bundle_last_part,
+                 2 * nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_pair_grad->bundle_first_task_list,
+                 2 * nBundles * sizeof(int));
+
+  /*Create streams so that we can off-load different batches of work in
+   * different streams and get some con-CURRENCY! Events used to maximise
+   * asynchrony further*/
+
+  cudaStream_t stream[nBundles];
+  cudaStream_t stream_pairs[nBundles_pair];
+
+  cudaEvent_t self_end[nBundles];
+  for (int i = 0; i < nBundles; i++) cudaEventCreate(&self_end[i]);
+  cudaEvent_t self_end_g[nBundles];
+  for (int i = 0; i < nBundles; i++) cudaEventCreate(&self_end_g[i]);
+  cudaEvent_t self_end_f[nBundles];
+  for (int i = 0; i < nBundles; i++) cudaEventCreate(&self_end_f[i]);
+
+  cudaEvent_t pair_end[nBundles];
+  for (int i = 0; i < nBundles; i++) cudaEventCreate(&pair_end[i]);
+  cudaEvent_t pair_end_g[nBundles];
+  for (int i = 0; i < nBundles; i++) cudaEventCreate(&pair_end_g[i]);
+  cudaEvent_t pair_end_f[nBundles];
+  for (int i = 0; i < nBundles; i++) cudaEventCreate(&pair_end_f[i]);
+
+  int tasksperbundle = (target_n_tasks + nBundles - 1) / nBundles;
+  int tasksperbundle_pair =
+      (target_n_tasks_pair + nBundles_pair - 1) / nBundles_pair;
+
+  pack_vars_self_dens->tasksperbundle = tasksperbundle;
+  pack_vars_pair_dens->tasksperbundle = tasksperbundle_pair;
+  pack_vars_self_forc->tasksperbundle = tasksperbundle;
+  pack_vars_pair_forc->tasksperbundle = tasksperbundle_pair;
+  pack_vars_self_grad->tasksperbundle = tasksperbundle;
+  pack_vars_pair_grad->tasksperbundle = tasksperbundle_pair;
+
+  for (int i = 0; i < nBundles; ++i)
+    cudaStreamCreateWithFlags(&stream[i], cudaStreamNonBlocking);
+  for (int i = 0; i < nBundles_pair; ++i)
+    cudaStreamCreateWithFlags(&stream_pairs[i], cudaStreamNonBlocking);
+
+  pack_vars_self_dens->count_parts = 0;
+  pack_vars_pair_dens->count_parts = 0;
+  pack_vars_self_forc->count_parts = 0;
+  pack_vars_pair_forc->count_parts = 0;
+  pack_vars_self_grad->count_parts = 0;
+  pack_vars_pair_grad->count_parts = 0;
+
+  /*Estimate how many particles to pack for GPU for each GPU launch
+   * instruction*/
+  int nr_nodes = 1, res = 0;
+#ifdef WITH_MPI
+  if ((res = MPI_Comm_size(MPI_COMM_WORLD, &nr_nodes)) != MPI_SUCCESS)
+    error("MPI_Comm_size failed with error %i.", res);
+#endif
+  int parts_per_top_level_cell =
+      space->nr_local_cells_with_particles /
+      space->nr_parts; /*A. Nasar: What I think is a good approximation for
+                                   average N particles in each top level cell*/
+  float eta_neighbours = e->s->eta_neighbours;
+  int np_per_cell = ceil(2.0 * eta_neighbours);
+  np_per_cell *= np_per_cell * np_per_cell;
+  /*A. Nasar: Increase parts per recursed task-level cell by buffer to
+    ensure we allocate enough memory*/
+  int buff = ceil(0.5 * np_per_cell);
+  /*A. Nasar: Multiplication by 2 is also to ensure we do not over-run
+   *  the allocated memory on buffers and GPU. This can happen if calculated h
+   * is larger than cell width and splitting makes bigger than target cells*/
+  int count_max_parts_tmp = 64 * 8 * target_n_tasks * (np_per_cell + buff);
+
+  pack_vars_self_dens->count_max_parts = count_max_parts_tmp;
+  pack_vars_pair_dens->count_max_parts = count_max_parts_tmp;
+  pack_vars_self_forc->count_max_parts = count_max_parts_tmp;
+  pack_vars_pair_forc->count_max_parts = count_max_parts_tmp;
+  pack_vars_self_grad->count_max_parts = count_max_parts_tmp;
+  pack_vars_pair_grad->count_max_parts = count_max_parts_tmp;
+
+  /*Declare Buffer and GPU particle arrays*/
+  struct part_aos_f4_send *parts_aos_f4_send;
+  struct part_aos_f4_recv *parts_aos_f4_recv;
+
+  struct part_aos_f4_f_send *parts_aos_forc_f4_send;
+  struct part_aos_f4_f_recv *parts_aos_forc_f4_recv;
+
+  struct part_aos_f4_g_send *parts_aos_grad_f4_send;
+  struct part_aos_f4_g_recv *parts_aos_grad_f4_recv;
+
+  struct part_aos_f4_send *d_parts_aos_f4_send;
+  struct part_aos_f4_recv *d_parts_aos_f4_recv;
+
+  struct part_aos_f4_f_send *d_parts_aos_forc_f4_send;
+  struct part_aos_f4_f_recv *d_parts_aos_forc_f4_recv;
+
+  struct part_aos_f4_g_send *d_parts_aos_grad_f4_send;
+  struct part_aos_f4_g_recv *d_parts_aos_grad_f4_recv;
+
+  struct part_aos_f4_send *parts_aos_pair_f4_send;
+  struct part_aos_f4_recv *parts_aos_pair_f4_recv;
+
+  struct part_aos_f4_send *d_parts_aos_pair_f4_send;
+  struct part_aos_f4_recv *d_parts_aos_pair_f4_recv;
+
+  struct part_aos_f4_f_send *parts_aos_pair_f4_f_send;
+  struct part_aos_f4_f_recv *parts_aos_pair_f4_f_recv;
+
+  struct part_aos_f4_f_send *d_parts_aos_pair_f4_f_send;
+  struct part_aos_f4_f_recv *d_parts_aos_pair_f4_f_recv;
+
+  struct part_aos_f4_g_send *parts_aos_pair_f4_g_send;
+  struct part_aos_f4_g_recv *parts_aos_pair_f4_g_recv;
+
+  struct part_aos_f4_g_send *d_parts_aos_pair_f4_g_send;
+  struct part_aos_f4_g_recv *d_parts_aos_pair_f4_g_recv;
+
+  /*Now allocate memory for Buffer and GPU particle arrays*/
+  cudaMalloc((void **)&d_parts_aos_f4_send,
+             count_max_parts_tmp * sizeof(struct part_aos_f4_send));
+  cudaMalloc((void **)&d_parts_aos_f4_recv,
+             count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
+
+  cudaMalloc((void **)&d_parts_aos_forc_f4_send,
+             count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
+  cudaMalloc((void **)&d_parts_aos_forc_f4_recv,
+             count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
+
+  cudaMalloc((void **)&d_parts_aos_grad_f4_send,
+             count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
+  cudaMalloc((void **)&d_parts_aos_grad_f4_recv,
+             count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
+
+  cudaMallocHost((void **)&parts_aos_f4_send,
+                 count_max_parts_tmp * sizeof(struct part_aos_f4_send));
+  cudaMallocHost((void **)&parts_aos_f4_recv,
+                 count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
+
+  cudaMallocHost((void **)&parts_aos_forc_f4_send,
+                 count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
+  cudaMallocHost((void **)&parts_aos_forc_f4_recv,
+                 count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
+
+  cudaMallocHost((void **)&parts_aos_grad_f4_send,
+                 count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
+  cudaMallocHost((void **)&parts_aos_grad_f4_recv,
+                 count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
+
+  cudaMalloc((void **)&d_parts_aos_pair_f4_send,
+             2 * count_max_parts_tmp * sizeof(struct part_aos_f4_send));
+  cudaMalloc((void **)&d_parts_aos_pair_f4_recv,
+             2 * count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
+
+  cudaMalloc((void **)&d_parts_aos_pair_f4_f_send,
+             2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
+  cudaMalloc((void **)&d_parts_aos_pair_f4_f_recv,
+             2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
+
+  cudaMalloc((void **)&d_parts_aos_pair_f4_g_send,
+             2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
+  cudaMalloc((void **)&d_parts_aos_pair_f4_g_recv,
+             2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
+
+  cudaMallocHost((void **)&parts_aos_pair_f4_send,
+                 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_send));
+  cudaMallocHost((void **)&parts_aos_pair_f4_recv,
+                 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
+
+  cudaMallocHost((void **)&parts_aos_pair_f4_g_send,
+                 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
+  cudaMallocHost((void **)&parts_aos_pair_f4_g_recv,
+                 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
+
+  cudaMallocHost((void **)&parts_aos_pair_f4_f_send,
+                 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
+  cudaMallocHost((void **)&parts_aos_pair_f4_f_recv,
+                 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
+
+  /*Declare some global variables*/
+  float d_a = e->cosmology->a;
+  float d_H = e->cosmology->H;
+  int step = 0;
+
+  // a list of the cells and tasks the GPU will work on
+  pack_vars_self_dens->task_list =
+      (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+  pack_vars_self_dens->cell_list =
+      (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+
+  pack_vars_pair_dens->task_list =
+      (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+  pack_vars_pair_dens->top_task_list =
+      (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+  int n_leaves_max = 4096;
+  /*Allocate target_n_tasks for top level tasks. This is a 2D array with length target_n_tasks and width n_leaves_max*/
+  struct leaf_cell_list l_list[target_n_tasks];
+  pack_vars_pair_dens->leaf_list = (struct leaf_cell_list *)calloc(target_n_tasks, sizeof(struct leaf_cell_list));
+  for (int i = 0; i < target_n_tasks; i++){
+//    l_list[i].ci = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
+//    l_list[i].cj = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
+//    l_list[i].n_leaves = 0;
+    pack_vars_pair_dens->leaf_list[i].ci = malloc(n_leaves_max * sizeof(struct cell *));
+    pack_vars_pair_dens->leaf_list[i].cj = malloc(n_leaves_max * sizeof(struct cell *));
+    pack_vars_pair_dens->leaf_list[i].n_leaves = 0;
+    pack_vars_pair_dens->leaf_list[i].n_packed = 0;
+//    for (int j = 0; j < n_leaves_max; j++){
+//      pack_vars_pair_dens->leaf_list[i].ci[j] = l_list[i].ci[j];
+//      pack_vars_pair_dens->leaf_list[i].cj[j] = l_list[i].cj[j];
+//    }
+  }
+//  pack_vars_pair_dens->leaf_list = l_list;
+//  pack_vars_pair_dens->leaf_list->ci =
+//	      (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
+//  pack_vars_pair_dens->leaf_list->cj =
+//	      (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
+  /*Allocate memory for n_leaves_max task pointers per top level task*/
+
+  pack_vars_pair_dens->ci_list =
+      (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+  pack_vars_pair_dens->cj_list =
+      (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+
+  pack_vars_self_forc->task_list =
+      (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+  pack_vars_self_forc->cell_list =
+      (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+
+  pack_vars_pair_forc->task_list =
+      (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+  pack_vars_pair_forc->ci_list =
+      (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+  pack_vars_pair_forc->cj_list =
+      (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+
+  pack_vars_self_grad->task_list =
+      (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+  pack_vars_self_grad->cell_list =
+      (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+
+  pack_vars_pair_grad->task_list =
+      (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+  pack_vars_pair_grad->ci_list =
+      (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+  pack_vars_pair_grad->cj_list =
+      (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+
+  // number of density self tasks executed
+  int tasks_done_cpu = 0;
+  int tasks_done_gpu = 0;
+  int tasks_done_gpu_inc = 0;
+
+  /* Main loop. */
+  while (1) {
+    /*Stuff for debugging*/
+    int n_full_d_bundles = 0, n_full_g_bundles = 0, n_full_f_bundles = 0;
+    int n_full_p_d_bundles = 0, n_full_p_g_bundles = 0, n_full_p_f_bundles = 0;
+    int n_partial_d_bundles = 0, n_partial_g_bundles = 0,
+        n_partial_f_bundles = 0;
+    int n_partial_p_d_bundles = 0, n_partial_p_g_bundles = 0,
+        n_partial_p_f_bundles = 0;
+    int output = 0;
+    int packed_self = 0;
+    int packed_pair = 0;
+    int packed_self_f = 0;
+    int packed_pair_f = 0;
+    int packed_self_g = 0;
+    int packed_pair_g = 0;
+    int density = 0;
+    int density_sub = 0;
+    int unpacked = 0;
+    int unpacked_f = 0;
+    int unpacked_g = 0;
+    int unpacked_pair = 0;
+    int unpacked_pair_f = 0;
+    int unpacked_pair_g = 0;
+    int ghost_in = 0;
+    int cpu_self = 0;
+    int cpu_self_f = 0;
+    int cpu_self_g = 0;
+    int cpu_pair = 0;
+    int cpu_pair_f = 0;
+    int cpu_pair_g = 0;
+    int n_leafs_total = 0;
+    //	Initialise timers to zero
+    double time_for_density_cpu = 0.0;
+    double time_for_density_cpu_pair = 0.0;
+    double time_for_cpu_g = 0.0;
+    double time_for_cpu_pair_g = 0.0;
+    double time_for_cpu_f = 0.0;
+    double time_for_cpu_pair_f = 0.0;
+    double time_for_density_cpu_sub = 0.0;
+    double time_for_density_gpu = 0.0;
+    double time_for_density_gpu_pair = 0.0;
+    double time_for_gpu_f = 0.0;
+    double time_for_gpu_pair_f = 0.0;
+    double time_for_gpu_g = 0.0;
+    double time_for_gpu_pair_g = 0.0;
+    double unpack_time_self_g = 0.0;
+    double unpack_time_self_f = 0.0;
+    double unpack_time_self = 0.0;
+    double time_for_gpu_pair = 0.0;
+    int nr_cells = space->nr_cells;
+    /* Wait at the barrier. */
+    engine_barrier(e);
+    // Initialise packing counters
+    pack_vars_self_dens->tasks_packed = 0;
+    pack_vars_pair_dens->tasks_packed = 0;
+    pack_vars_self_dens->count_parts = 0;
+    pack_vars_pair_dens->count_parts = 0;
+    pack_vars_pair_dens->task_locked = 0;
+    pack_vars_pair_dens->top_tasks_packed = 0;
+    // Initialise packing counters
+    pack_vars_self_forc->tasks_packed = 0;
+    pack_vars_pair_forc->tasks_packed = 0;
+    pack_vars_self_forc->count_parts = 0;
+    pack_vars_pair_forc->count_parts = 0;
+    // Initialise packing counters
+    pack_vars_self_grad->tasks_packed = 0;
+    pack_vars_pair_grad->tasks_packed = 0;
+    pack_vars_self_grad->count_parts = 0;
+    pack_vars_pair_grad->count_parts = 0;
+    for(int i = 0; i < target_n_tasks; i++)
+    	pack_vars_pair_dens->leaf_list[i].n_leaves = 0;
+
+    int total_tasks_packed_this_time_pair = 0;
+    double packing_time = 0.0;
+    double packing_time_f = 0.0;
+    double packing_time_g = 0.0;
+    double unpacking_time = 0.0;
+    double unpacking_time_f = 0.0;
+    double unpacking_time_g = 0.0;
+    double packing_time_pair = 0.0;
+    double packing_time_pair_f = 0.0;
+    double packing_time_pair_g = 0.0;
+    double unpacking_time_pair = 0.0;
+    double unpacking_time_pair_f = 0.0;
+    double unpacking_time_pair_g = 0.0;
+    double time_for_copy_to_struct = 0.0;
+    double tot_time_for_hard_memcpys = 0.0;
+    /* Can we go home yet? */
+    if (e->step_props & engine_step_prop_done) break;
+    /* Re-set the pointer to the previous task, as there is none. */
+    struct task *t = NULL;
+    struct task *prev = NULL;
+    /*Some bits for output in case of debug*/
+    char buf5[20];
+    snprintf(buf5, sizeof(buf5), "t%dr%dstep%d", r->cpuid, engine_rank, step);
+#ifdef DUMP_TIMINGS
+    FILE *fgpu_steps;
+    fgpu_steps = fopen(buf5, "w");
+#endif
+    //    if (step == 0) cudaProfilerStart();
+    step++;
+
+    sched->nr_packs_self_dens_done = 0;
+    sched->nr_packs_pair_dens_done = 0;
+    sched->nr_packs_self_forc_done = 0;
+    sched->nr_packs_pair_forc_done = 0;
+    sched->nr_packs_self_grad_done = 0;
+    sched->nr_packs_pair_grad_done = 0;
+    int n_cells_d = 0;
+    int n_cells_g = 0;
+    int n_cells_f = 0;
+    int n_cells_p_d = 0;
+    int n_cells_p_g = 0;
+    int n_cells_p_f = 0;
+    int n_w_prts_gtr_target_d = 0;
+    int n_w_prts_gtr_target_g = 0;
+    int n_w_prts_gtr_target_f = 0;
+    int n_w_prts_gtr_target_p_d = 0;
+    int n_w_prts_gtr_target_p_g = 0;
+    int n_w_prts_gtr_target_p_f = 0;
+    int g100 = 0;
+    int l100 = 0;
+    int maxcount = 0;
+    /* Loop while there are tasks... */
+    tasks_done_gpu_inc = 0;
+    ticks hang_time = getticks();
+    struct task * ttop_prev;
+    while (1) {
+      // A. Nasar: Get qid for re-use later
+      int qid = r->qid;
+      /* If there's no old task, try to get a new one. */
+      if (t == NULL) {
+        /* Get the task. */
+        TIMER_TIC
+        t = scheduler_gettask(sched, qid, prev);
+        TIMER_TOC(timer_gettask);
+        /* Did I get anything? */
+        if (t == NULL) break;
+      }
+      /* Get the cells. */
+      struct cell *ci = t->ci;
+      struct cell *cj = t->cj;
+
+      struct task * ttop = t;
+
+      if (ci == NULL && (t->subtype != task_subtype_gpu_unpack_d
+    		  && t->subtype != task_subtype_gpu_unpack_g
+			  && t->subtype != task_subtype_gpu_unpack_f)) error("This cannot be");
+
+#ifdef SWIFT_DEBUG_TASKS
+      /* Mark the thread we run on */
+      t->rid = r->cpuid;
+
+      /* And recover the pair direction */
+      if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+        struct cell *ci_temp = ci;
+        struct cell *cj_temp = cj;
+        double shift[3];
+        if (t->subtype != task_subtype_gpu_unpack_d &&
+            t->subtype != task_subtype_gpu_unpack_g &&
+            t->subtype != task_subtype_gpu_unpack_f)
+          t->sid = space_getsid_and_swap_cells(e->s, &ci_temp, &cj_temp, shift);
+      } else {
+        t->sid = -1;
+      }
+#endif
+
+#ifdef SWIFT_DEBUG_CHECKS
+      /* Check that we haven't scheduled an inactive task */
+      t->ti_run = e->ti_current;
+      /* Store the task that will be running (for debugging only) */
+      r->t = t;
+#endif
+
+      const ticks task_beg = getticks();
+      /* Different types of tasks... */
+      switch (t->type) {
+        case task_type_self:
+          if (t->subtype == task_subtype_gpu_unpack_d) {
+            unpacked++;
+          } else if (t->subtype == task_subtype_gpu_unpack_g) {
+            unpacked_g++;
+          } else if (t->subtype == task_subtype_gpu_unpack_f) {
+            unpacked_f++;
+          } else if (t->subtype == task_subtype_density) {
+            cpu_self++;
+#ifndef GPUOFFLOAD_DENSITY
+            struct timespec t0, t1, dt;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            runner_doself1_branch_density(r, ci);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            tasks_done_cpu++;
+            time_for_density_cpu += (t1.tv_sec - t0.tv_sec) +
+                                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+            density++;
+#endif
+            /* GPU WORK */
+          } else if (t->subtype == task_subtype_gpu_pack_d) {
+            packed_self++;
+#ifdef GPUOFFLOAD_DENSITY
+            ticks tic_cpu_pack = getticks();
+            packing_time +=
+                runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci, t,
+                                       parts_aos_f4_send, task_first_part_f4);
+            //Record times for task analysis
+            t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+            /* No pack tasks left in queue, flag that we want to run */
+            int launch_leftovers = pack_vars_self_dens->launch_leftovers;
+            /*Packed enough tasks. Let's go*/
+            int launch = pack_vars_self_dens->launch;
+            /* Do we have enough stuff to run the GPU ? */
+            if (launch || launch_leftovers) {
+              /*Launch GPU tasks*/
+              int t_packed = pack_vars_self_dens->tasks_packed;
+              runner_doself1_launch_f4(
+                  r, sched, pack_vars_self_dens, ci, t, parts_aos_f4_send,
+                  parts_aos_f4_recv, d_parts_aos_f4_send, d_parts_aos_f4_recv,
+                  stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
+                  &unpack_time_self, devId,
+                  task_first_part_f4, d_task_first_part_f4, self_end);
+            } /*End of GPU work Self*/
+#endif
+          } /* self / pack */
+          else if (t->subtype == task_subtype_gpu_pack_g) {
+            packed_self_g++;
+#ifdef GPUOFFLOAD_GRADIENT
+            ticks tic_cpu_pack = getticks();
+            packing_time_g += runner_doself1_pack_f4_g(
+                r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
+                task_first_part_f4_g);
+            //Record times for task analysis
+            t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+            /* No pack tasks left in queue, flag that we want to run */
+            int launch_leftovers = pack_vars_self_grad->launch_leftovers;
+            /*Packed enough tasks let's go*/
+            int launch = pack_vars_self_grad->launch;
+            /* Do we have enough stuff to run the GPU ? */
+            if (launch || launch_leftovers) {
+              /*Launch GPU tasks*/
+              int t_packed = pack_vars_self_grad->tasks_packed;
+              runner_doself1_launch_f4_g(
+                  r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
+                  parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
+                  d_parts_aos_grad_f4_recv, stream, d_a, d_H, e,
+                  &packing_time_g, &time_for_gpu_g, task_first_part_f4_g,
+                  d_task_first_part_f4_g, self_end_g, &unpack_time_self_g);
+            } /*End of GPU work Self*/
+#endif  // GPUGRADSELF
+          } else if (t->subtype == task_subtype_gpu_pack_f) {
+            packed_self_f++;
+#ifdef GPUOFFLOAD_FORCE
+            ticks tic_cpu_pack = getticks();
+            packing_time_f += runner_doself1_pack_f4_f(
+                r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
+                task_first_part_f4_f);
+            //Record times for task analysis
+            t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+            /* No pack tasks left in queue, flag that we want to run */
+            int launch_leftovers = pack_vars_self_forc->launch_leftovers;
+            /*Packed enough tasks let's go*/
+            int launch = pack_vars_self_forc->launch;
+            /* Do we have enough stuff to run the GPU ? */
+            if (launch || launch_leftovers) {
+              /*Launch GPU tasks*/
+              int t_packed = pack_vars_self_forc->tasks_packed;
+              runner_doself1_launch_f4_f(
+                  r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
+                  parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
+                  d_parts_aos_forc_f4_recv, stream, d_a, d_H, e,
+                  &packing_time_f, &time_for_gpu_f, task_first_part_f4_f,
+                  d_task_first_part_f4_f, self_end_f, &unpack_time_self_f);
+            } /*End of GPU work Self*/
+#endif
+          }
+#ifdef EXTRA_HYDRO_LOOP
+          else if (t->subtype == task_subtype_gradient) {
+            cpu_self_g++;
+#ifndef GPUOFFLOAD_GRADIENT
+            struct timespec t0, t1, dt;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            runner_doself1_branch_gradient(r, ci);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            tasks_done_cpu++;
+            time_for_cpu_g += (t1.tv_sec - t0.tv_sec) +
+                              (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+#endif
+          }
+#endif
+          else if (t->subtype == task_subtype_force) {
+            cpu_self_f++;
+#ifndef GPUOFFLOAD_FORCE
+            struct timespec t0, t1;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            runner_doself2_branch_force(r, ci);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            tasks_done_cpu++;
+            time_for_cpu_f += (t1.tv_sec - t0.tv_sec) +
+                              (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+#endif
+          } else if (t->subtype == task_subtype_limiter)
+            runner_doself1_branch_limiter(r, ci);
+          else if (t->subtype == task_subtype_grav)
+            runner_doself_recursive_grav(r, ci, 1);
+          else if (t->subtype == task_subtype_external_grav)
+            runner_do_grav_external(r, ci, 1);
+          else if (t->subtype == task_subtype_stars_density)
+            runner_doself_branch_stars_density(r, ci);
+#ifdef EXTRA_STAR_LOOPS
+          else if (t->subtype == task_subtype_stars_prep1)
+            runner_doself_branch_stars_prep1(r, ci);
+          else if (t->subtype == task_subtype_stars_prep2)
+            runner_doself_branch_stars_prep2(r, ci);
+#endif
+          else if (t->subtype == task_subtype_stars_feedback)
+            runner_doself_branch_stars_feedback(r, ci);
+          else if (t->subtype == task_subtype_bh_density)
+            runner_doself_branch_bh_density(r, ci);
+          else if (t->subtype == task_subtype_bh_swallow)
+            runner_doself_branch_bh_swallow(r, ci);
+          else if (t->subtype == task_subtype_do_gas_swallow)
+            runner_do_gas_swallow_self(r, ci, 1);
+          else if (t->subtype == task_subtype_do_bh_swallow)
+            runner_do_bh_swallow_self(r, ci, 1);
+          else if (t->subtype == task_subtype_bh_feedback)
+            runner_doself_branch_bh_feedback(r, ci);
+          else if (t->subtype == task_subtype_rt_gradient)
+            runner_doself1_branch_rt_gradient(r, ci);
+          else if (t->subtype == task_subtype_rt_transport)
+            runner_doself2_branch_rt_transport(r, ci);
+          else if (t->subtype == task_subtype_sink_swallow)
+            runner_doself_branch_sinks_swallow(r, ci);
+          else if (t->subtype == task_subtype_sink_do_gas_swallow)
+            runner_do_sinks_gas_swallow_self(r, ci, 1);
+          else if (t->subtype == task_subtype_sink_do_sink_swallow)
+            runner_do_sinks_sink_swallow_self(r, ci, 1);
+          else
+            error("Unknown/invalid task subtype (%s).",
+                  subtaskID_names[t->subtype]);
+          break;
+
+        case task_type_pair:
+          if (t->subtype == task_subtype_density) {
+            cpu_pair++;
+#ifndef GPUOFFLOAD_DENSITY
+            struct timespec t0, t1, dt;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            runner_dopair1_branch_density(r, ci, cj);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            tasks_done_cpu++;
+            time_for_density_cpu_pair +=
+                (t1.tv_sec - t0.tv_sec) +
+                (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+#endif
+          }
+          /* GPU WORK */
+          else if (t->subtype == task_subtype_gpu_pack_d) {
+            packed_pair++;
+#ifdef GPUOFFLOAD_DENSITY
+
+            ticks tic_cpu_pack = getticks();
+
+            /////////////////////W.I.P!!!////////////////////////////////////////////////////////
+            /*Call recursion here. This will be a function in runner_doiact_functions_hydro_gpu.h.
+            * We are recursing separately to find out how much work we have before offloading*/
+            //We need to allocate a list to put cell pointers into for each new task
+            int n_expected_tasks = 4096; //A. Nasar: Need to come up with a good estimate for this
+            int n_leaves_found = 0;
+            int top_tasks_packed = pack_vars_pair_dens->top_tasks_packed;
+            int depth = 0;
+
+            pack_vars_pair_dens->leaf_list[top_tasks_packed].n_leaves = 0;
+            pack_vars_pair_dens->leaf_list[top_tasks_packed].n_start = 0;
+            pack_vars_pair_dens->leaf_list[top_tasks_packed].n_packed = 0;
+
+            runner_recurse_gpu(r, sched, pack_vars_pair_dens, ci, cj, t,
+                      parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, &n_leaves_found, depth, n_expected_tasks);
+
+            n_leafs_total += n_leaves_found;
+            int cstart = 0, cid = 0;
+
+            pack_vars_pair_dens->top_task_list[top_tasks_packed] = t;
+
+            pack_vars_pair_dens->top_tasks_packed++;
+            pack_vars_pair_dens->task_locked = 1;
+            int t_s, t_e;
+            t_s = 0;
+            int n_t_tasks = pack_vars_pair_dens->target_n_tasks;
+            t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+
+            int ntop_packed = pack_vars_pair_dens->top_tasks_packed;
+
+            while(cstart < n_leaves_found){
+              tic_cpu_pack = getticks();
+
+//              if(pack_vars_pair_dens->top_task_list[0] == ttop_prev)
+//                error("Working on prev top level task");
+              pack_vars_pair_dens->launch_leftovers = 0;
+              pack_vars_pair_dens->launch = 0;
+              /*Loop through n_daughters such that the pack_vars_pair_dens counters are updated*/
+              while(cstart < n_leaves_found && pack_vars_pair_dens->tasks_packed < n_t_tasks){
+                // n_start is incremented in pack. However, for cases where we have launched
+                // but there are still some daughters left unpacked, we need to restart the
+                // count from zero for the packed arrays as the daughters we previously worked on are no longer necessary.
+                // Thus, the counter for cii and cjj should remain cstart but counter for packing/unpacking arrays
+                // should be n_start which is set to zero after launch. count_parts should also be zero ater launch
+                struct cell * cii = pack_vars_pair_dens->leaf_list[ntop_packed - 1].ci[cstart];
+                struct cell * cjj = pack_vars_pair_dens->leaf_list[ntop_packed - 1].cj[cstart];
+                packing_time_pair += runner_dopair1_pack_f4(
+                    /////////////////////////////Are we sure we should use
+                    /////////////////////////////cells_left/cells right and not
+                    /////////////////////////////pack_vars_pair_dens->leaf_list[top_tasks_packed].ci & cj?
+                  r, sched, pack_vars_pair_dens, cii, cjj, t,
+                    /////////////////////////////      HERE        //////////////////////////////////////////
+                  parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
+                if(pack_vars_pair_dens->count_parts > count_max_parts_tmp)
+                  error("Packed more parts than possible");
+                cstart++;
+              }
+              /* Copies done. Release the lock ! */
+              t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+              /* Packed enough tasks or no pack tasks left in queue, flag that
+               * we want to run */
+              int launch = pack_vars_pair_dens->launch;
+              int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
+              /* Do we have enough stuff to run the GPU ? */
+              if (launch || launch_leftovers) {
+                /*Launch GPU tasks*/
+                int t_packed = pack_vars_pair_dens->tasks_packed;
+                runner_dopair1_launch_f4_one_memcpy(
+                    r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
+                    parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
+                    d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
+                    &packing_time_pair, &time_for_density_gpu_pair,
+                    &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
+                    pair_end);
+                //A. Nasar: Unpack data and zero count_parts counter
+                runner_dopair1_unpack_f4(
+                    r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
+                    parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
+                    d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
+                    &packing_time_pair, &time_for_density_gpu_pair,
+                    &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
+                    pair_end, cstart, n_leaves_found);
+                /*This ensure that if we still have leaves left we start at index 1.
+                  Otherwise, reset the index since we will be grabbing a new task*/
+                int n_packed = pack_vars_pair_dens->tasks_packed;
+                //A. Nasar: We've packed all daughters and have launched --> one way or the other
+                if(cstart == n_leaves_found){
+                  pack_vars_pair_dens->top_tasks_packed = 0;
+//                  for(int i = 0; i < ntop_packed; i++){
+//                    pack_vars_pair_dens->leaf_list[i].n_leaves = 0;
+//                    pack_vars_pair_dens->leaf_list[i].n_packed = 0;
+//                    pack_vars_pair_dens->leaf_list[i].n_start = 0;
+//                  }
+                }
+                // A. Nasar: We've launched but we have not packed all daughters.
+                // Need to set counters so we start from the last top-task packed
+                // and it's last packed daughter-task and start packing to the beginning of GPU arrays
+                // which is reset to zero (count_parts) in "....unpack_f4()"
+                else{
+                  for(int i = 1; i < pack_vars_pair_dens->top_tasks_packed; i++)
+                    pack_vars_pair_dens->leaf_list[i].n_start = 0;
+                  pack_vars_pair_dens->top_tasks_packed = 1;
+                  pack_vars_pair_dens->top_task_list[0]= t;
+                  // A. Nasar: We've launched so need to restart counting tasks
+                  // from zero and need to reset tasks_packed to zero.
+                  // However, the counter for
+                  pack_vars_pair_dens->leaf_list[0].n_start = cstart;
+
+                  pack_vars_pair_dens->leaf_list[0].n_packed = 0;
+                  //A. Nasar: We have packed all daughter tasks in this parent task
+                  /*This makes it such that the remaining leaf tasks are packed starting from a
+                      fresh list since we are still in the while cstart < n_leaves_found loop**/
+                }
+                // A. Nasar: These need to be reset to zero either way as our GPU array counters
+                // need to re-start from zero
+                pack_vars_pair_dens->tasks_packed = 0;
+                pack_vars_pair_dens->launch_leftovers = 0;
+                pack_vars_pair_dens->launch = 0;
+              }
+              ///////////////////////////////////////////////////////////////////////
+            }
+            ttop_prev = t;
+            cell_unlocktree(ci);
+            cell_unlocktree(cj);
+//            pack_vars_pair_dens->launch_leftovers = 0;
+//            pack_vars_pair_dens->launch = 0;
+            /////////////////////W.I.P!!!////////////////////////////////////////////////////////
+
+#endif  // GPUOFFLOAD_DENSITY
+          } /* pair / pack */
+          else if (t->subtype == task_subtype_gpu_pack_g) {
+            packed_pair_g++;
+#ifdef GPUOFFLOAD_GRADIENT
+              ticks tic_cpu_pack = getticks();
+              packing_time_pair_g +=
+                  runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
+                                           cj, t, parts_aos_pair_f4_g_send, e,
+                                           fparti_fpartj_lparti_lpartj_grad);
+              t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+              /* No pack tasks left in queue, flag that we want to run */
+              int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
+              /*Packed enough tasks, let's go*/
+              int launch = pack_vars_pair_grad->launch;
+              /* Do we have enough stuff to run the GPU ? */
+              if (launch || launch_leftovers) {
+                /*Launch GPU tasks*/
+                int t_packed = pack_vars_pair_grad->tasks_packed;
+                //                signal_sleeping_runners(sched, t, t_packed);
+                runner_dopair1_launch_f4_g_one_memcpy(
+                    r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send,
+                    parts_aos_pair_f4_g_recv, d_parts_aos_pair_f4_g_send,
+                    d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e,
+                    &packing_time_pair_g, &time_for_gpu_pair_g,
+                    &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad,
+                    pair_end_g);
+              }
+              pack_vars_pair_grad->launch_leftovers = 0;
+#endif  // GPUOFFLOAD_GRADIENT
+          } else if (t->subtype == task_subtype_gpu_pack_f) {
+            packed_pair_f++;
+#ifdef GPUOFFLOAD_FORCE
+              ticks tic_cpu_pack = getticks();
+              /*Pack data and increment counters checking if we should run on the GPU after packing this task*/
+              packing_time_pair_f +=
+                  runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
+                                           cj, t, parts_aos_pair_f4_f_send, e,
+                                           fparti_fpartj_lparti_lpartj_forc);
+              /* No pack tasks left in queue, flag that we want to run */
+              int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
+              /*Packed enough tasks let's go*/
+              int launch = pack_vars_pair_forc->launch;
+              /* Do we have enough stuff to run the GPU ? */
+              if (launch || launch_leftovers) {
+                /*Launch GPU tasks*/
+                int t_packed = pack_vars_pair_forc->tasks_packed;
+                //                signal_sleeping_runners(sched, t, t_packed);
+                runner_dopair1_launch_f4_f_one_memcpy(
+                    r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send,
+                    parts_aos_pair_f4_f_recv, d_parts_aos_pair_f4_f_send,
+                    d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e,
+                    &packing_time_pair_f, &time_for_gpu_pair_f,
+                    &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc,
+                    pair_end_f);
+
+                pack_vars_pair_forc->launch_leftovers = 0;
+              } /* End of GPU work Pairs */
+#endif  // GPUOFFLOAD_FORCE
+          } else if (t->subtype == task_subtype_gpu_unpack_d) {
+            unpacked_pair++;
+          } else if (t->subtype == task_subtype_gpu_unpack_g) {
+            unpacked_pair_g++;
+          } else if (t->subtype == task_subtype_gpu_unpack_f) {
+            unpacked_pair_f++;
+          }
+#ifdef EXTRA_HYDRO_LOOP
+          else if (t->subtype == task_subtype_gradient) {
+            int Do_nothing = 0;
+#ifndef GPUOFFLOAD_GRADIENT
+            struct timespec t0, t1, dt;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            runner_dopair1_branch_gradient(r, ci, cj);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            tasks_done_cpu++;
+            time_for_cpu_pair_g += (t1.tv_sec - t0.tv_sec) +
+                                   (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+#endif
+          }
+#endif  // EXTRA_HYDRO_LOOP
+          else if (t->subtype == task_subtype_force) {
+            int Do_nothing = 0;
+#ifndef GPUOFFLOAD_FORCE
+            struct timespec t0, t1, dt;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            runner_dopair2_branch_force(r, ci, cj);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            tasks_done_cpu++;
+            time_for_cpu_pair_f += (t1.tv_sec - t0.tv_sec) +
+                                   (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+#endif  // GPUOFFLOAD_FORCE
+          } else if (t->subtype == task_subtype_limiter)
+            runner_dopair1_branch_limiter(r, ci, cj);
+          else if (t->subtype == task_subtype_grav)
+            runner_dopair_recursive_grav(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_stars_density)
+            runner_dopair_branch_stars_density(r, ci, cj);
+#ifdef EXTRA_STAR_LOOPS
+          else if (t->subtype == task_subtype_stars_prep1)
+            runner_dopair_branch_stars_prep1(r, ci, cj);
+          else if (t->subtype == task_subtype_stars_prep2)
+            runner_dopair_branch_stars_prep2(r, ci, cj);
+#endif
+          else if (t->subtype == task_subtype_stars_feedback)
+            runner_dopair_branch_stars_feedback(r, ci, cj);
+          else if (t->subtype == task_subtype_bh_density)
+            runner_dopair_branch_bh_density(r, ci, cj);
+          else if (t->subtype == task_subtype_bh_swallow)
+            runner_dopair_branch_bh_swallow(r, ci, cj);
+          else if (t->subtype == task_subtype_do_gas_swallow)
+            runner_do_gas_swallow_pair(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_do_bh_swallow)
+            runner_do_bh_swallow_pair(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_bh_feedback)
+            runner_dopair_branch_bh_feedback(r, ci, cj);
+          else if (t->subtype == task_subtype_rt_gradient)
+            runner_dopair1_branch_rt_gradient(r, ci, cj);
+          else if (t->subtype == task_subtype_rt_transport)
+            runner_dopair2_branch_rt_transport(r, ci, cj);
+          else if (t->subtype == task_subtype_sink_swallow)
+            runner_dopair_branch_sinks_swallow(r, ci, cj);
+          else if (t->subtype == task_subtype_sink_do_gas_swallow)
+            runner_do_sinks_gas_swallow_pair(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_sink_do_sink_swallow)
+            runner_do_sinks_sink_swallow_pair(r, ci, cj, 1);
+          else
+            error("Unknown/invalid task subtype (%s/%s).",
+                  taskID_names[t->type], subtaskID_names[t->subtype]);
+          break;
+
+        case task_type_sub_self:
+          if (t->subtype == task_subtype_density) {
+            struct timespec t0, t1, dt;
+            const int count = ci->hydro.count;
+            density_sub++;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            runner_dosub_self1_density(r, ci, 1);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            tasks_done_cpu++;
+            time_for_density_cpu_sub +=
+                (t1.tv_sec - t0.tv_sec) +
+                (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+          }
+#ifdef EXTRA_HYDRO_LOOP
+          else if (t->subtype == task_subtype_gradient) {
+            runner_dosub_self1_gradient(r, ci, 1);
+          }
+#endif
+          else if (t->subtype == task_subtype_force) {
+            runner_dosub_self2_force(r, ci, 1);
+          } else if (t->subtype == task_subtype_limiter)
+            runner_dosub_self1_limiter(r, ci, 1);
+          else if (t->subtype == task_subtype_stars_density)
+            runner_dosub_self_stars_density(r, ci, 1);
+#ifdef EXTRA_STAR_LOOPS
+          else if (t->subtype == task_subtype_stars_prep1)
+            runner_dosub_self_stars_prep1(r, ci, 1);
+          else if (t->subtype == task_subtype_stars_prep2)
+            runner_dosub_self_stars_prep2(r, ci, 1);
+#endif
+          else if (t->subtype == task_subtype_stars_feedback)
+            runner_dosub_self_stars_feedback(r, ci, 1);
+          else if (t->subtype == task_subtype_bh_density)
+            runner_dosub_self_bh_density(r, ci, 1);
+          else if (t->subtype == task_subtype_bh_swallow)
+            runner_dosub_self_bh_swallow(r, ci, 1);
+          else if (t->subtype == task_subtype_do_gas_swallow)
+            runner_do_gas_swallow_self(r, ci, 1);
+          else if (t->subtype == task_subtype_do_bh_swallow)
+            runner_do_bh_swallow_self(r, ci, 1);
+          else if (t->subtype == task_subtype_bh_feedback)
+            runner_dosub_self_bh_feedback(r, ci, 1);
+          else if (t->subtype == task_subtype_rt_gradient)
+            runner_dosub_self1_rt_gradient(r, ci, 1);
+          else if (t->subtype == task_subtype_rt_transport)
+            runner_dosub_self2_rt_transport(r, ci, 1);
+          else if (t->subtype == task_subtype_sink_swallow)
+            runner_dosub_self_sinks_swallow(r, ci, 1);
+          else if (t->subtype == task_subtype_sink_do_gas_swallow)
+            runner_do_sinks_gas_swallow_self(r, ci, 1);
+          else if (t->subtype == task_subtype_sink_do_sink_swallow)
+            runner_do_sinks_sink_swallow_self(r, ci, 1);
+          else
+            error("Unknown/invalid task subtype (%s/%s).",
+                  taskID_names[t->type], subtaskID_names[t->subtype]);
+          break;
+
+        case task_type_sub_pair:
+          if (t->subtype == task_subtype_density) {
+            int nothing = 0;
+            runner_dosub_pair1_density(r, ci, cj, 1);
+          }
+#ifdef EXTRA_HYDRO_LOOP
+          else if (t->subtype == task_subtype_gradient) {
+            runner_dosub_pair1_gradient(r, ci, cj, 1);
+          }
+#endif
+          else if (t->subtype == task_subtype_force) {
+            runner_dosub_pair2_force(r, ci, cj, 1);
+          } else if (t->subtype == task_subtype_limiter)
+            runner_dosub_pair1_limiter(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_stars_density)
+            runner_dosub_pair_stars_density(r, ci, cj, 1);
+#ifdef EXTRA_STAR_LOOPS
+          else if (t->subtype == task_subtype_stars_prep1)
+            runner_dosub_pair_stars_prep1(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_stars_prep2)
+            runner_dosub_pair_stars_prep2(r, ci, cj, 1);
+#endif
+          else if (t->subtype == task_subtype_stars_feedback)
+            runner_dosub_pair_stars_feedback(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_bh_density)
+            runner_dosub_pair_bh_density(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_bh_swallow)
+            runner_dosub_pair_bh_swallow(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_do_gas_swallow)
+            runner_do_gas_swallow_pair(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_do_bh_swallow)
+            runner_do_bh_swallow_pair(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_bh_feedback)
+            runner_dosub_pair_bh_feedback(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_rt_gradient)
+            runner_dosub_pair1_rt_gradient(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_rt_transport)
+            runner_dosub_pair2_rt_transport(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_sink_swallow)
+            runner_dosub_pair_sinks_swallow(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_sink_do_gas_swallow)
+            runner_do_sinks_gas_swallow_pair(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_sink_do_sink_swallow)
+            runner_do_sinks_sink_swallow_pair(r, ci, cj, 1);
+          else
+            error("Unknown/invalid task subtype (%s/%s).",
+                  taskID_names[t->type], subtaskID_names[t->subtype]);
+          break;
+
+        case task_type_sort:
+          /* Cleanup only if any of the indices went stale. */
+          runner_do_hydro_sort(
+              r, ci, t->flags,
+              ci->hydro.dx_max_sort_old > space_maxreldx * ci->dmin,
+              cell_get_flag(ci, cell_flag_rt_requests_sort), 1);
+          /* Reset the sort flags as our work here is done. */
+          t->flags = 0;
+          break;
+        case task_type_rt_sort:
+          /* Cleanup only if any of the indices went stale.
+           * NOTE: we check whether we reset the sort flags when the
+           * recv tasks are running. Cells without an RT recv task
+           * don't have rt_sort tasks. */
+          runner_do_hydro_sort(
+              r, ci, t->flags,
+              ci->hydro.dx_max_sort_old > space_maxreldx * ci->dmin, 1, 1);
+          /* Reset the sort flags as our work here is done. */
+          t->flags = 0;
+          break;
+        case task_type_stars_sort:
+          /* Cleanup only if any of the indices went stale. */
+          runner_do_stars_sort(
+              r, ci, t->flags,
+              ci->stars.dx_max_sort_old > space_maxreldx * ci->dmin, 1);
+          /* Reset the sort flags as our work here is done. */
+          t->flags = 0;
+          break;
+        case task_type_init_grav:
+          runner_do_init_grav(r, ci, 1);
+          break;
+        case task_type_ghost:
+          runner_do_ghost(r, ci, 1);
+          break;
+#ifdef EXTRA_HYDRO_LOOP
+        case task_type_extra_ghost:
+          runner_do_extra_ghost(r, ci, 1);
+          break;
+#endif
+        case task_type_stars_ghost:
+          runner_do_stars_ghost(r, ci, 1);
+          break;
+        case task_type_bh_density_ghost:
+          runner_do_black_holes_density_ghost(r, ci, 1);
+          break;
+        case task_type_bh_swallow_ghost3:
+          runner_do_black_holes_swallow_ghost(r, ci, 1);
+          break;
+        case task_type_drift_part:
+          runner_do_drift_part(r, ci, 1);
+          break;
+        case task_type_drift_spart:
+          runner_do_drift_spart(r, ci, 1);
+          break;
+        case task_type_drift_sink:
+          runner_do_drift_sink(r, ci, 1);
+          break;
+        case task_type_drift_bpart:
+          runner_do_drift_bpart(r, ci, 1);
+          break;
+        case task_type_drift_gpart:
+          runner_do_drift_gpart(r, ci, 1);
+          break;
+        case task_type_kick1:
+          runner_do_kick1(r, ci, 1);
+          break;
+        case task_type_kick2:
+          runner_do_kick2(r, ci, 1);
+          break;
+        case task_type_end_hydro_force:
+          runner_do_end_hydro_force(r, ci, 1);
+          break;
+        case task_type_end_grav_force:
+          runner_do_end_grav_force(r, ci, 1);
+          break;
+        case task_type_csds:
+          runner_do_csds(r, ci, 1);
+          break;
+        case task_type_timestep:
+          runner_do_timestep(r, ci, 1);
+          break;
+        case task_type_timestep_limiter:
+          runner_do_limiter(r, ci, 0, 1);
+          break;
+        case task_type_timestep_sync:
+          runner_do_sync(r, ci, 0, 1);
+          break;
+        case task_type_collect:
+          runner_do_timestep_collect(r, ci, 1);
+          break;
+        case task_type_rt_collect_times:
+          runner_do_collect_rt_times(r, ci, 1);
+          break;
+#ifdef WITH_MPI
+        case task_type_send:
+          if (t->subtype == task_subtype_tend) {
+            free(t->buff);
+          } else if (t->subtype == task_subtype_sf_counts) {
+            free(t->buff);
+          } else if (t->subtype == task_subtype_part_swallow) {
+            free(t->buff);
+          } else if (t->subtype == task_subtype_bpart_merger) {
+            free(t->buff);
+          } else if (t->subtype == task_subtype_limiter) {
+            free(t->buff);
+          }
+          break;
+        case task_type_recv:
+          if (t->subtype == task_subtype_tend) {
+            cell_unpack_end_step(ci, (struct pcell_step *)t->buff);
+            free(t->buff);
+          } else if (t->subtype == task_subtype_sf_counts) {
+            cell_unpack_sf_counts(ci, (struct pcell_sf *)t->buff);
+            cell_clear_stars_sort_flags(ci, /*clear_unused_flags=*/0);
+            free(t->buff);
+          } else if (t->subtype == task_subtype_xv) {
+            runner_do_recv_part(r, ci, 1, 1);
+          } else if (t->subtype == task_subtype_rho) {
+            runner_do_recv_part(r, ci, 0, 1);
+          } else if (t->subtype == task_subtype_gradient) {
+            runner_do_recv_part(r, ci, 0, 1);
+          } else if (t->subtype == task_subtype_rt_gradient) {
+            runner_do_recv_part(r, ci, 2, 1);
+          } else if (t->subtype == task_subtype_rt_transport) {
+            runner_do_recv_part(r, ci, -1, 1);
+          } else if (t->subtype == task_subtype_part_swallow) {
+            cell_unpack_part_swallow(ci,
+                                     (struct black_holes_part_data *)t->buff);
+            free(t->buff);
+          } else if (t->subtype == task_subtype_bpart_merger) {
+            cell_unpack_bpart_swallow(ci,
+                                      (struct black_holes_bpart_data *)t->buff);
+            free(t->buff);
+          } else if (t->subtype == task_subtype_limiter) {
+            /* Nothing to do here. Unpacking done in a separate task */
+          } else if (t->subtype == task_subtype_gpart) {
+            runner_do_recv_gpart(r, ci, 1);
+          } else if (t->subtype == task_subtype_spart_density) {
+            runner_do_recv_spart(r, ci, 1, 1);
+          } else if (t->subtype == task_subtype_part_prep1) {
+            runner_do_recv_part(r, ci, 0, 1);
+          } else if (t->subtype == task_subtype_spart_prep2) {
+            runner_do_recv_spart(r, ci, 0, 1);
+          } else if (t->subtype == task_subtype_bpart_rho) {
+            runner_do_recv_bpart(r, ci, 1, 1);
+          } else if (t->subtype == task_subtype_bpart_feedback) {
+            runner_do_recv_bpart(r, ci, 0, 1);
+          } else {
+            error("Unknown/invalid task subtype (%d).", t->subtype);
+          }
+          break;
+
+        case task_type_pack:
+          runner_do_pack_limiter(r, ci, &t->buff, 1);
+          task_get_unique_dependent(t)->buff = t->buff;
+          break;
+        case task_type_unpack:
+          runner_do_unpack_limiter(r, ci, t->buff, 1);
+          break;
+#endif
+        case task_type_grav_down:
+          runner_do_grav_down(r, t->ci, 1);
+          break;
+        case task_type_grav_long_range:
+          runner_do_grav_long_range(r, t->ci, 1);
+          break;
+        case task_type_grav_mm:
+          runner_dopair_grav_mm_progenies(r, t->flags, t->ci, t->cj);
+          break;
+        case task_type_cooling:
+          runner_do_cooling(r, t->ci, 1);
+          break;
+        case task_type_star_formation:
+          runner_do_star_formation(r, t->ci, 1);
+          break;
+        case task_type_star_formation_sink:
+          runner_do_star_formation_sink(r, t->ci, 1);
+          break;
+        case task_type_stars_resort:
+          runner_do_stars_resort(r, t->ci, 1);
+          break;
+        case task_type_sink_formation:
+          runner_do_sink_formation(r, t->ci);
+          break;
+        case task_type_fof_self:
+          runner_do_fof_search_self(r, t->ci, 1);
+          break;
+        case task_type_fof_pair:
+          runner_do_fof_search_pair(r, t->ci, t->cj, 1);
+          break;
+        case task_type_fof_attach_self:
+          runner_do_fof_attach_self(r, t->ci, 1);
+          break;
+        case task_type_fof_attach_pair:
+          runner_do_fof_attach_pair(r, t->ci, t->cj, 1);
+          break;
+        case task_type_neutrino_weight:
+          runner_do_neutrino_weighting(r, ci, 1);
+          break;
+        case task_type_rt_ghost1:
+          runner_do_rt_ghost1(r, t->ci, 1);
+          break;
+        case task_type_rt_ghost2:
+          runner_do_rt_ghost2(r, t->ci, 1);
+          break;
+        case task_type_rt_tchem:
+          runner_do_rt_tchem(r, t->ci, 1);
+          break;
+        case task_type_rt_advance_cell_time:
+          runner_do_rt_advance_cell_time(r, t->ci, 1);
+          break;
+        default:
+          error("Unknown/invalid task type (%d).", t->type);
+      }
+      r->active_time += (getticks() - task_beg);
+
+/* Mark that we have run this task on these cells */
+#ifdef SWIFT_DEBUG_CHECKS
+      if (ci != NULL) {
+        ci->tasks_executed[t->type]++;
+        ci->subtasks_executed[t->subtype]++;
+      }
+      if (cj != NULL) {
+        cj->tasks_executed[t->type]++;
+        cj->subtasks_executed[t->subtype]++;
+      }
+      /* This runner is not doing a task anymore */
+      r->t = NULL;
+#endif
+
+      /* We're done with this task, see if we get a next one. */
+      prev = t;
+
+      if (t->subtype == task_subtype_gpu_pack_d) {
+#ifdef GPUOFFLOAD_DENSITY
+        /* Don't enqueue unpacks yet. Just signal the runners */
+        t->skip = 1;
+        t->toc = getticks();
+        t->total_ticks += t->toc - t->tic;
+        t = NULL;
+#else
+        t = scheduler_done(sched, t);
+#endif
+      }
+
+      else if (t->subtype == task_subtype_gpu_pack_g) {
+#ifdef GPUOFFLOAD_GRADIENT
+        /* Don't enqueue unpacks yet. Just signal the runners */
+        t->skip = 1;
+        t->toc = getticks();
+        t->total_ticks += t->toc - t->tic;
+        t = NULL;
+#else
+        t = scheduler_done(sched, t);
+#endif
+      }
+
+      else if (t->subtype == task_subtype_gpu_pack_f) {
+#ifdef GPUOFFLOAD_FORCE
+        /* Don't enqueue unpacks yet. Just signal the runners */
+        t->skip = 1;
+        t->toc = getticks();
+        t->total_ticks += t->toc - t->tic;
+        t = NULL;
+#else
+        t = scheduler_done(sched, t);
+#endif
+      }
+
+      else if (t->subtype != task_subtype_gpu_pack_d &&
+               t->subtype != task_subtype_gpu_pack_g &&
+               t->subtype != task_subtype_gpu_pack_f) {
+        t = scheduler_done(sched, t);
+      }
+    } /* main loop. */
+
+    message("n_leafs found %i", n_leafs_total);
+//    message("cpu %i packed %i cells with %i containing more parts than target of %i max_count %i",
+//            r->cpuid, n_cells_d, n_w_prts_gtr_target_d, np_per_cell, maxcount);
+//    message("cpu %i packed %i cells_G with %i containing more parts than target of %i max_count %i",
+//            r->cpuid, n_cells_g, n_w_prts_gtr_target_g, np_per_cell, maxcount);
+//    message("cpu %i packed %i cells_F with %i containing more parts than target of %i max_count %i",
+//            r->cpuid, n_cells_f, n_w_prts_gtr_target_f, np_per_cell, maxcount);
+//    message("cpu %i packed %i pairs_D with %i containing more parts than target of %i max_count %i",
+//            r->cpuid, n_cells_p_d, n_w_prts_gtr_target_p_d, np_per_cell, maxcount);
+//    message("cpu %i packed %i pairs_G with %i containing more parts than target of %i max_count %i",
+//            r->cpuid, n_cells_p_g, n_w_prts_gtr_target_p_g, np_per_cell, maxcount);
+//    message("cpu %i packed %i pairs_F with %i containing more parts than target of %i max_count %i",
+//            r->cpuid, n_cells_p_f, n_w_prts_gtr_target_p_f, np_per_cell, maxcount);
+
+    //    message("Worked on %i supers w more than 100 parts", g100);
+    // Stuff for writing debug data to file for validation
+    ////        if (step % 10 == 0 || step == 1) {
+    //      if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "x, y, z,
+    //      rho, rhodh, v_sig, lap_u, a_visc_max, ax, ay, az\n"); for (int tid
+    //      = 0; tid < space->nr_local_cells;
+    //           tid++) { /* This should indeed be tasks_done_gpu as they are
+    //           the only
+    ////                     tasks which have been done*/
+    //        struct cell *ctemp = &(space->cells_top[tid]);
+    //        for (int i = 0; i < ctemp->hydro.count; i++) {
+    //          fprintf(fgpu_steps, "%f, %f, %f, %f, %f, %f, %f, %f, %f, %f,
+    //          %f, %f\n",
+    //                  ctemp->hydro.parts[i].x[0],
+    //                  ctemp->hydro.parts[i].x[1],
+    //                  ctemp->hydro.parts[i].x[2], ctemp->hydro.parts[i].rho,
+    //                  ctemp->hydro.parts[i].density.rho_dh,
+    //                  ctemp->hydro.parts[i].viscosity.v_sig,
+    //                  ctemp->hydro.parts[i].diffusion.laplace_u,
+    //                  ctemp->hydro.parts[i].force.alpha_visc_max_ngb,
+    //                  ctemp->hydro.parts[i].a_hydro[0],
+    //				  ctemp->hydro.parts[i].a_hydro[1],
+    //				  ctemp->hydro.parts[i].a_hydro[2]);
+    ////          message("wcount %f density %f",
+    /// ctemp->hydro.parts[i].density.wcount, ctemp->hydro.parts[i].rho); /
+    /// message("wcount is %f\n", ctemp->hydro.parts[i].density.wcount);
+    //        }
+    //      }
+    ////  }
+    /*Output compute times to separate files. cat later into one file*/
+//    if (step % 11 == 0 || step == 1) {
+#ifdef DUMP_TIMINGS
+#if defined(GPUOFFLOAD_DENSITY) || defined(GPUOFFLOAD_GRADIENT) || \
+    defined(GPUOFFLOAD_FORCE)
+    //        char buffer[30];
+    //        snprintf(buffer, sizeof(buffer), "t%d_stepnfullbundles%d",
+    //        r->cpuid, step); FILE *fullbundles = fopen(buffer, "w");
+    //        if(r->cpuid == 0)fprintf(fullbundles, "nfull, npartial,
+    //        nfullpair, npartialpair\n"); else fprintf(fullbundles, "%i, %i,
+    //        %i, %i\n", 		n_full_d_bundles, n_partial_d_bundles,
+    //        n_full_p_d_bundles, n_partial_p_d_bundles); fflush(fullbundles);
+
+    ///////////////////////////////////////////////////////////////
+    /// to ooutput timings uncomment this
+    ///////////////////////////////////////////////////////////////
+    if (r->cpuid == 0 && engine_rank == 0)
+      fprintf(fgpu_steps,
+              "GPU_SD, P_SD, U_SD, GPU_PD,  P_PD, U_PD, "
+              "GPU_SF, P_SF, U_SF, GPU_PF, P_PF, U_PF, GPU_SG, P_SG, U_SG, "
+              "GPU_PG, P_PG, U_PG\n "
+              "%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, "
+              "%e, %e\n",
+              time_for_density_gpu, packing_time, unpack_time_self,
+              time_for_density_gpu_pair, packing_time_pair, unpacking_time_pair,
+              time_for_gpu_f, packing_time_f, unpack_time_self_f,
+              time_for_gpu_pair_f, packing_time_pair_f, unpacking_time_pair_f,
+              time_for_gpu_g, packing_time_g, unpack_time_self_g,
+              time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f);
+
+    else
+      fprintf(fgpu_steps,
+              "%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, "
+              "%e, %e\n",
+              time_for_density_gpu, packing_time, unpack_time_self,
+              time_for_density_gpu_pair, packing_time_pair, unpacking_time_pair,
+              time_for_gpu_f, packing_time_f, unpack_time_self_f,
+              time_for_gpu_pair_f, packing_time_pair_f, unpacking_time_pair_f,
+              time_for_gpu_g, packing_time_g, unpack_time_self_g,
+              time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f);
+      //////////////////////////////////////////////////////////////
+      ///////////////////////////////////////////////////////////////
+      ///////////////////////////////////////////////////////////////
+
+#else  // No GPU offload
+    if (r->cpuid == 0 && engine_rank == 0)
+      fprintf(fgpu_steps,
+              "CPU TIME SELF, CPU TIME PAIR, "
+              "CPU TIME SELF F, CPU TIME PAIR F, CPU TIME SELF G, CPU TIME "
+              "PAIR G\n "
+              "%e, %e, %e, %e, %e, %e\n",
+              time_for_density_cpu, time_for_density_cpu_pair, time_for_cpu_f,
+              time_for_cpu_pair_f, time_for_cpu_g, time_for_cpu_pair_g);
+
+    else
+      fprintf(fgpu_steps, "%e, %e, %e, %e, %e, %e,\n", time_for_density_cpu,
+              time_for_density_cpu_pair, time_for_cpu_f, time_for_cpu_pair_f,
+              time_for_cpu_g, time_for_cpu_pair_g);
+#endif
+    //    }
+    fflush(fgpu_steps);
+    fclose(fgpu_steps);
+#endif  // DUMPTIMINGS
+    time_for_density_cpu = 0.0;
+    time_for_density_gpu = 0.0;
+    time_for_density_cpu_pair = 0.0;
+    time_for_density_gpu_pair = 0.0;
+    time_for_density_cpu_sub = 0.0;
+    tot_time_for_hard_memcpys = 0.0;
+    tasks_done_gpu = 0;
+    tasks_done_cpu = 0;
+    tasks_done_gpu_inc = 0;
+    if (ghost_in > 0)
+      fprintf(stderr, "total tasks not done on GPU %i is %i\n", r->cpuid,
+              ghost_in);
+    packed_self = 0;
+    packed_pair = 0;
+    packed_self_f = 0;
+    packed_pair_f = 0;
+    packed_self_g = 0;
+    packed_pair_g = 0;
+    density = 0;
+    density_sub = 0;
+    unpacked = 0;
+    //	if(step == 2)cudaProfilerStop();
+    //	if(step == 2)exit(0);
+    //	  size_t free_byte ;
+    //	  size_t total_byte ;
+    //	  cudaError_t cuda_status = cudaMemGetInfo( &free_byte,
+    //&total_byte ) ; 	  double free = (double)free_byte; 	  double
+    // available = (double)total_byte; 	  double used = (available - free);
+    // fprintf(stderr, "Used %f GB GPU memory\n", used/1e9);
+    /* Wait at the wait barrier. */
+    //    swift_barrier_wait(&e->wait_barrier);
+  }
+  // Free all data
+  //  cudaFree(d_tid_p);
+  //  cudaFree(d_id);
+  //  cudaFree(d_x_p);
+  //  cudaFree(d_y_p);
+  //  cudaFree(d_z_p);
+  //  cudaFree(d_ux);
+  //  cudaFree(d_uy);
+  //  cudaFree(d_uz);
+  //  cudaFree(d_a_hydrox);
+  //  cudaFree(d_a_hydroy);
+  //  cudaFree(d_a_hydroz);
+  //  cudaFree(d_mass);
+  //  cudaFree(d_h);
+  //  cudaFree(d_u);
+  //  cudaFree(d_u_dt);
+  //  cudaFree(d_rho);
+  //  cudaFree(d_SPH_sum);
+  //  cudaFree(d_locx);
+  //  cudaFree(d_locy);
+  //  cudaFree(d_locz);
+  //  cudaFree(d_widthx);
+  //  cudaFree(d_widthy);
+  //  cudaFree(d_widthz);
+  //  cudaFree(d_h_max);
+  //  cudaFree(d_count_p);
+  //  cudaFree(d_wcount);
+  //  cudaFree(d_wcount_dh);
+  //  cudaFree(d_rho_dh);
+  //  cudaFree(d_rot_ux);
+  //  cudaFree(d_rot_uy);
+  //  cudaFree(d_rot_uz);
+  //  cudaFree(d_div_v);
+  //  cudaFree(d_div_v_previous_step);
+  //  cudaFree(d_alpha_visc);
+  //  cudaFree(d_v_sig);
+  //  cudaFree(d_laplace_u);
+  //  cudaFree(d_alpha_diff);
+  //  cudaFree(d_f);
+  //  cudaFree(d_soundspeed);
+  //  cudaFree(d_h_dt);
+  //  cudaFree(d_balsara);
+  //  cudaFree(d_pressure);
+  //  cudaFree(d_alpha_visc_max_ngb);
+  //  cudaFree(d_time_bin);
+  //  cudaFree(d_wakeup);
+  //  cudaFree(d_min_ngb_time_bin);
+  //  cudaFree(d_to_be_synchronized);
+  //  cudaFree(tid_p);
+  //  cudaFree(id);
+  //  cudaFree(mass);
+  //  cudaFree(h);
+  //  cudaFree(u);
+  //  cudaFree(u_dt);
+  //  cudaFree(rho);
+  //  cudaFree(SPH_sum);
+  //  cudaFree(x_p);
+  //  cudaFree(y_p);
+  //  cudaFree(z_p);
+  //  cudaFree(ux);
+  //  cudaFree(uy);
+  //  cudaFree(uz);
+  //  cudaFree(a_hydrox);
+  //  cudaFree(a_hydroy);
+  //  cudaFree(a_hydroz);
+  //  cudaFree(locx);
+  //  cudaFree(locy);
+  //  cudaFree(locz);
+  //  cudaFree(widthx);
+  //  cudaFree(widthy);
+  //  cudaFree(widthz);
+  //  cudaFree(h_max);
+  //  cudaFree(count_p);
+  //  cudaFree(wcount);
+  //  cudaFree(wcount_dh);
+  //  cudaFree(rho_dh);
+  //  cudaFree(rot_ux);
+  //  cudaFree(rot_uy);
+  //  cudaFree(rot_uz);
+  //  cudaFree(div_v);
+  //  cudaFree(div_v_previous_step);
+  //  cudaFree(alpha_visc);
+  //  cudaFree(v_sig);
+  //  cudaFree(laplace_u);
+  //  cudaFree(alpha_diff);
+  //  cudaFree(f);
+  //  cudaFree(soundspeed);
+  //  cudaFree(h_dt);
+  //  cudaFree(balsara);
+  //  cudaFree(pressure);
+  //  cudaFree(alpha_visc_max_ngb);
+  //  cudaFree(time_bin);
+  //  cudaFree(wakeup);
+  //  cudaFree(min_ngb_time_bin);
+  //  cudaFree(to_be_synchronized);
+  //  cudaFree(partid_p);
+  //  cudaFree(d_task_first_part);
+  //  cudaFree(d_task_last_part);
+  //  cudaFree(task_first_part_self_dens);
+  //  cudaFree(task_last_part_self_dens);
+  //  cudaFree(task_first_part_pair_ci);
+  //  cudaFree(task_last_part_pair_ci);
+  //  cudaFree(task_first_part_pair_cj);
+  //  cudaFree(task_last_part_pair_cj);
+  //  cudaFree(d_bundle_first_part_self_dens);
+  //  cudaFree(d_bundle_last_part_self_dens);
+  //  cudaFree(bundle_first_part_self_dens);
+  //  cudaFree(bundle_last_part_self_dens);
+  //  cudaFree(bundle_first_part_pair_ci);
+  //  cudaFree(bundle_last_part_pair_ci);
+  //  cudaFree(bundle_first_part_pair_cj);
+  //  cudaFree(bundle_last_part_pair_cj);
+  //  free(ci_list_self_dens);
+  //  free(ci_list_pair);
+  //  free(cj_list_pair);
+
+  /* Be kind, rewind. */
+  return NULL;
+}
+
+#endif  // WITH_CUDA
+
diff --git a/src/runner_others.c b/src/runner_others.c
index cbace92a63..914b1f47a3 100644
--- a/src/runner_others.c
+++ b/src/runner_others.c
@@ -381,7 +381,7 @@ void runner_do_star_formation(struct runner *r, struct cell *c, int timer) {
 
     /* Loop over the gas particles in this cell. */
     for (int k = 0; k < count; k++) {
-
+      continue; //A. Nasar: Commented out to try without inhibited particles
       /* Get a handle on the part. */
       struct part *restrict p = &parts[k];
       struct xpart *restrict xp = &xparts[k];
diff --git a/src/scheduler.c b/src/scheduler.c
index 2b156f8250..69203e37b6 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -61,6 +61,7 @@
 int activate_by_unskip = 1;
 #endif
 
+#include "cuda/BLOCK_SIZE.h"
 /**
  * @brief Re-set the list of active tasks.
  */
@@ -900,7 +901,9 @@ void scheduler_write_cell_dependencies(struct scheduler *s, int verbose,
   int local_count = 0;
   for (int i = 0; i < s->nr_tasks; i++) {
     const struct task *ta = &s->tasks[i];
-
+    //    if(ta->subtype == task_subtype_gpu_unpack_d
+    //  		  || ta->subtype == task_subtype_gpu_unpack_f
+    //			  || ta->subtype == task_subtype_gpu_unpack_g)continue;
     /* Are we using this task?
      * For the 0-step, we wish to show all the tasks (even the inactives). */
     if (step != 0 && ta->skip) continue;
@@ -952,7 +955,10 @@ void scheduler_write_cell_dependencies(struct scheduler *s, int verbose,
     /* and their dependencies */
     for (int j = 0; j < ta->nr_unlock_tasks; j++) {
       const struct task *tb = ta->unlock_tasks[j];
-
+      if (tb->subtype == task_subtype_gpu_unpack_d ||
+          tb->subtype == task_subtype_gpu_unpack_f ||
+          tb->subtype == task_subtype_gpu_unpack_g)
+        continue;
       /* Are we using this task?
        * For the 0-step, we wish to show all the tasks (even the inactive). */
       if (step != 0 && tb->skip) continue;
@@ -1167,6 +1173,237 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
   const int with_black_holes =
       (s->space->e->policy & engine_policy_black_holes);
 
+  /* Iterate on this task until we're done with it. */
+  int redo = 1;
+  while (redo) {
+    /* Reset the redo flag. */
+    redo = 0;
+
+    /* Is this a non-empty self-task? */
+    const int is_self =
+        (t->type == task_type_self) && (t->ci != NULL) &&
+        ((t->ci->hydro.count > 0) || (with_stars && t->ci->stars.count > 0) ||
+         (with_sinks && t->ci->sinks.count > 0) ||
+         (with_black_holes && t->ci->black_holes.count > 0));
+
+    /* Is this a non-empty pair-task? */
+    const int is_pair = (t->type == task_type_pair) && (t->ci != NULL) &&
+                        (t->cj != NULL) &&
+                        ((t->ci->hydro.count > 0) ||
+                         (with_feedback && t->ci->stars.count > 0) ||
+                         (with_sinks && t->ci->sinks.count > 0) ||
+                         (with_black_holes && t->ci->black_holes.count > 0)) &&
+                        ((t->cj->hydro.count > 0) ||
+                         (with_feedback && t->cj->stars.count > 0) ||
+                         (with_sinks && t->cj->sinks.count > 0) ||
+                         (with_black_holes && t->cj->black_holes.count > 0));
+
+    /* Empty task? */
+    if (!is_self && !is_pair) {
+      t->type = task_type_none;
+      t->subtype = task_subtype_none;
+      t->ci = NULL;
+      t->cj = NULL;
+      t->skip = 1;
+      break;
+    }
+
+    /* Self-interaction? */
+    if (t->type == task_type_self) {
+      /* Get a handle on the cell involved. */
+      struct cell *ci = t->ci;
+
+      /* Foreign task? */
+      if (ci->nodeID != s->nodeID) {
+        t->skip = 1;
+        break;
+      }
+
+      /* Is this cell even split and the task does not violate h ? */
+      if (cell_can_split_self_hydro_task(ci)) {
+        /* Make a sub? */
+        if (scheduler_dosub && (ci->hydro.count < space_subsize_self_hydro_default) &&
+            (ci->stars.count < space_subsize_self_stars)) {
+          /* convert to a self-subtask. */
+          t->type = task_type_sub_self;
+
+          /* Otherwise, make tasks explicitly. */
+        } else {
+          /* Take a step back (we're going to recycle the current task)... */
+          redo = 1;
+
+          /* Add the self tasks. */
+          int first_child = 0;
+          while (ci->progeny[first_child] == NULL) first_child++;
+
+          t->ci = ci->progeny[first_child];
+          cell_set_flag(t->ci, cell_flag_has_tasks);
+
+          for (int k = first_child + 1; k < 8; k++) {
+            /* Do we have a non-empty progenitor? */
+            if (ci->progeny[k] != NULL &&
+                (ci->progeny[k]->hydro.count ||
+                 (with_stars && ci->progeny[k]->stars.count))) {
+              scheduler_splittask_hydro(
+                  scheduler_addtask(s, task_type_self, t->subtype, 0, 0,
+                                    ci->progeny[k], NULL),
+                  s);
+            }
+          }
+
+          /* Make a task for each pair of progeny */
+          for (int j = 0; j < 8; j++) {
+            /* Do we have a non-empty progenitor? */
+            if (ci->progeny[j] != NULL &&
+                (ci->progeny[j]->hydro.count ||
+                 (with_feedback && ci->progeny[j]->stars.count))) {
+              for (int k = j + 1; k < 8; k++) {
+                /* Do we have a second non-empty progenitor? */
+                if (ci->progeny[k] != NULL &&
+                    (ci->progeny[k]->hydro.count ||
+                     (with_feedback && ci->progeny[k]->stars.count))) {
+                  scheduler_splittask_hydro(
+                      scheduler_addtask(s, task_type_pair, t->subtype,
+                                        sub_sid_flag[j][k], 0, ci->progeny[j],
+                                        ci->progeny[k]),
+                      s);
+                }
+              }
+            }
+          }
+        }
+
+      } /* Cell is split */
+
+    } /* Self interaction */
+
+    /* Pair interaction? */
+    else if (t->type == task_type_pair) {
+      /* Get a handle on the cells involved. */
+      struct cell *ci = t->ci;
+      struct cell *cj = t->cj;
+
+      /* Foreign task? */
+      if (ci->nodeID != s->nodeID && cj->nodeID != s->nodeID) {
+        t->skip = 1;
+        break;
+      }
+
+      /* Get the sort ID, use space_getsid_and_swap_cells and not t->flags
+         to make sure we get ci and cj swapped if needed. */
+      double shift[3];
+      const int sid = space_getsid_and_swap_cells(s->space, &ci, &cj, shift);
+
+#ifdef SWIFT_DEBUG_CHECKS
+      if (sid != t->flags)
+        error("Got pair task with incorrect flags: sid=%d flags=%lld", sid,
+              t->flags);
+#endif
+
+      /* Should this task be split-up? */
+      if (cell_can_split_pair_hydro_task(ci) &&
+          cell_can_split_pair_hydro_task(cj)) {
+
+        const int h_count_i = ci->hydro.count;
+        const int h_count_j = cj->hydro.count;
+
+        const int s_count_i = ci->stars.count;
+        const int s_count_j = cj->stars.count;
+
+        int do_sub_hydro = 1;
+        int do_sub_stars_i = 1;
+        int do_sub_stars_j = 1;
+        if (h_count_i > 0 && h_count_j > 0) {
+
+          /* Note: Use division to avoid integer overflow. */
+          do_sub_hydro =
+              h_count_i * sid_scale[sid] < space_subsize_pair_hydro_default / h_count_j;
+        }
+        if (s_count_i > 0 && h_count_j > 0) {
+
+          /* Note: Use division to avoid integer overflow. */
+          do_sub_stars_i =
+              s_count_i * sid_scale[sid] < space_subsize_pair_stars / h_count_j;
+        }
+        if (s_count_j > 0 && h_count_i > 0) {
+
+          /* Note: Use division to avoid integer overflow. */
+          do_sub_stars_j =
+              s_count_j * sid_scale[sid] < space_subsize_pair_stars / h_count_i;
+        }
+
+        /* Replace by a single sub-task? */
+        if (scheduler_dosub &&
+            (do_sub_hydro && do_sub_stars_i && do_sub_stars_j) &&
+            !sort_is_corner(sid)) {
+
+          /* Make this task a sub task. */
+          t->type = task_type_sub_pair;
+
+          /* Otherwise, split it. */
+        } else {
+          /* Take a step back (we're going to recycle the current task)... */
+          redo = 1;
+
+          /* Loop over the sub-cell pairs for the current sid and add new tasks
+           * for them. */
+          struct cell_split_pair *csp = &cell_split_pairs[sid];
+
+          t->ci = ci->progeny[csp->pairs[0].pid];
+          t->cj = cj->progeny[csp->pairs[0].pjd];
+          if (t->ci != NULL) cell_set_flag(t->ci, cell_flag_has_tasks);
+          if (t->cj != NULL) cell_set_flag(t->cj, cell_flag_has_tasks);
+
+          t->flags = csp->pairs[0].sid;
+          for (int k = 1; k < csp->count; k++) {
+            scheduler_splittask_hydro(
+                scheduler_addtask(s, task_type_pair, t->subtype,
+                                  csp->pairs[k].sid, 0,
+                                  ci->progeny[csp->pairs[k].pid],
+                                  cj->progeny[csp->pairs[k].pjd]),
+                s);
+          }
+        }
+
+        /* Otherwise, break it up if it is too large? */
+      } else if (scheduler_doforcesplit && ci->split && cj->split &&
+                 (ci->hydro.count > space_maxsize / cj->hydro.count)) {
+
+        /* Replace the current task. */
+        t->type = task_type_none;
+
+        for (int j = 0; j < 8; j++)
+          if (ci->progeny[j] != NULL && ci->progeny[j]->hydro.count)
+            for (int k = 0; k < 8; k++)
+              if (cj->progeny[k] != NULL && cj->progeny[k]->hydro.count) {
+                struct task *tl =
+                    scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
+                                      ci->progeny[j], cj->progeny[k]);
+                scheduler_splittask_hydro(tl, s);
+                tl->flags = space_getsid_and_swap_cells(s->space, &t->ci,
+                                                        &t->cj, shift);
+              }
+      }
+    } /* pair interaction? */
+  } /* iterate over the current task. */
+}
+
+/**
+ * @brief Split a hydrodynamic task if too large.
+ *
+ * @param t The #task
+ * @param s The #scheduler we are working in.
+ */
+static void scheduler_splittask_hydro_GPU(struct task *t, struct scheduler *s) {
+  /* Are we considering both stars and hydro when splitting? */
+  /* Note this is not very clean as the scheduler should not really
+     access the engine... */
+  const int with_feedback = (s->space->e->policy & engine_policy_feedback);
+  const int with_stars = (s->space->e->policy & engine_policy_stars);
+  const int with_sinks = (s->space->e->policy & engine_policy_sinks);
+  const int with_black_holes =
+      (s->space->e->policy & engine_policy_black_holes);
+
   /* Iterate on this task until we're done with it. */
   int redo = 1;
   while (redo) {
@@ -1362,8 +1599,6 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
         /* Otherwise, break it up if it is too large? */
       } else if (scheduler_doforcesplit && ci->split && cj->split &&
                  (ci->hydro.count > space_maxsize / cj->hydro.count)) {
-        // message( "force splitting pair with %i and %i parts." ,
-        // ci->hydro.count , cj->hydro.count );
 
         /* Replace the current task. */
         t->type = task_type_none;
@@ -1651,6 +1886,19 @@ void scheduler_splittasks_mapper(void *map_data, int num_elements,
       scheduler_splittask_gravity(t, s);
     } else if (t->subtype == task_subtype_grav) {
       scheduler_splittask_gravity(t, s);
+      // if task is gpu task do not split A. Nasar
+    } else if (t->subtype == task_subtype_gpu_pack_d ||
+               t->subtype == task_subtype_gpu_pack_g ||
+               t->subtype == task_subtype_gpu_pack_f) {
+      scheduler_splittask_hydro_GPU(t, s);
+    } else if (t->subtype == task_subtype_gpu_unpack_d ||
+               t->subtype == task_subtype_gpu_unpack_g ||
+               t->subtype == task_subtype_gpu_unpack_f) {
+      /*Do nothing and grab next task to split.
+       *These tasks are cell-less so cannot split.
+       *Will remove this if statement if set on splitting
+       *b4 creating unpack tasks*/
+      continue;
     } else {
 #ifdef SWIFT_DEBUG_CHECKS
       error("Unexpected task sub-type %s/%s", taskID_names[t->type],
@@ -1740,6 +1988,8 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
   t->tic = 0;
   t->toc = 0;
   t->total_ticks = 0;
+  t->total_cpu_pack_ticks = 0;
+  t->total_cpu_unpack_ticks = 0;
 #ifdef SWIFT_DEBUG_CHECKS
   t->activated_by_unskip = 0;
   t->activated_by_marktask = 0;
@@ -1748,6 +1998,26 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
   if (ci != NULL) cell_set_flag(ci, cell_flag_has_tasks);
   if (cj != NULL) cell_set_flag(cj, cell_flag_has_tasks);
 
+  // #ifdef WITH_CUDA  A. Nasar
+  if (t->subtype == task_subtype_gpu_pack_d) {
+    if (t->type == task_type_self || t->type == task_type_sub_self)
+      atomic_inc(&s->nr_self_pack_tasks_d);
+    if (t->type == task_type_pair || t->type == task_type_sub_pair)
+      atomic_inc(&s->nr_pair_pack_tasks_d);
+  }
+  if (t->subtype == task_subtype_gpu_pack_f) {
+    if (t->type == task_type_self || t->type == task_type_sub_self)
+      atomic_inc(&s->nr_self_pack_tasks_f);
+    if (t->type == task_type_pair || t->type == task_type_sub_pair)
+      atomic_inc(&s->nr_pair_pack_tasks_f);
+  }
+  if (t->subtype == task_subtype_gpu_pack_g) {
+    if (t->type == task_type_self || t->type == task_type_sub_self)
+      atomic_inc(&s->nr_self_pack_tasks_g);
+    if (t->type == task_type_pair || t->type == task_type_sub_pair)
+      atomic_inc(&s->nr_pair_pack_tasks_g);
+  }
+  // #endif
   /* Add an index for it. */
   // lock_lock( &s->lock );
   s->tasks_ind[atomic_inc(&s->nr_tasks)] = ind;
@@ -1833,6 +2103,13 @@ void scheduler_set_unlocks(struct scheduler *s) {
     struct task *t = &s->tasks[k];
     for (int i = 0; i < t->nr_unlock_tasks; i++) {
       for (int j = i + 1; j < t->nr_unlock_tasks; j++) {
+        /*Fix for the case when one unpack task works over the same cell
+         * connected to two pair pack tasks*/
+        if (t->subtype == task_subtype_gpu_unpack_d ||
+            t->subtype == task_subtype_gpu_unpack_g ||
+            t->subtype == task_subtype_gpu_unpack_f) {
+          continue;
+        }
         if (t->unlock_tasks[i] == t->unlock_tasks[j])
           error("duplicate unlock! t->type=%s/%s unlocking type=%s/%s",
                 taskID_names[t->type], subtaskID_names[t->subtype],
@@ -1940,13 +2217,20 @@ void scheduler_reset(struct scheduler *s, int size) {
   /* Reset the counters. */
   s->size = size;
   s->nr_tasks = 0;
+  s->nr_self_pack_tasks_d = 0;  // A. Nasar
+  s->nr_pair_pack_tasks_d = 0;
+  s->nr_self_pack_tasks_f = 0;
+  s->nr_pair_pack_tasks_f = 0;
+  s->nr_self_pack_tasks_g = 0;
+  s->nr_pair_pack_tasks_g = 0;
   s->tasks_next = 0;
   s->waiting = 0;
   s->nr_unlocks = 0;
   s->completed_unlock_writes = 0;
   s->active_count = 0;
   s->total_ticks = 0;
-
+  s->pack_size = N_TASKS_PER_PACK_SELF;
+  s->pack_size_pair = N_TASKS_PER_PACK_PAIR;
   /* Set the task pointers in the queues. */
   for (int k = 0; k < s->nr_queues; k++) s->queues[k].tasks = s->tasks;
 }
@@ -2007,6 +2291,24 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
           cost = 1.f * (wscale * gcount_i) * gcount_i;
         } else if (t->subtype == task_subtype_external_grav)
           cost = 1.f * wscale * gcount_i;
+        else if (t->subtype == task_subtype_gpu_pack_d)  // A. Nasar
+          cost = 1.f * (wscale * count_i * count_i);     // * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_pack_f)
+          cost = 1.f * (wscale * count_i * count_i);  // * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_pack_g)
+          cost = 1.f * (wscale * count_i * count_i);  // * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_unpack_d)
+	  //cost = wscale * s->pack_size;	
+          cost = (wscale * count_i) * count_i * s->pack_size;
+	//          cost = 1.f * wscale * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_unpack_f)
+	  cost = (wscale * count_i) * count_i * s->pack_size;
+//	  cost = wscale * s->pack_size;	
+//          cost = 1.f * wscale * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_unpack_g)
+	  cost = (wscale * count_i) * count_i * s->pack_size;
+//	  cost = wscale * s->pack_size;	
+//          cost = 1.f * wscale * s->pack_size;
         else if (t->subtype == task_subtype_stars_density ||
                  t->subtype == task_subtype_stars_prep1 ||
                  t->subtype == task_subtype_stars_prep2 ||
@@ -2045,7 +2347,36 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
             cost = 3.f * (wscale * gcount_i) * gcount_j;
           else
             cost = 2.f * (wscale * gcount_i) * gcount_j;
-
+          // Abouzied: Think about good cost (for rainy days) A. Nasar
+        } else if (t->subtype == task_subtype_gpu_pack_d) {
+         // cost = 2.f * (wscale * count_i) * count_i;
+          if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID)
+            cost = 3.f * (wscale * count_i * count_i);
+          else
+            cost = 2.f * (wscale * count_i) * count_j * sid_scale[t->flags];        
+	} else if (t->subtype == task_subtype_gpu_pack_f) {
+//          cost = 2.f * (wscale * count_i) * count_i;
+          if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID)
+            cost = 3.f * (wscale * count_i * count_i) * sid_scale[t->flags];
+          else
+            cost = 2.f * (wscale * count_i) * count_j * sid_scale[t->flags];
+	  
+        } else if (t->subtype == task_subtype_gpu_pack_g) {
+          if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID)
+            cost = 3.f * (wscale * count_i * count_i) * sid_scale[t->flags];
+          else
+            cost = 2.f * (wscale * count_i) * count_j * sid_scale[t->flags];
+		
+//          cost = 2.f * (wscale * count_i) * count_i;
+        } else if (t->subtype == task_subtype_gpu_unpack_d) {
+	  cost = (wscale * count_i) * count_i * s->pack_size;	
+          //cost = 1.f * wscale;
+        } else if (t->subtype == task_subtype_gpu_unpack_f) {
+	  cost = (wscale * count_i) * count_i * s->pack_size;      	
+          //cost = 1.f * wscale;
+        } else if (t->subtype == task_subtype_gpu_unpack_g) {
+	  cost = (wscale * count_i) * count_i * s->pack_size;      	
+          //cost = 1.f * wscale;
         } else if (t->subtype == task_subtype_stars_density ||
                    t->subtype == task_subtype_stars_prep1 ||
                    t->subtype == task_subtype_stars_prep2 ||
@@ -2177,7 +2508,21 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
 
         } else if (t->subtype == task_subtype_do_bh_swallow) {
           cost = 1.f * wscale * (bcount_i + bcount_j);
-
+        } else if (t->subtype == task_subtype_gpu_pack_d) {
+          cost = 2.f * (wscale * count_i) * count_i;
+        } else if (t->subtype == task_subtype_gpu_pack_f) {
+          cost = 2.f * (wscale * count_i) * count_i;
+        } else if (t->subtype == task_subtype_gpu_pack_g) {
+          cost = 2.f * (wscale * count_i) * count_i;
+        } else if (t->subtype == task_subtype_gpu_unpack_d) {
+	  cost = (wscale * count_i) * count_i * s->pack_size;	
+          //cost = 1.f * wscale;
+        } else if (t->subtype == task_subtype_gpu_unpack_f) {
+          cost = (wscale * count_i) * count_i * s->pack_size;
+		//cost = 1.f * wscale;
+        } else if (t->subtype == task_subtype_gpu_unpack_g) {
+          cost = (wscale * count_i) * count_i * s->pack_size;
+		//cost = 1.f * wscale;
         } else if (t->subtype == task_subtype_density ||
                    t->subtype == task_subtype_gradient ||
                    t->subtype == task_subtype_force ||
@@ -2216,10 +2561,25 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
           cost = 1.f * wscale * count_i;
         } else if (t->subtype == task_subtype_do_bh_swallow) {
           cost = 1.f * wscale * bcount_i;
-        } else if (t->subtype == task_subtype_density ||
-                   t->subtype == task_subtype_gradient ||
-                   t->subtype == task_subtype_force ||
-                   t->subtype == task_subtype_limiter) {
+        } else if (t->subtype == task_subtype_gpu_pack_d)  // A. Nasar
+          cost = 1.f * (wscale * count_i) * count_i;       // * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_pack_f)
+          cost = 1.f * (wscale * count_i) * count_i;  // * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_pack_g)
+          cost = 1.f * (wscale * count_i) * count_i;  // * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_unpack_d)
+	  cost = (wscale * count_i) * count_i * s->pack_size;	
+          //cost = 1.f * wscale * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_unpack_f)
+	  cost = (wscale * count_i) * count_i * s->pack_size;	
+          //cost = 1.f * wscale * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_unpack_g)
+	  cost = (wscale * count_i) * count_i * s->pack_size;	
+          //cost = 1.f * wscale * s->pack_size;
+        else if (t->subtype == task_subtype_density ||
+                 t->subtype == task_subtype_gradient ||
+                 t->subtype == task_subtype_force ||
+                 t->subtype == task_subtype_limiter) {
           cost = 1.f * (wscale * count_i) * count_i;
         } else if (t->subtype == task_subtype_rt_gradient) {
           cost = 1.f * wscale * scount_i * count_i;
@@ -2231,10 +2591,10 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
         }
         break;
       case task_type_ghost:
-        if (t->ci == t->ci->hydro.super) cost = wscale * count_i;
+        if (t->ci == t->ci->hydro.super) cost = wscale * count_i * count_i;
         break;
       case task_type_extra_ghost:
-        if (t->ci == t->ci->hydro.super) cost = wscale * count_i;
+        if (t->ci == t->ci->hydro.super) cost = wscale * count_i * count_i;
         break;
       case task_type_stars_ghost:
         if (t->ci == t->ci->hydro.super) cost = wscale * scount_i;
@@ -2246,7 +2606,7 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
         if (t->ci == t->ci->hydro.super) cost = wscale * bcount_i;
         break;
       case task_type_drift_part:
-        cost = wscale * count_i;
+        cost = wscale * count_i * count_i;
         break;
       case task_type_drift_gpart:
         cost = wscale * gcount_i;
@@ -2273,7 +2633,7 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
         cost = wscale * (gcount_i + gcount_j);
         break;
       case task_type_end_hydro_force:
-        cost = wscale * count_i;
+        cost = wscale * count_i * count_i;
         break;
       case task_type_end_grav_force:
         cost = wscale * gcount_i;
@@ -2309,15 +2669,15 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
         break;
       case task_type_kick1:
         cost =
-            wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i);
+            wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i) * count_i;
         break;
       case task_type_kick2:
         cost =
-            wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i);
+            wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i) * count_i;
         break;
       case task_type_timestep:
         cost =
-            wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i);
+            wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i) * count_i;
         break;
       case task_type_timestep_limiter:
         cost = wscale * count_i;
@@ -2374,6 +2734,27 @@ void scheduler_rewait_mapper(void *map_data, int num_elements,
 
     /* Increment the task's own wait counter for the enqueueing. */
     atomic_inc(&t->wait);
+    t->done = 0;
+    t->gpu_done = 0;
+
+    //    if (t->type == task_type_self){ // A. Nasar increment number of
+    //    waiting tasks
+    //      if(t->subtype == task_subtype_gpu_pack_d)
+    //        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left);
+    //      if (t->subtype == task_subtype_gpu_pack_f)
+    //        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left_f);
+    //      if (t->subtype == task_subtype_gpu_pack_g)
+    //        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left_g);
+    //    }
+    //
+    //    if (t->type == task_type_pair){
+    //      if(t->subtype == task_subtype_gpu_pack_d)
+    //        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left);
+    //      if (t->subtype == task_subtype_gpu_pack_f)
+    //        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left_f);
+    //      if (t->subtype == task_subtype_gpu_pack_g)
+    //        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left_g);
+    //    }
 
 #ifdef SWIFT_DEBUG_CHECKS
     /* Check that we don't have more waits that what can be stored. */
@@ -2411,7 +2792,26 @@ void scheduler_enqueue_mapper(void *map_data, int num_elements,
  * @param s The #scheduler.
  */
 void scheduler_start(struct scheduler *s) {
-
+  for (int i = 0; i < s->nr_queues; i++) {  // A. Nasar
+    s->queues[i].n_packs_self_left_d = 0;
+    s->queues[i].n_packs_pair_left_d = 0;
+    s->queues[i].n_packs_self_left_f = 0;
+    s->queues[i].n_packs_pair_left_f = 0;
+    s->queues[i].n_packs_self_left_g = 0;
+    s->queues[i].n_packs_pair_left_g = 0;
+    s->queues[i].n_packs_self_stolen_d = 0;
+    s->queues[i].n_packs_pair_stolen_d = 0;
+    s->queues[i].n_packs_self_stolen_f = 0;
+    s->queues[i].n_packs_pair_stolen_f = 0;
+    s->queues[i].n_packs_self_stolen_g = 0;
+    s->queues[i].n_packs_pair_stolen_g = 0;
+    s->s_d_left[i] = 0;
+    s->s_g_left[i] = 0;
+    s->s_f_left[i] = 0;
+    s->p_d_left[i] = 0;
+    s->p_g_left[i] = 0;
+    s->p_f_left[i] = 0;
+  }
   /* Re-wait the tasks. */
   if (s->active_count > 1000) {
     threadpool_map(s->threadpool, scheduler_rewait_mapper, s->tid_active,
@@ -2487,6 +2887,21 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
             t->subtype == task_subtype_external_grav) {
           qid = t->ci->grav.super->owner;
           owner = &t->ci->grav.super->owner;
+        } else if (t->subtype == task_subtype_gpu_pack_d) {  // A. Nasar
+          qid = t->ci->hydro.super->owner;
+          owner = &t->ci->hydro.super->owner;
+        } else if (t->subtype == task_subtype_gpu_pack_f) {
+          qid = t->ci->hydro.super->owner;
+          owner = &t->ci->hydro.super->owner;
+        } else if (t->subtype == task_subtype_gpu_pack_g) {
+          qid = t->ci->hydro.super->owner;
+          owner = &t->ci->hydro.super->owner;
+        } else if (t->subtype == task_subtype_gpu_unpack_d) {
+          qid = -1;
+        } else if (t->subtype == task_subtype_gpu_unpack_f) {
+          qid = -1;
+        } else if (t->subtype == task_subtype_gpu_unpack_g) {
+          qid = -1;
         } else {
           qid = t->ci->hydro.super->owner;
           owner = &t->ci->hydro.super->owner;
@@ -2513,13 +2928,19 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         break;
       case task_type_pair:
       case task_type_sub_pair:
-        qid = t->ci->super->owner;
-        owner = &t->ci->super->owner;
-        if ((qid < 0) ||
-            ((t->cj->super->owner > -1) &&
-             (s->queues[qid].count > s->queues[t->cj->super->owner].count))) {
-          qid = t->cj->super->owner;
-          owner = &t->cj->super->owner;
+        if (t->subtype == task_subtype_gpu_unpack_d ||
+            t->subtype == task_subtype_gpu_unpack_f ||
+            t->subtype == task_subtype_gpu_unpack_g) {
+          qid = -1;
+        } else {
+          qid = t->ci->super->owner;
+          owner = &t->ci->super->owner;
+          if ((qid < 0) ||
+              ((t->cj->super->owner > -1) &&
+               (s->queues[qid].count > s->queues[t->cj->super->owner].count))) {
+            qid = t->cj->super->owner;
+            owner = &t->cj->super->owner;
+          }
         }
         break;
       case task_type_recv:
@@ -2729,12 +3150,83 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
 
     /* Save qid as owner for next time a task accesses this cell. */
     if (owner != NULL) *owner = qid;
-
+//    if (t->type == task_type_self || t->type == task_type_sub_self) {
+//      if (t->subtype == task_subtype_gpu_pack_d && t->ci->hydro.count > 0) {
+//    	  return;
+//      }
+//      if (t->subtype == task_subtype_gpu_pack_f && t->ci->hydro.count > 0) {
+//    	  return;
+//      }
+//      if (t->subtype == task_subtype_gpu_pack_g && t->ci->hydro.count > 0) {
+//    	  return;
+//      }
+//    }
+//    /* A. Nasar NEED to think about how to do this with
+//     MPI where ci may not be on this node/rank */
+//    if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+//      if (t->subtype == task_subtype_gpu_pack_d  && t->ci->hydro.count > 0  && t->cj->hydro.count > 0) {
+//    	  return;
+//      }
+//      if (t->subtype == task_subtype_gpu_pack_f  && t->ci->hydro.count > 0  && t->cj->hydro.count > 0) {
+//    	  return;
+//      }
+//      if (t->subtype == task_subtype_gpu_pack_g  && t->ci->hydro.count > 0  && t->cj->hydro.count > 0) {
+//    	  return;
+//      }
+//    }
     /* Increase the waiting counter. */
     atomic_inc(&s->waiting);
-
     /* Insert the task into that queue. */
     queue_insert(&s->queues[qid], t);
+    /* A. Nasar: Increment counters required for the pack tasks */
+    if (t->type == task_type_self || t->type == task_type_sub_self) {
+      if (t->subtype == task_subtype_gpu_pack_d && t->ci->hydro.count > 0) {
+        lock_lock(&s->queues[qid].lock);
+        s->queues[qid].n_packs_self_left_d++;
+        if (lock_unlock(&s->queues[qid].lock) != 0)
+          error("Error unlocking queue");
+        atomic_inc(&s->s_d_left[qid]);
+      }
+      if (t->subtype == task_subtype_gpu_pack_f && t->ci->hydro.count > 0) {
+        lock_lock(&s->queues[qid].lock);
+        s->queues[qid].n_packs_self_left_f++;
+        if (lock_unlock(&s->queues[qid].lock) != 0)
+          error("Error unlocking queue");
+        atomic_inc(&s->s_f_left[qid]);
+      }
+      if (t->subtype == task_subtype_gpu_pack_g && t->ci->hydro.count > 0) {
+        lock_lock(&s->queues[qid].lock);
+        s->queues[qid].n_packs_self_left_g++;
+        if (lock_unlock(&s->queues[qid].lock) != 0)
+          error("Error unlocking queue");
+        atomic_inc(&s->s_g_left[qid]);
+      }
+    }
+    /* A. Nasar NEED to think about how to do this with
+     MPI where ci may not be on this node/rank */
+    if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+      if (t->subtype == task_subtype_gpu_pack_d  && t->ci->hydro.count > 0  && t->cj->hydro.count > 0) {
+        lock_lock(&s->queues[qid].lock);
+        s->queues[qid].n_packs_pair_left_d++;
+        if (lock_unlock(&s->queues[qid].lock) != 0)
+          error("Error unlocking queue");
+        atomic_inc(&s->p_d_left[qid]);
+      }
+      if (t->subtype == task_subtype_gpu_pack_f  && t->ci->hydro.count > 0  && t->cj->hydro.count > 0) {
+        lock_lock(&s->queues[qid].lock);
+        s->queues[qid].n_packs_pair_left_f++;
+        if (lock_unlock(&s->queues[qid].lock) != 0)
+          error("Error unlocking queue");
+        atomic_inc(&s->p_f_left[qid]);
+      }
+      if (t->subtype == task_subtype_gpu_pack_g  && t->ci->hydro.count > 0  && t->cj->hydro.count > 0) {
+        lock_lock(&s->queues[qid].lock);
+        s->queues[qid].n_packs_pair_left_g++;
+        if (lock_unlock(&s->queues[qid].lock) != 0)
+          error("Error unlocking queue");
+        atomic_inc(&s->p_g_left[qid]);
+      }
+    }
   }
 }
 
@@ -2778,12 +3270,48 @@ struct task *scheduler_done(struct scheduler *s, struct task *t) {
   /* Mark the task as skip. */
   t->skip = 1;
 
+  t->done = 1;
+
   /* Return the next best task. Note that we currently do not
      implement anything that does this, as getting it to respect
      priorities is too tricky and currently unnecessary. */
   return NULL;
 }
 
+struct task *signal_sleeping_runners(struct scheduler *s, struct task *t,
+                                     int tasks_packed) {
+  /* Mark the task as skip. */
+  //  t->skip = 1;
+
+  /* Task definitely done, signal any sleeping runners. */
+  if (!t->implicit) {
+    pthread_mutex_lock(&s->sleep_mutex);
+    atomic_sub(&s->waiting, tasks_packed);
+    pthread_cond_broadcast(&s->sleep_cond);
+    pthread_mutex_unlock(&s->sleep_mutex);
+  }
+  return NULL;
+}
+
+struct task *enqueue_dependencies(struct scheduler *s, struct task *t) {
+
+  /* Loop through the dependencies and add them to a queue if
+         they are ready. */
+  for (int k = 0; k < t->nr_unlock_tasks; k++) {
+    struct task *t2 = t->unlock_tasks[k];
+    if (t2->skip) continue;
+
+    const int res = atomic_dec(&t2->wait);
+    if (res < 1) {
+      error("Negative wait!");
+    } else if (res == 1) {
+      scheduler_enqueue(s, t2);
+    }
+  }
+
+  return NULL;
+}
+
 /**
  * @brief Resolve a single dependency by hand.
  *
@@ -2911,10 +3439,12 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
   /* Check qid. */
   if (qid >= nr_queues || qid < 0) error("Bad queue ID.");
 
+  /*Get a pointer to our queue for re-use*/
+  struct queue *q = &s->queues[qid];
   /* Loop as long as there are tasks... */
   while (s->waiting > 0 && res == NULL) {
     /* Try more than once before sleeping. */
-    for (int tries = 0; res == NULL && s->waiting && tries < scheduler_maxtries;
+    for (int tries = 0; res == NULL && s->waiting && tries < scheduler_maxtries * 100;
          tries++) {
       /* Try to get a task from the suggested queue. */
       if (s->queues[qid].count > 0 || s->queues[qid].count_incoming > 0) {
@@ -2926,21 +3456,109 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
 
       /* If unsuccessful, try stealing from the other queues. */
       if (s->flags & scheduler_flag_steal) {
+
         int count = 0, qids[nr_queues];
-        for (int k = 0; k < nr_queues; k++)
+
+        /* Make list of queues that have 1 or more tasks in them */
+        for (int k = 0; k < nr_queues; k++) {
+          if (k == qid) continue;
           if (s->queues[k].count > 0 || s->queues[k].count_incoming > 0) {
             qids[count++] = k;
           }
+        }
+
         for (int k = 0; k < scheduler_maxsteal && count > 0; k++) {
+
+          /* Pick a queue at random among the non-empty ones */
           const int ind = rand_r(&seed) % count;
-          TIMER_TIC
-          res = queue_gettask(&s->queues[qids[ind]], prev, 0);
+          /*Get a pointer to the queue we're stealing from*/
+          int qstl_id = qids[ind];
+
+          /* If we got the queue we already have, abort */
+          if (qid == qstl_id) {
+            /* Reduce the size of the list of non-empty queues */
+            qids[ind] = qids[--count];
+            continue;
+          }
+
+          /* The queue we are stealing from */
+          struct queue *q_stl = &s->queues[qstl_id];
+
+          /* Can we lock our own queue? */
+          if (lock_trylock(&q->lock) != 0) {
+
+            /* No --> continue and try a different queue */
+            continue;
+
+          } else {
+
+            /* Yes --> Try locking the que we steal from */
+            if (lock_trylock(&q_stl->lock) != 0) {
+
+              /* Failed? --> Unlock the 1st queue  and
+                 try again */
+              if (lock_unlock(&q->lock) != 0)
+                error("Unlocking our queue failed");
+              continue;
+            }
+          }
+
+          /* We now have locked q and q_stl */
+
+          /* Try to get a task from that random queue */
+          TIMER_TIC;
+          res = queue_gettask(q_stl, prev, 0);
           TIMER_TOC(timer_qsteal);
+
+          /* Lucky? i.e. did we actually get a task? */
           if (res != NULL) {
+
+            /*A.Nasar: Get task type*/
+            enum task_types type = res->type;
+            enum task_subtypes subtype = res->subtype;
+
+            /*Move counter from the robbed to the robber*/
+            if ((type == task_type_self || type == task_type_sub_self) &&
+                subtype == task_subtype_gpu_pack_d) {
+              q->n_packs_self_left_d--;
+              q_stl->n_packs_self_left_d--;
+            }
+            if ((type == task_type_self || type == task_type_sub_self) &&
+                subtype == task_subtype_gpu_pack_g) {
+              q->n_packs_self_left_g--;
+              q_stl->n_packs_self_left_g--;
+            }
+            if ((type == task_type_self || type == task_type_sub_self) &&
+                subtype == task_subtype_gpu_pack_f) {
+              q->n_packs_self_left_f--;
+              q_stl->n_packs_self_left_f--;
+            }
+            if ((type == task_type_pair || type == task_type_sub_pair) &&
+                subtype == task_subtype_gpu_pack_d) {
+              q->n_packs_pair_left_d--;
+              q_stl->n_packs_pair_left_d--;
+            }
+            if ((type == task_type_pair || type == task_type_sub_pair) &&
+                subtype == task_subtype_gpu_pack_g) {
+              q->n_packs_pair_left_g--;
+              q_stl->n_packs_pair_left_g--;
+            }
+            if ((type == task_type_pair || type == task_type_sub_pair) &&
+                subtype == task_subtype_gpu_pack_f) {
+              q->n_packs_pair_left_f--;
+              q_stl->n_packs_pair_left_f--;
+            }
+            /* Run with the task */
             break;
           } else {
+
+            /* Reduce the size of the list of non-empty queues */
             qids[ind] = qids[--count];
           }
+
+          if (lock_unlock(&q->lock) != 0) error("Unlocking our queue failed");
+          if (lock_unlock(&q_stl->lock) != 0)
+            error("Unlocking the stealing queue failed");
         }
         if (res != NULL) break;
       }
@@ -2956,6 +3574,11 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
       pthread_mutex_lock(&s->sleep_mutex);
       res = queue_gettask(&s->queues[qid], prev, 1);
       if (res == NULL && s->waiting > 0) {
+        // struct queue qq = s->queues[qid];
+        //     	message("s->waiting %i self_stolen %i, self_left %i, pair_stolen
+        //     %i, pair_left %i", s->waiting,
+        //     qq.n_packs_self_stolen_f, qq.n_packs_self_left_f,
+        //     qq.n_packs_pair_stolen_f, qq.n_packs_pair_left_f);
         pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
       }
       pthread_mutex_unlock(&s->sleep_mutex);
@@ -3002,6 +3625,16 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
   /* Initialize each queue. */
   for (int k = 0; k < nr_queues; k++) queue_init(&s->queues[k], NULL);
 
+  /* Initialize each queue. */
+  for (int k = 0; k < nr_queues; k++) {
+    s->s_d_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues);
+    s->s_g_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues);
+    s->s_f_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues);
+    s->p_d_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues);
+    s->p_g_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues);
+    s->p_f_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues);
+  }
+
   /* Init the sleep mutex and cond. */
   if (pthread_cond_init(&s->sleep_cond, NULL) != 0 ||
       pthread_mutex_init(&s->sleep_mutex, NULL) != 0)
@@ -3090,6 +3723,13 @@ void scheduler_free_tasks(struct scheduler *s) {
   }
   s->size = 0;
   s->nr_tasks = 0;
+  // reset GPU task counters too
+  s->nr_self_pack_tasks_d = 0;
+  s->nr_self_pack_tasks_f = 0;
+  s->nr_self_pack_tasks_g = 0;
+  s->nr_pair_pack_tasks_d = 0;
+  s->nr_pair_pack_tasks_f = 0;
+  s->nr_pair_pack_tasks_g = 0;
 }
 
 /**
@@ -3207,6 +3847,19 @@ void scheduler_report_task_times_mapper(void *map_data, int num_elements,
     const float total_time = clocks_from_ticks(t->total_ticks);
     const enum task_categories cat = task_get_category(t);
     time_local[cat] += total_time;
+
+    if (t->subtype == task_subtype_gpu_pack_d ||
+        t->subtype == task_subtype_gpu_pack_f ||
+        t->subtype == task_subtype_gpu_pack_g) {
+      time_local[task_category_gpu_pack] +=
+          clocks_from_ticks(t->total_cpu_pack_ticks);
+      time_local[task_category_gpu] -=
+          clocks_from_ticks(t->total_cpu_pack_ticks);
+      time_local[task_category_gpu] -=
+          clocks_from_ticks(t->total_cpu_unpack_ticks);
+      time_local[task_category_gpu_unpack] +=
+          clocks_from_ticks(t->total_cpu_unpack_ticks);
+    }
   }
 
   /* Update the global counters */
diff --git a/src/scheduler.h b/src/scheduler.h
index 6ea7b41d58..b7f8b9f2ad 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -60,6 +60,35 @@ extern int activate_by_unskip;
 
 /* Data of a scheduler. */
 struct scheduler {
+
+  int nr_packs_self_dens_done;  // A. Nasar
+  int nr_packs_pair_dens_done;
+  int nr_packs_self_forc_done;
+  int nr_packs_pair_forc_done;
+  int nr_packs_self_grad_done;
+  int nr_packs_pair_grad_done;
+
+  volatile int *s_d_left;
+  volatile int *s_g_left;
+  volatile int *s_f_left;
+  volatile int *p_d_left;
+  volatile int *p_g_left;
+  volatile int *p_f_left;
+  /* Actual number of GPU tasks. */
+  int nr_gpu_tasks;
+  /* Number of tasks we want*/
+  int target_gpu_tasks;
+  /* Actual number of density pack tasks. */
+  int nr_self_pack_tasks_d, nr_pair_pack_tasks_d;
+  /* Actual number of force pack tasks. */
+  int nr_self_pack_tasks_f, nr_pair_pack_tasks_f;
+  /* Actual number of gradient pack tasks. */
+  int nr_self_pack_tasks_g, nr_pair_pack_tasks_g;
+
+  /*how many tasks we want to try and work on at once on the GPU*/
+  int pack_size;
+  int pack_size_pair;
+
   /* Scheduler flags. */
   unsigned int flags;
 
@@ -323,5 +352,8 @@ void scheduler_write_task_level(const struct scheduler *s, int step);
 void scheduler_dump_queues(struct engine *e);
 void scheduler_report_task_times(const struct scheduler *s,
                                  const int nr_threads);
+struct task *enqueue_dependencies(struct scheduler *s, struct task *t);
+struct task *signal_sleeping_runners(struct scheduler *s, struct task *t,
+                                     int tasks_packed);
 
 #endif /* SWIFT_SCHEDULER_H */
diff --git a/src/space.h b/src/space.h
index 4e0e849d64..a5358c913c 100644
--- a/src/space.h
+++ b/src/space.h
@@ -48,7 +48,7 @@ struct hydro_props;
 #define space_cellallocchunk 1000
 #define space_splitsize_default 400
 #define space_maxsize_default 8000000
-#define space_grid_split_threshold_default 400
+#define space_grid_split_threshold_default 100
 #define space_extra_parts_default 0
 #define space_extra_gparts_default 0
 #define space_extra_sparts_default 100
@@ -94,6 +94,9 @@ extern double engine_foreign_alloc_margin;
  */
 struct space {
 
+  /*Used to define GPU task memory allocation*/
+  float eta_neighbours;
+
   /*! Spatial extent. */
   double dim[3];
 
diff --git a/src/space_getsid.h b/src/space_getsid.h
index df81615d3c..f5e0101d30 100644
--- a/src/space_getsid.h
+++ b/src/space_getsid.h
@@ -46,7 +46,6 @@
 __attribute__((always_inline, nonnull)) INLINE static int
 space_getsid_and_swap_cells(const struct space *s, struct cell **ci,
                             struct cell **cj, double shift[3]) {
-
   /* Get the relative distance between the pairs, wrapping. */
   const int periodic = s->periodic;
   double dx[3];
@@ -79,4 +78,89 @@ space_getsid_and_swap_cells(const struct space *s, struct cell **ci,
   return sid;
 }
 
+__attribute__((always_inline, nonnull))
+INLINE static int  // A. Nasar Same as usual but only used to pack GPU cells
+space_getsid_GPU(const struct space *s, struct cell **ci, struct cell **cj,
+                 double *shift_x, double *shift_y, double *shift_z) {
+  /* Get the relative distance between the pairs, wrapping. */
+  const int periodic = s->periodic;
+  double dx[3];
+  for (int k = 0; k < 3; k++) dx[k] = (*cj)->loc[k] - (*ci)->loc[k];
+
+  if (periodic && dx[0] < -s->dim[0] / 2)
+    *(shift_x) = s->dim[0];
+  else if (periodic && dx[0] > s->dim[0] / 2)
+    *(shift_x) = -s->dim[0];
+  else
+    *(shift_x) = 0.0;
+
+  dx[0] += *(shift_x);
+
+  if (periodic && dx[1] < -s->dim[1] / 2)
+    *(shift_y) = s->dim[1];
+  else if (periodic && dx[1] > s->dim[1] / 2)
+    *(shift_y) = -s->dim[1];
+  else
+    *(shift_y) = 0.0;
+
+  dx[1] += *(shift_y);
+
+  if (periodic && dx[2] < -s->dim[2] / 2)
+    *(shift_z) = s->dim[2];
+  else if (periodic && dx[2] > s->dim[2] / 2)
+    *(shift_z) = -s->dim[2];
+  else
+    *(shift_z) = 0.0;
+
+  dx[2] += *(shift_z);
+
+  /* Get the sorting index. */
+  int sid = 0;
+  for (int k = 0; k < 3; k++)
+    sid = 3 * sid + ((dx[k] < 0.0) ? 0 : ((dx[k] > 0.0) ? 2 : 1));
+
+  /* Switch the cells around? */
+  if (runner_flip[sid]) {
+    struct cell *temp = *ci;
+    *ci = *cj;
+    *cj = temp;
+    *(shift_x) = -*(shift_x);
+    *(shift_y) = -*(shift_y);
+    *(shift_z) = -*(shift_z);
+  }
+  sid = sortlistID[sid];
+
+  /* Return the sort ID. */
+  return sid;
+}
+
+__attribute__((always_inline, nonnull)) INLINE static int space_getsid_filter(
+    const struct space *s, struct cell **ci, struct cell **cj,
+    double shift[3]) {
+
+  /* Get the relative distance between the pairs, wrapping. */
+  const int periodic = s->periodic;
+  double dx[3];
+  for (int k = 0; k < 3; k++) {
+    dx[k] = (*cj)->loc[k] - (*ci)->loc[k];
+    if (periodic && dx[k] < -s->dim[k] / 2)
+      shift[k] = s->dim[k];
+    else if (periodic && dx[k] > s->dim[k] / 2)
+      shift[k] = -s->dim[k];
+    else
+      shift[k] = 0.0;
+    dx[k] += shift[k];
+  }
+
+  /* Get the sorting index. */
+  int sid = 0;
+  for (int k = 0; k < 3; k++)
+    sid = 3 * sid + ((dx[k] < 0.0) ? 0 : ((dx[k] > 0.0) ? 2 : 1));
+
+  sid = sortlistID[sid];
+
+  /* Return the sort ID. */
+  return sid;
+}
+
 #endif /* SWIFT_SPACE_GETSID_H */
diff --git a/src/space_recycle.c b/src/space_recycle.c
index cf84227302..0b915ac7a2 100644
--- a/src/space_recycle.c
+++ b/src/space_recycle.c
@@ -232,6 +232,12 @@ void space_rebuild_recycle_mapper(void *map_data, int num_elements,
     c->mpi.recv = NULL;
     c->mpi.send = NULL;
 #endif
+    c->hydro.density_pack = NULL;  // A. Nasar
+    c->hydro.density_unpack = NULL;
+    c->hydro.gradient_pack = NULL;
+    c->hydro.gradient_unpack = NULL;
+    c->hydro.force_pack = NULL;
+    c->hydro.force_unpack = NULL;
   }
 }
 
diff --git a/src/task.c b/src/task.c
index 3b504a79e6..cbe9547e9d 100644
--- a/src/task.c
+++ b/src/task.c
@@ -164,15 +164,22 @@ const char *subtaskID_names[task_subtype_count] = {
     "sink_do_gas_swallow",
     "rt_gradient",
     "rt_transport",
+    "gpu_pack",  // A. Nasar
+    "gpu_pack_g",
+    "gpu_pack_f",
+    "gpu_unpack",
+    "gpu_unpack_g",
+    "gpu_unpack_f",
 };
 
 const char *task_category_names[task_category_count] = {
-    "drift",       "sorts",    "resort",
-    "hydro",       "gravity",  "feedback",
-    "black holes", "cooling",  "star formation",
-    "limiter",     "sync",     "time integration",
-    "mpi",         "pack",     "fof",
-    "others",      "neutrino", "sink",
+    "drift",       "sorts",      "resort",
+    "hydro",       "gravity",    "feedback",
+    "black holes", "cooling",    "star formation",
+    "limiter",     "sync",       "time integration",
+    "mpi",         "pack",       "gpu",
+    "gpu_pack",    "gpu_unpack", "fof",
+    "others",      "neutrino",   "sink",
     "RT",          "CSDS"};
 
 #ifdef WITH_MPI
@@ -598,6 +605,22 @@ void task_unlock(struct task *t) {
 #ifdef SWIFT_TASKS_WITHOUT_ATOMICS
         cell_unlocktree(ci);
 #endif
+      } else if (subtype == task_subtype_gpu_unpack_d) {
+        //        for(int pp = 0; pp < 128 /*should be sched->pack_size*/;
+        //        pp++){
+        //		  cell_unlocktree(t->ci_unpack[pp]);
+        //	    }
+        /*Do nothing and be on your way*/
+      } else if (subtype == task_subtype_gpu_unpack_f) {
+        /*Do nothing and be on your way*/
+      } else if (subtype == task_subtype_gpu_unpack_g) {
+        /*Do nothing and be on your way*/
+      } else if (subtype == task_subtype_gpu_pack_d) {
+        cell_unlocktree(ci);
+      } else if (subtype == task_subtype_gpu_pack_f) {
+        cell_unlocktree(ci);
+      } else if (subtype == task_subtype_gpu_pack_g) {
+        cell_unlocktree(ci);
       } else { /* hydro */
         cell_unlocktree(ci);
       }
@@ -645,6 +668,21 @@ void task_unlock(struct task *t) {
         cell_unlocktree(ci);
         cell_unlocktree(cj);
 #endif
+      } else if (subtype == task_subtype_gpu_pack_d) {
+        cell_unlocktree(ci);
+        cell_unlocktree(cj);
+      } else if (subtype == task_subtype_gpu_pack_f) {
+        cell_unlocktree(ci);
+        cell_unlocktree(cj);
+      } else if (subtype == task_subtype_gpu_pack_g) {
+        cell_unlocktree(ci);
+        cell_unlocktree(cj);
+      } else if (subtype == task_subtype_gpu_unpack_d) {
+        /* Nothing to do */
+      } else if (subtype == task_subtype_gpu_unpack_f) {
+        /* Nothing to do */
+      } else if (subtype == task_subtype_gpu_unpack_g) {
+        /* Nothing to do */
       } else { /* hydro */
         cell_unlocktree(ci);
         cell_unlocktree(cj);
@@ -848,6 +886,38 @@ int task_lock(struct task *t) {
         if (ci->hydro.hold) return 0;
         if (cell_locktree(ci) != 0) return 0;
 #endif
+      } else if (subtype == task_subtype_gpu_pack_d) {
+        /* Attempt to lock the cell */
+        if (ci->hydro.hold) return 0;
+        if (cell_locktree(ci) != 0) return 0;
+      } else if (subtype == task_subtype_gpu_pack_f) {
+        /* Attempt to lock the cell */
+        if (ci->hydro.hold) return 0;
+        if (cell_locktree(ci) != 0) return 0;
+      } else if (subtype == task_subtype_gpu_pack_g) {
+        /* Attempt to lock the cell */
+        if (ci->hydro.hold) return 0;
+        if (cell_locktree(ci) != 0) return 0;
+      } else if (subtype == task_subtype_gpu_unpack_d) {
+        //        for(int pp = 0; pp < 128 /*should be sched->pack_size*/;
+        //        pp++){
+        //    	  if (t->ci_unpack[pp]->gpu_done == 0){
+        //    		  message("trying to queue an unpack before all packs
+        //    done on GPU"); 		  return 0;
+        //    	  }
+        ////          if (t->ci_unpack[pp]->hydro.hold)
+        ////    		return 0;
+        ////    	  if (cell_locktree(t->ci_unpack[pp]) != 0)
+        ////            return 0;
+        //        }
+        /* Nothing to do here */
+        return 1;
+      } else if (subtype == task_subtype_gpu_unpack_f) {
+        /* Nothing to do here */
+        return 1;
+      } else if (subtype == task_subtype_gpu_unpack_g) {
+        /* Nothing to do here */
+        return 1;
       } else { /* subtype == hydro */
         if (ci->hydro.hold) return 0;
         if (cell_locktree(ci) != 0) return 0;
@@ -964,6 +1034,39 @@ int task_lock(struct task *t) {
           return 0;
         }
 #endif
+      } else if (subtype == task_subtype_gpu_pack_d) {
+        /* Lock the parts in both cells */
+        if (ci->hydro.hold || cj->hydro.hold) return 0;
+        if (cell_locktree(ci) != 0) return 0;
+        if (cell_locktree(cj) != 0) {
+          cell_unlocktree(ci);
+          return 0;
+        }
+      } else if (subtype == task_subtype_gpu_pack_f) {
+        /* Lock the parts in both cells */
+        if (ci->hydro.hold || cj->hydro.hold) return 0;
+        if (cell_locktree(ci) != 0) return 0;
+        if (cell_locktree(cj) != 0) {
+          cell_unlocktree(ci);
+          return 0;
+        }
+      } else if (subtype == task_subtype_gpu_pack_g) {
+        /* Lock the parts in both cells */
+        if (ci->hydro.hold || cj->hydro.hold) return 0;
+        if (cell_locktree(ci) != 0) return 0;
+        if (cell_locktree(cj) != 0) {
+          cell_unlocktree(ci);
+          return 0;
+        }
+      } else if (subtype == task_subtype_gpu_unpack_d) {
+        /* Nothing to do here. */
+        return 1;
+      } else if (subtype == task_subtype_gpu_unpack_f) {
+        /* Nothing to do here. */
+        return 1;
+      } else if (subtype == task_subtype_gpu_unpack_g) {
+        /* Nothing to do here. */
+        return 1;
       } else { /* subtype == hydro */
         /* Lock the parts in both cells */
         if (ci->hydro.hold || cj->hydro.hold) return 0;
@@ -1127,6 +1230,19 @@ void task_get_group_name(int type, int subtype, char *cluster) {
   }
 
   switch (subtype) {
+    /* A. Nasar */
+    case task_subtype_gpu_pack_d:
+    case task_subtype_gpu_unpack_d:
+      strcpy(cluster, "Density");
+      break;
+    case task_subtype_gpu_pack_f:
+    case task_subtype_gpu_unpack_f:
+      strcpy(cluster, "Force");
+      break;
+    case task_subtype_gpu_pack_g:
+    case task_subtype_gpu_unpack_g:
+      strcpy(cluster, "Gradient");
+      break;
     case task_subtype_density:
       strcpy(cluster, "Density");
       break;
@@ -1629,8 +1745,16 @@ void task_dump_active(struct engine *e) {
 
       /* Get destination rank of MPI requests. */
       int paired = (t->cj != NULL);
-      int otherrank = t->ci->nodeID;
-      if (paired) otherrank = t->cj->nodeID;
+      int otherrank = 0;
+      // A. N.: Mods requied to stop code crashing when debugging GPU tasks
+      if (t->subtype != task_subtype_gpu_unpack_d &&
+          t->subtype != task_subtype_gpu_unpack_f &&
+          t->subtype != task_subtype_gpu_unpack_g)
+        otherrank = t->ci->nodeID;
+      if (paired && t->subtype != task_subtype_gpu_unpack_d &&
+          t->subtype != task_subtype_gpu_unpack_f &&
+          t->subtype != task_subtype_gpu_unpack_g)
+        otherrank = t->cj->nodeID;
 
       fprintf(file_thread, "%i %i %s %s %i %i %lli %lli %i %i %i %i %lli\n",
               engine_rank, otherrank, taskID_names[t->type],
@@ -1757,6 +1881,14 @@ enum task_categories task_get_category(const struct task *t) {
         case task_subtype_force:
           return task_category_hydro;
 
+        case task_subtype_gpu_pack_d:  // A. Nasar
+        case task_subtype_gpu_unpack_d:
+        case task_subtype_gpu_pack_f:
+        case task_subtype_gpu_unpack_f:
+        case task_subtype_gpu_pack_g:
+        case task_subtype_gpu_unpack_g:
+          return task_category_gpu;
+
         case task_subtype_limiter:
           return task_category_limiter;
 
diff --git a/src/task.h b/src/task.h
index b405a0795f..c6991751b5 100644
--- a/src/task.h
+++ b/src/task.h
@@ -160,6 +160,12 @@ enum task_subtypes {
   task_subtype_sink_do_gas_swallow,
   task_subtype_rt_gradient,
   task_subtype_rt_transport,
+  task_subtype_gpu_pack_d,  // A. Nasar
+  task_subtype_gpu_pack_g,
+  task_subtype_gpu_pack_f,
+  task_subtype_gpu_unpack_d,
+  task_subtype_gpu_unpack_g,
+  task_subtype_gpu_unpack_f,
   task_subtype_count
 } __attribute__((packed));
 
@@ -196,6 +202,9 @@ enum task_categories {
   task_category_time_integration,
   task_category_mpi,
   task_category_pack,
+  task_category_gpu,
+  task_category_gpu_pack,
+  task_category_gpu_unpack,
   task_category_fof,
   task_category_others,
   task_category_neutrino,
@@ -235,6 +244,15 @@ struct task {
   /*! Pointers to the cells this task acts upon */
   struct cell *ci, *cj;
 
+  int done;  // A. Nasar
+
+  int gpu_done;
+
+  int corner_pair;
+
+  /*! Pointers to the cells this task acts upon */
+  struct cell **ci_unpack;  //, **cj;
+
   /*! List of tasks unlocked by this one */
   struct task **unlock_tasks;
 
@@ -286,6 +304,9 @@ struct task {
   /*! Start and end time of this task */
   ticks tic, toc;
 
+  ticks total_cpu_pack_ticks;
+  ticks total_cpu_unpack_ticks;
+
   /* Total time spent running this task */
   ticks total_ticks;
 
diff --git a/swift.c b/swift.c
index b63941cd63..7a9277ae5c 100644
--- a/swift.c
+++ b/swift.c
@@ -1108,7 +1108,7 @@ int main(int argc, char *argv[]) {
       hydro_props_init(&hydro_properties, &prog_const, &us, params);
     else
       bzero(&hydro_properties, sizeof(struct hydro_props));
-
+    float eta_neighbours = hydro_properties.eta_neighbours;
     /* Initialise the equation of state */
     if (with_hydro)
       eos_init(&eos, &prog_const, &us, params);
@@ -1388,7 +1388,7 @@ int main(int argc, char *argv[]) {
                with_self_gravity, with_star_formation, with_sinks,
                with_DM_particles, with_DM_background_particles, with_neutrinos,
                talking, dry_run, nr_nodes);
-
+    s.eta_neighbours = eta_neighbours;
     /* Initialise the line of sight properties. */
     if (with_line_of_sight) los_init(s.dim, &los_properties, params);