diff --git a/.gitignore b/.gitignore
index 0c4d8bddadb17afcc29aa3dc1d4dba6cb6fd2f5b..e3e17bb82a01d9af0ace6ed72d196cf2dba242f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,7 +34,7 @@ examples/*/*/*.txt
 examples/*/*/used_parameters.yml
 examples/*/gravity_checks_*.dat
 
-tests/testPair
+tests/testActivePair
 tests/brute_force_periodic_BC_standard.dat
 tests/swift_periodic_BC_standard.dat
 tests/brute_force_periodic_BC_pertrubed.dat
@@ -54,6 +54,11 @@ tests/brute_force_125_standard.dat
 tests/swift_dopair_125_standard.dat
 tests/brute_force_125_perturbed.dat
 tests/swift_dopair_125_perturbed.dat
+tests/brute_force_active.dat
+tests/brute_force_periodic_BC_perturbed.dat
+tests/swift_dopair_active.dat
+tests/test_nonsym_density_serial.dat
+tests/test_nonsym_density_vec.dat
 tests/testGreetings
 tests/testReading
 tests/input.hdf5
@@ -75,8 +80,6 @@ tests/test27cells.sh
 tests/test27cellsPerturbed.sh
 tests/test125cells.sh
 tests/test125cellsPerturbed.sh
-tests/testPair.sh
-tests/testPairPerturbed.sh
 tests/testParser.sh
 tests/testReading.sh
 tests/testAdiabaticIndex
diff --git a/configure.ac b/configure.ac
index 95519863f6e303cde51bbef7e2046faa314f46ee..74fede99f4fbf578af4e703cedaa42f2c278b037 100644
--- a/configure.ac
+++ b/configure.ac
@@ -861,14 +861,14 @@ AM_CONDITIONAL([HAVE_DOXYGEN], [test "$ac_cv_path_ac_pt_DX_DOXYGEN" != ""])
 # Handle .in files.
 AC_CONFIG_FILES([Makefile src/Makefile examples/Makefile doc/Makefile doc/Doxyfile tests/Makefile])
 AC_CONFIG_FILES([tests/testReading.sh], [chmod +x tests/testReading.sh])
-AC_CONFIG_FILES([tests/testPair.sh], [chmod +x tests/testPair.sh])
-AC_CONFIG_FILES([tests/testPairPerturbed.sh], [chmod +x tests/testPairPerturbed.sh])
+AC_CONFIG_FILES([tests/testActivePair.sh], [chmod +x tests/testActivePair.sh])
 AC_CONFIG_FILES([tests/test27cells.sh], [chmod +x tests/test27cells.sh])
 AC_CONFIG_FILES([tests/test27cellsPerturbed.sh], [chmod +x tests/test27cellsPerturbed.sh])
 AC_CONFIG_FILES([tests/test125cells.sh], [chmod +x tests/test125cells.sh])
 AC_CONFIG_FILES([tests/test125cellsPerturbed.sh], [chmod +x tests/test125cellsPerturbed.sh])
 AC_CONFIG_FILES([tests/testPeriodicBC.sh], [chmod +x tests/testPeriodicBC.sh])
 AC_CONFIG_FILES([tests/testPeriodicBCPerturbed.sh], [chmod +x tests/testPeriodicBCPerturbed.sh])
+AC_CONFIG_FILES([tests/testInteractions.sh], [chmod +x tests/testInteractions.sh])
 AC_CONFIG_FILES([tests/testParser.sh], [chmod +x tests/testParser.sh])
 
 # Save the compilation options
diff --git a/examples/EAGLE_12/eagle_12.yml b/examples/EAGLE_12/eagle_12.yml
index fd9569c50f4ad0d87eccd54e34636180c0337596..f56c330590ac25cc5b3fe8f68ed68aa1e94d6490 100644
--- a/examples/EAGLE_12/eagle_12.yml
+++ b/examples/EAGLE_12/eagle_12.yml
@@ -26,7 +26,7 @@ Statistics:
 # Parameters for the self-gravity scheme
 Gravity:
   eta:                   0.025    # Constant dimensionless multiplier for time integration.
-  epsilon:               0.0001   # Softening length (in internal units).
+  epsilon:               0.001    # Softening length (in internal units).
   theta:                 0.7      # Opening angle (Multipole acceptance criterion)
   
 # Parameters for the hydrodynamics scheme
diff --git a/examples/EAGLE_6/eagle_6.yml b/examples/EAGLE_6/eagle_6.yml
index a8ba818a8bb278e783454e7c3cd2890f4e1faf89..f55ecc856953d4cb60a86e3461625318a1757693 100644
--- a/examples/EAGLE_6/eagle_6.yml
+++ b/examples/EAGLE_6/eagle_6.yml
@@ -12,6 +12,9 @@ TimeIntegration:
   time_end:   1e-2  # The end time of the simulation (in internal units).
   dt_min:     1e-10 # The minimal time-step size of the simulation (in internal units).
   dt_max:     1e-4  # The maximal time-step size of the simulation (in internal units).
+
+Scheduler:
+  cell_split_size: 64
   
 # Parameters governing the snapshots
 Snapshots:
diff --git a/examples/PerturbedBox_2D/perturbedPlane.yml b/examples/PerturbedBox_2D/perturbedPlane.yml
index b92e29f620edc6f72399111fbe73ba6bd1485e92..a0c6b6d9dbc7a677002dbce5abc6e5d268b56e97 100644
--- a/examples/PerturbedBox_2D/perturbedPlane.yml
+++ b/examples/PerturbedBox_2D/perturbedPlane.yml
@@ -9,7 +9,7 @@ InternalUnitSystem:
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
-  time_end:   10.   # The end time of the simulation (in internal units).
+  time_end:   1000. # The end time of the simulation (in internal units).
   dt_min:     1e-6  # The minimal time-step size of the simulation (in internal units).
   dt_max:     1e-2  # The maximal time-step size of the simulation (in internal units).
 
@@ -21,12 +21,11 @@ Snapshots:
 
 # Parameters governing the conserved quantities statistics
 Statistics:
-  delta_time:          1e-3 # Time between statistics output
+  delta_time:          1.   # Time between statistics output
 
 # Parameters for the hydrodynamics scheme
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
-  delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
   
 # Parameters related to the initial conditions
diff --git a/examples/PerturbedBox_3D/perturbedBox.yml b/examples/PerturbedBox_3D/perturbedBox.yml
index 71c8dece4df5505eb44511ee92291feedd7ffab1..3148510979d0e349c0d6242bf11e1a0db94f9e1f 100644
--- a/examples/PerturbedBox_3D/perturbedBox.yml
+++ b/examples/PerturbedBox_3D/perturbedBox.yml
@@ -9,9 +9,9 @@ InternalUnitSystem:
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
-  time_end:   1.    # The end time of the simulation (in internal units).
+  time_end:   1000  # The end time of the simulation (in internal units).
   dt_min:     1e-6  # The minimal time-step size of the simulation (in internal units).
-  dt_max:     1e-3  # The maximal time-step size of the simulation (in internal units).
+  dt_max:     1e-2  # The maximal time-step size of the simulation (in internal units).
 
 # Parameters governing the snapshots
 Snapshots:
@@ -21,12 +21,11 @@ Snapshots:
 
 # Parameters governing the conserved quantities statistics
 Statistics:
-  delta_time:          1e-3 # Time between statistics output
+  delta_time:          1. # Time between statistics output
 
 # Parameters for the hydrodynamics scheme
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
-  delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
   
 # Parameters related to the initial conditions
diff --git a/examples/main.c b/examples/main.c
index c8a1780c5ba5432ff9e713853c578666d1cc3a6f..ee1253062409ec2e787e064a5fb50da2c830d35d 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -190,7 +190,11 @@ int main(int argc, char *argv[]) {
   while ((c = getopt(argc, argv, "acCdDef:FgGhMn:P:sSt:Tv:y:Y:")) != -1)
     switch (c) {
       case 'a':
+#if defined(HAVE_SETAFFINITY) && defined(HAVE_LIBNUMA)
         with_aff = 1;
+#else
+        error("Need NUMA support for thread affinity");
+#endif
         break;
       case 'c':
         with_cosmology = 1;
@@ -392,8 +396,12 @@ int main(int argc, char *argv[]) {
     parser_read_file(paramFileName, params);
 
     /* Handle any command-line overrides. */
-    if (nparams > 0)
+    if (nparams > 0) {
+      message(
+          "Overwriting values read from the YAML file with command-line "
+          "values.");
       for (int k = 0; k < nparams; k++) parser_set_param(params, cmdparams[k]);
+    }
 
     /* And dump the parameters as used. */
     // parser_print_params(&params);
@@ -565,6 +573,11 @@ int main(int argc, char *argv[]) {
     message("nr of cells at depth %i is %i.", data[0], data[1]);
   }
 
+/* Initialise the table of Ewald corrections for the gravity checks */
+#ifdef SWIFT_GRAVITY_FORCE_CHECKS
+  if (periodic) gravity_exact_force_ewald_init(dim[0]);
+#endif
+
   /* Initialise the external potential properties */
   struct external_potential potential;
   if (with_external_gravity)
diff --git a/src/Makefile.am b/src/Makefile.am
index 3784886278544a1c6b606689bfbae0d6fa52cf19..ec01184928faf3d58b2d0890965a745d05718354 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -64,7 +64,7 @@ nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h
 		 kernel_long_gravity.h vector.h cache.h runner_doiact.h runner_doiact_vec.h runner_doiact_grav.h runner_doiact_fft.h \
                  runner_doiact_nosort.h units.h intrinsics.h minmax.h kick.h timestep.h drift.h adiabatic_index.h io_properties.h \
 		 dimension.h equation_of_state.h part_type.h periodic.h \
-		 gravity.h gravity_io.h \
+		 gravity.h gravity_io.h gravity_cache.h \
 		 gravity/Default/gravity.h gravity/Default/gravity_iact.h gravity/Default/gravity_io.h \
 		 gravity/Default/gravity_debug.h gravity/Default/gravity_part.h  \
 		 sourceterms.h \
diff --git a/src/align.h b/src/align.h
index 915af33e6e2ba59be1a0849c4de0e2f1bd5b0d96..54435c4c9baa1ce9dc511e2903b7e2be2d6655de 100644
--- a/src/align.h
+++ b/src/align.h
@@ -23,9 +23,71 @@
  * @brief The default struct alignment in SWIFT.
  */
 #define SWIFT_STRUCT_ALIGNMENT 32
+
 /**
  * @brief Defines alignment of structures
  */
 #define SWIFT_STRUCT_ALIGN __attribute__((aligned(SWIFT_STRUCT_ALIGNMENT)))
 
+/**
+ * @brief The default cache alignment in SWIFT.
+ */
+#define SWIFT_CACHE_ALIGNMENT 64
+
+/**
+ * @brief Defines alignment of caches
+ */
+#define SWIFT_CACHE_ALIGN __attribute__((aligned(SWIFT_CACHE_ALIGNMENT)))
+
+/**
+ * @brief Macro to tell the compiler that a given array has the specified
+ * alignment.
+ *
+ * Note that this turns into a no-op but gives information to the compiler.
+ *
+ * @param array The array.
+ * @param alignment The alignment in bytes of the array.
+ */
+#if defined(__ICC)
+#define swift_align_information(array, alignment) \
+  __assume_aligned(array, alignment);
+#elif defined(__GNUC__)
+#define swift_align_information(array, alignment) \
+  array = __builtin_assume_aligned(array, alignment);
+#else
+#define swift_align_information(array, alignment) ;
+#endif
+
+/**
+ * @brief Macro to create a restrict pointer to an array and tell the compiler
+ * that the given array has the specified
+ * alignment.
+ *
+ * Note that this turns into a no-op but gives information to the compiler.
+ *
+ * @param array The array.
+ * @param ptr Pointer to array
+ * @param type Type of array
+ * @param alignment The alignment in bytes of the array.
+ */
+#define swift_declare_aligned_ptr(type, array, ptr, alignment) \
+  type *restrict array = ptr;                                  \
+  swift_align_information(array, alignment);
+
+/**
+ * @brief Macro to tell the compiler that a given number is 0 modulo a given
+ * size.
+ *
+ * Note that this turns into a no-op but gives information to the compiler.
+ * GCC does not have the equivalent built-in so defaults to nothing.
+ *
+ * @param var The variable
+ * @param size The modulo of interest.
+ */
+#if defined(__ICC)
+#define swift_assume_size(var, size) __assume(var % size == 0);
+#else
+#define swift_assume_size(var, size) ;
+#endif
+
 #endif /* SWIFT_ALIGN_H */
diff --git a/src/cache.h b/src/cache.h
index 1b6d89f03fafa1595cb9bdf857c9f2bc1cba316e..170eb8f5cb0939e85647bbb4622d66cc78ef3dbe 100644
--- a/src/cache.h
+++ b/src/cache.h
@@ -23,6 +23,7 @@
 #include "../config.h"
 
 /* Local headers */
+#include "align.h"
 #include "cell.h"
 #include "error.h"
 #include "part.h"
@@ -30,9 +31,7 @@
 #include "vector.h"
 
 #define NUM_VEC_PROC 2
-#define CACHE_ALIGN 64
 #define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE)
-#define C2_CACHE_ALIGN sizeof(float) * VEC_SIZE
 
 #ifdef WITH_VECTORIZATION
 /* Cache struct to hold a local copy of a cells' particle
@@ -40,31 +39,31 @@
 struct cache {
 
   /* Particle x position. */
-  float *restrict x __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict x SWIFT_CACHE_ALIGN;
 
   /* Particle y position. */
-  float *restrict y __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict y SWIFT_CACHE_ALIGN;
 
   /* Particle z position. */
-  float *restrict z __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict z SWIFT_CACHE_ALIGN;
 
   /* Particle smoothing length. */
-  float *restrict h __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict h SWIFT_CACHE_ALIGN;
 
   /* Particle mass. */
-  float *restrict m __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict m SWIFT_CACHE_ALIGN;
 
   /* Particle x velocity. */
-  float *restrict vx __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict vx SWIFT_CACHE_ALIGN;
 
   /* Particle y velocity. */
-  float *restrict vy __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict vy SWIFT_CACHE_ALIGN;
 
   /* Particle z velocity. */
-  float *restrict vz __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict vz SWIFT_CACHE_ALIGN;
 
-  /* Maximum distance of particles into neighbouring cell. */
-  float *restrict max_d __attribute__((aligned(CACHE_ALIGN)));
+  /* Maximum index into neighbouring cell for particles that are in range. */
+  int *restrict max_index SWIFT_CACHE_ALIGN;
 
   /* Cache size. */
   int count;
@@ -75,28 +74,28 @@ struct cache {
 struct c2_cache {
 
   /* Separation between two particles squared. */
-  float r2q[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float r2q[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* x separation between two particles. */
-  float dxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float dxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* y separation between two particles. */
-  float dyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float dyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* z separation between two particles. */
-  float dzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float dzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* Mass of particle pj. */
-  float mq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float mq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* x velocity of particle pj. */
-  float vxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float vxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* y velocity of particle pj. */
-  float vyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float vyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* z velocity of particle pj. */
-  float vzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float vzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 };
 
 /**
@@ -111,9 +110,10 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
   /* Align cache on correct byte boundary and pad cache size to be a multiple of
    * the vector size
    * and include 2 vector lengths for remainder operations. */
-  unsigned int pad = 2 * VEC_SIZE, rem = count % VEC_SIZE;
+  size_t pad = 2 * VEC_SIZE, rem = count % VEC_SIZE;
   if (rem > 0) pad += VEC_SIZE - rem;
-  unsigned int sizeBytes = (count + pad) * sizeof(float);
+  size_t sizeBytes = (count + pad) * sizeof(float);
+  size_t sizeIntBytes = (count + pad) * sizeof(int);
   int error = 0;
 
   /* Free memory if cache has already been allocated. */
@@ -126,18 +126,19 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
     free(c->vy);
     free(c->vz);
     free(c->h);
-    free(c->max_d);
+    free(c->max_index);
   }
 
-  error += posix_memalign((void **)&c->x, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->y, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->z, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->m, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->vx, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->vy, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->vz, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->h, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->max_d, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->vx, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->vy, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->vz, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->h, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->max_index, SWIFT_CACHE_ALIGNMENT,
+                          sizeIntBytes);
 
   if (error != 0)
     error("Couldn't allocate cache, no. of particles: %d", (int)count);
@@ -151,156 +152,43 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
  * @param ci_cache The cache.
  */
 __attribute__((always_inline)) INLINE void cache_read_particles(
-    const struct cell *const ci, struct cache *const ci_cache) {
+    const struct cell *restrict const ci,
+    struct cache *restrict const ci_cache) {
 
 #if defined(GADGET2_SPH)
 
-/* Shift the particles positions to a local frame so single precision can be
- * used instead of double precision. */
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma vector aligned
-#endif
-  for (int i = 0; i < ci->count; i++) {
-    ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0];
-    ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1];
-    ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2];
-    ci_cache->h[i] = ci->parts[i].h;
-
-    ci_cache->m[i] = ci->parts[i].mass;
-    ci_cache->vx[i] = ci->parts[i].v[0];
-    ci_cache->vy[i] = ci->parts[i].v[1];
-    ci_cache->vz[i] = ci->parts[i].v[2];
-  }
-
-#endif
-}
-
-/**
- * @brief Populate cache by reading in the particles from two cells in unsorted
- * order.
- *
- * @param ci The i #cell.
- * @param cj The j #cell.
- * @param ci_cache The cache for cell ci.
- * @param cj_cache The cache for cell cj.
- * @param shift The amount to shift the particle positions to account for BCs
- */
-__attribute__((always_inline)) INLINE void cache_read_two_cells(
-    const struct cell *const ci, const struct cell *const cj,
-    struct cache *const ci_cache, struct cache *const cj_cache,
-    const double *const shift) {
-
-  /* Shift the particles positions to a local frame (ci frame) so single
-   * precision can be
-   * used instead of double precision. Also shift the cell ci, particles
-   * positions due to BCs but leave cell cj. */
-  for (int i = 0; i < ci->count; i++) {
-    ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0] - shift[0];
-    ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1] - shift[1];
-    ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2] - shift[2];
-    ci_cache->h[i] = ci->parts[i].h;
-
-    ci_cache->m[i] = ci->parts[i].mass;
-    ci_cache->vx[i] = ci->parts[i].v[0];
-    ci_cache->vy[i] = ci->parts[i].v[1];
-    ci_cache->vz[i] = ci->parts[i].v[2];
-  }
-
-  for (int i = 0; i < cj->count; i++) {
-    cj_cache->x[i] = cj->parts[i].x[0] - ci->loc[0];
-    cj_cache->y[i] = cj->parts[i].x[1] - ci->loc[1];
-    cj_cache->z[i] = cj->parts[i].x[2] - ci->loc[2];
-    cj_cache->h[i] = cj->parts[i].h;
-
-    cj_cache->m[i] = cj->parts[i].mass;
-    cj_cache->vx[i] = cj->parts[i].v[0];
-    cj_cache->vy[i] = cj->parts[i].v[1];
-    cj_cache->vz[i] = cj->parts[i].v[2];
-  }
-}
-
-__attribute__((always_inline)) INLINE void cache_read_cell_sorted(
-    const struct cell *const ci, struct cache *const ci_cache,
-    const struct entry *restrict sort_i, double *const loc,
-    double *const shift) {
-
-  int idx;
-/* Shift the particles positions to a local frame (ci frame) so single precision
- * can be
- * used instead of double precision. Also shift the cell ci, particles positions
- * due to BCs but leave cell cj. */
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma simd
-#endif
-  for (int i = 0; i < ci->count; i++) {
-    idx = sort_i[i].i;
-
-    ci_cache->x[i] = ci->parts[idx].x[0] - loc[0] - shift[0];
-    ci_cache->y[i] = ci->parts[idx].x[1] - loc[1] - shift[1];
-    ci_cache->z[i] = ci->parts[idx].x[2] - loc[2] - shift[2];
-    ci_cache->h[i] = ci->parts[idx].h;
-
-    ci_cache->m[i] = ci->parts[idx].mass;
-    ci_cache->vx[i] = ci->parts[idx].v[0];
-    ci_cache->vy[i] = ci->parts[idx].v[1];
-    ci_cache->vz[i] = ci->parts[idx].v[2];
-  }
-}
-
-/**
- * @brief Populate cache by reading in the particles from two cells in sorted
- * order.
- *
- * @param ci The i #cell.
- * @param cj The j #cell.
- * @param ci_cache The #cache for cell ci.
- * @param cj_cache The #cache for cell cj.
- * @param sort_i The array of sorted particle indices for cell ci.
- * @param sort_j The array of sorted particle indices for cell ci.
- * @param shift The amount to shift the particle positions to account for BCs
- */
-__attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
-    const struct cell *const ci, const struct cell *const cj,
-    struct cache *const ci_cache, struct cache *const cj_cache,
-    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
-    const double *const shift) {
-
-  int idx;
-/* Shift the particles positions to a local frame (ci frame) so single precision
- * can be
- * used instead of double precision. Also shift the cell ci, particles positions
- * due to BCs but leave cell cj. */
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma simd
-#endif
+  /* Let the compiler know that the data is aligned and create pointers to the
+   * arrays inside the cache. */
+  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
+
+  const struct part *restrict parts = ci->parts;
+  double loc[3];
+  loc[0] = ci->loc[0];
+  loc[1] = ci->loc[1];
+  loc[2] = ci->loc[2];
+
+  /* Shift the particles positions to a local frame so single precision can be
+   * used instead of double precision. */
   for (int i = 0; i < ci->count; i++) {
-    idx = sort_i[i].i;
-    ci_cache->x[i] = ci->parts[idx].x[0] - ci->loc[0] - shift[0];
-    ci_cache->y[i] = ci->parts[idx].x[1] - ci->loc[1] - shift[1];
-    ci_cache->z[i] = ci->parts[idx].x[2] - ci->loc[2] - shift[2];
-    ci_cache->h[i] = ci->parts[idx].h;
-
-    ci_cache->m[i] = ci->parts[idx].mass;
-    ci_cache->vx[i] = ci->parts[idx].v[0];
-    ci_cache->vy[i] = ci->parts[idx].v[1];
-    ci_cache->vz[i] = ci->parts[idx].v[2];
+    x[i] = (float)(parts[i].x[0] - loc[0]);
+    y[i] = (float)(parts[i].x[1] - loc[1]);
+    z[i] = (float)(parts[i].x[2] - loc[2]);
+    h[i] = parts[i].h;
+
+    m[i] = parts[i].mass;
+    vx[i] = parts[i].v[0];
+    vy[i] = parts[i].v[1];
+    vz[i] = parts[i].v[2];
   }
 
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma simd
 #endif
-  for (int i = 0; i < cj->count; i++) {
-    idx = sort_j[i].i;
-    cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0];
-    cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1];
-    cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2];
-    cj_cache->h[i] = cj->parts[idx].h;
-
-    cj_cache->m[i] = cj->parts[idx].mass;
-    cj_cache->vx[i] = cj->parts[idx].v[0];
-    cj_cache->vy[i] = cj->parts[idx].v[1];
-    cj_cache->vz[i] = cj->parts[idx].v[2];
-  }
 }
 
 /**
@@ -321,13 +209,13 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
  * interaction.
  */
 __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
-    const struct cell *const ci, const struct cell *const cj,
-    struct cache *const ci_cache, struct cache *const cj_cache,
-    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
-    const double *const shift, int *first_pi, int *last_pj,
-    const int num_vec_proc) {
+    const struct cell *restrict const ci, const struct cell *restrict const cj,
+    struct cache *restrict const ci_cache,
+    struct cache *restrict const cj_cache, const struct entry *restrict sort_i,
+    const struct entry *restrict sort_j, const double *restrict const shift,
+    int *first_pi, int *last_pj, const int num_vec_proc) {
 
-  int idx, ci_cache_idx;
+  int idx;
   /* Pad number of particles read to the vector size. */
   int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE);
   if (rem != 0) {
@@ -345,74 +233,97 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
 
   int first_pi_align = *first_pi;
   int last_pj_align = *last_pj;
-
-/* Shift the particles positions to a local frame (ci frame) so single precision
- * can be
- * used instead of double precision. Also shift the cell ci, particles positions
- * due to BCs but leave cell cj. */
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma vector aligned
-#endif
-  for (int i = first_pi_align; i < ci->count; i++) {
-    /* Make sure ci_cache is filled from the first element. */
-    ci_cache_idx = i - first_pi_align;
-    idx = sort_i[i].i;
-    ci_cache->x[ci_cache_idx] = ci->parts[idx].x[0] - ci->loc[0] - shift[0];
-    ci_cache->y[ci_cache_idx] = ci->parts[idx].x[1] - ci->loc[1] - shift[1];
-    ci_cache->z[ci_cache_idx] = ci->parts[idx].x[2] - ci->loc[2] - shift[2];
-    ci_cache->h[ci_cache_idx] = ci->parts[idx].h;
-
-    ci_cache->m[ci_cache_idx] = ci->parts[idx].mass;
-    ci_cache->vx[ci_cache_idx] = ci->parts[idx].v[0];
-    ci_cache->vy[ci_cache_idx] = ci->parts[idx].v[1];
-    ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2];
+  const struct part *restrict parts_i = ci->parts;
+  const struct part *restrict parts_j = cj->parts;
+  double loc[3];
+  loc[0] = ci->loc[0];
+  loc[1] = ci->loc[1];
+  loc[2] = ci->loc[2];
+
+  /* Let the compiler know that the data is aligned and create pointers to the
+   * arrays inside the cache. */
+  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
+
+  int ci_cache_count = ci->count - first_pi_align;
+  /* Shift the particles positions to a local frame (ci frame) so single
+   * precision
+   * can be
+   * used instead of double precision. Also shift the cell ci, particles
+   * positions
+   * due to BCs but leave cell cj. */
+  for (int i = 0; i < ci_cache_count; i++) {
+    idx = sort_i[i + first_pi_align].i;
+    x[i] = (float)(parts_i[idx].x[0] - loc[0] - shift[0]);
+    y[i] = (float)(parts_i[idx].x[1] - loc[1] - shift[1]);
+    z[i] = (float)(parts_i[idx].x[2] - loc[2] - shift[2]);
+    h[i] = parts_i[idx].h;
+
+    m[i] = parts_i[idx].mass;
+    vx[i] = parts_i[idx].v[0];
+    vy[i] = parts_i[idx].v[1];
+    vz[i] = parts_i[idx].v[2];
   }
 
   /* Pad cache with fake particles that exist outside the cell so will not
    * interact.*/
-  float fake_pix = 2.0f * ci->parts[sort_i[ci->count - 1].i].x[0];
+  float fake_pix = 2.0f * parts_i[sort_i[ci->count - 1].i].x[0];
   for (int i = ci->count - first_pi_align;
        i < ci->count - first_pi_align + VEC_SIZE; i++) {
-    ci_cache->x[i] = fake_pix;
-    ci_cache->y[i] = 1.f;
-    ci_cache->z[i] = 1.f;
-    ci_cache->h[i] = 1.f;
-
-    ci_cache->m[i] = 1.f;
-    ci_cache->vx[i] = 1.f;
-    ci_cache->vy[i] = 1.f;
-    ci_cache->vz[i] = 1.f;
+    x[i] = fake_pix;
+    y[i] = 1.f;
+    z[i] = 1.f;
+    h[i] = 1.f;
+
+    m[i] = 1.f;
+    vx[i] = 1.f;
+    vy[i] = 1.f;
+    vz[i] = 1.f;
   }
 
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma vector aligned
-#endif
+  /* Let the compiler know that the data is aligned and create pointers to the
+   * arrays inside the cache. */
+  swift_declare_aligned_ptr(float, xj, cj_cache->x, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, yj, cj_cache->y, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, zj, cj_cache->z, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, hj, cj_cache->h, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, mj, cj_cache->m, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vxj, cj_cache->vx, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vyj, cj_cache->vy, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT);
+
   for (int i = 0; i <= last_pj_align; i++) {
     idx = sort_j[i].i;
-    cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0];
-    cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1];
-    cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2];
-    cj_cache->h[i] = cj->parts[idx].h;
-
-    cj_cache->m[i] = cj->parts[idx].mass;
-    cj_cache->vx[i] = cj->parts[idx].v[0];
-    cj_cache->vy[i] = cj->parts[idx].v[1];
-    cj_cache->vz[i] = cj->parts[idx].v[2];
+    xj[i] = (float)(parts_j[idx].x[0] - loc[0]);
+    yj[i] = (float)(parts_j[idx].x[1] - loc[1]);
+    zj[i] = (float)(parts_j[idx].x[2] - loc[2]);
+    hj[i] = parts_j[idx].h;
+
+    mj[i] = parts_j[idx].mass;
+    vxj[i] = parts_j[idx].v[0];
+    vyj[i] = parts_j[idx].v[1];
+    vzj[i] = parts_j[idx].v[2];
   }
 
   /* Pad cache with fake particles that exist outside the cell so will not
    * interact.*/
   float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0];
   for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
-    cj_cache->x[i] = fake_pjx;
-    cj_cache->y[i] = 1.f;
-    cj_cache->z[i] = 1.f;
-    cj_cache->h[i] = 1.f;
-
-    cj_cache->m[i] = 1.f;
-    cj_cache->vx[i] = 1.f;
-    cj_cache->vy[i] = 1.f;
-    cj_cache->vz[i] = 1.f;
+    xj[i] = fake_pjx;
+    yj[i] = 1.f;
+    zj[i] = 1.f;
+    hj[i] = 1.f;
+
+    mj[i] = 1.f;
+    vxj[i] = 1.f;
+    vyj[i] = 1.f;
+    vzj[i] = 1.f;
   }
 }
 
@@ -430,7 +341,7 @@ static INLINE void cache_clean(struct cache *c) {
     free(c->vy);
     free(c->vz);
     free(c->h);
-    free(c->max_d);
+    free(c->max_index);
   }
 }
 
diff --git a/src/debug.c b/src/debug.c
index a66aa93b85d586ba62c9ef78d7d7d38b94112836..dca66ee45bed4530bcc6e4765ecd5f96848f8caa 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -26,6 +26,7 @@
 /* Some standard headers. */
 #include <float.h>
 #include <stdio.h>
+#include <unistd.h>
 
 /* This object's header. */
 #include "debug.h"
@@ -524,3 +525,69 @@ void dumpCellRanks(const char *prefix, struct cell *cells_top, int nr_cells) {
 }
 
 #endif /* HAVE_MPI */
+
+/**
+ * @brief parse the process /proc/self/statm file to get the process
+ *        memory use (in KB). Top field in ().
+ *
+ * @param size     total virtual memory (VIRT)
+ * @param resident resident non-swapped memory (RES)
+ * @param share    shared (mmap'd) memory  (SHR)
+ * @param trs      text (exe) resident set (CODE)
+ * @param lrs      library resident set
+ * @param drs      data+stack resident set (DATA)
+ * @param dt       dirty pages (nDRT)
+ */
+void getProcMemUse(long *size, long *resident, long *share, long *trs,
+                   long *lrs, long *drs, long *dt) {
+
+  /* Open the file. */
+  FILE *file = fopen("/proc/self/statm", "r");
+  if (file != NULL) {
+    int nscan = fscanf(file, "%ld %ld %ld %ld %ld %ld %ld", size, resident,
+                       share, trs, lrs, drs, dt);
+
+    if (nscan == 7) {
+      /* Convert pages into bytes. Usually 4096, but could be 512 on some
+       * systems so take care in conversion to KB. */
+      long sz = sysconf(_SC_PAGESIZE);
+      *size *= sz;
+      *resident *= sz;
+      *share *= sz;
+      *trs *= sz;
+      *lrs *= sz;
+      *drs *= sz;
+      *dt *= sz;
+
+      *size /= 1024;
+      *resident /= 1024;
+      *share /= 1024;
+      *trs /= 1024;
+      *lrs /= 1024;
+      *drs /= 1024;
+      *dt /= 1024;
+    } else {
+      error("Failed to read sufficient fields from /proc/self/statm");
+    }
+    fclose(file);
+  } else {
+    error("Failed to open /proc/self/statm");
+  }
+}
+
+/**
+ * @brief Print the current memory use of the process. A la "top".
+ */
+void printProcMemUse() {
+  long size;
+  long resident;
+  long share;
+  long trs;
+  long lrs;
+  long drs;
+  long dt;
+  getProcMemUse(&size, &resident, &share, &trs, &lrs, &drs, &dt);
+  printf("## VIRT = %ld , RES = %ld , SHR = %ld , CODE = %ld, DATA = %ld\n",
+         size, resident, share, trs, drs);
+  fflush(stdout);
+}
diff --git a/src/debug.h b/src/debug.h
index e07c73c7a68b0ef244b4a30cf3aecebc6ff38f3d..7bea11de6ac3fe82ae9b3f1be84249f75742ecaf 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -45,4 +45,7 @@ void dumpMETISGraph(const char *prefix, idx_t nvtxs, idx_t ncon, idx_t *xadj,
 void dumpCellRanks(const char *prefix, struct cell *cells_top, int nr_cells);
 #endif
 
+void getProcMemUse(long *size, long *resident, long *share, long *trs,
+                   long *lrs, long *drs, long *dt);
+void printProcMemUse();
 #endif /* SWIFT_DEBUG_H */
diff --git a/src/dimension.h b/src/dimension.h
index 4ee9377b4f9608b2549f3dfee2a683b61a71c76a..0b2093d718a61c6ce850db1970412af3e2e462b9 100644
--- a/src/dimension.h
+++ b/src/dimension.h
@@ -275,11 +275,11 @@ __attribute__((always_inline)) INLINE static vector pow_dimension_vec(
 
 #if defined(HYDRO_DIMENSION_3D)
 
-  return (vector)(x.v * x.v * x.v);
+  return (vector)(vec_mul(vec_mul(x.v, x.v), x.v));
 
 #elif defined(HYDRO_DIMENSION_2D)
 
-  return (vector)(x.v * x.v);
+  return (vector)(vec_mul(x.v, x.v));
 
 #elif defined(HYDRO_DIMENSION_1D)
 
@@ -304,16 +304,16 @@ __attribute__((always_inline)) INLINE static vector pow_dimension_plus_one_vec(
 
 #if defined(HYDRO_DIMENSION_3D)
 
-  const vector x2 = (vector)(x.v * x.v);
-  return (vector)(x2.v * x2.v);
+  const vector x2 = (vector)(vec_mul(x.v, x.v));
+  return (vector)(vec_mul(x2.v, x2.v));
 
 #elif defined(HYDRO_DIMENSION_2D)
 
-  return (vector)(x.v * x.v * x.v);
+  return (vector)(vec_mul(x.v, vec_mul(x.v, x.v)));
 
 #elif defined(HYDRO_DIMENSION_1D)
 
-  return (vector)(x.v * x.v);
+  return (vector)(vec_mul(x.v, x.v));
 
 #else
 
diff --git a/src/engine.c b/src/engine.c
index a45a2fb4375ef63138dd228c18f90bcacac37494..ad9f5bf57d3f1161e07bf83dbc082029e60818c4 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -57,6 +57,7 @@
 #include "error.h"
 #include "gravity.h"
 #include "hydro.h"
+#include "map.h"
 #include "minmax.h"
 #include "parallel_io.h"
 #include "part.h"
@@ -153,7 +154,7 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
 
   struct scheduler *s = &e->sched;
   const int periodic = e->s->periodic;
-  const int is_hydro = (e->policy & engine_policy_hydro);
+  const int is_with_hydro = (e->policy & engine_policy_hydro);
   const int is_self_gravity = (e->policy & engine_policy_self_gravity);
   const int is_with_cooling = (e->policy & engine_policy_cooling);
   const int is_with_sourceterms = (e->policy & engine_policy_sourceterms);
@@ -162,15 +163,19 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
   if (c->super == c) {
 
     /* Add the sort task. */
-    c->sorts =
-        scheduler_addtask(s, task_type_sort, task_subtype_none, 0, 0, c, NULL);
+    if (is_with_hydro) {
+      c->sorts = scheduler_addtask(s, task_type_sort, task_subtype_none, 0, 0,
+                                   c, NULL);
+    }
 
     /* Local tasks only... */
     if (c->nodeID == e->nodeID) {
 
       /* Add the drift task. */
-      c->drift_part = scheduler_addtask(s, task_type_drift_part,
-                                        task_subtype_none, 0, 0, c, NULL);
+      if (is_with_hydro) {
+        c->drift_part = scheduler_addtask(s, task_type_drift_part,
+                                          task_subtype_none, 0, 0, c, NULL);
+      }
 
       /* Add the two half kicks */
       c->kick1 = scheduler_addtask(s, task_type_kick1, task_subtype_none, 0, 0,
@@ -207,7 +212,7 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
       }
 
       /* Generate the ghost tasks. */
-      if (is_hydro) {
+      if (is_with_hydro) {
         c->ghost_in =
             scheduler_addtask(s, task_type_ghost, task_subtype_none, 0,
                               /* implicit = */ 1, c, NULL);
@@ -215,14 +220,13 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
             scheduler_addtask(s, task_type_ghost, task_subtype_none, 0,
                               /* implicit = */ 1, c, NULL);
         engine_add_ghosts(e, c, c->ghost_in, c->ghost_out);
-      }
 
 #ifdef EXTRA_HYDRO_LOOP
-      /* Generate the extra ghost task. */
-      if (is_hydro)
+        /* Generate the extra ghost task. */
         c->extra_ghost = scheduler_addtask(s, task_type_extra_ghost,
                                            task_subtype_none, 0, 0, c, NULL);
 #endif
+      }
 
       /* Cooling task */
       if (is_with_cooling) {
@@ -898,6 +902,16 @@ void engine_repartition(struct engine *e) {
   partition_repartition(e->reparttype, e->nodeID, e->nr_nodes, e->s,
                         e->sched.tasks, e->sched.nr_tasks);
 
+  /* Partitioning requires copies of the particles, so we need to reduce the
+   * memory in use to the minimum, we can free the sorting indices and the
+   * tasks as these will be regenerated at the next rebuild. */
+
+  /* Sorting indices. */
+  if (e->s->cells_top != NULL) space_free_cells(e->s);
+
+  /* Task arrays. */
+  scheduler_free_tasks(&e->sched);
+
   /* Now comes the tricky part: Exchange particles between all nodes.
      This is done in two steps, first allreducing a matrix of
      how many particles go from where to where, then re-allocating
@@ -1731,7 +1745,7 @@ void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements,
     /* If the cells is local build a self-interaction */
     scheduler_addtask(sched, task_type_self, task_subtype_grav, 0, 0, ci, NULL);
 
-    /* Deal with periodicity dependencies */
+    /* Deal with periodicity FFT task dependencies */
     const int ghost_id = cell_getid(cdim_ghost, i / 4, j / 4, k / 4);
     if (ghost_id > n_ghosts) error("Invalid ghost_id");
     if (periodic) {
@@ -1739,6 +1753,10 @@ void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements,
       ci->grav_ghost[1] = ghosts[2 * ghost_id + 1];
     }
 
+    /* Recover the multipole information */
+    struct gravity_tensors *const multi_i = ci->multipole;
+    const double CoM_i[3] = {multi_i->CoM[0], multi_i->CoM[1], multi_i->CoM[2]};
+
     /* Loop over every other cell */
     for (int ii = 0; ii < cdim[0]; ii++) {
       for (int jj = 0; jj < cdim[1]; jj++) {
@@ -1757,11 +1775,27 @@ void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements,
           /* Is that neighbour local ? */
           if (cj->nodeID != nodeID) continue;  // MATTHIEU
 
-          /* Are the cells to close for a MM interaction ? */
-          if (!gravity_multipole_accept_rebuild(ci->multipole, cj->multipole,
-                                                theta_crit_inv, periodic,
-                                                dim)) {
+          /* Recover the multipole information */
+          struct gravity_tensors *const multi_j = cj->multipole;
+
+          /* Get the distance between the CoMs */
+          double dx = CoM_i[0] - multi_j->CoM[0];
+          double dy = CoM_i[1] - multi_j->CoM[1];
+          double dz = CoM_i[2] - multi_j->CoM[2];
+
+          /* Apply BC */
+          if (periodic) {
+            dx = nearest(dx, dim[0]);
+            dy = nearest(dy, dim[1]);
+            dz = nearest(dz, dim[2]);
+          }
+          const double r2 = dx * dx + dy * dy + dz * dz;
+
+          /* Are the cells too close for a MM interaction ? */
+          if (!gravity_multipole_accept_rebuild(multi_i, multi_j,
+                                                theta_crit_inv, r2)) {
 
+            /* Ok, we need to add a direct pair calculation */
             scheduler_addtask(sched, task_type_pair, task_subtype_grav, 0, 0,
                               ci, cj);
           }
@@ -2796,8 +2830,7 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
 
     /* Kick/Drift/init ? */
     if (t->type == task_type_kick1 || t->type == task_type_kick2 ||
-        t->type == task_type_drift_part || t->type == task_type_drift_gpart ||
-        t->type == task_type_init_grav) {
+        t->type == task_type_drift_gpart || t->type == task_type_init_grav) {
       if (cell_is_active(t->ci, e)) scheduler_activate(s, t);
     }
 
@@ -4583,13 +4616,13 @@ void engine_init(struct engine *e, struct space *s,
     error("Failed to initialize barrier.");
 
   /* Expected average for tasks per cell. If set to zero we use a heuristic
-   * guess that hopefully is large enough, but not too large. */
+   * guess based on the numbers of cells and how many tasks per cell we expect. */
   e->tasks_per_cell =
       parser_get_opt_param_int(params, "Scheduler:tasks_per_cell", 0);
 
   /* Init the scheduler. */
   scheduler_init(&e->sched, e->s, engine_estimate_nr_tasks(e), nr_queues,
-                 scheduler_flag_steal, e->nodeID, &e->threadpool);
+                 (policy & scheduler_flag_steal), e->nodeID, &e->threadpool);
 
   /* Allocate and init the threads. */
   if ((e->runners = (struct runner *)malloc(sizeof(struct runner) *
@@ -4633,8 +4666,12 @@ void engine_init(struct engine *e, struct space *s,
       e->runners[k].qid = k * nr_queues / e->nr_threads;
     }
 
-#ifdef WITH_VECTORIZATION
     /* Allocate particle caches. */
+    e->runners[k].ci_gravity_cache.count = 0;
+    e->runners[k].cj_gravity_cache.count = 0;
+    gravity_cache_init(&e->runners[k].ci_gravity_cache, space_splitsize);
+    gravity_cache_init(&e->runners[k].cj_gravity_cache, space_splitsize);
+#ifdef WITH_VECTORIZATION
     e->runners[k].ci_cache.count = 0;
     e->runners[k].cj_cache.count = 0;
     cache_init(&e->runners[k].ci_cache, CACHE_SIZE);
@@ -4725,8 +4762,12 @@ void engine_compute_next_snapshot_time(struct engine *e) {
 void engine_clean(struct engine *e) {
 
 #ifdef WITH_VECTORIZATION
-  for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].ci_cache);
-  for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].cj_cache);
+  for (int i = 0; i < e->nr_threads; ++i) {
+    cache_clean(&e->runners[i].ci_cache);
+    cache_clean(&e->runners[i].cj_cache);
+    gravity_cache_clean(&e->runners[i].ci_gravity_cache);
+    gravity_cache_clean(&e->runners[i].cj_gravity_cache);
+  }
 #endif
   free(e->runners);
   free(e->snapshotUnits);
diff --git a/src/gravity.c b/src/gravity.c
index 579546182445ce3fe58c2bbd43c328c0df2d377b..f58bc1b7456bc5dfc95b4c976ebda8e1999ff3e0 100644
--- a/src/gravity.c
+++ b/src/gravity.c
@@ -21,9 +21,15 @@
 #include "../config.h"
 
 /* Some standard headers. */
+#include <float.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <unistd.h>
 
+#ifdef HAVE_HDF5
+#include <hdf5.h>
+#endif
+
 /* This object's header. */
 #include "gravity.h"
 
@@ -39,6 +45,256 @@ struct exact_force_data {
   double const_G;
 };
 
+#ifdef SWIFT_GRAVITY_FORCE_CHECKS
+
+/* Size of the Ewald table */
+#define Newald 64
+
+/* Components of the Ewald correction */
+static float fewald_x[Newald + 1][Newald + 1][Newald + 1];
+static float fewald_y[Newald + 1][Newald + 1][Newald + 1];
+static float fewald_z[Newald + 1][Newald + 1][Newald + 1];
+
+/* Factor used to normalize the access to the Ewald table */
+float ewald_fac;
+#endif
+
+/**
+ * @brief Allocates the memory and computes one octant of the
+ * Ewald correction table.
+ *
+ * We follow Hernquist, Bouchet & Suto, 1991, ApJS, Volume 75, p.231-240,
+ * equations (2.14a) and (2.14b) with alpha = 2. We consider all terms with
+ * |x - nL| < 4L and |h|^2 < 16.
+ *
+ * @param boxSize The side-length (L) of the volume.
+ */
+void gravity_exact_force_ewald_init(double boxSize) {
+
+#ifdef SWIFT_GRAVITY_FORCE_CHECKS
+  const ticks tic = getticks();
+  message("Computing Ewald correction table...");
+
+  /* Level of correction  (Hernquist et al. 1991)*/
+  const float alpha = 2.f;
+
+  /* some useful constants */
+  const float alpha2 = alpha * alpha;
+  const float factor_exp1 = 2.f * alpha / sqrt(M_PI);
+  const float factor_exp2 = -M_PI * M_PI / alpha2;
+  const float factor_sin = 2.f * M_PI;
+  const float boxSize_inv2 = 1.f / (boxSize * boxSize);
+
+  /* Ewald factor to access the table */
+  ewald_fac = (double)(2 * Newald) / boxSize;
+
+  /* Zero everything */
+  bzero(fewald_x, (Newald + 1) * (Newald + 1) * (Newald + 1) * sizeof(float));
+  bzero(fewald_y, (Newald + 1) * (Newald + 1) * (Newald + 1) * sizeof(float));
+  bzero(fewald_z, (Newald + 1) * (Newald + 1) * (Newald + 1) * sizeof(float));
+
+  /* Compute the values in one of the octants */
+  for (int i = 0; i <= Newald; ++i) {
+    for (int j = 0; j <= Newald; ++j) {
+      for (int k = 0; k <= Newald; ++k) {
+
+        if (i == 0 && j == 0 && k == 0) continue;
+
+        /* Distance vector */
+        const float r_x = 0.5f * ((float)i) / Newald;
+        const float r_y = 0.5f * ((float)j) / Newald;
+        const float r_z = 0.5f * ((float)k) / Newald;
+
+        /* Norm of distance vector */
+        const float r2 = r_x * r_x + r_y * r_y + r_z * r_z;
+        const float r_inv = 1.f / sqrtf(r2);
+        const float r_inv3 = r_inv * r_inv * r_inv;
+
+        /* Normal gravity potential term */
+        float f_x = r_x * r_inv3;
+        float f_y = r_y * r_inv3;
+        float f_z = r_z * r_inv3;
+
+        for (int n_i = -4; n_i <= 4; ++n_i) {
+          for (int n_j = -4; n_j <= 4; ++n_j) {
+            for (int n_k = -4; n_k <= 4; ++n_k) {
+
+              const float d_x = r_x - n_i;
+              const float d_y = r_y - n_j;
+              const float d_z = r_z - n_k;
+
+              /* Discretised distance */
+              const float r_tilde2 = d_x * d_x + d_y * d_y + d_z * d_z;
+              const float r_tilde_inv = 1.f / sqrtf(r_tilde2);
+              const float r_tilde = r_tilde_inv * r_tilde2;
+              const float r_tilde_inv3 =
+                  r_tilde_inv * r_tilde_inv * r_tilde_inv;
+
+              const float val =
+                  erfcf(alpha * r_tilde) +
+                  factor_exp1 * r_tilde * expf(-alpha2 * r_tilde2);
+
+              /* First correction term */
+              const float f = val * r_tilde_inv3;
+              f_x -= f * d_x;
+              f_y -= f * d_y;
+              f_z -= f * d_z;
+            }
+          }
+        }
+
+        for (int h_i = -4; h_i <= 4; ++h_i) {
+          for (int h_j = -4; h_j <= 4; ++h_j) {
+            for (int h_k = -4; h_k <= 4; ++h_k) {
+
+              const float h2 = h_i * h_i + h_j * h_j + h_k * h_k;
+
+              const float h2_inv = 1.f / (h2 + FLT_MIN);
+              const float h_dot_x = h_i * r_x + h_j * r_y + h_k * r_z;
+
+              const float val = 2.f * h2_inv * expf(h2 * factor_exp2) *
+                                sinf(factor_sin * h_dot_x);
+
+              /* Second correction term */
+              f_x -= val * h_i;
+              f_y -= val * h_j;
+              f_z -= val * h_k;
+            }
+          }
+        }
+
+        /* Save back to memory */
+        fewald_x[i][j][k] = f_x;
+        fewald_y[i][j][k] = f_y;
+        fewald_z[i][j][k] = f_z;
+      }
+    }
+  }
+
+/* Dump the Ewald table to a file */
+#ifdef HAVE_HDF5
+  hid_t h_file =
+      H5Fcreate("Ewald.hdf5", H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
+  if (h_file < 0) error("Error while opening file for Ewald dump.");
+
+  /* Create dataspace */
+  hsize_t dim[3] = {Newald + 1, Newald + 1, Newald + 1};
+  hid_t h_space = H5Screate_simple(3, dim, NULL);
+  hid_t h_data;
+  h_data = H5Dcreate(h_file, "Ewald_x", H5T_NATIVE_FLOAT, h_space, H5P_DEFAULT,
+                     H5P_DEFAULT, H5P_DEFAULT);
+  H5Dwrite(h_data, H5T_NATIVE_FLOAT, h_space, H5S_ALL, H5P_DEFAULT,
+           &(fewald_x[0][0][0]));
+  H5Dclose(h_data);
+  h_data = H5Dcreate(h_file, "Ewald_y", H5T_NATIVE_FLOAT, h_space, H5P_DEFAULT,
+                     H5P_DEFAULT, H5P_DEFAULT);
+  H5Dwrite(h_data, H5T_NATIVE_FLOAT, h_space, H5S_ALL, H5P_DEFAULT,
+           &(fewald_y[0][0][0]));
+  H5Dclose(h_data);
+  h_data = H5Dcreate(h_file, "Ewald_z", H5T_NATIVE_FLOAT, h_space, H5P_DEFAULT,
+                     H5P_DEFAULT, H5P_DEFAULT);
+  H5Dwrite(h_data, H5T_NATIVE_FLOAT, h_space, H5S_ALL, H5P_DEFAULT,
+           &(fewald_z[0][0][0]));
+  H5Dclose(h_data);
+  H5Sclose(h_space);
+  H5Fclose(h_file);
+#endif
+
+  /* Apply the box-size correction */
+  for (int i = 0; i <= Newald; ++i) {
+    for (int j = 0; j <= Newald; ++j) {
+      for (int k = 0; k <= Newald; ++k) {
+        fewald_x[i][j][k] *= boxSize_inv2;
+        fewald_y[i][j][k] *= boxSize_inv2;
+        fewald_z[i][j][k] *= boxSize_inv2;
+      }
+    }
+  }
+
+  /* Say goodbye */
+  message("Ewald correction table computed (took %.3f %s). ",
+          clocks_from_ticks(getticks() - tic), clocks_getunit());
+#else
+  error("Gravity checking function called without the corresponding flag.");
+#endif
+}
+
+#ifdef SWIFT_GRAVITY_FORCE_CHECKS
+/**
+ * @brief Compute the Ewald correction for a given distance vector r.
+ *
+ * We interpolate the Ewald correction tables using a tri-linear interpolation
+ * similar to a CIC.
+ *
+ * @param rx x-coordinate of distance vector.
+ * @param ry y-coordinate of distance vector.
+ * @param rz z-coordinate of distance vector.
+ * @param corr (return) The Ewald correction.
+ */
+__attribute__((always_inline)) INLINE static void
+gravity_exact_force_ewald_evaluate(double rx, double ry, double rz,
+                                   double corr[3]) {
+
+  const double s_x = (rx < 0.) ? 1. : -1.;
+  const double s_y = (ry < 0.) ? 1. : -1.;
+  const double s_z = (rz < 0.) ? 1. : -1.;
+  rx = fabs(rx);
+  ry = fabs(ry);
+  rz = fabs(rz);
+
+  int i = (int)(rx * ewald_fac);
+  if (i >= Newald) i = Newald - 1;
+  const double dx = rx * ewald_fac - i;
+  const double tx = 1. - dx;
+
+  int j = (int)(ry * ewald_fac);
+  if (j >= Newald) j = Newald - 1;
+  const double dy = ry * ewald_fac - j;
+  const double ty = 1. - dy;
+
+  int k = (int)(rz * ewald_fac);
+  if (k >= Newald) k = Newald - 1;
+  const double dz = rz * ewald_fac - k;
+  const double tz = 1. - dz;
+
+  /* Interpolation in X */
+  corr[0] = 0.;
+  corr[0] += fewald_x[i + 0][j + 0][k + 0] * tx * ty * tz;
+  corr[0] += fewald_x[i + 0][j + 0][k + 1] * tx * ty * dz;
+  corr[0] += fewald_x[i + 0][j + 1][k + 0] * tx * dy * tz;
+  corr[0] += fewald_x[i + 0][j + 1][k + 1] * tx * dy * dz;
+  corr[0] += fewald_x[i + 1][j + 0][k + 0] * dx * ty * tz;
+  corr[0] += fewald_x[i + 1][j + 0][k + 1] * dx * ty * dz;
+  corr[0] += fewald_x[i + 1][j + 1][k + 0] * dx * dy * tz;
+  corr[0] += fewald_x[i + 1][j + 1][k + 1] * dx * dy * dz;
+  corr[0] *= s_x;
+
+  /* Interpolation in Y */
+  corr[1] = 0.;
+  corr[1] += fewald_y[i + 0][j + 0][k + 0] * tx * ty * tz;
+  corr[1] += fewald_y[i + 0][j + 0][k + 1] * tx * ty * dz;
+  corr[1] += fewald_y[i + 0][j + 1][k + 0] * tx * dy * tz;
+  corr[1] += fewald_y[i + 0][j + 1][k + 1] * tx * dy * dz;
+  corr[1] += fewald_y[i + 1][j + 0][k + 0] * dx * ty * tz;
+  corr[1] += fewald_y[i + 1][j + 0][k + 1] * dx * ty * dz;
+  corr[1] += fewald_y[i + 1][j + 1][k + 0] * dx * dy * tz;
+  corr[1] += fewald_y[i + 1][j + 1][k + 1] * dx * dy * dz;
+  corr[1] *= s_y;
+
+  /* Interpolation in Z */
+  corr[2] = 0.;
+  corr[2] += fewald_z[i + 0][j + 0][k + 0] * tx * ty * tz;
+  corr[2] += fewald_z[i + 0][j + 0][k + 1] * tx * ty * dz;
+  corr[2] += fewald_z[i + 0][j + 1][k + 0] * tx * dy * tz;
+  corr[2] += fewald_z[i + 0][j + 1][k + 1] * tx * dy * dz;
+  corr[2] += fewald_z[i + 1][j + 0][k + 0] * dx * ty * tz;
+  corr[2] += fewald_z[i + 1][j + 0][k + 1] * dx * ty * dz;
+  corr[2] += fewald_z[i + 1][j + 1][k + 0] * dx * dy * tz;
+  corr[2] += fewald_z[i + 1][j + 1][k + 1] * dx * dy * dz;
+  corr[2] *= s_z;
+}
+#endif
+
 /**
  * @brief Checks whether the file containing the exact accelerations for
  * the current choice of parameters already exists.
@@ -116,6 +372,12 @@ void gravity_exact_force_compute_mapper(void *map_data, int nr_gparts,
     if (gpi->id_or_neg_offset % SWIFT_GRAVITY_FORCE_CHECKS == 0 &&
         gpart_is_active(gpi, e)) {
 
+      /* Get some information about the particle */
+      const double pix[3] = {gpi->x[0], gpi->x[1], gpi->x[2]};
+      const double hi = gpi->epsilon;
+      const double hi_inv = 1. / hi;
+      const double hi_inv3 = hi_inv * hi_inv * hi_inv;
+
       /* Be ready for the calculation */
       double a_grav[3] = {0., 0., 0.};
 
@@ -128,9 +390,9 @@ void gravity_exact_force_compute_mapper(void *map_data, int nr_gparts,
         if (gpi == gpj) continue;
 
         /* Compute the pairwise distance. */
-        double dx = gpi->x[0] - gpj->x[0];
-        double dy = gpi->x[1] - gpj->x[1];
-        double dz = gpi->x[2] - gpj->x[2];
+        double dx = gpj->x[0] - pix[0];
+        double dy = gpj->x[1] - pix[1];
+        double dz = gpj->x[2] - pix[2];
 
         /* Now apply periodic BC */
         if (periodic) {
@@ -140,34 +402,41 @@ void gravity_exact_force_compute_mapper(void *map_data, int nr_gparts,
         }
 
         const double r2 = dx * dx + dy * dy + dz * dz;
-        const double r = sqrt(r2);
-        const double ir = 1. / r;
+        const double r_inv = 1. / sqrt(r2);
+        const double r = r2 * r_inv;
         const double mj = gpj->mass;
-        const double hi = gpi->epsilon;
         double f;
-        const double f_lr = 1.;
 
         if (r >= hi) {
 
           /* Get Newtonian gravity */
-          f = mj * ir * ir * ir * f_lr;
+          f = mj * r_inv * r_inv * r_inv;
 
         } else {
 
-          const double hi_inv = 1. / hi;
-          const double hi_inv3 = hi_inv * hi_inv * hi_inv;
           const double ui = r * hi_inv;
           double W;
 
           kernel_grav_eval_double(ui, &W);
 
           /* Get softened gravity */
-          f = mj * hi_inv3 * W * f_lr;
+          f = mj * hi_inv3 * W;
         }
 
-        a_grav[0] -= f * dx;
-        a_grav[1] -= f * dy;
-        a_grav[2] -= f * dz;
+        a_grav[0] += f * dx;
+        a_grav[1] += f * dy;
+        a_grav[2] += f * dz;
+
+        /* Apply Ewald correction for periodic BC */
+        if (periodic && r > 1e-5 * hi) {
+
+          double corr[3];
+          gravity_exact_force_ewald_evaluate(dx, dy, dz, corr);
+
+          a_grav[0] += mj * corr[0];
+          a_grav[1] += mj * corr[1];
+          a_grav[2] += mj * corr[2];
+        }
       }
 
       /* Store the exact answer */
diff --git a/src/gravity.h b/src/gravity.h
index 00b930c00fb2558f274feb2991b78e96dc8b990b..85e42370bc456dceb577c42ee609e3f0724e14ea 100644
--- a/src/gravity.h
+++ b/src/gravity.h
@@ -34,6 +34,8 @@
 #include "./gravity/Default/gravity.h"
 #include "./gravity/Default/gravity_iact.h"
 
+void gravity_exact_force_ewald_init(double boxSize);
+void gravity_exact_force_ewald_free();
 void gravity_exact_force_compute(struct space *s, const struct engine *e);
 void gravity_exact_force_check(struct space *s, const struct engine *e,
                                float rel_tol);
diff --git a/src/gravity_cache.h b/src/gravity_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..14b672233aa9958ec39af32a87baead98c0bae04
--- /dev/null
+++ b/src/gravity_cache.h
@@ -0,0 +1,247 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#ifndef SWIFT_GRAVITY_CACHE_H
+#define SWIFT_GRAVITY_CACHE_H
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Local headers */
+#include "align.h"
+#include "error.h"
+#include "gravity.h"
+#include "vector.h"
+
+/**
+ * @brief A SoA object for the #gpart of a cell.
+ *
+ * This is used to help vectorize the leaf-leaf gravity interactions.
+ */
+struct gravity_cache {
+
+  /*! #gpart x position. */
+  float *restrict x SWIFT_CACHE_ALIGN;
+
+  /*! #gpart y position. */
+  float *restrict y SWIFT_CACHE_ALIGN;
+
+  /*! #gpart z position. */
+  float *restrict z SWIFT_CACHE_ALIGN;
+
+  /*! #gpart softening length. */
+  float *restrict epsilon SWIFT_CACHE_ALIGN;
+
+  /*! #gpart mass. */
+  float *restrict m SWIFT_CACHE_ALIGN;
+
+  /*! #gpart x acceleration. */
+  float *restrict a_x SWIFT_CACHE_ALIGN;
+
+  /*! #gpart y acceleration. */
+  float *restrict a_y SWIFT_CACHE_ALIGN;
+
+  /*! #gpart z acceleration. */
+  float *restrict a_z SWIFT_CACHE_ALIGN;
+
+  /*! Cache size */
+  int count;
+};
+
+/**
+ * @brief Frees the memory allocated in a #gravity_cache
+ *
+ * @param c The #gravity_cache to free.
+ */
+static INLINE void gravity_cache_clean(struct gravity_cache *c) {
+
+  if (c->count > 0) {
+    free(c->x);
+    free(c->y);
+    free(c->z);
+    free(c->epsilon);
+    free(c->m);
+    free(c->a_x);
+    free(c->a_y);
+    free(c->a_z);
+  }
+  c->count = 0;
+}
+
+/**
+ * @brief Allocates memory for the #gpart caches used in the leaf-leaf
+ * interactions.
+ *
+ * The cache is padded for the vector size and aligned properly
+ *
+ * @param c The #gravity_cache to allocate.
+ * @param count The number of #gpart to allocated for (space_splitsize is a good
+ * choice).
+ */
+static INLINE void gravity_cache_init(struct gravity_cache *c, int count) {
+
+  /* Size of the gravity cache */
+  const int padded_count = count - (count % VEC_SIZE) + VEC_SIZE;
+  const size_t sizeBytes = padded_count * sizeof(float);
+
+  /* Delete old stuff if any */
+  gravity_cache_clean(c);
+
+  int error = 0;
+  error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error +=
+      posix_memalign((void **)&c->epsilon, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->a_x, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->a_y, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->a_z, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+
+  if (error != 0)
+    error("Couldn't allocate gravity cache, size: %d", padded_count);
+
+  c->count = padded_count;
+}
+
+/**
+ * @brief Fills a #gravity_cache structure with some #gpart and shift them.
+ *
+ * @param c The #gravity_cache to fill.
+ * @param gparts The #gpart array to read from.
+ * @param gcount The number of particles to read.
+ * @param gcount_padded The number of particle to read padded to the next
+ * multiple of the vector length.
+ * @param shift A shift to apply to all the particles.
+ */
+__attribute__((always_inline)) INLINE void gravity_cache_populate(
+    struct gravity_cache *c, const struct gpart *restrict gparts, int gcount,
+    int gcount_padded, const double shift[3]) {
+
+  /* Make the compiler understand we are in happy vectorization land */
+  float *restrict x = c->x;
+  float *restrict y = c->y;
+  float *restrict z = c->z;
+  float *restrict m = c->m;
+  float *restrict epsilon = c->epsilon;
+  swift_align_information(x, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(y, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(z, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(epsilon, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(m, SWIFT_CACHE_ALIGNMENT);
+  swift_assume_size(gcount_padded, VEC_SIZE);
+
+  /* Fill the input caches */
+  for (int i = 0; i < gcount; ++i) {
+    x[i] = (float)(gparts[i].x[0] - shift[0]);
+    y[i] = (float)(gparts[i].x[1] - shift[1]);
+    z[i] = (float)(gparts[i].x[2] - shift[2]);
+    epsilon[i] = gparts[i].epsilon;
+    m[i] = gparts[i].mass;
+  }
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (gcount_padded < gcount) error("Padded counter smaller than counter");
+#endif
+
+  /* Pad the caches */
+  for (int i = gcount; i < gcount_padded; ++i) {
+    x[i] = 0.f;
+    y[i] = 0.f;
+    z[i] = 0.f;
+    epsilon[i] = 0.f;
+    m[i] = 0.f;
+  }
+}
+
+/**
+ * @brief Fills a #gravity_cache structure with some #gpart.
+ *
+ * @param c The #gravity_cache to fill.
+ * @param gparts The #gpart array to read from.
+ * @param gcount The number of particles to read.
+ * @param gcount_padded The number of particle to read padded to the next
+ * multiple of the vector length.
+ */
+__attribute__((always_inline)) INLINE void gravity_cache_populate_no_shift(
+    struct gravity_cache *c, const struct gpart *restrict gparts, int gcount,
+    int gcount_padded) {
+
+  /* Make the compiler understand we are in happy vectorization land */
+  float *restrict x = c->x;
+  float *restrict y = c->y;
+  float *restrict z = c->z;
+  float *restrict m = c->m;
+  float *restrict epsilon = c->epsilon;
+  swift_align_information(x, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(y, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(z, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(epsilon, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(m, SWIFT_CACHE_ALIGNMENT);
+  swift_assume_size(gcount_padded, VEC_SIZE);
+
+  /* Fill the input caches */
+  for (int i = 0; i < gcount; ++i) {
+    x[i] = (float)(gparts[i].x[0]);
+    y[i] = (float)(gparts[i].x[1]);
+    z[i] = (float)(gparts[i].x[2]);
+    epsilon[i] = gparts[i].epsilon;
+    m[i] = gparts[i].mass;
+  }
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (gcount_padded < gcount) error("Padded counter smaller than counter");
+#endif
+
+  /* Pad the caches */
+  for (int i = gcount; i < gcount_padded; ++i) {
+    x[i] = 0.f;
+    y[i] = 0.f;
+    z[i] = 0.f;
+    epsilon[i] = 0.f;
+    m[i] = 0.f;
+  }
+}
+
+/**
+ * @brief Write the output cache values back to the #gpart.
+ *
+ * @param c The #gravity_cache to read from.
+ * @param gparts The #gpart array to write to.
+ * @param gcount The number of particles to write.
+ */
+__attribute__((always_inline)) INLINE void gravity_cache_write_back(
+    const struct gravity_cache *c, struct gpart *restrict gparts, int gcount) {
+
+  /* Make the compiler understand we are in happy vectorization land */
+  float *restrict a_x = c->a_x;
+  float *restrict a_y = c->a_y;
+  float *restrict a_z = c->a_z;
+  swift_align_information(a_x, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(a_y, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(a_z, SWIFT_CACHE_ALIGNMENT);
+
+  /* Write stuff back to the particles */
+  for (int i = 0; i < gcount; ++i) {
+    gparts[i].a_grav[0] += a_x[i];
+    gparts[i].a_grav[1] += a_y[i];
+    gparts[i].a_grav[2] += a_z[i];
+  }
+}
+
+#endif /* SWIFT_GRAVITY_CACHE_H */
diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h
index 25eb1e4b76d088d7dcd3d4ba57fe5901188be000..9e06fec92f9667d21ce7dc196ba26f9870cb24f4 100644
--- a/src/hydro/Gadget2/hydro_iact.h
+++ b/src/hydro/Gadget2/hydro_iact.h
@@ -388,7 +388,7 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
                                  vector *wcountSum, vector *wcount_dhSum,
                                  vector *div_vSum, vector *curlvxSum,
                                  vector *curlvySum, vector *curlvzSum,
-                                 vector mask, int knlMask) {
+                                 mask_t mask) {
 
   vector r, ri, ui, wi, wi_dx;
   vector mj;
@@ -432,49 +432,24 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
   curlvry.v = vec_mul(curlvry.v, ri.v);
   curlvrz.v = vec_mul(curlvrz.v, ri.v);
 
-/* Mask updates to intermediate vector sums for particle pi. */
-#ifdef HAVE_AVX512_F
-  rhoSum->v =
-      _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);
+  vector wcount_dh_update;
+  wcount_dh_update.v =
+      vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v));
 
+  /* Mask updates to intermediate vector sums for particle pi. */
+  rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
   rho_dhSum->v =
-      _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
-                         vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
-                                               vec_mul(ui.v, wi_dx.v))));
-
-  wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);
-
-  wcount_dhSum->v = _mm512_mask_sub_ps(
-      rho_dhSum->v, knlMask, rho_dhSum->v,
-      vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)));
-
-  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
-                                   vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
-
-  curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
-                                    vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
-                                    curlvxSum->v);
-
-  curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
-                                    vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
-                                    curlvySum->v);
-
-  curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
-                                    vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
-                                    curlvzSum->v);
-#else
-  rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
-  rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
-                                                vec_mul(ui.v, wi_dx.v))),
-                          mask.v);
-  wcountSum->v += vec_and(wi.v, mask.v);
-  wcount_dhSum->v -= vec_and(
-      vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)), mask.v);
-  div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
-  curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
-  curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
-  curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
-#endif
+      vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v), mask);
+  wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
+  wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update.v, mask);
+  div_vSum->v =
+      vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
+  curlvxSum->v = vec_mask_add(curlvxSum->v,
+                              vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
+  curlvySum->v = vec_mask_add(curlvySum->v,
+                              vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
+  curlvzSum->v = vec_mask_add(curlvzSum->v,
+                              vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
 }
 
 /**
@@ -482,12 +457,14 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
  * (non-symmetric vectorized version).
  */
 __attribute__((always_inline)) INLINE static void
-runner_iact_nonsym_2_vec_density(
-    float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, vector vix,
-    vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj,
-    vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
-    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
-    vector mask, vector mask2, int knlMask, int knlMask2) {
+runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz,
+                                 vector hi_inv, vector vix, vector viy,
+                                 vector viz, float *Vjx, float *Vjy, float *Vjz,
+                                 float *Mj, vector *rhoSum, vector *rho_dhSum,
+                                 vector *wcountSum, vector *wcount_dhSum,
+                                 vector *div_vSum, vector *curlvxSum,
+                                 vector *curlvySum, vector *curlvzSum,
+                                 mask_t mask, mask_t mask2, short mask_cond) {
 
   vector r, ri, r2, ui, wi, wi_dx;
   vector mj;
@@ -567,88 +544,66 @@ runner_iact_nonsym_2_vec_density(
   curlvrz.v = vec_mul(curlvrz.v, ri.v);
   curlvrz2.v = vec_mul(curlvrz2.v, ri2.v);
 
-/* Mask updates to intermediate vector sums for particle pi. */
-#ifdef HAVE_AVX512_F
-  rhoSum->v =
-      _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);
-  rhoSum->v =
-      _mm512_mask_add_ps(rhoSum->v, knlMask2, vec_mul(mj2.v, wi2.v), rhoSum->v);
-
-  rho_dhSum->v =
-      _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
-                         vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
-                                               vec_mul(ui.v, wi_dx.v))));
-  rho_dhSum->v =
-      _mm512_mask_sub_ps(rho_dhSum->v, knlMask2, rho_dhSum->v,
-                         vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
-                                               vec_mul(ui2.v, wi_dx2.v))));
-
-  wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);
-  wcountSum->v =
-      _mm512_mask_add_ps(wcountSum->v, knlMask2, wi2.v, wcountSum->v);
-
-  wcount_dhSum->v = _mm512_mask_sub_ps(
-      rho_dhSum->v, knlMask, rho_dhSum->v,
-      vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)));
-
-  wcount_dhSum->v = _mm512_mask_sub_ps(
-      rho_dhSum->v, knlMask2, rho_dhSum->v,
-      vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v)));
-
-  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
-                                   vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
-  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask2, div_vSum->v,
-                                   vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)));
-
-  curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
-                                    vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
-                                    curlvxSum->v);
-  curlvxSum->v = _mm512_mask_add_ps(
-      curlvxSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)),
-      curlvxSum->v);
-
-  curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
-                                    vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
-                                    curlvySum->v);
-  curlvySum->v = _mm512_mask_add_ps(
-      curlvySum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)),
-      curlvySum->v);
-
-  curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
-                                    vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
-                                    curlvzSum->v);
-  curlvzSum->v = _mm512_mask_add_ps(
-      curlvzSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)),
-      curlvzSum->v);
-#else
-  rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
-  rhoSum->v += vec_and(vec_mul(mj2.v, wi2.v), mask2.v);
-  rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
-                                                vec_mul(ui.v, wi_dx.v))),
-                          mask.v);
-  rho_dhSum->v -=
-      vec_and(vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
-                                     vec_mul(ui2.v, wi_dx2.v))),
-              mask2.v);
-  wcountSum->v += vec_and(wi.v, mask.v);
-  wcountSum->v += vec_and(wi2.v, mask2.v);
-  wcount_dhSum->v -= vec_and(
-      vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)), mask.v);
-  wcount_dhSum->v -= vec_and(
-      vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v)),
-      mask2.v);
-  div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
-  div_vSum->v -= vec_and(vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2.v);
-  curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
-  curlvxSum->v +=
-      vec_and(vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2.v);
-  curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
-  curlvySum->v +=
-      vec_and(vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2.v);
-  curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
-  curlvzSum->v +=
-      vec_and(vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2.v);
-#endif
+  vector wcount_dh_update, wcount_dh_update2;
+  wcount_dh_update.v =
+      vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v));
+  wcount_dh_update2.v =
+      vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v));
+
+  /* Mask updates to intermediate vector sums for particle pi. */
+  /* Mask only when needed. */
+  if (mask_cond) {
+    rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
+    rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2);
+    rho_dhSum->v =
+        vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v), mask);
+    rho_dhSum->v =
+        vec_mask_sub(rho_dhSum->v, vec_mul(mj2.v, wcount_dh_update2.v), mask2);
+    wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
+    wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2);
+    wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update.v, mask);
+    wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update2.v, mask2);
+    div_vSum->v = vec_mask_sub(div_vSum->v,
+                               vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
+    div_vSum->v = vec_mask_sub(
+        div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2);
+    curlvxSum->v = vec_mask_add(
+        curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
+    curlvxSum->v = vec_mask_add(
+        curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2);
+    curlvySum->v = vec_mask_add(
+        curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
+    curlvySum->v = vec_mask_add(
+        curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2);
+    curlvzSum->v = vec_mask_add(
+        curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
+    curlvzSum->v = vec_mask_add(
+        curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2);
+  } else {
+    rhoSum->v = vec_add(rhoSum->v, vec_mul(mj.v, wi.v));
+    rhoSum->v = vec_add(rhoSum->v, vec_mul(mj2.v, wi2.v));
+    rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v));
+    rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj2.v, wcount_dh_update2.v));
+    wcountSum->v = vec_add(wcountSum->v, wi.v);
+    wcountSum->v = vec_add(wcountSum->v, wi2.v);
+    wcount_dhSum->v = vec_sub(wcount_dhSum->v, wcount_dh_update.v);
+    wcount_dhSum->v = vec_sub(wcount_dhSum->v, wcount_dh_update2.v);
+    div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
+    div_vSum->v =
+        vec_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)));
+    curlvxSum->v =
+        vec_add(curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)));
+    curlvxSum->v =
+        vec_add(curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)));
+    curlvySum->v =
+        vec_add(curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)));
+    curlvySum->v =
+        vec_add(curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)));
+    curlvzSum->v =
+        vec_add(curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)));
+    curlvzSum->v =
+        vec_add(curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)));
+  }
 }
 #endif
 
@@ -755,7 +710,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
     float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
     struct part **pj) {
 
-#ifdef WITH_VECTORIZATION
+#ifdef WITH_OLD_VECTORIZATION
 
   vector r, r2, ri;
   vector xi, xj;
@@ -1037,7 +992,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
     float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
     struct part **pj) {
 
-#ifdef WITH_VECTORIZATION
+#ifdef WITH_OLD_VECTORIZATION
 
   vector r, r2, ri;
   vector xi, xj;
diff --git a/src/hydro/PressureEntropy/hydro.h b/src/hydro/PressureEntropy/hydro.h
index 6d8d06abb126a574fd82c71a809235eb833494c6..080b796b21d7f3b48191cd375574ae1de6d11d1a 100644
--- a/src/hydro/PressureEntropy/hydro.h
+++ b/src/hydro/PressureEntropy/hydro.h
@@ -212,14 +212,15 @@ __attribute__((always_inline)) INLINE static void hydro_end_density(
   p->density.pressure_dh -=
       hydro_dimension * p->mass * p->entropy_one_over_gamma * kernel_root;
   p->density.wcount += kernel_root;
+  p->density.wcount_dh -= hydro_dimension * kernel_root;
 
   /* Finish the calculation by inserting the missing h-factors */
   p->rho *= h_inv_dim;
   p->rho_bar *= h_inv_dim;
   p->density.rho_dh *= h_inv_dim_plus_one;
   p->density.pressure_dh *= h_inv_dim_plus_one;
-  p->density.wcount *= kernel_norm;
-  p->density.wcount_dh *= h_inv * kernel_gamma * kernel_norm;
+  p->density.wcount *= h_inv_dim;
+  p->density.wcount_dh *= h_inv_dim_plus_one;
 
   const float rho_inv = 1.f / p->rho;
   const float entropy_minus_one_over_gamma = 1.f / p->entropy_one_over_gamma;
diff --git a/src/hydro/PressureEntropy/hydro_iact.h b/src/hydro/PressureEntropy/hydro_iact.h
index ce1c38ca69954252dc804af9181b9060a14afcb9..37a9f2b01af16fe598b414a9f67123849bee1442 100644
--- a/src/hydro/PressureEntropy/hydro_iact.h
+++ b/src/hydro/PressureEntropy/hydro_iact.h
@@ -59,7 +59,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
 
   /* Compute contribution to the number of neighbours */
   pi->density.wcount += wi;
-  pi->density.wcount_dh -= ui * wi_dx;
+  pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx);
 
   /* Compute contribution to the weighted density */
   pi->rho_bar += mj * pj->entropy_one_over_gamma * wi;
@@ -77,7 +77,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
 
   /* Compute contribution to the number of neighbours */
   pj->density.wcount += wj;
-  pj->density.wcount_dh -= uj * wj_dx;
+  pj->density.wcount_dh -= (hydro_dimension * wj + uj * wj_dx);
 
   /* Compute contribution to the weighted density */
   pj->rho_bar += mi * pi->entropy_one_over_gamma * wj;
@@ -147,7 +147,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
 
   /* Compute contribution to the number of neighbours */
   pi->density.wcount += wi;
-  pi->density.wcount_dh -= ui * wi_dx;
+  pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx);
 
   /* Compute contribution to the weighted density */
   pi->rho_bar += mj * pj->entropy_one_over_gamma * wi;
diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h
index f634a59d7ee769951e6560d46a92053c144cc766..5355c15f8f1a0d0c3d811c7039da04caf0522cc9 100644
--- a/src/kernel_hydro.h
+++ b/src/kernel_hydro.h
@@ -341,20 +341,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx(
 
 /* ------------------------------------------------------------------------- */
 
-#ifdef WITH_VECTORIZATION
-
-static const vector kernel_gamma_inv_vec = FILL_VEC((float)kernel_gamma_inv);
-
-static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals);
-
-static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant);
-
-static const vector kernel_gamma_inv_dim_vec =
-    FILL_VEC((float)kernel_gamma_inv_dim);
-
-static const vector kernel_gamma_inv_dim_plus_one_vec =
-    FILL_VEC((float)kernel_gamma_inv_dim_plus_one);
-
+#ifdef WITH_OLD_VECTORIZATION
 /**
  * @brief Computes the kernel function and its derivative (Vectorised version).
  *
@@ -369,11 +356,12 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec(
 
   /* Go to the range [0,1[ from [0,H[ */
   vector x;
-  x.v = u->v * kernel_gamma_inv_vec.v;
+  x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
 
   /* Load x and get the interval id. */
   vector ind;
-  ind.m = vec_ftoi(vec_fmin(x.v * kernel_ivals_vec.v, kernel_ivals_vec.v));
+  ind.m =
+      vec_ftoi(vec_fmin(vec_mul(x.v, kernel_ivals_vec.v), kernel_ivals_vec.v));
 
   /* load the coefficients. */
   vector c[kernel_degree + 1];
@@ -382,20 +370,36 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec(
       c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j];
 
   /* Init the iteration for Horner's scheme. */
-  w->v = (c[0].v * x.v) + c[1].v;
+  w->v = vec_fma(c[0].v, x.v, c[1].v);
   dw_dx->v = c[0].v;
 
   /* And we're off! */
   for (int k = 2; k <= kernel_degree; k++) {
-    dw_dx->v = (dw_dx->v * x.v) + w->v;
-    w->v = (x.v * w->v) + c[k].v;
+    dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
+    w->v = vec_fma(x.v, w->v, c[k].v);
   }
 
   /* Return everything */
-  w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
-  dw_dx->v =
-      dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
+  w->v =
+      vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
+  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
+                                       kernel_gamma_inv_dim_plus_one_vec.v));
 }
+#endif
+
+#ifdef WITH_VECTORIZATION
+
+static const vector kernel_gamma_inv_vec = FILL_VEC((float)kernel_gamma_inv);
+
+static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals);
+
+static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant);
+
+static const vector kernel_gamma_inv_dim_vec =
+    FILL_VEC((float)kernel_gamma_inv_dim);
+
+static const vector kernel_gamma_inv_dim_plus_one_vec =
+    FILL_VEC((float)kernel_gamma_inv_dim_plus_one);
 
 /* Define constant vectors for the Wendland C2 and Cubic Spline kernel
  * coefficients. */
@@ -468,15 +472,15 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
   w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
 #elif defined(CUBIC_SPLINE_KERNEL)
   vector w2, dw_dx2;
-  vector mask_reg1, mask_reg2;
+  mask_t mask_reg;
 
-  /* Form a mask for each part of the kernel. */
-  mask_reg1.v = vec_cmp_lt(x.v, cond.v);  /* 0 < x < 0.5 */
-  mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
-  ;
+  /* Form a mask for one part of the kernel. */
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
 
   /* Work out w for both regions of the kernel and combine the results together
-   * using masks. */
+   * using a mask. */
 
   /* Init the iteration for Horner's scheme. */
   w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
@@ -495,20 +499,17 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
   w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
   w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
 
-  /* Mask out unneeded values. */
-  w->v = vec_and(w->v, mask_reg1.v);
-  w2.v = vec_and(w2.v, mask_reg2.v);
-  dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
-  dw_dx2.v = vec_and(dw_dx2.v, mask_reg2.v);
-
-  /* Added both w and w2 together to form complete result. */
-  w->v = vec_add(w->v, w2.v);
-  dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
+  /* Blend both kernel regions into one vector (mask out unneeded values). */
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  w->v = vec_blend(mask_reg, w->v, w2.v);
+  dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v);
+
 #else
-#error
+#error "Vectorisation not supported for this kernel!!!"
 #endif
 
-  /* Return everything */
+  /* Return everyting */
   w->v =
       vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
   dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
@@ -580,15 +581,13 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
 #elif defined(CUBIC_SPLINE_KERNEL)
   vector w_2, dw_dx_2;
   vector w2_2, dw_dx2_2;
-  vector mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2;
+  mask_t mask_reg, mask_reg_v2;
 
-  /* Form a mask for each part of the kernel. */
-  mask_reg1.v = vec_cmp_lt(x.v, cond.v);     /* 0 < x < 0.5 */
-  mask_reg1_v2.v = vec_cmp_lt(x2.v, cond.v); /* 0 < x < 0.5 */
-  mask_reg2.v = vec_cmp_gte(x.v, cond.v);    /* 0.5 < x < 1 */
-  ;
-  mask_reg2_v2.v = vec_cmp_gte(x2.v, cond.v); /* 0.5 < x < 1 */
-  ;
+  /* Form a mask for one part of the kernel for each vector. */
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v));     /* 0.5 < x < 1 */
+  vec_create_mask(mask_reg_v2, vec_cmp_gte(x2.v, cond.v)); /* 0.5 < x < 1 */
 
   /* Work out w for both regions of the kernel and combine the results together
    * using masks. */
@@ -622,29 +621,23 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
   w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c3.v);
   w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c3.v);
 
-  /* Mask out unneeded values. */
-  w->v = vec_and(w->v, mask_reg1.v);
-  w2->v = vec_and(w2->v, mask_reg1_v2.v);
-  w_2.v = vec_and(w_2.v, mask_reg2.v);
-  w2_2.v = vec_and(w2_2.v, mask_reg2_v2.v);
-  dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
-  dw_dx2->v = vec_and(dw_dx2->v, mask_reg1_v2.v);
-  dw_dx_2.v = vec_and(dw_dx_2.v, mask_reg2.v);
-  dw_dx2_2.v = vec_and(dw_dx2_2.v, mask_reg2_v2.v);
-
-  /* Added both w and w2 together to form complete result. */
-  w->v = vec_add(w->v, w_2.v);
-  w2->v = vec_add(w2->v, w2_2.v);
-  dw_dx->v = vec_add(dw_dx->v, dw_dx_2.v);
-  dw_dx2->v = vec_add(dw_dx2->v, dw_dx2_2.v);
+  /* Blend both kernel regions into one vector (mask out unneeded values). */
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  w->v = vec_blend(mask_reg, w->v, w_2.v);
+  w2->v = vec_blend(mask_reg_v2, w2->v, w2_2.v);
+  dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx_2.v);
+  dw_dx2->v = vec_blend(mask_reg_v2, dw_dx2->v, dw_dx2_2.v);
 
   /* Return everything */
-  w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
-  w2->v = w2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
-  dw_dx->v =
-      dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
-  dw_dx2->v =
-      dw_dx2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
+  w->v =
+      vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
+  w2->v = vec_mul(w2->v,
+                  vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
+  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
+                                       kernel_gamma_inv_dim_plus_one_vec.v));
+  dw_dx2->v = vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v,
+                                         kernel_gamma_inv_dim_plus_one_vec.v));
 
 #endif
 }
@@ -676,12 +669,12 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
   w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
 #elif defined(CUBIC_SPLINE_KERNEL)
   vector w2;
-  vector mask1, mask2;
+  mask_t mask_reg;
 
   /* Form a mask for each part of the kernel. */
-  mask1.v = vec_cmp_lt(x.v, cond.v);  /* 0 < x < 0.5 */
-  mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
-  ;
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
 
   /* Work out w for both regions of the kernel and combine the results together
    * using masks. */
@@ -698,13 +691,12 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
   w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
 
   /* Mask out unneeded values. */
-  w->v = vec_and(w->v, mask1.v);
-  w2.v = vec_and(w2.v, mask2.v);
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  w->v = vec_blend(mask_reg, w->v, w2.v);
 
-  /* Added both w and w2 together to form complete result. */
-  w->v = vec_add(w->v, w2.v);
 #else
-#error
+#error "Vectorisation not supported for this kernel!!!"
 #endif
 
   /* Return everything */
@@ -741,12 +733,12 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
 
 #elif defined(CUBIC_SPLINE_KERNEL)
   vector dw_dx2;
-  vector mask1, mask2;
+  mask_t mask_reg;
 
   /* Form a mask for each part of the kernel. */
-  mask1.v = vec_cmp_lt(x.v, cond.v);  /* 0 < x < 0.5 */
-  mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
-  ;
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
 
   /* Work out w for both regions of the kernel and combine the results together
    * using masks. */
@@ -760,15 +752,106 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
   dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
 
   /* Mask out unneeded values. */
-  dw_dx->v = vec_and(dw_dx->v, mask1.v);
-  dw_dx2.v = vec_and(dw_dx2.v, mask2.v);
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v);
+
+#else
+#error "Vectorisation not supported for this kernel!!!"
+#endif
+
+  /* Mask out result for particles that lie outside of the kernel function. */
+  mask_t mask;
+  vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */
+
+  dw_dx->v = vec_and_mask(dw_dx->v, mask);
+
+  /* Return everything */
+  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
+                                       kernel_gamma_inv_dim_plus_one_vec.v));
+}
+
+/**
+ * @brief Computes the kernel function derivative for two particles
+ * using interleaved vectors.
+ *
+ * Return 0 if $u > \\gamma = H/h$
+ *
+ * @param u The ratio of the distance to the smoothing length $u = x/h$.
+ * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
+ * @param u_2 The ratio of the distance to the smoothing length $u = x/h$ for
+ * second particle.
+ * @param dw_dx_2 (return) The norm of the gradient of $|\\nabla W(x,h)|$ for
+ * second particle.
+ */
+__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
+    vector *u, vector *dw_dx, vector *u_2, vector *dw_dx_2) {
+
+  /* Go to the range [0,1[ from [0,H[ */
+  vector x, x_2;
+  x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
+  x_2.v = vec_mul(u_2->v, kernel_gamma_inv_vec.v);
+
+#ifdef WENDLAND_C2_KERNEL
+  /* Init the iteration for Horner's scheme. */
+  dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v);
+  dw_dx_2->v =
+      vec_fma(wendland_dwdx_const_c0.v, x_2.v, wendland_dwdx_const_c1.v);
+
+  /* Calculate the polynomial interleaving vector operations */
+  dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v);
+  dw_dx_2->v = vec_fma(dw_dx_2->v, x_2.v, wendland_dwdx_const_c2.v);
+
+  dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v);
+  dw_dx_2->v = vec_fma(dw_dx_2->v, x_2.v, wendland_dwdx_const_c3.v);
+
+  dw_dx->v = vec_mul(dw_dx->v, x.v);
+  dw_dx_2->v = vec_mul(dw_dx_2->v, x_2.v);
+
+#elif defined(CUBIC_SPLINE_KERNEL)
+  vector dw_dx2, dw_dx2_2;
+  mask_t mask_reg;
+  mask_t mask_reg_v2;
+
+  /* Form a mask for one part of the kernel. */
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v));      /* 0.5 < x < 1 */
+  vec_create_mask(mask_reg_v2, vec_cmp_gte(x_2.v, cond.v)); /* 0.5 < x < 1 */
+
+  /* Work out w for both regions of the kernel and combine the results together
+   * using masks. */
+
+  /* Init the iteration for Horner's scheme. */
+  dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v);
+  dw_dx_2->v = vec_fma(cubic_1_dwdx_const_c0.v, x_2.v, cubic_1_dwdx_const_c1.v);
+  dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v);
+  dw_dx2_2.v = vec_fma(cubic_2_dwdx_const_c0.v, x_2.v, cubic_2_dwdx_const_c1.v);
+
+  /* Calculate the polynomial interleaving vector operations. */
+  dw_dx->v = vec_mul(dw_dx->v, x.v);       /* cubic_1_dwdx_const_c2 is zero. */
+  dw_dx_2->v = vec_mul(dw_dx_2->v, x_2.v); /* cubic_1_dwdx_const_c2 is zero. */
+  dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
+  dw_dx2_2.v = vec_fma(dw_dx2_2.v, x_2.v, cubic_2_dwdx_const_c2.v);
+
+  /* Mask out unneeded values. */
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v);
+  dw_dx_2->v = vec_blend(mask_reg_v2, dw_dx_2->v, dw_dx2_2.v);
 
-  /* Added both dwdx and dwdx2 together to form complete result. */
-  dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
 #else
-#error
+#error "Vectorisation not supported for this kernel!!!"
 #endif
 
+  /* Mask out result for particles that lie outside of the kernel function. */
+  mask_t mask, mask_2;
+  vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f)));     /* x < 1 */
+  vec_create_mask(mask_2, vec_cmp_lt(x_2.v, vec_set1(1.f))); /* x < 1 */
+
+  dw_dx->v = vec_and_mask(dw_dx->v, mask);
+  dw_dx_2->v = vec_and_mask(dw_dx_2->v, mask_2);
+
   /* Return everything */
   dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
                                        kernel_gamma_inv_dim_plus_one_vec.v));
diff --git a/src/multipole.h b/src/multipole.h
index 3932b265de2dc7416a96d0a02695fe5973b6fb4e..004757924cccb6bc2f450c19f1ccd600f50e1990 100644
--- a/src/multipole.h
+++ b/src/multipole.h
@@ -2647,31 +2647,17 @@ INLINE static void gravity_L2P(const struct grav_tensor *lb,
  * @param ma The #multipole of the first #cell.
  * @param mb The #multipole of the second #cell.
  * @param theta_crit_inv The inverse of the critical opening angle.
- * @param periodic Are we using periodic boundary conditions ?
- * @param dim The dimensions of the box.
+ * @param r2 Square of the distance (periodically wrapped) between the
+ * multipoles.
  */
 __attribute__((always_inline)) INLINE static int
 gravity_multipole_accept_rebuild(const struct gravity_tensors *const ma,
                                  const struct gravity_tensors *const mb,
-                                 double theta_crit_inv, int periodic,
-                                 const double dim[3]) {
+                                 double theta_crit_inv, double r2) {
 
   const double r_crit_a = ma->r_max_rebuild * theta_crit_inv;
   const double r_crit_b = mb->r_max_rebuild * theta_crit_inv;
 
-  double dx = ma->CoM_rebuild[0] - mb->CoM_rebuild[0];
-  double dy = ma->CoM_rebuild[1] - mb->CoM_rebuild[1];
-  double dz = ma->CoM_rebuild[2] - mb->CoM_rebuild[2];
-
-  /* Apply BC */
-  if (periodic) {
-    dx = nearest(dx, dim[0]);
-    dy = nearest(dy, dim[1]);
-    dz = nearest(dz, dim[2]);
-  }
-
-  const double r2 = dx * dx + dy * dy + dz * dz;
-
   // MATTHIEU: Make this mass-dependent ?
 
   /* Multipole acceptance criterion (Dehnen 2002, eq.10) */
@@ -2688,30 +2674,16 @@ gravity_multipole_accept_rebuild(const struct gravity_tensors *const ma,
  * @param ma The #multipole of the first #cell.
  * @param mb The #multipole of the second #cell.
  * @param theta_crit_inv The inverse of the critical opening angle.
- * @param periodic Are we using periodic boundary conditions ?
- * @param dim The dimensions of the box.
+ * @param r2 Square of the distance (periodically wrapped) between the
+ * multipoles.
  */
 __attribute__((always_inline)) INLINE static int gravity_multipole_accept(
     const struct gravity_tensors *const ma,
-    const struct gravity_tensors *const mb, double theta_crit_inv, int periodic,
-    const double dim[3]) {
+    const struct gravity_tensors *const mb, double theta_crit_inv, double r2) {
 
   const double r_crit_a = ma->r_max * theta_crit_inv;
   const double r_crit_b = mb->r_max * theta_crit_inv;
 
-  double dx = ma->CoM[0] - mb->CoM[0];
-  double dy = ma->CoM[1] - mb->CoM[1];
-  double dz = ma->CoM[2] - mb->CoM[2];
-
-  /* Apply BC */
-  if (periodic) {
-    dx = nearest(dx, dim[0]);
-    dy = nearest(dy, dim[1]);
-    dz = nearest(dz, dim[2]);
-  }
-
-  const double r2 = dx * dx + dy * dy + dz * dz;
-
   // MATTHIEU: Make this mass-dependent ?
 
   /* Multipole acceptance criterion (Dehnen 2002, eq.10) */
diff --git a/src/parser.c b/src/parser.c
index 02970c0526072013c14ee27c4486c8915ec67b25..0b608b29263342240af68fd99d2fdd3241e2a1e6 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -133,6 +133,8 @@ void parser_set_param(struct swift_params *params, const char *namevalue) {
   int updated = 0;
   for (int i = 0; i < params->paramCount; i++) {
     if (strcmp(name, params->data[i].name) == 0) {
+      message("Value of '%s' changed from '%s' to '%s'", params->data[i].name,
+              params->data[i].value, value);
       strcpy(params->data[i].value, value);
       updated = 1;
     }
diff --git a/src/runner.h b/src/runner.h
index facadf1608fb7e06af952eedbf1151fa68530bef..e33a3e380e6097a67258d116d617483caca35086 100644
--- a/src/runner.h
+++ b/src/runner.h
@@ -28,6 +28,7 @@
 
 /* Includes. */
 #include "cache.h"
+#include "gravity_cache.h"
 
 struct cell;
 struct engine;
@@ -49,7 +50,14 @@ struct runner {
   /*! The engine owing this runner. */
   struct engine *e;
 
+  /*! The particle gravity_cache of cell ci. */
+  struct gravity_cache ci_gravity_cache;
+
+  /*! The particle gravity_cache of cell cj. */
+  struct gravity_cache cj_gravity_cache;
+
 #ifdef WITH_VECTORIZATION
+
   /*! The particle cache of cell ci. */
   struct cache ci_cache;
 
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index 47ebba61d9a77486448ba2cdb25c9c0a448b9bac..0dd6b2cdfd430e471fc13087afb0b2fcde947b97 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -1037,8 +1037,8 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
       struct part *restrict pj = &parts_j[sort_j[pjd].i];
       if (!part_is_active(pj, e)) continue;
       const float hj = pj->h;
-      const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
-      if (dj > di_max) continue;
+      const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift;
+      if (dj - rshift > di_max) continue;
 
       double pjx[3];
       for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
@@ -1449,8 +1449,8 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
     /* Get a hold of the jth part in cj. */
     struct part *restrict pj = &parts_j[sort_j[pjd].i];
     const float hj = pj->h;
-    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
-    if (dj > di_max) continue;
+    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift;
+    if (dj - rshift > di_max) continue;
 
     double pjx[3];
     for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h
index a5b803ae62ad199dce7168aa9b9fb270d1e7042c..01ea6a073211a08430e77721f4c2e60ef7adfd04 100644
--- a/src/runner_doiact_grav.h
+++ b/src/runner_doiact_grav.h
@@ -162,6 +162,235 @@ void runner_dopair_grav_mm(const struct runner *r, struct cell *restrict ci,
 void runner_dopair_grav_pp_full(struct runner *r, struct cell *ci,
                                 struct cell *cj, double shift[3]) {
 
+  /* Some constants */
+  const struct engine *const e = r->e;
+  struct gravity_cache *const ci_cache = &r->ci_gravity_cache;
+  struct gravity_cache *const cj_cache = &r->cj_gravity_cache;
+
+  /* Cell properties */
+  const int gcount_i = ci->gcount;
+  const int gcount_j = cj->gcount;
+  struct gpart *restrict gparts_i = ci->gparts;
+  struct gpart *restrict gparts_j = cj->gparts;
+  const int ci_active = cell_is_active(ci, e);
+  const int cj_active = cell_is_active(cj, e);
+  const double loc_i[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
+  const double loc_j[3] = {cj->loc[0], cj->loc[1], cj->loc[2]};
+  const double loc_mean[3] = {0.5 * (loc_i[0] + loc_j[0]),
+                              0.5 * (loc_i[1] + loc_j[1]),
+                              0.5 * (loc_i[2] + loc_j[2])};
+
+  /* Anything to do here ?*/
+  if (!ci_active && !cj_active) return;
+
+  /* Check that we fit in cache */
+  if (gcount_i > ci_cache->count || gcount_j > cj_cache->count)
+    error("Not enough space in the caches! gcount_i=%d gcount_j=%d", gcount_i,
+          gcount_j);
+
+  /* Computed the padded counts */
+  const int gcount_padded_i = gcount_i - (gcount_i % VEC_SIZE) + VEC_SIZE;
+  const int gcount_padded_j = gcount_j - (gcount_j % VEC_SIZE) + VEC_SIZE;
+
+  /* Fill the caches */
+  gravity_cache_populate(ci_cache, gparts_i, gcount_i, gcount_padded_i,
+                         loc_mean);
+  gravity_cache_populate(cj_cache, gparts_j, gcount_j, gcount_padded_j,
+                         loc_mean);
+
+  /* Ok... Here we go ! */
+
+  if (ci_active) {
+
+    /* Loop over all particles in ci... */
+    for (int pid = 0; pid < gcount_i; pid++) {
+
+      /* Skip inactive particles */
+      if (!gpart_is_active(&gparts_i[pid], e)) continue;
+
+      const float x_i = ci_cache->x[pid];
+      const float y_i = ci_cache->y[pid];
+      const float z_i = ci_cache->z[pid];
+
+      /* Some powers of the softening length */
+      const float h_i = ci_cache->epsilon[pid];
+      const float h2_i = h_i * h_i;
+      const float h_inv_i = 1.f / h_i;
+      const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i;
+
+      /* Local accumulators for the acceleration */
+      float a_x = 0.f, a_y = 0.f, a_z = 0.f;
+
+      /* Make the compiler understand we are in happy vectorization land */
+      swift_align_information(cj_cache->x, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(cj_cache->y, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(cj_cache->z, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(cj_cache->m, SWIFT_CACHE_ALIGNMENT);
+      swift_assume_size(gcount_padded_j, VEC_SIZE);
+
+      /* Loop over every particle in the other cell. */
+      for (int pjd = 0; pjd < gcount_padded_j; pjd++) {
+
+        /* Get info about j */
+        const float x_j = cj_cache->x[pjd];
+        const float y_j = cj_cache->y[pjd];
+        const float z_j = cj_cache->z[pjd];
+        const float mass_j = cj_cache->m[pjd];
+
+        /* Compute the pairwise (square) distance. */
+        const float dx = x_i - x_j;
+        const float dy = y_i - y_j;
+        const float dz = z_i - z_j;
+        const float r2 = dx * dx + dy * dy + dz * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        if (r2 == 0.f) error("Interacting particles with 0 distance");
+
+        /* Check that particles have been drifted to the current time */
+        if (gparts_i[pid].ti_drift != e->ti_current)
+          error("gpi not drifted to current time");
+        if (pjd < gcount_j && gparts_j[pjd].ti_drift != e->ti_current)
+          error("gpj not drifted to current time");
+#endif
+
+        /* Get the inverse distance */
+        const float r_inv = 1.f / sqrtf(r2);
+
+        float f_ij, W_ij;
+
+        if (r2 >= h2_i) {
+
+          /* Get Newtonian gravity */
+          f_ij = mass_j * r_inv * r_inv * r_inv;
+
+        } else {
+
+          const float r = r2 * r_inv;
+          const float ui = r * h_inv_i;
+
+          kernel_grav_eval(ui, &W_ij);
+
+          /* Get softened gravity */
+          f_ij = mass_j * h_inv3_i * W_ij;
+        }
+
+        /* Store it back */
+        a_x -= f_ij * dx;
+        a_y -= f_ij * dy;
+        a_z -= f_ij * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        /* Update the interaction counter if it's not a padded gpart */
+        if (pjd < gcount_j) gparts_i[pid].num_interacted++;
+#endif
+      }
+
+      /* Store everything back in cache */
+      ci_cache->a_x[pid] = a_x;
+      ci_cache->a_y[pid] = a_y;
+      ci_cache->a_z[pid] = a_z;
+    }
+  }
+
+  /* Now do the opposite loop */
+  if (cj_active) {
+
+    /* Loop over all particles in ci... */
+    for (int pjd = 0; pjd < gcount_j; pjd++) {
+
+      /* Skip inactive particles */
+      if (!gpart_is_active(&gparts_j[pjd], e)) continue;
+
+      const float x_j = cj_cache->x[pjd];
+      const float y_j = cj_cache->y[pjd];
+      const float z_j = cj_cache->z[pjd];
+
+      /* Some powers of the softening length */
+      const float h_j = cj_cache->epsilon[pjd];
+      const float h2_j = h_j * h_j;
+      const float h_inv_j = 1.f / h_j;
+      const float h_inv3_j = h_inv_j * h_inv_j * h_inv_j;
+
+      /* Local accumulators for the acceleration */
+      float a_x = 0.f, a_y = 0.f, a_z = 0.f;
+
+      /* Make the compiler understand we are in happy vectorization land */
+      swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT);
+      swift_assume_size(gcount_padded_i, VEC_SIZE);
+
+      /* Loop over every particle in the other cell. */
+      for (int pid = 0; pid < gcount_padded_i; pid++) {
+
+        /* Get info about j */
+        const float x_i = ci_cache->x[pid];
+        const float y_i = ci_cache->y[pid];
+        const float z_i = ci_cache->z[pid];
+        const float mass_i = ci_cache->m[pid];
+
+        /* Compute the pairwise (square) distance. */
+        const float dx = x_j - x_i;
+        const float dy = y_j - y_i;
+        const float dz = z_j - z_i;
+        const float r2 = dx * dx + dy * dy + dz * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        if (r2 == 0.f) error("Interacting particles with 0 distance");
+
+        /* Check that particles have been drifted to the current time */
+        if (gparts_j[pjd].ti_drift != e->ti_current)
+          error("gpj not drifted to current time");
+        if (pid < gcount_i && gparts_i[pid].ti_drift != e->ti_current)
+          error("gpi not drifted to current time");
+#endif
+
+        /* Get the inverse distance */
+        const float r_inv = 1.f / sqrtf(r2);
+
+        float f_ji, W_ji;
+
+        if (r2 >= h2_j) {
+
+          /* Get Newtonian gravity */
+          f_ji = mass_i * r_inv * r_inv * r_inv;
+
+        } else {
+
+          const float r = r2 * r_inv;
+          const float uj = r * h_inv_j;
+
+          kernel_grav_eval(uj, &W_ji);
+
+          /* Get softened gravity */
+          f_ji = mass_i * h_inv3_j * W_ji;
+        }
+
+        /* Store it back */
+        a_x -= f_ji * dx;
+        a_y -= f_ji * dy;
+        a_z -= f_ji * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        /* Update the interaction counter if it's not a padded gpart */
+        if (pid < gcount_i) gparts_j[pjd].num_interacted++;
+#endif
+      }
+
+      /* Store everything back in cache */
+      cj_cache->a_x[pjd] = a_x;
+      cj_cache->a_y[pjd] = a_y;
+      cj_cache->a_z[pjd] = a_z;
+    }
+  }
+
+  /* Write back to the particles */
+  if (ci_active) gravity_cache_write_back(ci_cache, gparts_i, gcount_i);
+  if (cj_active) gravity_cache_write_back(cj_cache, gparts_j, gcount_j);
+
+#ifdef MATTHIEU_OLD_STUFF
+
   /* Some constants */
   const struct engine *const e = r->e;
 
@@ -258,6 +487,7 @@ void runner_dopair_grav_pp_full(struct runner *r, struct cell *ci,
       }
     }
   }
+#endif
 }
 
 /**
@@ -281,6 +511,251 @@ void runner_dopair_grav_pp_truncated(struct runner *r, struct cell *ci,
   const double rlr = cell_width * a_smooth;
   const float rlr_inv = 1. / rlr;
 
+  /* Caches to play with */
+  struct gravity_cache *const ci_cache = &r->ci_gravity_cache;
+  struct gravity_cache *const cj_cache = &r->cj_gravity_cache;
+
+  /* Cell properties */
+  const int gcount_i = ci->gcount;
+  const int gcount_j = cj->gcount;
+  struct gpart *restrict gparts_i = ci->gparts;
+  struct gpart *restrict gparts_j = cj->gparts;
+  const int ci_active = cell_is_active(ci, e);
+  const int cj_active = cell_is_active(cj, e);
+  const double loc_i[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
+  const double loc_j[3] = {cj->loc[0], cj->loc[1], cj->loc[2]};
+  const double loc_mean[3] = {0.5 * (loc_i[0] + loc_j[0]),
+                              0.5 * (loc_i[1] + loc_j[1]),
+                              0.5 * (loc_i[2] + loc_j[2])};
+
+  /* Anything to do here ?*/
+  if (!ci_active && !cj_active) return;
+
+  /* Check that we fit in cache */
+  if (gcount_i > ci_cache->count || gcount_j > cj_cache->count)
+    error("Not enough space in the caches! gcount_i=%d gcount_j=%d", gcount_i,
+          gcount_j);
+
+  /* Computed the padded counts */
+  const int gcount_padded_i = gcount_i - (gcount_i % VEC_SIZE) + VEC_SIZE;
+  const int gcount_padded_j = gcount_j - (gcount_j % VEC_SIZE) + VEC_SIZE;
+
+  /* Fill the caches */
+  gravity_cache_populate(ci_cache, gparts_i, gcount_i, gcount_padded_i,
+                         loc_mean);
+  gravity_cache_populate(cj_cache, gparts_j, gcount_j, gcount_padded_j,
+                         loc_mean);
+
+  /* Ok... Here we go ! */
+
+  if (ci_active) {
+
+    /* Loop over all particles in ci... */
+    for (int pid = 0; pid < gcount_i; pid++) {
+
+      /* Skip inactive particles */
+      if (!gpart_is_active(&gparts_i[pid], e)) continue;
+
+      const float x_i = ci_cache->x[pid];
+      const float y_i = ci_cache->y[pid];
+      const float z_i = ci_cache->z[pid];
+
+      /* Some powers of the softening length */
+      const float h_i = ci_cache->epsilon[pid];
+      const float h2_i = h_i * h_i;
+      const float h_inv_i = 1.f / h_i;
+      const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i;
+
+      /* Local accumulators for the acceleration */
+      float a_x = 0.f, a_y = 0.f, a_z = 0.f;
+
+      /* Make the compiler understand we are in happy vectorization land */
+      swift_align_information(cj_cache->x, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(cj_cache->y, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(cj_cache->z, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(cj_cache->m, SWIFT_CACHE_ALIGNMENT);
+      swift_assume_size(gcount_padded_j, VEC_SIZE);
+
+      /* Loop over every particle in the other cell. */
+      for (int pjd = 0; pjd < gcount_padded_j; pjd++) {
+
+        /* Get info about j */
+        const float x_j = cj_cache->x[pjd];
+        const float y_j = cj_cache->y[pjd];
+        const float z_j = cj_cache->z[pjd];
+        const float mass_j = cj_cache->m[pjd];
+
+        /* Compute the pairwise (square) distance. */
+        const float dx = x_i - x_j;
+        const float dy = y_i - y_j;
+        const float dz = z_i - z_j;
+        const float r2 = dx * dx + dy * dy + dz * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        if (r2 == 0.f) error("Interacting particles with 0 distance");
+
+        /* Check that particles have been drifted to the current time */
+        if (gparts_i[pid].ti_drift != e->ti_current)
+          error("gpi not drifted to current time");
+        if (pjd < gcount_j && gparts_j[pjd].ti_drift != e->ti_current)
+          error("gpj not drifted to current time");
+#endif
+
+        /* Get the inverse distance */
+        const float r_inv = 1.f / sqrtf(r2);
+        const float r = r2 * r_inv;
+
+        float f_ij, W_ij, corr_lr;
+
+        if (r2 >= h2_i) {
+
+          /* Get Newtonian gravity */
+          f_ij = mass_j * r_inv * r_inv * r_inv;
+
+        } else {
+
+          const float ui = r * h_inv_i;
+
+          kernel_grav_eval(ui, &W_ij);
+
+          /* Get softened gravity */
+          f_ij = mass_j * h_inv3_i * W_ij;
+        }
+
+        /* Get long-range correction */
+        const float u_lr = r * rlr_inv;
+        kernel_long_grav_eval(u_lr, &corr_lr);
+        f_ij *= corr_lr;
+
+        /* Store it back */
+        a_x -= f_ij * dx;
+        a_y -= f_ij * dy;
+        a_z -= f_ij * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        /* Update the interaction counter if it's not a padded gpart */
+        if (pjd < gcount_j) gparts_i[pid].num_interacted++;
+#endif
+      }
+
+      /* Store everything back in cache */
+      ci_cache->a_x[pid] = a_x;
+      ci_cache->a_y[pid] = a_y;
+      ci_cache->a_z[pid] = a_z;
+    }
+  }
+
+  /* Now do the opposite loop */
+  if (cj_active) {
+
+    /* Loop over all particles in ci... */
+    for (int pjd = 0; pjd < gcount_j; pjd++) {
+
+      /* Skip inactive particles */
+      if (!gpart_is_active(&gparts_j[pjd], e)) continue;
+
+      const float x_j = cj_cache->x[pjd];
+      const float y_j = cj_cache->y[pjd];
+      const float z_j = cj_cache->z[pjd];
+
+      /* Some powers of the softening length */
+      const float h_j = cj_cache->epsilon[pjd];
+      const float h2_j = h_j * h_j;
+      const float h_inv_j = 1.f / h_j;
+      const float h_inv3_j = h_inv_j * h_inv_j * h_inv_j;
+
+      /* Local accumulators for the acceleration */
+      float a_x = 0.f, a_y = 0.f, a_z = 0.f;
+
+      /* Make the compiler understand we are in happy vectorization land */
+      swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT);
+      swift_assume_size(gcount_padded_i, VEC_SIZE);
+
+      /* Loop over every particle in the other cell. */
+      for (int pid = 0; pid < gcount_padded_i; pid++) {
+
+        /* Get info about j */
+        const float x_i = ci_cache->x[pid];
+        const float y_i = ci_cache->y[pid];
+        const float z_i = ci_cache->z[pid];
+        const float mass_i = ci_cache->m[pid];
+
+        /* Compute the pairwise (square) distance. */
+        const float dx = x_j - x_i;
+        const float dy = y_j - y_i;
+        const float dz = z_j - z_i;
+        const float r2 = dx * dx + dy * dy + dz * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        if (r2 == 0.f) error("Interacting particles with 0 distance");
+
+        /* Check that particles have been drifted to the current time */
+        if (gparts_j[pjd].ti_drift != e->ti_current)
+          error("gpj not drifted to current time");
+        if (pid < gcount_i && gparts_i[pid].ti_drift != e->ti_current)
+          error("gpi not drifted to current time");
+#endif
+
+        /* Get the inverse distance */
+        const float r_inv = 1.f / sqrtf(r2);
+        const float r = r2 * r_inv;
+
+        float f_ji, W_ji, corr_lr;
+
+        if (r2 >= h2_j) {
+
+          /* Get Newtonian gravity */
+          f_ji = mass_i * r_inv * r_inv * r_inv;
+
+        } else {
+
+          const float uj = r * h_inv_j;
+
+          kernel_grav_eval(uj, &W_ji);
+
+          /* Get softened gravity */
+          f_ji = mass_i * h_inv3_j * W_ji;
+        }
+
+        /* Get long-range correction */
+        const float u_lr = r * rlr_inv;
+        kernel_long_grav_eval(u_lr, &corr_lr);
+        f_ji *= corr_lr;
+
+        /* Store it back */
+        a_x -= f_ji * dx;
+        a_y -= f_ji * dy;
+        a_z -= f_ji * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        /* Update the interaction counter if it's not a padded gpart */
+        if (pid < gcount_i) gparts_j[pjd].num_interacted++;
+#endif
+      }
+
+      /* Store everything back in cache */
+      cj_cache->a_x[pjd] = a_x;
+      cj_cache->a_y[pjd] = a_y;
+      cj_cache->a_z[pjd] = a_z;
+    }
+  }
+
+  /* Write back to the particles */
+  if (ci_active) gravity_cache_write_back(ci_cache, gparts_i, gcount_i);
+  if (cj_active) gravity_cache_write_back(cj_cache, gparts_j, gcount_j);
+
+#ifdef MATTHIEU_OLD_STUFF
+  /* Some constants */
+  const struct engine *const e = r->e;
+  const struct space *s = e->s;
+  const double cell_width = s->width[0];
+  const double a_smooth = e->gravity_properties->a_smooth;
+  const double rlr = cell_width * a_smooth;
+  const float rlr_inv = 1. / rlr;
+
   /* Cell properties */
   const int gcount_i = ci->gcount;
   const int gcount_j = cj->gcount;
@@ -374,6 +849,8 @@ void runner_dopair_grav_pp_truncated(struct runner *r, struct cell *ci,
       }
     }
   }
+
+#endif
 }
 
 /**
@@ -442,6 +919,129 @@ void runner_dopair_grav_pp(struct runner *r, struct cell *ci, struct cell *cj) {
  */
 void runner_doself_grav_pp_full(struct runner *r, struct cell *c) {
 
+  /* Some constants */
+  const struct engine *const e = r->e;
+  struct gravity_cache *const ci_cache = &r->ci_gravity_cache;
+
+  /* Cell properties */
+  const int gcount = c->gcount;
+  struct gpart *restrict gparts = c->gparts;
+  const int c_active = cell_is_active(c, e);
+  const double loc[3] = {c->loc[0] + 0.5 * c->width[0],
+                         c->loc[1] + 0.5 * c->width[1],
+                         c->loc[2] + 0.5 * c->width[2]};
+
+  /* Anything to do here ?*/
+  if (!c_active) return;
+
+  /* Check that we fit in cache */
+  if (gcount > ci_cache->count)
+    error("Not enough space in the cache! gcount=%d", gcount);
+
+  /* Computed the padded counts */
+  const int gcount_padded = gcount - (gcount % VEC_SIZE) + VEC_SIZE;
+
+  gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc);
+
+  /* Ok... Here we go ! */
+
+  /* Loop over all particles in ci... */
+  for (int pid = 0; pid < gcount; pid++) {
+
+    /* Skip inactive particles */
+    if (!gpart_is_active(&gparts[pid], e)) continue;
+
+    const float x_i = ci_cache->x[pid];
+    const float y_i = ci_cache->y[pid];
+    const float z_i = ci_cache->z[pid];
+
+    /* Some powers of the softening length */
+    const float h_i = ci_cache->epsilon[pid];
+    const float h2_i = h_i * h_i;
+    const float h_inv_i = 1.f / h_i;
+    const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i;
+
+    /* Local accumulators for the acceleration */
+    float a_x = 0.f, a_y = 0.f, a_z = 0.f;
+
+    /* Make the compiler understand we are in happy vectorization land */
+    swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+    swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+    swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+    swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT);
+    swift_assume_size(gcount_padded, VEC_SIZE);
+
+    /* Loop over every other particle in the cell. */
+    for (int pjd = 0; pjd < gcount_padded; pjd++) {
+
+      /* No self interaction */
+      if (pid == pjd) continue;
+
+      /* Get info about j */
+      const float x_j = ci_cache->x[pjd];
+      const float y_j = ci_cache->y[pjd];
+      const float z_j = ci_cache->z[pjd];
+      const float mass_j = ci_cache->m[pjd];
+
+      /* Compute the pairwise (square) distance. */
+      const float dx = x_i - x_j;
+      const float dy = y_i - y_j;
+      const float dz = z_i - z_j;
+      const float r2 = dx * dx + dy * dy + dz * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+      if (r2 == 0.f) error("Interacting particles with 0 distance");
+
+      /* Check that particles have been drifted to the current time */
+      if (gparts[pid].ti_drift != e->ti_current)
+        error("gpi not drifted to current time");
+      if (pjd < gcount && gparts[pjd].ti_drift != e->ti_current)
+        error("gpj not drifted to current time");
+#endif
+
+      /* Get the inverse distance */
+      const float r_inv = 1.f / sqrtf(r2);
+
+      float f_ij, W_ij;
+
+      if (r2 >= h2_i) {
+
+        /* Get Newtonian gravity */
+        f_ij = mass_j * r_inv * r_inv * r_inv;
+
+      } else {
+
+        const float r = r2 * r_inv;
+        const float ui = r * h_inv_i;
+
+        kernel_grav_eval(ui, &W_ij);
+
+        /* Get softened gravity */
+        f_ij = mass_j * h_inv3_i * W_ij;
+      }
+
+      /* Store it back */
+      a_x -= f_ij * dx;
+      a_y -= f_ij * dy;
+      a_z -= f_ij * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+      /* Update the interaction counter if it's not a padded gpart */
+      if (pjd < gcount) gparts[pid].num_interacted++;
+#endif
+    }
+
+    /* Store everything back in cache */
+    ci_cache->a_x[pid] = a_x;
+    ci_cache->a_y[pid] = a_y;
+    ci_cache->a_z[pid] = a_z;
+  }
+
+  /* Write back to the particles */
+  gravity_cache_write_back(ci_cache, gparts, gcount);
+
+#ifdef MATTHIEU_OLD_STUFF
+
   /* Some constants */
   const struct engine *const e = r->e;
 
@@ -511,6 +1111,8 @@ void runner_doself_grav_pp_full(struct runner *r, struct cell *c) {
       }
     }
   }
+
+#endif
 }
 
 /**
@@ -532,6 +1134,140 @@ void runner_doself_grav_pp_truncated(struct runner *r, struct cell *c) {
   const double rlr = cell_width * a_smooth;
   const float rlr_inv = 1. / rlr;
 
+  /* Caches to play with */
+  struct gravity_cache *const ci_cache = &r->ci_gravity_cache;
+
+  /* Cell properties */
+  const int gcount = c->gcount;
+  struct gpart *restrict gparts = c->gparts;
+  const int c_active = cell_is_active(c, e);
+  const double loc[3] = {c->loc[0] + 0.5 * c->width[0],
+                         c->loc[1] + 0.5 * c->width[1],
+                         c->loc[2] + 0.5 * c->width[2]};
+
+  /* Anything to do here ?*/
+  if (!c_active) return;
+
+  /* Check that we fit in cache */
+  if (gcount > ci_cache->count)
+    error("Not enough space in the caches! gcount=%d", gcount);
+
+  /* Computed the padded counts */
+  const int gcount_padded = gcount - (gcount % VEC_SIZE) + VEC_SIZE;
+
+  gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc);
+
+  /* Ok... Here we go ! */
+
+  /* Loop over all particles in ci... */
+  for (int pid = 0; pid < gcount; pid++) {
+
+    /* Skip inactive particles */
+    if (!gpart_is_active(&gparts[pid], e)) continue;
+
+    const float x_i = ci_cache->x[pid];
+    const float y_i = ci_cache->y[pid];
+    const float z_i = ci_cache->z[pid];
+
+    /* Some powers of the softening length */
+    const float h_i = ci_cache->epsilon[pid];
+    const float h2_i = h_i * h_i;
+    const float h_inv_i = 1.f / h_i;
+    const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i;
+
+    /* Local accumulators for the acceleration */
+    float a_x = 0.f, a_y = 0.f, a_z = 0.f;
+
+    /* Make the compiler understand we are in happy vectorization land */
+    swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+    swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+    swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+    swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT);
+    swift_assume_size(gcount_padded, VEC_SIZE);
+
+    /* Loop over every other particle in the cell. */
+    for (int pjd = 0; pjd < gcount_padded; pjd++) {
+
+      /* No self interaction */
+      if (pid == pjd) continue;
+
+      /* Get info about j */
+      const float x_j = ci_cache->x[pjd];
+      const float y_j = ci_cache->y[pjd];
+      const float z_j = ci_cache->z[pjd];
+      const float mass_j = ci_cache->m[pjd];
+
+      /* Compute the pairwise (square) distance. */
+      const float dx = x_i - x_j;
+      const float dy = y_i - y_j;
+      const float dz = z_i - z_j;
+      const float r2 = dx * dx + dy * dy + dz * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+      if (r2 == 0.f) error("Interacting particles with 0 distance");
+
+      /* Check that particles have been drifted to the current time */
+      if (gparts[pid].ti_drift != e->ti_current)
+        error("gpi not drifted to current time");
+      if (pjd < gcount && gparts[pjd].ti_drift != e->ti_current)
+        error("gpj not drifted to current time");
+#endif
+
+      /* Get the inverse distance */
+      const float r_inv = 1.f / sqrtf(r2);
+      const float r = r2 * r_inv;
+
+      float f_ij, W_ij, corr_lr;
+
+      if (r2 >= h2_i) {
+
+        /* Get Newtonian gravity */
+        f_ij = mass_j * r_inv * r_inv * r_inv;
+
+      } else {
+
+        const float ui = r * h_inv_i;
+
+        kernel_grav_eval(ui, &W_ij);
+
+        /* Get softened gravity */
+        f_ij = mass_j * h_inv3_i * W_ij;
+      }
+
+      /* Get long-range correction */
+      const float u_lr = r * rlr_inv;
+      kernel_long_grav_eval(u_lr, &corr_lr);
+      f_ij *= corr_lr;
+
+      /* Store it back */
+      a_x -= f_ij * dx;
+      a_y -= f_ij * dy;
+      a_z -= f_ij * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+      /* Update the interaction counter if it's not a padded gpart */
+      if (pjd < gcount) gparts[pid].num_interacted++;
+#endif
+    }
+
+    /* Store everything back in cache */
+    ci_cache->a_x[pid] = a_x;
+    ci_cache->a_y[pid] = a_y;
+    ci_cache->a_z[pid] = a_z;
+  }
+
+  /* Write back to the particles */
+  gravity_cache_write_back(ci_cache, gparts, gcount);
+
+#ifdef MATTHIEU_OLD_STUFF
+  /* Some constants */
+  const struct engine *const e = r->e;
+  const struct space *s = e->s;
+  const double cell_width = s->width[0];
+  const double a_smooth = e->gravity_properties->a_smooth;
+  const double rlr = cell_width * a_smooth;
+  const float rlr_inv = 1. / rlr;
+
   /* Cell properties */
   const int gcount = c->gcount;
   struct gpart *restrict gparts = c->gparts;
@@ -598,6 +1334,7 @@ void runner_doself_grav_pp_truncated(struct runner *r, struct cell *c) {
       }
     }
   }
+#endif
 }
 
 /**
@@ -666,9 +1403,12 @@ void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj,
   const struct engine *e = r->e;
   const struct space *s = e->s;
   const int periodic = s->periodic;
+  const double cell_width = s->width[0];
   const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
   const struct gravity_props *props = e->gravity_properties;
   const double theta_crit_inv = props->theta_crit_inv;
+  const double max_distance = props->a_smooth * props->r_cut_max * cell_width;
+  const double max_distance2 = max_distance * max_distance;
 
 #ifdef SWIFT_DEBUG_CHECKS
 
@@ -688,38 +1428,46 @@ void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj,
     error("cj->multipole not drifted.");
 #endif
 
-#if ICHECK > 0
-  for (int pid = 0; pid < ci->gcount; pid++) {
-
-    /* Get a hold of the ith part in ci. */
-    struct gpart *restrict gp = &ci->gparts[pid];
+  TIMER_TIC;
 
-    if (gp->id_or_neg_offset == ICHECK)
-      message("id=%lld loc=[ %f %f %f ] size= %f count= %d",
-              gp->id_or_neg_offset, cj->loc[0], cj->loc[1], cj->loc[2],
-              cj->width[0], cj->gcount);
-  }
+  /* Anything to do here? */
+  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
 
-  for (int pid = 0; pid < cj->gcount; pid++) {
+  /* Recover the multipole information */
+  struct gravity_tensors *const multi_i = ci->multipole;
+  struct gravity_tensors *const multi_j = cj->multipole;
 
-    /* Get a hold of the ith part in ci. */
-    struct gpart *restrict gp = &cj->gparts[pid];
+  /* Get the distance between the CoMs */
+  double dx = multi_i->CoM[0] - multi_j->CoM[0];
+  double dy = multi_i->CoM[1] - multi_j->CoM[1];
+  double dz = multi_i->CoM[2] - multi_j->CoM[2];
 
-    if (gp->id_or_neg_offset == ICHECK)
-      message("id=%lld loc=[ %f %f %f ] size= %f count= %d",
-              gp->id_or_neg_offset, ci->loc[0], ci->loc[1], ci->loc[2],
-              ci->width[0], ci->gcount);
+  /* Apply BC */
+  if (periodic) {
+    dx = nearest(dx, dim[0]);
+    dy = nearest(dy, dim[1]);
+    dz = nearest(dz, dim[2]);
   }
-#endif
+  const double r2 = dx * dx + dy * dy + dz * dz;
 
-  /* Anything to do here? */
-  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
+  /* Are we beyond the distance where the truncated forces are 0? */
+  if (periodic && r2 > max_distance2) {
 
-  TIMER_TIC;
+#ifdef SWIFT_DEBUG_CHECKS
+    /* Need to account for the interactions we missed */
+    if (cell_is_active(ci, e))
+      multi_i->pot.num_interacted += multi_j->m_pole.num_gpart;
+    if (cell_is_active(cj, e))
+      multi_j->pot.num_interacted += multi_i->m_pole.num_gpart;
+#endif
+    return;
+  }
+
+  /* OK, we actually need to compute this pair. Let's find the cheapest
+   * option... */
 
   /* Can we use M-M interactions ? */
-  if (gravity_multipole_accept(ci->multipole, cj->multipole, theta_crit_inv,
-                               periodic, dim)) {
+  if (gravity_multipole_accept(multi_i, multi_j, theta_crit_inv, r2)) {
 
     /* MATTHIEU: make a symmetric M-M interaction function ! */
     runner_dopair_grav_mm(r, ci, cj);
@@ -732,8 +1480,8 @@ void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj,
   /* Alright, we'll have to split and recurse. */
   else {
 
-    const double ri_max = ci->multipole->r_max;
-    const double rj_max = cj->multipole->r_max;
+    const double ri_max = multi_i->r_max;
+    const double rj_max = multi_j->r_max;
 
     /* Split the larger of the two cells and start over again */
     if (ri_max > rj_max) {
@@ -887,8 +1635,6 @@ void runner_do_grav_long_range(struct runner *r, struct cell *ci, int timer) {
   const double theta_crit_inv = props->theta_crit_inv;
   const double max_distance = props->a_smooth * props->r_cut_max * cell_width;
   const double max_distance2 = max_distance * max_distance;
-  struct gravity_tensors *const mi = ci->multipole;
-  const double CoM[3] = {mi->CoM[0], mi->CoM[1], mi->CoM[2]};
 
   TIMER_TIC;
 
@@ -903,51 +1649,80 @@ void runner_do_grav_long_range(struct runner *r, struct cell *ci, int timer) {
   if (ci->ti_old_multipole != e->ti_current)
     error("Interacting un-drifted multipole");
 
+  /* Recover the local multipole */
+  struct gravity_tensors *const multi_i = ci->multipole;
+  const double CoM_i[3] = {multi_i->CoM[0], multi_i->CoM[1], multi_i->CoM[2]};
+  const double CoM_rebuild_i[3] = {multi_i->CoM_rebuild[0],
+                                   multi_i->CoM_rebuild[1],
+                                   multi_i->CoM_rebuild[2]};
+
   /* Loop over all the top-level cells and go for a M-M interaction if
    * well-separated */
   for (int i = 0; i < nr_cells; ++i) {
 
     /* Handle on the top-level cell and it's gravity business*/
     struct cell *cj = &cells[i];
-    const struct gravity_tensors *const mj = cj->multipole;
+    const struct gravity_tensors *const multi_j = cj->multipole;
 
     /* Avoid stupid cases */
     if (ci == cj || cj->gcount == 0) continue;
 
-    /* Is this interaction part of the periodic mesh calculation ?*/
-    if (periodic) {
+    /* Get the distance between the CoMs */
+    double dx = CoM_i[0] - multi_j->CoM[0];
+    double dy = CoM_i[1] - multi_j->CoM[1];
+    double dz = CoM_i[2] - multi_j->CoM[2];
 
-      const double dx = nearest(CoM[0] - mj->CoM[0], dim[0]);
-      const double dy = nearest(CoM[1] - mj->CoM[1], dim[1]);
-      const double dz = nearest(CoM[2] - mj->CoM[2], dim[2]);
-      const double r2 = dx * dx + dy * dy + dz * dz;
+    /* Apply BC */
+    if (periodic) {
+      dx = nearest(dx, dim[0]);
+      dy = nearest(dy, dim[1]);
+      dz = nearest(dz, dim[2]);
+    }
+    const double r2 = dx * dx + dy * dy + dz * dz;
 
-      /* Are we beyond the distance where the truncated forces are 0 ?*/
-      if (r2 > max_distance2) {
+    /* Are we beyond the distance where the truncated forces are 0 ?*/
+    if (periodic && r2 > max_distance2) {
 
 #ifdef SWIFT_DEBUG_CHECKS
-        /* Need to account for the interactions we missed */
-        mi->pot.num_interacted += mj->m_pole.num_gpart;
+      /* Need to account for the interactions we missed */
+      multi_i->pot.num_interacted += multi_j->m_pole.num_gpart;
 #endif
-        continue;
-      }
+      continue;
     }
 
     /* Check the multipole acceptance criterion */
-    if (gravity_multipole_accept(mi, mj, theta_crit_inv, periodic, dim)) {
+    if (gravity_multipole_accept(multi_i, multi_j, theta_crit_inv, r2)) {
 
       /* Go for a (non-symmetric) M-M calculation */
       runner_dopair_grav_mm(r, ci, cj);
-    }
-    /* Is the criterion violated now but was OK at the last rebuild ? */
-    else if (gravity_multipole_accept_rebuild(mi, mj, theta_crit_inv, periodic,
-                                              dim)) {
 
-      /* Alright, we have to take charge of that pair in a different way. */
-      // MATTHIEU: We should actually open the tree-node here and recurse.
-      runner_dopair_grav_mm(r, ci, cj);
+    } else {
+
+      /* Let's check whether we need to still operate on this pair */
+
+      /* Get the distance between the CoMs at the last rebuild*/
+      double dx = CoM_rebuild_i[0] - multi_j->CoM_rebuild[0];
+      double dy = CoM_rebuild_i[1] - multi_j->CoM_rebuild[1];
+      double dz = CoM_rebuild_i[2] - multi_j->CoM_rebuild[2];
+
+      /* Apply BC */
+      if (periodic) {
+        dx = nearest(dx, dim[0]);
+        dy = nearest(dy, dim[1]);
+        dz = nearest(dz, dim[2]);
+      }
+      const double r2_rebuild = dx * dx + dy * dy + dz * dz;
+
+      /* Is the criterion violated now but was OK at the last rebuild ? */
+      if (gravity_multipole_accept_rebuild(multi_i, multi_j, theta_crit_inv,
+                                           r2_rebuild)) {
+
+        /* Alright, we have to take charge of that pair in a different way. */
+        // MATTHIEU: We should actually open the tree-node here and recurse.
+        runner_dopair_grav_mm(r, ci, cj);
+      }
     }
-  }
+  } /* Loop over top-level cells */
 
   if (timer) TIMER_TOC(timer_dograv_long_range);
 }
diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
index 5b22a67bbf757e09f144ba3accc2ef5adfb706f0..1557c1ec84863c3518f8671cdf0e9ca44732e94e 100644
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -63,10 +63,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
     vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz,
     int *icount_align) {
 
-#ifdef HAVE_AVX512_F
-  KNL_MASK_16 knl_mask, knl_mask2;
-#endif
-  vector int_mask, int_mask2;
+  mask_t int_mask, int_mask2;
 
   /* Work out the number of remainder interactions and pad secondary cache. */
   *icount_align = icount;
@@ -75,16 +72,10 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
     int pad = (NUM_VEC_PROC * VEC_SIZE) - rem;
     *icount_align += pad;
 
-/* Initialise masks to true. */
-#ifdef HAVE_AVX512_F
-    knl_mask = 0xFFFF;
-    knl_mask2 = 0xFFFF;
-    int_mask.m = vec_setint1(0xFFFFFFFF);
-    int_mask2.m = vec_setint1(0xFFFFFFFF);
-#else
-    int_mask.m = vec_setint1(0xFFFFFFFF);
-    int_mask2.m = vec_setint1(0xFFFFFFFF);
-#endif
+    /* Initialise masks to true. */
+    vec_init_mask_true(int_mask);
+    vec_init_mask_true(int_mask2);
+
     /* Pad secondary cache so that there are no contributions in the interaction
      * function. */
     for (int i = icount; i < *icount_align; i++) {
@@ -100,19 +91,10 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
 
     /* Zero parts of mask that represent the padded values.*/
     if (pad < VEC_SIZE) {
-#ifdef HAVE_AVX512_F
-      knl_mask2 = knl_mask2 >> pad;
-#else
-      for (int i = VEC_SIZE - pad; i < VEC_SIZE; i++) int_mask2.i[i] = 0;
-#endif
+      vec_pad_mask(int_mask2, pad);
     } else {
-#ifdef HAVE_AVX512_F
-      knl_mask = knl_mask >> (VEC_SIZE - rem);
-      knl_mask2 = 0;
-#else
-      for (int i = rem; i < VEC_SIZE; i++) int_mask.i[i] = 0;
-      int_mask2.v = vec_setzero();
-#endif
+      vec_pad_mask(int_mask, VEC_SIZE - rem);
+      vec_zero_mask(int_mask2);
     }
 
     /* Perform remainder interaction and remove remainder from aligned
@@ -125,12 +107,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
         &int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align],
         &int_cache->mq[*icount_align], rhoSum, rho_dhSum, wcountSum,
         wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask,
-        int_mask2,
-#ifdef HAVE_AVX512_F
-        knl_mask, knl_mask2);
-#else
-        0, 0);
-#endif
+        int_mask2, 1);
   }
 }
 
@@ -144,10 +121,6 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
  * @param v_dx #vector of the x separation between two particles.
  * @param v_dy #vector of the y separation between two particles.
  * @param v_dz #vector of the z separation between two particles.
- * @param v_mj #vector of the mass of particle pj.
- * @param v_vjx #vector of x velocity of pj.
- * @param v_vjy #vector of y velocity of pj.
- * @param v_vjz #vector of z velocity of pj.
  * @param cell_cache #cache of all particles in the cell.
  * @param int_cache (return) secondary #cache of interactions between two
  * particles.
@@ -174,44 +147,33 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
  */
 __attribute__((always_inline)) INLINE static void storeInteractions(
     const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy,
-    vector *v_dz, vector *v_mj, vector *v_vjx, vector *v_vjy, vector *v_vjz,
-    const struct cache *const cell_cache, struct c2_cache *const int_cache,
-    int *icount, vector *rhoSum, vector *rho_dhSum, vector *wcountSum,
-    vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum,
-    vector *curlvySum, vector *curlvzSum, vector v_hi_inv, vector v_vix,
-    vector v_viy, vector v_viz) {
+    vector *v_dz, const struct cache *const cell_cache,
+    struct c2_cache *const int_cache, int *icount, vector *rhoSum,
+    vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
+    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
+    vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz) {
 
 /* Left-pack values needed into the secondary cache using the interaction mask.
  */
 #if defined(HAVE_AVX2) || defined(HAVE_AVX512_F)
-  int pack = 0;
-
-#ifdef HAVE_AVX512_F
-  pack += __builtin_popcount(mask);
-  VEC_LEFT_PACK(v_r2->v, mask, &int_cache->r2q[*icount]);
-  VEC_LEFT_PACK(v_dx->v, mask, &int_cache->dxq[*icount]);
-  VEC_LEFT_PACK(v_dy->v, mask, &int_cache->dyq[*icount]);
-  VEC_LEFT_PACK(v_dz->v, mask, &int_cache->dzq[*icount]);
-  VEC_LEFT_PACK(v_mj->v, mask, &int_cache->mq[*icount]);
-  VEC_LEFT_PACK(v_vjx->v, mask, &int_cache->vxq[*icount]);
-  VEC_LEFT_PACK(v_vjy->v, mask, &int_cache->vyq[*icount]);
-  VEC_LEFT_PACK(v_vjz->v, mask, &int_cache->vzq[*icount]);
-#else
-  vector v_mask;
-  VEC_FORM_PACKED_MASK(mask, v_mask.m, pack);
-
-  VEC_LEFT_PACK(v_r2->v, v_mask.m, &int_cache->r2q[*icount]);
-  VEC_LEFT_PACK(v_dx->v, v_mask.m, &int_cache->dxq[*icount]);
-  VEC_LEFT_PACK(v_dy->v, v_mask.m, &int_cache->dyq[*icount]);
-  VEC_LEFT_PACK(v_dz->v, v_mask.m, &int_cache->dzq[*icount]);
-  VEC_LEFT_PACK(v_mj->v, v_mask.m, &int_cache->mq[*icount]);
-  VEC_LEFT_PACK(v_vjx->v, v_mask.m, &int_cache->vxq[*icount]);
-  VEC_LEFT_PACK(v_vjy->v, v_mask.m, &int_cache->vyq[*icount]);
-  VEC_LEFT_PACK(v_vjz->v, v_mask.m, &int_cache->vzq[*icount]);
-
-#endif /* HAVE_AVX512_F */
-
-  (*icount) += pack;
+  mask_t packed_mask;
+  VEC_FORM_PACKED_MASK(mask, packed_mask);
+
+  VEC_LEFT_PACK(v_r2->v, packed_mask, &int_cache->r2q[*icount]);
+  VEC_LEFT_PACK(v_dx->v, packed_mask, &int_cache->dxq[*icount]);
+  VEC_LEFT_PACK(v_dy->v, packed_mask, &int_cache->dyq[*icount]);
+  VEC_LEFT_PACK(v_dz->v, packed_mask, &int_cache->dzq[*icount]);
+  VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), packed_mask,
+                &int_cache->mq[*icount]);
+  VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), packed_mask,
+                &int_cache->vxq[*icount]);
+  VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), packed_mask,
+                &int_cache->vyq[*icount]);
+  VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), packed_mask,
+                &int_cache->vzq[*icount]);
+
+  /* Increment interaction count by number of bits set in mask. */
+  (*icount) += __builtin_popcount(mask);
 #else
   /* Quicker to do it serially in AVX rather than use intrinsics. */
   for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) {
@@ -242,9 +204,9 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
                         wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum,
                         v_hi_inv, v_vix, v_viy, v_viz, &icount_align);
 
-    vector int_mask, int_mask2;
-    int_mask.m = vec_setint1(0xFFFFFFFF);
-    int_mask2.m = vec_setint1(0xFFFFFFFF);
+    mask_t int_mask, int_mask2;
+    vec_init_mask_true(int_mask);
+    vec_init_mask_true(int_mask2);
 
     /* Perform interactions. */
     for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
@@ -253,7 +215,7 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
           &int_cache->dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz,
           &int_cache->vxq[pjd], &int_cache->vyq[pjd], &int_cache->vzq[pjd],
           &int_cache->mq[pjd], rhoSum, rho_dhSum, wcountSum, wcount_dhSum,
-          div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2, 0, 0);
+          div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2, 0);
     }
 
     /* Reset interaction count. */
@@ -262,7 +224,8 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
 }
 
 /**
- * @brief Populates the arrays max_di and max_dj with the maximum distances of
+ * @brief Populates the arrays max_index_i and max_index_j with the maximum
+ * indices of
  * particles into their neighbouring cells. Also finds the first pi that
  * interacts with any particle in cj and the last pj that interacts with any
  * particle in ci.
@@ -277,77 +240,121 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
  * @param hj_max Maximal smoothing length in cell cj
  * @param di_max Maximal position on the axis that can interact in cell ci
  * @param dj_min Minimal position on the axis that can interact in cell ci
- * @param max_di array to hold the maximum distances of pi particles into cell
+ * @param max_index_i array to hold the maximum distances of pi particles into
+ * cell
  * cj
- * @param max_dj array to hold the maximum distances of pj particles into cell
+ * @param max_index_j array to hold the maximum distances of pj particles into
+ * cell
  * cj
  * @param init_pi first pi to interact with a pj particle
  * @param init_pj last pj to interact with a pi particle
  * @param e The #engine.
  */
-__attribute__((always_inline)) INLINE static void populate_max_d_no_cache(
+__attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
     const struct cell *ci, const struct cell *cj,
     const struct entry *restrict sort_i, const struct entry *restrict sort_j,
     const float dx_max, const float rshift, const double hi_max,
     const double hj_max, const double di_max, const double dj_min,
-    float *max_di, float *max_dj, int *init_pi, int *init_pj,
+    int *max_index_i, int *max_index_j, int *init_pi, int *init_pj,
     const struct engine *e) {
 
   const struct part *restrict parts_i = ci->parts;
   const struct part *restrict parts_j = cj->parts;
 
   int first_pi = 0, last_pj = cj->count - 1;
+  int temp;
+
+  /* Find the leftmost active particle in cell i that interacts with any
+   * particle in cell j. */
+  first_pi = ci->count;
+  int active_id = first_pi - 1;
+  while (first_pi > 0 && sort_i[first_pi - 1].d + dx_max + hi_max > dj_min) {
+    first_pi--;
+    /* Store the index of the particle if it is active. */
+    if (part_is_active(&parts_i[sort_i[first_pi].i], e)) active_id = first_pi;
+  }
 
-  /* Find the first active particle in ci to interact with any particle in cj.
-   */
-  /* Populate max_di with distances. */
-  int active_id = ci->count - 1;
-  for (int k = ci->count - 1; k >= 0; k--) {
-    const struct part *pi = &parts_i[sort_i[k].i];
-    const float d = sort_i[k].d + dx_max;
-
-    // max_di[k] = d + h * kernel_gamma - rshift;
-    max_di[k] = d + hi_max;
-
-    /* If the particle is out of range set the index to
-     * the last active particle within range. */
-    if (d + hi_max < dj_min) {
-      first_pi = active_id;
-      break;
-    } else {
-      if (part_is_active(pi, e)) active_id = k;
+  /* Set the first active pi in range of any particle in cell j. */
+  first_pi = active_id;
+
+  /* Find the maximum index into cell j for each particle in range in cell i. */
+  if (first_pi < ci->count) {
+
+    /* Start from the first particle in cell j. */
+    temp = 0;
+
+    const struct part *pi = &parts_i[sort_i[first_pi].i];
+
+    /* Loop through particles in cell j until they are not in range of pi. */
+    while (temp <= cj->count &&
+           (sort_i[first_pi].d + (pi->h * kernel_gamma + dx_max - rshift) >
+            sort_j[temp].d))
+      temp++;
+
+    max_index_i[first_pi] = temp;
+
+    /* Populate max_index_i for remaining particles that are within range. */
+    for (int i = first_pi + 1; i < ci->count; i++) {
+      temp = max_index_i[i - 1];
+      pi = &parts_i[sort_i[i].i];
+
+      while (temp <= cj->count &&
+             (sort_i[i].d + (pi->h * kernel_gamma + dx_max - rshift) >
+              sort_j[temp].d))
+        temp++;
+
+      max_index_i[i] = temp;
     }
+  } else {
+    /* Make sure that max index is set to first particle in cj.*/
+    max_index_i[ci->count - 1] = 0;
   }
 
-  /* Find the maximum distance of pi particles into cj.*/
-  for (int k = first_pi + 1; k < ci->count; k++) {
-    max_di[k] = fmaxf(max_di[k - 1], max_di[k]);
+  /* Find the rightmost active particle in cell j that interacts with any
+   * particle in cell i. */
+  last_pj = -1;
+  active_id = last_pj;
+  while (last_pj < cj->count &&
+         sort_j[last_pj + 1].d - hj_max - dx_max < di_max) {
+    last_pj++;
+    /* Store the index of the particle if it is active. */
+    if (part_is_active(&parts_j[sort_j[last_pj].i], e)) active_id = last_pj;
   }
 
-  /* Find the last particle in cj to interact with any particle in ci. */
-  /* Populate max_dj with distances. */
-  active_id = 0;
-  for (int k = 0; k < cj->count; k++) {
-    const struct part *pj = &parts_j[sort_j[k].i];
-    const float d = sort_j[k].d - dx_max;
-
-    /*TODO: don't think rshift should be taken off here, waiting on Pedro. */
-    // max_dj[k] = d - h * kernel_gamma - rshift;
-    max_dj[k] = d - hj_max;
-
-    /* If the particle is out of range set the index to
-     * the last active particle within range. */
-    if (d - hj_max > di_max) {
-      last_pj = active_id;
-      break;
-    } else {
-      if (part_is_active(pj, e)) active_id = k;
-    }
-  }
+  /* Set the last active pj in range of any particle in cell i. */
+  last_pj = active_id;
+
+  /* Find the maximum index into cell i for each particle in range in cell j. */
+  if (last_pj > 0) {
+
+    /* Start from the last particle in cell i. */
+    temp = ci->count - 1;
+
+    const struct part *pj = &parts_j[sort_j[last_pj].i];
+
+    /* Loop through particles in cell i until they are not in range of pj. */
+    while (temp > 0 &&
+           sort_j[last_pj].d - dx_max - (pj->h * kernel_gamma) <
+               sort_i[temp].d - rshift)
+      temp--;
 
-  /* Find the maximum distance of pj particles into ci.*/
-  for (int k = 1; k <= last_pj; k++) {
-    max_dj[k] = fmaxf(max_dj[k - 1], max_dj[k]);
+    max_index_j[last_pj] = temp;
+
+    /* Populate max_index_j for remaining particles that are within range. */
+    for (int i = last_pj - 1; i >= 0; i--) {
+      temp = max_index_j[i + 1];
+      pj = &parts_j[sort_j[i].i];
+
+      while (temp > 0 &&
+             sort_j[i].d - dx_max - (pj->h * kernel_gamma) <
+                 sort_i[temp].d - rshift)
+        temp--;
+
+      max_index_j[i] = temp;
+    }
+  } else {
+    /* Make sure that max index is set to last particle in ci.*/
+    max_index_j[0] = ci->count - 1;
   }
 
   *init_pi = first_pi;
@@ -367,7 +374,6 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
 
 #ifdef WITH_VECTORIZATION
   const struct engine *e = r->e;
-  int doi_mask;
   struct part *restrict pi;
   int count_align;
   int num_vec_proc = NUM_VEC_PROC;
@@ -459,9 +465,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
     }
 
     vector pjx, pjy, pjz;
-    vector pjvx, pjvy, pjvz, mj;
     vector pjx2, pjy2, pjz2;
-    vector pjvx2, pjvy2, pjvz2, mj2;
 
     /* Find all of particle pi's interacions and store needed values in the
      * secondary cache.*/
@@ -471,18 +475,10 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
       pjx.v = vec_load(&cell_cache->x[pjd]);
       pjy.v = vec_load(&cell_cache->y[pjd]);
       pjz.v = vec_load(&cell_cache->z[pjd]);
-      pjvx.v = vec_load(&cell_cache->vx[pjd]);
-      pjvy.v = vec_load(&cell_cache->vy[pjd]);
-      pjvz.v = vec_load(&cell_cache->vz[pjd]);
-      mj.v = vec_load(&cell_cache->m[pjd]);
 
       pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]);
       pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]);
       pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]);
-      pjvx2.v = vec_load(&cell_cache->vx[pjd + VEC_SIZE]);
-      pjvy2.v = vec_load(&cell_cache->vy[pjd + VEC_SIZE]);
-      pjvz2.v = vec_load(&cell_cache->vz[pjd + VEC_SIZE]);
-      mj2.v = vec_load(&cell_cache->m[pjd + VEC_SIZE]);
 
       /* Compute the pairwise distance. */
       vector v_dx_tmp, v_dy_tmp, v_dz_tmp;
@@ -502,53 +498,47 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
       v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v);
       v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v);
 
-/* Form a mask from r2 < hig2 and r2 > 0.*/
-#ifdef HAVE_AVX512_F
-      // KNL_MASK_16 doi_mask, doi_mask_check, doi_mask2, doi_mask2_check;
-      KNL_MASK_16 doi_mask_check, doi_mask2, doi_mask2_check;
-
-      doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero());
-      doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v);
-
-      doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero());
-      doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v);
-
-      doi_mask = doi_mask & doi_mask_check;
-      doi_mask2 = doi_mask2 & doi_mask2_check;
-
-#else
-      vector v_doi_mask, v_doi_mask_check, v_doi_mask2, v_doi_mask2_check;
-      int doi_mask2;
+      /* Form a mask from r2 < hig2 and r2 > 0.*/
+      mask_t v_doi_mask, v_doi_mask_self_check, v_doi_mask2,
+          v_doi_mask2_self_check;
+      int doi_mask, doi_mask_self_check, doi_mask2, doi_mask2_self_check;
 
       /* Form r2 > 0 mask and r2 < hig2 mask. */
-      v_doi_mask_check.v = vec_cmp_gt(v_r2.v, vec_setzero());
-      v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v);
+      vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero()));
+      vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v));
 
       /* Form r2 > 0 mask and r2 < hig2 mask. */
-      v_doi_mask2_check.v = vec_cmp_gt(v_r2_2.v, vec_setzero());
-      v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v);
+      vec_create_mask(v_doi_mask2_self_check,
+                      vec_cmp_gt(v_r2_2.v, vec_setzero()));
+      vec_create_mask(v_doi_mask2, vec_cmp_lt(v_r2_2.v, v_hig2.v));
 
-      /* Combine two masks and form integer mask. */
-      doi_mask = vec_cmp_result(vec_and(v_doi_mask.v, v_doi_mask_check.v));
-      doi_mask2 = vec_cmp_result(vec_and(v_doi_mask2.v, v_doi_mask2_check.v));
-#endif /* HAVE_AVX512_F */
+      /* Form integer masks. */
+      doi_mask_self_check = vec_form_int_mask(v_doi_mask_self_check);
+      doi_mask = vec_form_int_mask(v_doi_mask);
+
+      doi_mask2_self_check = vec_form_int_mask(v_doi_mask2_self_check);
+      doi_mask2 = vec_form_int_mask(v_doi_mask2);
+
+      /* Combine the two masks. */
+      doi_mask = doi_mask & doi_mask_self_check;
+      doi_mask2 = doi_mask2 & doi_mask2_self_check;
 
       /* If there are any interactions left pack interaction values into c2
        * cache. */
       if (doi_mask) {
         storeInteractions(doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp,
-                          &mj, &pjvx, &pjvy, &pjvz, cell_cache, &int_cache,
+                          cell_cache, &int_cache, &icount, &rhoSum, &rho_dhSum,
+                          &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
+                          &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy,
+                          v_viz);
+      }
+      if (doi_mask2) {
+        storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2,
+                          &v_dy_tmp2, &v_dz_tmp2, cell_cache, &int_cache,
                           &icount, &rhoSum, &rho_dhSum, &wcountSum,
                           &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
                           &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz);
       }
-      if (doi_mask2) {
-        storeInteractions(
-            doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2, &v_dy_tmp2,
-            &v_dz_tmp2, &mj2, &pjvx2, &pjvy2, &pjvz2, cell_cache, &int_cache,
-            &icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum,
-            &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz);
-      }
     }
 
     /* Perform padded vector remainder interactions if any are present. */
@@ -559,16 +549,9 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
 
     /* Initialise masks to true in case remainder interactions have been
      * performed. */
-    vector int_mask, int_mask2;
-#ifdef HAVE_AVX512_F
-    KNL_MASK_16 knl_mask = 0xFFFF;
-    KNL_MASK_16 knl_mask2 = 0xFFFF;
-    int_mask.m = vec_setint1(0xFFFFFFFF);
-    int_mask2.m = vec_setint1(0xFFFFFFFF);
-#else
-    int_mask.m = vec_setint1(0xFFFFFFFF);
-    int_mask2.m = vec_setint1(0xFFFFFFFF);
-#endif
+    mask_t int_mask, int_mask2;
+    vec_init_mask_true(int_mask);
+    vec_init_mask_true(int_mask2);
 
     /* Perform interaction with 2 vectors. */
     for (int pjd = 0; pjd < icount_align; pjd += (num_vec_proc * VEC_SIZE)) {
@@ -578,11 +561,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
           &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd],
           &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
           &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2,
-#ifdef HAVE_AVX512_F
-          knl_mask, knl_mask2);
-#else
-          0, 0);
-#endif
+          0);
     }
 
     /* Perform horizontal adds on vector sums and store result in particle pi.
@@ -714,38 +693,19 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   }
 
   int first_pi, last_pj;
-  float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE)));
-  float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+  int *max_index_i __attribute__((aligned(sizeof(int) * VEC_SIZE)));
+  int *max_index_j __attribute__((aligned(sizeof(int) * VEC_SIZE)));
 
-  max_di = r->ci_cache.max_d;
-  max_dj = r->cj_cache.max_d;
+  max_index_i = r->ci_cache.max_index;
+  max_index_j = r->cj_cache.max_index;
 
-  /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */
+  /* Find particles maximum index into cj, max_index_i[] and ci, max_index_j[].
+   */
   /* Also find the first pi that interacts with any particle in cj and the last
    * pj that interacts with any particle in ci. */
-  populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max,
-                          hj_max, di_max, dj_min, max_di, max_dj, &first_pi,
-                          &last_pj, e);
-
-  /* Find the maximum index into cj that is required by a particle in ci. */
-  /* Find the maximum index into ci that is required by a particle in cj. */
-  float di, dj;
-  int max_ind_j = count_j - 1;
-  int max_ind_i = 0;
-
-  dj = sort_j[max_ind_j].d;
-  while (max_ind_j > 0 && max_di[count_i - 1] < dj) {
-    max_ind_j--;
-
-    dj = sort_j[max_ind_j].d;
-  }
-
-  di = sort_i[max_ind_i].d;
-  while (max_ind_i < count_i - 1 && max_dj[0] > di) {
-    max_ind_i++;
-
-    di = sort_i[max_ind_i].d;
-  }
+  populate_max_index_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max,
+                              hj_max, di_max, dj_min, max_index_i, max_index_j,
+                              &first_pi, &last_pj, e);
 
   /* Limits of the outer loops. */
   int first_pi_loop = first_pi;
@@ -753,8 +713,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 
   /* Take the max/min of both values calculated to work out how many particles
    * to read into the cache. */
-  last_pj = max(last_pj, max_ind_j);
-  first_pi = min(first_pi, max_ind_i);
+  last_pj = max(last_pj, max_index_i[count_i - 1]);
+  first_pi = min(first_pi, max_index_j[0]);
 
   /* Read the needed particles into the two caches. */
   int first_pi_align = first_pi;
@@ -769,7 +729,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   if (cell_is_active(ci, e)) {
 
     /* Loop over the parts in ci until nothing is within range in cj. */
-    for (int pid = count_i - 1; pid >= first_pi_loop && max_ind_j >= 0; pid--) {
+    for (int pid = count_i - 1; pid >= first_pi_loop; pid--) {
 
       /* Get a hold of the ith part in ci. */
       struct part *restrict pi = &parts_i[sort_i[pid].i];
@@ -785,13 +745,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
       if (di_test < dj_min) continue;
 
       /* Determine the exit iteration of the interaction loop. */
-      dj = sort_j[max_ind_j].d;
-      while (max_ind_j > 0 && max_di[pid] < dj) {
-        max_ind_j--;
-
-        dj = sort_j[max_ind_j].d;
-      }
-      int exit_iteration = max_ind_j + 1;
+      int exit_iteration = max_index_i[pid];
 
       const float hig2 = hi * hi * kernel_gamma2;
 
@@ -866,14 +820,14 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
         v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
         v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
 
-        vector v_doi_mask;
+        mask_t v_doi_mask;
         int doi_mask;
 
         /* Form r2 < hig2 mask. */
-        v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v);
+        vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v));
 
         /* Form integer mask. */
-        doi_mask = vec_cmp_result(v_doi_mask.v);
+        doi_mask = vec_form_int_mask(v_doi_mask);
 
         /* If there are any interactions perform them. */
         if (doi_mask)
@@ -882,12 +836,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
               &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx],
               &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &rhoSum,
               &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
-              &curlvySum, &curlvzSum, v_doi_mask,
-#ifdef HAVE_AVX512_F
-              knl_mask);
-#else
-              0);
-#endif
+              &curlvySum, &curlvzSum, v_doi_mask);
 
       } /* loop over the parts in cj. */
 
@@ -908,7 +857,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   if (cell_is_active(cj, e)) {
 
     /* Loop over the parts in cj until nothing is within range in ci. */
-    for (int pjd = 0; pjd <= last_pj_loop && max_ind_i < count_i; pjd++) {
+    for (int pjd = 0; pjd <= last_pj_loop; pjd++) {
 
       /* Get a hold of the jth part in cj. */
       struct part *restrict pj = &parts_j[sort_j[pjd].i];
@@ -925,13 +874,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
       if (dj_test > di_max) continue;
 
       /* Determine the exit iteration of the interaction loop. */
-      di = sort_i[max_ind_i].d;
-      while (max_ind_i < count_i - 1 && max_dj[pjd] > di) {
-        max_ind_i++;
-
-        di = sort_i[max_ind_i].d;
-      }
-      int exit_iteration = max_ind_i;
+      int exit_iteration = max_index_j[pjd];
 
       const float hjg2 = hj * hj * kernel_gamma2;
 
@@ -1005,14 +948,14 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
         v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
         v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
 
-        vector v_doj_mask;
+        mask_t v_doj_mask;
         int doj_mask;
 
         /* Form r2 < hig2 mask. */
-        v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v);
+        vec_create_mask(v_doj_mask, vec_cmp_lt(v_r2.v, v_hjg2.v));
 
         /* Form integer mask. */
-        doj_mask = vec_cmp_result(v_doj_mask.v);
+        doj_mask = vec_form_int_mask(v_doj_mask);
 
         /* If there are any interactions perform them. */
         if (doj_mask)
@@ -1021,12 +964,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
               &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx],
               &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum,
               &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
-              &curlvySum, &curlvzSum, v_doj_mask,
-#ifdef HAVE_AVX512_F
-              knl_mask);
-#else
-              0);
-#endif
+              &curlvySum, &curlvzSum, v_doj_mask);
 
       } /* loop over the parts in ci. */
 
diff --git a/src/scheduler.c b/src/scheduler.c
index ed3c163949875a5621674fa2fbb6fa3157e65bc1..b3505bb874d08a90504dfec2b104f19dcf9dff9a 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -970,9 +970,7 @@ void scheduler_reset(struct scheduler *s, int size) {
   if (size > s->size) {
 
     /* Free existing task lists if necessary. */
-    if (s->tasks != NULL) free(s->tasks);
-    if (s->tasks_ind != NULL) free(s->tasks_ind);
-    if (s->tid_active != NULL) free(s->tid_active);
+    scheduler_free_tasks(s);
 
     /* Allocate the new lists. */
     if (posix_memalign((void *)&s->tasks, task_align,
@@ -1648,11 +1646,29 @@ void scheduler_print_tasks(const struct scheduler *s, const char *fileName) {
  */
 void scheduler_clean(struct scheduler *s) {
 
-  free(s->tasks);
-  free(s->tasks_ind);
+  scheduler_free_tasks(s);
   free(s->unlocks);
   free(s->unlock_ind);
-  free(s->tid_active);
   for (int i = 0; i < s->nr_queues; ++i) queue_clean(&s->queues[i]);
   free(s->queues);
 }
+
+/**
+ * @brief Free the task arrays allocated by this #scheduler.
+ */
+void scheduler_free_tasks(struct scheduler *s) {
+
+  if (s->tasks != NULL) {
+    free(s->tasks);
+    s->tasks = NULL;
+  }
+  if (s->tasks_ind != NULL) {
+    free(s->tasks_ind);
+    s->tasks_ind = NULL;
+  }
+  if (s->tid_active != NULL) {
+    free(s->tid_active);
+    s->tid_active = NULL;
+  }
+  s->size = 0;
+}
diff --git a/src/scheduler.h b/src/scheduler.h
index f38e5fb4d849842217756b7b93713a5e1375c9c5..ac654580b2af2ffb506dc3fd9f0b988b89effbd0 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -52,7 +52,7 @@
 
 /* Flags . */
 #define scheduler_flag_none 0
-#define scheduler_flag_steal 1
+#define scheduler_flag_steal (1 << 1)
 
 /* Data of a scheduler. */
 struct scheduler {
@@ -143,5 +143,6 @@ void scheduler_set_unlocks(struct scheduler *s);
 void scheduler_dump_queue(struct scheduler *s);
 void scheduler_print_tasks(const struct scheduler *s, const char *fileName);
 void scheduler_clean(struct scheduler *s);
+void scheduler_free_tasks(struct scheduler *s);
 
 #endif /* SWIFT_SCHEDULER_H */
diff --git a/src/space.c b/src/space.c
index fc7732f45350b37600198b057583baf5b761c28c..52a34248cd9bf38e03e476e4937fa601b0ee9222 100644
--- a/src/space.c
+++ b/src/space.c
@@ -253,6 +253,15 @@ void space_rebuild_recycle_mapper(void *map_data, int num_elements,
   }
 }
 
+/**
+ * @brief Free up any allocated cells.
+ */
+void space_free_cells(struct space *s) {
+  threadpool_map(&s->e->threadpool, space_rebuild_recycle_mapper, s->cells_top,
+                 s->nr_cells, sizeof(struct cell), 0, s);
+  s->maxdepth = 0;
+}
+
 /**
  * @brief Re-build the top-level cell grid.
  *
@@ -379,13 +388,15 @@ void space_regrid(struct space *s, int verbose) {
 
     /* Free the old cells, if they were allocated. */
     if (s->cells_top != NULL) {
-      threadpool_map(&s->e->threadpool, space_rebuild_recycle_mapper,
-                     s->cells_top, s->nr_cells, sizeof(struct cell), 0, s);
+      space_free_cells(s);
       free(s->cells_top);
       free(s->multipoles_top);
-      s->maxdepth = 0;
     }
 
+    /* Also free the task arrays, these will be regenerated and we can use the
+     * memory while copying the particle arrays. */
+    if (s->e != NULL) scheduler_free_tasks(&s->e->sched);
+
     /* Set the new cell dimensions only if smaller. */
     for (int k = 0; k < 3; k++) {
       s->cdim[k] = cdim[k];
@@ -492,9 +503,7 @@ void space_regrid(struct space *s, int verbose) {
   else { /* Otherwise, just clean up the cells. */
 
     /* Free the old cells, if they were allocated. */
-    threadpool_map(&s->e->threadpool, space_rebuild_recycle_mapper,
-                   s->cells_top, s->nr_cells, sizeof(struct cell), 0, s);
-    s->maxdepth = 0;
+    space_free_cells(s);
   }
 
   if (verbose)
diff --git a/src/space.h b/src/space.h
index 7fe8f27ecaf671738e03881d547d6cb29a61503a..dbbba714c2b3c9841905b2ba54e4f2d854b820a6 100644
--- a/src/space.h
+++ b/src/space.h
@@ -226,5 +226,6 @@ void space_check_timesteps(struct space *s);
 void space_replicate(struct space *s, int replicate, int verbose);
 void space_reset_task_counters(struct space *s);
 void space_clean(struct space *s);
+void space_free_cells(struct space *s);
 
 #endif /* SWIFT_SPACE_H */
diff --git a/src/tools.c b/src/tools.c
index 68b23e1a815b510abafbd8f7c6285d3633ba0a63..7d69ebc6c476312081d8a8c34c76c6592da5cab0 100644
--- a/src/tools.c
+++ b/src/tools.c
@@ -32,6 +32,7 @@
 #include "tools.h"
 
 /* Local includes. */
+#include "active.h"
 #include "cell.h"
 #include "error.h"
 #include "gravity.h"
@@ -183,6 +184,7 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) {
   float r2, hi, hj, hig2, hjg2, dx[3];
   struct part *pi, *pj;
   const double dim[3] = {r->e->s->dim[0], r->e->s->dim[1], r->e->s->dim[2]};
+  const struct engine *e = r->e;
 
   /* Implements a double-for loop and checks every interaction */
   for (int i = 0; i < ci->count; ++i) {
@@ -191,6 +193,9 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) {
     hi = pi->h;
     hig2 = hi * hi * kernel_gamma2;
 
+    /* Skip inactive particles. */
+    if (!part_is_active(pi, e)) continue;
+
     for (int j = 0; j < cj->count; ++j) {
 
       pj = &cj->parts[j];
@@ -219,6 +224,9 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) {
     hj = pj->h;
     hjg2 = hj * hj * kernel_gamma2;
 
+    /* Skip inactive particles. */
+    if (!part_is_active(pj, e)) continue;
+
     for (int i = 0; i < ci->count; ++i) {
 
       pi = &ci->parts[i];
@@ -311,6 +319,7 @@ void pairs_all_force(struct runner *r, struct cell *ci, struct cell *cj) {
 void self_all_density(struct runner *r, struct cell *ci) {
   float r2, hi, hj, hig2, hjg2, dxi[3];  //, dxj[3];
   struct part *pi, *pj;
+  const struct engine *e = r->e;
 
   /* Implements a double-for loop and checks every interaction */
   for (int i = 0; i < ci->count; ++i) {
@@ -335,14 +344,14 @@ void self_all_density(struct runner *r, struct cell *ci) {
       }
 
       /* Hit or miss? */
-      if (r2 < hig2) {
+      if (r2 < hig2 && part_is_active(pi, e)) {
 
         /* Interact */
         runner_iact_nonsym_density(r2, dxi, hi, hj, pi, pj);
       }
 
       /* Hit or miss? */
-      if (r2 < hjg2) {
+      if (r2 < hjg2 && part_is_active(pj, e)) {
 
         dxi[0] = -dxi[0];
         dxi[1] = -dxi[1];
diff --git a/src/vector.h b/src/vector.h
index 7d82b9f5c9e2bc6e1fea9426ab3870eeec180408..6a7c6837989025785c1f9134004f2ebcc226a205 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -23,8 +23,12 @@
 /* Have I already read this file? */
 #ifndef VEC_MACRO
 
+/* Config parameters. */
 #include "../config.h"
 
+/* Local headers */
+#include "inline.h"
+
 #ifdef WITH_VECTORIZATION
 
 /* Need to check whether compiler supports this (IBM does not)
@@ -60,9 +64,13 @@
 #define vec_dbl_set(a, b, c, d, e, f, g, h) \
   _mm512_set_pd(h, g, f, e, d, c, b, a)
 #define vec_add(a, b) _mm512_add_ps(a, b)
+#define vec_mask_add(a, b, mask) _mm512_mask_add_ps(a, mask, b, a)
 #define vec_sub(a, b) _mm512_sub_ps(a, b)
+#define vec_mask_sub(a, b, mask) _mm512_mask_sub_ps(a, mask, a, b)
 #define vec_mul(a, b) _mm512_mul_ps(a, b)
+#define vec_div(a, b) _mm512_div_ps(a, b)
 #define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c)
+#define vec_fnma(a, b, c) _mm512_fnmadd_ps(a, b, c)
 #define vec_sqrt(a) _mm512_sqrt_ps(a)
 #define vec_rcp(a) _mm512_rcp14_ps(a)
 #define vec_rsqrt(a) _mm512_rsqrt14_ps(a)
@@ -75,7 +83,16 @@
 #define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ)
 #define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ)
 #define vec_cmp_gte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ)
+#define vec_cmp_result(a) ({ a; })
+#define vec_form_int_mask(a) ({ a; })
 #define vec_and(a, b) _mm512_and_ps(a, b)
+#define vec_mask_and(a, b) _mm512_kand(a, b)
+#define vec_and_mask(a, mask) _mm512_maskz_mov_ps(mask, a)
+#define vec_init_mask_true(mask) ({ mask = 0xFFFF; })
+#define vec_zero_mask(mask) ({ mask = 0; })
+#define vec_create_mask(mask, cond) ({ mask = cond; })
+#define vec_pad_mask(mask, pad) ({ mask = mask >> (pad); })
+#define vec_blend(mask, a, b) _mm512_mask_blend_ps(mask, a, b)
 #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
 #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
 #define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1)
@@ -116,10 +133,13 @@
     for (int i = 0; i < VEC_SIZE; i++) b += a.f[i]; \
   }
 #endif
-/* Calculates the number of set bits in the mask and adds the result to an int.
- */
-#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \
-  pack += __builtin_popcount(mask);
+
+/* Do nothing in the case of AVX-512 as there are already
+ * instructions for left-packing.*/
+#define VEC_FORM_PACKED_MASK(mask, packed_mask) packed_mask = mask
+
+/* Finds the horizontal maximum of vector b and returns a float. */
+#define VEC_HMAX(a, b) b = _mm512_reduce_max_ps(a.v)
 
 /* Performs a left-pack on a vector based upon a mask and returns the result. */
 #define VEC_LEFT_PACK(a, mask, result) \
@@ -142,8 +162,11 @@
 #define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a)
 #define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a)
 #define vec_add(a, b) _mm256_add_ps(a, b)
+#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b, mask.v))
 #define vec_sub(a, b) _mm256_sub_ps(a, b)
+#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b, mask.v))
 #define vec_mul(a, b) _mm256_mul_ps(a, b)
+#define vec_div(a, b) _mm256_div_ps(a, b)
 #define vec_sqrt(a) _mm256_sqrt_ps(a)
 #define vec_rcp(a) _mm256_rcp_ps(a)
 #define vec_rsqrt(a) _mm256_rsqrt_ps(a)
@@ -157,7 +180,16 @@
 #define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
 #define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
 #define vec_cmp_result(a) _mm256_movemask_ps(a)
+#define vec_form_int_mask(a) _mm256_movemask_ps(a.v)
 #define vec_and(a, b) _mm256_and_ps(a, b)
+#define vec_mask_and(a, b) _mm256_and_ps(a.v, b.v)
+#define vec_and_mask(a, mask) _mm256_and_ps(a, mask.v)
+#define vec_init_mask_true(mask) mask.m = vec_setint1(0xFFFFFFFF)
+#define vec_create_mask(mask, cond) mask.v = cond
+#define vec_zero_mask(mask) mask.v = vec_setzero()
+#define vec_pad_mask(mask, pad) \
+  for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0
+#define vec_blend(mask, a, b) _mm256_blendv_ps(a, b, mask.v)
 #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0))
 #define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1))
 #define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1)
@@ -183,6 +215,13 @@
   a.v = _mm256_hadd_ps(a.v, a.v); \
   b += a.f[0] + a.f[4];
 
+/* Performs a horizontal maximum on the vector and takes the maximum of the
+ * result with a float, b. */
+#define VEC_HMAX(a, b)                                     \
+  {                                                        \
+    for (int k = 0; k < VEC_SIZE; k++) b = max(b, a.f[k]); \
+  }
+
 /* Returns the lower 128-bits of the 256-bit vector. */
 #define VEC_GET_LOW(a) _mm256_castps256_ps128(a)
 
@@ -192,6 +231,7 @@
 /* Check if we have AVX2 intrinsics alongside AVX */
 #ifdef HAVE_AVX2
 #define vec_fma(a, b, c) _mm256_fmadd_ps(a, b, c)
+#define vec_fnma(a, b, c) _mm256_fnmadd_ps(a, b, c)
 
 /* Used in VEC_FORM_PACKED_MASK */
 #define identity_indices 0x0706050403020100
@@ -201,19 +241,18 @@
 /* Takes an integer mask and forms a left-packed integer vector
  * containing indices of the set bits in the integer mask.
  * Also returns the total number of bits set in the mask. */
-#define VEC_FORM_PACKED_MASK(mask, v_mask, pack)                               \
+#define VEC_FORM_PACKED_MASK(mask, packed_mask)                                \
   {                                                                            \
     unsigned long expanded_mask = _pdep_u64(mask, 0x0101010101010101);         \
     expanded_mask *= 0xFF;                                                     \
     unsigned long wanted_indices = _pext_u64(identity_indices, expanded_mask); \
     __m128i bytevec = _mm_cvtsi64_si128(wanted_indices);                       \
-    v_mask = _mm256_cvtepu8_epi32(bytevec);                                    \
-    pack += __builtin_popcount(mask);                                          \
+    packed_mask.m = _mm256_cvtepu8_epi32(bytevec);                             \
   }
 
 /* Performs a left-pack on a vector based upon a mask and returns the result. */
 #define VEC_LEFT_PACK(a, mask, result) \
-  vec_unaligned_store(_mm256_permutevar8x32_ps(a, mask), result)
+  vec_unaligned_store(_mm256_permutevar8x32_ps(a, mask.m), result)
 #endif /* HAVE_AVX2 */
 
 /* Create an FMA using vec_add and vec_mul if AVX2 is not present. */
@@ -221,6 +260,11 @@
 #define vec_fma(a, b, c) vec_add(vec_mul(a, b), c)
 #endif
 
+/* Create a negated FMA using vec_sub and vec_mul if AVX2 is not present. */
+#ifndef vec_fnma
+#define vec_fnma(a, b, c) vec_sub(c, vec_mul(a, b))
+#endif
+
 /* Form a packed mask without intrinsics if AVX2 is not present. */
 #ifndef VEC_FORM_PACKED_MASK
 
@@ -284,6 +328,7 @@
 #define vec_add(a, b) _mm_add_ps(a, b)
 #define vec_sub(a, b) _mm_sub_ps(a, b)
 #define vec_mul(a, b) _mm_mul_ps(a, b)
+#define vec_div(a, b) _mm_div_ps(a, b)
 #define vec_sqrt(a) _mm_sqrt_ps(a)
 #define vec_rcp(a) _mm_rcp_ps(a)
 #define vec_rsqrt(a) _mm_rsqrt_ps(a)
@@ -336,6 +381,13 @@ typedef union {
   int i[VEC_SIZE];
 } vector;
 
+/* Define the mask type depending on the instruction set used. */
+#ifdef HAVE_AVX512_F
+typedef __mmask16 mask_t;
+#else
+typedef vector mask_t;
+#endif
+
 /**
  * @brief Calculates the inverse ($1/x$) of a vector using intrinsics and a
  * Newton iteration to obtain the correct level of accuracy.
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 2eec538ea50440cf5589882af52d3a0c5ca04b92..9cd6e9ab9e09935d39bf416dfbb65b83a874b382 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -21,16 +21,16 @@ AM_LDFLAGS = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS) $(FFTW_LIBS
 
 # List of programs and scripts to run in the test suite
 TESTS = testGreetings testMaths testReading.sh testSingle testKernel testSymmetry \
-        testPair.sh testPairPerturbed.sh test27cells.sh test27cellsPerturbed.sh  \
+        testActivePair.sh test27cells.sh test27cellsPerturbed.sh  \
         testParser.sh testSPHStep test125cells.sh test125cellsPerturbed.sh testFFT \
         testAdiabaticIndex testRiemannExact testRiemannTRRS testRiemannHLLC \
-        testMatrixInversion testThreadpool testDump testLogger \
+        testMatrixInversion testThreadpool testDump testLogger testInteractions.sh \
         testVoronoi1D testVoronoi2D testVoronoi3D \
 	testPeriodicBC.sh testPeriodicBCPerturbed.sh
 
 # List of test programs to compile
 check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration \
-		 testSPHStep testPair test27cells test125cells testParser \
+		 testSPHStep testActivePair test27cells test125cells testParser \
                  testKernel testFFT testInteractions testMaths \
                  testSymmetry testThreadpool benchmarkInteractions \
                  testAdiabaticIndex testRiemannExact testRiemannTRRS \
@@ -55,7 +55,7 @@ testSPHStep_SOURCES = testSPHStep.c
 
 testSingle_SOURCES = testSingle.c
 
-testPair_SOURCES = testPair.c
+testActivePair_SOURCES = testActivePair.c
 
 test27cells_SOURCES = test27cells.c
 
@@ -96,10 +96,10 @@ testDump_SOURCES = testDump.c
 testLogger_SOURCES = testLogger.c
 
 # Files necessary for distribution
-EXTRA_DIST = testReading.sh makeInput.py testPair.sh testPairPerturbed.sh \
+EXTRA_DIST = testReading.sh makeInput.py testActivePair.sh \
 	     test27cells.sh test27cellsPerturbed.sh testParser.sh testPeriodicBC.sh \
 	     testPeriodicBCPerturbed.sh test125cells.sh test125cellsPerturbed.sh testParserInput.yaml \
 	     difffloat.py tolerance_125_normal.dat tolerance_125_perturbed.dat \
-             tolerance_27_normal.dat tolerance_27_perturbed.dat tolerance_27_perturbed_h.dat \
-	     tolerance_pair_normal.dat tolerance_pair_perturbed.dat \
+             tolerance_27_normal.dat tolerance_27_perturbed.dat tolerance_27_perturbed_h.dat tolerance_27_perturbed_h2.dat \
+	     tolerance_testInteractions.dat tolerance_pair_active.dat \
 	     fft_params.yml tolerance_periodic_BC_normal.dat tolerance_periodic_BC_perturbed.dat
diff --git a/tests/benchmarkInteractions.c b/tests/benchmarkInteractions.c
index a11862ebaa345623ca65cd7025b4f4e79769d0ac..5dc7a4ae85e62313a84d0c0be88b65a1c0bb59df 100644
--- a/tests/benchmarkInteractions.c
+++ b/tests/benchmarkInteractions.c
@@ -31,6 +31,7 @@
 #define IACT runner_iact_nonsym_density
 #define IACT_VEC runner_iact_nonsym_2_vec_density
 #define IACT_NAME "test_nonsym_density"
+#define NUM_VEC_PROC_INT 2
 #endif
 
 #ifdef SYM_DENSITY
@@ -53,8 +54,9 @@
 
 #ifndef IACT
 #define IACT runner_iact_nonsym_density
-#define IACT_VEC runner_iact_nonsym_2_vec_density
+#define IACT_VEC runner_iact_nonsym_1_vec_density
 #define IACT_NAME "test_nonsym_density"
+#define NUM_VEC_PROC_INT 1
 #endif
 
 /**
@@ -368,7 +370,7 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
 
 /* Perform vector interaction. */
 #ifdef WITH_VECTORIZATION
-    vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec, mask, mask2;
+    vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec;
     vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
         curlvySum, curlvzSum;
 
@@ -387,28 +389,35 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
     viz_vec.v = vec_load(&vizq[0]);
 
     hi_inv_vec = vec_reciprocal(hi_vec);
-    mask.m = vec_setint1(0xFFFFFFFF);
-    mask2.m = vec_setint1(0xFFFFFFFF);
 
-#ifdef HAVE_AVX512_F
-    KNL_MASK_16 knl_mask, knl_mask2;
-    knl_mask = 0xFFFF;
-    knl_mask2 = 0xFFFF;
+    mask_t mask;
+    vec_init_mask_true(mask);
+#if (NUM_VEC_PROC_INT == 2)
+    mask_t mask2;
+    vec_init_mask_true(mask2);
 #endif
-
     const ticks vec_tic = getticks();
 
-    for (size_t i = 0; i < count; i += 2 * VEC_SIZE) {
+    for (size_t i = 0; i < count; i += NUM_VEC_PROC_INT * VEC_SIZE) {
 
+/* Interleave two vectors for interaction. */
+#if (NUM_VEC_PROC_INT == 2)
       IACT_VEC(&(r2q[i]), &(dxq[i]), &(dyq[i]), &(dzq[i]), (hi_inv_vec),
                (vix_vec), (viy_vec), (viz_vec), &(vjxq[i]), &(vjyq[i]),
                &(vjzq[i]), &(mjq[i]), &rhoSum, &rho_dhSum, &wcountSum,
                &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum,
-               mask, mask2,
-#ifdef HAVE_AVX512_F
-               knl_mask, knl_mask2);
-#else
-               0, 0);
+               mask, mask2, 0);
+#else /* Only use one vector for interaction. */
+      vector r2, dx, dy, dz;
+      r2.v = vec_load(&(r2q[i]));
+      dx.v = vec_load(&(dxq[i]));
+      dy.v = vec_load(&(dyq[i]));
+      dz.v = vec_load(&(dzq[i]));
+
+      IACT_VEC(&r2, &dx, &dy, &dz, (hi_inv_vec), (vix_vec), (viy_vec),
+               (viz_vec), &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]),
+               &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum,
+               &curlvxSum, &curlvySum, &curlvzSum, mask);
 #endif
     }
 
diff --git a/tests/test125cells.c b/tests/test125cells.c
index 1bddd5dd2d4933e3b2d6c802d1325f3b88ba6ba5..2b35f7b5d509a7f966d07d6da82616869f29ab66 100644
--- a/tests/test125cells.c
+++ b/tests/test125cells.c
@@ -440,6 +440,7 @@ void runner_doself2_force(struct runner *r, struct cell *ci);
 /* And go... */
 int main(int argc, char *argv[]) {
 
+  engine_pin();
   size_t runs = 0, particles = 0;
   double h = 1.23485, size = 1., rho = 2.5;
   double perturbation = 0.;
diff --git a/tests/test27cells.c b/tests/test27cells.c
index c4df8356eb6494cc34b91c6ab5038b3866b14056..7ba1eec9ad279f09f63021e332dac1cfd5cc1505 100644
--- a/tests/test27cells.c
+++ b/tests/test27cells.c
@@ -122,7 +122,7 @@ struct cell *make_cell(size_t n, double *offset, double size, double h,
             break;
         }
         if (h_pert)
-          part->h = size * h * random_uniform(1.f, 1.1f) / (float)n;
+          part->h = size * h * random_uniform(1.f, h_pert) / (float)n;
         else
           part->h = size * h / (float)n;
         h_max = fmaxf(h_max, part->h);
diff --git a/tests/test27cells.sh.in b/tests/test27cells.sh.in
index a313a594c10c80598852a92e7920ac846e15ba2d..059a7a208aa8e570ad5035fac16ffd201bf3dddd 100755
--- a/tests/test27cells.sh.in
+++ b/tests/test27cells.sh.in
@@ -56,4 +56,32 @@ do
     
 done
 
+# Test for particles with random smoothing lengths
+for v in {0..3}
+do
+    echo ""
+	
+    rm -f brute_force_27_standard.dat swift_dopair_27_standard.dat
+
+    echo "Running ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -p 1.3"
+    ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -p 1.3
+
+    if [ -e brute_force_27_standard.dat ]
+    then
+	if python @srcdir@/difffloat.py brute_force_27_standard.dat swift_dopair_27_standard.dat @srcdir@/tolerance_27_perturbed_h2.dat 6
+	then
+	    echo "Accuracy test passed"
+	else
+	    echo "Accuracy test failed"
+	    exit 1
+	fi
+    else
+	echo "Error Missing test output file"
+	exit 1
+    fi
+
+    echo "------------"
+    
+done
+
 exit $?
diff --git a/tests/test27cellsPerturbed.sh.in b/tests/test27cellsPerturbed.sh.in
index 4adbe0b9ca6d8e1cfe7df861c2bf88bf35832983..f875504e541588377ca6e40fe55681ebec3466f6 100755
--- a/tests/test27cellsPerturbed.sh.in
+++ b/tests/test27cellsPerturbed.sh.in
@@ -56,4 +56,31 @@ do
 
 done
 
+# Test for particles with random smoothing lengths
+for v in {0..3}
+do
+    echo ""
+
+    rm -f brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat
+
+    echo "Running ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -p 1.3"
+    ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -p 1.3
+
+    if [ -e brute_force_27_perturbed.dat ]
+    then
+	if python @srcdir@/difffloat.py brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat @srcdir@/tolerance_27_perturbed_h2.dat 6
+	then
+	    echo "Accuracy test passed"
+	else
+	    echo "Accuracy test failed"
+	    exit 1
+	fi
+    else
+	echo "Error Missing test output file"
+	exit 1
+    fi
+
+    echo "------------"
+
+done
 exit $?
diff --git a/tests/testActivePair.c b/tests/testActivePair.c
new file mode 100644
index 0000000000000000000000000000000000000000..1e0111b4f0e480d0f66463b4c2264cdd89bd28c8
--- /dev/null
+++ b/tests/testActivePair.c
@@ -0,0 +1,510 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#include "../config.h"
+
+/* Some standard headers. */
+#include <fenv.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+/* Local headers. */
+#include "swift.h"
+
+/**
+ * @brief Constructs a cell and all of its particle in a valid state prior to
+ * a DOPAIR or DOSELF calcuation.
+ *
+ * @param n The cube root of the number of particles.
+ * @param offset The position of the cell offset from (0,0,0).
+ * @param size The cell size.
+ * @param h The smoothing length of the particles in units of the inter-particle
+ * separation.
+ * @param density The density of the fluid.
+ * @param partId The running counter of IDs.
+ * @param pert The perturbation to apply to the particles in the cell in units
+ * of the inter-particle separation.
+ * @param h_pert The perturbation to apply to the smoothing length.
+ * @param fraction_active The fraction of particles that should be active in the
+ * cell.
+ */
+struct cell *make_cell(size_t n, double *offset, double size, double h,
+                       double density, long long *partId, double pert,
+                       double h_pert, double fraction_active) {
+  const size_t count = n * n * n;
+  const double volume = size * size * size;
+  float h_max = 0.f;
+  struct cell *cell = malloc(sizeof(struct cell));
+  bzero(cell, sizeof(struct cell));
+
+  if (posix_memalign((void **)&cell->parts, part_align,
+                     count * sizeof(struct part)) != 0) {
+    error("couldn't allocate particles, no. of particles: %d", (int)count);
+  }
+  bzero(cell->parts, count * sizeof(struct part));
+
+  /* Construct the parts */
+  struct part *part = cell->parts;
+  for (size_t x = 0; x < n; ++x) {
+    for (size_t y = 0; y < n; ++y) {
+      for (size_t z = 0; z < n; ++z) {
+        part->x[0] =
+            offset[0] +
+            size * (x + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
+        part->x[1] =
+            offset[1] +
+            size * (y + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
+        part->x[2] =
+            offset[2] +
+            size * (z + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
+        part->v[0] = random_uniform(-0.05, 0.05);
+        part->v[1] = random_uniform(-0.05, 0.05);
+        part->v[2] = random_uniform(-0.05, 0.05);
+
+        if (h_pert)
+          part->h = size * h * random_uniform(1.f, h_pert) / (float)n;
+        else
+          part->h = size * h / (float)n;
+        h_max = fmaxf(h_max, part->h);
+        part->id = ++(*partId);
+
+#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
+        part->conserved.mass = density * volume / count;
+
+#ifdef SHADOWFAX_SPH
+        double anchor[3] = {0., 0., 0.};
+        double side[3] = {1., 1., 1.};
+        voronoi_cell_init(&part->cell, part->x, anchor, side);
+#endif
+
+#else
+        part->mass = density * volume / count;
+#endif
+
+#if defined(HOPKINS_PE_SPH)
+        part->entropy = 1.f;
+        part->entropy_one_over_gamma = 1.f;
+#endif
+        if (random_uniform(0, 1.f) < fraction_active)
+          part->time_bin = 1;
+        else
+          part->time_bin = num_time_bins + 1;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        part->ti_drift = 8;
+        part->ti_kick = 8;
+#endif
+
+        ++part;
+      }
+    }
+  }
+
+  /* Cell properties */
+  cell->split = 0;
+  cell->h_max = h_max;
+  cell->count = count;
+  cell->dx_max_part = 0.;
+  cell->dx_max_sort = 0.;
+  cell->width[0] = size;
+  cell->width[1] = size;
+  cell->width[2] = size;
+  cell->loc[0] = offset[0];
+  cell->loc[1] = offset[1];
+  cell->loc[2] = offset[2];
+
+  cell->ti_old_part = 8;
+  cell->ti_end_min = 8;
+  cell->ti_end_max = 8;
+
+  shuffle_particles(cell->parts, cell->count);
+
+  cell->sorted = 0;
+  for (int k = 0; k < 13; k++) cell->sort[k] = NULL;
+
+  return cell;
+}
+
+void clean_up(struct cell *ci) {
+  free(ci->parts);
+  for (int k = 0; k < 13; k++)
+    if (ci->sort[k] != NULL) free(ci->sort[k]);
+  free(ci);
+}
+
+/**
+ * @brief Initializes all particles field to be ready for a density calculation
+ */
+void zero_particle_fields(struct cell *c) {
+  for (int pid = 0; pid < c->count; pid++) {
+    hydro_init_part(&c->parts[pid], NULL);
+  }
+}
+
+/**
+ * @brief Ends the loop by adding the appropriate coefficients
+ */
+void end_calculation(struct cell *c) {
+  for (int pid = 0; pid < c->count; pid++) {
+    hydro_end_density(&c->parts[pid]);
+
+    /* Recover the common "Neighbour number" definition */
+    c->parts[pid].density.wcount *= pow_dimension(c->parts[pid].h);
+    c->parts[pid].density.wcount *= kernel_norm;
+  }
+}
+
+/**
+ * @brief Dump all the particles to a file
+ */
+void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) {
+  FILE *file = fopen(fileName, "a");
+
+  /* Write header */
+  fprintf(file, "# %4s %13s\n", "ID", "wcount");
+
+  fprintf(file, "# ci --------------------------------------------\n");
+
+  for (int pid = 0; pid < ci->count; pid++) {
+    fprintf(file, "%6llu %13e\n", ci->parts[pid].id,
+            ci->parts[pid].density.wcount);
+  }
+
+  fprintf(file, "# cj --------------------------------------------\n");
+
+  for (int pjd = 0; pjd < cj->count; pjd++) {
+    fprintf(file, "%6llu %13e\n", cj->parts[pjd].id,
+            cj->parts[pjd].density.wcount);
+  }
+
+  fclose(file);
+}
+
+/* Just a forward declaration... */
+void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
+void runner_doself1_density_vec(struct runner *r, struct cell *ci);
+void runner_dopair1_branch_density(struct runner *r, struct cell *ci,
+                                   struct cell *cj);
+
+/**
+ * @brief Computes the pair interactions of two cells using SWIFT and a brute
+ * force implementation.
+ */
+void test_pair_interactions(struct runner *runner, struct cell **ci,
+                            struct cell **cj, char *swiftOutputFileName,
+                            char *bruteForceOutputFileName) {
+
+  runner_do_sort(runner, *ci, 0x1FFF, 0, 0);
+  runner_do_sort(runner, *cj, 0x1FFF, 0, 0);
+
+  /* Zero the fields */
+  zero_particle_fields(*ci);
+  zero_particle_fields(*cj);
+
+  /* Run the test */
+  runner_dopair1_branch_density(runner, *ci, *cj);
+
+  /* Let's get physical ! */
+  end_calculation(*ci);
+  end_calculation(*cj);
+
+  /* Dump if necessary */
+  dump_particle_fields(swiftOutputFileName, *ci, *cj);
+
+  /* Now perform a brute-force version for accuracy tests */
+
+  /* Zero the fields */
+  zero_particle_fields(*ci);
+  zero_particle_fields(*cj);
+
+  /* Run the brute-force test */
+  pairs_all_density(runner, *ci, *cj);
+
+  /* Let's get physical ! */
+  end_calculation(*ci);
+  end_calculation(*cj);
+
+  dump_particle_fields(bruteForceOutputFileName, *ci, *cj);
+}
+
+/**
+ * @brief Computes the pair interactions of two cells in various configurations.
+ */
+void test_all_pair_interactions(struct runner *runner, double *offset2,
+                                size_t particles, double size, double h,
+                                double rho, long long *partId,
+                                double perturbation, double h_pert,
+                                char *swiftOutputFileName,
+                                char *bruteForceOutputFileName) {
+
+  double offset1[3] = {0, 0, 0};
+  struct cell *ci, *cj;
+
+  /* All active particles. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 1.);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 1.);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* Half particles are active. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 0.5);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 0.5);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* All particles inactive. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 0.);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 0.);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* 10% of particles active. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 0.1);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 0.1);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* One active cell one inactive cell. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 1.0);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 0.);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* One active cell one inactive cell. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 0.);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 1.0);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* Smaller cells, all active. */
+  ci = make_cell(2, offset1, size, h, rho, partId, perturbation, h_pert, 1.0);
+  cj = make_cell(2, offset2, size, h, rho, partId, perturbation, h_pert, 1.0);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* Different numbers of particles in each cell. */
+  ci = make_cell(10, offset1, size, h, rho, partId, perturbation, h_pert, 0.5);
+  cj = make_cell(3, offset2, size, h, rho, partId, perturbation, h_pert, 0.75);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* One cell inactive and the other only half active. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 0.5);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 0.);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* One cell inactive and the other only half active. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 0.);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 0.5);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  /* Clean things to make the sanitizer happy ... */
+  clean_up(ci);
+  clean_up(cj);
+}
+
+int main(int argc, char *argv[]) {
+  size_t particles = 0, runs = 0, type = 0;
+  double h = 1.23485, size = 1., rho = 1.;
+  double perturbation = 0.1, h_pert = 1.1;
+  struct space space;
+  struct engine engine;
+  struct runner *runner;
+  char c;
+  static long long partId = 0;
+  char outputFileNameExtension[200] = "";
+  char swiftOutputFileName[200] = "";
+  char bruteForceOutputFileName[200] = "";
+
+  /* Initialize CPU frequency, this also starts time. */
+  unsigned long long cpufreq = 0;
+  clocks_set_cpufreq(cpufreq);
+
+  /* Choke on FP-exceptions */
+  feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  /* Generate a RNG seed from time. */
+  unsigned int seed = time(NULL);
+
+  while ((c = getopt(argc, argv, "h:p:n:r:t:d:s:f:")) != -1) {
+    switch (c) {
+      case 'h':
+        sscanf(optarg, "%lf", &h);
+        break;
+      case 'p':
+        sscanf(optarg, "%lf", &h_pert);
+        break;
+      case 'n':
+        sscanf(optarg, "%zu", &particles);
+        break;
+      case 'r':
+        sscanf(optarg, "%zu", &runs);
+        break;
+      case 't':
+        sscanf(optarg, "%zu", &type);
+        break;
+      case 'd':
+        sscanf(optarg, "%lf", &perturbation);
+        break;
+      case 's':
+        sscanf(optarg, "%u", &seed);
+        break;
+      case 'f':
+        strcpy(outputFileNameExtension, optarg);
+        break;
+      case '?':
+        error("Unknown option.");
+        break;
+    }
+  }
+
+  if (h < 0 || particles == 0 || runs == 0 || type > 2) {
+    printf(
+        "\nUsage: %s -n PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n"
+        "\nGenerates a cell pair, filled with particles on a Cartesian grid."
+        "\nThese are then interacted using runner_dopair1_density."
+        "\n\nOptions:"
+        "\n-t TYPE=0          - cells share face (0), edge (1) or corner (2)"
+        "\n-h DISTANCE=1.2348 - smoothing length"
+        "\n-p                 - Random fractional change in h, h=h*random(1,p)"
+        "\n-d pert            - perturbation to apply to the particles [0,1["
+        "\n-s seed            - seed for RNG"
+        "\n-f fileName        - part of the file name used to save the dumps\n",
+        argv[0]);
+    exit(1);
+  }
+
+  /* Seed RNG. */
+  message("Seed used for RNG: %d", seed);
+  srand(seed);
+
+  space.periodic = 0;
+  space.dim[0] = 3.;
+  space.dim[1] = 3.;
+  space.dim[2] = 3.;
+
+  engine.s = &space;
+  engine.time = 0.1f;
+  engine.ti_current = 8;
+  engine.max_active_bin = num_time_bins;
+
+  if (posix_memalign((void **)&runner, SWIFT_STRUCT_ALIGNMENT,
+                     sizeof(struct runner)) != 0) {
+    error("couldn't allocate runner");
+  }
+
+  runner->e = &engine;
+
+  /* Create output file names. */
+  sprintf(swiftOutputFileName, "swift_dopair_%s.dat", outputFileNameExtension);
+  sprintf(bruteForceOutputFileName, "brute_force_%s.dat",
+          outputFileNameExtension);
+
+  /* Delete files if they already exist. */
+  remove(swiftOutputFileName);
+  remove(bruteForceOutputFileName);
+
+#ifdef WITH_VECTORIZATION
+  runner->ci_cache.count = 0;
+  cache_init(&runner->ci_cache, 512);
+  runner->cj_cache.count = 0;
+  cache_init(&runner->cj_cache, 512);
+#endif
+
+  double offset[3] = {1., 0., 0.};
+
+  /* Test a pair of cells face-on. */
+  test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId,
+                             perturbation, h_pert, swiftOutputFileName,
+                             bruteForceOutputFileName);
+
+  /* Test a pair of cells edge-on. */
+  offset[0] = 1.;
+  offset[1] = 1.;
+  offset[2] = 0.;
+  test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId,
+                             perturbation, h_pert, swiftOutputFileName,
+                             bruteForceOutputFileName);
+
+  /* Test a pair of cells corner-on. */
+  offset[0] = 1.;
+  offset[1] = 1.;
+  offset[2] = 1.;
+  test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId,
+                             perturbation, h_pert, swiftOutputFileName,
+                             bruteForceOutputFileName);
+  return 0;
+}
diff --git a/tests/testActivePair.sh.in b/tests/testActivePair.sh.in
new file mode 100755
index 0000000000000000000000000000000000000000..ff8d027a469bd9bc78286b843cf2dffd3ef27ad3
--- /dev/null
+++ b/tests/testActivePair.sh.in
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+echo ""
+
+rm -f brute_force_pair_active.dat swift_dopair_active.dat
+
+./testActivePair -n 6 -r 1 -d 0 -f active
+
+python @srcdir@/difffloat.py brute_force_active.dat swift_dopair_active.dat @srcdir@/tolerance_pair_active.dat
+
+exit $?
diff --git a/tests/testInteractions.c b/tests/testInteractions.c
index 4ce7fe40554d24551750629fa47c0bee7acdb6da..54d1f38733a1f1647331166f1a37b40ed3511419 100644
--- a/tests/testInteractions.c
+++ b/tests/testInteractions.c
@@ -17,12 +17,6 @@
  *
  ******************************************************************************/
 
-#include "../config.h"
-
-#ifndef WITH_VECTORIZATION
-int main() { return 0; }
-#else
-
 #include <fenv.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -30,15 +24,17 @@ int main() { return 0; }
 #include <unistd.h>
 #include "swift.h"
 
+#ifdef WITH_VECTORIZATION
+
 #define array_align sizeof(float) * VEC_SIZE
 #define ACC_THRESHOLD 1e-5
 
-/* Typdef function pointers for serial and vectorised versions of the
- * interaction functions. */
-typedef void (*serial_interaction)(float, float *, float, float, struct part *,
-                                   struct part *);
-typedef void (*vec_interaction)(float *, float *, float *, float *,
-                                struct part **, struct part **);
+#ifndef IACT
+#define IACT runner_iact_nonsym_density
+#define IACT_VEC runner_iact_nonsym_1_vec_density
+#define IACT_NAME "test_nonsym_density"
+#define NUM_VEC_PROC_INT 1
+#endif
 
 /**
  * @brief Constructs an array of particles in a valid state prior to
@@ -74,7 +70,10 @@ struct part *make_particles(size_t count, double *offset, double spacing,
 
   p->h = h;
   p->id = ++(*partId);
+
+#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH)
   p->mass = 1.0f;
+#endif
 
   /* Place rest of particles around the test particle
    * with random position within a unit sphere. */
@@ -93,7 +92,9 @@ struct part *make_particles(size_t count, double *offset, double spacing,
 
     p->h = h;
     p->id = ++(*partId);
+#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH)
     p->mass = 1.0f;
+#endif
   }
   return particles;
 }
@@ -103,6 +104,7 @@ struct part *make_particles(size_t count, double *offset, double spacing,
  */
 void prepare_force(struct part *parts, size_t count) {
 
+#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) && !defined(MINIMAL_SPH)
   struct part *p;
   for (size_t i = 0; i < count; ++i) {
     p = &parts[i];
@@ -113,6 +115,7 @@ void prepare_force(struct part *parts, size_t count) {
     p->force.v_sig = 0.0f;
     p->force.h_dt = 0.0f;
   }
+#endif
 }
 
 /**
@@ -122,25 +125,26 @@ void dump_indv_particle_fields(char *fileName, struct part *p) {
 
   FILE *file = fopen(fileName, "a");
 
+  /* Write header */
   fprintf(file,
-          "%6llu %10f %10f %10f %10f %10f %10f %10e %10e %10e %13e %13e %13e "
-          "%13e %13e %13e %13e "
-          "%13e %13e %13e %10f\n",
+          "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e "
+          "%13e %13e %13e\n",
           p->id, p->x[0], p->x[1], p->x[2], p->v[0], p->v[1], p->v[2],
-          p->a_hydro[0], p->a_hydro[1], p->a_hydro[2], p->rho,
-          p->density.rho_dh, p->density.wcount, p->density.wcount_dh,
-          p->force.h_dt, p->force.v_sig,
-#if defined(GADGET2_SPH)
-          p->density.div_v, p->density.rot_v[0], p->density.rot_v[1],
-          p->density.rot_v[2], p->entropy_dt
-#elif defined(DEFAULT_SPH)
-          p->density.div_v, p->density.rot_v[0], p->density.rot_v[1],
-          p->density.rot_v[2], 0.
+          hydro_get_density(p),
+#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
+          0.f,
 #else
+          p->density.rho_dh,
+#endif
+          p->density.wcount, p->density.wcount_dh,
+#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH)
           p->density.div_v, p->density.rot_v[0], p->density.rot_v[1],
           p->density.rot_v[2]
+#else
+          0., 0., 0., 0.
 #endif
           );
+
   fclose(file);
 }
 
@@ -152,13 +156,10 @@ void write_header(char *fileName) {
   FILE *file = fopen(fileName, "w");
   /* Write header */
   fprintf(file,
-          "# %4s %10s %10s %10s %10s %10s %10s %10s %10s %10s %13s %13s %13s "
-          "%13s %13s %13s %13s"
-          "%13s %13s %13s %13s\n",
-          "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "a_x", "a_y",
-          "a_z", "rho", "rho_dh", "wcount", "wcount_dh", "dh/dt", "v_sig",
-          "div_v", "curl_vx", "curl_vy", "curl_vz", "dS/dt");
-  fprintf(file, "\n# PARTICLES BEFORE INTERACTION:\n");
+          "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s "
+          "%13s %13s %13s\n",
+          "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh",
+          "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz");
   fclose(file);
 }
 
@@ -187,8 +188,8 @@ int check_results(struct part serial_test_part, struct part *serial_parts,
 }
 
 /*
- * @brief Calls the serial and vectorised version of an interaction
- * function given by the function pointers.
+ * @brief Calls the serial and vectorised version of the non-symmetrical density
+ * interaction.
  *
  * @param test_part Particle that will be updated
  * @param parts Particle array to be interacted
@@ -196,16 +197,15 @@ int check_results(struct part serial_test_part, struct part *serial_parts,
  * @param serial_inter_func Serial interaction function to be called
  * @param vec_inter_func Vectorised interaction function to be called
  * @param runs No. of times to call interactions
+ * @param num_vec_proc No. of vectors to use to process interaction
  *
  */
 void test_interactions(struct part test_part, struct part *parts, size_t count,
-                       serial_interaction serial_inter_func,
-                       vec_interaction vec_inter_func, char *filePrefix,
-                       size_t runs) {
+                       char *filePrefix, int runs, int num_vec_proc) {
 
-  ticks serial_time = 0, vec_time = 0;
+  ticks serial_time = 0;
+  ticks vec_time = 0;
 
-  FILE *file;
   char serial_filename[200] = "";
   char vec_filename[200] = "";
 
@@ -217,64 +217,68 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
   write_header(serial_filename);
   write_header(vec_filename);
 
-  /* Test particle at the center of a unit sphere. */
   struct part pi_serial, pi_vec;
-
-  /* Remaining particles in the sphere that will interact with test particle. */
   struct part pj_serial[count], pj_vec[count];
 
-  /* Stores the separation, smoothing length and pointers to particles
-   * needed for the vectorised interaction. */
+  float r2[count] __attribute__((aligned(array_align)));
+  float dx[3 * count] __attribute__((aligned(array_align)));
+
+  struct part *piq[count], *pjq[count];
+  for (size_t k = 0; k < count; k++) {
+    piq[k] = NULL;
+    pjq[k] = NULL;
+  }
+
   float r2q[count] __attribute__((aligned(array_align)));
   float hiq[count] __attribute__((aligned(array_align)));
-  float hjq[count] __attribute__((aligned(array_align)));
-  float dxq[3 * count] __attribute__((aligned(array_align)));
-  struct part *piq[count], *pjq[count];
+  float dxq[count] __attribute__((aligned(array_align)));
+
+  float dyq[count] __attribute__((aligned(array_align)));
+  float dzq[count] __attribute__((aligned(array_align)));
+  float mjq[count] __attribute__((aligned(array_align)));
+  float vixq[count] __attribute__((aligned(array_align)));
+  float viyq[count] __attribute__((aligned(array_align)));
+  float vizq[count] __attribute__((aligned(array_align)));
+  float vjxq[count] __attribute__((aligned(array_align)));
+  float vjyq[count] __attribute__((aligned(array_align)));
+  float vjzq[count] __attribute__((aligned(array_align)));
 
   /* Call serial interaction a set number of times. */
-  for (size_t k = 0; k < runs; k++) {
+  for (int k = 0; k < runs; k++) {
     /* Reset particle to initial setup */
     pi_serial = test_part;
     for (size_t i = 0; i < count; i++) pj_serial[i] = parts[i];
 
-    /* Only dump data on first run. */
-    if (k == 0) {
-      /* Dump state of particles before serial interaction. */
-      dump_indv_particle_fields(serial_filename, &pi_serial);
-      for (size_t i = 0; i < count; i++)
-        dump_indv_particle_fields(serial_filename, &pj_serial[i]);
-    }
-
     /* Perform serial interaction */
     for (size_t i = 0; i < count; i++) {
       /* Compute the pairwise distance. */
-      float r2 = 0.0f;
-      float dx[3];
-      for (size_t k = 0; k < 3; k++) {
-        dx[k] = pi_serial.x[k] - pj_serial[i].x[k];
-        r2 += dx[k] * dx[k];
+      r2[i] = 0.0f;
+      for (int k = 0; k < 3; k++) {
+        int ind = (3 * i) + k;
+        dx[ind] = pi_serial.x[k] - pj_serial[i].x[k];
+        r2[i] += dx[ind] * dx[ind];
       }
+    }
 
-      const ticks tic = getticks();
-
-      serial_inter_func(r2, dx, pi_serial.h, pj_serial[i].h, &pi_serial,
-                        &pj_serial[i]);
-
-      serial_time += getticks() - tic;
+    const ticks tic = getticks();
+/* Perform serial interaction */
+#ifdef __ICC
+#pragma novector
+#endif
+    for (size_t i = 0; i < count; i++) {
+      IACT(r2[i], &(dx[3 * i]), pi_serial.h, pj_serial[i].h, &pi_serial,
+           &pj_serial[i]);
     }
+    serial_time += getticks() - tic;
   }
 
-  file = fopen(serial_filename, "a");
-  fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n");
-  fclose(file);
-
   /* Dump result of serial interaction. */
   dump_indv_particle_fields(serial_filename, &pi_serial);
   for (size_t i = 0; i < count; i++)
     dump_indv_particle_fields(serial_filename, &pj_serial[i]);
 
   /* Call vector interaction a set number of times. */
-  for (size_t k = 0; k < runs; k++) {
+  for (int k = 0; k < runs; k++) {
     /* Reset particle to initial setup */
     pi_vec = test_part;
     for (size_t i = 0; i < count; i++) pj_vec[i] = parts[i];
@@ -284,45 +288,92 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
       /* Compute the pairwise distance. */
       float r2 = 0.0f;
       float dx[3];
-      for (size_t k = 0; k < 3; k++) {
+      for (int k = 0; k < 3; k++) {
         dx[k] = pi_vec.x[k] - pj_vec[i].x[k];
         r2 += dx[k] * dx[k];
       }
 
       r2q[i] = r2;
-      dxq[3 * i + 0] = dx[0];
-      dxq[3 * i + 1] = dx[1];
-      dxq[3 * i + 2] = dx[2];
+      dxq[i] = dx[0];
       hiq[i] = pi_vec.h;
-      hjq[i] = pj_vec[i].h;
       piq[i] = &pi_vec;
       pjq[i] = &pj_vec[i];
-    }
 
-    /* Only dump data on first run. */
-    if (k == 0) {
-      /* Dump state of particles before vector interaction. */
-      dump_indv_particle_fields(vec_filename, piq[0]);
-      for (size_t i = 0; i < count; i++)
-        dump_indv_particle_fields(vec_filename, pjq[i]);
+      dyq[i] = dx[1];
+      dzq[i] = dx[2];
+      mjq[i] = pj_vec[i].mass;
+      vixq[i] = pi_vec.v[0];
+      viyq[i] = pi_vec.v[1];
+      vizq[i] = pi_vec.v[2];
+      vjxq[i] = pj_vec[i].v[0];
+      vjyq[i] = pj_vec[i].v[1];
+      vjzq[i] = pj_vec[i].v[2];
     }
 
+    /* Perform vector interaction. */
+    vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec;
+    vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
+        curlvySum, curlvzSum;
+    mask_t mask, mask2;
+
+    rhoSum.v = vec_set1(0.f);
+    rho_dhSum.v = vec_set1(0.f);
+    wcountSum.v = vec_set1(0.f);
+    wcount_dhSum.v = vec_set1(0.f);
+    div_vSum.v = vec_set1(0.f);
+    curlvxSum.v = vec_set1(0.f);
+    curlvySum.v = vec_set1(0.f);
+    curlvzSum.v = vec_set1(0.f);
+
+    hi_vec.v = vec_load(&hiq[0]);
+    vix_vec.v = vec_load(&vixq[0]);
+    viy_vec.v = vec_load(&viyq[0]);
+    viz_vec.v = vec_load(&vizq[0]);
+
+    hi_inv_vec = vec_reciprocal(hi_vec);
+    vec_init_mask_true(mask);
+    vec_init_mask_true(mask2);
+
     const ticks vec_tic = getticks();
 
-    /* Perform vector interaction. */
-    for (size_t i = 0; i < count; i += VEC_SIZE) {
-      vec_inter_func(&(r2q[i]), &(dxq[3 * i]), &(hiq[i]), &(hjq[i]), &(piq[i]),
-                     &(pjq[i]));
+    for (size_t i = 0; i < count; i += num_vec_proc * VEC_SIZE) {
+
+      /* Interleave two vectors for interaction. */
+      if (num_vec_proc == 2) {
+        runner_iact_nonsym_2_vec_density(
+            &(r2q[i]), &(dxq[i]), &(dyq[i]), &(dzq[i]), (hi_inv_vec), (vix_vec),
+            (viy_vec), (viz_vec), &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]),
+            &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum,
+            &curlvxSum, &curlvySum, &curlvzSum, mask, mask2, 0);
+      } else { /* Only use one vector for interaction. */
+
+        vector r2, dx, dy, dz;
+        r2.v = vec_load(&(r2q[i]));
+        dx.v = vec_load(&(dxq[i]));
+        dy.v = vec_load(&(dyq[i]));
+        dz.v = vec_load(&(dzq[i]));
+
+        runner_iact_nonsym_1_vec_density(
+            &r2, &dx, &dy, &dz, (hi_inv_vec), (vix_vec), (viy_vec), (viz_vec),
+            &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]), &rhoSum, &rho_dhSum,
+            &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
+            &curlvzSum, mask);
+      }
     }
 
+    VEC_HADD(rhoSum, piq[0]->rho);
+    VEC_HADD(rho_dhSum, piq[0]->density.rho_dh);
+    VEC_HADD(wcountSum, piq[0]->density.wcount);
+    VEC_HADD(wcount_dhSum, piq[0]->density.wcount_dh);
+    VEC_HADD(div_vSum, piq[0]->density.div_v);
+    VEC_HADD(curlvxSum, piq[0]->density.rot_v[0]);
+    VEC_HADD(curlvySum, piq[0]->density.rot_v[1]);
+    VEC_HADD(curlvzSum, piq[0]->density.rot_v[2]);
+
     vec_time += getticks() - vec_tic;
   }
 
-  file = fopen(vec_filename, "a");
-  fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n");
-  fclose(file);
-
-  /* Dump result of vector interaction. */
+  /* Dump result of serial interaction. */
   dump_indv_particle_fields(vec_filename, piq[0]);
   for (size_t i = 0; i < count; i++)
     dump_indv_particle_fields(vec_filename, pjq[i]);
@@ -334,6 +385,7 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
   message("The serial interactions took     : %15lli ticks.",
           serial_time / runs);
   message("The vectorised interactions took : %15lli ticks.", vec_time / runs);
+  message("Speed up: %15fx.", (double)(serial_time) / vec_time);
 }
 
 /* And go... */
@@ -386,62 +438,22 @@ int main(int argc, char *argv[]) {
 
   /* Build the infrastructure */
   static long long partId = 0;
-  struct part density_test_particle, force_test_particle;
-  struct part *density_particles =
-      make_particles(count, offset, spacing, h, &partId);
-  struct part *force_particles =
-      make_particles(count, offset, spacing, h, &partId);
-  prepare_force(force_particles, count);
-
-  /* Define which interactions to call */
-  serial_interaction serial_inter_func = &runner_iact_nonsym_density;
-  vec_interaction vec_inter_func = &runner_iact_nonsym_vec_density;
-
-  density_test_particle = density_particles[0];
+  struct part test_particle;
+  struct part *particles = make_particles(count, offset, spacing, h, &partId);
+
+  test_particle = particles[0];
   /* Call the non-sym density test. */
-  message("Testing non-symmetrical density interaction...");
-  test_interactions(density_test_particle, &density_particles[1], count - 1,
-                    serial_inter_func, vec_inter_func, "test_nonsym_density",
-                    runs);
-
-  density_particles = make_particles(count, offset, spacing, h, &partId);
-
-  /* Re-assign function pointers. */
-  serial_inter_func = &runner_iact_density;
-  vec_inter_func = &runner_iact_vec_density;
-
-  density_test_particle = density_particles[0];
-  /* Call the symmetrical density test. */
-  message("Testing symmetrical density interaction...");
-  test_interactions(density_test_particle, &density_particles[1], count - 1,
-                    serial_inter_func, vec_inter_func, "test_sym_density",
-                    runs);
-
-  /* Re-assign function pointers. */
-  serial_inter_func = &runner_iact_nonsym_force;
-  vec_inter_func = &runner_iact_nonsym_vec_force;
-
-  force_test_particle = force_particles[0];
-  /* Call the test non-sym force test. */
-  message("Testing non-symmetrical force interaction...");
-  test_interactions(force_test_particle, &force_particles[1], count - 1,
-                    serial_inter_func, vec_inter_func, "test_nonsym_force",
-                    runs);
-
-  force_particles = make_particles(count, offset, spacing, h, &partId);
-  prepare_force(force_particles, count);
-
-  /* Re-assign function pointers. */
-  serial_inter_func = &runner_iact_force;
-  vec_inter_func = &runner_iact_vec_force;
-
-  force_test_particle = force_particles[0];
-  /* Call the test symmetrical force test. */
-  message("Testing symmetrical force interaction...");
-  test_interactions(force_test_particle, &force_particles[1], count - 1,
-                    serial_inter_func, vec_inter_func, "test_sym_force", runs);
+  message("Testing %s interaction...", IACT_NAME);
+  test_interactions(test_particle, &particles[1], count - 1, IACT_NAME, runs,
+                    1);
+  test_interactions(test_particle, &particles[1], count - 1, IACT_NAME, runs,
+                    2);
 
   return 0;
 }
 
-#endif /* WITH_VECTORIZATION */
+#else
+
+int main() { return 1; }
+
+#endif
diff --git a/tests/testInteractions.sh.in b/tests/testInteractions.sh.in
new file mode 100644
index 0000000000000000000000000000000000000000..4b002c56e37eff417c673ddac2e44b3edf17683a
--- /dev/null
+++ b/tests/testInteractions.sh.in
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+echo ""
+
+rm -f test_nonsym_density_serial.dat test_nonsym_density_vec.dat
+
+echo "Running ./testInteractions"
+
+./testInteractions
+
+if [ $? != 0 ]; then
+  echo "testInteractions is redundant when vectorisation is disabled"
+else
+  if [ -e test_nonsym_density_serial.dat ]
+  then
+    if python @srcdir@/difffloat.py test_nonsym_density_serial.dat test_nonsym_density_vec.dat @srcdir@/tolerance_testInteractions.dat
+    then
+      echo "Accuracy test passed"
+    else
+      echo "Accuracy test failed"
+      exit 1
+    fi
+  else
+    echo "Error Missing test output file"
+    exit 1
+  fi
+fi
+
+echo "------------"
diff --git a/tests/testKernel.c b/tests/testKernel.c
index a2744119a527cc842cdd4711056eee7a7d7b4270..0658639070526f28ce1bceefc54d3f2d7a3ae765 100644
--- a/tests/testKernel.c
+++ b/tests/testKernel.c
@@ -68,7 +68,7 @@ int main() {
       vx.f[j] = (i + j) * 2.25f / numPoints;
     }
 
-    vx_h.v = vx.v / vec_set1(h);
+    vx_h.v = vec_mul(vx.v, vec_set1(1.f / h));
 
     kernel_deval_1_vec(&vx_h, &W_vec, &dW_vec);
 
@@ -106,8 +106,8 @@ int main() {
       vx_2.f[j] = (i + j) * 2.25f / numPoints;
     }
 
-    vx_h.v = vx.v / vec_set1(h);
-    vx_h_2.v = vx_2.v / vec_set1(h);
+    vx_h.v = vec_mul(vx.v, vec_set1(1.f / h));
+    vx_h_2.v = vec_mul(vx_2.v, vec_set1(1.f / h));
 
     kernel_deval_2_vec(&vx_h, &W_vec, &dW_vec, &vx_h_2, &W_vec_2, &dW_vec_2);
 
diff --git a/tests/testPair.c b/tests/testPair.c
deleted file mode 100644
index e0e813f31531fb1c8d2e24e417a43ac4d07ecf7e..0000000000000000000000000000000000000000
--- a/tests/testPair.c
+++ /dev/null
@@ -1,349 +0,0 @@
-/*******************************************************************************
- * This file is part of SWIFT.
- * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- ******************************************************************************/
-#include "../config.h"
-
-/* Some standard headers. */
-#include <fenv.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-/* Local headers. */
-#include "swift.h"
-
-#if defined(WITH_VECTORIZATION)
-#define DOSELF1 runner_doself1_density_vec
-#define DOPAIR1 runner_dopair1_branch_density
-#define DOSELF1_NAME "runner_doself1_density_vec"
-#define DOPAIR1_NAME "runner_dopair1_density_vec"
-#endif
-
-#ifndef DOSELF1
-#define DOSELF1 runner_doself1_density
-#define DOSELF1_NAME "runner_doself1_density"
-#endif
-
-#ifndef DOPAIR1
-#define DOPAIR1 runner_dopair1_branch_density
-#define DOPAIR1_NAME "runner_dopair1_density"
-#endif
-
-/* n is both particles per axis and box size:
- * particles are generated on a mesh with unit spacing
- */
-struct cell *make_cell(size_t n, double *offset, double size, double h,
-                       double density, unsigned long long *partId,
-                       double pert) {
-  const size_t count = n * n * n;
-  const double volume = size * size * size;
-  struct cell *cell = malloc(sizeof(struct cell));
-  bzero(cell, sizeof(struct cell));
-
-  if (posix_memalign((void **)&cell->parts, part_align,
-                     count * sizeof(struct part)) != 0) {
-    error("couldn't allocate particles, no. of particles: %d", (int)count);
-  }
-  bzero(cell->parts, count * sizeof(struct part));
-
-  /* Construct the parts */
-  struct part *part = cell->parts;
-  for (size_t x = 0; x < n; ++x) {
-    for (size_t y = 0; y < n; ++y) {
-      for (size_t z = 0; z < n; ++z) {
-        part->x[0] =
-            offset[0] +
-            size * (x + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
-        part->x[1] =
-            offset[1] +
-            size * (y + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
-        part->x[2] =
-            offset[2] +
-            size * (z + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
-        // part->v[0] = part->x[0] - 1.5;
-        // part->v[1] = part->x[1] - 1.5;
-        // part->v[2] = part->x[2] - 1.5;
-        part->v[0] = random_uniform(-0.05, 0.05);
-        part->v[1] = random_uniform(-0.05, 0.05);
-        part->v[2] = random_uniform(-0.05, 0.05);
-        part->h = size * h / (float)n;
-        part->id = ++(*partId);
-#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
-        part->conserved.mass = density * volume / count;
-#else
-        part->mass = density * volume / count;
-#endif
-        part->time_bin = 1;
-
-#ifdef SWIFT_DEBUG_CHECKS
-        part->ti_drift = 8;
-        part->ti_kick = 8;
-#endif
-
-        ++part;
-      }
-    }
-  }
-
-  /* Cell properties */
-  cell->split = 0;
-  cell->h_max = h;
-  cell->count = count;
-  cell->dx_max_part = 0.;
-  cell->dx_max_sort = 0.;
-  cell->width[0] = n;
-  cell->width[1] = n;
-  cell->width[2] = n;
-  cell->loc[0] = offset[0];
-  cell->loc[1] = offset[1];
-  cell->loc[2] = offset[2];
-
-  cell->ti_old_part = 8;
-  cell->ti_end_min = 8;
-  cell->ti_end_max = 8;
-
-  shuffle_particles(cell->parts, cell->count);
-
-  cell->sorted = 0;
-  for (int k = 0; k < 13; k++) cell->sort[k] = NULL;
-
-  return cell;
-}
-
-void clean_up(struct cell *ci) {
-  free(ci->parts);
-  for (int k = 0; k < 13; k++)
-    if (ci->sort[k] != NULL) free(ci->sort[k]);
-  free(ci);
-}
-
-/**
- * @brief Initializes all particles field to be ready for a density calculation
- */
-void zero_particle_fields(struct cell *c) {
-  for (int pid = 0; pid < c->count; pid++) {
-    hydro_init_part(&c->parts[pid], NULL);
-  }
-}
-
-/**
- * @brief Dump all the particles to a file
- */
-void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) {
-  FILE *file = fopen(fileName, "w");
-
-  /* Write header */
-  fprintf(file,
-          "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s "
-          "%13s %13s %13s\n",
-          "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh",
-          "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz");
-
-  fprintf(file, "# ci --------------------------------------------\n");
-
-  for (int pid = 0; pid < ci->count; pid++) {
-    fprintf(file,
-            "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e "
-            "%13e %13e %13e\n",
-            ci->parts[pid].id, ci->parts[pid].x[0], ci->parts[pid].x[1],
-            ci->parts[pid].x[2], ci->parts[pid].v[0], ci->parts[pid].v[1],
-            ci->parts[pid].v[2], hydro_get_density(&ci->parts[pid]),
-#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
-            0.f,
-#else
-            ci->parts[pid].density.rho_dh,
-#endif
-            ci->parts[pid].density.wcount, ci->parts[pid].density.wcount_dh,
-#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH)
-            ci->parts[pid].density.div_v, ci->parts[pid].density.rot_v[0],
-            ci->parts[pid].density.rot_v[1], ci->parts[pid].density.rot_v[2]
-#else
-            0., 0., 0., 0.
-#endif
-            );
-  }
-
-  fprintf(file, "# cj --------------------------------------------\n");
-
-  for (int pjd = 0; pjd < cj->count; pjd++) {
-    fprintf(file,
-            "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e "
-            "%13e %13e %13e\n",
-            cj->parts[pjd].id, cj->parts[pjd].x[0], cj->parts[pjd].x[1],
-            cj->parts[pjd].x[2], cj->parts[pjd].v[0], cj->parts[pjd].v[1],
-            cj->parts[pjd].v[2], hydro_get_density(&cj->parts[pjd]),
-#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
-            0.f,
-#else
-            cj->parts[pjd].density.rho_dh,
-#endif
-            cj->parts[pjd].density.wcount, cj->parts[pjd].density.wcount_dh,
-#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH)
-            cj->parts[pjd].density.div_v, cj->parts[pjd].density.rot_v[0],
-            cj->parts[pjd].density.rot_v[1], cj->parts[pjd].density.rot_v[2]
-#else
-            0., 0., 0., 0.
-#endif
-            );
-  }
-
-  fclose(file);
-}
-
-/* Just a forward declaration... */
-void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
-void runner_doself1_density_vec(struct runner *r, struct cell *ci);
-void runner_dopair1_branch_density(struct runner *r, struct cell *ci,
-                                   struct cell *cj);
-
-int main(int argc, char *argv[]) {
-  size_t particles = 0, runs = 0, volume, type = 0;
-  double offset[3] = {0, 0, 0}, h = 1.1255, size = 1., rho = 1.;
-  double perturbation = 0.;
-  struct cell *ci, *cj;
-  struct space space;
-  struct engine engine;
-  struct runner runner;
-  char c;
-  static unsigned long long partId = 0;
-  char outputFileNameExtension[200] = "";
-  char outputFileName[200] = "";
-  ticks tic, toc, time;
-
-  /* Initialize CPU frequency, this also starts time. */
-  unsigned long long cpufreq = 0;
-  clocks_set_cpufreq(cpufreq);
-
-  /* Choke on FP-exceptions */
-  feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
-
-  srand(0);
-
-  while ((c = getopt(argc, argv, "h:p:r:t:d:f:")) != -1) {
-    switch (c) {
-      case 'h':
-        sscanf(optarg, "%lf", &h);
-        break;
-      case 'p':
-        sscanf(optarg, "%zu", &particles);
-        break;
-      case 'r':
-        sscanf(optarg, "%zu", &runs);
-        break;
-      case 't':
-        sscanf(optarg, "%zu", &type);
-        break;
-      case 'd':
-        sscanf(optarg, "%lf", &perturbation);
-        break;
-      case 'f':
-        strcpy(outputFileNameExtension, optarg);
-        break;
-      case '?':
-        error("Unknown option.");
-        break;
-    }
-  }
-
-  if (h < 0 || particles == 0 || runs == 0 || type > 2) {
-    printf(
-        "\nUsage: %s -p PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n"
-        "\nGenerates a cell pair, filled with particles on a Cartesian grid."
-        "\nThese are then interacted using runner_dopair1_density."
-        "\n\nOptions:"
-        "\n-t TYPE=0          - cells share face (0), edge (1) or corner (2)"
-        "\n-h DISTANCE=1.1255 - smoothing length"
-        "\n-d pert            - perturbation to apply to the particles [0,1["
-        "\n-f fileName        - part of the file name used to save the dumps\n",
-        argv[0]);
-    exit(1);
-  }
-
-  space.periodic = 0;
-
-  engine.s = &space;
-  engine.time = 0.1f;
-  engine.ti_current = 8;
-  engine.max_active_bin = num_time_bins;
-  runner.e = &engine;
-
-  volume = particles * particles * particles;
-  message("particles: %zu B\npositions: 0 B", 2 * volume * sizeof(struct part));
-
-  ci = make_cell(particles, offset, size, h, rho, &partId, perturbation);
-  for (size_t i = 0; i < type + 1; ++i) offset[i] = 1.;
-  cj = make_cell(particles, offset, size, h, rho, &partId, perturbation);
-
-  runner_do_sort(&runner, ci, 0x1FFF, 0, 0);
-  runner_do_sort(&runner, cj, 0x1FFF, 0, 0);
-
-  time = 0;
-  for (size_t i = 0; i < runs; ++i) {
-    /* Zero the fields */
-    zero_particle_fields(ci);
-    zero_particle_fields(cj);
-
-    tic = getticks();
-
-#if defined(DEFAULT_SPH) || !defined(WITH_VECTORIZATION)
-    /* Run the test */
-    DOPAIR1(&runner, ci, cj);
-#endif
-
-    toc = getticks();
-    time += toc - tic;
-
-    /* Dump if necessary */
-    if (i % 50 == 0) {
-      sprintf(outputFileName, "swift_dopair_%s.dat", outputFileNameExtension);
-      dump_particle_fields(outputFileName, ci, cj);
-    }
-  }
-
-  /* Output timing */
-  message("SWIFT calculation took       %lli ticks.", time / runs);
-
-  /* Now perform a brute-force version for accuracy tests */
-
-  /* Zero the fields */
-  zero_particle_fields(ci);
-  zero_particle_fields(cj);
-
-  tic = getticks();
-
-#if defined(DEFAULT_SPH) || !defined(WITH_VECTORIZATION)
-  /* Run the brute-force test */
-  pairs_all_density(&runner, ci, cj);
-#endif
-
-  toc = getticks();
-
-  /* Dump */
-  sprintf(outputFileName, "brute_force_%s.dat", outputFileNameExtension);
-  dump_particle_fields(outputFileName, ci, cj);
-
-  /* Output timing */
-  message("Brute force calculation took %lli ticks.", toc - tic);
-
-  /* Clean things to make the sanitizer happy ... */
-  clean_up(ci);
-  clean_up(cj);
-
-  return 0;
-}
diff --git a/tests/testPair.sh.in b/tests/testPair.sh.in
deleted file mode 100755
index bd7051b060c4acab6cf5a164af1914715856849b..0000000000000000000000000000000000000000
--- a/tests/testPair.sh.in
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-echo ""
-
-rm -f brute_force_standard.dat swift_dopair_standard.dat
-
-./testPair -p 6 -r 1 -d 0 -f standard
-
-python @srcdir@/difffloat.py brute_force_standard.dat swift_dopair_standard.dat @srcdir@/tolerance_pair_normal.dat
-
-exit $?
diff --git a/tests/testPairPerturbed.sh.in b/tests/testPairPerturbed.sh.in
deleted file mode 100755
index 9f214e25a098448a906f9da307ea569e327cfdea..0000000000000000000000000000000000000000
--- a/tests/testPairPerturbed.sh.in
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-echo ""
-
-rm -f brute_force_perturbed.dat swift_dopair_perturbed.dat
-
-./testPair -p 6 -r 1 -d 0.1 -f perturbed
-
-python @srcdir@/difffloat.py brute_force_perturbed.dat swift_dopair_perturbed.dat @srcdir@/tolerance_pair_perturbed.dat
-
-exit $?
diff --git a/tests/testPeriodicBC.c b/tests/testPeriodicBC.c
index 495442efac853517891c1efdec54f1f5e8e14e75..6fa2dc607b996b9e8508338a9806633c5a4d1a89 100644
--- a/tests/testPeriodicBC.c
+++ b/tests/testPeriodicBC.c
@@ -517,7 +517,8 @@ int main(int argc, char *argv[]) {
 
   const int half_dim = (dim - 1) / 2;
 
-  /* Test the periodic boundary conditions for each of the 8 corners. */
+  /* Test the periodic boundary conditions for each of the 8 corners. Interact
+   * each corner with all of its 26 neighbours.*/
   test_boundary_conditions(cells, runner, 0, 0, 0, dim, swiftOutputFileName,
                            bruteForceOutputFileName);
   test_boundary_conditions(cells, runner, dim - 1, 0, 0, dim,
diff --git a/tests/tolerance_27_perturbed_h.dat b/tests/tolerance_27_perturbed_h.dat
index 495c865783ae4fc6cab76f11e945efe9ac21870f..5142c2a2090e15381a19b2bc71e253a482973b11 100644
--- a/tests/tolerance_27_perturbed_h.dat
+++ b/tests/tolerance_27_perturbed_h.dat
@@ -1,4 +1,4 @@
 #   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   2e-6       1e-4	    5e-4       1.2e-2		 1e-5	     3e-6	   3e-6		 7e-6
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1.4e-2	    1e-5       2e-3		 2.5e-4	     3e-3	   3e-3	 	 3e-3
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1e-6	    1e-6       1e0		 1e-6	     4e-6	   4e-6		 4e-6
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         2.4e-6       1e-4	        5e-4       1.2e-2		         1e-5	       3e-6	         3e-6		       8e-6
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         1.2e-6	      1.4e-2	      1e-5       2e-3		           2.5e-4	     3e-3	         3e-3	 	       3e-3
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         1e-6	      1e-6	        1e-6       1e0		           1e-6	       4e-6	         4e-6		       4e-6
diff --git a/tests/tolerance_27_perturbed_h2.dat b/tests/tolerance_27_perturbed_h2.dat
new file mode 100644
index 0000000000000000000000000000000000000000..23f6a5006124f6233aebd111005760a5dcc5b6a3
--- /dev/null
+++ b/tests/tolerance_27_perturbed_h2.dat
@@ -0,0 +1,4 @@
+#   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         3e-6       1e-4	        5e-4       1.5e-2		         1.4e-5	       3e-6	         3e-6		       9e-6
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         1.5e-6	    1.57e-2	      1e-5       4.74e-3		       3.89e-4	     3e-3	         3e-3	 	       3e-3
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         1e-6	      1e-6	        1e-6       1e0		           1e-6	         4e-6	         4e-6		       4e-6
diff --git a/tests/tolerance_pair_active.dat b/tests/tolerance_pair_active.dat
new file mode 100644
index 0000000000000000000000000000000000000000..b07697a686eb7801326ceaf77cf93fb3a1491c2e
--- /dev/null
+++ b/tests/tolerance_pair_active.dat
@@ -0,0 +1,4 @@
+#   ID     wcount
+    0	     1e-2
+    0	     1e-2
+    0	     1e-2
diff --git a/tests/tolerance_pair_perturbed.dat b/tests/tolerance_pair_perturbed.dat
deleted file mode 100644
index 59a80a7b1fa0db373b7cb43b9793146330b17d53..0000000000000000000000000000000000000000
--- a/tests/tolerance_pair_perturbed.dat
+++ /dev/null
@@ -1,4 +0,0 @@
-#   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-5	      1e-5	    2e-5       3e-2		 1e-5	     1e-5	   1e-5		 1e-5
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-3	      4e-4	    8e-3       2e-2		 1e-4	     1.6e-4	   1.6e-4	 1.6e-4
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1e-6	    1e-6       1e-6		 1e-6	     1e-6	   1e-6		 1e-6
diff --git a/tests/tolerance_pair_normal.dat b/tests/tolerance_testInteractions.dat
similarity index 75%
rename from tests/tolerance_pair_normal.dat
rename to tests/tolerance_testInteractions.dat
index 3fca591dce8156a08c970e4a6579b56b2de12553..ebb376bf26bfdc0fb2107ab720bbf9eca5a35bce 100644
--- a/tests/tolerance_pair_normal.dat
+++ b/tests/tolerance_testInteractions.dat
@@ -1,4 +1,4 @@
 #   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-5	      1e-5	    2e-5       3e-2		 1e-5	     1e-5	   1e-5		 1e-5
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-5	      1.2e-5	    1e-5       1e-2		 1e-4	     1e-4	   1e-4		 1e-4
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-5	      4e-5	    4e-4       1e-2		 1e-5	     6e-6	   6e-6		 6e-6
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1.2e-4	  1e-4       2e-4		 2e-4	     1e-4	   1e-4	 	 1e-4
     0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1e-6	    1e-6       1e-6		 1e-6	     1e-6	   1e-6		 1e-6