diff --git a/.gitignore b/.gitignore
index 0c4d8bddadb17afcc29aa3dc1d4dba6cb6fd2f5b..e3e17bb82a01d9af0ace6ed72d196cf2dba242f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,7 +34,7 @@ examples/*/*/*.txt
 examples/*/*/used_parameters.yml
 examples/*/gravity_checks_*.dat
 
-tests/testPair
+tests/testActivePair
 tests/brute_force_periodic_BC_standard.dat
 tests/swift_periodic_BC_standard.dat
 tests/brute_force_periodic_BC_pertrubed.dat
@@ -54,6 +54,11 @@ tests/brute_force_125_standard.dat
 tests/swift_dopair_125_standard.dat
 tests/brute_force_125_perturbed.dat
 tests/swift_dopair_125_perturbed.dat
+tests/brute_force_active.dat
+tests/brute_force_periodic_BC_perturbed.dat
+tests/swift_dopair_active.dat
+tests/test_nonsym_density_serial.dat
+tests/test_nonsym_density_vec.dat
 tests/testGreetings
 tests/testReading
 tests/input.hdf5
@@ -75,8 +80,6 @@ tests/test27cells.sh
 tests/test27cellsPerturbed.sh
 tests/test125cells.sh
 tests/test125cellsPerturbed.sh
-tests/testPair.sh
-tests/testPairPerturbed.sh
 tests/testParser.sh
 tests/testReading.sh
 tests/testAdiabaticIndex
diff --git a/configure.ac b/configure.ac
index 95519863f6e303cde51bbef7e2046faa314f46ee..74fede99f4fbf578af4e703cedaa42f2c278b037 100644
--- a/configure.ac
+++ b/configure.ac
@@ -861,14 +861,14 @@ AM_CONDITIONAL([HAVE_DOXYGEN], [test "$ac_cv_path_ac_pt_DX_DOXYGEN" != ""])
 # Handle .in files.
 AC_CONFIG_FILES([Makefile src/Makefile examples/Makefile doc/Makefile doc/Doxyfile tests/Makefile])
 AC_CONFIG_FILES([tests/testReading.sh], [chmod +x tests/testReading.sh])
-AC_CONFIG_FILES([tests/testPair.sh], [chmod +x tests/testPair.sh])
-AC_CONFIG_FILES([tests/testPairPerturbed.sh], [chmod +x tests/testPairPerturbed.sh])
+AC_CONFIG_FILES([tests/testActivePair.sh], [chmod +x tests/testActivePair.sh])
 AC_CONFIG_FILES([tests/test27cells.sh], [chmod +x tests/test27cells.sh])
 AC_CONFIG_FILES([tests/test27cellsPerturbed.sh], [chmod +x tests/test27cellsPerturbed.sh])
 AC_CONFIG_FILES([tests/test125cells.sh], [chmod +x tests/test125cells.sh])
 AC_CONFIG_FILES([tests/test125cellsPerturbed.sh], [chmod +x tests/test125cellsPerturbed.sh])
 AC_CONFIG_FILES([tests/testPeriodicBC.sh], [chmod +x tests/testPeriodicBC.sh])
 AC_CONFIG_FILES([tests/testPeriodicBCPerturbed.sh], [chmod +x tests/testPeriodicBCPerturbed.sh])
+AC_CONFIG_FILES([tests/testInteractions.sh], [chmod +x tests/testInteractions.sh])
 AC_CONFIG_FILES([tests/testParser.sh], [chmod +x tests/testParser.sh])
 
 # Save the compilation options
diff --git a/src/align.h b/src/align.h
index 2e541043cc3866d76979277f3c143a45657684ca..54435c4c9baa1ce9dc511e2903b7e2be2d6655de 100644
--- a/src/align.h
+++ b/src/align.h
@@ -58,6 +58,22 @@
 #define swift_align_information(array, alignment) ;
 #endif
 
+/**
+ * @brief Macro to create a restrict pointer to an array and tell the compiler
+ * that the given array has the specified
+ * alignment.
+ *
+ * Note that this turns into a no-op but gives information to the compiler.
+ *
+ * @param array The array.
+ * @param ptr Pointer to array
+ * @param type Type of array
+ * @param alignment The alignment in bytes of the array.
+ */
+#define swift_declare_aligned_ptr(type, array, ptr, alignment) \
+  type *restrict array = ptr;                                  \
+  swift_align_information(array, alignment);
+
 /**
  * @brief Macro to tell the compiler that a given number is 0 modulo a given
  * size.
diff --git a/src/cache.h b/src/cache.h
index 1b6d89f03fafa1595cb9bdf857c9f2bc1cba316e..170eb8f5cb0939e85647bbb4622d66cc78ef3dbe 100644
--- a/src/cache.h
+++ b/src/cache.h
@@ -23,6 +23,7 @@
 #include "../config.h"
 
 /* Local headers */
+#include "align.h"
 #include "cell.h"
 #include "error.h"
 #include "part.h"
@@ -30,9 +31,7 @@
 #include "vector.h"
 
 #define NUM_VEC_PROC 2
-#define CACHE_ALIGN 64
 #define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE)
-#define C2_CACHE_ALIGN sizeof(float) * VEC_SIZE
 
 #ifdef WITH_VECTORIZATION
 /* Cache struct to hold a local copy of a cells' particle
@@ -40,31 +39,31 @@
 struct cache {
 
   /* Particle x position. */
-  float *restrict x __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict x SWIFT_CACHE_ALIGN;
 
   /* Particle y position. */
-  float *restrict y __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict y SWIFT_CACHE_ALIGN;
 
   /* Particle z position. */
-  float *restrict z __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict z SWIFT_CACHE_ALIGN;
 
   /* Particle smoothing length. */
-  float *restrict h __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict h SWIFT_CACHE_ALIGN;
 
   /* Particle mass. */
-  float *restrict m __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict m SWIFT_CACHE_ALIGN;
 
   /* Particle x velocity. */
-  float *restrict vx __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict vx SWIFT_CACHE_ALIGN;
 
   /* Particle y velocity. */
-  float *restrict vy __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict vy SWIFT_CACHE_ALIGN;
 
   /* Particle z velocity. */
-  float *restrict vz __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict vz SWIFT_CACHE_ALIGN;
 
-  /* Maximum distance of particles into neighbouring cell. */
-  float *restrict max_d __attribute__((aligned(CACHE_ALIGN)));
+  /* Maximum index into neighbouring cell for particles that are in range. */
+  int *restrict max_index SWIFT_CACHE_ALIGN;
 
   /* Cache size. */
   int count;
@@ -75,28 +74,28 @@ struct cache {
 struct c2_cache {
 
   /* Separation between two particles squared. */
-  float r2q[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float r2q[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* x separation between two particles. */
-  float dxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float dxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* y separation between two particles. */
-  float dyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float dyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* z separation between two particles. */
-  float dzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float dzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* Mass of particle pj. */
-  float mq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float mq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* x velocity of particle pj. */
-  float vxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float vxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* y velocity of particle pj. */
-  float vyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float vyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* z velocity of particle pj. */
-  float vzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float vzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 };
 
 /**
@@ -111,9 +110,10 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
   /* Align cache on correct byte boundary and pad cache size to be a multiple of
    * the vector size
    * and include 2 vector lengths for remainder operations. */
-  unsigned int pad = 2 * VEC_SIZE, rem = count % VEC_SIZE;
+  size_t pad = 2 * VEC_SIZE, rem = count % VEC_SIZE;
   if (rem > 0) pad += VEC_SIZE - rem;
-  unsigned int sizeBytes = (count + pad) * sizeof(float);
+  size_t sizeBytes = (count + pad) * sizeof(float);
+  size_t sizeIntBytes = (count + pad) * sizeof(int);
   int error = 0;
 
   /* Free memory if cache has already been allocated. */
@@ -126,18 +126,19 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
     free(c->vy);
     free(c->vz);
     free(c->h);
-    free(c->max_d);
+    free(c->max_index);
   }
 
-  error += posix_memalign((void **)&c->x, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->y, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->z, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->m, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->vx, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->vy, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->vz, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->h, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->max_d, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->vx, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->vy, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->vz, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->h, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->max_index, SWIFT_CACHE_ALIGNMENT,
+                          sizeIntBytes);
 
   if (error != 0)
     error("Couldn't allocate cache, no. of particles: %d", (int)count);
@@ -151,156 +152,43 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
  * @param ci_cache The cache.
  */
 __attribute__((always_inline)) INLINE void cache_read_particles(
-    const struct cell *const ci, struct cache *const ci_cache) {
+    const struct cell *restrict const ci,
+    struct cache *restrict const ci_cache) {
 
 #if defined(GADGET2_SPH)
 
-/* Shift the particles positions to a local frame so single precision can be
- * used instead of double precision. */
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma vector aligned
-#endif
-  for (int i = 0; i < ci->count; i++) {
-    ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0];
-    ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1];
-    ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2];
-    ci_cache->h[i] = ci->parts[i].h;
-
-    ci_cache->m[i] = ci->parts[i].mass;
-    ci_cache->vx[i] = ci->parts[i].v[0];
-    ci_cache->vy[i] = ci->parts[i].v[1];
-    ci_cache->vz[i] = ci->parts[i].v[2];
-  }
-
-#endif
-}
-
-/**
- * @brief Populate cache by reading in the particles from two cells in unsorted
- * order.
- *
- * @param ci The i #cell.
- * @param cj The j #cell.
- * @param ci_cache The cache for cell ci.
- * @param cj_cache The cache for cell cj.
- * @param shift The amount to shift the particle positions to account for BCs
- */
-__attribute__((always_inline)) INLINE void cache_read_two_cells(
-    const struct cell *const ci, const struct cell *const cj,
-    struct cache *const ci_cache, struct cache *const cj_cache,
-    const double *const shift) {
-
-  /* Shift the particles positions to a local frame (ci frame) so single
-   * precision can be
-   * used instead of double precision. Also shift the cell ci, particles
-   * positions due to BCs but leave cell cj. */
-  for (int i = 0; i < ci->count; i++) {
-    ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0] - shift[0];
-    ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1] - shift[1];
-    ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2] - shift[2];
-    ci_cache->h[i] = ci->parts[i].h;
-
-    ci_cache->m[i] = ci->parts[i].mass;
-    ci_cache->vx[i] = ci->parts[i].v[0];
-    ci_cache->vy[i] = ci->parts[i].v[1];
-    ci_cache->vz[i] = ci->parts[i].v[2];
-  }
-
-  for (int i = 0; i < cj->count; i++) {
-    cj_cache->x[i] = cj->parts[i].x[0] - ci->loc[0];
-    cj_cache->y[i] = cj->parts[i].x[1] - ci->loc[1];
-    cj_cache->z[i] = cj->parts[i].x[2] - ci->loc[2];
-    cj_cache->h[i] = cj->parts[i].h;
-
-    cj_cache->m[i] = cj->parts[i].mass;
-    cj_cache->vx[i] = cj->parts[i].v[0];
-    cj_cache->vy[i] = cj->parts[i].v[1];
-    cj_cache->vz[i] = cj->parts[i].v[2];
-  }
-}
-
-__attribute__((always_inline)) INLINE void cache_read_cell_sorted(
-    const struct cell *const ci, struct cache *const ci_cache,
-    const struct entry *restrict sort_i, double *const loc,
-    double *const shift) {
-
-  int idx;
-/* Shift the particles positions to a local frame (ci frame) so single precision
- * can be
- * used instead of double precision. Also shift the cell ci, particles positions
- * due to BCs but leave cell cj. */
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma simd
-#endif
-  for (int i = 0; i < ci->count; i++) {
-    idx = sort_i[i].i;
-
-    ci_cache->x[i] = ci->parts[idx].x[0] - loc[0] - shift[0];
-    ci_cache->y[i] = ci->parts[idx].x[1] - loc[1] - shift[1];
-    ci_cache->z[i] = ci->parts[idx].x[2] - loc[2] - shift[2];
-    ci_cache->h[i] = ci->parts[idx].h;
-
-    ci_cache->m[i] = ci->parts[idx].mass;
-    ci_cache->vx[i] = ci->parts[idx].v[0];
-    ci_cache->vy[i] = ci->parts[idx].v[1];
-    ci_cache->vz[i] = ci->parts[idx].v[2];
-  }
-}
-
-/**
- * @brief Populate cache by reading in the particles from two cells in sorted
- * order.
- *
- * @param ci The i #cell.
- * @param cj The j #cell.
- * @param ci_cache The #cache for cell ci.
- * @param cj_cache The #cache for cell cj.
- * @param sort_i The array of sorted particle indices for cell ci.
- * @param sort_j The array of sorted particle indices for cell ci.
- * @param shift The amount to shift the particle positions to account for BCs
- */
-__attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
-    const struct cell *const ci, const struct cell *const cj,
-    struct cache *const ci_cache, struct cache *const cj_cache,
-    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
-    const double *const shift) {
-
-  int idx;
-/* Shift the particles positions to a local frame (ci frame) so single precision
- * can be
- * used instead of double precision. Also shift the cell ci, particles positions
- * due to BCs but leave cell cj. */
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma simd
-#endif
+  /* Let the compiler know that the data is aligned and create pointers to the
+   * arrays inside the cache. */
+  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
+
+  const struct part *restrict parts = ci->parts;
+  double loc[3];
+  loc[0] = ci->loc[0];
+  loc[1] = ci->loc[1];
+  loc[2] = ci->loc[2];
+
+  /* Shift the particles positions to a local frame so single precision can be
+   * used instead of double precision. */
   for (int i = 0; i < ci->count; i++) {
-    idx = sort_i[i].i;
-    ci_cache->x[i] = ci->parts[idx].x[0] - ci->loc[0] - shift[0];
-    ci_cache->y[i] = ci->parts[idx].x[1] - ci->loc[1] - shift[1];
-    ci_cache->z[i] = ci->parts[idx].x[2] - ci->loc[2] - shift[2];
-    ci_cache->h[i] = ci->parts[idx].h;
-
-    ci_cache->m[i] = ci->parts[idx].mass;
-    ci_cache->vx[i] = ci->parts[idx].v[0];
-    ci_cache->vy[i] = ci->parts[idx].v[1];
-    ci_cache->vz[i] = ci->parts[idx].v[2];
+    x[i] = (float)(parts[i].x[0] - loc[0]);
+    y[i] = (float)(parts[i].x[1] - loc[1]);
+    z[i] = (float)(parts[i].x[2] - loc[2]);
+    h[i] = parts[i].h;
+
+    m[i] = parts[i].mass;
+    vx[i] = parts[i].v[0];
+    vy[i] = parts[i].v[1];
+    vz[i] = parts[i].v[2];
   }
 
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma simd
 #endif
-  for (int i = 0; i < cj->count; i++) {
-    idx = sort_j[i].i;
-    cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0];
-    cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1];
-    cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2];
-    cj_cache->h[i] = cj->parts[idx].h;
-
-    cj_cache->m[i] = cj->parts[idx].mass;
-    cj_cache->vx[i] = cj->parts[idx].v[0];
-    cj_cache->vy[i] = cj->parts[idx].v[1];
-    cj_cache->vz[i] = cj->parts[idx].v[2];
-  }
 }
 
 /**
@@ -321,13 +209,13 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
  * interaction.
  */
 __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
-    const struct cell *const ci, const struct cell *const cj,
-    struct cache *const ci_cache, struct cache *const cj_cache,
-    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
-    const double *const shift, int *first_pi, int *last_pj,
-    const int num_vec_proc) {
+    const struct cell *restrict const ci, const struct cell *restrict const cj,
+    struct cache *restrict const ci_cache,
+    struct cache *restrict const cj_cache, const struct entry *restrict sort_i,
+    const struct entry *restrict sort_j, const double *restrict const shift,
+    int *first_pi, int *last_pj, const int num_vec_proc) {
 
-  int idx, ci_cache_idx;
+  int idx;
   /* Pad number of particles read to the vector size. */
   int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE);
   if (rem != 0) {
@@ -345,74 +233,97 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
 
   int first_pi_align = *first_pi;
   int last_pj_align = *last_pj;
-
-/* Shift the particles positions to a local frame (ci frame) so single precision
- * can be
- * used instead of double precision. Also shift the cell ci, particles positions
- * due to BCs but leave cell cj. */
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma vector aligned
-#endif
-  for (int i = first_pi_align; i < ci->count; i++) {
-    /* Make sure ci_cache is filled from the first element. */
-    ci_cache_idx = i - first_pi_align;
-    idx = sort_i[i].i;
-    ci_cache->x[ci_cache_idx] = ci->parts[idx].x[0] - ci->loc[0] - shift[0];
-    ci_cache->y[ci_cache_idx] = ci->parts[idx].x[1] - ci->loc[1] - shift[1];
-    ci_cache->z[ci_cache_idx] = ci->parts[idx].x[2] - ci->loc[2] - shift[2];
-    ci_cache->h[ci_cache_idx] = ci->parts[idx].h;
-
-    ci_cache->m[ci_cache_idx] = ci->parts[idx].mass;
-    ci_cache->vx[ci_cache_idx] = ci->parts[idx].v[0];
-    ci_cache->vy[ci_cache_idx] = ci->parts[idx].v[1];
-    ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2];
+  const struct part *restrict parts_i = ci->parts;
+  const struct part *restrict parts_j = cj->parts;
+  double loc[3];
+  loc[0] = ci->loc[0];
+  loc[1] = ci->loc[1];
+  loc[2] = ci->loc[2];
+
+  /* Let the compiler know that the data is aligned and create pointers to the
+   * arrays inside the cache. */
+  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
+
+  int ci_cache_count = ci->count - first_pi_align;
+  /* Shift the particles positions to a local frame (ci frame) so single
+   * precision
+   * can be
+   * used instead of double precision. Also shift the cell ci, particles
+   * positions
+   * due to BCs but leave cell cj. */
+  for (int i = 0; i < ci_cache_count; i++) {
+    idx = sort_i[i + first_pi_align].i;
+    x[i] = (float)(parts_i[idx].x[0] - loc[0] - shift[0]);
+    y[i] = (float)(parts_i[idx].x[1] - loc[1] - shift[1]);
+    z[i] = (float)(parts_i[idx].x[2] - loc[2] - shift[2]);
+    h[i] = parts_i[idx].h;
+
+    m[i] = parts_i[idx].mass;
+    vx[i] = parts_i[idx].v[0];
+    vy[i] = parts_i[idx].v[1];
+    vz[i] = parts_i[idx].v[2];
   }
 
   /* Pad cache with fake particles that exist outside the cell so will not
    * interact.*/
-  float fake_pix = 2.0f * ci->parts[sort_i[ci->count - 1].i].x[0];
+  float fake_pix = 2.0f * parts_i[sort_i[ci->count - 1].i].x[0];
   for (int i = ci->count - first_pi_align;
        i < ci->count - first_pi_align + VEC_SIZE; i++) {
-    ci_cache->x[i] = fake_pix;
-    ci_cache->y[i] = 1.f;
-    ci_cache->z[i] = 1.f;
-    ci_cache->h[i] = 1.f;
-
-    ci_cache->m[i] = 1.f;
-    ci_cache->vx[i] = 1.f;
-    ci_cache->vy[i] = 1.f;
-    ci_cache->vz[i] = 1.f;
+    x[i] = fake_pix;
+    y[i] = 1.f;
+    z[i] = 1.f;
+    h[i] = 1.f;
+
+    m[i] = 1.f;
+    vx[i] = 1.f;
+    vy[i] = 1.f;
+    vz[i] = 1.f;
   }
 
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma vector aligned
-#endif
+  /* Let the compiler know that the data is aligned and create pointers to the
+   * arrays inside the cache. */
+  swift_declare_aligned_ptr(float, xj, cj_cache->x, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, yj, cj_cache->y, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, zj, cj_cache->z, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, hj, cj_cache->h, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, mj, cj_cache->m, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vxj, cj_cache->vx, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vyj, cj_cache->vy, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT);
+
   for (int i = 0; i <= last_pj_align; i++) {
     idx = sort_j[i].i;
-    cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0];
-    cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1];
-    cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2];
-    cj_cache->h[i] = cj->parts[idx].h;
-
-    cj_cache->m[i] = cj->parts[idx].mass;
-    cj_cache->vx[i] = cj->parts[idx].v[0];
-    cj_cache->vy[i] = cj->parts[idx].v[1];
-    cj_cache->vz[i] = cj->parts[idx].v[2];
+    xj[i] = (float)(parts_j[idx].x[0] - loc[0]);
+    yj[i] = (float)(parts_j[idx].x[1] - loc[1]);
+    zj[i] = (float)(parts_j[idx].x[2] - loc[2]);
+    hj[i] = parts_j[idx].h;
+
+    mj[i] = parts_j[idx].mass;
+    vxj[i] = parts_j[idx].v[0];
+    vyj[i] = parts_j[idx].v[1];
+    vzj[i] = parts_j[idx].v[2];
   }
 
   /* Pad cache with fake particles that exist outside the cell so will not
    * interact.*/
   float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0];
   for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
-    cj_cache->x[i] = fake_pjx;
-    cj_cache->y[i] = 1.f;
-    cj_cache->z[i] = 1.f;
-    cj_cache->h[i] = 1.f;
-
-    cj_cache->m[i] = 1.f;
-    cj_cache->vx[i] = 1.f;
-    cj_cache->vy[i] = 1.f;
-    cj_cache->vz[i] = 1.f;
+    xj[i] = fake_pjx;
+    yj[i] = 1.f;
+    zj[i] = 1.f;
+    hj[i] = 1.f;
+
+    mj[i] = 1.f;
+    vxj[i] = 1.f;
+    vyj[i] = 1.f;
+    vzj[i] = 1.f;
   }
 }
 
@@ -430,7 +341,7 @@ static INLINE void cache_clean(struct cache *c) {
     free(c->vy);
     free(c->vz);
     free(c->h);
-    free(c->max_d);
+    free(c->max_index);
   }
 }
 
diff --git a/src/dimension.h b/src/dimension.h
index 4ee9377b4f9608b2549f3dfee2a683b61a71c76a..0b2093d718a61c6ce850db1970412af3e2e462b9 100644
--- a/src/dimension.h
+++ b/src/dimension.h
@@ -275,11 +275,11 @@ __attribute__((always_inline)) INLINE static vector pow_dimension_vec(
 
 #if defined(HYDRO_DIMENSION_3D)
 
-  return (vector)(x.v * x.v * x.v);
+  return (vector)(vec_mul(vec_mul(x.v, x.v), x.v));
 
 #elif defined(HYDRO_DIMENSION_2D)
 
-  return (vector)(x.v * x.v);
+  return (vector)(vec_mul(x.v, x.v));
 
 #elif defined(HYDRO_DIMENSION_1D)
 
@@ -304,16 +304,16 @@ __attribute__((always_inline)) INLINE static vector pow_dimension_plus_one_vec(
 
 #if defined(HYDRO_DIMENSION_3D)
 
-  const vector x2 = (vector)(x.v * x.v);
-  return (vector)(x2.v * x2.v);
+  const vector x2 = (vector)(vec_mul(x.v, x.v));
+  return (vector)(vec_mul(x2.v, x2.v));
 
 #elif defined(HYDRO_DIMENSION_2D)
 
-  return (vector)(x.v * x.v * x.v);
+  return (vector)(vec_mul(x.v, vec_mul(x.v, x.v)));
 
 #elif defined(HYDRO_DIMENSION_1D)
 
-  return (vector)(x.v * x.v);
+  return (vector)(vec_mul(x.v, x.v));
 
 #else
 
diff --git a/src/engine.c b/src/engine.c
index 99c4baee1e6856fa08789b7b81103970ded3d57e..d20dab45643d0d61f9246d24aaa670a35a14e1fd 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -227,8 +227,8 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
                               /* implicit = */ 1, c, NULL);
         engine_add_ghosts(e, c, c->ghost_in, c->ghost_out);
 
-/* Generate the extra ghost task. */
 #ifdef EXTRA_HYDRO_LOOP
+        /* Generate the extra ghost task. */
         c->extra_ghost = scheduler_addtask(s, task_type_extra_ghost,
                                            task_subtype_none, 0, 0, c, NULL);
 #endif
@@ -4545,8 +4545,8 @@ void engine_init(struct engine *e, struct space *s,
 
   /* Init the scheduler with enough tasks for the initial sorting tasks. */
   const int nr_tasks = 2 * s->tot_cells + 2 * e->nr_threads;
-  scheduler_init(&e->sched, e->s, nr_tasks, nr_queues, scheduler_flag_steal,
-                 e->nodeID, &e->threadpool);
+  scheduler_init(&e->sched, e->s, nr_tasks, nr_queues,
+                 (policy & scheduler_flag_steal), e->nodeID, &e->threadpool);
 
   /* Allocate and init the threads. */
   if ((e->runners = (struct runner *)malloc(sizeof(struct runner) *
diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h
index 25eb1e4b76d088d7dcd3d4ba57fe5901188be000..9e06fec92f9667d21ce7dc196ba26f9870cb24f4 100644
--- a/src/hydro/Gadget2/hydro_iact.h
+++ b/src/hydro/Gadget2/hydro_iact.h
@@ -388,7 +388,7 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
                                  vector *wcountSum, vector *wcount_dhSum,
                                  vector *div_vSum, vector *curlvxSum,
                                  vector *curlvySum, vector *curlvzSum,
-                                 vector mask, int knlMask) {
+                                 mask_t mask) {
 
   vector r, ri, ui, wi, wi_dx;
   vector mj;
@@ -432,49 +432,24 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
   curlvry.v = vec_mul(curlvry.v, ri.v);
   curlvrz.v = vec_mul(curlvrz.v, ri.v);
 
-/* Mask updates to intermediate vector sums for particle pi. */
-#ifdef HAVE_AVX512_F
-  rhoSum->v =
-      _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);
+  vector wcount_dh_update;
+  wcount_dh_update.v =
+      vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v));
 
+  /* Mask updates to intermediate vector sums for particle pi. */
+  rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
   rho_dhSum->v =
-      _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
-                         vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
-                                               vec_mul(ui.v, wi_dx.v))));
-
-  wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);
-
-  wcount_dhSum->v = _mm512_mask_sub_ps(
-      rho_dhSum->v, knlMask, rho_dhSum->v,
-      vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)));
-
-  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
-                                   vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
-
-  curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
-                                    vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
-                                    curlvxSum->v);
-
-  curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
-                                    vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
-                                    curlvySum->v);
-
-  curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
-                                    vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
-                                    curlvzSum->v);
-#else
-  rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
-  rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
-                                                vec_mul(ui.v, wi_dx.v))),
-                          mask.v);
-  wcountSum->v += vec_and(wi.v, mask.v);
-  wcount_dhSum->v -= vec_and(
-      vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)), mask.v);
-  div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
-  curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
-  curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
-  curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
-#endif
+      vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v), mask);
+  wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
+  wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update.v, mask);
+  div_vSum->v =
+      vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
+  curlvxSum->v = vec_mask_add(curlvxSum->v,
+                              vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
+  curlvySum->v = vec_mask_add(curlvySum->v,
+                              vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
+  curlvzSum->v = vec_mask_add(curlvzSum->v,
+                              vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
 }
 
 /**
@@ -482,12 +457,14 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
  * (non-symmetric vectorized version).
  */
 __attribute__((always_inline)) INLINE static void
-runner_iact_nonsym_2_vec_density(
-    float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, vector vix,
-    vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj,
-    vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
-    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
-    vector mask, vector mask2, int knlMask, int knlMask2) {
+runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz,
+                                 vector hi_inv, vector vix, vector viy,
+                                 vector viz, float *Vjx, float *Vjy, float *Vjz,
+                                 float *Mj, vector *rhoSum, vector *rho_dhSum,
+                                 vector *wcountSum, vector *wcount_dhSum,
+                                 vector *div_vSum, vector *curlvxSum,
+                                 vector *curlvySum, vector *curlvzSum,
+                                 mask_t mask, mask_t mask2, short mask_cond) {
 
   vector r, ri, r2, ui, wi, wi_dx;
   vector mj;
@@ -567,88 +544,66 @@ runner_iact_nonsym_2_vec_density(
   curlvrz.v = vec_mul(curlvrz.v, ri.v);
   curlvrz2.v = vec_mul(curlvrz2.v, ri2.v);
 
-/* Mask updates to intermediate vector sums for particle pi. */
-#ifdef HAVE_AVX512_F
-  rhoSum->v =
-      _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);
-  rhoSum->v =
-      _mm512_mask_add_ps(rhoSum->v, knlMask2, vec_mul(mj2.v, wi2.v), rhoSum->v);
-
-  rho_dhSum->v =
-      _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
-                         vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
-                                               vec_mul(ui.v, wi_dx.v))));
-  rho_dhSum->v =
-      _mm512_mask_sub_ps(rho_dhSum->v, knlMask2, rho_dhSum->v,
-                         vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
-                                               vec_mul(ui2.v, wi_dx2.v))));
-
-  wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);
-  wcountSum->v =
-      _mm512_mask_add_ps(wcountSum->v, knlMask2, wi2.v, wcountSum->v);
-
-  wcount_dhSum->v = _mm512_mask_sub_ps(
-      rho_dhSum->v, knlMask, rho_dhSum->v,
-      vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)));
-
-  wcount_dhSum->v = _mm512_mask_sub_ps(
-      rho_dhSum->v, knlMask2, rho_dhSum->v,
-      vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v)));
-
-  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
-                                   vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
-  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask2, div_vSum->v,
-                                   vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)));
-
-  curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
-                                    vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
-                                    curlvxSum->v);
-  curlvxSum->v = _mm512_mask_add_ps(
-      curlvxSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)),
-      curlvxSum->v);
-
-  curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
-                                    vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
-                                    curlvySum->v);
-  curlvySum->v = _mm512_mask_add_ps(
-      curlvySum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)),
-      curlvySum->v);
-
-  curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
-                                    vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
-                                    curlvzSum->v);
-  curlvzSum->v = _mm512_mask_add_ps(
-      curlvzSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)),
-      curlvzSum->v);
-#else
-  rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
-  rhoSum->v += vec_and(vec_mul(mj2.v, wi2.v), mask2.v);
-  rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
-                                                vec_mul(ui.v, wi_dx.v))),
-                          mask.v);
-  rho_dhSum->v -=
-      vec_and(vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
-                                     vec_mul(ui2.v, wi_dx2.v))),
-              mask2.v);
-  wcountSum->v += vec_and(wi.v, mask.v);
-  wcountSum->v += vec_and(wi2.v, mask2.v);
-  wcount_dhSum->v -= vec_and(
-      vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)), mask.v);
-  wcount_dhSum->v -= vec_and(
-      vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v)),
-      mask2.v);
-  div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
-  div_vSum->v -= vec_and(vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2.v);
-  curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
-  curlvxSum->v +=
-      vec_and(vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2.v);
-  curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
-  curlvySum->v +=
-      vec_and(vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2.v);
-  curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
-  curlvzSum->v +=
-      vec_and(vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2.v);
-#endif
+  vector wcount_dh_update, wcount_dh_update2;
+  wcount_dh_update.v =
+      vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v));
+  wcount_dh_update2.v =
+      vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v));
+
+  /* Mask updates to intermediate vector sums for particle pi. */
+  /* Mask only when needed. */
+  if (mask_cond) {
+    rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
+    rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2);
+    rho_dhSum->v =
+        vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v), mask);
+    rho_dhSum->v =
+        vec_mask_sub(rho_dhSum->v, vec_mul(mj2.v, wcount_dh_update2.v), mask2);
+    wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
+    wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2);
+    wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update.v, mask);
+    wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update2.v, mask2);
+    div_vSum->v = vec_mask_sub(div_vSum->v,
+                               vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
+    div_vSum->v = vec_mask_sub(
+        div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2);
+    curlvxSum->v = vec_mask_add(
+        curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
+    curlvxSum->v = vec_mask_add(
+        curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2);
+    curlvySum->v = vec_mask_add(
+        curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
+    curlvySum->v = vec_mask_add(
+        curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2);
+    curlvzSum->v = vec_mask_add(
+        curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
+    curlvzSum->v = vec_mask_add(
+        curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2);
+  } else {
+    rhoSum->v = vec_add(rhoSum->v, vec_mul(mj.v, wi.v));
+    rhoSum->v = vec_add(rhoSum->v, vec_mul(mj2.v, wi2.v));
+    rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v));
+    rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj2.v, wcount_dh_update2.v));
+    wcountSum->v = vec_add(wcountSum->v, wi.v);
+    wcountSum->v = vec_add(wcountSum->v, wi2.v);
+    wcount_dhSum->v = vec_sub(wcount_dhSum->v, wcount_dh_update.v);
+    wcount_dhSum->v = vec_sub(wcount_dhSum->v, wcount_dh_update2.v);
+    div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
+    div_vSum->v =
+        vec_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)));
+    curlvxSum->v =
+        vec_add(curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)));
+    curlvxSum->v =
+        vec_add(curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)));
+    curlvySum->v =
+        vec_add(curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)));
+    curlvySum->v =
+        vec_add(curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)));
+    curlvzSum->v =
+        vec_add(curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)));
+    curlvzSum->v =
+        vec_add(curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)));
+  }
 }
 #endif
 
@@ -755,7 +710,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
     float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
     struct part **pj) {
 
-#ifdef WITH_VECTORIZATION
+#ifdef WITH_OLD_VECTORIZATION
 
   vector r, r2, ri;
   vector xi, xj;
@@ -1037,7 +992,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
     float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
     struct part **pj) {
 
-#ifdef WITH_VECTORIZATION
+#ifdef WITH_OLD_VECTORIZATION
 
   vector r, r2, ri;
   vector xi, xj;
diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h
index f634a59d7ee769951e6560d46a92053c144cc766..5355c15f8f1a0d0c3d811c7039da04caf0522cc9 100644
--- a/src/kernel_hydro.h
+++ b/src/kernel_hydro.h
@@ -341,20 +341,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx(
 
 /* ------------------------------------------------------------------------- */
 
-#ifdef WITH_VECTORIZATION
-
-static const vector kernel_gamma_inv_vec = FILL_VEC((float)kernel_gamma_inv);
-
-static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals);
-
-static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant);
-
-static const vector kernel_gamma_inv_dim_vec =
-    FILL_VEC((float)kernel_gamma_inv_dim);
-
-static const vector kernel_gamma_inv_dim_plus_one_vec =
-    FILL_VEC((float)kernel_gamma_inv_dim_plus_one);
-
+#ifdef WITH_OLD_VECTORIZATION
 /**
  * @brief Computes the kernel function and its derivative (Vectorised version).
  *
@@ -369,11 +356,12 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec(
 
   /* Go to the range [0,1[ from [0,H[ */
   vector x;
-  x.v = u->v * kernel_gamma_inv_vec.v;
+  x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
 
   /* Load x and get the interval id. */
   vector ind;
-  ind.m = vec_ftoi(vec_fmin(x.v * kernel_ivals_vec.v, kernel_ivals_vec.v));
+  ind.m =
+      vec_ftoi(vec_fmin(vec_mul(x.v, kernel_ivals_vec.v), kernel_ivals_vec.v));
 
   /* load the coefficients. */
   vector c[kernel_degree + 1];
@@ -382,20 +370,36 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec(
       c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j];
 
   /* Init the iteration for Horner's scheme. */
-  w->v = (c[0].v * x.v) + c[1].v;
+  w->v = vec_fma(c[0].v, x.v, c[1].v);
   dw_dx->v = c[0].v;
 
   /* And we're off! */
   for (int k = 2; k <= kernel_degree; k++) {
-    dw_dx->v = (dw_dx->v * x.v) + w->v;
-    w->v = (x.v * w->v) + c[k].v;
+    dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
+    w->v = vec_fma(x.v, w->v, c[k].v);
   }
 
   /* Return everything */
-  w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
-  dw_dx->v =
-      dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
+  w->v =
+      vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
+  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
+                                       kernel_gamma_inv_dim_plus_one_vec.v));
 }
+#endif
+
+#ifdef WITH_VECTORIZATION
+
+static const vector kernel_gamma_inv_vec = FILL_VEC((float)kernel_gamma_inv);
+
+static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals);
+
+static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant);
+
+static const vector kernel_gamma_inv_dim_vec =
+    FILL_VEC((float)kernel_gamma_inv_dim);
+
+static const vector kernel_gamma_inv_dim_plus_one_vec =
+    FILL_VEC((float)kernel_gamma_inv_dim_plus_one);
 
 /* Define constant vectors for the Wendland C2 and Cubic Spline kernel
  * coefficients. */
@@ -468,15 +472,15 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
   w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
 #elif defined(CUBIC_SPLINE_KERNEL)
   vector w2, dw_dx2;
-  vector mask_reg1, mask_reg2;
+  mask_t mask_reg;
 
-  /* Form a mask for each part of the kernel. */
-  mask_reg1.v = vec_cmp_lt(x.v, cond.v);  /* 0 < x < 0.5 */
-  mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
-  ;
+  /* Form a mask for one part of the kernel. */
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
 
   /* Work out w for both regions of the kernel and combine the results together
-   * using masks. */
+   * using a mask. */
 
   /* Init the iteration for Horner's scheme. */
   w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
@@ -495,20 +499,17 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
   w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
   w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
 
-  /* Mask out unneeded values. */
-  w->v = vec_and(w->v, mask_reg1.v);
-  w2.v = vec_and(w2.v, mask_reg2.v);
-  dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
-  dw_dx2.v = vec_and(dw_dx2.v, mask_reg2.v);
-
-  /* Added both w and w2 together to form complete result. */
-  w->v = vec_add(w->v, w2.v);
-  dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
+  /* Blend both kernel regions into one vector (mask out unneeded values). */
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  w->v = vec_blend(mask_reg, w->v, w2.v);
+  dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v);
+
 #else
-#error
+#error "Vectorisation not supported for this kernel!!!"
 #endif
 
-  /* Return everything */
+  /* Return everyting */
   w->v =
       vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
   dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
@@ -580,15 +581,13 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
 #elif defined(CUBIC_SPLINE_KERNEL)
   vector w_2, dw_dx_2;
   vector w2_2, dw_dx2_2;
-  vector mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2;
+  mask_t mask_reg, mask_reg_v2;
 
-  /* Form a mask for each part of the kernel. */
-  mask_reg1.v = vec_cmp_lt(x.v, cond.v);     /* 0 < x < 0.5 */
-  mask_reg1_v2.v = vec_cmp_lt(x2.v, cond.v); /* 0 < x < 0.5 */
-  mask_reg2.v = vec_cmp_gte(x.v, cond.v);    /* 0.5 < x < 1 */
-  ;
-  mask_reg2_v2.v = vec_cmp_gte(x2.v, cond.v); /* 0.5 < x < 1 */
-  ;
+  /* Form a mask for one part of the kernel for each vector. */
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v));     /* 0.5 < x < 1 */
+  vec_create_mask(mask_reg_v2, vec_cmp_gte(x2.v, cond.v)); /* 0.5 < x < 1 */
 
   /* Work out w for both regions of the kernel and combine the results together
    * using masks. */
@@ -622,29 +621,23 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
   w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c3.v);
   w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c3.v);
 
-  /* Mask out unneeded values. */
-  w->v = vec_and(w->v, mask_reg1.v);
-  w2->v = vec_and(w2->v, mask_reg1_v2.v);
-  w_2.v = vec_and(w_2.v, mask_reg2.v);
-  w2_2.v = vec_and(w2_2.v, mask_reg2_v2.v);
-  dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
-  dw_dx2->v = vec_and(dw_dx2->v, mask_reg1_v2.v);
-  dw_dx_2.v = vec_and(dw_dx_2.v, mask_reg2.v);
-  dw_dx2_2.v = vec_and(dw_dx2_2.v, mask_reg2_v2.v);
-
-  /* Added both w and w2 together to form complete result. */
-  w->v = vec_add(w->v, w_2.v);
-  w2->v = vec_add(w2->v, w2_2.v);
-  dw_dx->v = vec_add(dw_dx->v, dw_dx_2.v);
-  dw_dx2->v = vec_add(dw_dx2->v, dw_dx2_2.v);
+  /* Blend both kernel regions into one vector (mask out unneeded values). */
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  w->v = vec_blend(mask_reg, w->v, w_2.v);
+  w2->v = vec_blend(mask_reg_v2, w2->v, w2_2.v);
+  dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx_2.v);
+  dw_dx2->v = vec_blend(mask_reg_v2, dw_dx2->v, dw_dx2_2.v);
 
   /* Return everything */
-  w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
-  w2->v = w2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
-  dw_dx->v =
-      dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
-  dw_dx2->v =
-      dw_dx2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
+  w->v =
+      vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
+  w2->v = vec_mul(w2->v,
+                  vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
+  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
+                                       kernel_gamma_inv_dim_plus_one_vec.v));
+  dw_dx2->v = vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v,
+                                         kernel_gamma_inv_dim_plus_one_vec.v));
 
 #endif
 }
@@ -676,12 +669,12 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
   w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
 #elif defined(CUBIC_SPLINE_KERNEL)
   vector w2;
-  vector mask1, mask2;
+  mask_t mask_reg;
 
   /* Form a mask for each part of the kernel. */
-  mask1.v = vec_cmp_lt(x.v, cond.v);  /* 0 < x < 0.5 */
-  mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
-  ;
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
 
   /* Work out w for both regions of the kernel and combine the results together
    * using masks. */
@@ -698,13 +691,12 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
   w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
 
   /* Mask out unneeded values. */
-  w->v = vec_and(w->v, mask1.v);
-  w2.v = vec_and(w2.v, mask2.v);
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  w->v = vec_blend(mask_reg, w->v, w2.v);
 
-  /* Added both w and w2 together to form complete result. */
-  w->v = vec_add(w->v, w2.v);
 #else
-#error
+#error "Vectorisation not supported for this kernel!!!"
 #endif
 
   /* Return everything */
@@ -741,12 +733,12 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
 
 #elif defined(CUBIC_SPLINE_KERNEL)
   vector dw_dx2;
-  vector mask1, mask2;
+  mask_t mask_reg;
 
   /* Form a mask for each part of the kernel. */
-  mask1.v = vec_cmp_lt(x.v, cond.v);  /* 0 < x < 0.5 */
-  mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
-  ;
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
 
   /* Work out w for both regions of the kernel and combine the results together
    * using masks. */
@@ -760,15 +752,106 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
   dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
 
   /* Mask out unneeded values. */
-  dw_dx->v = vec_and(dw_dx->v, mask1.v);
-  dw_dx2.v = vec_and(dw_dx2.v, mask2.v);
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v);
+
+#else
+#error "Vectorisation not supported for this kernel!!!"
+#endif
+
+  /* Mask out result for particles that lie outside of the kernel function. */
+  mask_t mask;
+  vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */
+
+  dw_dx->v = vec_and_mask(dw_dx->v, mask);
+
+  /* Return everything */
+  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
+                                       kernel_gamma_inv_dim_plus_one_vec.v));
+}
+
+/**
+ * @brief Computes the kernel function derivative for two particles
+ * using interleaved vectors.
+ *
+ * Return 0 if $u > \\gamma = H/h$
+ *
+ * @param u The ratio of the distance to the smoothing length $u = x/h$.
+ * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
+ * @param u_2 The ratio of the distance to the smoothing length $u = x/h$ for
+ * second particle.
+ * @param dw_dx_2 (return) The norm of the gradient of $|\\nabla W(x,h)|$ for
+ * second particle.
+ */
+__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
+    vector *u, vector *dw_dx, vector *u_2, vector *dw_dx_2) {
+
+  /* Go to the range [0,1[ from [0,H[ */
+  vector x, x_2;
+  x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
+  x_2.v = vec_mul(u_2->v, kernel_gamma_inv_vec.v);
+
+#ifdef WENDLAND_C2_KERNEL
+  /* Init the iteration for Horner's scheme. */
+  dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v);
+  dw_dx_2->v =
+      vec_fma(wendland_dwdx_const_c0.v, x_2.v, wendland_dwdx_const_c1.v);
+
+  /* Calculate the polynomial interleaving vector operations */
+  dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v);
+  dw_dx_2->v = vec_fma(dw_dx_2->v, x_2.v, wendland_dwdx_const_c2.v);
+
+  dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v);
+  dw_dx_2->v = vec_fma(dw_dx_2->v, x_2.v, wendland_dwdx_const_c3.v);
+
+  dw_dx->v = vec_mul(dw_dx->v, x.v);
+  dw_dx_2->v = vec_mul(dw_dx_2->v, x_2.v);
+
+#elif defined(CUBIC_SPLINE_KERNEL)
+  vector dw_dx2, dw_dx2_2;
+  mask_t mask_reg;
+  mask_t mask_reg_v2;
+
+  /* Form a mask for one part of the kernel. */
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v));      /* 0.5 < x < 1 */
+  vec_create_mask(mask_reg_v2, vec_cmp_gte(x_2.v, cond.v)); /* 0.5 < x < 1 */
+
+  /* Work out w for both regions of the kernel and combine the results together
+   * using masks. */
+
+  /* Init the iteration for Horner's scheme. */
+  dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v);
+  dw_dx_2->v = vec_fma(cubic_1_dwdx_const_c0.v, x_2.v, cubic_1_dwdx_const_c1.v);
+  dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v);
+  dw_dx2_2.v = vec_fma(cubic_2_dwdx_const_c0.v, x_2.v, cubic_2_dwdx_const_c1.v);
+
+  /* Calculate the polynomial interleaving vector operations. */
+  dw_dx->v = vec_mul(dw_dx->v, x.v);       /* cubic_1_dwdx_const_c2 is zero. */
+  dw_dx_2->v = vec_mul(dw_dx_2->v, x_2.v); /* cubic_1_dwdx_const_c2 is zero. */
+  dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
+  dw_dx2_2.v = vec_fma(dw_dx2_2.v, x_2.v, cubic_2_dwdx_const_c2.v);
+
+  /* Mask out unneeded values. */
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v);
+  dw_dx_2->v = vec_blend(mask_reg_v2, dw_dx_2->v, dw_dx2_2.v);
 
-  /* Added both dwdx and dwdx2 together to form complete result. */
-  dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
 #else
-#error
+#error "Vectorisation not supported for this kernel!!!"
 #endif
 
+  /* Mask out result for particles that lie outside of the kernel function. */
+  mask_t mask, mask_2;
+  vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f)));     /* x < 1 */
+  vec_create_mask(mask_2, vec_cmp_lt(x_2.v, vec_set1(1.f))); /* x < 1 */
+
+  dw_dx->v = vec_and_mask(dw_dx->v, mask);
+  dw_dx_2->v = vec_and_mask(dw_dx_2->v, mask_2);
+
   /* Return everything */
   dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
                                        kernel_gamma_inv_dim_plus_one_vec.v));
diff --git a/src/parser.c b/src/parser.c
index 641e2fdac6c286e0c1e1a1c07f4badcb591112c2..0b608b29263342240af68fd99d2fdd3241e2a1e6 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -133,8 +133,8 @@ void parser_set_param(struct swift_params *params, const char *namevalue) {
   int updated = 0;
   for (int i = 0; i < params->paramCount; i++) {
     if (strcmp(name, params->data[i].name) == 0) {
-      message("Using the value '%s' instead of '%s' for the parameter '%s'",
-              value, params->data[i].value, params->data[i].name);
+      message("Value of '%s' changed from '%s' to '%s'", params->data[i].name,
+              params->data[i].value, value);
       strcpy(params->data[i].value, value);
       updated = 1;
     }
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index 47ebba61d9a77486448ba2cdb25c9c0a448b9bac..906e9ad35572b643a5c2126f3dc25f67532e30a4 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -1037,9 +1037,9 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
       struct part *restrict pj = &parts_j[sort_j[pjd].i];
       if (!part_is_active(pj, e)) continue;
       const float hj = pj->h;
-      const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
-      if (dj > di_max) continue;
-
+      const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift;
+      if (dj - rshift > di_max) continue;
+      
       double pjx[3];
       for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
       const float hjg2 = hj * hj * kernel_gamma2;
@@ -1449,9 +1449,9 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
     /* Get a hold of the jth part in cj. */
     struct part *restrict pj = &parts_j[sort_j[pjd].i];
     const float hj = pj->h;
-    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
-    if (dj > di_max) continue;
-
+    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift;
+    if (dj - rshift > di_max) continue;
+    
     double pjx[3];
     for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
     const float hjg2 = hj * hj * kernel_gamma2;
diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
index 5b22a67bbf757e09f144ba3accc2ef5adfb706f0..1557c1ec84863c3518f8671cdf0e9ca44732e94e 100644
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -63,10 +63,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
     vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz,
     int *icount_align) {
 
-#ifdef HAVE_AVX512_F
-  KNL_MASK_16 knl_mask, knl_mask2;
-#endif
-  vector int_mask, int_mask2;
+  mask_t int_mask, int_mask2;
 
   /* Work out the number of remainder interactions and pad secondary cache. */
   *icount_align = icount;
@@ -75,16 +72,10 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
     int pad = (NUM_VEC_PROC * VEC_SIZE) - rem;
     *icount_align += pad;
 
-/* Initialise masks to true. */
-#ifdef HAVE_AVX512_F
-    knl_mask = 0xFFFF;
-    knl_mask2 = 0xFFFF;
-    int_mask.m = vec_setint1(0xFFFFFFFF);
-    int_mask2.m = vec_setint1(0xFFFFFFFF);
-#else
-    int_mask.m = vec_setint1(0xFFFFFFFF);
-    int_mask2.m = vec_setint1(0xFFFFFFFF);
-#endif
+    /* Initialise masks to true. */
+    vec_init_mask_true(int_mask);
+    vec_init_mask_true(int_mask2);
+
     /* Pad secondary cache so that there are no contributions in the interaction
      * function. */
     for (int i = icount; i < *icount_align; i++) {
@@ -100,19 +91,10 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
 
     /* Zero parts of mask that represent the padded values.*/
     if (pad < VEC_SIZE) {
-#ifdef HAVE_AVX512_F
-      knl_mask2 = knl_mask2 >> pad;
-#else
-      for (int i = VEC_SIZE - pad; i < VEC_SIZE; i++) int_mask2.i[i] = 0;
-#endif
+      vec_pad_mask(int_mask2, pad);
     } else {
-#ifdef HAVE_AVX512_F
-      knl_mask = knl_mask >> (VEC_SIZE - rem);
-      knl_mask2 = 0;
-#else
-      for (int i = rem; i < VEC_SIZE; i++) int_mask.i[i] = 0;
-      int_mask2.v = vec_setzero();
-#endif
+      vec_pad_mask(int_mask, VEC_SIZE - rem);
+      vec_zero_mask(int_mask2);
     }
 
     /* Perform remainder interaction and remove remainder from aligned
@@ -125,12 +107,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
         &int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align],
         &int_cache->mq[*icount_align], rhoSum, rho_dhSum, wcountSum,
         wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask,
-        int_mask2,
-#ifdef HAVE_AVX512_F
-        knl_mask, knl_mask2);
-#else
-        0, 0);
-#endif
+        int_mask2, 1);
   }
 }
 
@@ -144,10 +121,6 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
  * @param v_dx #vector of the x separation between two particles.
  * @param v_dy #vector of the y separation between two particles.
  * @param v_dz #vector of the z separation between two particles.
- * @param v_mj #vector of the mass of particle pj.
- * @param v_vjx #vector of x velocity of pj.
- * @param v_vjy #vector of y velocity of pj.
- * @param v_vjz #vector of z velocity of pj.
  * @param cell_cache #cache of all particles in the cell.
  * @param int_cache (return) secondary #cache of interactions between two
  * particles.
@@ -174,44 +147,33 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
  */
 __attribute__((always_inline)) INLINE static void storeInteractions(
     const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy,
-    vector *v_dz, vector *v_mj, vector *v_vjx, vector *v_vjy, vector *v_vjz,
-    const struct cache *const cell_cache, struct c2_cache *const int_cache,
-    int *icount, vector *rhoSum, vector *rho_dhSum, vector *wcountSum,
-    vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum,
-    vector *curlvySum, vector *curlvzSum, vector v_hi_inv, vector v_vix,
-    vector v_viy, vector v_viz) {
+    vector *v_dz, const struct cache *const cell_cache,
+    struct c2_cache *const int_cache, int *icount, vector *rhoSum,
+    vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
+    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
+    vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz) {
 
 /* Left-pack values needed into the secondary cache using the interaction mask.
  */
 #if defined(HAVE_AVX2) || defined(HAVE_AVX512_F)
-  int pack = 0;
-
-#ifdef HAVE_AVX512_F
-  pack += __builtin_popcount(mask);
-  VEC_LEFT_PACK(v_r2->v, mask, &int_cache->r2q[*icount]);
-  VEC_LEFT_PACK(v_dx->v, mask, &int_cache->dxq[*icount]);
-  VEC_LEFT_PACK(v_dy->v, mask, &int_cache->dyq[*icount]);
-  VEC_LEFT_PACK(v_dz->v, mask, &int_cache->dzq[*icount]);
-  VEC_LEFT_PACK(v_mj->v, mask, &int_cache->mq[*icount]);
-  VEC_LEFT_PACK(v_vjx->v, mask, &int_cache->vxq[*icount]);
-  VEC_LEFT_PACK(v_vjy->v, mask, &int_cache->vyq[*icount]);
-  VEC_LEFT_PACK(v_vjz->v, mask, &int_cache->vzq[*icount]);
-#else
-  vector v_mask;
-  VEC_FORM_PACKED_MASK(mask, v_mask.m, pack);
-
-  VEC_LEFT_PACK(v_r2->v, v_mask.m, &int_cache->r2q[*icount]);
-  VEC_LEFT_PACK(v_dx->v, v_mask.m, &int_cache->dxq[*icount]);
-  VEC_LEFT_PACK(v_dy->v, v_mask.m, &int_cache->dyq[*icount]);
-  VEC_LEFT_PACK(v_dz->v, v_mask.m, &int_cache->dzq[*icount]);
-  VEC_LEFT_PACK(v_mj->v, v_mask.m, &int_cache->mq[*icount]);
-  VEC_LEFT_PACK(v_vjx->v, v_mask.m, &int_cache->vxq[*icount]);
-  VEC_LEFT_PACK(v_vjy->v, v_mask.m, &int_cache->vyq[*icount]);
-  VEC_LEFT_PACK(v_vjz->v, v_mask.m, &int_cache->vzq[*icount]);
-
-#endif /* HAVE_AVX512_F */
-
-  (*icount) += pack;
+  mask_t packed_mask;
+  VEC_FORM_PACKED_MASK(mask, packed_mask);
+
+  VEC_LEFT_PACK(v_r2->v, packed_mask, &int_cache->r2q[*icount]);
+  VEC_LEFT_PACK(v_dx->v, packed_mask, &int_cache->dxq[*icount]);
+  VEC_LEFT_PACK(v_dy->v, packed_mask, &int_cache->dyq[*icount]);
+  VEC_LEFT_PACK(v_dz->v, packed_mask, &int_cache->dzq[*icount]);
+  VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), packed_mask,
+                &int_cache->mq[*icount]);
+  VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), packed_mask,
+                &int_cache->vxq[*icount]);
+  VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), packed_mask,
+                &int_cache->vyq[*icount]);
+  VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), packed_mask,
+                &int_cache->vzq[*icount]);
+
+  /* Increment interaction count by number of bits set in mask. */
+  (*icount) += __builtin_popcount(mask);
 #else
   /* Quicker to do it serially in AVX rather than use intrinsics. */
   for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) {
@@ -242,9 +204,9 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
                         wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum,
                         v_hi_inv, v_vix, v_viy, v_viz, &icount_align);
 
-    vector int_mask, int_mask2;
-    int_mask.m = vec_setint1(0xFFFFFFFF);
-    int_mask2.m = vec_setint1(0xFFFFFFFF);
+    mask_t int_mask, int_mask2;
+    vec_init_mask_true(int_mask);
+    vec_init_mask_true(int_mask2);
 
     /* Perform interactions. */
     for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
@@ -253,7 +215,7 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
           &int_cache->dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz,
           &int_cache->vxq[pjd], &int_cache->vyq[pjd], &int_cache->vzq[pjd],
           &int_cache->mq[pjd], rhoSum, rho_dhSum, wcountSum, wcount_dhSum,
-          div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2, 0, 0);
+          div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2, 0);
     }
 
     /* Reset interaction count. */
@@ -262,7 +224,8 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
 }
 
 /**
- * @brief Populates the arrays max_di and max_dj with the maximum distances of
+ * @brief Populates the arrays max_index_i and max_index_j with the maximum
+ * indices of
  * particles into their neighbouring cells. Also finds the first pi that
  * interacts with any particle in cj and the last pj that interacts with any
  * particle in ci.
@@ -277,77 +240,121 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
  * @param hj_max Maximal smoothing length in cell cj
  * @param di_max Maximal position on the axis that can interact in cell ci
  * @param dj_min Minimal position on the axis that can interact in cell ci
- * @param max_di array to hold the maximum distances of pi particles into cell
+ * @param max_index_i array to hold the maximum distances of pi particles into
+ * cell
  * cj
- * @param max_dj array to hold the maximum distances of pj particles into cell
+ * @param max_index_j array to hold the maximum distances of pj particles into
+ * cell
  * cj
  * @param init_pi first pi to interact with a pj particle
  * @param init_pj last pj to interact with a pi particle
  * @param e The #engine.
  */
-__attribute__((always_inline)) INLINE static void populate_max_d_no_cache(
+__attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
     const struct cell *ci, const struct cell *cj,
     const struct entry *restrict sort_i, const struct entry *restrict sort_j,
     const float dx_max, const float rshift, const double hi_max,
     const double hj_max, const double di_max, const double dj_min,
-    float *max_di, float *max_dj, int *init_pi, int *init_pj,
+    int *max_index_i, int *max_index_j, int *init_pi, int *init_pj,
     const struct engine *e) {
 
   const struct part *restrict parts_i = ci->parts;
   const struct part *restrict parts_j = cj->parts;
 
   int first_pi = 0, last_pj = cj->count - 1;
+  int temp;
+
+  /* Find the leftmost active particle in cell i that interacts with any
+   * particle in cell j. */
+  first_pi = ci->count;
+  int active_id = first_pi - 1;
+  while (first_pi > 0 && sort_i[first_pi - 1].d + dx_max + hi_max > dj_min) {
+    first_pi--;
+    /* Store the index of the particle if it is active. */
+    if (part_is_active(&parts_i[sort_i[first_pi].i], e)) active_id = first_pi;
+  }
 
-  /* Find the first active particle in ci to interact with any particle in cj.
-   */
-  /* Populate max_di with distances. */
-  int active_id = ci->count - 1;
-  for (int k = ci->count - 1; k >= 0; k--) {
-    const struct part *pi = &parts_i[sort_i[k].i];
-    const float d = sort_i[k].d + dx_max;
-
-    // max_di[k] = d + h * kernel_gamma - rshift;
-    max_di[k] = d + hi_max;
-
-    /* If the particle is out of range set the index to
-     * the last active particle within range. */
-    if (d + hi_max < dj_min) {
-      first_pi = active_id;
-      break;
-    } else {
-      if (part_is_active(pi, e)) active_id = k;
+  /* Set the first active pi in range of any particle in cell j. */
+  first_pi = active_id;
+
+  /* Find the maximum index into cell j for each particle in range in cell i. */
+  if (first_pi < ci->count) {
+
+    /* Start from the first particle in cell j. */
+    temp = 0;
+
+    const struct part *pi = &parts_i[sort_i[first_pi].i];
+
+    /* Loop through particles in cell j until they are not in range of pi. */
+    while (temp <= cj->count &&
+           (sort_i[first_pi].d + (pi->h * kernel_gamma + dx_max - rshift) >
+            sort_j[temp].d))
+      temp++;
+
+    max_index_i[first_pi] = temp;
+
+    /* Populate max_index_i for remaining particles that are within range. */
+    for (int i = first_pi + 1; i < ci->count; i++) {
+      temp = max_index_i[i - 1];
+      pi = &parts_i[sort_i[i].i];
+
+      while (temp <= cj->count &&
+             (sort_i[i].d + (pi->h * kernel_gamma + dx_max - rshift) >
+              sort_j[temp].d))
+        temp++;
+
+      max_index_i[i] = temp;
     }
+  } else {
+    /* Make sure that max index is set to first particle in cj.*/
+    max_index_i[ci->count - 1] = 0;
   }
 
-  /* Find the maximum distance of pi particles into cj.*/
-  for (int k = first_pi + 1; k < ci->count; k++) {
-    max_di[k] = fmaxf(max_di[k - 1], max_di[k]);
+  /* Find the rightmost active particle in cell j that interacts with any
+   * particle in cell i. */
+  last_pj = -1;
+  active_id = last_pj;
+  while (last_pj < cj->count &&
+         sort_j[last_pj + 1].d - hj_max - dx_max < di_max) {
+    last_pj++;
+    /* Store the index of the particle if it is active. */
+    if (part_is_active(&parts_j[sort_j[last_pj].i], e)) active_id = last_pj;
   }
 
-  /* Find the last particle in cj to interact with any particle in ci. */
-  /* Populate max_dj with distances. */
-  active_id = 0;
-  for (int k = 0; k < cj->count; k++) {
-    const struct part *pj = &parts_j[sort_j[k].i];
-    const float d = sort_j[k].d - dx_max;
-
-    /*TODO: don't think rshift should be taken off here, waiting on Pedro. */
-    // max_dj[k] = d - h * kernel_gamma - rshift;
-    max_dj[k] = d - hj_max;
-
-    /* If the particle is out of range set the index to
-     * the last active particle within range. */
-    if (d - hj_max > di_max) {
-      last_pj = active_id;
-      break;
-    } else {
-      if (part_is_active(pj, e)) active_id = k;
-    }
-  }
+  /* Set the last active pj in range of any particle in cell i. */
+  last_pj = active_id;
+
+  /* Find the maximum index into cell i for each particle in range in cell j. */
+  if (last_pj > 0) {
+
+    /* Start from the last particle in cell i. */
+    temp = ci->count - 1;
+
+    const struct part *pj = &parts_j[sort_j[last_pj].i];
+
+    /* Loop through particles in cell i until they are not in range of pj. */
+    while (temp > 0 &&
+           sort_j[last_pj].d - dx_max - (pj->h * kernel_gamma) <
+               sort_i[temp].d - rshift)
+      temp--;
 
-  /* Find the maximum distance of pj particles into ci.*/
-  for (int k = 1; k <= last_pj; k++) {
-    max_dj[k] = fmaxf(max_dj[k - 1], max_dj[k]);
+    max_index_j[last_pj] = temp;
+
+    /* Populate max_index_j for remaining particles that are within range. */
+    for (int i = last_pj - 1; i >= 0; i--) {
+      temp = max_index_j[i + 1];
+      pj = &parts_j[sort_j[i].i];
+
+      while (temp > 0 &&
+             sort_j[i].d - dx_max - (pj->h * kernel_gamma) <
+                 sort_i[temp].d - rshift)
+        temp--;
+
+      max_index_j[i] = temp;
+    }
+  } else {
+    /* Make sure that max index is set to last particle in ci.*/
+    max_index_j[0] = ci->count - 1;
   }
 
   *init_pi = first_pi;
@@ -367,7 +374,6 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
 
 #ifdef WITH_VECTORIZATION
   const struct engine *e = r->e;
-  int doi_mask;
   struct part *restrict pi;
   int count_align;
   int num_vec_proc = NUM_VEC_PROC;
@@ -459,9 +465,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
     }
 
     vector pjx, pjy, pjz;
-    vector pjvx, pjvy, pjvz, mj;
     vector pjx2, pjy2, pjz2;
-    vector pjvx2, pjvy2, pjvz2, mj2;
 
     /* Find all of particle pi's interacions and store needed values in the
      * secondary cache.*/
@@ -471,18 +475,10 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
       pjx.v = vec_load(&cell_cache->x[pjd]);
       pjy.v = vec_load(&cell_cache->y[pjd]);
       pjz.v = vec_load(&cell_cache->z[pjd]);
-      pjvx.v = vec_load(&cell_cache->vx[pjd]);
-      pjvy.v = vec_load(&cell_cache->vy[pjd]);
-      pjvz.v = vec_load(&cell_cache->vz[pjd]);
-      mj.v = vec_load(&cell_cache->m[pjd]);
 
       pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]);
       pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]);
       pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]);
-      pjvx2.v = vec_load(&cell_cache->vx[pjd + VEC_SIZE]);
-      pjvy2.v = vec_load(&cell_cache->vy[pjd + VEC_SIZE]);
-      pjvz2.v = vec_load(&cell_cache->vz[pjd + VEC_SIZE]);
-      mj2.v = vec_load(&cell_cache->m[pjd + VEC_SIZE]);
 
       /* Compute the pairwise distance. */
       vector v_dx_tmp, v_dy_tmp, v_dz_tmp;
@@ -502,53 +498,47 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
       v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v);
       v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v);
 
-/* Form a mask from r2 < hig2 and r2 > 0.*/
-#ifdef HAVE_AVX512_F
-      // KNL_MASK_16 doi_mask, doi_mask_check, doi_mask2, doi_mask2_check;
-      KNL_MASK_16 doi_mask_check, doi_mask2, doi_mask2_check;
-
-      doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero());
-      doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v);
-
-      doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero());
-      doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v);
-
-      doi_mask = doi_mask & doi_mask_check;
-      doi_mask2 = doi_mask2 & doi_mask2_check;
-
-#else
-      vector v_doi_mask, v_doi_mask_check, v_doi_mask2, v_doi_mask2_check;
-      int doi_mask2;
+      /* Form a mask from r2 < hig2 and r2 > 0.*/
+      mask_t v_doi_mask, v_doi_mask_self_check, v_doi_mask2,
+          v_doi_mask2_self_check;
+      int doi_mask, doi_mask_self_check, doi_mask2, doi_mask2_self_check;
 
       /* Form r2 > 0 mask and r2 < hig2 mask. */
-      v_doi_mask_check.v = vec_cmp_gt(v_r2.v, vec_setzero());
-      v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v);
+      vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero()));
+      vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v));
 
       /* Form r2 > 0 mask and r2 < hig2 mask. */
-      v_doi_mask2_check.v = vec_cmp_gt(v_r2_2.v, vec_setzero());
-      v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v);
+      vec_create_mask(v_doi_mask2_self_check,
+                      vec_cmp_gt(v_r2_2.v, vec_setzero()));
+      vec_create_mask(v_doi_mask2, vec_cmp_lt(v_r2_2.v, v_hig2.v));
 
-      /* Combine two masks and form integer mask. */
-      doi_mask = vec_cmp_result(vec_and(v_doi_mask.v, v_doi_mask_check.v));
-      doi_mask2 = vec_cmp_result(vec_and(v_doi_mask2.v, v_doi_mask2_check.v));
-#endif /* HAVE_AVX512_F */
+      /* Form integer masks. */
+      doi_mask_self_check = vec_form_int_mask(v_doi_mask_self_check);
+      doi_mask = vec_form_int_mask(v_doi_mask);
+
+      doi_mask2_self_check = vec_form_int_mask(v_doi_mask2_self_check);
+      doi_mask2 = vec_form_int_mask(v_doi_mask2);
+
+      /* Combine the two masks. */
+      doi_mask = doi_mask & doi_mask_self_check;
+      doi_mask2 = doi_mask2 & doi_mask2_self_check;
 
       /* If there are any interactions left pack interaction values into c2
        * cache. */
       if (doi_mask) {
         storeInteractions(doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp,
-                          &mj, &pjvx, &pjvy, &pjvz, cell_cache, &int_cache,
+                          cell_cache, &int_cache, &icount, &rhoSum, &rho_dhSum,
+                          &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
+                          &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy,
+                          v_viz);
+      }
+      if (doi_mask2) {
+        storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2,
+                          &v_dy_tmp2, &v_dz_tmp2, cell_cache, &int_cache,
                           &icount, &rhoSum, &rho_dhSum, &wcountSum,
                           &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
                           &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz);
       }
-      if (doi_mask2) {
-        storeInteractions(
-            doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2, &v_dy_tmp2,
-            &v_dz_tmp2, &mj2, &pjvx2, &pjvy2, &pjvz2, cell_cache, &int_cache,
-            &icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum,
-            &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz);
-      }
     }
 
     /* Perform padded vector remainder interactions if any are present. */
@@ -559,16 +549,9 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
 
     /* Initialise masks to true in case remainder interactions have been
      * performed. */
-    vector int_mask, int_mask2;
-#ifdef HAVE_AVX512_F
-    KNL_MASK_16 knl_mask = 0xFFFF;
-    KNL_MASK_16 knl_mask2 = 0xFFFF;
-    int_mask.m = vec_setint1(0xFFFFFFFF);
-    int_mask2.m = vec_setint1(0xFFFFFFFF);
-#else
-    int_mask.m = vec_setint1(0xFFFFFFFF);
-    int_mask2.m = vec_setint1(0xFFFFFFFF);
-#endif
+    mask_t int_mask, int_mask2;
+    vec_init_mask_true(int_mask);
+    vec_init_mask_true(int_mask2);
 
     /* Perform interaction with 2 vectors. */
     for (int pjd = 0; pjd < icount_align; pjd += (num_vec_proc * VEC_SIZE)) {
@@ -578,11 +561,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
           &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd],
           &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
           &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2,
-#ifdef HAVE_AVX512_F
-          knl_mask, knl_mask2);
-#else
-          0, 0);
-#endif
+          0);
     }
 
     /* Perform horizontal adds on vector sums and store result in particle pi.
@@ -714,38 +693,19 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   }
 
   int first_pi, last_pj;
-  float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE)));
-  float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+  int *max_index_i __attribute__((aligned(sizeof(int) * VEC_SIZE)));
+  int *max_index_j __attribute__((aligned(sizeof(int) * VEC_SIZE)));
 
-  max_di = r->ci_cache.max_d;
-  max_dj = r->cj_cache.max_d;
+  max_index_i = r->ci_cache.max_index;
+  max_index_j = r->cj_cache.max_index;
 
-  /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */
+  /* Find particles maximum index into cj, max_index_i[] and ci, max_index_j[].
+   */
   /* Also find the first pi that interacts with any particle in cj and the last
    * pj that interacts with any particle in ci. */
-  populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max,
-                          hj_max, di_max, dj_min, max_di, max_dj, &first_pi,
-                          &last_pj, e);
-
-  /* Find the maximum index into cj that is required by a particle in ci. */
-  /* Find the maximum index into ci that is required by a particle in cj. */
-  float di, dj;
-  int max_ind_j = count_j - 1;
-  int max_ind_i = 0;
-
-  dj = sort_j[max_ind_j].d;
-  while (max_ind_j > 0 && max_di[count_i - 1] < dj) {
-    max_ind_j--;
-
-    dj = sort_j[max_ind_j].d;
-  }
-
-  di = sort_i[max_ind_i].d;
-  while (max_ind_i < count_i - 1 && max_dj[0] > di) {
-    max_ind_i++;
-
-    di = sort_i[max_ind_i].d;
-  }
+  populate_max_index_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max,
+                              hj_max, di_max, dj_min, max_index_i, max_index_j,
+                              &first_pi, &last_pj, e);
 
   /* Limits of the outer loops. */
   int first_pi_loop = first_pi;
@@ -753,8 +713,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 
   /* Take the max/min of both values calculated to work out how many particles
    * to read into the cache. */
-  last_pj = max(last_pj, max_ind_j);
-  first_pi = min(first_pi, max_ind_i);
+  last_pj = max(last_pj, max_index_i[count_i - 1]);
+  first_pi = min(first_pi, max_index_j[0]);
 
   /* Read the needed particles into the two caches. */
   int first_pi_align = first_pi;
@@ -769,7 +729,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   if (cell_is_active(ci, e)) {
 
     /* Loop over the parts in ci until nothing is within range in cj. */
-    for (int pid = count_i - 1; pid >= first_pi_loop && max_ind_j >= 0; pid--) {
+    for (int pid = count_i - 1; pid >= first_pi_loop; pid--) {
 
       /* Get a hold of the ith part in ci. */
       struct part *restrict pi = &parts_i[sort_i[pid].i];
@@ -785,13 +745,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
       if (di_test < dj_min) continue;
 
       /* Determine the exit iteration of the interaction loop. */
-      dj = sort_j[max_ind_j].d;
-      while (max_ind_j > 0 && max_di[pid] < dj) {
-        max_ind_j--;
-
-        dj = sort_j[max_ind_j].d;
-      }
-      int exit_iteration = max_ind_j + 1;
+      int exit_iteration = max_index_i[pid];
 
       const float hig2 = hi * hi * kernel_gamma2;
 
@@ -866,14 +820,14 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
         v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
         v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
 
-        vector v_doi_mask;
+        mask_t v_doi_mask;
         int doi_mask;
 
         /* Form r2 < hig2 mask. */
-        v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v);
+        vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v));
 
         /* Form integer mask. */
-        doi_mask = vec_cmp_result(v_doi_mask.v);
+        doi_mask = vec_form_int_mask(v_doi_mask);
 
         /* If there are any interactions perform them. */
         if (doi_mask)
@@ -882,12 +836,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
               &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx],
               &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &rhoSum,
               &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
-              &curlvySum, &curlvzSum, v_doi_mask,
-#ifdef HAVE_AVX512_F
-              knl_mask);
-#else
-              0);
-#endif
+              &curlvySum, &curlvzSum, v_doi_mask);
 
       } /* loop over the parts in cj. */
 
@@ -908,7 +857,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   if (cell_is_active(cj, e)) {
 
     /* Loop over the parts in cj until nothing is within range in ci. */
-    for (int pjd = 0; pjd <= last_pj_loop && max_ind_i < count_i; pjd++) {
+    for (int pjd = 0; pjd <= last_pj_loop; pjd++) {
 
       /* Get a hold of the jth part in cj. */
       struct part *restrict pj = &parts_j[sort_j[pjd].i];
@@ -925,13 +874,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
       if (dj_test > di_max) continue;
 
       /* Determine the exit iteration of the interaction loop. */
-      di = sort_i[max_ind_i].d;
-      while (max_ind_i < count_i - 1 && max_dj[pjd] > di) {
-        max_ind_i++;
-
-        di = sort_i[max_ind_i].d;
-      }
-      int exit_iteration = max_ind_i;
+      int exit_iteration = max_index_j[pjd];
 
       const float hjg2 = hj * hj * kernel_gamma2;
 
@@ -1005,14 +948,14 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
         v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
         v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
 
-        vector v_doj_mask;
+        mask_t v_doj_mask;
         int doj_mask;
 
         /* Form r2 < hig2 mask. */
-        v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v);
+        vec_create_mask(v_doj_mask, vec_cmp_lt(v_r2.v, v_hjg2.v));
 
         /* Form integer mask. */
-        doj_mask = vec_cmp_result(v_doj_mask.v);
+        doj_mask = vec_form_int_mask(v_doj_mask);
 
         /* If there are any interactions perform them. */
         if (doj_mask)
@@ -1021,12 +964,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
               &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx],
               &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum,
               &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
-              &curlvySum, &curlvzSum, v_doj_mask,
-#ifdef HAVE_AVX512_F
-              knl_mask);
-#else
-              0);
-#endif
+              &curlvySum, &curlvzSum, v_doj_mask);
 
       } /* loop over the parts in ci. */
 
diff --git a/src/scheduler.h b/src/scheduler.h
index cdb3a28f1a3482e934480b0c0ccd18f18b66d437..ac654580b2af2ffb506dc3fd9f0b988b89effbd0 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -52,7 +52,7 @@
 
 /* Flags . */
 #define scheduler_flag_none 0
-#define scheduler_flag_steal 1
+#define scheduler_flag_steal (1 << 1)
 
 /* Data of a scheduler. */
 struct scheduler {
diff --git a/src/tools.c b/src/tools.c
index 3f866710c387a858c23ed052be1ad463222601d5..3ee55db3d5f5348699372d2620b6d15af38b23d0 100644
--- a/src/tools.c
+++ b/src/tools.c
@@ -32,6 +32,7 @@
 #include "tools.h"
 
 /* Local includes. */
+#include "active.h"
 #include "cell.h"
 #include "error.h"
 #include "gravity.h"
@@ -183,6 +184,7 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) {
   float r2, hi, hj, hig2, hjg2, dx[3];
   struct part *pi, *pj;
   const double dim[3] = {r->e->s->dim[0], r->e->s->dim[1], r->e->s->dim[2]};
+  const struct engine *e = r->e;
 
   /* Implements a double-for loop and checks every interaction */
   for (int i = 0; i < ci->count; ++i) {
@@ -191,6 +193,9 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) {
     hi = pi->h;
     hig2 = hi * hi * kernel_gamma2;
 
+    /* Skip inactive particles. */
+    if (!part_is_active(pi, e)) continue;
+
     for (int j = 0; j < cj->count; ++j) {
 
       pj = &cj->parts[j];
@@ -219,6 +224,9 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) {
     hj = pj->h;
     hjg2 = hj * hj * kernel_gamma2;
 
+    /* Skip inactive particles. */
+    if (!part_is_active(pj, e)) continue;
+
     for (int i = 0; i < ci->count; ++i) {
 
       pi = &ci->parts[i];
@@ -311,6 +319,7 @@ void pairs_all_force(struct runner *r, struct cell *ci, struct cell *cj) {
 void self_all_density(struct runner *r, struct cell *ci) {
   float r2, hi, hj, hig2, hjg2, dxi[3];  //, dxj[3];
   struct part *pi, *pj;
+  const struct engine *e = r->e;
 
   /* Implements a double-for loop and checks every interaction */
   for (int i = 0; i < ci->count; ++i) {
@@ -335,14 +344,14 @@ void self_all_density(struct runner *r, struct cell *ci) {
       }
 
       /* Hit or miss? */
-      if (r2 < hig2) {
+      if (r2 < hig2 && part_is_active(pi, e)) {
 
         /* Interact */
         runner_iact_nonsym_density(r2, dxi, hi, hj, pi, pj);
       }
 
       /* Hit or miss? */
-      if (r2 < hjg2) {
+      if (r2 < hjg2 && part_is_active(pj, e)) {
 
         dxi[0] = -dxi[0];
         dxi[1] = -dxi[1];
diff --git a/src/vector.h b/src/vector.h
index 6d5389039c028e4307d4e559dd6567cce6c25392..6a7c6837989025785c1f9134004f2ebcc226a205 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -64,9 +64,13 @@
 #define vec_dbl_set(a, b, c, d, e, f, g, h) \
   _mm512_set_pd(h, g, f, e, d, c, b, a)
 #define vec_add(a, b) _mm512_add_ps(a, b)
+#define vec_mask_add(a, b, mask) _mm512_mask_add_ps(a, mask, b, a)
 #define vec_sub(a, b) _mm512_sub_ps(a, b)
+#define vec_mask_sub(a, b, mask) _mm512_mask_sub_ps(a, mask, a, b)
 #define vec_mul(a, b) _mm512_mul_ps(a, b)
+#define vec_div(a, b) _mm512_div_ps(a, b)
 #define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c)
+#define vec_fnma(a, b, c) _mm512_fnmadd_ps(a, b, c)
 #define vec_sqrt(a) _mm512_sqrt_ps(a)
 #define vec_rcp(a) _mm512_rcp14_ps(a)
 #define vec_rsqrt(a) _mm512_rsqrt14_ps(a)
@@ -79,7 +83,16 @@
 #define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ)
 #define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ)
 #define vec_cmp_gte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ)
+#define vec_cmp_result(a) ({ a; })
+#define vec_form_int_mask(a) ({ a; })
 #define vec_and(a, b) _mm512_and_ps(a, b)
+#define vec_mask_and(a, b) _mm512_kand(a, b)
+#define vec_and_mask(a, mask) _mm512_maskz_mov_ps(mask, a)
+#define vec_init_mask_true(mask) ({ mask = 0xFFFF; })
+#define vec_zero_mask(mask) ({ mask = 0; })
+#define vec_create_mask(mask, cond) ({ mask = cond; })
+#define vec_pad_mask(mask, pad) ({ mask = mask >> (pad); })
+#define vec_blend(mask, a, b) _mm512_mask_blend_ps(mask, a, b)
 #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
 #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
 #define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1)
@@ -120,10 +133,13 @@
     for (int i = 0; i < VEC_SIZE; i++) b += a.f[i]; \
   }
 #endif
-/* Calculates the number of set bits in the mask and adds the result to an int.
- */
-#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \
-  pack += __builtin_popcount(mask);
+
+/* Do nothing in the case of AVX-512 as there are already
+ * instructions for left-packing.*/
+#define VEC_FORM_PACKED_MASK(mask, packed_mask) packed_mask = mask
+
+/* Finds the horizontal maximum of vector b and returns a float. */
+#define VEC_HMAX(a, b) b = _mm512_reduce_max_ps(a.v)
 
 /* Performs a left-pack on a vector based upon a mask and returns the result. */
 #define VEC_LEFT_PACK(a, mask, result) \
@@ -146,8 +162,11 @@
 #define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a)
 #define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a)
 #define vec_add(a, b) _mm256_add_ps(a, b)
+#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b, mask.v))
 #define vec_sub(a, b) _mm256_sub_ps(a, b)
+#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b, mask.v))
 #define vec_mul(a, b) _mm256_mul_ps(a, b)
+#define vec_div(a, b) _mm256_div_ps(a, b)
 #define vec_sqrt(a) _mm256_sqrt_ps(a)
 #define vec_rcp(a) _mm256_rcp_ps(a)
 #define vec_rsqrt(a) _mm256_rsqrt_ps(a)
@@ -161,7 +180,16 @@
 #define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
 #define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
 #define vec_cmp_result(a) _mm256_movemask_ps(a)
+#define vec_form_int_mask(a) _mm256_movemask_ps(a.v)
 #define vec_and(a, b) _mm256_and_ps(a, b)
+#define vec_mask_and(a, b) _mm256_and_ps(a.v, b.v)
+#define vec_and_mask(a, mask) _mm256_and_ps(a, mask.v)
+#define vec_init_mask_true(mask) mask.m = vec_setint1(0xFFFFFFFF)
+#define vec_create_mask(mask, cond) mask.v = cond
+#define vec_zero_mask(mask) mask.v = vec_setzero()
+#define vec_pad_mask(mask, pad) \
+  for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0
+#define vec_blend(mask, a, b) _mm256_blendv_ps(a, b, mask.v)
 #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0))
 #define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1))
 #define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1)
@@ -187,6 +215,13 @@
   a.v = _mm256_hadd_ps(a.v, a.v); \
   b += a.f[0] + a.f[4];
 
+/* Performs a horizontal maximum on the vector and takes the maximum of the
+ * result with a float, b. */
+#define VEC_HMAX(a, b)                                     \
+  {                                                        \
+    for (int k = 0; k < VEC_SIZE; k++) b = max(b, a.f[k]); \
+  }
+
 /* Returns the lower 128-bits of the 256-bit vector. */
 #define VEC_GET_LOW(a) _mm256_castps256_ps128(a)
 
@@ -196,6 +231,7 @@
 /* Check if we have AVX2 intrinsics alongside AVX */
 #ifdef HAVE_AVX2
 #define vec_fma(a, b, c) _mm256_fmadd_ps(a, b, c)
+#define vec_fnma(a, b, c) _mm256_fnmadd_ps(a, b, c)
 
 /* Used in VEC_FORM_PACKED_MASK */
 #define identity_indices 0x0706050403020100
@@ -205,19 +241,18 @@
 /* Takes an integer mask and forms a left-packed integer vector
  * containing indices of the set bits in the integer mask.
  * Also returns the total number of bits set in the mask. */
-#define VEC_FORM_PACKED_MASK(mask, v_mask, pack)                               \
+#define VEC_FORM_PACKED_MASK(mask, packed_mask)                                \
   {                                                                            \
     unsigned long expanded_mask = _pdep_u64(mask, 0x0101010101010101);         \
     expanded_mask *= 0xFF;                                                     \
     unsigned long wanted_indices = _pext_u64(identity_indices, expanded_mask); \
     __m128i bytevec = _mm_cvtsi64_si128(wanted_indices);                       \
-    v_mask = _mm256_cvtepu8_epi32(bytevec);                                    \
-    pack += __builtin_popcount(mask);                                          \
+    packed_mask.m = _mm256_cvtepu8_epi32(bytevec);                             \
   }
 
 /* Performs a left-pack on a vector based upon a mask and returns the result. */
 #define VEC_LEFT_PACK(a, mask, result) \
-  vec_unaligned_store(_mm256_permutevar8x32_ps(a, mask), result)
+  vec_unaligned_store(_mm256_permutevar8x32_ps(a, mask.m), result)
 #endif /* HAVE_AVX2 */
 
 /* Create an FMA using vec_add and vec_mul if AVX2 is not present. */
@@ -225,6 +260,11 @@
 #define vec_fma(a, b, c) vec_add(vec_mul(a, b), c)
 #endif
 
+/* Create a negated FMA using vec_sub and vec_mul if AVX2 is not present. */
+#ifndef vec_fnma
+#define vec_fnma(a, b, c) vec_sub(c, vec_mul(a, b))
+#endif
+
 /* Form a packed mask without intrinsics if AVX2 is not present. */
 #ifndef VEC_FORM_PACKED_MASK
 
@@ -288,6 +328,7 @@
 #define vec_add(a, b) _mm_add_ps(a, b)
 #define vec_sub(a, b) _mm_sub_ps(a, b)
 #define vec_mul(a, b) _mm_mul_ps(a, b)
+#define vec_div(a, b) _mm_div_ps(a, b)
 #define vec_sqrt(a) _mm_sqrt_ps(a)
 #define vec_rcp(a) _mm_rcp_ps(a)
 #define vec_rsqrt(a) _mm_rsqrt_ps(a)
@@ -340,6 +381,13 @@ typedef union {
   int i[VEC_SIZE];
 } vector;
 
+/* Define the mask type depending on the instruction set used. */
+#ifdef HAVE_AVX512_F
+typedef __mmask16 mask_t;
+#else
+typedef vector mask_t;
+#endif
+
 /**
  * @brief Calculates the inverse ($1/x$) of a vector using intrinsics and a
  * Newton iteration to obtain the correct level of accuracy.
diff --git a/tests/Makefile.am b/tests/Makefile.am
index c4ed2512fc49276ad09dd1c4ef7a0f28207a1f77..bf4c205b5ab5000e246693c4d45772d16f67a5e5 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -21,16 +21,16 @@ AM_LDFLAGS = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS) $(FFTW_LIBS
 
 # List of programs and scripts to run in the test suite
 TESTS = testGreetings testMaths testReading.sh testSingle testKernel testSymmetry \
-        testPair.sh testPairPerturbed.sh test27cells.sh test27cellsPerturbed.sh  \
+        testActivePair.sh test27cells.sh test27cellsPerturbed.sh  \
         testParser.sh testSPHStep test125cells.sh test125cellsPerturbed.sh testFFT \
         testAdiabaticIndex testRiemannExact testRiemannTRRS testRiemannHLLC \
-        testMatrixInversion testThreadpool testDump testLogger \
+        testMatrixInversion testThreadpool testDump testLogger testInteractions.sh \
         testVoronoi1D testVoronoi2D testVoronoi3D testGravityDerivatives \
 	testPeriodicBC.sh testPeriodicBCPerturbed.sh
 
 # List of test programs to compile
 check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration \
-		 testSPHStep testPair test27cells test125cells testParser \
+		 testSPHStep testActivePair test27cells test125cells testParser \
                  testKernel testFFT testInteractions testMaths \
                  testSymmetry testThreadpool benchmarkInteractions \
                  testAdiabaticIndex testRiemannExact testRiemannTRRS \
@@ -56,7 +56,7 @@ testSPHStep_SOURCES = testSPHStep.c
 
 testSingle_SOURCES = testSingle.c
 
-testPair_SOURCES = testPair.c
+testActivePair_SOURCES = testActivePair.c
 
 test27cells_SOURCES = test27cells.c
 
@@ -99,10 +99,10 @@ testLogger_SOURCES = testLogger.c
 testGravityDerivatives_SOURCES = testGravityDerivatives.c
 
 # Files necessary for distribution
-EXTRA_DIST = testReading.sh makeInput.py testPair.sh testPairPerturbed.sh \
+EXTRA_DIST = testReading.sh makeInput.py testActivePair.sh \
 	     test27cells.sh test27cellsPerturbed.sh testParser.sh testPeriodicBC.sh \
 	     testPeriodicBCPerturbed.sh test125cells.sh test125cellsPerturbed.sh testParserInput.yaml \
 	     difffloat.py tolerance_125_normal.dat tolerance_125_perturbed.dat \
-             tolerance_27_normal.dat tolerance_27_perturbed.dat tolerance_27_perturbed_h.dat \
-	     tolerance_pair_normal.dat tolerance_pair_perturbed.dat \
+             tolerance_27_normal.dat tolerance_27_perturbed.dat tolerance_27_perturbed_h.dat tolerance_27_perturbed_h2.dat \
+	     tolerance_testInteractions.dat tolerance_pair_active.dat \
 	     fft_params.yml tolerance_periodic_BC_normal.dat tolerance_periodic_BC_perturbed.dat
diff --git a/tests/benchmarkInteractions.c b/tests/benchmarkInteractions.c
index a11862ebaa345623ca65cd7025b4f4e79769d0ac..5dc7a4ae85e62313a84d0c0be88b65a1c0bb59df 100644
--- a/tests/benchmarkInteractions.c
+++ b/tests/benchmarkInteractions.c
@@ -31,6 +31,7 @@
 #define IACT runner_iact_nonsym_density
 #define IACT_VEC runner_iact_nonsym_2_vec_density
 #define IACT_NAME "test_nonsym_density"
+#define NUM_VEC_PROC_INT 2
 #endif
 
 #ifdef SYM_DENSITY
@@ -53,8 +54,9 @@
 
 #ifndef IACT
 #define IACT runner_iact_nonsym_density
-#define IACT_VEC runner_iact_nonsym_2_vec_density
+#define IACT_VEC runner_iact_nonsym_1_vec_density
 #define IACT_NAME "test_nonsym_density"
+#define NUM_VEC_PROC_INT 1
 #endif
 
 /**
@@ -368,7 +370,7 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
 
 /* Perform vector interaction. */
 #ifdef WITH_VECTORIZATION
-    vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec, mask, mask2;
+    vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec;
     vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
         curlvySum, curlvzSum;
 
@@ -387,28 +389,35 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
     viz_vec.v = vec_load(&vizq[0]);
 
     hi_inv_vec = vec_reciprocal(hi_vec);
-    mask.m = vec_setint1(0xFFFFFFFF);
-    mask2.m = vec_setint1(0xFFFFFFFF);
 
-#ifdef HAVE_AVX512_F
-    KNL_MASK_16 knl_mask, knl_mask2;
-    knl_mask = 0xFFFF;
-    knl_mask2 = 0xFFFF;
+    mask_t mask;
+    vec_init_mask_true(mask);
+#if (NUM_VEC_PROC_INT == 2)
+    mask_t mask2;
+    vec_init_mask_true(mask2);
 #endif
-
     const ticks vec_tic = getticks();
 
-    for (size_t i = 0; i < count; i += 2 * VEC_SIZE) {
+    for (size_t i = 0; i < count; i += NUM_VEC_PROC_INT * VEC_SIZE) {
 
+/* Interleave two vectors for interaction. */
+#if (NUM_VEC_PROC_INT == 2)
       IACT_VEC(&(r2q[i]), &(dxq[i]), &(dyq[i]), &(dzq[i]), (hi_inv_vec),
                (vix_vec), (viy_vec), (viz_vec), &(vjxq[i]), &(vjyq[i]),
                &(vjzq[i]), &(mjq[i]), &rhoSum, &rho_dhSum, &wcountSum,
                &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum,
-               mask, mask2,
-#ifdef HAVE_AVX512_F
-               knl_mask, knl_mask2);
-#else
-               0, 0);
+               mask, mask2, 0);
+#else /* Only use one vector for interaction. */
+      vector r2, dx, dy, dz;
+      r2.v = vec_load(&(r2q[i]));
+      dx.v = vec_load(&(dxq[i]));
+      dy.v = vec_load(&(dyq[i]));
+      dz.v = vec_load(&(dzq[i]));
+
+      IACT_VEC(&r2, &dx, &dy, &dz, (hi_inv_vec), (vix_vec), (viy_vec),
+               (viz_vec), &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]),
+               &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum,
+               &curlvxSum, &curlvySum, &curlvzSum, mask);
 #endif
     }
 
diff --git a/tests/test125cells.c b/tests/test125cells.c
index 1bddd5dd2d4933e3b2d6c802d1325f3b88ba6ba5..2b35f7b5d509a7f966d07d6da82616869f29ab66 100644
--- a/tests/test125cells.c
+++ b/tests/test125cells.c
@@ -440,6 +440,7 @@ void runner_doself2_force(struct runner *r, struct cell *ci);
 /* And go... */
 int main(int argc, char *argv[]) {
 
+  engine_pin();
   size_t runs = 0, particles = 0;
   double h = 1.23485, size = 1., rho = 2.5;
   double perturbation = 0.;
diff --git a/tests/test27cells.c b/tests/test27cells.c
index c4df8356eb6494cc34b91c6ab5038b3866b14056..7ba1eec9ad279f09f63021e332dac1cfd5cc1505 100644
--- a/tests/test27cells.c
+++ b/tests/test27cells.c
@@ -122,7 +122,7 @@ struct cell *make_cell(size_t n, double *offset, double size, double h,
             break;
         }
         if (h_pert)
-          part->h = size * h * random_uniform(1.f, 1.1f) / (float)n;
+          part->h = size * h * random_uniform(1.f, h_pert) / (float)n;
         else
           part->h = size * h / (float)n;
         h_max = fmaxf(h_max, part->h);
diff --git a/tests/test27cells.sh.in b/tests/test27cells.sh.in
index a313a594c10c80598852a92e7920ac846e15ba2d..059a7a208aa8e570ad5035fac16ffd201bf3dddd 100755
--- a/tests/test27cells.sh.in
+++ b/tests/test27cells.sh.in
@@ -56,4 +56,32 @@ do
     
 done
 
+# Test for particles with random smoothing lengths
+for v in {0..3}
+do
+    echo ""
+	
+    rm -f brute_force_27_standard.dat swift_dopair_27_standard.dat
+
+    echo "Running ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -p 1.3"
+    ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -p 1.3
+
+    if [ -e brute_force_27_standard.dat ]
+    then
+	if python @srcdir@/difffloat.py brute_force_27_standard.dat swift_dopair_27_standard.dat @srcdir@/tolerance_27_perturbed_h2.dat 6
+	then
+	    echo "Accuracy test passed"
+	else
+	    echo "Accuracy test failed"
+	    exit 1
+	fi
+    else
+	echo "Error Missing test output file"
+	exit 1
+    fi
+
+    echo "------------"
+    
+done
+
 exit $?
diff --git a/tests/test27cellsPerturbed.sh.in b/tests/test27cellsPerturbed.sh.in
index 4adbe0b9ca6d8e1cfe7df861c2bf88bf35832983..f875504e541588377ca6e40fe55681ebec3466f6 100755
--- a/tests/test27cellsPerturbed.sh.in
+++ b/tests/test27cellsPerturbed.sh.in
@@ -56,4 +56,31 @@ do
 
 done
 
+# Test for particles with random smoothing lengths
+for v in {0..3}
+do
+    echo ""
+
+    rm -f brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat
+
+    echo "Running ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -p 1.3"
+    ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -p 1.3
+
+    if [ -e brute_force_27_perturbed.dat ]
+    then
+	if python @srcdir@/difffloat.py brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat @srcdir@/tolerance_27_perturbed_h2.dat 6
+	then
+	    echo "Accuracy test passed"
+	else
+	    echo "Accuracy test failed"
+	    exit 1
+	fi
+    else
+	echo "Error Missing test output file"
+	exit 1
+    fi
+
+    echo "------------"
+
+done
 exit $?
diff --git a/tests/testActivePair.c b/tests/testActivePair.c
new file mode 100644
index 0000000000000000000000000000000000000000..1e0111b4f0e480d0f66463b4c2264cdd89bd28c8
--- /dev/null
+++ b/tests/testActivePair.c
@@ -0,0 +1,510 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#include "../config.h"
+
+/* Some standard headers. */
+#include <fenv.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+/* Local headers. */
+#include "swift.h"
+
+/**
+ * @brief Constructs a cell and all of its particle in a valid state prior to
+ * a DOPAIR or DOSELF calcuation.
+ *
+ * @param n The cube root of the number of particles.
+ * @param offset The position of the cell offset from (0,0,0).
+ * @param size The cell size.
+ * @param h The smoothing length of the particles in units of the inter-particle
+ * separation.
+ * @param density The density of the fluid.
+ * @param partId The running counter of IDs.
+ * @param pert The perturbation to apply to the particles in the cell in units
+ * of the inter-particle separation.
+ * @param h_pert The perturbation to apply to the smoothing length.
+ * @param fraction_active The fraction of particles that should be active in the
+ * cell.
+ */
+struct cell *make_cell(size_t n, double *offset, double size, double h,
+                       double density, long long *partId, double pert,
+                       double h_pert, double fraction_active) {
+  const size_t count = n * n * n;
+  const double volume = size * size * size;
+  float h_max = 0.f;
+  struct cell *cell = malloc(sizeof(struct cell));
+  bzero(cell, sizeof(struct cell));
+
+  if (posix_memalign((void **)&cell->parts, part_align,
+                     count * sizeof(struct part)) != 0) {
+    error("couldn't allocate particles, no. of particles: %d", (int)count);
+  }
+  bzero(cell->parts, count * sizeof(struct part));
+
+  /* Construct the parts */
+  struct part *part = cell->parts;
+  for (size_t x = 0; x < n; ++x) {
+    for (size_t y = 0; y < n; ++y) {
+      for (size_t z = 0; z < n; ++z) {
+        part->x[0] =
+            offset[0] +
+            size * (x + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
+        part->x[1] =
+            offset[1] +
+            size * (y + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
+        part->x[2] =
+            offset[2] +
+            size * (z + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
+        part->v[0] = random_uniform(-0.05, 0.05);
+        part->v[1] = random_uniform(-0.05, 0.05);
+        part->v[2] = random_uniform(-0.05, 0.05);
+
+        if (h_pert)
+          part->h = size * h * random_uniform(1.f, h_pert) / (float)n;
+        else
+          part->h = size * h / (float)n;
+        h_max = fmaxf(h_max, part->h);
+        part->id = ++(*partId);
+
+#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
+        part->conserved.mass = density * volume / count;
+
+#ifdef SHADOWFAX_SPH
+        double anchor[3] = {0., 0., 0.};
+        double side[3] = {1., 1., 1.};
+        voronoi_cell_init(&part->cell, part->x, anchor, side);
+#endif
+
+#else
+        part->mass = density * volume / count;
+#endif
+
+#if defined(HOPKINS_PE_SPH)
+        part->entropy = 1.f;
+        part->entropy_one_over_gamma = 1.f;
+#endif
+        if (random_uniform(0, 1.f) < fraction_active)
+          part->time_bin = 1;
+        else
+          part->time_bin = num_time_bins + 1;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        part->ti_drift = 8;
+        part->ti_kick = 8;
+#endif
+
+        ++part;
+      }
+    }
+  }
+
+  /* Cell properties */
+  cell->split = 0;
+  cell->h_max = h_max;
+  cell->count = count;
+  cell->dx_max_part = 0.;
+  cell->dx_max_sort = 0.;
+  cell->width[0] = size;
+  cell->width[1] = size;
+  cell->width[2] = size;
+  cell->loc[0] = offset[0];
+  cell->loc[1] = offset[1];
+  cell->loc[2] = offset[2];
+
+  cell->ti_old_part = 8;
+  cell->ti_end_min = 8;
+  cell->ti_end_max = 8;
+
+  shuffle_particles(cell->parts, cell->count);
+
+  cell->sorted = 0;
+  for (int k = 0; k < 13; k++) cell->sort[k] = NULL;
+
+  return cell;
+}
+
+void clean_up(struct cell *ci) {
+  free(ci->parts);
+  for (int k = 0; k < 13; k++)
+    if (ci->sort[k] != NULL) free(ci->sort[k]);
+  free(ci);
+}
+
+/**
+ * @brief Initializes all particles field to be ready for a density calculation
+ */
+void zero_particle_fields(struct cell *c) {
+  for (int pid = 0; pid < c->count; pid++) {
+    hydro_init_part(&c->parts[pid], NULL);
+  }
+}
+
+/**
+ * @brief Ends the loop by adding the appropriate coefficients
+ */
+void end_calculation(struct cell *c) {
+  for (int pid = 0; pid < c->count; pid++) {
+    hydro_end_density(&c->parts[pid]);
+
+    /* Recover the common "Neighbour number" definition */
+    c->parts[pid].density.wcount *= pow_dimension(c->parts[pid].h);
+    c->parts[pid].density.wcount *= kernel_norm;
+  }
+}
+
+/**
+ * @brief Dump all the particles to a file
+ */
+void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) {
+  FILE *file = fopen(fileName, "a");
+
+  /* Write header */
+  fprintf(file, "# %4s %13s\n", "ID", "wcount");
+
+  fprintf(file, "# ci --------------------------------------------\n");
+
+  for (int pid = 0; pid < ci->count; pid++) {
+    fprintf(file, "%6llu %13e\n", ci->parts[pid].id,
+            ci->parts[pid].density.wcount);
+  }
+
+  fprintf(file, "# cj --------------------------------------------\n");
+
+  for (int pjd = 0; pjd < cj->count; pjd++) {
+    fprintf(file, "%6llu %13e\n", cj->parts[pjd].id,
+            cj->parts[pjd].density.wcount);
+  }
+
+  fclose(file);
+}
+
+/* Just a forward declaration... */
+void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
+void runner_doself1_density_vec(struct runner *r, struct cell *ci);
+void runner_dopair1_branch_density(struct runner *r, struct cell *ci,
+                                   struct cell *cj);
+
+/**
+ * @brief Computes the pair interactions of two cells using SWIFT and a brute
+ * force implementation.
+ */
+void test_pair_interactions(struct runner *runner, struct cell **ci,
+                            struct cell **cj, char *swiftOutputFileName,
+                            char *bruteForceOutputFileName) {
+
+  runner_do_sort(runner, *ci, 0x1FFF, 0, 0);
+  runner_do_sort(runner, *cj, 0x1FFF, 0, 0);
+
+  /* Zero the fields */
+  zero_particle_fields(*ci);
+  zero_particle_fields(*cj);
+
+  /* Run the test */
+  runner_dopair1_branch_density(runner, *ci, *cj);
+
+  /* Let's get physical ! */
+  end_calculation(*ci);
+  end_calculation(*cj);
+
+  /* Dump if necessary */
+  dump_particle_fields(swiftOutputFileName, *ci, *cj);
+
+  /* Now perform a brute-force version for accuracy tests */
+
+  /* Zero the fields */
+  zero_particle_fields(*ci);
+  zero_particle_fields(*cj);
+
+  /* Run the brute-force test */
+  pairs_all_density(runner, *ci, *cj);
+
+  /* Let's get physical ! */
+  end_calculation(*ci);
+  end_calculation(*cj);
+
+  dump_particle_fields(bruteForceOutputFileName, *ci, *cj);
+}
+
+/**
+ * @brief Computes the pair interactions of two cells in various configurations.
+ */
+void test_all_pair_interactions(struct runner *runner, double *offset2,
+                                size_t particles, double size, double h,
+                                double rho, long long *partId,
+                                double perturbation, double h_pert,
+                                char *swiftOutputFileName,
+                                char *bruteForceOutputFileName) {
+
+  double offset1[3] = {0, 0, 0};
+  struct cell *ci, *cj;
+
+  /* All active particles. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 1.);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 1.);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* Half particles are active. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 0.5);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 0.5);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* All particles inactive. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 0.);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 0.);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* 10% of particles active. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 0.1);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 0.1);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* One active cell one inactive cell. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 1.0);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 0.);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* One active cell one inactive cell. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 0.);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 1.0);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* Smaller cells, all active. */
+  ci = make_cell(2, offset1, size, h, rho, partId, perturbation, h_pert, 1.0);
+  cj = make_cell(2, offset2, size, h, rho, partId, perturbation, h_pert, 1.0);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* Different numbers of particles in each cell. */
+  ci = make_cell(10, offset1, size, h, rho, partId, perturbation, h_pert, 0.5);
+  cj = make_cell(3, offset2, size, h, rho, partId, perturbation, h_pert, 0.75);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* One cell inactive and the other only half active. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 0.5);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 0.);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* One cell inactive and the other only half active. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 0.);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 0.5);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  /* Clean things to make the sanitizer happy ... */
+  clean_up(ci);
+  clean_up(cj);
+}
+
+int main(int argc, char *argv[]) {
+  size_t particles = 0, runs = 0, type = 0;
+  double h = 1.23485, size = 1., rho = 1.;
+  double perturbation = 0.1, h_pert = 1.1;
+  struct space space;
+  struct engine engine;
+  struct runner *runner;
+  char c;
+  static long long partId = 0;
+  char outputFileNameExtension[200] = "";
+  char swiftOutputFileName[200] = "";
+  char bruteForceOutputFileName[200] = "";
+
+  /* Initialize CPU frequency, this also starts time. */
+  unsigned long long cpufreq = 0;
+  clocks_set_cpufreq(cpufreq);
+
+  /* Choke on FP-exceptions */
+  feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  /* Generate a RNG seed from time. */
+  unsigned int seed = time(NULL);
+
+  while ((c = getopt(argc, argv, "h:p:n:r:t:d:s:f:")) != -1) {
+    switch (c) {
+      case 'h':
+        sscanf(optarg, "%lf", &h);
+        break;
+      case 'p':
+        sscanf(optarg, "%lf", &h_pert);
+        break;
+      case 'n':
+        sscanf(optarg, "%zu", &particles);
+        break;
+      case 'r':
+        sscanf(optarg, "%zu", &runs);
+        break;
+      case 't':
+        sscanf(optarg, "%zu", &type);
+        break;
+      case 'd':
+        sscanf(optarg, "%lf", &perturbation);
+        break;
+      case 's':
+        sscanf(optarg, "%u", &seed);
+        break;
+      case 'f':
+        strcpy(outputFileNameExtension, optarg);
+        break;
+      case '?':
+        error("Unknown option.");
+        break;
+    }
+  }
+
+  if (h < 0 || particles == 0 || runs == 0 || type > 2) {
+    printf(
+        "\nUsage: %s -n PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n"
+        "\nGenerates a cell pair, filled with particles on a Cartesian grid."
+        "\nThese are then interacted using runner_dopair1_density."
+        "\n\nOptions:"
+        "\n-t TYPE=0          - cells share face (0), edge (1) or corner (2)"
+        "\n-h DISTANCE=1.2348 - smoothing length"
+        "\n-p                 - Random fractional change in h, h=h*random(1,p)"
+        "\n-d pert            - perturbation to apply to the particles [0,1["
+        "\n-s seed            - seed for RNG"
+        "\n-f fileName        - part of the file name used to save the dumps\n",
+        argv[0]);
+    exit(1);
+  }
+
+  /* Seed RNG. */
+  message("Seed used for RNG: %d", seed);
+  srand(seed);
+
+  space.periodic = 0;
+  space.dim[0] = 3.;
+  space.dim[1] = 3.;
+  space.dim[2] = 3.;
+
+  engine.s = &space;
+  engine.time = 0.1f;
+  engine.ti_current = 8;
+  engine.max_active_bin = num_time_bins;
+
+  if (posix_memalign((void **)&runner, SWIFT_STRUCT_ALIGNMENT,
+                     sizeof(struct runner)) != 0) {
+    error("couldn't allocate runner");
+  }
+
+  runner->e = &engine;
+
+  /* Create output file names. */
+  sprintf(swiftOutputFileName, "swift_dopair_%s.dat", outputFileNameExtension);
+  sprintf(bruteForceOutputFileName, "brute_force_%s.dat",
+          outputFileNameExtension);
+
+  /* Delete files if they already exist. */
+  remove(swiftOutputFileName);
+  remove(bruteForceOutputFileName);
+
+#ifdef WITH_VECTORIZATION
+  runner->ci_cache.count = 0;
+  cache_init(&runner->ci_cache, 512);
+  runner->cj_cache.count = 0;
+  cache_init(&runner->cj_cache, 512);
+#endif
+
+  double offset[3] = {1., 0., 0.};
+
+  /* Test a pair of cells face-on. */
+  test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId,
+                             perturbation, h_pert, swiftOutputFileName,
+                             bruteForceOutputFileName);
+
+  /* Test a pair of cells edge-on. */
+  offset[0] = 1.;
+  offset[1] = 1.;
+  offset[2] = 0.;
+  test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId,
+                             perturbation, h_pert, swiftOutputFileName,
+                             bruteForceOutputFileName);
+
+  /* Test a pair of cells corner-on. */
+  offset[0] = 1.;
+  offset[1] = 1.;
+  offset[2] = 1.;
+  test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId,
+                             perturbation, h_pert, swiftOutputFileName,
+                             bruteForceOutputFileName);
+  return 0;
+}
diff --git a/tests/testActivePair.sh.in b/tests/testActivePair.sh.in
new file mode 100755
index 0000000000000000000000000000000000000000..ff8d027a469bd9bc78286b843cf2dffd3ef27ad3
--- /dev/null
+++ b/tests/testActivePair.sh.in
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+echo ""
+
+rm -f brute_force_pair_active.dat swift_dopair_active.dat
+
+./testActivePair -n 6 -r 1 -d 0 -f active
+
+python @srcdir@/difffloat.py brute_force_active.dat swift_dopair_active.dat @srcdir@/tolerance_pair_active.dat
+
+exit $?
diff --git a/tests/testInteractions.c b/tests/testInteractions.c
index 4ce7fe40554d24551750629fa47c0bee7acdb6da..54d1f38733a1f1647331166f1a37b40ed3511419 100644
--- a/tests/testInteractions.c
+++ b/tests/testInteractions.c
@@ -17,12 +17,6 @@
  *
  ******************************************************************************/
 
-#include "../config.h"
-
-#ifndef WITH_VECTORIZATION
-int main() { return 0; }
-#else
-
 #include <fenv.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -30,15 +24,17 @@ int main() { return 0; }
 #include <unistd.h>
 #include "swift.h"
 
+#ifdef WITH_VECTORIZATION
+
 #define array_align sizeof(float) * VEC_SIZE
 #define ACC_THRESHOLD 1e-5
 
-/* Typdef function pointers for serial and vectorised versions of the
- * interaction functions. */
-typedef void (*serial_interaction)(float, float *, float, float, struct part *,
-                                   struct part *);
-typedef void (*vec_interaction)(float *, float *, float *, float *,
-                                struct part **, struct part **);
+#ifndef IACT
+#define IACT runner_iact_nonsym_density
+#define IACT_VEC runner_iact_nonsym_1_vec_density
+#define IACT_NAME "test_nonsym_density"
+#define NUM_VEC_PROC_INT 1
+#endif
 
 /**
  * @brief Constructs an array of particles in a valid state prior to
@@ -74,7 +70,10 @@ struct part *make_particles(size_t count, double *offset, double spacing,
 
   p->h = h;
   p->id = ++(*partId);
+
+#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH)
   p->mass = 1.0f;
+#endif
 
   /* Place rest of particles around the test particle
    * with random position within a unit sphere. */
@@ -93,7 +92,9 @@ struct part *make_particles(size_t count, double *offset, double spacing,
 
     p->h = h;
     p->id = ++(*partId);
+#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH)
     p->mass = 1.0f;
+#endif
   }
   return particles;
 }
@@ -103,6 +104,7 @@ struct part *make_particles(size_t count, double *offset, double spacing,
  */
 void prepare_force(struct part *parts, size_t count) {
 
+#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) && !defined(MINIMAL_SPH)
   struct part *p;
   for (size_t i = 0; i < count; ++i) {
     p = &parts[i];
@@ -113,6 +115,7 @@ void prepare_force(struct part *parts, size_t count) {
     p->force.v_sig = 0.0f;
     p->force.h_dt = 0.0f;
   }
+#endif
 }
 
 /**
@@ -122,25 +125,26 @@ void dump_indv_particle_fields(char *fileName, struct part *p) {
 
   FILE *file = fopen(fileName, "a");
 
+  /* Write header */
   fprintf(file,
-          "%6llu %10f %10f %10f %10f %10f %10f %10e %10e %10e %13e %13e %13e "
-          "%13e %13e %13e %13e "
-          "%13e %13e %13e %10f\n",
+          "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e "
+          "%13e %13e %13e\n",
           p->id, p->x[0], p->x[1], p->x[2], p->v[0], p->v[1], p->v[2],
-          p->a_hydro[0], p->a_hydro[1], p->a_hydro[2], p->rho,
-          p->density.rho_dh, p->density.wcount, p->density.wcount_dh,
-          p->force.h_dt, p->force.v_sig,
-#if defined(GADGET2_SPH)
-          p->density.div_v, p->density.rot_v[0], p->density.rot_v[1],
-          p->density.rot_v[2], p->entropy_dt
-#elif defined(DEFAULT_SPH)
-          p->density.div_v, p->density.rot_v[0], p->density.rot_v[1],
-          p->density.rot_v[2], 0.
+          hydro_get_density(p),
+#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
+          0.f,
 #else
+          p->density.rho_dh,
+#endif
+          p->density.wcount, p->density.wcount_dh,
+#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH)
           p->density.div_v, p->density.rot_v[0], p->density.rot_v[1],
           p->density.rot_v[2]
+#else
+          0., 0., 0., 0.
 #endif
           );
+
   fclose(file);
 }
 
@@ -152,13 +156,10 @@ void write_header(char *fileName) {
   FILE *file = fopen(fileName, "w");
   /* Write header */
   fprintf(file,
-          "# %4s %10s %10s %10s %10s %10s %10s %10s %10s %10s %13s %13s %13s "
-          "%13s %13s %13s %13s"
-          "%13s %13s %13s %13s\n",
-          "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "a_x", "a_y",
-          "a_z", "rho", "rho_dh", "wcount", "wcount_dh", "dh/dt", "v_sig",
-          "div_v", "curl_vx", "curl_vy", "curl_vz", "dS/dt");
-  fprintf(file, "\n# PARTICLES BEFORE INTERACTION:\n");
+          "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s "
+          "%13s %13s %13s\n",
+          "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh",
+          "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz");
   fclose(file);
 }
 
@@ -187,8 +188,8 @@ int check_results(struct part serial_test_part, struct part *serial_parts,
 }
 
 /*
- * @brief Calls the serial and vectorised version of an interaction
- * function given by the function pointers.
+ * @brief Calls the serial and vectorised version of the non-symmetrical density
+ * interaction.
  *
  * @param test_part Particle that will be updated
  * @param parts Particle array to be interacted
@@ -196,16 +197,15 @@ int check_results(struct part serial_test_part, struct part *serial_parts,
  * @param serial_inter_func Serial interaction function to be called
  * @param vec_inter_func Vectorised interaction function to be called
  * @param runs No. of times to call interactions
+ * @param num_vec_proc No. of vectors to use to process interaction
  *
  */
 void test_interactions(struct part test_part, struct part *parts, size_t count,
-                       serial_interaction serial_inter_func,
-                       vec_interaction vec_inter_func, char *filePrefix,
-                       size_t runs) {
+                       char *filePrefix, int runs, int num_vec_proc) {
 
-  ticks serial_time = 0, vec_time = 0;
+  ticks serial_time = 0;
+  ticks vec_time = 0;
 
-  FILE *file;
   char serial_filename[200] = "";
   char vec_filename[200] = "";
 
@@ -217,64 +217,68 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
   write_header(serial_filename);
   write_header(vec_filename);
 
-  /* Test particle at the center of a unit sphere. */
   struct part pi_serial, pi_vec;
-
-  /* Remaining particles in the sphere that will interact with test particle. */
   struct part pj_serial[count], pj_vec[count];
 
-  /* Stores the separation, smoothing length and pointers to particles
-   * needed for the vectorised interaction. */
+  float r2[count] __attribute__((aligned(array_align)));
+  float dx[3 * count] __attribute__((aligned(array_align)));
+
+  struct part *piq[count], *pjq[count];
+  for (size_t k = 0; k < count; k++) {
+    piq[k] = NULL;
+    pjq[k] = NULL;
+  }
+
   float r2q[count] __attribute__((aligned(array_align)));
   float hiq[count] __attribute__((aligned(array_align)));
-  float hjq[count] __attribute__((aligned(array_align)));
-  float dxq[3 * count] __attribute__((aligned(array_align)));
-  struct part *piq[count], *pjq[count];
+  float dxq[count] __attribute__((aligned(array_align)));
+
+  float dyq[count] __attribute__((aligned(array_align)));
+  float dzq[count] __attribute__((aligned(array_align)));
+  float mjq[count] __attribute__((aligned(array_align)));
+  float vixq[count] __attribute__((aligned(array_align)));
+  float viyq[count] __attribute__((aligned(array_align)));
+  float vizq[count] __attribute__((aligned(array_align)));
+  float vjxq[count] __attribute__((aligned(array_align)));
+  float vjyq[count] __attribute__((aligned(array_align)));
+  float vjzq[count] __attribute__((aligned(array_align)));
 
   /* Call serial interaction a set number of times. */
-  for (size_t k = 0; k < runs; k++) {
+  for (int k = 0; k < runs; k++) {
     /* Reset particle to initial setup */
     pi_serial = test_part;
     for (size_t i = 0; i < count; i++) pj_serial[i] = parts[i];
 
-    /* Only dump data on first run. */
-    if (k == 0) {
-      /* Dump state of particles before serial interaction. */
-      dump_indv_particle_fields(serial_filename, &pi_serial);
-      for (size_t i = 0; i < count; i++)
-        dump_indv_particle_fields(serial_filename, &pj_serial[i]);
-    }
-
     /* Perform serial interaction */
     for (size_t i = 0; i < count; i++) {
       /* Compute the pairwise distance. */
-      float r2 = 0.0f;
-      float dx[3];
-      for (size_t k = 0; k < 3; k++) {
-        dx[k] = pi_serial.x[k] - pj_serial[i].x[k];
-        r2 += dx[k] * dx[k];
+      r2[i] = 0.0f;
+      for (int k = 0; k < 3; k++) {
+        int ind = (3 * i) + k;
+        dx[ind] = pi_serial.x[k] - pj_serial[i].x[k];
+        r2[i] += dx[ind] * dx[ind];
       }
+    }
 
-      const ticks tic = getticks();
-
-      serial_inter_func(r2, dx, pi_serial.h, pj_serial[i].h, &pi_serial,
-                        &pj_serial[i]);
-
-      serial_time += getticks() - tic;
+    const ticks tic = getticks();
+/* Perform serial interaction */
+#ifdef __ICC
+#pragma novector
+#endif
+    for (size_t i = 0; i < count; i++) {
+      IACT(r2[i], &(dx[3 * i]), pi_serial.h, pj_serial[i].h, &pi_serial,
+           &pj_serial[i]);
     }
+    serial_time += getticks() - tic;
   }
 
-  file = fopen(serial_filename, "a");
-  fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n");
-  fclose(file);
-
   /* Dump result of serial interaction. */
   dump_indv_particle_fields(serial_filename, &pi_serial);
   for (size_t i = 0; i < count; i++)
     dump_indv_particle_fields(serial_filename, &pj_serial[i]);
 
   /* Call vector interaction a set number of times. */
-  for (size_t k = 0; k < runs; k++) {
+  for (int k = 0; k < runs; k++) {
     /* Reset particle to initial setup */
     pi_vec = test_part;
     for (size_t i = 0; i < count; i++) pj_vec[i] = parts[i];
@@ -284,45 +288,92 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
       /* Compute the pairwise distance. */
       float r2 = 0.0f;
       float dx[3];
-      for (size_t k = 0; k < 3; k++) {
+      for (int k = 0; k < 3; k++) {
         dx[k] = pi_vec.x[k] - pj_vec[i].x[k];
         r2 += dx[k] * dx[k];
       }
 
       r2q[i] = r2;
-      dxq[3 * i + 0] = dx[0];
-      dxq[3 * i + 1] = dx[1];
-      dxq[3 * i + 2] = dx[2];
+      dxq[i] = dx[0];
       hiq[i] = pi_vec.h;
-      hjq[i] = pj_vec[i].h;
       piq[i] = &pi_vec;
       pjq[i] = &pj_vec[i];
-    }
 
-    /* Only dump data on first run. */
-    if (k == 0) {
-      /* Dump state of particles before vector interaction. */
-      dump_indv_particle_fields(vec_filename, piq[0]);
-      for (size_t i = 0; i < count; i++)
-        dump_indv_particle_fields(vec_filename, pjq[i]);
+      dyq[i] = dx[1];
+      dzq[i] = dx[2];
+      mjq[i] = pj_vec[i].mass;
+      vixq[i] = pi_vec.v[0];
+      viyq[i] = pi_vec.v[1];
+      vizq[i] = pi_vec.v[2];
+      vjxq[i] = pj_vec[i].v[0];
+      vjyq[i] = pj_vec[i].v[1];
+      vjzq[i] = pj_vec[i].v[2];
     }
 
+    /* Perform vector interaction. */
+    vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec;
+    vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
+        curlvySum, curlvzSum;
+    mask_t mask, mask2;
+
+    rhoSum.v = vec_set1(0.f);
+    rho_dhSum.v = vec_set1(0.f);
+    wcountSum.v = vec_set1(0.f);
+    wcount_dhSum.v = vec_set1(0.f);
+    div_vSum.v = vec_set1(0.f);
+    curlvxSum.v = vec_set1(0.f);
+    curlvySum.v = vec_set1(0.f);
+    curlvzSum.v = vec_set1(0.f);
+
+    hi_vec.v = vec_load(&hiq[0]);
+    vix_vec.v = vec_load(&vixq[0]);
+    viy_vec.v = vec_load(&viyq[0]);
+    viz_vec.v = vec_load(&vizq[0]);
+
+    hi_inv_vec = vec_reciprocal(hi_vec);
+    vec_init_mask_true(mask);
+    vec_init_mask_true(mask2);
+
     const ticks vec_tic = getticks();
 
-    /* Perform vector interaction. */
-    for (size_t i = 0; i < count; i += VEC_SIZE) {
-      vec_inter_func(&(r2q[i]), &(dxq[3 * i]), &(hiq[i]), &(hjq[i]), &(piq[i]),
-                     &(pjq[i]));
+    for (size_t i = 0; i < count; i += num_vec_proc * VEC_SIZE) {
+
+      /* Interleave two vectors for interaction. */
+      if (num_vec_proc == 2) {
+        runner_iact_nonsym_2_vec_density(
+            &(r2q[i]), &(dxq[i]), &(dyq[i]), &(dzq[i]), (hi_inv_vec), (vix_vec),
+            (viy_vec), (viz_vec), &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]),
+            &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum,
+            &curlvxSum, &curlvySum, &curlvzSum, mask, mask2, 0);
+      } else { /* Only use one vector for interaction. */
+
+        vector r2, dx, dy, dz;
+        r2.v = vec_load(&(r2q[i]));
+        dx.v = vec_load(&(dxq[i]));
+        dy.v = vec_load(&(dyq[i]));
+        dz.v = vec_load(&(dzq[i]));
+
+        runner_iact_nonsym_1_vec_density(
+            &r2, &dx, &dy, &dz, (hi_inv_vec), (vix_vec), (viy_vec), (viz_vec),
+            &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]), &rhoSum, &rho_dhSum,
+            &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
+            &curlvzSum, mask);
+      }
     }
 
+    VEC_HADD(rhoSum, piq[0]->rho);
+    VEC_HADD(rho_dhSum, piq[0]->density.rho_dh);
+    VEC_HADD(wcountSum, piq[0]->density.wcount);
+    VEC_HADD(wcount_dhSum, piq[0]->density.wcount_dh);
+    VEC_HADD(div_vSum, piq[0]->density.div_v);
+    VEC_HADD(curlvxSum, piq[0]->density.rot_v[0]);
+    VEC_HADD(curlvySum, piq[0]->density.rot_v[1]);
+    VEC_HADD(curlvzSum, piq[0]->density.rot_v[2]);
+
     vec_time += getticks() - vec_tic;
   }
 
-  file = fopen(vec_filename, "a");
-  fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n");
-  fclose(file);
-
-  /* Dump result of vector interaction. */
+  /* Dump result of serial interaction. */
   dump_indv_particle_fields(vec_filename, piq[0]);
   for (size_t i = 0; i < count; i++)
     dump_indv_particle_fields(vec_filename, pjq[i]);
@@ -334,6 +385,7 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
   message("The serial interactions took     : %15lli ticks.",
           serial_time / runs);
   message("The vectorised interactions took : %15lli ticks.", vec_time / runs);
+  message("Speed up: %15fx.", (double)(serial_time) / vec_time);
 }
 
 /* And go... */
@@ -386,62 +438,22 @@ int main(int argc, char *argv[]) {
 
   /* Build the infrastructure */
   static long long partId = 0;
-  struct part density_test_particle, force_test_particle;
-  struct part *density_particles =
-      make_particles(count, offset, spacing, h, &partId);
-  struct part *force_particles =
-      make_particles(count, offset, spacing, h, &partId);
-  prepare_force(force_particles, count);
-
-  /* Define which interactions to call */
-  serial_interaction serial_inter_func = &runner_iact_nonsym_density;
-  vec_interaction vec_inter_func = &runner_iact_nonsym_vec_density;
-
-  density_test_particle = density_particles[0];
+  struct part test_particle;
+  struct part *particles = make_particles(count, offset, spacing, h, &partId);
+
+  test_particle = particles[0];
   /* Call the non-sym density test. */
-  message("Testing non-symmetrical density interaction...");
-  test_interactions(density_test_particle, &density_particles[1], count - 1,
-                    serial_inter_func, vec_inter_func, "test_nonsym_density",
-                    runs);
-
-  density_particles = make_particles(count, offset, spacing, h, &partId);
-
-  /* Re-assign function pointers. */
-  serial_inter_func = &runner_iact_density;
-  vec_inter_func = &runner_iact_vec_density;
-
-  density_test_particle = density_particles[0];
-  /* Call the symmetrical density test. */
-  message("Testing symmetrical density interaction...");
-  test_interactions(density_test_particle, &density_particles[1], count - 1,
-                    serial_inter_func, vec_inter_func, "test_sym_density",
-                    runs);
-
-  /* Re-assign function pointers. */
-  serial_inter_func = &runner_iact_nonsym_force;
-  vec_inter_func = &runner_iact_nonsym_vec_force;
-
-  force_test_particle = force_particles[0];
-  /* Call the test non-sym force test. */
-  message("Testing non-symmetrical force interaction...");
-  test_interactions(force_test_particle, &force_particles[1], count - 1,
-                    serial_inter_func, vec_inter_func, "test_nonsym_force",
-                    runs);
-
-  force_particles = make_particles(count, offset, spacing, h, &partId);
-  prepare_force(force_particles, count);
-
-  /* Re-assign function pointers. */
-  serial_inter_func = &runner_iact_force;
-  vec_inter_func = &runner_iact_vec_force;
-
-  force_test_particle = force_particles[0];
-  /* Call the test symmetrical force test. */
-  message("Testing symmetrical force interaction...");
-  test_interactions(force_test_particle, &force_particles[1], count - 1,
-                    serial_inter_func, vec_inter_func, "test_sym_force", runs);
+  message("Testing %s interaction...", IACT_NAME);
+  test_interactions(test_particle, &particles[1], count - 1, IACT_NAME, runs,
+                    1);
+  test_interactions(test_particle, &particles[1], count - 1, IACT_NAME, runs,
+                    2);
 
   return 0;
 }
 
-#endif /* WITH_VECTORIZATION */
+#else
+
+int main() { return 1; }
+
+#endif
diff --git a/tests/testInteractions.sh.in b/tests/testInteractions.sh.in
new file mode 100644
index 0000000000000000000000000000000000000000..4b002c56e37eff417c673ddac2e44b3edf17683a
--- /dev/null
+++ b/tests/testInteractions.sh.in
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+echo ""
+
+rm -f test_nonsym_density_serial.dat test_nonsym_density_vec.dat
+
+echo "Running ./testInteractions"
+
+./testInteractions
+
+if [ $? != 0 ]; then
+  echo "testInteractions is redundant when vectorisation is disabled"
+else
+  if [ -e test_nonsym_density_serial.dat ]
+  then
+    if python @srcdir@/difffloat.py test_nonsym_density_serial.dat test_nonsym_density_vec.dat @srcdir@/tolerance_testInteractions.dat
+    then
+      echo "Accuracy test passed"
+    else
+      echo "Accuracy test failed"
+      exit 1
+    fi
+  else
+    echo "Error Missing test output file"
+    exit 1
+  fi
+fi
+
+echo "------------"
diff --git a/tests/testKernel.c b/tests/testKernel.c
index a2744119a527cc842cdd4711056eee7a7d7b4270..0658639070526f28ce1bceefc54d3f2d7a3ae765 100644
--- a/tests/testKernel.c
+++ b/tests/testKernel.c
@@ -68,7 +68,7 @@ int main() {
       vx.f[j] = (i + j) * 2.25f / numPoints;
     }
 
-    vx_h.v = vx.v / vec_set1(h);
+    vx_h.v = vec_mul(vx.v, vec_set1(1.f / h));
 
     kernel_deval_1_vec(&vx_h, &W_vec, &dW_vec);
 
@@ -106,8 +106,8 @@ int main() {
       vx_2.f[j] = (i + j) * 2.25f / numPoints;
     }
 
-    vx_h.v = vx.v / vec_set1(h);
-    vx_h_2.v = vx_2.v / vec_set1(h);
+    vx_h.v = vec_mul(vx.v, vec_set1(1.f / h));
+    vx_h_2.v = vec_mul(vx_2.v, vec_set1(1.f / h));
 
     kernel_deval_2_vec(&vx_h, &W_vec, &dW_vec, &vx_h_2, &W_vec_2, &dW_vec_2);
 
diff --git a/tests/testPair.c b/tests/testPair.c
deleted file mode 100644
index 0e0b36a4c30abc5a8d37060c70788669f5b5710d..0000000000000000000000000000000000000000
--- a/tests/testPair.c
+++ /dev/null
@@ -1,352 +0,0 @@
-/*******************************************************************************
- * This file is part of SWIFT.
- * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- ******************************************************************************/
-#include "../config.h"
-
-/* Some standard headers. */
-#include <fenv.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-/* Local headers. */
-#include "swift.h"
-
-#if defined(WITH_VECTORIZATION)
-#define DOSELF1 runner_doself1_density_vec
-#define DOPAIR1 runner_dopair1_branch_density
-#define DOSELF1_NAME "runner_doself1_density_vec"
-#define DOPAIR1_NAME "runner_dopair1_density_vec"
-#endif
-
-#ifndef DOSELF1
-#define DOSELF1 runner_doself1_density
-#define DOSELF1_NAME "runner_doself1_density"
-#endif
-
-#ifndef DOPAIR1
-#define DOPAIR1 runner_dopair1_branch_density
-#define DOPAIR1_NAME "runner_dopair1_density"
-#endif
-
-/* n is both particles per axis and box size:
- * particles are generated on a mesh with unit spacing
- */
-struct cell *make_cell(size_t n, double *offset, double size, double h,
-                       double density, unsigned long long *partId,
-                       double pert) {
-  const size_t count = n * n * n;
-  const double volume = size * size * size;
-  struct cell *cell = malloc(sizeof(struct cell));
-  bzero(cell, sizeof(struct cell));
-
-  if (posix_memalign((void **)&cell->parts, part_align,
-                     count * sizeof(struct part)) != 0) {
-    error("couldn't allocate particles, no. of particles: %d", (int)count);
-  }
-  bzero(cell->parts, count * sizeof(struct part));
-
-  /* Construct the parts */
-  struct part *part = cell->parts;
-  for (size_t x = 0; x < n; ++x) {
-    for (size_t y = 0; y < n; ++y) {
-      for (size_t z = 0; z < n; ++z) {
-        part->x[0] =
-            offset[0] +
-            size * (x + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
-        part->x[1] =
-            offset[1] +
-            size * (y + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
-        part->x[2] =
-            offset[2] +
-            size * (z + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
-        // part->v[0] = part->x[0] - 1.5;
-        // part->v[1] = part->x[1] - 1.5;
-        // part->v[2] = part->x[2] - 1.5;
-        part->v[0] = random_uniform(-0.05, 0.05);
-        part->v[1] = random_uniform(-0.05, 0.05);
-        part->v[2] = random_uniform(-0.05, 0.05);
-        part->h = size * h / (float)n;
-        part->id = ++(*partId);
-#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
-        part->conserved.mass = density * volume / count;
-#else
-        part->mass = density * volume / count;
-#endif
-        part->time_bin = 1;
-
-#ifdef SWIFT_DEBUG_CHECKS
-        part->ti_drift = 8;
-        part->ti_kick = 8;
-#endif
-
-        ++part;
-      }
-    }
-  }
-
-  /* Cell properties */
-  cell->split = 0;
-  cell->h_max = h;
-  cell->count = count;
-  cell->dx_max_part = 0.;
-  cell->dx_max_sort = 0.;
-  cell->width[0] = n;
-  cell->width[1] = n;
-  cell->width[2] = n;
-  cell->loc[0] = offset[0];
-  cell->loc[1] = offset[1];
-  cell->loc[2] = offset[2];
-
-  cell->ti_old_part = 8;
-  cell->ti_end_min = 8;
-  cell->ti_end_max = 8;
-
-  shuffle_particles(cell->parts, cell->count);
-
-  cell->sorted = 0;
-  for (int k = 0; k < 13; k++) cell->sort[k] = NULL;
-
-  return cell;
-}
-
-void clean_up(struct cell *ci) {
-  free(ci->parts);
-  for (int k = 0; k < 13; k++)
-    if (ci->sort[k] != NULL) free(ci->sort[k]);
-  free(ci);
-}
-
-/**
- * @brief Initializes all particles field to be ready for a density calculation
- */
-void zero_particle_fields(struct cell *c) {
-  for (int pid = 0; pid < c->count; pid++) {
-    hydro_init_part(&c->parts[pid], NULL);
-  }
-}
-
-/**
- * @brief Dump all the particles to a file
- */
-void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) {
-  FILE *file = fopen(fileName, "w");
-
-  /* Write header */
-  fprintf(file,
-          "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s "
-          "%13s %13s %13s\n",
-          "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh",
-          "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz");
-
-  fprintf(file, "# ci --------------------------------------------\n");
-
-  for (int pid = 0; pid < ci->count; pid++) {
-    fprintf(file,
-            "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e "
-            "%13e %13e %13e\n",
-            ci->parts[pid].id, ci->parts[pid].x[0], ci->parts[pid].x[1],
-            ci->parts[pid].x[2], ci->parts[pid].v[0], ci->parts[pid].v[1],
-            ci->parts[pid].v[2], hydro_get_density(&ci->parts[pid]),
-#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
-            0.f,
-#else
-            ci->parts[pid].density.rho_dh,
-#endif
-            ci->parts[pid].density.wcount, ci->parts[pid].density.wcount_dh,
-#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH)
-            ci->parts[pid].density.div_v, ci->parts[pid].density.rot_v[0],
-            ci->parts[pid].density.rot_v[1], ci->parts[pid].density.rot_v[2]
-#else
-            0., 0., 0., 0.
-#endif
-            );
-  }
-
-  fprintf(file, "# cj --------------------------------------------\n");
-
-  for (int pjd = 0; pjd < cj->count; pjd++) {
-    fprintf(file,
-            "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e "
-            "%13e %13e %13e\n",
-            cj->parts[pjd].id, cj->parts[pjd].x[0], cj->parts[pjd].x[1],
-            cj->parts[pjd].x[2], cj->parts[pjd].v[0], cj->parts[pjd].v[1],
-            cj->parts[pjd].v[2], hydro_get_density(&cj->parts[pjd]),
-#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
-            0.f,
-#else
-            cj->parts[pjd].density.rho_dh,
-#endif
-            cj->parts[pjd].density.wcount, cj->parts[pjd].density.wcount_dh,
-#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH)
-            cj->parts[pjd].density.div_v, cj->parts[pjd].density.rot_v[0],
-            cj->parts[pjd].density.rot_v[1], cj->parts[pjd].density.rot_v[2]
-#else
-            0., 0., 0., 0.
-#endif
-            );
-  }
-
-  fclose(file);
-}
-
-/* Just a forward declaration... */
-void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
-void runner_doself1_density_vec(struct runner *r, struct cell *ci);
-void runner_dopair1_branch_density(struct runner *r, struct cell *ci,
-                                   struct cell *cj);
-
-int main(int argc, char *argv[]) {
-  size_t particles = 0, runs = 0, volume, type = 0;
-  double offset[3] = {0, 0, 0}, h = 1.1255, size = 1., rho = 1.;
-  double perturbation = 0.;
-  struct cell *ci, *cj;
-  struct space space;
-  struct engine engine;
-  struct runner runner;
-  char c;
-  static unsigned long long partId = 0;
-  char outputFileNameExtension[200] = "";
-  char outputFileName[200] = "";
-  ticks tic, toc, time;
-
-  /* Initialize CPU frequency, this also starts time. */
-  unsigned long long cpufreq = 0;
-  clocks_set_cpufreq(cpufreq);
-
-  /* Choke on FP-exceptions */
-  feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
-
-  srand(0);
-
-  while ((c = getopt(argc, argv, "h:p:r:t:d:f:")) != -1) {
-    switch (c) {
-      case 'h':
-        sscanf(optarg, "%lf", &h);
-        break;
-      case 'p':
-        sscanf(optarg, "%zu", &particles);
-        break;
-      case 'r':
-        sscanf(optarg, "%zu", &runs);
-        break;
-      case 't':
-        sscanf(optarg, "%zu", &type);
-        break;
-      case 'd':
-        sscanf(optarg, "%lf", &perturbation);
-        break;
-      case 'f':
-        strcpy(outputFileNameExtension, optarg);
-        break;
-      case '?':
-        error("Unknown option.");
-        break;
-    }
-  }
-
-  if (h < 0 || particles == 0 || runs == 0 || type > 2) {
-    printf(
-        "\nUsage: %s -p PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n"
-        "\nGenerates a cell pair, filled with particles on a Cartesian grid."
-        "\nThese are then interacted using runner_dopair1_density."
-        "\n\nOptions:"
-        "\n-t TYPE=0          - cells share face (0), edge (1) or corner (2)"
-        "\n-h DISTANCE=1.1255 - smoothing length"
-        "\n-d pert            - perturbation to apply to the particles [0,1["
-        "\n-f fileName        - part of the file name used to save the dumps\n",
-        argv[0]);
-    exit(1);
-  }
-
-  space.periodic = 0;
-  space.dim[0] = 3.;
-  space.dim[1] = 3.;
-  space.dim[2] = 3.;
-
-  engine.s = &space;
-  engine.time = 0.1f;
-  engine.ti_current = 8;
-  engine.max_active_bin = num_time_bins;
-  runner.e = &engine;
-
-  volume = particles * particles * particles;
-  message("particles: %zu B\npositions: 0 B", 2 * volume * sizeof(struct part));
-
-  ci = make_cell(particles, offset, size, h, rho, &partId, perturbation);
-  for (size_t i = 0; i < type + 1; ++i) offset[i] = 1.;
-  cj = make_cell(particles, offset, size, h, rho, &partId, perturbation);
-
-  runner_do_sort(&runner, ci, 0x1FFF, 0, 0);
-  runner_do_sort(&runner, cj, 0x1FFF, 0, 0);
-
-  time = 0;
-  for (size_t i = 0; i < runs; ++i) {
-    /* Zero the fields */
-    zero_particle_fields(ci);
-    zero_particle_fields(cj);
-
-    tic = getticks();
-
-#if defined(DEFAULT_SPH) || !defined(WITH_VECTORIZATION)
-    /* Run the test */
-    DOPAIR1(&runner, ci, cj);
-#endif
-
-    toc = getticks();
-    time += toc - tic;
-
-    /* Dump if necessary */
-    if (i % 50 == 0) {
-      sprintf(outputFileName, "swift_dopair_%s.dat", outputFileNameExtension);
-      dump_particle_fields(outputFileName, ci, cj);
-    }
-  }
-
-  /* Output timing */
-  message("SWIFT calculation took       %lli ticks.", time / runs);
-
-  /* Now perform a brute-force version for accuracy tests */
-
-  /* Zero the fields */
-  zero_particle_fields(ci);
-  zero_particle_fields(cj);
-
-  tic = getticks();
-
-#if defined(DEFAULT_SPH) || !defined(WITH_VECTORIZATION)
-  /* Run the brute-force test */
-  pairs_all_density(&runner, ci, cj);
-#endif
-
-  toc = getticks();
-
-  /* Dump */
-  sprintf(outputFileName, "brute_force_%s.dat", outputFileNameExtension);
-  dump_particle_fields(outputFileName, ci, cj);
-
-  /* Output timing */
-  message("Brute force calculation took %lli ticks.", toc - tic);
-
-  /* Clean things to make the sanitizer happy ... */
-  clean_up(ci);
-  clean_up(cj);
-
-  return 0;
-}
diff --git a/tests/testPair.sh.in b/tests/testPair.sh.in
deleted file mode 100755
index bd7051b060c4acab6cf5a164af1914715856849b..0000000000000000000000000000000000000000
--- a/tests/testPair.sh.in
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-echo ""
-
-rm -f brute_force_standard.dat swift_dopair_standard.dat
-
-./testPair -p 6 -r 1 -d 0 -f standard
-
-python @srcdir@/difffloat.py brute_force_standard.dat swift_dopair_standard.dat @srcdir@/tolerance_pair_normal.dat
-
-exit $?
diff --git a/tests/testPairPerturbed.sh.in b/tests/testPairPerturbed.sh.in
deleted file mode 100755
index 9f214e25a098448a906f9da307ea569e327cfdea..0000000000000000000000000000000000000000
--- a/tests/testPairPerturbed.sh.in
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-echo ""
-
-rm -f brute_force_perturbed.dat swift_dopair_perturbed.dat
-
-./testPair -p 6 -r 1 -d 0.1 -f perturbed
-
-python @srcdir@/difffloat.py brute_force_perturbed.dat swift_dopair_perturbed.dat @srcdir@/tolerance_pair_perturbed.dat
-
-exit $?
diff --git a/tests/testPeriodicBC.c b/tests/testPeriodicBC.c
index 495442efac853517891c1efdec54f1f5e8e14e75..6fa2dc607b996b9e8508338a9806633c5a4d1a89 100644
--- a/tests/testPeriodicBC.c
+++ b/tests/testPeriodicBC.c
@@ -517,7 +517,8 @@ int main(int argc, char *argv[]) {
 
   const int half_dim = (dim - 1) / 2;
 
-  /* Test the periodic boundary conditions for each of the 8 corners. */
+  /* Test the periodic boundary conditions for each of the 8 corners. Interact
+   * each corner with all of its 26 neighbours.*/
   test_boundary_conditions(cells, runner, 0, 0, 0, dim, swiftOutputFileName,
                            bruteForceOutputFileName);
   test_boundary_conditions(cells, runner, dim - 1, 0, 0, dim,
diff --git a/tests/tolerance_27_perturbed_h.dat b/tests/tolerance_27_perturbed_h.dat
index 495c865783ae4fc6cab76f11e945efe9ac21870f..5142c2a2090e15381a19b2bc71e253a482973b11 100644
--- a/tests/tolerance_27_perturbed_h.dat
+++ b/tests/tolerance_27_perturbed_h.dat
@@ -1,4 +1,4 @@
 #   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   2e-6       1e-4	    5e-4       1.2e-2		 1e-5	     3e-6	   3e-6		 7e-6
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1.4e-2	    1e-5       2e-3		 2.5e-4	     3e-3	   3e-3	 	 3e-3
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1e-6	    1e-6       1e0		 1e-6	     4e-6	   4e-6		 4e-6
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         2.4e-6       1e-4	        5e-4       1.2e-2		         1e-5	       3e-6	         3e-6		       8e-6
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         1.2e-6	      1.4e-2	      1e-5       2e-3		           2.5e-4	     3e-3	         3e-3	 	       3e-3
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         1e-6	      1e-6	        1e-6       1e0		           1e-6	       4e-6	         4e-6		       4e-6
diff --git a/tests/tolerance_27_perturbed_h2.dat b/tests/tolerance_27_perturbed_h2.dat
new file mode 100644
index 0000000000000000000000000000000000000000..23f6a5006124f6233aebd111005760a5dcc5b6a3
--- /dev/null
+++ b/tests/tolerance_27_perturbed_h2.dat
@@ -0,0 +1,4 @@
+#   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         3e-6       1e-4	        5e-4       1.5e-2		         1.4e-5	       3e-6	         3e-6		       9e-6
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         1.5e-6	    1.57e-2	      1e-5       4.74e-3		       3.89e-4	     3e-3	         3e-3	 	       3e-3
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         1e-6	      1e-6	        1e-6       1e0		           1e-6	         4e-6	         4e-6		       4e-6
diff --git a/tests/tolerance_pair_active.dat b/tests/tolerance_pair_active.dat
new file mode 100644
index 0000000000000000000000000000000000000000..b07697a686eb7801326ceaf77cf93fb3a1491c2e
--- /dev/null
+++ b/tests/tolerance_pair_active.dat
@@ -0,0 +1,4 @@
+#   ID     wcount
+    0	     1e-2
+    0	     1e-2
+    0	     1e-2
diff --git a/tests/tolerance_pair_perturbed.dat b/tests/tolerance_pair_perturbed.dat
deleted file mode 100644
index 59a80a7b1fa0db373b7cb43b9793146330b17d53..0000000000000000000000000000000000000000
--- a/tests/tolerance_pair_perturbed.dat
+++ /dev/null
@@ -1,4 +0,0 @@
-#   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-5	      1e-5	    2e-5       3e-2		 1e-5	     1e-5	   1e-5		 1e-5
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-3	      4e-4	    8e-3       2e-2		 1e-4	     1.6e-4	   1.6e-4	 1.6e-4
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1e-6	    1e-6       1e-6		 1e-6	     1e-6	   1e-6		 1e-6
diff --git a/tests/tolerance_pair_normal.dat b/tests/tolerance_testInteractions.dat
similarity index 75%
rename from tests/tolerance_pair_normal.dat
rename to tests/tolerance_testInteractions.dat
index 3fca591dce8156a08c970e4a6579b56b2de12553..ebb376bf26bfdc0fb2107ab720bbf9eca5a35bce 100644
--- a/tests/tolerance_pair_normal.dat
+++ b/tests/tolerance_testInteractions.dat
@@ -1,4 +1,4 @@
 #   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-5	      1e-5	    2e-5       3e-2		 1e-5	     1e-5	   1e-5		 1e-5
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-5	      1.2e-5	    1e-5       1e-2		 1e-4	     1e-4	   1e-4		 1e-4
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-5	      4e-5	    4e-4       1e-2		 1e-5	     6e-6	   6e-6		 6e-6
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1.2e-4	  1e-4       2e-4		 2e-4	     1e-4	   1e-4	 	 1e-4
     0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1e-6	    1e-6       1e-6		 1e-6	     1e-6	   1e-6		 1e-6