Split the P2P and M2P into two separate (vectorizable) loops in the leaf-leaf...

Split the P2P and M2P into two separate (vectorizable) loops in the leaf-leaf gravity P-P interactions.

Split the P2P and M2P into two separate (vectorizable) loops in the leaf-leaf...
4cb04d51 · Matthieu Schaller · c08fce03 · 4cb04d51 · 4cb04d51 · 4cb04d51
Commit 4cb04d51 authored 7 years ago by Matthieu Schaller
--- a/src/gravity_cache.h
+++ b/src/gravity_cache.h
@@ -59,6 +59,12 @@ struct gravity_cache {
  /*! #gpart z acceleration. */
  float *restrict a_z SWIFT_CACHE_ALIGN;

+  /*! Is this #gpart active ? */
+  int *restrict active SWIFT_CACHE_ALIGN;
+
+  /*! Can this #gpart use a M2P interaction ? */
+  int *restrict use_mpole SWIFT_CACHE_ALIGN;
+
  /*! Cache size */
  int count;
 };
@@ -79,6 +85,8 @@ static INLINE void gravity_cache_clean(struct gravity_cache *c) {
    free(c->a_x);
    free(c->a_y);
    free(c->a_z);
+    free(c->active);
+    free(c->use_mpole);
  }
  c->count = 0;
 }
@@ -97,24 +105,26 @@ static INLINE void gravity_cache_init(struct gravity_cache *c, int count) {

  /* Size of the gravity cache */
  const int padded_count = count - (count % VEC_SIZE) + VEC_SIZE;
-  const size_t sizeBytes = padded_count * sizeof(float);
+  const size_t sizeBytesF = padded_count * sizeof(float);
+  const size_t sizeBytesI = padded_count * sizeof(int);

  /* Delete old stuff if any */
  gravity_cache_clean(c);

-  int error = 0;
-  error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes);
-  error += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytes);
-  error += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytes);
-  error +=
-      posix_memalign((void **)&c->epsilon, SWIFT_CACHE_ALIGNMENT, sizeBytes);
-  error += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytes);
-  error += posix_memalign((void **)&c->a_x, SWIFT_CACHE_ALIGNMENT, sizeBytes);
-  error += posix_memalign((void **)&c->a_y, SWIFT_CACHE_ALIGNMENT, sizeBytes);
-  error += posix_memalign((void **)&c->a_z, SWIFT_CACHE_ALIGNMENT, sizeBytes);
-
-  if (error != 0)
-    error("Couldn't allocate gravity cache, size: %d", padded_count);
+  int e = 0;
+  e += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytesF);
+  e += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytesF);
+  e += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytesF);
+  e += posix_memalign((void **)&c->epsilon, SWIFT_CACHE_ALIGNMENT, sizeBytesF);
+  e += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytesF);
+  e += posix_memalign((void **)&c->a_x, SWIFT_CACHE_ALIGNMENT, sizeBytesF);
+  e += posix_memalign((void **)&c->a_y, SWIFT_CACHE_ALIGNMENT, sizeBytesF);
+  e += posix_memalign((void **)&c->a_z, SWIFT_CACHE_ALIGNMENT, sizeBytesF);
+  e += posix_memalign((void **)&c->active, SWIFT_CACHE_ALIGNMENT, sizeBytesI);
+  e +=
+      posix_memalign((void **)&c->use_mpole, SWIFT_CACHE_ALIGNMENT, sizeBytesI);
+
+  if (e != 0) error("Couldn't allocate gravity cache, size: %d", padded_count);

  c->count = padded_count;
 }
@@ -129,9 +139,11 @@ static INLINE void gravity_cache_init(struct gravity_cache *c, int count) {
 * multiple of the vector length.
 * @param shift A shift to apply to all the particles.
 */
-__attribute__((always_inline)) INLINE void gravity_cache_populate(
-    struct gravity_cache *c, const struct gpart *restrict gparts, int gcount,
-    int gcount_padded, const double shift[3]) {
+__attribute__((always_inline)) INLINE static void gravity_cache_populate(
+    timebin_t max_active_bin, struct gravity_cache *c,
+    const struct gpart *restrict gparts, int gcount, int gcount_padded,
+    const double shift[3], const float CoM[3], float r_max2,
+    float theta_crit2) {

  /* Make the compiler understand we are in happy vectorization land */
  swift_declare_aligned_ptr(float, x, c->x, SWIFT_CACHE_ALIGNMENT);
@@ -139,6 +151,9 @@ __attribute__((always_inline)) INLINE void gravity_cache_populate(
  swift_declare_aligned_ptr(float, z, c->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, epsilon, c->epsilon, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, m, c->m, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(int, active, c->active, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(int, use_mpole, c->use_mpole,
+                            SWIFT_CACHE_ALIGNMENT);
  swift_assume_size(gcount_padded, VEC_SIZE);

  /* Fill the input caches */
@@ -148,6 +163,14 @@ __attribute__((always_inline)) INLINE void gravity_cache_populate(
    z[i] = (float)(gparts[i].x[2] - shift[2]);
    epsilon[i] = gparts[i].epsilon;
    m[i] = gparts[i].mass;
+    active[i] = (int)(gparts[i].time_bin <= max_active_bin);
+
+    /* Check whether we can use the multipole instead of P-P */
+    const float dx = x[i] - CoM[0];
+    const float dy = y[i] - CoM[1];
+    const float dz = z[i] - CoM[2];
+    const float r2 = dx * dx + dy * dy + dz * dz;
+    use_mpole[i] = gravity_M2P_accept(r_max2, theta_crit2, r2);
  }

 #ifdef SWIFT_DEBUG_CHECKS
@@ -161,21 +184,26 @@ __attribute__((always_inline)) INLINE void gravity_cache_populate(
    z[i] = 0.f;
    epsilon[i] = 0.f;
    m[i] = 0.f;
+    active[i] = 0;
+    use_mpole[i] = 0;
  }
 }

 /**
- * @brief Fills a #gravity_cache structure with some #gpart.
+ * @brief Fills a #gravity_cache structure with some #gpart and shift them.
 *
 * @param c The #gravity_cache to fill.
 * @param gparts The #gpart array to read from.
 * @param gcount The number of particles to read.
 * @param gcount_padded The number of particle to read padded to the next
 * multiple of the vector length.
+ * @param shift A shift to apply to all the particles.
 */
-__attribute__((always_inline)) INLINE void gravity_cache_populate_no_shift(
-    struct gravity_cache *c, const struct gpart *restrict gparts, int gcount,
-    int gcount_padded) {
+__attribute__((always_inline)) INLINE static void
+gravity_cache_populate_no_mpole(timebin_t max_active_bin,
+                                struct gravity_cache *c,
+                                const struct gpart *restrict gparts, int gcount,
+                                int gcount_padded, const double shift[3]) {

  /* Make the compiler understand we are in happy vectorization land */
  swift_declare_aligned_ptr(float, x, c->x, SWIFT_CACHE_ALIGNMENT);
@@ -183,15 +211,17 @@ __attribute__((always_inline)) INLINE void gravity_cache_populate_no_shift(
  swift_declare_aligned_ptr(float, z, c->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, epsilon, c->epsilon, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, m, c->m, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(int, active, c->active, SWIFT_CACHE_ALIGNMENT);
  swift_assume_size(gcount_padded, VEC_SIZE);

  /* Fill the input caches */
  for (int i = 0; i < gcount; ++i) {
-    x[i] = (float)(gparts[i].x[0]);
-    y[i] = (float)(gparts[i].x[1]);
-    z[i] = (float)(gparts[i].x[2]);
+    x[i] = (float)(gparts[i].x[0] - shift[0]);
+    y[i] = (float)(gparts[i].x[1] - shift[1]);
+    z[i] = (float)(gparts[i].x[2] - shift[2]);
    epsilon[i] = gparts[i].epsilon;
    m[i] = gparts[i].mass;
+    active[i] = (int)(gparts[i].time_bin <= max_active_bin);
  }

 #ifdef SWIFT_DEBUG_CHECKS
@@ -205,6 +235,7 @@ __attribute__((always_inline)) INLINE void gravity_cache_populate_no_shift(
    z[i] = 0.f;
    epsilon[i] = 0.f;
    m[i] = 0.f;
+    active[i] = 0;
  }
 }

@@ -219,18 +250,18 @@ __attribute__((always_inline)) INLINE void gravity_cache_write_back(
    const struct gravity_cache *c, struct gpart *restrict gparts, int gcount) {

  /* Make the compiler understand we are in happy vectorization land */
-  float *restrict a_x = c->a_x;
-  float *restrict a_y = c->a_y;
-  float *restrict a_z = c->a_z;
-  swift_align_information(a_x, SWIFT_CACHE_ALIGNMENT);
-  swift_align_information(a_y, SWIFT_CACHE_ALIGNMENT);
-  swift_align_information(a_z, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, a_x, c->a_x, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, a_y, c->a_y, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, a_z, c->a_z, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(int, active, c->active, SWIFT_CACHE_ALIGNMENT);

  /* Write stuff back to the particles */
  for (int i = 0; i < gcount; ++i) {
-    gparts[i].a_grav[0] += a_x[i];
-    gparts[i].a_grav[1] += a_y[i];
-    gparts[i].a_grav[2] += a_z[i];
+    if (active[i]) {
+      gparts[i].a_grav[0] += a_x[i];
+      gparts[i].a_grav[1] += a_y[i];
+      gparts[i].a_grav[2] += a_z[i];
+    }
  }
 }


--- a/src/multipole.h
+++ b/src/multipole.h
@@ -2411,4 +2411,26 @@ __attribute__((always_inline)) INLINE static int gravity_multipole_accept(
  return (r2 * theta_crit2 > size2);
 }

+/**
+ * @brief Checks whether a particle-cell interaction can be appromixated by a
+ * M2P
+ * interaction using the distance and cell radius.
+ *
+ * We use the multipole acceptance criterion of Dehnen, 2002, JCoPh, Volume 179,
+ * Issue 1, pp.27-42, equation 10.
+ *
+ * @param r_max2 The square of the size of the multipole.
+ * @param theta_crit2 The square of the critical opening angle.
+ * @param r2 Square of the distance (periodically wrapped) between the
+ * multipoles.
+ */
+__attribute__((always_inline)) INLINE static int gravity_M2P_accept(
+    float r_max2, float theta_crit2, float r2) {
+
+  // MATTHIEU: Make this mass-dependent ?
+
+  /* Multipole acceptance criterion (Dehnen 2002, eq.10) */
+  return (r2 * theta_crit2 > r_max2);
+}
+
 #endif /* SWIFT_MULTIPOLE_H */
--- a/src/runner_doiact_grav.h
+++ b/src/runner_doiact_grav.h
@@ -155,345 +155,313 @@ void runner_dopair_grav_mm(const struct runner *r, struct cell *restrict ci,
  TIMER_TOC(timer_dopair_grav_mm);
 }

-/**
- * @brief Computes the interaction of all the particles in a cell with all the
- * particles of another cell using the full Newtonian potential
- *
- * @param r The #runner.
- * @param ci The first #cell.
- * @param cj The other #cell.
- * @param shift The distance vector (periodically wrapped) between the cell
- * centres.
- */
-void runner_dopair_grav_pp_full(struct runner *r, struct cell *ci,
-                                struct cell *cj, double shift[3]) {
-
-  /* Some constants */
-  struct gravity_cache *const ci_cache = &r->ci_gravity_cache;
-  struct gravity_cache *const cj_cache = &r->cj_gravity_cache;
-  const struct engine *const e = r->e;
-  const struct gravity_props *props = e->gravity_properties;
-  const float theta_crit2 = props->theta_crit2;
-
-  /* Cell properties */
-  const int gcount_i = ci->gcount;
-  const int gcount_j = cj->gcount;
-  struct gpart *restrict gparts_i = ci->gparts;
-  struct gpart *restrict gparts_j = cj->gparts;
-  const int ci_active = cell_is_active(ci, e);
-  const int cj_active = cell_is_active(cj, e);
-  const double loc_i[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
-  const double loc_j[3] = {cj->loc[0], cj->loc[1], cj->loc[2]};
-  const double loc_mean[3] = {0.5 * (loc_i[0] + loc_j[0]),
-                              0.5 * (loc_i[1] + loc_j[1]),
-                              0.5 * (loc_i[2] + loc_j[2])};
-
-  /* Anything to do here ?*/
-  if (!ci_active && !cj_active) return;
+static INLINE void runner_dopair_grav_pp_full(const struct engine *e,
+                                              struct gravity_cache *ci_cache,
+                                              struct gravity_cache *cj_cache,
+                                              int gcount_i, int gcount_j,
+                                              int gcount_padded_j,
+                                              struct gpart *restrict gparts_i,
+                                              struct gpart *restrict gparts_j) {

-  /* Check that we fit in cache */
-  if (gcount_i > ci_cache->count || gcount_j > cj_cache->count)
-    error("Not enough space in the caches! gcount_i=%d gcount_j=%d", gcount_i,
-          gcount_j);
+  /* Loop over all particles in ci... */
+  for (int pid = 0; pid < gcount_i; pid++) {

-  /* Computed the padded counts */
-  const int gcount_padded_i = gcount_i - (gcount_i % VEC_SIZE) + VEC_SIZE;
-  const int gcount_padded_j = gcount_j - (gcount_j % VEC_SIZE) + VEC_SIZE;
+    /* Skip inactive particles */
+    if (!ci_cache->active[pid]) continue;

-  /* Fill the caches */
-  gravity_cache_populate(ci_cache, gparts_i, gcount_i, gcount_padded_i,
-                         loc_mean);
-  gravity_cache_populate(cj_cache, gparts_j, gcount_j, gcount_padded_j,
-                         loc_mean);
+    /* Skip particle that can use the multipole */
+    if (ci_cache->use_mpole[pid]) continue;

-  /* Recover the multipole info and shift the CoM locations */
-  const float rmax_i = ci->multipole->r_max;
-  const float rmax_j = cj->multipole->r_max;
-  const struct multipole *multi_i = &ci->multipole->m_pole;
-  const struct multipole *multi_j = &cj->multipole->m_pole;
-  const float CoM_i[3] = {ci->multipole->CoM[0] - loc_mean[0],
-                          ci->multipole->CoM[1] - loc_mean[1],
-                          ci->multipole->CoM[2] - loc_mean[2]};
-  const float CoM_j[3] = {cj->multipole->CoM[0] - loc_mean[0],
-                          cj->multipole->CoM[1] - loc_mean[1],
-                          cj->multipole->CoM[2] - loc_mean[2]};
+#ifdef SWIFT_DEBUG_CHECKS
+    if (!gpart_is_active(&gparts_i[pid], e))
+      error("Active particle went through the cache");
+#endif

-  /* Ok... Here we go ! */
+    const float x_i = ci_cache->x[pid];
+    const float y_i = ci_cache->y[pid];
+    const float z_i = ci_cache->z[pid];

-  if (ci_active) {
+    /* Some powers of the softening length */
+    const float h_i = ci_cache->epsilon[pid];
+    const float h2_i = h_i * h_i;
+    const float h_inv_i = 1.f / h_i;
+    const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i;

-    /* Loop over all particles in ci... */
-    for (int pid = 0; pid < gcount_i; pid++) {
+    /* Local accumulators for the acceleration */
+    float a_x = 0.f, a_y = 0.f, a_z = 0.f;

-      /* Skip inactive particles */
-      if (!gpart_is_active(&gparts_i[pid], e)) continue;
+    /* Make the compiler understand we are in happy vectorization land */
+    swift_align_information(cj_cache->x, SWIFT_CACHE_ALIGNMENT);
+    swift_align_information(cj_cache->y, SWIFT_CACHE_ALIGNMENT);
+    swift_align_information(cj_cache->z, SWIFT_CACHE_ALIGNMENT);
+    swift_align_information(cj_cache->m, SWIFT_CACHE_ALIGNMENT);
+    swift_assume_size(gcount_padded_j, VEC_SIZE);

-      const float x_i = ci_cache->x[pid];
-      const float y_i = ci_cache->y[pid];
-      const float z_i = ci_cache->z[pid];
+    /* Loop over every particle in the other cell. */
+    for (int pjd = 0; pjd < gcount_padded_j; pjd++) {

-      /* Some powers of the softening length */
-      const float h_i = ci_cache->epsilon[pid];
-      const float h2_i = h_i * h_i;
-      const float h_inv_i = 1.f / h_i;
-      const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i;
+      /* Get info about j */
+      const float x_j = cj_cache->x[pjd];
+      const float y_j = cj_cache->y[pjd];
+      const float z_j = cj_cache->z[pjd];
+      const float mass_j = cj_cache->m[pjd];

-      /* Distance to the multipole in cj */
-      const float dx = x_i - CoM_j[0];
-      const float dy = y_i - CoM_j[1];
-      const float dz = z_i - CoM_j[2];
+      /* Compute the pairwise (square) distance. */
+      const float dx = x_i - x_j;
+      const float dy = y_i - y_j;
+      const float dz = z_i - z_j;
      const float r2 = dx * dx + dy * dy + dz * dz;

-      /* Can we use the multipole in cj ? */
-      if (gravity_multipole_accept(0., rmax_j, theta_crit2, r2)) {
+#ifdef SWIFT_DEBUG_CHECKS
+      if (r2 == 0.f) error("Interacting particles with 0 distance");

-        /* Interact! */
-        float f_x, f_y, f_z;
-        runner_iact_grav_pm(dx, dy, dz, r2, h_i, h_inv_i, multi_j, &f_x, &f_y,
-                            &f_z);
+      /* Check that particles have been drifted to the current time */
+      if (gparts_i[pid].ti_drift != e->ti_current)
+        error("gpi not drifted to current time");
+      if (pjd < gcount_j && gparts_j[pjd].ti_drift != e->ti_current)
+        error("gpj not drifted to current time");
+#endif
+
+      /* Interact! */
+      float f_ij;
+      runner_iact_grav_pp_full(r2, h2_i, h_inv_i, h_inv3_i, mass_j, &f_ij);

-        /* Store it back */
-        ci_cache->a_x[pid] = f_x;
-        ci_cache->a_y[pid] = f_y;
-        ci_cache->a_z[pid] = f_z;
+      /* Store it back */
+      a_x -= f_ij * dx;
+      a_y -= f_ij * dy;
+      a_z -= f_ij * dz;

 #ifdef SWIFT_DEBUG_CHECKS
-        /* Update the interaction counter */
-        gparts_i[pid].num_interacted += cj->multipole->m_pole.num_gpart;
+      /* Update the interaction counter if it's not a padded gpart */
+      if (pjd < gcount_j) gparts_i[pid].num_interacted++;
 #endif
-        /* Done with this particle */
-        continue;
-      }
-
-      /* Ok, as of here we have no choice but directly interact everything */
+    }

-      /* Local accumulators for the acceleration */
-      float a_x = 0.f, a_y = 0.f, a_z = 0.f;
+    /* Store everything back in cache */
+    ci_cache->a_x[pid] = a_x;
+    ci_cache->a_y[pid] = a_y;
+    ci_cache->a_z[pid] = a_z;
+  }
+}

-      /* Make the compiler understand we are in happy vectorization land */
-      swift_align_information(cj_cache->x, SWIFT_CACHE_ALIGNMENT);
-      swift_align_information(cj_cache->y, SWIFT_CACHE_ALIGNMENT);
-      swift_align_information(cj_cache->z, SWIFT_CACHE_ALIGNMENT);
-      swift_align_information(cj_cache->m, SWIFT_CACHE_ALIGNMENT);
-      swift_assume_size(gcount_padded_j, VEC_SIZE);
+static INLINE void runner_dopair_grav_pp_truncated(
+    const struct engine *e, const float rlr_inv, struct gravity_cache *ci_cache,
+    struct gravity_cache *cj_cache, int gcount_i, int gcount_j,
+    int gcount_padded_j, struct gpart *restrict gparts_i,
+    struct gpart *restrict gparts_j) {

-      /* Loop over every particle in the other cell. */
-      for (int pjd = 0; pjd < gcount_padded_j; pjd++) {
+  /* Loop over all particles in ci... */
+  for (int pid = 0; pid < gcount_i; pid++) {

-        /* Get info about j */
-        const float x_j = cj_cache->x[pjd];
-        const float y_j = cj_cache->y[pjd];
-        const float z_j = cj_cache->z[pjd];
-        const float mass_j = cj_cache->m[pjd];
+    /* Skip inactive particles */
+    if (!ci_cache->active[pid]) continue;

-        /* Compute the pairwise (square) distance. */
-        const float dx = x_i - x_j;
-        const float dy = y_i - y_j;
-        const float dz = z_i - z_j;
-        const float r2 = dx * dx + dy * dy + dz * dz;
+    /* Skip particle that can use the multipole */
+    if (ci_cache->use_mpole[pid]) continue;

 #ifdef SWIFT_DEBUG_CHECKS
-        if (r2 == 0.f) error("Interacting particles with 0 distance");
-
-        /* Check that particles have been drifted to the current time */
-        if (gparts_i[pid].ti_drift != e->ti_current)
-          error("gpi not drifted to current time");
-        if (pjd < gcount_j && gparts_j[pjd].ti_drift != e->ti_current)
-          error("gpj not drifted to current time");
+    if (!gpart_is_active(&gparts_i[pid], e))
+      error("Active particle went through the cache");
 #endif

-        /* Interact! */
-        float f_ij;
-        runner_iact_grav_pp_full(r2, h2_i, h_inv_i, h_inv3_i, mass_j, &f_ij);
-
-        /* Store it back */
-        a_x -= f_ij * dx;
-        a_y -= f_ij * dy;
-        a_z -= f_ij * dz;
-
-#ifdef SWIFT_DEBUG_CHECKS
-        /* Update the interaction counter if it's not a padded gpart */
-        if (pjd < gcount_j) gparts_i[pid].num_interacted++;
-#endif
-      }
+    const float x_i = ci_cache->x[pid];
+    const float y_i = ci_cache->y[pid];
+    const float z_i = ci_cache->z[pid];

-      /* Store everything back in cache */
-      ci_cache->a_x[pid] = a_x;
-      ci_cache->a_y[pid] = a_y;
-      ci_cache->a_z[pid] = a_z;
-    }
-  }
+    /* Some powers of the softening length */
+    const float h_i = ci_cache->epsilon[pid];
+    const float h2_i = h_i * h_i;
+    const float h_inv_i = 1.f / h_i;
+    const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i;

-  /* Now do the opposite loop */
-  if (cj_active) {
+    /* Local accumulators for the acceleration */
+    float a_x = 0.f, a_y = 0.f, a_z = 0.f;

-    /* Loop over all particles in cj... */
-    for (int pjd = 0; pjd < gcount_j; pjd++) {
+    /* Make the compiler understand we are in happy vectorization land */
+    swift_align_information(cj_cache->x, SWIFT_CACHE_ALIGNMENT);
+    swift_align_information(cj_cache->y, SWIFT_CACHE_ALIGNMENT);
+    swift_align_information(cj_cache->z, SWIFT_CACHE_ALIGNMENT);
+    swift_align_information(cj_cache->m, SWIFT_CACHE_ALIGNMENT);
+    swift_assume_size(gcount_padded_j, VEC_SIZE);

-      /* Skip inactive particles */
-      if (!gpart_is_active(&gparts_j[pjd], e)) continue;
+    /* Loop over every particle in the other cell. */
+    for (int pjd = 0; pjd < gcount_padded_j; pjd++) {

+      /* Get info about j */
      const float x_j = cj_cache->x[pjd];
      const float y_j = cj_cache->y[pjd];
      const float z_j = cj_cache->z[pjd];
+      const float mass_j = cj_cache->m[pjd];

-      /* Some powers of the softening length */
-      const float h_j = cj_cache->epsilon[pjd];
-      const float h2_j = h_j * h_j;
-      const float h_inv_j = 1.f / h_j;
-      const float h_inv3_j = h_inv_j * h_inv_j * h_inv_j;
-
-      /* Distance to the multipole in ci */
-      const float dx = x_j - CoM_i[0];
-      const float dy = y_j - CoM_i[1];
-      const float dz = z_j - CoM_i[2];
+      /* Compute the pairwise (square) distance. */
+      const float dx = x_i - x_j;
+      const float dy = y_i - y_j;
+      const float dz = z_i - z_j;
      const float r2 = dx * dx + dy * dy + dz * dz;

-      /* Can we use the multipole in cj ? */
-      if (gravity_multipole_accept(0., rmax_i, theta_crit2, r2)) {
+#ifdef SWIFT_DEBUG_CHECKS
+      if (r2 == 0.f) error("Interacting particles with 0 distance");

-        /* Interact! */
-        float f_x, f_y, f_z;
-        runner_iact_grav_pm(dx, dy, dz, r2, h_j, h_inv_j, multi_i, &f_x, &f_y,
-                            &f_z);
+      /* Check that particles have been drifted to the current time */
+      if (gparts_i[pid].ti_drift != e->ti_current)
+        error("gpi not drifted to current time");
+      if (pjd < gcount_j && gparts_j[pjd].ti_drift != e->ti_current)
+        error("gpj not drifted to current time");
+#endif
+
+      /* Interact! */
+      float f_ij;
+      runner_iact_grav_pp_truncated(r2, h2_i, h_inv_i, h_inv3_i, mass_j,
+                                    rlr_inv, &f_ij);

-        /* Store it back */
-        cj_cache->a_x[pjd] = f_x;
-        cj_cache->a_y[pjd] = f_y;
-        cj_cache->a_z[pjd] = f_z;
+      /* Store it back */
+      a_x -= f_ij * dx;
+      a_y -= f_ij * dy;
+      a_z -= f_ij * dz;

 #ifdef SWIFT_DEBUG_CHECKS
-        /* Update the interaction counter */
-        gparts_j[pjd].num_interacted += ci->multipole->m_pole.num_gpart;
+      /* Update the interaction counter if it's not a padded gpart */
+      if (pjd < gcount_j) gparts_i[pid].num_interacted++;
 #endif
-        /* Done with this particle */
-        continue;
-      }
-
-      /* Ok, as of here we have no choice but directly interact everything */
+    }

-      /* Local accumulators for the acceleration */
-      float a_x = 0.f, a_y = 0.f, a_z = 0.f;
+    /* Store everything back in cache */
+    ci_cache->a_x[pid] = a_x;
+    ci_cache->a_y[pid] = a_y;
+    ci_cache->a_z[pid] = a_z;
+  }
+}

-      /* Make the compiler understand we are in happy vectorization land */
-      swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT);
-      swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT);
-      swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT);
-      swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT);
-      swift_assume_size(gcount_padded_i, VEC_SIZE);
+static INLINE void runner_dopair_grav_pm(
+    const struct engine *restrict e, struct gravity_cache *ci_cache,
+    int gcount_i, int gcount_padded_i, struct gpart *restrict gparts_i,
+    const float CoM_j[3], const struct multipole *restrict multi_j,
+    struct cell *restrict cj) {
+
+  /* Make the compiler understand we are in happy vectorization land */
+  swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(ci_cache->epsilon, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(ci_cache->a_x, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(ci_cache->a_y, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(ci_cache->a_z, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(ci_cache->active, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(ci_cache->use_mpole, SWIFT_CACHE_ALIGNMENT);
+  swift_assume_size(gcount_padded_i, VEC_SIZE);

-      /* Loop over every particle in the other cell. */
-      for (int pid = 0; pid < gcount_padded_i; pid++) {
+  /* Loop over all particles in ci... */
+  for (int pid = 0; pid < gcount_padded_i; pid++) {

-        /* Get info about j */
-        const float x_i = ci_cache->x[pid];
-        const float y_i = ci_cache->y[pid];
-        const float z_i = ci_cache->z[pid];
-        const float mass_i = ci_cache->m[pid];
+    /* Skip inactive particles */
+    if (!ci_cache->active[pid]) continue;

-        /* Compute the pairwise (square) distance. */
-        const float dx = x_j - x_i;
-        const float dy = y_j - y_i;
-        const float dz = z_j - z_i;
-        const float r2 = dx * dx + dy * dy + dz * dz;
+    /* Skip particle that cannot use the multipole */
+    if (!ci_cache->use_mpole[pid]) continue;

 #ifdef SWIFT_DEBUG_CHECKS
-        if (r2 == 0.f) error("Interacting particles with 0 distance");
-
-        /* Check that particles have been drifted to the current time */
-        if (gparts_j[pjd].ti_drift != e->ti_current)
-          error("gpj not drifted to current time");
-        if (pid < gcount_i && gparts_i[pid].ti_drift != e->ti_current)
-          error("gpi not drifted to current time");
+    if (pid < gcount_i && !gpart_is_active(&gparts_i[pid], e))
+      error("Active particle went through the cache");
 #endif

-        /* Interact! */
-        float f_ji;
-        runner_iact_grav_pp_full(r2, h2_j, h_inv_j, h_inv3_j, mass_i, &f_ji);
+    const float x_i = ci_cache->x[pid];
+    const float y_i = ci_cache->y[pid];
+    const float z_i = ci_cache->z[pid];

-        /* Store it back */
-        a_x -= f_ji * dx;
-        a_y -= f_ji * dy;
-        a_z -= f_ji * dz;
+    /* Some powers of the softening length */
+    const float h_i = ci_cache->epsilon[pid];
+    const float h_inv_i = 1.f / h_i;
+
+    /* Distance to the Multipole */
+    const float dx = x_i - CoM_j[0];
+    const float dy = y_i - CoM_j[1];
+    const float dz = z_i - CoM_j[2];
+    const float r2 = dx * dx + dy * dy + dz * dz;
+
+    /* Interact! */
+    float f_x, f_y, f_z;
+    runner_iact_grav_pm(dx, dy, dz, r2, h_i, h_inv_i, multi_j, &f_x, &f_y,
+                        &f_z);
+
+    /* Store it back */
+    ci_cache->a_x[pid] = f_x;
+    ci_cache->a_y[pid] = f_y;
+    ci_cache->a_z[pid] = f_z;

 #ifdef SWIFT_DEBUG_CHECKS
-        /* Update the interaction counter if it's not a padded gpart */
-        if (pid < gcount_i) gparts_j[pjd].num_interacted++;
+    /* Update the interaction counter */
+    if (pid < gcount_i)
+      gparts_i[pid].num_interacted += cj->multipole->m_pole.num_gpart;
 #endif
-      }
-
-      /* Store everything back in cache */
-      cj_cache->a_x[pjd] = a_x;
-      cj_cache->a_y[pjd] = a_y;
-      cj_cache->a_z[pjd] = a_z;
-    }
  }
-
-  /* Write back to the particles */
-  if (ci_active) gravity_cache_write_back(ci_cache, gparts_i, gcount_i);
-  if (cj_active) gravity_cache_write_back(cj_cache, gparts_j, gcount_j);
 }

 /**
 * @brief Computes the interaction of all the particles in a cell with all the
- * particles of another cell using the truncated Newtonian potential
+ * particles of another cell (switching function between full and truncated).
 *
 * @param r The #runner.
 * @param ci The first #cell.
 * @param cj The other #cell.
- * @param shift The distance vector (periodically wrapped) between the cell
- * centres.
 */
-void runner_dopair_grav_pp_truncated(struct runner *r, struct cell *ci,
-                                     struct cell *cj, double shift[3]) {
+void runner_dopair_grav_pp(struct runner *r, struct cell *ci, struct cell *cj) {

-  /* Some constants */
-  const struct engine *const e = r->e;
+  const struct engine *e = r->e;
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  const int ci_active = cell_is_active(ci, e);
+  const int cj_active = cell_is_active(cj, e);
+  if (!ci_active && !cj_active) return;
+
+  /* Check that we are not doing something stupid */
+  if (ci->split || cj->split) error("Running P-P on splitable cells");
+
+  /* Let's start by drifting things */
+  if (!cell_are_gpart_drifted(ci, e)) error("Un-drifted gparts");
+  if (!cell_are_gpart_drifted(cj, e)) error("Un-drifted gparts");
+
+  /* Recover some useful constants */
  const struct space *s = e->s;
+  const int periodic = s->periodic;
  const double cell_width = s->width[0];
-  const double a_smooth = e->gravity_properties->a_smooth;
+  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
  const float theta_crit2 = e->gravity_properties->theta_crit2;
+  const double a_smooth = e->gravity_properties->a_smooth;
+  const double r_cut_min = e->gravity_properties->r_cut_min;
  const double rlr = cell_width * a_smooth;
+  const double min_trunc = rlr * r_cut_min;
  const float rlr_inv = 1. / rlr;

  /* Caches to play with */
  struct gravity_cache *const ci_cache = &r->ci_gravity_cache;
  struct gravity_cache *const cj_cache = &r->cj_gravity_cache;

-  /* Cell properties */
+  /* Centre of the cell pais */
+  const double loc_mean[3] = {0.5 * (ci->loc[0] + cj->loc[0]),
+                              0.5 * (ci->loc[1] + cj->loc[1]),
+                              0.5 * (ci->loc[2] + cj->loc[2])};
+  // MATTHIEU deal with periodicity
+
+  /* Star by constructing particle caches */
+
+  /* Computed the padded counts */
  const int gcount_i = ci->gcount;
  const int gcount_j = cj->gcount;
-  struct gpart *restrict gparts_i = ci->gparts;
-  struct gpart *restrict gparts_j = cj->gparts;
-  const int ci_active = cell_is_active(ci, e);
-  const int cj_active = cell_is_active(cj, e);
-  const double loc_i[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
-  const double loc_j[3] = {cj->loc[0], cj->loc[1], cj->loc[2]};
-  const double loc_mean[3] = {0.5 * (loc_i[0] + loc_j[0]),
-                              0.5 * (loc_i[1] + loc_j[1]),
-                              0.5 * (loc_i[2] + loc_j[2])};
-
-  /* Anything to do here ?*/
-  if (!ci_active && !cj_active) return;
+  const int gcount_padded_i = gcount_i - (gcount_i % VEC_SIZE) + VEC_SIZE;
+  const int gcount_padded_j = gcount_j - (gcount_j % VEC_SIZE) + VEC_SIZE;

  /* Check that we fit in cache */
  if (gcount_i > ci_cache->count || gcount_j > cj_cache->count)
    error("Not enough space in the caches! gcount_i=%d gcount_j=%d", gcount_i,
          gcount_j);

-  /* Computed the padded counts */
-  const int gcount_padded_i = gcount_i - (gcount_i % VEC_SIZE) + VEC_SIZE;
-  const int gcount_padded_j = gcount_j - (gcount_j % VEC_SIZE) + VEC_SIZE;
-
-  /* Fill the caches */
-  gravity_cache_populate(ci_cache, gparts_i, gcount_i, gcount_padded_i,
-                         loc_mean);
-  gravity_cache_populate(cj_cache, gparts_j, gcount_j, gcount_padded_j,
-                         loc_mean);
-
  /* Recover the multipole info and shift the CoM locations */
  const float rmax_i = ci->multipole->r_max;
  const float rmax_j = cj->multipole->r_max;
+  const float rmax2_i = rmax_i * rmax_i;
+  const float rmax2_j = rmax_j * rmax_j;
  const struct multipole *multi_i = &ci->multipole->m_pole;
  const struct multipole *multi_j = &cj->multipole->m_pole;
  const float CoM_i[3] = {ci->multipole->CoM[0] - loc_mean[0],
@@ -502,264 +470,48 @@ void runner_dopair_grav_pp_truncated(struct runner *r, struct cell *ci,
  const float CoM_j[3] = {cj->multipole->CoM[0] - loc_mean[0],
                          cj->multipole->CoM[1] - loc_mean[1],
                          cj->multipole->CoM[2] - loc_mean[2]};
+  // MATTHIEU deal with periodicity

-  /* Ok... Here we go ! */
-
-  if (ci_active) {
-
-    /* Loop over all particles in ci... */
-    for (int pid = 0; pid < gcount_i; pid++) {
-
-      /* Skip inactive particles */
-      if (!gpart_is_active(&gparts_i[pid], e)) continue;
-
-      const float x_i = ci_cache->x[pid];
-      const float y_i = ci_cache->y[pid];
-      const float z_i = ci_cache->z[pid];
-
-      /* Some powers of the softening length */
-      const float h_i = ci_cache->epsilon[pid];
-      const float h2_i = h_i * h_i;
-      const float h_inv_i = 1.f / h_i;
-      const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i;
-
-      /* Distance to the multipole in cj */
-      const float dx = x_i - CoM_j[0];
-      const float dy = y_i - CoM_j[1];
-      const float dz = z_i - CoM_j[2];
-      const float r2 = dx * dx + dy * dy + dz * dz;
-
-      /* Can we use the multipole in cj ? */
-      if (gravity_multipole_accept(0., rmax_j, theta_crit2, r2)) {
-
-        /* Interact! */
-        float f_x, f_y, f_z;
-        runner_iact_grav_pm(dx, dy, dz, r2, h_i, h_inv_i, multi_j, &f_x, &f_y,
-                            &f_z);
-
-        /* Store it back */
-        ci_cache->a_x[pid] = f_x;
-        ci_cache->a_y[pid] = f_y;
-        ci_cache->a_z[pid] = f_z;
-
-#ifdef SWIFT_DEBUG_CHECKS
-        /* Update the interaction counter */
-        gparts_i[pid].num_interacted += cj->multipole->m_pole.num_gpart;
-#endif
-        /* Done with this particle */
-        continue;
-      }
-
-      /* Ok, as of here we have no choice but directly interact everything */
-
-      /* Local accumulators for the acceleration */
-      float a_x = 0.f, a_y = 0.f, a_z = 0.f;
-
-      /* Make the compiler understand we are in happy vectorization land */
-      swift_align_information(cj_cache->x, SWIFT_CACHE_ALIGNMENT);
-      swift_align_information(cj_cache->y, SWIFT_CACHE_ALIGNMENT);
-      swift_align_information(cj_cache->z, SWIFT_CACHE_ALIGNMENT);
-      swift_align_information(cj_cache->m, SWIFT_CACHE_ALIGNMENT);
-      swift_assume_size(gcount_padded_j, VEC_SIZE);
-
-      /* Loop over every particle in the other cell. */
-      for (int pjd = 0; pjd < gcount_padded_j; pjd++) {
-
-        /* Get info about j */
-        const float x_j = cj_cache->x[pjd];
-        const float y_j = cj_cache->y[pjd];
-        const float z_j = cj_cache->z[pjd];
-        const float mass_j = cj_cache->m[pjd];
-
-        /* Compute the pairwise (square) distance. */
-        const float dx = x_i - x_j;
-        const float dy = y_i - y_j;
-        const float dz = z_i - z_j;
-        const float r2 = dx * dx + dy * dy + dz * dz;
-
-#ifdef SWIFT_DEBUG_CHECKS
-        if (r2 == 0.f) error("Interacting particles with 0 distance");
+  /* Fill the caches */
+  gravity_cache_populate(e->max_active_bin, ci_cache, ci->gparts, gcount_i,
+                         gcount_padded_i, loc_mean, CoM_j, rmax2_j,
+                         theta_crit2);
+  gravity_cache_populate(e->max_active_bin, cj_cache, cj->gparts, gcount_j,
+                         gcount_padded_j, loc_mean, CoM_i, rmax2_i,
+                         theta_crit2);

-        /* Check that particles have been drifted to the current time */
-        if (gparts_i[pid].ti_drift != e->ti_current)
-          error("gpi not drifted to current time");
-        if (pjd < gcount_j && gparts_j[pjd].ti_drift != e->ti_current)
-          error("gpj not drifted to current time");
-#endif
+  /* Can we use the Newtonian version or do we need the truncated one ? */
+  if (!periodic) {

-        /* Interact! */
-        float f_ij;
-        runner_iact_grav_pp_truncated(r2, h2_i, h_inv_i, h_inv3_i, mass_j,
-                                      rlr_inv, &f_ij);
+    /* Not periodic -> Can always use Newtonian potential */

-        /* Store it back */
-        a_x -= f_ij * dx;
-        a_y -= f_ij * dy;
-        a_z -= f_ij * dz;
+    /* Let's updated the active cell(s) only */
+    if (ci_active) {

-#ifdef SWIFT_DEBUG_CHECKS
-        /* Update the interaction counter if it's not a padded gpart */
-        if (pjd < gcount_j) gparts_i[pid].num_interacted++;
-#endif
-      }
+      /* First the P2P */
+      runner_dopair_grav_pp_full(e, ci_cache, cj_cache, gcount_i, gcount_j,
+                                 gcount_padded_j, ci->gparts, cj->gparts);

-      /* Store everything back in cache */
-      ci_cache->a_x[pid] = a_x;
-      ci_cache->a_y[pid] = a_y;
-      ci_cache->a_z[pid] = a_z;
+      /* Then the M2P */
+      runner_dopair_grav_pm(e, ci_cache, gcount_i, gcount_padded_i, ci->gparts,
+                            CoM_j, multi_j, cj);
    }
-  }
-
-  /* Now do the opposite loop */
-  if (cj_active) {
-
-    /* Loop over all particles in cj... */
-    for (int pjd = 0; pjd < gcount_j; pjd++) {
-
-      /* Skip inactive particles */
-      if (!gpart_is_active(&gparts_j[pjd], e)) continue;
-
-      const float x_j = cj_cache->x[pjd];
-      const float y_j = cj_cache->y[pjd];
-      const float z_j = cj_cache->z[pjd];
-
-      /* Some powers of the softening length */
-      const float h_j = cj_cache->epsilon[pjd];
-      const float h2_j = h_j * h_j;
-      const float h_inv_j = 1.f / h_j;
-      const float h_inv3_j = h_inv_j * h_inv_j * h_inv_j;
-
-      /* Distance to the multipole in ci */
-      const float dx = x_j - CoM_i[0];
-      const float dy = y_j - CoM_i[1];
-      const float dz = z_j - CoM_i[2];
-      const float r2 = dx * dx + dy * dy + dz * dz;
-
-      /* Can we use the multipole in cj ? */
-      if (gravity_multipole_accept(0., rmax_i, theta_crit2, r2)) {
-
-        /* Interact! */
-        float f_x, f_y, f_z;
-        runner_iact_grav_pm(dx, dy, dz, r2, h_j, h_inv_j, multi_i, &f_x, &f_y,
-                            &f_z);
-
-        /* Store it back */
-        cj_cache->a_x[pjd] = f_x;
-        cj_cache->a_y[pjd] = f_y;
-        cj_cache->a_z[pjd] = f_z;
-
-#ifdef SWIFT_DEBUG_CHECKS
-        /* Update the interaction counter */
-        gparts_j[pjd].num_interacted += ci->multipole->m_pole.num_gpart;
-#endif
-        /* Done with this particle */
-        continue;
-      }
-
-      /* Ok, as of here we have no choice but directly interact everything */
-
-      /* Local accumulators for the acceleration */
-      float a_x = 0.f, a_y = 0.f, a_z = 0.f;
-
-      /* Make the compiler understand we are in happy vectorization land */
-      swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT);
-      swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT);
-      swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT);
-      swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT);
-      swift_assume_size(gcount_padded_i, VEC_SIZE);
-
-      /* Loop over every particle in the other cell. */
-      for (int pid = 0; pid < gcount_padded_i; pid++) {
-
-        /* Get info about j */
-        const float x_i = ci_cache->x[pid];
-        const float y_i = ci_cache->y[pid];
-        const float z_i = ci_cache->z[pid];
-        const float mass_i = ci_cache->m[pid];
-
-        /* Compute the pairwise (square) distance. */
-        const float dx = x_j - x_i;
-        const float dy = y_j - y_i;
-        const float dz = z_j - z_i;
-        const float r2 = dx * dx + dy * dy + dz * dz;
-
-#ifdef SWIFT_DEBUG_CHECKS
-        if (r2 == 0.f) error("Interacting particles with 0 distance");
-
-        /* Check that particles have been drifted to the current time */
-        if (gparts_j[pjd].ti_drift != e->ti_current)
-          error("gpj not drifted to current time");
-        if (pid < gcount_i && gparts_i[pid].ti_drift != e->ti_current)
-          error("gpi not drifted to current time");
-#endif
-
-        /* Interact! */
-        float f_ji;
-        runner_iact_grav_pp_truncated(r2, h2_j, h_inv_j, h_inv3_j, mass_i,
-                                      rlr_inv, &f_ji);
-
-        /* Store it back */
-        a_x -= f_ji * dx;
-        a_y -= f_ji * dy;
-        a_z -= f_ji * dz;
-
-#ifdef SWIFT_DEBUG_CHECKS
-        /* Update the interaction counter if it's not a padded gpart */
-        if (pid < gcount_i) gparts_j[pjd].num_interacted++;
-#endif
-      }
-
-      /* Store everything back in cache */
-      cj_cache->a_x[pjd] = a_x;
-      cj_cache->a_y[pjd] = a_y;
-      cj_cache->a_z[pjd] = a_z;
+    if (cj_active) {
+
+      /* First the P2P */
+      runner_dopair_grav_pp_full(e, cj_cache, ci_cache, gcount_j, gcount_i,
+                                 gcount_padded_i, cj->gparts, ci->gparts);
+      /* Then the M2P */
+      runner_dopair_grav_pm(e, cj_cache, gcount_j, gcount_padded_j, cj->gparts,
+                            CoM_i, multi_i, ci);
    }
-  }
-
-  /* Write back to the particles */
-  if (ci_active) gravity_cache_write_back(ci_cache, gparts_i, gcount_i);
-  if (cj_active) gravity_cache_write_back(cj_cache, gparts_j, gcount_j);
-}
-
-/**
- * @brief Computes the interaction of all the particles in a cell with all the
- * particles of another cell (switching function between full and truncated).
- *
- * @param r The #runner.
- * @param ci The first #cell.
- * @param cj The other #cell.
- */
-void runner_dopair_grav_pp(struct runner *r, struct cell *ci, struct cell *cj) {
-
-  /* Some properties of the space */
-  const struct engine *e = r->e;
-  const struct space *s = e->s;
-  const int periodic = s->periodic;
-  const double cell_width = s->width[0];
-  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
-  const double a_smooth = e->gravity_properties->a_smooth;
-  const double r_cut_min = e->gravity_properties->r_cut_min;
-  const double min_trunc = cell_width * r_cut_min * a_smooth;
-  double shift[3] = {0.0, 0.0, 0.0};

-  TIMER_TIC;
+  } else { /* Periodic BC */

-  /* Anything to do here? */
-  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
-
-  /* Check that we are not doing something stupid */
-  if (ci->split || cj->split) error("Running P-P on splitable cells");
-
-  /* Let's start by drifting things */
-  if (!cell_are_gpart_drifted(ci, e)) error("Un-drifted gparts");
-  if (!cell_are_gpart_drifted(cj, e)) error("Un-drifted gparts");
-
-  /* Can we use the Newtonian version or do we need the truncated one ? */
-  if (!periodic) {
-    runner_dopair_grav_pp_full(r, ci, cj, shift);
-  } else {
+    // MATTHIEU deal with periodicity

    /* Get the relative distance between the pairs, wrapping. */
+    double shift[3] = {0.0, 0.0, 0.0};
    shift[0] = nearest(cj->loc[0] - ci->loc[0], dim[0]);
    shift[1] = nearest(cj->loc[1] - ci->loc[1], dim[1]);
    shift[2] = nearest(cj->loc[2] - ci->loc[2], dim[2]);
@@ -767,15 +519,40 @@ void runner_dopair_grav_pp(struct runner *r, struct cell *ci, struct cell *cj) {
        shift[0] * shift[0] + shift[1] * shift[1] + shift[2] * shift[2];

    /* Get the maximal distance between any two particles */
-    const double max_r = sqrt(r2) + ci->multipole->r_max + cj->multipole->r_max;
+    const double max_r = sqrt(r2) + rmax_i + rmax_j;

    /* Do we need to use the truncated interactions ? */
-    if (max_r > min_trunc)
-      runner_dopair_grav_pp_truncated(r, ci, cj, shift);
-    else
-      runner_dopair_grav_pp_full(r, ci, cj, shift);
+    if (max_r > min_trunc) {
+
+      /* Periodic but far-away cells must use the truncated potential */
+
+      /* Let's updated the active cell(s) only */
+      if (ci_active)
+        runner_dopair_grav_pp_truncated(e, rlr_inv, ci_cache, cj_cache,
+                                        gcount_i, gcount_j, gcount_padded_j,
+                                        ci->gparts, cj->gparts);
+      if (cj_active)
+        runner_dopair_grav_pp_truncated(e, rlr_inv, cj_cache, ci_cache,
+                                        gcount_j, gcount_i, gcount_padded_i,
+                                        cj->gparts, ci->gparts);
+    } else {
+
+      /* Periodic but close-by cells can use the full Newtonian potential */
+
+      /* Let's updated the active cell(s) only */
+      if (ci_active)
+        runner_dopair_grav_pp_full(e, ci_cache, cj_cache, gcount_i, gcount_j,
+                                   gcount_padded_j, ci->gparts, cj->gparts);
+      if (cj_active)
+        runner_dopair_grav_pp_full(e, cj_cache, ci_cache, gcount_j, gcount_i,
+                                   gcount_padded_i, cj->gparts, ci->gparts);
+    }
  }

+  /* Write back to the particles */
+  if (ci_active) gravity_cache_write_back(ci_cache, ci->gparts, gcount_i);
+  if (cj_active) gravity_cache_write_back(cj_cache, cj->gparts, gcount_j);
+
  TIMER_TOC(timer_dopair_grav_pp);
 }

@@ -812,7 +589,8 @@ void runner_doself_grav_pp_full(struct runner *r, struct cell *c) {
  /* Computed the padded counts */
  const int gcount_padded = gcount - (gcount % VEC_SIZE) + VEC_SIZE;

-  gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc);
+  gravity_cache_populate_no_mpole(e->max_active_bin, ci_cache, gparts, gcount,
+                                  gcount_padded, loc);

  /* Ok... Here we go ! */

@@ -820,7 +598,7 @@ void runner_doself_grav_pp_full(struct runner *r, struct cell *c) {
  for (int pid = 0; pid < gcount; pid++) {

    /* Skip inactive particles */
-    if (!gpart_is_active(&gparts[pid], e)) continue;
+    if (!ci_cache->active[pid]) continue;

    const float x_i = ci_cache->x[pid];
    const float y_i = ci_cache->y[pid];
@@ -935,7 +713,8 @@ void runner_doself_grav_pp_truncated(struct runner *r, struct cell *c) {
  /* Computed the padded counts */
  const int gcount_padded = gcount - (gcount % VEC_SIZE) + VEC_SIZE;

-  gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc);
+  gravity_cache_populate_no_mpole(e->max_active_bin, ci_cache, gparts, gcount,
+                                  gcount_padded, loc);

  /* Ok... Here we go ! */

@@ -943,7 +722,7 @@ void runner_doself_grav_pp_truncated(struct runner *r, struct cell *c) {
  for (int pid = 0; pid < gcount; pid++) {

    /* Skip inactive particles */
-    if (!gpart_is_active(&gparts[pid], e)) continue;
+    if (!ci_cache->active[pid]) continue;

    const float x_i = ci_cache->x[pid];
    const float y_i = ci_cache->y[pid];