diff --git a/examples/test.c b/examples/test.c
index e798e69cd8c1cabff9cb531e23819f11e5d5277f..188b20c4118bf6aec607566ff9859803892b9d9a 100644
--- a/examples/test.c
+++ b/examples/test.c
@@ -795,7 +795,7 @@ int main ( int argc , char *argv[] ) {
     /* Initialize the engine with this space. */
     tic = getticks();
     message( "nr_nodes is %i." , nr_nodes );
-    engine_init( &e , &s , dt_max , nr_threads , nr_queues , nr_nodes , myrank , ENGINE_POLICY | engine_policy_steal );
+    engine_init( &e , &s , dt_max , nr_threads , nr_queues , nr_nodes , myrank , ENGINE_POLICY | engine_policy_steal | engine_policy_paranoid );
     if ( myrank == 0 )
         message( "engine_init took %.3f ms." , ((double)(getticks() - tic)) / CPU_TPS * 1000 ); fflush(stdout);
 
@@ -849,12 +849,12 @@ int main ( int argc , char *argv[] ) {
     
         /* Repartition the space amongst the nodes? */
         #if defined(WITH_MPI) && defined(HAVE_METIS)
-            if ( j == 2 )
+            if ( j % 100 == 2 )
                 e.forcerepart = 1;
         #endif
         
         /* Force a rebuild for testing. */
-        /* if ( j % 4 == 1 )
+        /* if ( j % 4 == 3 )
             e.forcerebuild = 1; */
         
         // message( "starting run %i/%i (t=%.3e) with %i threads and %i queues..." , j+1 , runs , e.time , e.nr_threads , e.nr_queues ); fflush(stdout);
diff --git a/src/atomic.h b/src/atomic.h
index df6e5aaeed4db12653530d6a5dec8ee3042f02f7..16b268c4c799cd1ca8c38a3382df912a9d618614 100644
--- a/src/atomic.h
+++ b/src/atomic.h
@@ -1,26 +1,30 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_ATOMIC_H
+#define SWIFT_ATOMIC_H
 
-
+/* Includes. */
 #include "inline.h"
-    
-#define atomic_add(v,i) __sync_fetch_and_add( v , i )
-#define atomic_inc(v) atomic_add( v , 1 )
-#define atomic_dec(v) atomic_add( v , -1 )
-#define atomic_cas(v,o,n) __sync_val_compare_and_swap( v , o , n )
+
+#define atomic_add(v, i) __sync_fetch_and_add(v, i)
+#define atomic_inc(v) atomic_add(v, 1)
+#define atomic_dec(v) atomic_add(v, -1)
+#define atomic_cas(v, o, n) __sync_val_compare_and_swap(v, o, n)
+
+#endif /* SWIFT_ATOMIC_H */
diff --git a/src/cell.c b/src/cell.c
index 13e1055649dd8f5b06fdad112102e25821b44850..87b51ac82cade8a7e4302e52b7a4e55e5d612aa2 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -1,85 +1,76 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 
 /* Config parameters. */
 #include "../config.h"
 
 /* Some standard headers. */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <pthread.h>
 #include <float.h>
 #include <limits.h>
 #include <math.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
 /* MPI headers. */
 #ifdef WITH_MPI
-    #include <mpi.h>
+#include <mpi.h>
 #endif
 
 /* Switch off timers. */
 #ifdef TIMER
-    #undef TIMER
+#undef TIMER
 #endif
 
+/* This object's header. */
+#include "cell.h"
+
 /* Local headers. */
-#include "const.h"
 #include "atomic.h"
-#include "cycle.h"
-#include "lock.h"
-#include "task.h"
-#include "timers.h"
-#include "part.h"
-#include "space.h"
-#include "multipole.h"
-#include "cell.h"
 #include "error.h"
-#include "inline.h"
+#include "space.h"
+#include "timers.h"
 
 /* Global variables. */
 int cell_next_tag = 0;
 
-
 /**
  * @brief Get the size of the cell subtree.
  *
  * @param c The #cell.
  */
- 
-int cell_getsize ( struct cell *c ) {
-
-    int k, count = 1;
-    
-    /* Sum up the progeny if split. */
-    if ( c->split )
-        for ( k = 0 ; k < 8 ; k++ )
-            if ( c->progeny[k] != NULL )
-                count += cell_getsize( c->progeny[k] );
-                
-    /* Return the final count. */
-    return count;
 
-    }
+int cell_getsize(struct cell *c) {
+
+  int k, count = 1;
 
+  /* Sum up the progeny if split. */
+  if (c->split)
+    for (k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL) count += cell_getsize(c->progeny[k]);
 
-/** 
+  /* Return the final count. */
+  return count;
+}
+
+/**
  * @brief Unpack the data of a given cell and its sub-cells.
  *
  * @param pc An array of packed #pcell.
@@ -88,52 +79,47 @@ int cell_getsize ( struct cell *c ) {
  *
  * @return The number of cells created.
  */
- 
-int cell_unpack ( struct pcell *pc , struct cell *c , struct space *s ) {
-
-    int k, count = 1;
-    struct cell *temp;
-    
-    /* Unpack the current pcell. */
-    c->h_max = pc->h_max;
-    c->dt_min = FLT_MAX; // pc->dt_min;
-    c->dt_max = FLT_MAX; // pc->dt_max;
-    c->count = pc->count;
-    c->tag = pc->tag;
-    
-    /* Fill the progeny recursively, depth-first. */
-    for ( k = 0 ; k < 8 ; k++ )
-        if ( pc->progeny[k] >= 0 ) {
-            temp = space_getcell( s );
-            temp->count = 0;
-            temp->loc[0] = c->loc[0];
-            temp->loc[1] = c->loc[1];
-            temp->loc[2] = c->loc[2];
-            temp->h[0] = c->h[0]/2;
-            temp->h[1] = c->h[1]/2;
-            temp->h[2] = c->h[2]/2;
-            temp->dmin = c->dmin/2;
-            if ( k & 4 )
-                temp->loc[0] += temp->h[0];
-            if ( k & 2 )
-                temp->loc[1] += temp->h[1];
-            if ( k & 1 )
-                temp->loc[2] += temp->h[2];
-            temp->depth = c->depth + 1;
-            temp->split = 0;
-            temp->dx_max = 0.0;
-            temp->nodeID = c->nodeID;
-            temp->parent = c;
-            c->progeny[k] = temp;
-            c->split = 1;
-            count += cell_unpack( &pc[ pc->progeny[k] ] , temp , s );
-            }
-            
-    /* Return the total number of unpacked cells. */
-    return count;
 
+int cell_unpack(struct pcell *pc, struct cell *c, struct space *s) {
+
+  int k, count = 1;
+  struct cell *temp;
+
+  /* Unpack the current pcell. */
+  c->h_max = pc->h_max;
+  c->dt_min = FLT_MAX;  // pc->dt_min;
+  c->dt_max = FLT_MAX;  // pc->dt_max;
+  c->count = pc->count;
+  c->tag = pc->tag;
+
+  /* Fill the progeny recursively, depth-first. */
+  for (k = 0; k < 8; k++)
+    if (pc->progeny[k] >= 0) {
+      temp = space_getcell(s);
+      temp->count = 0;
+      temp->loc[0] = c->loc[0];
+      temp->loc[1] = c->loc[1];
+      temp->loc[2] = c->loc[2];
+      temp->h[0] = c->h[0] / 2;
+      temp->h[1] = c->h[1] / 2;
+      temp->h[2] = c->h[2] / 2;
+      temp->dmin = c->dmin / 2;
+      if (k & 4) temp->loc[0] += temp->h[0];
+      if (k & 2) temp->loc[1] += temp->h[1];
+      if (k & 1) temp->loc[2] += temp->h[2];
+      temp->depth = c->depth + 1;
+      temp->split = 0;
+      temp->dx_max = 0.0;
+      temp->nodeID = c->nodeID;
+      temp->parent = c;
+      c->progeny[k] = temp;
+      c->split = 1;
+      count += cell_unpack(&pc[pc->progeny[k]], temp, s);
     }
 
+  /* Return the total number of unpacked cells. */
+  return count;
+}
 
 /**
  * @brief Link the cells recursively to the given part array.
@@ -144,23 +130,20 @@ int cell_unpack ( struct pcell *pc , struct cell *c , struct space *s ) {
  * @return The number of particles linked.
  */
 
-int cell_link ( struct cell *c , struct part *parts ) {
-
-    int k, ind = 0;
-    
-    c->parts = parts;
-    
-    /* Fill the progeny recursively, depth-first. */
-    if ( c->split )
-        for ( k = 0 ; k < 8 ; k++ )
-            if ( c->progeny[k] != NULL )
-                ind += cell_link( c->progeny[k] , &parts[ind] );
-            
-    /* Return the total number of unpacked cells. */
-    return c->count;
+int cell_link(struct cell *c, struct part *parts) {
 
-    }
+  int k, ind = 0;
+
+  c->parts = parts;
 
+  /* Fill the progeny recursively, depth-first. */
+  if (c->split)
+    for (k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL) ind += cell_link(c->progeny[k], &parts[ind]);
+
+  /* Return the total number of unpacked cells. */
+  return c->count;
+}
 
 /**
  * @brief Pack the data of the given cell and all it's sub-cells.
@@ -171,402 +154,394 @@ int cell_link ( struct cell *c , struct part *parts ) {
  *
  * @return The number of packed cells.
  */
- 
-int cell_pack ( struct cell *c , struct pcell *pc ) {
-
-    int k, count = 1;
-    
-    /* Start by packing the data of the current cell. */
-    pc->h_max = c->h_max;
-    pc->dt_min = c->dt_min;
-    pc->dt_max = c->dt_max;
-    pc->count = c->count;
-    c->tag = pc->tag = cell_next_tag++;
-    
-    /* Fill in the progeny, depth-first recursion. */
-    for ( k = 0 ; k < 8 ; k++ )
-        if ( c->progeny[k] != NULL ) {
-            pc->progeny[k] = count;
-            count += cell_pack( c->progeny[k] , &pc[count] );
-            }
-        else
-            pc->progeny[k] = -1;
-            
-    /* Return the number of packed cells used. */
-    return count;
 
-    }
+int cell_pack(struct cell *c, struct pcell *pc) {
+
+  int k, count = 1;
+
+  /* Start by packing the data of the current cell. */
+  pc->h_max = c->h_max;
+  pc->dt_min = c->dt_min;
+  pc->dt_max = c->dt_max;
+  pc->count = c->count;
+  c->tag = pc->tag = atomic_inc(&cell_next_tag) % cell_max_tag;
 
+  /* Fill in the progeny, depth-first recursion. */
+  for (k = 0; k < 8; k++)
+    if (c->progeny[k] != NULL) {
+      pc->progeny[k] = count;
+      count += cell_pack(c->progeny[k], &pc[count]);
+    } else
+      pc->progeny[k] = -1;
+
+  /* Return the number of packed cells used. */
+  return count;
+}
 
 /**
  * @brief Lock a cell and hold its parents.
  *
  * @param c The #cell.
  */
- 
-int cell_locktree( struct cell *c ) {
-
-    struct cell *finger, *finger2;
-    TIMER_TIC
-
-    /* First of all, try to lock this cell. */
-    if ( c->hold || lock_trylock( &c->lock ) != 0 ) {
-        TIMER_TOC(timer_locktree);
-        return 1;
-        }
-        
-    /* Did somebody hold this cell in the meantime? */
-    if ( c->hold ) {
-        
-        /* Unlock this cell. */
-        if ( lock_unlock( &c->lock ) != 0 )
-            error( "Failed to unlock cell." );
-            
-        /* Admit defeat. */
-        TIMER_TOC(timer_locktree);
-        return 1;
-    
-        }
-        
-    /* Climb up the tree and lock/hold/unlock. */
-    for ( finger = c->parent ; finger != NULL ; finger = finger->parent ) {
-    
-        /* Lock this cell. */
-        if ( lock_trylock( &finger->lock ) != 0 )
-            break;
-            
-        /* Increment the hold. */
-        atomic_inc( &finger->hold );
-        
-        /* Unlock the cell. */
-        if ( lock_unlock( &finger->lock ) != 0 )
-            error( "Failed to unlock cell." );
-    
-        }
-        
-    /* If we reached the top of the tree, we're done. */
-    if ( finger == NULL ) {
-        TIMER_TOC(timer_locktree);
-        return 0;
-        }
-        
-    /* Otherwise, we hit a snag. */
-    else {
-    
-        /* Undo the holds up to finger. */
-        for ( finger2 = c->parent ; finger2 != finger ; finger2 = finger2->parent )
-            __sync_fetch_and_sub( &finger2->hold , 1 );
-            
-        /* Unlock this cell. */
-        if ( lock_unlock( &c->lock ) != 0 )
-            error( "Failed to unlock cell." );
-            
-        /* Admit defeat. */
-        TIMER_TOC(timer_locktree);
-        return 1;
-    
-        }
 
-    }
-    
-    
-int cell_glocktree( struct cell *c ) {
-
-    struct cell *finger, *finger2;
-    TIMER_TIC
-
-    /* First of all, try to lock this cell. */
-    if ( c->ghold || lock_trylock( &c->glock ) != 0 ) {
-        TIMER_TOC(timer_locktree);
-        return 1;
-        }
-        
-    /* Did somebody hold this cell in the meantime? */
-    if ( c->ghold ) {
-        
-        /* Unlock this cell. */
-        if ( lock_unlock( &c->glock ) != 0 )
-            error( "Failed to unlock cell." );
-            
-        /* Admit defeat. */
-        TIMER_TOC(timer_locktree);
-        return 1;
-    
-        }
-        
-    /* Climb up the tree and lock/hold/unlock. */
-    for ( finger = c->parent ; finger != NULL ; finger = finger->parent ) {
-    
-        /* Lock this cell. */
-        if ( lock_trylock( &finger->glock ) != 0 )
-            break;
-            
-        /* Increment the hold. */
-        __sync_fetch_and_add( &finger->ghold , 1 );
-        
-        /* Unlock the cell. */
-        if ( lock_unlock( &finger->glock ) != 0 )
-            error( "Failed to unlock cell." );
-    
-        }
-        
-    /* If we reached the top of the tree, we're done. */
-    if ( finger == NULL ) {
-        TIMER_TOC(timer_locktree);
-        return 0;
-        }
-        
-    /* Otherwise, we hit a snag. */
-    else {
-    
-        /* Undo the holds up to finger. */
-        for ( finger2 = c->parent ; finger2 != finger ; finger2 = finger2->parent )
-            __sync_fetch_and_sub( &finger2->ghold , 1 );
-            
-        /* Unlock this cell. */
-        if ( lock_unlock( &c->glock ) != 0 )
-            error( "Failed to unlock cell." );
-            
-        /* Admit defeat. */
-        TIMER_TOC(timer_locktree);
-        return 1;
-    
-        }
+int cell_locktree(struct cell *c) {
+
+  struct cell *finger, *finger2;
+  TIMER_TIC
+
+  /* First of all, try to lock this cell. */
+  if (c->hold || lock_trylock(&c->lock) != 0) {
+    TIMER_TOC(timer_locktree);
+    return 1;
+  }
+
+  /* Did somebody hold this cell in the meantime? */
+  if (c->hold) {
+
+    /* Unlock this cell. */
+    if (lock_unlock(&c->lock) != 0) error("Failed to unlock cell.");
+
+    /* Admit defeat. */
+    TIMER_TOC(timer_locktree);
+    return 1;
+  }
+
+  /* Climb up the tree and lock/hold/unlock. */
+  for (finger = c->parent; finger != NULL; finger = finger->parent) {
+
+    /* Lock this cell. */
+    if (lock_trylock(&finger->lock) != 0) break;
+
+    /* Increment the hold. */
+    atomic_inc(&finger->hold);
+
+    /* Unlock the cell. */
+    if (lock_unlock(&finger->lock) != 0) error("Failed to unlock cell.");
+  }
+
+  /* If we reached the top of the tree, we're done. */
+  if (finger == NULL) {
+    TIMER_TOC(timer_locktree);
+    return 0;
+  }
+
+  /* Otherwise, we hit a snag. */
+  else {
+
+    /* Undo the holds up to finger. */
+    for (finger2 = c->parent; finger2 != finger; finger2 = finger2->parent)
+      __sync_fetch_and_sub(&finger2->hold, 1);
+
+    /* Unlock this cell. */
+    if (lock_unlock(&c->lock) != 0) error("Failed to unlock cell.");
+
+    /* Admit defeat. */
+    TIMER_TOC(timer_locktree);
+    return 1;
+  }
+}
+
+int cell_glocktree(struct cell *c) {
+
+  struct cell *finger, *finger2;
+  TIMER_TIC
+
+  /* First of all, try to lock this cell. */
+  if (c->ghold || lock_trylock(&c->glock) != 0) {
+    TIMER_TOC(timer_locktree);
+    return 1;
+  }
+
+  /* Did somebody hold this cell in the meantime? */
+  if (c->ghold) {
+
+    /* Unlock this cell. */
+    if (lock_unlock(&c->glock) != 0) error("Failed to unlock cell.");
+
+    /* Admit defeat. */
+    TIMER_TOC(timer_locktree);
+    return 1;
+  }
+
+  /* Climb up the tree and lock/hold/unlock. */
+  for (finger = c->parent; finger != NULL; finger = finger->parent) {
+
+    /* Lock this cell. */
+    if (lock_trylock(&finger->glock) != 0) break;
+
+    /* Increment the hold. */
+    __sync_fetch_and_add(&finger->ghold, 1);
+
+    /* Unlock the cell. */
+    if (lock_unlock(&finger->glock) != 0) error("Failed to unlock cell.");
+  }
+
+  /* If we reached the top of the tree, we're done. */
+  if (finger == NULL) {
+    TIMER_TOC(timer_locktree);
+    return 0;
+  }
+
+  /* Otherwise, we hit a snag. */
+  else {
+
+    /* Undo the holds up to finger. */
+    for (finger2 = c->parent; finger2 != finger; finger2 = finger2->parent)
+      __sync_fetch_and_sub(&finger2->ghold, 1);
+
+    /* Unlock this cell. */
+    if (lock_unlock(&c->glock) != 0) error("Failed to unlock cell.");
+
+    /* Admit defeat. */
+    TIMER_TOC(timer_locktree);
+    return 1;
+  }
+}
 
-    }
-    
-    
 /**
  * @brief Unock a cell's parents.
  *
  * @param c The #cell.
  */
- 
-void cell_unlocktree( struct cell *c ) {
-
-    struct cell *finger;
-    TIMER_TIC
-
-    /* First of all, try to unlock this cell. */
-    if ( lock_unlock( &c->lock ) != 0 )
-        error( "Failed to unlock cell." );
-        
-    /* Climb up the tree and unhold the parents. */
-    for ( finger = c->parent ; finger != NULL ; finger = finger->parent )
-        __sync_fetch_and_sub( &finger->hold , 1 );
-        
-    TIMER_TOC(timer_locktree);
-        
-    }
-    
-    
-void cell_gunlocktree( struct cell *c ) {
-
-    struct cell *finger;
-    TIMER_TIC
-
-    /* First of all, try to unlock this cell. */
-    if ( lock_unlock( &c->glock ) != 0 )
-        error( "Failed to unlock cell." );
-        
-    /* Climb up the tree and unhold the parents. */
-    for ( finger = c->parent ; finger != NULL ; finger = finger->parent )
-        __sync_fetch_and_sub( &finger->ghold , 1 );
-        
-    TIMER_TOC(timer_locktree);
-        
-    }
-    
-    
+
+void cell_unlocktree(struct cell *c) {
+
+  struct cell *finger;
+  TIMER_TIC
+
+  /* First of all, try to unlock this cell. */
+  if (lock_unlock(&c->lock) != 0) error("Failed to unlock cell.");
+
+  /* Climb up the tree and unhold the parents. */
+  for (finger = c->parent; finger != NULL; finger = finger->parent)
+    __sync_fetch_and_sub(&finger->hold, 1);
+
+  TIMER_TOC(timer_locktree);
+}
+
+void cell_gunlocktree(struct cell *c) {
+
+  struct cell *finger;
+  TIMER_TIC
+
+  /* First of all, try to unlock this cell. */
+  if (lock_unlock(&c->glock) != 0) error("Failed to unlock cell.");
+
+  /* Climb up the tree and unhold the parents. */
+  for (finger = c->parent; finger != NULL; finger = finger->parent)
+    __sync_fetch_and_sub(&finger->ghold, 1);
+
+  TIMER_TOC(timer_locktree);
+}
+
 /**
  * @brief Sort the parts into eight bins along the given pivots.
  *
  * @param c The #cell array to be sorted.
  */
- 
-void cell_split ( struct cell *c  ) {
-
-    int i, j, k, count = c->count, gcount = c->gcount;
-    struct part temp, *parts = c->parts;
-    struct xpart xtemp, *xparts = c->xparts;
-    struct gpart gtemp, *gparts = c->gparts;
-    int left[8], right[8];
-    double pivot[3];
-    
-    /* Init the pivots. */
-    for ( k = 0 ; k < 3 ; k++ )
-        pivot[k] = c->loc[k] + c->h[k]/2;
-    
-    /* Split along the x-axis. */
-    i = 0; j = count - 1;
-    while ( i <= j ) {
-        while ( i <= count-1 && parts[i].x[0] <= pivot[0] )
-            i += 1;
-        while ( j >= 0 && parts[j].x[0] > pivot[0] )
-            j -= 1;
-        if ( i < j ) {
-            temp = parts[i]; parts[i] = parts[j]; parts[j] = temp;
-            xtemp = xparts[i]; xparts[i] = xparts[j]; xparts[j] = xtemp;
-            }
-        }
-    /* for ( k = 0 ; k <= j ; k++ )
-        if ( parts[k].x[0] > pivot[0] )
-            error( "cell_split: sorting failed." );
-    for ( k = i ; k < count ; k++ )
-        if ( parts[k].x[0] < pivot[0] )
-            error( "cell_split: sorting failed." ); */
-    left[1] = i; right[1] = count - 1;
-    left[0] = 0; right[0] = j;
-    
-    /* Split along the y axis, twice. */
-    for ( k = 1 ; k >= 0 ; k-- ) {
-        i = left[k]; j = right[k];
-        while ( i <= j ) {
-            while ( i <= right[k] && parts[i].x[1] <= pivot[1] )
-                i += 1;
-            while ( j >= left[k] && parts[j].x[1] > pivot[1] )
-                j -= 1;
-            if ( i < j ) {
-                temp = parts[i]; parts[i] = parts[j]; parts[j] = temp;
-                xtemp = xparts[i]; xparts[i] = xparts[j]; xparts[j] = xtemp;
-                }
-            }
-        /* for ( int kk = left[k] ; kk <= j ; kk++ )
-            if ( parts[kk].x[1] > pivot[1] ) {
-                message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j );
-                error( "sorting failed (left)." );
-                }
-        for ( int kk = i ; kk <= right[k] ; kk++ )
-            if ( parts[kk].x[1] < pivot[1] )
-                error( "sorting failed (right)." ); */
-        left[2*k+1] = i; right[2*k+1] = right[k];
-        left[2*k] = left[k]; right[2*k] = j;
-        }
-
-    /* Split along the z axis, four times. */
-    for ( k = 3 ; k >= 0 ; k-- ) {
-        i = left[k]; j = right[k];
-        while ( i <= j ) {
-            while ( i <= right[k] && parts[i].x[2] <= pivot[2] )
-                i += 1;
-            while ( j >= left[k] && parts[j].x[2] > pivot[2] )
-                j -= 1;
-            if ( i < j ) {
-                temp = parts[i]; parts[i] = parts[j]; parts[j] = temp;
-                xtemp = xparts[i]; xparts[i] = xparts[j]; xparts[j] = xtemp;
-                }
-            }
-        /* for ( int kk = left[k] ; kk <= j ; kk++ )
-            if ( parts[kk].x[2] > pivot[2] ) {
-                message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j );
-                error( "sorting failed (left)." );
-                }
-        for ( int kk = i ; kk <= right[k] ; kk++ )
-            if ( parts[kk].x[2] < pivot[2] ) {
-                message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j );
-                error( "sorting failed (right)." );
-                } */
-        left[2*k+1] = i; right[2*k+1] = right[k];
-        left[2*k] = left[k]; right[2*k] = j;
-        }
-        
-    /* Store the counts and offsets. */
-    for ( k = 0 ; k < 8 ; k++ ) {
-        c->progeny[k]->count = right[k] - left[k] + 1;
-        c->progeny[k]->parts = &c->parts[ left[k] ];
-        c->progeny[k]->xparts = &c->xparts[ left[k] ];
-        }
-        
-    /* Re-link the gparts. */
-    for ( k = 0 ; k < count ; k++ )
-        if ( parts[k].gpart != NULL )
-            parts[k].gpart->part = &parts[k];
-        
-    /* Verify that _all_ the parts have been assigned to a cell. */
-    /* for ( k = 1 ; k < 8 ; k++ )
-        if ( &c->progeny[k-1]->parts[ c->progeny[k-1]->count ] != c->progeny[k]->parts )
-            error( "Particle sorting failed (internal consistency)." );
-    if ( c->progeny[0]->parts != c->parts )
-        error( "Particle sorting failed (left edge)." );
-    if ( &c->progeny[7]->parts[ c->progeny[7]->count ] != &c->parts[ count ] )
-        error( "Particle sorting failed (right edge)." ); */
-        
-    /* Verify a few sub-cells. */
-    /* for ( k = 0 ; k < c->progeny[0]->count ; k++ )
-        if ( c->progeny[0]->parts[k].x[0] > pivot[0] ||
-             c->progeny[0]->parts[k].x[1] > pivot[1] ||
-             c->progeny[0]->parts[k].x[2] > pivot[2] )
-            error( "Sorting failed (progeny=0)." );
-    for ( k = 0 ; k < c->progeny[1]->count ; k++ )
-        if ( c->progeny[1]->parts[k].x[0] > pivot[0] ||
-             c->progeny[1]->parts[k].x[1] > pivot[1] ||
-             c->progeny[1]->parts[k].x[2] <= pivot[2] )
-            error( "Sorting failed (progeny=1)." );
-    for ( k = 0 ; k < c->progeny[2]->count ; k++ )
-        if ( c->progeny[2]->parts[k].x[0] > pivot[0] ||
-             c->progeny[2]->parts[k].x[1] <= pivot[1] ||
-             c->progeny[2]->parts[k].x[2] > pivot[2] )
-            error( "Sorting failed (progeny=2)." ); */
-
-    /* Now do the same song and dance for the gparts. */
-
-    /* Split along the x-axis. */
-    i = 0; j = gcount - 1;
-    while ( i <= j ) {
-        while ( i <= gcount-1 && gparts[i].x[0] <= pivot[0] )
-            i += 1;
-        while ( j >= 0 && gparts[j].x[0] > pivot[0] )
-            j -= 1;
-        if ( i < j ) {
-            gtemp = gparts[i]; gparts[i] = gparts[j]; gparts[j] = gtemp;
-            }
-        }
-    left[1] = i; right[1] = gcount - 1;
-    left[0] = 0; right[0] = j;
-    
-    /* Split along the y axis, twice. */
-    for ( k = 1 ; k >= 0 ; k-- ) {
-        i = left[k]; j = right[k];
-        while ( i <= j ) {
-            while ( i <= right[k] && gparts[i].x[1] <= pivot[1] )
-                i += 1;
-            while ( j >= left[k] && gparts[j].x[1] > pivot[1] )
-                j -= 1;
-            if ( i < j ) {
-                gtemp = gparts[i]; gparts[i] = gparts[j]; gparts[j] = gtemp;
-                }
+
+void cell_split(struct cell *c) {
+
+  int i, j, k, count = c->count, gcount = c->gcount;
+  struct part temp, *parts = c->parts;
+  struct xpart xtemp, *xparts = c->xparts;
+  struct gpart gtemp, *gparts = c->gparts;
+  int left[8], right[8];
+  double pivot[3];
+
+  /* Init the pivots. */
+  for (k = 0; k < 3; k++) pivot[k] = c->loc[k] + c->h[k] / 2;
+
+  /* Split along the x-axis. */
+  i = 0;
+  j = count - 1;
+  while (i <= j) {
+    while (i <= count - 1 && parts[i].x[0] <= pivot[0]) i += 1;
+    while (j >= 0 && parts[j].x[0] > pivot[0]) j -= 1;
+    if (i < j) {
+      temp = parts[i];
+      parts[i] = parts[j];
+      parts[j] = temp;
+      xtemp = xparts[i];
+      xparts[i] = xparts[j];
+      xparts[j] = xtemp;
+    }
+  }
+  /* for ( k = 0 ; k <= j ; k++ )
+      if ( parts[k].x[0] > pivot[0] )
+          error( "cell_split: sorting failed." );
+  for ( k = i ; k < count ; k++ )
+      if ( parts[k].x[0] < pivot[0] )
+          error( "cell_split: sorting failed." ); */
+  left[1] = i;
+  right[1] = count - 1;
+  left[0] = 0;
+  right[0] = j;
+
+  /* Split along the y axis, twice. */
+  for (k = 1; k >= 0; k--) {
+    i = left[k];
+    j = right[k];
+    while (i <= j) {
+      while (i <= right[k] && parts[i].x[1] <= pivot[1]) i += 1;
+      while (j >= left[k] && parts[j].x[1] > pivot[1]) j -= 1;
+      if (i < j) {
+        temp = parts[i];
+        parts[i] = parts[j];
+        parts[j] = temp;
+        xtemp = xparts[i];
+        xparts[i] = xparts[j];
+        xparts[j] = xtemp;
+      }
+    }
+    /* for ( int kk = left[k] ; kk <= j ; kk++ )
+        if ( parts[kk].x[1] > pivot[1] ) {
+            message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j );
+            error( "sorting failed (left)." );
             }
-        left[2*k+1] = i; right[2*k+1] = right[k];
-        left[2*k] = left[k]; right[2*k] = j;
-        }
-
-    /* Split along the z axis, four times. */
-    for ( k = 3 ; k >= 0 ; k-- ) {
-        i = left[k]; j = right[k];
-        while ( i <= j ) {
-            while ( i <= right[k] && gparts[i].x[2] <= pivot[2] )
-                i += 1;
-            while ( j >= left[k] && gparts[j].x[2] > pivot[2] )
-                j -= 1;
-            if ( i < j ) {
-                gtemp = gparts[i]; gparts[i] = gparts[j]; gparts[j] = gtemp;
-                }
+    for ( int kk = i ; kk <= right[k] ; kk++ )
+        if ( parts[kk].x[1] < pivot[1] )
+            error( "sorting failed (right)." ); */
+    left[2 * k + 1] = i;
+    right[2 * k + 1] = right[k];
+    left[2 * k] = left[k];
+    right[2 * k] = j;
+  }
+
+  /* Split along the z axis, four times. */
+  for (k = 3; k >= 0; k--) {
+    i = left[k];
+    j = right[k];
+    while (i <= j) {
+      while (i <= right[k] && parts[i].x[2] <= pivot[2]) i += 1;
+      while (j >= left[k] && parts[j].x[2] > pivot[2]) j -= 1;
+      if (i < j) {
+        temp = parts[i];
+        parts[i] = parts[j];
+        parts[j] = temp;
+        xtemp = xparts[i];
+        xparts[i] = xparts[j];
+        xparts[j] = xtemp;
+      }
+    }
+    /* for ( int kk = left[k] ; kk <= j ; kk++ )
+        if ( parts[kk].x[2] > pivot[2] ) {
+            message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j );
+            error( "sorting failed (left)." );
             }
-        left[2*k+1] = i; right[2*k+1] = right[k];
-        left[2*k] = left[k]; right[2*k] = j;
-        }
-        
-    /* Store the counts and offsets. */
-    for ( k = 0 ; k < 8 ; k++ ) {
-        c->progeny[k]->gcount = right[k] - left[k] + 1;
-        c->progeny[k]->gparts = &c->gparts[ left[k] ];
-        }
-        
-    /* Re-link the parts. */
-    for ( k = 0 ; k < gcount ; k++ )
-        if ( gparts[k].id > 0 )
-            gparts[k].part->gpart = &gparts[k];
-        
+    for ( int kk = i ; kk <= right[k] ; kk++ )
+        if ( parts[kk].x[2] < pivot[2] ) {
+            message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j );
+            error( "sorting failed (right)." );
+            } */
+    left[2 * k + 1] = i;
+    right[2 * k + 1] = right[k];
+    left[2 * k] = left[k];
+    right[2 * k] = j;
+  }
+
+  /* Store the counts and offsets. */
+  for (k = 0; k < 8; k++) {
+    c->progeny[k]->count = right[k] - left[k] + 1;
+    c->progeny[k]->parts = &c->parts[left[k]];
+    c->progeny[k]->xparts = &c->xparts[left[k]];
+  }
+
+  /* Re-link the gparts. */
+  for (k = 0; k < count; k++)
+    if (parts[k].gpart != NULL) parts[k].gpart->part = &parts[k];
+
+  /* Verify that _all_ the parts have been assigned to a cell. */
+  /* for ( k = 1 ; k < 8 ; k++ )
+      if ( &c->progeny[k-1]->parts[ c->progeny[k-1]->count ] !=
+  c->progeny[k]->parts )
+          error( "Particle sorting failed (internal consistency)." );
+  if ( c->progeny[0]->parts != c->parts )
+      error( "Particle sorting failed (left edge)." );
+  if ( &c->progeny[7]->parts[ c->progeny[7]->count ] != &c->parts[ count ] )
+      error( "Particle sorting failed (right edge)." ); */
+
+  /* Verify a few sub-cells. */
+  /* for ( k = 0 ; k < c->progeny[0]->count ; k++ )
+      if ( c->progeny[0]->parts[k].x[0] > pivot[0] ||
+           c->progeny[0]->parts[k].x[1] > pivot[1] ||
+           c->progeny[0]->parts[k].x[2] > pivot[2] )
+          error( "Sorting failed (progeny=0)." );
+  for ( k = 0 ; k < c->progeny[1]->count ; k++ )
+      if ( c->progeny[1]->parts[k].x[0] > pivot[0] ||
+           c->progeny[1]->parts[k].x[1] > pivot[1] ||
+           c->progeny[1]->parts[k].x[2] <= pivot[2] )
+          error( "Sorting failed (progeny=1)." );
+  for ( k = 0 ; k < c->progeny[2]->count ; k++ )
+      if ( c->progeny[2]->parts[k].x[0] > pivot[0] ||
+           c->progeny[2]->parts[k].x[1] <= pivot[1] ||
+           c->progeny[2]->parts[k].x[2] > pivot[2] )
+          error( "Sorting failed (progeny=2)." ); */
+
+  /* Now do the same song and dance for the gparts. */
+
+  /* Split along the x-axis. */
+  i = 0;
+  j = gcount - 1;
+  while (i <= j) {
+    while (i <= gcount - 1 && gparts[i].x[0] <= pivot[0]) i += 1;
+    while (j >= 0 && gparts[j].x[0] > pivot[0]) j -= 1;
+    if (i < j) {
+      gtemp = gparts[i];
+      gparts[i] = gparts[j];
+      gparts[j] = gtemp;
     }
-
-
+  }
+  left[1] = i;
+  right[1] = gcount - 1;
+  left[0] = 0;
+  right[0] = j;
+
+  /* Split along the y axis, twice. */
+  for (k = 1; k >= 0; k--) {
+    i = left[k];
+    j = right[k];
+    while (i <= j) {
+      while (i <= right[k] && gparts[i].x[1] <= pivot[1]) i += 1;
+      while (j >= left[k] && gparts[j].x[1] > pivot[1]) j -= 1;
+      if (i < j) {
+        gtemp = gparts[i];
+        gparts[i] = gparts[j];
+        gparts[j] = gtemp;
+      }
+    }
+    left[2 * k + 1] = i;
+    right[2 * k + 1] = right[k];
+    left[2 * k] = left[k];
+    right[2 * k] = j;
+  }
+
+  /* Split along the z axis, four times. */
+  for (k = 3; k >= 0; k--) {
+    i = left[k];
+    j = right[k];
+    while (i <= j) {
+      while (i <= right[k] && gparts[i].x[2] <= pivot[2]) i += 1;
+      while (j >= left[k] && gparts[j].x[2] > pivot[2]) j -= 1;
+      if (i < j) {
+        gtemp = gparts[i];
+        gparts[i] = gparts[j];
+        gparts[j] = gtemp;
+      }
+    }
+    left[2 * k + 1] = i;
+    right[2 * k + 1] = right[k];
+    left[2 * k] = left[k];
+    right[2 * k] = j;
+  }
+
+  /* Store the counts and offsets. */
+  for (k = 0; k < 8; k++) {
+    c->progeny[k]->gcount = right[k] - left[k] + 1;
+    c->progeny[k]->gparts = &c->gparts[left[k]];
+  }
+
+  /* Re-link the parts. */
+  for (k = 0; k < gcount; k++)
+    if (gparts[k].id > 0) gparts[k].part->gpart = &gparts[k];
+}
diff --git a/src/cell.h b/src/cell.h
index 43dedefbb6c079b726ed1cbb5d4cfe0b39e368a2..7a5353bcae8cbb7ab4d50a546665a1a774a46aea 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -1,165 +1,173 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_CELL_H
+#define SWIFT_CELL_H
 
-/* Some constants. */
-#define cell_sid_dt                 13
+/* Includes. */
+#include "lock.h"
+#include "multipole.h"
+#include "part.h"
+
+/* Forward declaration of space, needed for cell_unpack. */
+struct space;
 
+/* Some constants. */
+#define cell_sid_dt 13
+#define cell_max_tag (1 << 16)
 
 /* Global variables. */
 extern int cell_next_tag;
 
-
 /* Packed cell. */
 struct pcell {
 
-    /* Stats on this cell's particles. */
-    double h_max, dt_min, dt_max;
-    
-    /* Number of particles in this cell. */
-    int count;
-    
-    /* tag used for MPI communication. */
-    int tag;
+  /* Stats on this cell's particles. */
+  double h_max, dt_min, dt_max;
+
+  /* Number of particles in this cell. */
+  int count;
 
-    /* Relative indices of the cell's progeny. */
-    int progeny[8];
-    
-    };
+  /* tag used for MPI communication. */
+  int tag;
 
+  /* Relative indices of the cell's progeny. */
+  int progeny[8];
+};
 
 /* Structure to store the data of a single cell. */
 struct cell {
 
-    /* The cell location on the grid. */
-    double loc[3];
-    
-    /* The cell dimensions. */
-    double h[3];
-    
-    /* Max radii in this cell. */
-    double h_max;
-    
-    /* Minimum and maximum dt in this cell. */
-    double dt_min, dt_max;
-    
-    /* Minimum dimension, i.e. smallest edge of this cell. */
-    float dmin;
-    
-    /* Maximum slack allowed for particle movement. */
-    float slack;
-    
-    /* Maximum particle movement in this cell. */
-    float dx_max;
-    
-    /* The depth of this cell in the tree. */
-    int depth, split, maxdepth;
-    
-    /* Nr of parts. */
-    int count, gcount;
-    
-    /* Pointers to the particle data. */
-    struct part *parts;
-    
-    /* Pointers to the extra particle data. */
-    struct xpart *xparts;
-    
-    /* Pointers to the gravity particle data. */
-    struct gpart *gparts;
-    
-    /* Pointers for the sorted indices. */
-    struct entry *sort, *gsort;
-    unsigned int sorted, gsorted;
-    
-    /* Pointers to the next level of cells. */
-    struct cell *progeny[8];
-    
-    /* Parent cell. */
-    struct cell *parent;
-    
-    /* Super cell, i.e. the highest-level supercell that has interactions. */
-    struct cell *super;
-    
-    /* The task computing this cell's sorts. */
-    struct task *sorts, *gsorts;
-    int sortsize, gsortsize;
-    
-    /* The tasks computing this cell's density. */
-    struct link *density, *force, *grav;
-    int nr_density, nr_force, nr_grav;
-    
-    /* The ghost task to link density to interactions. */
-    struct task *ghost, *kick1, *kick2;
-    
-    /* Task receiving data. */
-    struct task *recv_xv, *recv_rho;
-    
-    /* Tasks for gravity tree. */
-    struct task *grav_up, *grav_down;
-    
-    /* Number of tasks that are associated with this cell. */
-    int nr_tasks;
-    
-    /* Is the data of this cell being used in a sub-cell? */
-    int hold, ghold;
-    
-    /* Spin lock for various uses. */
-    lock_type lock, glock;
-    
-    /* ID of the previous owner, e.g. runner. */
-    int owner;
-    
-    /* Momentum of particles in cell. */
-    float mom[3], ang[3];
-    
-    /* Potential and kinetic energy of particles in this cell. */
-    double epot, ekin;
-    
-    /* Number of particles updated in this cell. */
-    int updated;
-    
-    /* Linking pointer for "memory management". */
-    struct cell *next;
-    
-    /* ID of the node this cell lives on. */
-    int nodeID;
-    
-    /* Bit mask of the proxies this cell is registered with. */
-    unsigned long long int sendto;
-    
-    /* Pointer to this cell's packed representation. */
-    struct pcell *pcell;
-    int pcell_size;
-    int tag;
-    
-    /* This cell's multipole. */
-    struct multipole multipole;
-    
-    } __attribute__((aligned (64)));
+  /* The cell location on the grid. */
+  double loc[3];
+
+  /* The cell dimensions. */
+  double h[3];
+
+  /* Max radii in this cell. */
+  double h_max;
+
+  /* Minimum and maximum dt in this cell. */
+  double dt_min, dt_max;
+
+  /* Minimum dimension, i.e. smallest edge of this cell. */
+  float dmin;
+
+  /* Maximum slack allowed for particle movement. */
+  float slack;
+
+  /* Maximum particle movement in this cell. */
+  float dx_max;
+
+  /* The depth of this cell in the tree. */
+  int depth, split, maxdepth;
+
+  /* Nr of parts. */
+  int count, gcount;
+
+  /* Pointers to the particle data. */
+  struct part *parts;
+
+  /* Pointers to the extra particle data. */
+  struct xpart *xparts;
+
+  /* Pointers to the gravity particle data. */
+  struct gpart *gparts;
+
+  /* Pointers for the sorted indices. */
+  struct entry *sort, *gsort;
+  unsigned int sorted, gsorted;
+
+  /* Pointers to the next level of cells. */
+  struct cell *progeny[8];
+
+  /* Parent cell. */
+  struct cell *parent;
 
+  /* Super cell, i.e. the highest-level supercell that has interactions. */
+  struct cell *super;
+
+  /* The task computing this cell's sorts. */
+  struct task *sorts, *gsorts;
+  int sortsize, gsortsize;
+
+  /* The tasks computing this cell's density. */
+  struct link *density, *force, *grav;
+  int nr_density, nr_force, nr_grav;
+
+  /* The ghost task to link density to interactions. */
+  struct task *ghost, *kick1, *kick2;
+
+  /* Task receiving data. */
+  struct task *recv_xv, *recv_rho;
+
+  /* Tasks for gravity tree. */
+  struct task *grav_up, *grav_down;
+
+  /* Number of tasks that are associated with this cell. */
+  int nr_tasks;
+
+  /* Is the data of this cell being used in a sub-cell? */
+  int hold, ghold;
+
+  /* Spin lock for various uses. */
+  lock_type lock, glock;
+
+  /* ID of the previous owner, e.g. runner. */
+  int owner;
+
+  /* Momentum of particles in cell. */
+  float mom[3], ang[3];
+
+  /* Potential and kinetic energy of particles in this cell. */
+  double epot, ekin;
+
+  /* Number of particles updated in this cell. */
+  int updated;
+
+  /* Linking pointer for "memory management". */
+  struct cell *next;
+
+  /* ID of the node this cell lives on. */
+  int nodeID;
+
+  /* Bit mask of the proxies this cell is registered with. */
+  unsigned long long int sendto;
+
+  /* Pointer to this cell's packed representation. */
+  struct pcell *pcell;
+  int pcell_size;
+  int tag;
+
+  /* This cell's multipole. */
+  struct multipole multipole;
+
+} __attribute__((aligned(64)));
 
 /* Function prototypes. */
-void cell_split ( struct cell *c  );
-int cell_locktree( struct cell *c );
-void cell_unlocktree( struct cell *c );
-int cell_glocktree( struct cell *c );
-void cell_gunlocktree( struct cell *c );
-int cell_pack ( struct cell *c , struct pcell *pc );
-int cell_unpack ( struct pcell *pc , struct cell *c , struct space *s );
-int cell_getsize ( struct cell *c );
-int cell_link ( struct cell *c , struct part *parts );
+void cell_split(struct cell *c);
+int cell_locktree(struct cell *c);
+void cell_unlocktree(struct cell *c);
+int cell_glocktree(struct cell *c);
+void cell_gunlocktree(struct cell *c);
+int cell_pack(struct cell *c, struct pcell *pc);
+int cell_unpack(struct pcell *pc, struct cell *c, struct space *s);
+int cell_getsize(struct cell *c);
+int cell_link(struct cell *c, struct part *parts);
+
+#endif /* SWIFT_CELL_H */
diff --git a/src/common_io.c b/src/common_io.c
index e2e29a596701281fb307ab256721d7809d2c1419..64194e5829658a0c3f0f1aa7d9951d73f3eed377 100644
--- a/src/common_io.c
+++ b/src/common_io.c
@@ -2,98 +2,110 @@
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
  *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk).
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 
 /* Config parameters. */
 #include "../config.h"
 
-
 #if defined(HAVE_HDF5)
 
 /* Some standard headers. */
+#include <hdf5.h>
+#include <math.h>
+#include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <stddef.h>
-#include <hdf5.h>
-#include <math.h>
+
+/* MPI headers. */
 #ifdef WITH_MPI
 #include <mpi.h>
 #endif
 
+/* This object's header. */
+#include "common_io.h"
+
+/* Local includes. */
 #include "const.h"
-#include "cycle.h"
-#include "lock.h"
-#include "task.h"
-#include "part.h"
-#include "space.h"
-#include "scheduler.h"
-#include "engine.h"
 #include "error.h"
 #include "kernel.h"
-#include "common_io.h"
-
-
 
 /**
- * @brief Converts a C data type to the HDF5 equivalent. 
+ * @brief Converts a C data type to the HDF5 equivalent.
  *
  * This function is a trivial wrapper around the HDF5 types but allows
- * to change the exact storage types matching the code types in a transparent way.
+ * to change the exact storage types matching the code types in a transparent
+ *way.
  */
-hid_t hdf5Type(enum DATA_TYPE type)
-{
-  switch(type)
-    {
-    case INT: return H5T_NATIVE_INT;
-    case UINT: return H5T_NATIVE_UINT;
-    case LONG: return H5T_NATIVE_LONG;
-    case ULONG: return H5T_NATIVE_ULONG;
-    case LONGLONG: return H5T_NATIVE_LLONG;
-    case ULONGLONG: return H5T_NATIVE_ULLONG;
-    case FLOAT: return H5T_NATIVE_FLOAT;
-    case DOUBLE: return H5T_NATIVE_DOUBLE;
-    case CHAR: return H5T_C_S1;
-    default: error("Unknown type"); return 0;
-    }
+hid_t hdf5Type(enum DATA_TYPE type) {
+  switch (type) {
+    case INT:
+      return H5T_NATIVE_INT;
+    case UINT:
+      return H5T_NATIVE_UINT;
+    case LONG:
+      return H5T_NATIVE_LONG;
+    case ULONG:
+      return H5T_NATIVE_ULONG;
+    case LONGLONG:
+      return H5T_NATIVE_LLONG;
+    case ULONGLONG:
+      return H5T_NATIVE_ULLONG;
+    case FLOAT:
+      return H5T_NATIVE_FLOAT;
+    case DOUBLE:
+      return H5T_NATIVE_DOUBLE;
+    case CHAR:
+      return H5T_C_S1;
+    default:
+      error("Unknown type");
+      return 0;
+  }
 }
 
 /**
  * @brief Returns the memory size of the data type
  */
-size_t sizeOfType(enum DATA_TYPE type)
-{
-  switch(type)
-    {
-    case INT: return sizeof(int);
-    case UINT: return sizeof(unsigned int);
-    case LONG: return sizeof(long);
-    case ULONG: return sizeof(unsigned long);
-    case LONGLONG: return sizeof(long long);
-    case ULONGLONG: return sizeof(unsigned long long);
-    case FLOAT: return sizeof(float);
-    case DOUBLE: return sizeof(double);
-    case CHAR: return sizeof(char);
-    default: error("Unknown type"); return 0;
-    }
+size_t sizeOfType(enum DATA_TYPE type) {
+  switch (type) {
+    case INT:
+      return sizeof(int);
+    case UINT:
+      return sizeof(unsigned int);
+    case LONG:
+      return sizeof(long);
+    case ULONG:
+      return sizeof(unsigned long);
+    case LONGLONG:
+      return sizeof(long long);
+    case ULONGLONG:
+      return sizeof(unsigned long long);
+    case FLOAT:
+      return sizeof(float);
+    case DOUBLE:
+      return sizeof(double);
+    case CHAR:
+      return sizeof(char);
+    default:
+      error("Unknown type");
+      return 0;
+  }
 }
 
-
-
 /**
  * @brief Reads an attribute from a given HDF5 group.
  *
@@ -104,21 +116,18 @@ size_t sizeOfType(enum DATA_TYPE type)
  *
  * Calls #error() if an error occurs.
  */
-void readAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data)
-{
-  hid_t h_attr=0, h_err=0;
+void readAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data) {
+  hid_t h_attr = 0, h_err = 0;
 
   h_attr = H5Aopen(grp, name, H5P_DEFAULT);
-  if(h_attr < 0)
-    {
-      error( "Error while opening attribute '%s'" , name );
-    }
+  if (h_attr < 0) {
+    error("Error while opening attribute '%s'", name);
+  }
 
   h_err = H5Aread(h_attr, hdf5Type(type), data);
-  if(h_err < 0)
-    {
-      error( "Error while reading attribute '%s'" , name );
-    }
+  if (h_err < 0) {
+    error("Error while reading attribute '%s'", name);
+  }
 
   H5Aclose(h_attr);
 }
@@ -134,34 +143,30 @@ void readAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data)
  *
  * Calls #error() if an error occurs.
  */
-void writeAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data, int num)
-{
-  hid_t h_space=0, h_attr=0, h_err=0;
-  hsize_t dim[1]={num};
+void writeAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data,
+                    int num) {
+  hid_t h_space = 0, h_attr = 0, h_err = 0;
+  hsize_t dim[1] = {num};
 
   h_space = H5Screate(H5S_SIMPLE);
-  if(h_space < 0)
-    {
-      error( "Error while creating dataspace for attribute '%s'." , name );
-    }
+  if (h_space < 0) {
+    error("Error while creating dataspace for attribute '%s'.", name);
+  }
 
   h_err = H5Sset_extent_simple(h_space, 1, dim, NULL);
-  if(h_err < 0)
-    {
-      error( "Error while changing dataspace shape for attribute '%s'." , name );
-    }
+  if (h_err < 0) {
+    error("Error while changing dataspace shape for attribute '%s'.", name);
+  }
 
   h_attr = H5Acreate1(grp, name, hdf5Type(type), h_space, H5P_DEFAULT);
-  if(h_attr < 0)
-    {
-      error( "Error while creating attribute '%s'.", name );
-    }
+  if (h_attr < 0) {
+    error("Error while creating attribute '%s'.", name);
+  }
 
   h_err = H5Awrite(h_attr, hdf5Type(type), data);
-  if(h_err < 0)
-    {
-      error( "Error while reading attribute '%s'." , name );
-    }
+  if (h_err < 0) {
+    error("Error while reading attribute '%s'.", name);
+  }
 
   H5Sclose(h_space);
   H5Aclose(h_attr);
@@ -177,39 +182,33 @@ void writeAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data, int
  *
  * Calls #error() if an error occurs.
  */
-void writeStringAttribute(hid_t grp, char* name, char* str, int length)
-{
-  hid_t h_space=0, h_attr=0, h_err=0, h_type=0;
+void writeStringAttribute(hid_t grp, char* name, char* str, int length) {
+  hid_t h_space = 0, h_attr = 0, h_err = 0, h_type = 0;
 
   h_space = H5Screate(H5S_SCALAR);
-  if(h_space < 0)
-    {
-      error( "Error while creating dataspace for attribute '%s'." , name );
-    }
+  if (h_space < 0) {
+    error("Error while creating dataspace for attribute '%s'.", name);
+  }
 
   h_type = H5Tcopy(H5T_C_S1);
-  if(h_type < 0)
-    {
-      error( "Error while copying datatype 'H5T_C_S1'." );
-    }
+  if (h_type < 0) {
+    error("Error while copying datatype 'H5T_C_S1'.");
+  }
 
   h_err = H5Tset_size(h_type, length);
-  if(h_err < 0)
-    {
-      error( "Error while resizing attribute tyep to '%i'." , length );
-    }
+  if (h_err < 0) {
+    error("Error while resizing attribute tyep to '%i'.", length);
+  }
 
   h_attr = H5Acreate1(grp, name, h_type, h_space, H5P_DEFAULT);
-  if(h_attr < 0)
-    {
-      error( "Error while creating attribute '%s'." , name );
-    }
+  if (h_attr < 0) {
+    error("Error while creating attribute '%s'.", name);
+  }
 
-  h_err = H5Awrite(h_attr, h_type, str );
-  if(h_err < 0)
-    {
-      error( "Error while reading attribute '%s'." , name );
-    }
+  h_err = H5Awrite(h_attr, h_type, str);
+  if (h_err < 0) {
+    error("Error while reading attribute '%s'.", name);
+  }
 
   H5Tclose(h_type);
   H5Sclose(h_space);
@@ -222,8 +221,7 @@ void writeStringAttribute(hid_t grp, char* name, char* str, int length)
  * @param name The name of the attribute
  * @param data The value to write
  */
-void writeAttribute_d(hid_t grp, char* name, double data)
-{
+void writeAttribute_d(hid_t grp, char* name, double data) {
   writeAttribute(grp, name, DOUBLE, &data, 1);
 }
 
@@ -233,8 +231,7 @@ void writeAttribute_d(hid_t grp, char* name, double data)
  * @param name The name of the attribute
  * @param data The value to write
  */
-void writeAttribute_f(hid_t grp, char* name, float data)
-{
+void writeAttribute_f(hid_t grp, char* name, float data) {
   writeAttribute(grp, name, FLOAT, &data, 1);
 }
 
@@ -245,8 +242,7 @@ void writeAttribute_f(hid_t grp, char* name, float data)
  * @param data The value to write
  */
 
-void writeAttribute_i(hid_t grp, char* name, int data)
-{
+void writeAttribute_i(hid_t grp, char* name, int data) {
   writeAttribute(grp, name, INT, &data, 1);
 }
 
@@ -256,8 +252,7 @@ void writeAttribute_i(hid_t grp, char* name, int data)
  * @param name The name of the attribute
  * @param data The value to write
  */
-void writeAttribute_l(hid_t grp, char* name, long data)
-{
+void writeAttribute_l(hid_t grp, char* name, long data) {
   writeAttribute(grp, name, LONG, &data, 1);
 }
 
@@ -267,26 +262,24 @@ void writeAttribute_l(hid_t grp, char* name, long data)
  * @param name The name of the attribute
  * @param str The string to write
  */
-void writeAttribute_s(hid_t grp, char* name, char* str)
-{
+void writeAttribute_s(hid_t grp, char* name, char* str) {
   writeStringAttribute(grp, name, str, strlen(str));
 }
 
-
-/* ------------------------------------------------------------------------------------------------ 
- * This part writes the XMF file descriptor enabling a visualisation through ParaView
- * ------------------------------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------------------------------
+ * This part writes the XMF file descriptor enabling a visualisation through
+ * ParaView
+ * ------------------------------------------------------------------------------------------------
+ */
 /**
  * @brief Writes the current model of SPH to the file
  * @param h_file The (opened) HDF5 file in which to write
  */
-void writeSPHflavour(hid_t h_file)
-{
-  hid_t h_grpsph=0;
+void writeSPHflavour(hid_t h_file) {
+  hid_t h_grpsph = 0;
 
   h_grpsph = H5Gcreate1(h_file, "/SPH", 0);
-  if(h_grpsph < 0)
-    error("Error while creating SPH group");
+  if (h_grpsph < 0) error("Error while creating SPH group");
 
   writeAttribute_f(h_grpsph, "Kernel eta", const_eta_kernel);
   writeAttribute_f(h_grpsph, "Weighted N_ngb", kernel_nwneigh);
@@ -294,24 +287,33 @@ void writeSPHflavour(hid_t h_file)
   writeAttribute_f(h_grpsph, "Hydro gamma", const_hydro_gamma);
 
 #ifdef LEGACY_GADGET2_SPH
-  writeAttribute_s(h_grpsph, "Thermal Conductivity Model", "(No treatment) Legacy Gadget-2 as in Springel (2005)");  
-  writeAttribute_s(h_grpsph, "Viscosity Model", "Legacy Gadget-2 as in Springel (2005)");  
-  writeAttribute_f(h_grpsph, "Viscosity alpha", const_viscosity_alpha);  
-  writeAttribute_f(h_grpsph, "Viscosity beta", 3.f);  
+  writeAttribute_s(h_grpsph, "Thermal Conductivity Model",
+                   "(No treatment) Legacy Gadget-2 as in Springel (2005)");
+  writeAttribute_s(h_grpsph, "Viscosity Model",
+                   "Legacy Gadget-2 as in Springel (2005)");
+  writeAttribute_f(h_grpsph, "Viscosity alpha", const_viscosity_alpha);
+  writeAttribute_f(h_grpsph, "Viscosity beta", 3.f);
 #else
-  writeAttribute_s(h_grpsph, "Thermal Conductivity Model", "Price (2008) without switch");  
-  writeAttribute_f(h_grpsph, "Thermal Conductivity alpha", const_conductivity_alpha);  
-  writeAttribute_s(h_grpsph, "Viscosity Model", "Morris & Monaghan (1997), Rosswog, Davies, Thielemann & Piran (2000) with additional Balsara (1995) switch");  
-  writeAttribute_f(h_grpsph, "Viscosity alpha_min", const_viscosity_alpha_min);  
-  writeAttribute_f(h_grpsph, "Viscosity alpha_max", const_viscosity_alpha_max);  
-  writeAttribute_f(h_grpsph, "Viscosity beta", 2.f);  
-  writeAttribute_f(h_grpsph, "Viscosity decay length", const_viscosity_length);  
+  writeAttribute_s(h_grpsph, "Thermal Conductivity Model",
+                   "Price (2008) without switch");
+  writeAttribute_f(h_grpsph, "Thermal Conductivity alpha",
+                   const_conductivity_alpha);
+  writeAttribute_s(h_grpsph, "Viscosity Model",
+                   "Morris & Monaghan (1997), Rosswog, Davies, Thielemann & "
+                   "Piran (2000) with additional Balsara (1995) switch");
+  writeAttribute_f(h_grpsph, "Viscosity alpha_min", const_viscosity_alpha_min);
+  writeAttribute_f(h_grpsph, "Viscosity alpha_max", const_viscosity_alpha_max);
+  writeAttribute_f(h_grpsph, "Viscosity beta", 2.f);
+  writeAttribute_f(h_grpsph, "Viscosity decay length", const_viscosity_length);
 #endif
 
-  writeAttribute_f(h_grpsph, "CFL parameter", const_cfl);  
-  writeAttribute_f(h_grpsph, "Maximal ln(Delta h) change over dt", const_ln_max_h_change);  
-  writeAttribute_f(h_grpsph, "Maximal Delta h change over dt", exp(const_ln_max_h_change));  
-  writeAttribute_f(h_grpsph, "Maximal Delta u change over dt", const_max_u_change);  
+  writeAttribute_f(h_grpsph, "CFL parameter", const_cfl);
+  writeAttribute_f(h_grpsph, "Maximal ln(Delta h) change over dt",
+                   const_ln_max_h_change);
+  writeAttribute_f(h_grpsph, "Maximal Delta h change over dt",
+                   exp(const_ln_max_h_change));
+  writeAttribute_f(h_grpsph, "Maximal Delta u change over dt",
+                   const_max_u_change);
   writeAttribute_s(h_grpsph, "Kernel", kernel_name);
 
   H5Gclose(h_grpsph);
@@ -322,25 +324,26 @@ void writeSPHflavour(hid_t h_file)
  * @param h_file The (opened) HDF5 file in which to write
  * @param us The UnitSystem used in the run
  */
-void writeUnitSystem(hid_t h_file, struct UnitSystem* us)
-{
-  hid_t h_grpunit=0;
+void writeUnitSystem(hid_t h_file, struct UnitSystem* us) {
+  hid_t h_grpunit = 0;
 
   h_grpunit = H5Gcreate1(h_file, "/Units", 0);
-  if(h_grpunit < 0)
-    error("Error while creating Unit System group");
-
-  writeAttribute_d(h_grpunit, "Unit mass in cgs (U_M)", getBaseUnit(us, UNIT_MASS));
-  writeAttribute_d(h_grpunit, "Unit length in cgs (U_L)", getBaseUnit(us, UNIT_LENGTH));
-  writeAttribute_d(h_grpunit, "Unit time in cgs (U_t)", getBaseUnit(us, UNIT_TIME));
-  writeAttribute_d(h_grpunit, "Unit current in cgs (U_I)", getBaseUnit(us, UNIT_CURRENT));
-  writeAttribute_d(h_grpunit, "Unit temperature in cgs (U_T)", getBaseUnit(us, UNIT_TEMPERATURE));  
+  if (h_grpunit < 0) error("Error while creating Unit System group");
+
+  writeAttribute_d(h_grpunit, "Unit mass in cgs (U_M)",
+                   getBaseUnit(us, UNIT_MASS));
+  writeAttribute_d(h_grpunit, "Unit length in cgs (U_L)",
+                   getBaseUnit(us, UNIT_LENGTH));
+  writeAttribute_d(h_grpunit, "Unit time in cgs (U_t)",
+                   getBaseUnit(us, UNIT_TIME));
+  writeAttribute_d(h_grpunit, "Unit current in cgs (U_I)",
+                   getBaseUnit(us, UNIT_CURRENT));
+  writeAttribute_d(h_grpunit, "Unit temperature in cgs (U_T)",
+                   getBaseUnit(us, UNIT_TEMPERATURE));
 
   H5Gclose(h_grpunit);
 }
 
-
-
 /**
  * @brief Prepares the XMF file for the new entry
  *
@@ -348,67 +351,63 @@ void writeUnitSystem(hid_t h_file, struct UnitSystem* us)
  *
  * @todo Use a proper XML library to avoid stupid copies.
  */
-FILE* prepareXMFfile()
-{
+FILE* prepareXMFfile() {
   char buffer[1024];
 
   FILE* xmfFile = fopen("output.xmf", "r");
   FILE* tempFile = fopen("output_temp.xmf", "w");
 
-  if(xmfFile == NULL)
-    error("Unable to open current XMF file.");
-
-  if(tempFile == NULL)
-    error("Unable to open temporary file.");
+  if (xmfFile == NULL) error("Unable to open current XMF file.");
 
+  if (tempFile == NULL) error("Unable to open temporary file.");
 
   /* First we make a temporary copy of the XMF file and count the lines */
   int counter = 0;
-  while (fgets(buffer, 1024, xmfFile) != NULL)
-    {
-      counter++;
-      fprintf(tempFile, "%s", buffer);
-    }
+  while (fgets(buffer, 1024, xmfFile) != NULL) {
+    counter++;
+    fprintf(tempFile, "%s", buffer);
+  }
   fclose(tempFile);
   fclose(xmfFile);
-  
+
   /* We then copy the XMF file back up to the closing lines */
   xmfFile = fopen("output.xmf", "w");
   tempFile = fopen("output_temp.xmf", "r");
 
-  if(xmfFile == NULL)
-    error("Unable to open current XMF file.");
+  if (xmfFile == NULL) error("Unable to open current XMF file.");
 
-  if(tempFile == NULL)
-    error("Unable to open temporary file.");
+  if (tempFile == NULL) error("Unable to open temporary file.");
 
   int i = 0;
-  while (fgets(buffer, 1024, tempFile) != NULL && i < counter - 3)
-    {
-      i++;
-      fprintf(xmfFile, "%s", buffer);
-    }
+  while (fgets(buffer, 1024, tempFile) != NULL && i < counter - 3) {
+    i++;
+    fprintf(xmfFile, "%s", buffer);
+  }
   fprintf(xmfFile, "\n");
   fclose(tempFile);
   remove("output_temp.xmf");
- 
+
   return xmfFile;
 }
 
 /**
  * @brief Writes the begin of the XMF file
  *
- * @todo Exploit the XML nature of the XMF format to write a proper XML writer and simplify all the XMF-related stuff.
+ * @todo Exploit the XML nature of the XMF format to write a proper XML writer
+ *and simplify all the XMF-related stuff.
  */
-void createXMFfile()
-{
+void createXMFfile() {
   FILE* xmfFile = fopen("output.xmf", "w");
 
   fprintf(xmfFile, "<?xml version=\"1.0\" ?> \n");
   fprintf(xmfFile, "<!DOCTYPE Xdmf SYSTEM \"Xdmf.dtd\" []> \n");
-  fprintf(xmfFile, "<Xdmf xmlns:xi=\"http://www.w3.org/2003/XInclude\" Version=\"2.1\">\n");
+  fprintf(
+      xmfFile,
+      "<Xdmf xmlns:xi=\"http://www.w3.org/2003/XInclude\" Version=\"2.1\">\n");
   fprintf(xmfFile, "<Domain>\n");
-  fprintf(xmfFile, "<Grid Name=\"TimeSeries\" GridType=\"Collection\" CollectionType=\"Temporal\">\n\n");
+  fprintf(xmfFile,
+          "<Grid Name=\"TimeSeries\" GridType=\"Collection\" "
+          "CollectionType=\"Temporal\">\n\n");
 
   fprintf(xmfFile, "</Grid>\n");
   fprintf(xmfFile, "</Domain>\n");
@@ -417,48 +416,52 @@ void createXMFfile()
   fclose(xmfFile);
 }
 
-
 /**
- * @brief Writes the part of the XMF entry presenting the geometry of the snapshot
+ * @brief Writes the part of the XMF entry presenting the geometry of the
+ *snapshot
  *
  * @param xmfFile The file to write in.
  * @param Nparts The number of particles.
  * @param hdfFileName The name of the HDF5 file corresponding to this output.
  * @param time The current simulation time.
  */
-void writeXMFheader(FILE* xmfFile, long long Nparts, char* hdfFileName, float time)
-{
+void writeXMFheader(FILE* xmfFile, long long Nparts, char* hdfFileName,
+                    float time) {
   /* Write end of file */
-  
-  fprintf(xmfFile, "<Grid GridType=\"Collection\" CollectionType=\"Spatial\">\n");
+
+  fprintf(xmfFile,
+          "<Grid GridType=\"Collection\" CollectionType=\"Spatial\">\n");
   fprintf(xmfFile, "<Time Type=\"Single\" Value=\"%f\"/>\n", time);
   fprintf(xmfFile, "<Grid Name=\"Gas\" GridType=\"Uniform\">\n");
-  fprintf(xmfFile, "<Topology TopologyType=\"Polyvertex\" Dimensions=\"%lld\"/>\n", Nparts);
+  fprintf(xmfFile,
+          "<Topology TopologyType=\"Polyvertex\" Dimensions=\"%lld\"/>\n",
+          Nparts);
   fprintf(xmfFile, "<Geometry GeometryType=\"XYZ\">\n");
-  fprintf(xmfFile, "<DataItem Dimensions=\"%lld 3\" NumberType=\"Double\" Precision=\"8\" Format=\"HDF\">%s:/PartType0/Coordinates</DataItem>\n", Nparts, hdfFileName);
+  fprintf(xmfFile,
+          "<DataItem Dimensions=\"%lld 3\" NumberType=\"Double\" "
+          "Precision=\"8\" "
+          "Format=\"HDF\">%s:/PartType0/Coordinates</DataItem>\n",
+          Nparts, hdfFileName);
   fprintf(xmfFile, "</Geometry>");
 }
 
-
 /**
  * @brief Writes the end of the XMF file (closes all open markups)
  *
  * @param xmfFile The file to write in.
  */
-void writeXMFfooter(FILE* xmfFile)
-{
+void writeXMFfooter(FILE* xmfFile) {
   /* Write end of the section of this time step */
-  
+
   fprintf(xmfFile, "\n</Grid>\n");
   fprintf(xmfFile, "</Grid>\n");
   fprintf(xmfFile, "\n</Grid>\n");
   fprintf(xmfFile, "</Domain>\n");
   fprintf(xmfFile, "</Xdmf>\n");
-  
+
   fclose(xmfFile);
 }
 
-
 /**
  * @brief Writes the lines corresponding to an array of the HDF5 output
  *
@@ -471,15 +474,22 @@ void writeXMFfooter(FILE* xmfFile)
  *
  * @todo Treat the types in a better way.
  */
-void writeXMFline(FILE* xmfFile, char* fileName, char* name, long long N, int dim, enum DATA_TYPE type )
-{
-  fprintf(xmfFile, "<Attribute Name=\"%s\" AttributeType=\"%s\" Center=\"Node\">\n", name, dim == 1 ? "Scalar": "Vector");
-  if(dim == 1)
-    fprintf(xmfFile, "<DataItem Dimensions=\"%lld\" NumberType=\"Double\" Precision=\"%d\" Format=\"HDF\">%s:/PartType0/%s</DataItem>\n", N, type==FLOAT ? 4:8, fileName, name);
+void writeXMFline(FILE* xmfFile, char* fileName, char* name, long long N,
+                  int dim, enum DATA_TYPE type) {
+  fprintf(xmfFile,
+          "<Attribute Name=\"%s\" AttributeType=\"%s\" Center=\"Node\">\n",
+          name, dim == 1 ? "Scalar" : "Vector");
+  if (dim == 1)
+    fprintf(xmfFile,
+            "<DataItem Dimensions=\"%lld\" NumberType=\"Double\" "
+            "Precision=\"%d\" Format=\"HDF\">%s:/PartType0/%s</DataItem>\n",
+            N, type == FLOAT ? 4 : 8, fileName, name);
   else
-    fprintf(xmfFile, "<DataItem Dimensions=\"%lld %d\" NumberType=\"Double\" Precision=\"%d\" Format=\"HDF\">%s:/PartType0/%s</DataItem>\n", N, dim, type==FLOAT ? 4:8, fileName, name);
+    fprintf(xmfFile,
+            "<DataItem Dimensions=\"%lld %d\" NumberType=\"Double\" "
+            "Precision=\"%d\" Format=\"HDF\">%s:/PartType0/%s</DataItem>\n",
+            N, dim, type == FLOAT ? 4 : 8, fileName, name);
   fprintf(xmfFile, "</Attribute>\n");
 }
 
-
 #endif
diff --git a/src/common_io.h b/src/common_io.h
index 0c098f597f7acd7f8a084becb1afaadda09c381a..7aacd9ad6e02b5bf2f9b4dd325d52f0af34d2f4f 100644
--- a/src/common_io.h
+++ b/src/common_io.h
@@ -2,52 +2,66 @@
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
  *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk).
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_COMMON_IO_H
+#define SWIFT_COMMON_IO_H
 
 /* Config parameters. */
 #include "../config.h"
 
+/* Includes. */
 #include "units.h"
 
 #if defined(HAVE_HDF5)
 
-
 /**
  * @brief The different types of data used in the GADGET IC files.
  *
  * (This is admittedly a poor substitute to C++ templates...)
  */
-enum DATA_TYPE{INT, LONG, LONGLONG, UINT, ULONG, ULONGLONG, FLOAT, DOUBLE, CHAR};
+enum DATA_TYPE {
+  INT,
+  LONG,
+  LONGLONG,
+  UINT,
+  ULONG,
+  ULONGLONG,
+  FLOAT,
+  DOUBLE,
+  CHAR
+};
 
 /**
- * @brief The two sorts of data present in the GADGET IC files: compulsory to start a run or optional.
+ * @brief The two sorts of data present in the GADGET IC files: compulsory to
+ *start a run or optional.
  *
  */
-enum DATA_IMPORTANCE{COMPULSORY=1, OPTIONAL=0};
-
-
-
+enum DATA_IMPORTANCE {
+  COMPULSORY = 1,
+  OPTIONAL = 0
+};
 
 hid_t hdf5Type(enum DATA_TYPE type);
 size_t sizeOfType(enum DATA_TYPE type);
 
 void readAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data);
 
-void writeAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data, int num);
+void writeAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data,
+                    int num);
 
 void writeAttribute_d(hid_t grp, char* name, double data);
 void writeAttribute_f(hid_t grp, char* name, float data);
@@ -58,9 +72,9 @@ void writeAttribute_s(hid_t grp, char* name, char* str);
 void createXMFfile();
 FILE* prepareXMFfile();
 void writeXMFfooter(FILE* xmfFile);
-void writeXMFheader(FILE* xmfFile, long long N, char* hdfFileName,  float time);
-void writeXMFline(FILE* xmfFile, char* fileName, char* name, long long N, int dim, enum DATA_TYPE type);
-
+void writeXMFheader(FILE* xmfFile, long long N, char* hdfFileName, float time);
+void writeXMFline(FILE* xmfFile, char* fileName, char* name, long long N,
+                  int dim, enum DATA_TYPE type);
 
 /**
  * @brief Writes the current model of SPH to the file
@@ -74,5 +88,6 @@ void writeSPHflavour(hid_t h_file);
  */
 void writeUnitSystem(hid_t h_file, struct UnitSystem* us);
 
-
 #endif
+
+#endif /* SWIFT_COMMON_IO_H */
diff --git a/src/const.h b/src/const.h
index e600f7f50b4ce4d2f6943860bef02e702b5022d6..ccccf6fa89884328efb33fcb018e0b17228fceff 100644
--- a/src/const.h
+++ b/src/const.h
@@ -2,63 +2,74 @@
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (ptcedro.gonnet@durham.ac.uk)
  *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
-
+#ifndef SWIFT_CONST_H
+#define SWIFT_CONST_H
 
 /* Hydrodynamical constants. */
-#define const_hydro_gamma          (5.0f/3.0f)
+#define const_hydro_gamma (5.0f / 3.0f)
 
 /* SPH Viscosity constants. */
-#define const_viscosity_alpha      0.8f            /* Used in the legacy gadget-2 SPH mode only */
-#define const_viscosity_alpha_min  0.1f            /* Values taken from (Price,2004), not used in legacy gadget mode */
-#define const_viscosity_alpha_max  2.0f            /* Values taken from (Price,2004), not used in legacy gadget mode */
-#define const_viscosity_length     0.1f            /* Values taken from (Price,2004), not used in legacy gadget mode */
+#define const_viscosity_alpha \
+  0.8f /* Used in the legacy gadget-2 SPH mode only */
+#define const_viscosity_alpha_min \
+  0.1f /* Values taken from (Price,2004), not used in legacy gadget mode */
+#define const_viscosity_alpha_max \
+  2.0f /* Values taken from (Price,2004), not used in legacy gadget mode */
+#define const_viscosity_length \
+  0.1f /* Values taken from (Price,2004), not used in legacy gadget mode */
 
 /* SPH Thermal conductivity constants. */
-#define const_conductivity_alpha    1.f            /* Value taken from (Price,2008), not used in legacy gadget mode */
+#define const_conductivity_alpha \
+  1.f /* Value taken from (Price,2008), not used in legacy gadget mode */
 
 /* Time integration constants. */
-#define const_cfl               0.3f
-#define const_ln_max_h_change   0.231111721f    /* Particle can't change volume by more than a factor of 2=1.26^3 over one time step */
-#define const_max_u_change      0.1f    
+#define const_cfl 0.3f
+#define const_ln_max_h_change                                           \
+  0.231111721f /* Particle can't change volume by more than a factor of \
+                  2=1.26^3 over one time step */
+#define const_max_u_change 0.1f
 
 /* Neighbour search constants. */
-#define const_eta_kernel        1.2349f         /* Corresponds to 48 ngbs with the cubic spline kernel */
-#define const_delta_nwneigh     1.f
+#define const_eta_kernel \
+  1.2349f /* Corresponds to 48 ngbs with the cubic spline kernel */
+#define const_delta_nwneigh 1.f
 #define CUBIC_SPLINE_KERNEL
 
 /* Gravity stuff. */
-#define const_theta_max         0.57735f        /* Opening criteria, which is the ratio of the
-                                                   cell distance over the cell width. */
+#define const_theta_max                                   \
+  0.57735f /* Opening criteria, which is the ratio of the \
+              cell distance over the cell width. */
 // #define const_G                 6.67384e-8f     /* Gravitational constant. */
-#define const_G                 6.672e-8f     /* Gravitational constant. */
-#define const_epsilon           0.0014f         /* Gravity blending distance. */
-#define const_iepsilon          714.285714286f  /* Inverse gravity blending distance. */
-#define const_iepsilon2         (const_iepsilon*const_iepsilon)
-#define const_iepsilon3         (const_iepsilon2*const_iepsilon)
-#define const_iepsilon4         (const_iepsilon2*const_iepsilon2)
-#define const_iepsilon5         (const_iepsilon3*const_iepsilon2)
-#define const_iepsilon6         (const_iepsilon3*const_iepsilon3)
+#define const_G 6.672e-8f             /* Gravitational constant. */
+#define const_epsilon 0.0014f         /* Gravity blending distance. */
+#define const_iepsilon 714.285714286f /* Inverse gravity blending distance. */
+#define const_iepsilon2 (const_iepsilon* const_iepsilon)
+#define const_iepsilon3 (const_iepsilon2* const_iepsilon)
+#define const_iepsilon4 (const_iepsilon2* const_iepsilon2)
+#define const_iepsilon5 (const_iepsilon3* const_iepsilon2)
+#define const_iepsilon6 (const_iepsilon3* const_iepsilon3)
 
 /* SPH variant to use */
 #define LEGACY_GADGET2_SPH
 
-
 /* System of units */
-#define const_unit_length_in_cgs       1 /* 3.08567810e16  /\* 1Mpc *\/ */
-#define const_unit_mass_in_cgs         1 /* 1.9891e33      /\* 1 M_sun *\/ */
-#define const_unit_velocity_in_cgs     1 /* 1e5            /\* km s^-1 *\/ */
+#define const_unit_length_in_cgs 1   /* 3.08567810e16  /\* 1Mpc *\/ */
+#define const_unit_mass_in_cgs 1     /* 1.9891e33      /\* 1 M_sun *\/ */
+#define const_unit_velocity_in_cgs 1 /* 1e5            /\* km s^-1 *\/ */
+
+#endif /* SWIFT_CONST_H */
diff --git a/src/cycle.h b/src/cycle.h
index 16f57e7e1ef942d2736f4328be9117b2deab6d6e..1278c83e8b43324662bdeb0de75eec08faf4fd82 100644
--- a/src/cycle.h
+++ b/src/cycle.h
@@ -23,7 +23,6 @@
  *
  */
 
-
 /* machine-dependent cycle counters code. Needs to be inlined. */
 
 /***************************************************************************/
@@ -52,25 +51,28 @@
    defined according to whether the corresponding function/type/header
    is available on your system.  The necessary macros are most
    conveniently defined if you are using GNU autoconf, via the tests:
-   
+
    dnl ---------------------------------------------------------------------
 
    AC_C_INLINE
    AC_HEADER_TIME
    AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h])
 
-   AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is defined in <sys/time.h>])],,[#if HAVE_SYS_TIME_H
+   AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if
+hrtime_t is defined in <sys/time.h>])],,[#if HAVE_SYS_TIME_H
 #include <sys/time.h>
 #endif])
 
-   AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime mach_absolute_time])
+   AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime
+mach_absolute_time])
 
    dnl Cray UNICOS _rtc() (real-time clock) intrinsic
    AC_MSG_CHECKING([for _rtc intrinsic])
    rtc_ok=yes
    AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H
 #include <intrinsics.h>
-#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() intrinsic.])], [rtc_ok=no])
+#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc()
+intrinsic.])], [rtc_ok=no])
    AC_MSG_RESULT($rtc_ok)
 
    dnl ---------------------------------------------------------------------
@@ -79,24 +81,25 @@
 /***************************************************************************/
 
 #if TIME_WITH_SYS_TIME
-# include <sys/time.h>
-# include <time.h>
+#include <sys/time.h>
+#include <time.h>
 #else
-# if HAVE_SYS_TIME_H
-#  include <sys/time.h>
-# else
-#  include <time.h>
-# endif
+#if HAVE_SYS_TIME_H
+#include <sys/time.h>
+#else
+#include <time.h>
+#endif
 #endif
 
-#define INLINE_ELAPSED(INL) static INL double elapsed(ticks t1, ticks t0) \
-{									  \
-     return (double)t1 - (double)t0;					  \
-}
+#define INLINE_ELAPSED(INL)                       \
+  static INL double elapsed(ticks t1, ticks t0) { \
+    return (double)t1 - (double)t0;               \
+  }
 
 /*----------------------------------------------------------------*/
 /* Solaris */
-#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && !defined(HAVE_TICK_COUNTER)
+#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && \
+    !defined(HAVE_TICK_COUNTER)
 typedef hrtime_t ticks;
 
 #define getticks gethrtime
@@ -108,22 +111,22 @@ INLINE_ELAPSED(inline)
 
 /*----------------------------------------------------------------*/
 /* AIX v. 4+ routines to read the real-time clock or time-base register */
-#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && !defined(HAVE_TICK_COUNTER)
+#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && \
+    !defined(HAVE_TICK_COUNTER)
 typedef timebasestruct_t ticks;
 
-static __inline ticks getticks(void)
-{
-     ticks t;
-     read_real_time(&t, TIMEBASE_SZ);
-     return t;
+static __inline ticks getticks(void) {
+  ticks t;
+  read_real_time(&t, TIMEBASE_SZ);
+  return t;
 }
 
 static __inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */
 {
-     time_base_to_time(&t1, TIMEBASE_SZ);
-     time_base_to_time(&t0, TIMEBASE_SZ);
-     return (((double)t1.tb_high - (double)t0.tb_high) * 1.0e9 + 
-	     ((double)t1.tb_low - (double)t0.tb_low));
+  time_base_to_time(&t1, TIMEBASE_SZ);
+  time_base_to_time(&t0, TIMEBASE_SZ);
+  return (((double)t1.tb_high - (double)t0.tb_high) * 1.0e9 +
+          ((double)t1.tb_low - (double)t0.tb_low));
 }
 
 #define HAVE_TICK_COUNTER
@@ -133,20 +136,23 @@ static __inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */
 /*
  * PowerPC ``cycle'' counter using the time base register.
  */
-#if ((((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || (defined(__MWERKS__) && defined(macintosh)))) || (defined(__IBM_GCC_ASM) && (defined(__powerpc__) || defined(__ppc__))))  && !defined(HAVE_TICK_COUNTER)
+#if ((((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || \
+       (defined(__MWERKS__) && defined(macintosh)))) ||                     \
+     (defined(__IBM_GCC_ASM) &&                                             \
+      (defined(__powerpc__) || defined(__ppc__)))) &&                       \
+    !defined(HAVE_TICK_COUNTER)
 typedef unsigned long long ticks;
 
-static __inline__ ticks getticks(void)
-{
-     unsigned int tbl, tbu0, tbu1;
+static __inline__ ticks getticks(void) {
+  unsigned int tbl, tbu0, tbu1;
 
-     do {
-	  __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
-	  __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
-	  __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
-     } while (tbu0 != tbu1);
+  do {
+    __asm__ __volatile__("mftbu %0" : "=r"(tbu0));
+    __asm__ __volatile__("mftb %0" : "=r"(tbl));
+    __asm__ __volatile__("mftbu %0" : "=r"(tbu1));
+  } while (tbu0 != tbu1);
 
-     return (((unsigned long long)tbu0) << 32) | tbl;
+  return (((unsigned long long)tbu0) << 32) | tbl;
 }
 
 INLINE_ELAPSED(__inline__)
@@ -156,7 +162,8 @@ INLINE_ELAPSED(__inline__)
 
 /* MacOS/Mach (Darwin) time-base register interface (unlike UpTime,
    from Carbon, requires no additional libraries to be linked). */
-#if defined(HAVE_MACH_ABSOLUTE_TIME) && defined(HAVE_MACH_MACH_TIME_H) && !defined(HAVE_TICK_COUNTER)
+#if defined(HAVE_MACH_ABSOLUTE_TIME) && defined(HAVE_MACH_MACH_TIME_H) && \
+    !defined(HAVE_TICK_COUNTER)
 #include <mach/mach_time.h>
 typedef uint64_t ticks;
 #define getticks mach_absolute_time
@@ -166,31 +173,31 @@ INLINE_ELAPSED(__inline__)
 
 /*----------------------------------------------------------------*/
 /*
- * Pentium cycle counter 
+ * Pentium cycle counter
  */
-#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__)  && !defined(HAVE_TICK_COUNTER)
+#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__) && \
+    !defined(HAVE_TICK_COUNTER)
 typedef unsigned long long ticks;
 
 #ifndef INLINE
-# if __GNUC__ && !__GNUC_STDC_INLINE__
-#  define INLINE extern inline
-# else
-#  define INLINE inline
-# endif
+#if __GNUC__ && !__GNUC_STDC_INLINE__
+#define INLINE extern inline
+#else
+#define INLINE inline
 #endif
-INLINE static ticks getticks(void)
-{
-     ticks ret;
+#endif
+INLINE static ticks getticks(void) {
+  ticks ret;
 
-     __asm__ __volatile__("rdtsc": "=A" (ret));
-     /* no input, nothing else clobbered */
-     return ret;
+  __asm__ __volatile__("rdtsc" : "=A"(ret));
+  /* no input, nothing else clobbered */
+  return ret;
 }
 
 INLINE_ELAPSED(__inline__)
 
 #define HAVE_TICK_COUNTER
-#define TIME_MIN 5000.0   /* unreliable pentium IV cycle counter */
+#define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */
 #endif
 
 /* Visual C++ -- thanks to Morten Nissov for his help with this */
@@ -199,46 +206,43 @@ INLINE_ELAPSED(__inline__)
 typedef LARGE_INTEGER ticks;
 #define RDTSC __asm __emit 0fh __asm __emit 031h /* hack for VC++ 5.0 */
 
-static __inline ticks getticks(void)
-{
-     ticks retval;
-
-     __asm {
-	  RDTSC
-	  mov retval.HighPart, edx
-	  mov retval.LowPart, eax
-     }
-     return retval;
+static __inline ticks getticks(void) {
+  ticks retval;
+
+  __asm {
+    RDTSC
+    mov retval.HighPart, edx mov retval.LowPart, eax
+  }
+  return retval;
 }
 
-static __inline double elapsed(ticks t1, ticks t0)
-{  
-     return (double)t1.QuadPart - (double)t0.QuadPart;
-}  
+static __inline double elapsed(ticks t1, ticks t0) {
+  return (double)t1.QuadPart - (double)t0.QuadPart;
+}
 
 #define HAVE_TICK_COUNTER
-#define TIME_MIN 5000.0   /* unreliable pentium IV cycle counter */
+#define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */
 #endif
 
 /*----------------------------------------------------------------*/
 /*
  * X86-64 cycle counter
  */
-#if (defined(__GNUC__) || defined(__ICC) || defined(__SUNPRO_C)) && defined(__x86_64__)  && !defined(HAVE_TICK_COUNTER)
+#if (defined(__GNUC__) || defined(__ICC) || defined(__SUNPRO_C)) && \
+    defined(__x86_64__) && !defined(HAVE_TICK_COUNTER)
 typedef unsigned long long ticks;
 
 #ifndef INLINE
-# if __GNUC__ && !__GNUC_STDC_INLINE__
-#  define INLINE extern inline
-# else
-#  define INLINE inline
-# endif
+#if __GNUC__ && !__GNUC_STDC_INLINE__
+#define INLINE extern inline
+#else
+#define INLINE inline
 #endif
-INLINE static ticks getticks(void)
-{
-     unsigned a, d; 
-     asm volatile("rdtsc" : "=a" (a), "=d" (d)); 
-     return ((ticks)a) | (((ticks)d) << 32); 
+#endif
+INLINE static ticks getticks(void) {
+  unsigned a, d;
+  asm volatile("rdtsc" : "=a"(a), "=d"(d));
+  return ((ticks)a) | (((ticks)d) << 32);
 }
 
 INLINE_ELAPSED(__inline__)
@@ -249,18 +253,18 @@ INLINE_ELAPSED(__inline__)
 /* PGI compiler, courtesy Cristiano Calonaci, Andrea Tarsi, & Roberto Gori.
    NOTE: this code will fail to link unless you use the -Masmkeyword compiler
    option (grrr). */
-#if defined(__PGI) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) 
+#if defined(__PGI) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER)
 typedef unsigned long long ticks;
-static ticks getticks(void)
-{
-    asm(" rdtsc; shl    $0x20,%rdx; mov    %eax,%eax; or     %rdx,%rax;    ");
+static ticks getticks(void) {
+  asm(" rdtsc; shl    $0x20,%rdx; mov    %eax,%eax; or     %rdx,%rax;    ");
 }
 INLINE_ELAPSED(__inline__)
 #define HAVE_TICK_COUNTER
 #endif
 
 /* Visual C++, courtesy of Dirk Michaelis */
-#if _MSC_VER >= 1400 && (defined(_M_AMD64) || defined(_M_X64)) && !defined(HAVE_TICK_COUNTER)
+#if _MSC_VER >= 1400 && (defined(_M_AMD64) || defined(_M_X64)) && \
+    !defined(HAVE_TICK_COUNTER)
 
 #include <intrin.h>
 #pragma intrinsic(__rdtsc)
@@ -277,17 +281,15 @@ INLINE_ELAPSED(__inline)
  */
 
 /* intel's icc/ecc compiler */
-#if (defined(__EDG_VERSION) || defined(__ECC)) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
+#if (defined(__EDG_VERSION) || defined(__ECC)) && defined(__ia64__) && \
+    !defined(HAVE_TICK_COUNTER)
 typedef unsigned long ticks;
 #include <ia64intrin.h>
 
-static __inline__ ticks getticks(void)
-{
-     return __getReg(_IA64_REG_AR_ITC);
-}
- 
+static __inline__ ticks getticks(void) { return __getReg(_IA64_REG_AR_ITC); }
+
 INLINE_ELAPSED(__inline__)
- 
+
 #define HAVE_TICK_COUNTER
 #endif
 
@@ -295,12 +297,11 @@ INLINE_ELAPSED(__inline__)
 #if defined(__GNUC__) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
 typedef unsigned long ticks;
 
-static __inline__ ticks getticks(void)
-{
-     ticks ret;
+static __inline__ ticks getticks(void) {
+  ticks ret;
 
-     __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(ret));
-     return ret;
+  __asm__ __volatile__("mov %0=ar.itc" : "=r"(ret));
+  return ret;
 }
 
 INLINE_ELAPSED(__inline__)
@@ -313,12 +314,11 @@ INLINE_ELAPSED(__inline__)
 #include <machine/sys/inline.h>
 typedef unsigned long ticks;
 
-static inline ticks getticks(void)
-{
-     ticks ret;
+static inline ticks getticks(void) {
+  ticks ret;
 
-     ret = _Asm_mov_from_ar (_AREG_ITC);
-     return ret;
+  ret = _Asm_mov_from_ar(_AREG_ITC);
+  return ret;
 }
 
 INLINE_ELAPSED(inline)
@@ -330,17 +330,17 @@ INLINE_ELAPSED(inline)
 #if defined(_MSC_VER) && defined(_M_IA64) && !defined(HAVE_TICK_COUNTER)
 typedef unsigned __int64 ticks;
 
-#  ifdef __cplusplus
+#ifdef __cplusplus
 extern "C"
-#  endif
-ticks __getReg(int whichReg);
+#endif
+    ticks
+        __getReg(int whichReg);
 #pragma intrinsic(__getReg)
 
-static __inline ticks getticks(void)
-{
-     volatile ticks temp;
-     temp = __getReg(3116);
-     return temp;
+static __inline ticks getticks(void) {
+  volatile ticks temp;
+  temp = __getReg(3116);
+  return temp;
 }
 
 INLINE_ELAPSED(inline)
@@ -350,29 +350,27 @@ INLINE_ELAPSED(inline)
 
 /*----------------------------------------------------------------*/
 /*
- * PA-RISC cycle counter 
+ * PA-RISC cycle counter
  */
 #if defined(__hppa__) || defined(__hppa) && !defined(HAVE_TICK_COUNTER)
 typedef unsigned long ticks;
 
-#  ifdef __GNUC__
-static __inline__ ticks getticks(void)
-{
-     ticks ret;
+#ifdef __GNUC__
+static __inline__ ticks getticks(void) {
+  ticks ret;
 
-     __asm__ __volatile__("mfctl 16, %0": "=r" (ret));
-     /* no input, nothing else clobbered */
-     return ret;
+  __asm__ __volatile__("mfctl 16, %0" : "=r"(ret));
+  /* no input, nothing else clobbered */
+  return ret;
 }
-#  else
-#  include <machine/inline.h>
-static inline unsigned long getticks(void)
-{
-     register ticks ret;
-     _MFCTL(16, ret);
-     return ret;
+#else
+#include <machine/inline.h>
+static inline unsigned long getticks(void) {
+  register ticks ret;
+  _MFCTL(16, ret);
+  return ret;
 }
-#  endif
+#endif
 
 INLINE_ELAPSED(inline)
 
@@ -384,11 +382,10 @@ INLINE_ELAPSED(inline)
 #if defined(__GNUC__) && defined(__s390__) && !defined(HAVE_TICK_COUNTER)
 typedef unsigned long long ticks;
 
-static __inline__ ticks getticks(void)
-{
-     ticks cycles;
-     __asm__("stck 0(%0)" : : "a" (&(cycles)) : "memory", "cc");
-     return cycles;
+static __inline__ ticks getticks(void) {
+  ticks cycles;
+  __asm__("stck 0(%0)" : : "a"(&(cycles)) : "memory", "cc");
+  return cycles;
 }
 
 INLINE_ELAPSED(__inline__)
@@ -398,16 +395,15 @@ INLINE_ELAPSED(__inline__)
 /*----------------------------------------------------------------*/
 #if defined(__GNUC__) && defined(__alpha__) && !defined(HAVE_TICK_COUNTER)
 /*
- * The 32-bit cycle counter on alpha overflows pretty quickly, 
+ * The 32-bit cycle counter on alpha overflows pretty quickly,
  * unfortunately.  A 1GHz machine overflows in 4 seconds.
  */
 typedef unsigned int ticks;
 
-static __inline__ ticks getticks(void)
-{
-     unsigned long cc;
-     __asm__ __volatile__ ("rpcc %0" : "=r"(cc));
-     return (cc & 0xFFFFFFFF);
+static __inline__ ticks getticks(void) {
+  unsigned long cc;
+  __asm__ __volatile__("rpcc %0" : "=r"(cc));
+  return (cc & 0xFFFFFFFF);
 }
 
 INLINE_ELAPSED(__inline__)
@@ -419,11 +415,10 @@ INLINE_ELAPSED(__inline__)
 #if defined(__GNUC__) && defined(__sparc_v9__) && !defined(HAVE_TICK_COUNTER)
 typedef unsigned long ticks;
 
-static __inline__ ticks getticks(void)
-{
-     ticks ret;
-     __asm__ __volatile__("rd %%tick, %0" : "=r" (ret));
-     return ret;
+static __inline__ ticks getticks(void) {
+  ticks ret;
+  __asm__ __volatile__("rd %%tick, %0" : "=r"(ret));
+  return ret;
 }
 
 INLINE_ELAPSED(__inline__)
@@ -432,15 +427,15 @@ INLINE_ELAPSED(__inline__)
 #endif
 
 /*----------------------------------------------------------------*/
-#if (defined(__DECC) || defined(__DECCXX)) && defined(__alpha) && defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER)
-#  include <c_asm.h>
+#if (defined(__DECC) || defined(__DECCXX)) && defined(__alpha) && \
+    defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER)
+#include <c_asm.h>
 typedef unsigned int ticks;
 
-static __inline ticks getticks(void)
-{
-     unsigned long cc;
-     cc = asm("rpcc %v0");
-     return (cc & 0xFFFFFFFF);
+static __inline ticks getticks(void) {
+  unsigned long cc;
+  cc = asm("rpcc %v0");
+  return (cc & 0xFFFFFFFF);
 }
 
 INLINE_ELAPSED(__inline)
@@ -449,20 +444,19 @@ INLINE_ELAPSED(__inline)
 #endif
 /*----------------------------------------------------------------*/
 /* SGI/Irix */
-#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && !defined(HAVE_TICK_COUNTER)
+#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && \
+    !defined(HAVE_TICK_COUNTER)
 typedef struct timespec ticks;
 
-static inline ticks getticks(void)
-{
-     struct timespec t;
-     clock_gettime(CLOCK_SGI_CYCLE, &t);
-     return t;
+static inline ticks getticks(void) {
+  struct timespec t;
+  clock_gettime(CLOCK_SGI_CYCLE, &t);
+  return t;
 }
 
-static inline double elapsed(ticks t1, ticks t0)
-{
-     return ((double)t1.tv_sec - (double)t0.tv_sec) * 1.0E9 +
-	  ((double)t1.tv_nsec - (double)t0.tv_nsec);
+static inline double elapsed(ticks t1, ticks t0) {
+  return ((double)t1.tv_sec - (double)t0.tv_sec) * 1.0E9 +
+         ((double)t1.tv_nsec - (double)t0.tv_nsec);
 }
 #define HAVE_TICK_COUNTER
 #endif
@@ -471,7 +465,7 @@ static inline double elapsed(ticks t1, ticks t0)
 /* Cray UNICOS _rtc() intrinsic function */
 #if defined(HAVE__RTC) && !defined(HAVE_TICK_COUNTER)
 #ifdef HAVE_INTRINSICS_H
-#  include <intrinsics.h>
+#include <intrinsics.h>
 #endif
 
 typedef long long ticks;
@@ -493,25 +487,23 @@ INLINE_ELAPSED(inline)
 
 typedef uint64_t ticks;
 
-static inline ticks getticks(void)
-{
+static inline ticks getticks(void) {
   static uint64_t* addr = 0;
 
-  if (addr == 0)
-  {
+  if (addr == 0) {
     uint32_t rq_addr = 0x10030000;
     int fd;
     int pgsize;
 
     pgsize = getpagesize();
-    fd = open ("/dev/mem", O_RDONLY | O_SYNC, 0);
+    fd = open("/dev/mem", O_RDONLY | O_SYNC, 0);
     if (fd < 0) {
       perror("open");
       return NULL;
     }
     addr = mmap(0, pgsize, PROT_READ, MAP_SHARED, fd, rq_addr);
     close(fd);
-    if (addr == (uint64_t *)-1) {
+    if (addr == (uint64_t*)-1) {
       perror("mmap");
       return NULL;
     }
@@ -525,4 +517,3 @@ INLINE_ELAPSED(inline)
 #define HAVE_TICK_COUNTER
 #endif
 #endif /* HAVE_MIPS_ZBUS_TIMER */
-
diff --git a/src/debug.c b/src/debug.c
index 75c726bfced9f9b5f1b09aa276b92fbc06ae3882..0ebbd44ae03ffc6ddfabab78e577335f0b9bbe5a 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -2,125 +2,111 @@
  * This file is part of SWIFT.
  * Coypright (c) 2013 Matthieu Schaller (matthieu.schaller@durham.ac.uk),
  *                    Pedro Gonnet (pedro.gonnet@durham.ac.uk).
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 
-
 #include <stdio.h>
 
-#include "const.h"
-#include "part.h"
+/* This object's header. */
+#include "debug.h"
 
+/**
+ * @brief Dump the information pertaining to the given cell.
+ */
+
+void print_cell(struct cell *c) {
+  printf(
+      "## Cell 0x%0zx: loc=[%.3e,%.3e,%.3e], h=[%.3e,%.3e,%.3e], depth=%i, "
+      "split=%i, maxdepth=%i.\n",
+      (size_t)c, c->loc[0], c->loc[1], c->loc[2], c->h[0], c->h[1], c->h[2],
+      c->depth, c->split, c->maxdepth);
+}
 
 /**
- * @brief Looks for the particle with the given id and prints its information to the standard output.
- * 
+ * @brief Looks for the particle with the given id and prints its information to
+ *the standard output.
+ *
  * @param parts The array of particles.
  * @param id The id too look for.
  * @param N The size of the array of particles.
  *
  * (Should be used for debugging only as it runs in O(N).)
  */
- 
-void printParticle ( struct part *parts , long long int id, int N ) {
-
-    int i, found = 0;
-
-    /* Look for the particle. */
-    for ( i = 0 ; i < N ; i++ )
-        if ( parts[i].id == id ) {
-            printf("## Particle[%d]: id=%lld, x=[%.16e,%.16e,%.16e], v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], h=%.3e, h_dt=%.3e, wcount=%.3e, m=%.3e, rho=%.3e, rho_dh=%.3e, div_v=%.3e, u=%.3e, dudt=%.3e, bals=%.3e, POrho2=%.3e, v_sig=%.3e, dt=%.3e\n",
-                i,
-                parts[i].id,
-                parts[i].x[0], parts[i].x[1], parts[i].x[2],
-                parts[i].v[0], parts[i].v[1], parts[i].v[2],
-                parts[i].a[0], parts[i].a[1], parts[i].a[2],
-                parts[i].h,
-                parts[i].force.h_dt,
-                parts[i].density.wcount,
-                parts[i].mass,
-                parts[i].rho, parts[i].rho_dh,
-                parts[i].density.div_v,
-                parts[i].u,
-                parts[i].force.u_dt,
-                parts[i].force.balsara,
-                parts[i].force.POrho2,
-                parts[i].force.v_sig,
-                parts[i].dt
-                );
-            found = 1;
-            }
-        
-    if ( !found )
-        printf("## Particles[???] id=%lld not found\n", id);
-    
+
+void printParticle(struct part *parts, long long int id, int N) {
+
+  int i, found = 0;
+
+  /* Look for the particle. */
+  for (i = 0; i < N; i++)
+    if (parts[i].id == id) {
+      printf(
+          "## Particle[%d]: id=%lld, x=[%.16e,%.16e,%.16e], "
+          "v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], h=%.3e, h_dt=%.3e, "
+          "wcount=%.3e, m=%.3e, rho=%.3e, rho_dh=%.3e, div_v=%.3e, u=%.3e, "
+          "dudt=%.3e, bals=%.3e, POrho2=%.3e, v_sig=%.3e, dt=%.3e\n",
+          i, parts[i].id, parts[i].x[0], parts[i].x[1], parts[i].x[2],
+          parts[i].v[0], parts[i].v[1], parts[i].v[2], parts[i].a[0],
+          parts[i].a[1], parts[i].a[2], parts[i].h, parts[i].force.h_dt,
+          parts[i].density.wcount, parts[i].mass, parts[i].rho, parts[i].rho_dh,
+          parts[i].density.div_v, parts[i].u, parts[i].force.u_dt,
+          parts[i].force.balsara, parts[i].force.POrho2, parts[i].force.v_sig,
+          parts[i].dt);
+      found = 1;
     }
 
+  if (!found) printf("## Particles[???] id=%lld not found\n", id);
+}
+
+void printgParticle(struct gpart *parts, long long int id, int N) {
+
+  int i, found = 0;
 
-void printgParticle ( struct gpart *parts , long long int id, int N ) {
-
-    int i, found = 0;
-
-    /* Look for the particle. */
-    for ( i = 0 ; i < N ; i++ )
-        if ( parts[i].id == -id || ( parts[i].id > 0 && parts[i].part->id == id ) ) {
-            printf("## gParticle[%d]: id=%lld, x=[%.16e,%.16e,%.16e], v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], m=%.3e, dt=%.3e\n",
-                i,
-                (parts[i].id < 0) ? -parts[i].id : parts[i].part->id ,
-                parts[i].x[0], parts[i].x[1], parts[i].x[2],
-                parts[i].v[0], parts[i].v[1], parts[i].v[2],
-                parts[i].a[0], parts[i].a[1], parts[i].a[2],
-                parts[i].mass,
-                parts[i].dt
-                );
-            found = 1;
-            }
-        
-    if ( !found )
-        printf("## Particles[???] id=%lld not found\n", id);
-    
+  /* Look for the particle. */
+  for (i = 0; i < N; i++)
+    if (parts[i].id == -id || (parts[i].id > 0 && parts[i].part->id == id)) {
+      printf(
+          "## gParticle[%d]: id=%lld, x=[%.16e,%.16e,%.16e], "
+          "v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], m=%.3e, dt=%.3e\n",
+          i, (parts[i].id < 0) ? -parts[i].id : parts[i].part->id,
+          parts[i].x[0], parts[i].x[1], parts[i].x[2], parts[i].v[0],
+          parts[i].v[1], parts[i].v[2], parts[i].a[0], parts[i].a[1],
+          parts[i].a[2], parts[i].mass, parts[i].dt);
+      found = 1;
     }
 
+  if (!found) printf("## Particles[???] id=%lld not found\n", id);
+}
 
 /**
  * @brief Prints the details of a given particle to stdout
- * 
+ *
  * @param p The particle to print
- * 
+ *
  */
- 
-void printParticle_single ( struct part *p ) {
-
-    printf("## Particle: id=%lld, x=[%e,%e,%e], v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], h=%.3e, h_dt=%.3e, wcount=%.3e, m=%.3e, rho=%.3e, rho_dh=%.3e, div_v=%.3e, u=%.3e, dudt=%.3e, bals=%.3e, POrho2=%.3e, v_sig=%.3e, dt=%.3e\n",
-        p->id,
-        p->x[0], p->x[1], p->x[2],
-        p->v[0], p->v[1], p->v[2],
-        p->a[0], p->a[1], p->a[2],
-        p->h,
-        p->force.h_dt,
-        p->density.wcount,
-        p->mass,
-        p->rho, p->rho_dh,
-        p->density.div_v,
-        p->u,
-        p->force.u_dt,
-        p->force.balsara,
-        p->force.POrho2,
-        p->force.v_sig,
-        p->dt
-        );
-    }
 
+void printParticle_single(struct part *p) {
+
+  printf(
+      "## Particle: id=%lld, x=[%e,%e,%e], v=[%.3e,%.3e,%.3e], "
+      "a=[%.3e,%.3e,%.3e], h=%.3e, h_dt=%.3e, wcount=%.3e, m=%.3e, rho=%.3e, "
+      "rho_dh=%.3e, div_v=%.3e, u=%.3e, dudt=%.3e, bals=%.3e, POrho2=%.3e, "
+      "v_sig=%.3e, dt=%.3e\n",
+      p->id, p->x[0], p->x[1], p->x[2], p->v[0], p->v[1], p->v[2], p->a[0],
+      p->a[1], p->a[2], p->h, p->force.h_dt, p->density.wcount, p->mass, p->rho,
+      p->rho_dh, p->density.div_v, p->u, p->force.u_dt, p->force.balsara,
+      p->force.POrho2, p->force.v_sig, p->dt);
+}
diff --git a/src/debug.h b/src/debug.h
index 5db731a857ef32792c4f0a377eb97e475ab6b782..42269fc267e6d5721990b992c92c515a5764f6a9 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -1,25 +1,31 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_DEBUG_H
+#define SWIFT_DEBUG_H
 
+/* Includes. */
+#include "cell.h"
+#include "part.h"
 
-
-
+void print_cell(struct cell *c);
 void printParticle(struct part *parts, long long int i, int N);
 void printgParticle(struct gpart *parts, long long int i, int N);
-void printParticle_single ( struct part *p );
+void printParticle_single(struct part *p);
+
+#endif /* SWIFT_DEBUG_H */
diff --git a/src/engine.c b/src/engine.c
index 36b6215772f2220788aed71eeef7f89c94e0cdb6..88d9547b83a10327a6a7183f13f5bb499da43948 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -1,66 +1,53 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 
 /* Config parameters. */
 #include "../config.h"
 
 /* Some standard headers. */
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
-#include <pthread.h>
-#include <math.h>
 #include <float.h>
 #include <limits.h>
 #include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
 
 /* MPI headers. */
 #ifdef WITH_MPI
-    #include <mpi.h>
-
+#include <mpi.h>
 /* METIS headers only used when MPI is also available. */
-    #ifdef HAVE_METIS
-        #include <metis.h>
-    #endif
+#ifdef HAVE_METIS
+#include <metis.h>
+#endif
 #endif
 
+/* This object's header. */
+#include "engine.h"
+
 /* Local headers. */
-#include "const.h"
-#include "cycle.h"
 #include "atomic.h"
-#include "timers.h"
-#include "const.h"
-#include "vector.h"
-#include "lock.h"
-#include "task.h"
-#include "part.h"
-#include "debug.h"
-#include "space.h"
-#include "multipole.h"
 #include "cell.h"
-#include "queue.h"
-#include "scheduler.h"
-#include "engine.h"
-#include "runner.h"
-#include "proxy.h"
+#include "cycle.h"
+#include "debug.h"
 #include "error.h"
+#include "timers.h"
 
 #ifdef LEGACY_GADGET2_SPH
 #include "runner_iact_legacy.h"
@@ -68,14 +55,85 @@
 #include "runner_iact.h"
 #endif
 
-
 /* Convert cell location to ID. */
-#define cell_getid( cdim , i , j , k ) ( (int)(k) + (cdim)[2]*( (int)(j) + (cdim)[1]*(int)(i) ) )
-
+#define cell_getid(cdim, i, j, k) \
+  ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i)))
 
 /** The rank of the engine as a global variable (for messages). */
 int engine_rank;
 
+/**
+ * @brief Check if a single particle is OK.
+ *
+ * @return Zero if all checks passed, non-zero otherwise.
+ */
+int engine_check_part(struct part *p) {
+  if (p == NULL || p->mass == 0.0f || p->h == 0.0f) {
+    message("Bad particle data.");
+    printParticle_single(p);
+    return 1;
+  } else if (p->x[0] == 0.0 && p->x[1] == 0.0 && p->x[2] == 0.0) {
+    message("Bad particle location.");
+    printParticle_single(p);
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+/**
+ * @brief Check if a cell's data is reasonable, also check if its particles
+ *        are OK.
+ *
+ * @return Zero if all checks passed, non-zero otherwise.
+ */
+
+void engine_check_cell(struct cell *c, void *data) {
+  /* Check the cell data. */
+  if (c->count == 0) {
+    print_cell(c);
+    error("Empty cell.");
+  }
+
+  /* Check the particles. */
+  for (int k = 0; k < c->count; k++) {
+    if (engine_check_part(&c->parts[k])) {
+      print_cell(c);
+      error("Bad particle in cell.");
+    }
+  }
+
+  /* Check that the progeny, if any, contain all the particles. */
+  if (c->split) {
+    int count = 0;
+    for (int k = 0; k < 8; k++) {
+      if (c->progeny[k] != NULL) {
+        count += c->progeny[k]->count;
+      }
+    }
+    if (count != c->count) {
+      print_cell(c);
+      error("Progeny cell counts don't add up.");
+    }
+  }
+}
+
+/**
+ * @brief Runs a series of checks to make sure we have no bad particles.
+ */
+
+void engine_check(struct engine *e) {
+  /* Check all particles directly. */
+  struct space *s = e->s;
+  for (int k = 0; k < s->nr_parts; k++) {
+    if (engine_check_part(&s->parts[k])) {
+      error("Bad particle s->parts[%i], aborting.", k);
+    }
+  }
+
+  /* Check each cell in the space. */
+  space_map_cells_post(s, 1, &engine_check_cell, NULL);
+}
 
 /**
  * @brief Link a density/force task to a cell.
@@ -86,16 +144,14 @@ int engine_rank;
  *
  * @return The new #link pointer.
  */
- 
-struct link *engine_addlink( struct engine *e , struct link *l , struct task *t ) {
-
-    struct link *res = &e->links[ atomic_inc( &e->nr_links ) ];
-    res->next = l;
-    res->t = t;
-    return res;
 
-    }
+struct link *engine_addlink(struct engine *e, struct link *l, struct task *t) {
 
+  struct link *res = &e->links[atomic_inc(&e->nr_links)];
+  res->next = l;
+  res->t = t;
+  return res;
+}
 
 /**
  * @brief Generate the ghost and kick tasks for a hierarchy of cells.
@@ -104,46 +160,44 @@ struct link *engine_addlink( struct engine *e , struct link *l , struct task *t
  * @param c The #cell.
  * @param super The super #cell.
  */
- 
-void engine_mkghosts ( struct engine *e , struct cell *c , struct cell *super ) {
-
-    int k;
-    struct scheduler *s = &e->sched;
-
-    /* Am I the super-cell? */
-    if ( super == NULL && c->nr_tasks > 0 ) {
-    
-        /* Remember me. */
-        super = c;
-        
-        /* Local tasks only... */
-        if ( c->nodeID == e->nodeID ) {
-        
-            /* Generate the ghost task. */
-            c->ghost = scheduler_addtask( s , task_type_ghost , task_subtype_none , 0 , 0 , c , NULL , 0 );
-
-            /* Add the kick2 task. */
-            c->kick2 = scheduler_addtask( s , task_type_kick2 , task_subtype_none , 0 , 0 , c , NULL , 0 );
-
-            /* Add the kick1 task if needed. */
-            if ( !(e->policy & engine_policy_fixdt) )
-                c->kick1 = scheduler_addtask( s , task_type_kick1 , task_subtype_none , 0 , 0 , c , NULL , 0 );
-                
-            }
-            
-        }
-        
-    /* Set the super-cell. */
-    c->super = super;
-        
-    /* Recurse. */
-    if ( c->split )
-        for ( k = 0 ; k < 8 ; k++ )
-            if ( c->progeny[k] != NULL )
-                engine_mkghosts( e , c->progeny[k] , super );
-    
+
+void engine_mkghosts(struct engine *e, struct cell *c, struct cell *super) {
+
+  int k;
+  struct scheduler *s = &e->sched;
+
+  /* Am I the super-cell? */
+  if (super == NULL && c->nr_tasks > 0) {
+
+    /* Remember me. */
+    super = c;
+
+    /* Local tasks only... */
+    if (c->nodeID == e->nodeID) {
+
+      /* Generate the ghost task. */
+      c->ghost = scheduler_addtask(s, task_type_ghost, task_subtype_none, 0, 0,
+                                   c, NULL, 0);
+
+      /* Add the kick2 task. */
+      c->kick2 = scheduler_addtask(s, task_type_kick2, task_subtype_none, 0, 0,
+                                   c, NULL, 0);
+
+      /* Add the kick1 task if needed. */
+      if (!(e->policy & engine_policy_fixdt))
+        c->kick1 = scheduler_addtask(s, task_type_kick1, task_subtype_none, 0,
+                                     0, c, NULL, 0);
     }
+  }
 
+  /* Set the super-cell. */
+  c->super = super;
+
+  /* Recurse. */
+  if (c->split)
+    for (k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL) engine_mkghosts(e, c->progeny[k], super);
+}
 
 /**
  * @brief Redistribute the particles amongst the nodes accorind
@@ -151,432 +205,465 @@ void engine_mkghosts ( struct engine *e , struct cell *c , struct cell *super )
  *
  * @param e The #engine.
  */
- 
-void engine_redistribute ( struct engine *e ) {
+
+void engine_redistribute(struct engine *e) {
 
 #ifdef WITH_MPI
 
-    int i, j, k, cid;
-    int nr_nodes = e->nr_nodes, nodeID = e->nodeID;
-    struct space *s = e->s;
-    int my_cells = 0;
-    int *cdim = s->cdim;
-    struct cell *cells = s->cells;
-    int nr_cells = s->nr_cells;
-
-    /* Start by sorting the particles according to their nodes and
-       getting the counts. */
-    int *counts, *dest;
-    struct part *parts = s->parts;
-    double ih[3], dim[3];
-    ih[0] = s->ih[0]; ih[1] = s->ih[1]; ih[2] = s->ih[2];
-    dim[0] = s->dim[0]; dim[1] = s->dim[1]; dim[2] = s->dim[2];
-    if ( ( counts = (int *)malloc( sizeof(int) * nr_nodes * nr_nodes ) ) == NULL ||
-         ( dest = (int *)malloc( sizeof(int) * s->nr_parts ) ) == NULL )
-        error( "Failed to allocate count and dest buffers." );
-    bzero( counts , sizeof(int) * nr_nodes * nr_nodes );
-    for ( k = 0 ; k < s->nr_parts ; k++ ) {
-        for ( j = 0 ; j < 3 ; j++ ) {
-            if ( parts[k].x[j] < 0.0 ) parts[k].x[j] += dim[j];
-            else if ( parts[k].x[j] >= dim[j] ) parts[k].x[j] -= dim[j];
-            }
-        cid = cell_getid( cdim , parts[k].x[0]*ih[0] , parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] );
-        dest[k] = cells[ cid ].nodeID;
-        counts[ nodeID*nr_nodes + dest[k] ] += 1;
-        }
-    parts_sort( s->parts , s->xparts , dest , s->nr_parts , 0 , nr_nodes-1 );
-    
-    /* Get all the counts from all the nodes. */
-    if ( MPI_Allreduce( MPI_IN_PLACE , counts , nr_nodes * nr_nodes , MPI_INT , MPI_SUM , MPI_COMM_WORLD ) != MPI_SUCCESS )
-        error( "Failed to allreduce particle transfer counts." );
-
-    /* Get the new number of parts for this node, be generous in allocating. */
-    int nr_parts = 0;
-    for ( k = 0 ; k < nr_nodes ; k++ )
-        nr_parts += counts[ k*nr_nodes + nodeID ];
-    struct part *parts_new;
-    struct xpart *xparts_new, *xparts = s->xparts;
-    if ( posix_memalign( (void **)&parts_new , part_align , sizeof(struct part) * nr_parts * 1.2 ) != 0 ||
-         posix_memalign( (void **)&xparts_new , part_align , sizeof(struct xpart) * nr_parts * 1.2 ) != 0 )
-        error( "Failed to allocate new part data." );
-        
-    /* Emit the sends and recvs for the particle data. */
-    MPI_Request *reqs;
-    if ( ( reqs = (MPI_Request *)malloc( sizeof(MPI_Request) * 4 * nr_nodes ) ) == NULL )
-        error( "Failed to allocate MPI request list." );
-    for ( k = 0 ; k < 4*nr_nodes ; k++ )
-        reqs[k] = MPI_REQUEST_NULL;
-    for ( i = 0 , j = 0 , k = 0 ; k < nr_nodes ; k++ ) {
-        if ( k == nodeID && counts[ nodeID*nr_nodes + k ] > 0 ) {
-            memcpy( &parts_new[j] , &parts[i] , sizeof(struct part) * counts[ k*nr_nodes + nodeID ] );
-            memcpy( &xparts_new[j] , &xparts[i] , sizeof(struct xpart) * counts[ k*nr_nodes + nodeID ] );
-            i += counts[ nodeID*nr_nodes + k ];
-            j += counts[ k*nr_nodes + nodeID ];
-            }
-        if ( k != nodeID && counts[ nodeID*nr_nodes + k ] > 0 ) {
-            if ( MPI_Isend( &parts[i] , sizeof(struct part) * counts[ nodeID*nr_nodes + k ] , MPI_BYTE , k , 2*(nodeID*nr_nodes + k) + 0 , MPI_COMM_WORLD , &reqs[4*k] ) != MPI_SUCCESS )
-                error( "Failed to isend parts to node %i." , k );
-            if ( MPI_Isend( &xparts[i] , sizeof(struct xpart) * counts[ nodeID*nr_nodes + k ] , MPI_BYTE , k , 2*(nodeID*nr_nodes + k) + 1 , MPI_COMM_WORLD , &reqs[4*k+1] ) != MPI_SUCCESS )
-                error( "Failed to isend xparts to node %i." , k );
-            i += counts[ nodeID*nr_nodes + k ];
-            }
-        if ( k != nodeID && counts[ k*nr_nodes + nodeID ] > 0 ) {
-            if ( MPI_Irecv( &parts_new[j] , sizeof(struct part) * counts[ k*nr_nodes + nodeID ] , MPI_BYTE , k , 2*(k*nr_nodes + nodeID) + 0 , MPI_COMM_WORLD , &reqs[4*k+2] ) != MPI_SUCCESS )
-                error( "Failed to emit irecv of parts from node %i." , k );
-            if ( MPI_Irecv( &xparts_new[j] , sizeof(struct xpart) * counts[ k*nr_nodes + nodeID ] , MPI_BYTE , k , 2*(k*nr_nodes + nodeID) + 1 , MPI_COMM_WORLD , &reqs[4*k+3] ) != MPI_SUCCESS )
-                error( "Failed to emit irecv of parts from node %i." , k );
-            j += counts[ k*nr_nodes + nodeID ];
-            }
-        }
-        
-    /* Wait for all the sends and recvs to tumble in. */
-    MPI_Status stats[4*nr_nodes];
-    int res;
-    if ( ( res = MPI_Waitall( 4*nr_nodes , reqs , stats ) ) != MPI_SUCCESS ) {
-        for ( k = 0 ; k < 4*nr_nodes ; k++ ) {
-        char buff[ MPI_MAX_ERROR_STRING ];
-        int res;
-        MPI_Error_string( stats[k].MPI_ERROR , buff , &res );
-        message( "request %i has error '%s'." , k , buff );
-        }
-    message( "counts is [ %i %i %i %i ]." , counts[0] , counts[1] , counts[2] , counts[3] );
-        error( "Failed during waitall for part data." );
-        }
+  int i, j, k, cid;
+  int nr_nodes = e->nr_nodes, nodeID = e->nodeID;
+  struct space *s = e->s;
+  int my_cells = 0;
+  int *cdim = s->cdim;
+  struct cell *cells = s->cells;
+  int nr_cells = s->nr_cells;
+
+  /* Start by sorting the particles according to their nodes and
+     getting the counts. */
+  int *counts, *dest;
+  struct part *parts = s->parts;
+  double ih[3], dim[3];
+  ih[0] = s->ih[0];
+  ih[1] = s->ih[1];
+  ih[2] = s->ih[2];
+  dim[0] = s->dim[0];
+  dim[1] = s->dim[1];
+  dim[2] = s->dim[2];
+  if ((counts = (int *)malloc(sizeof(int) *nr_nodes *nr_nodes)) == NULL ||
+      (dest = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
+    error("Failed to allocate count and dest buffers.");
+  bzero(counts, sizeof(int) * nr_nodes * nr_nodes);
+  for (k = 0; k < s->nr_parts; k++) {
+    for (j = 0; j < 3; j++) {
+      if (parts[k].x[j] < 0.0)
+        parts[k].x[j] += dim[j];
+      else if (parts[k].x[j] >= dim[j])
+        parts[k].x[j] -= dim[j];
+    }
+    cid = cell_getid(cdim, parts[k].x[0] * ih[0], parts[k].x[1] * ih[1],
+                     parts[k].x[2] * ih[2]);
+    dest[k] = cells[cid].nodeID;
+    counts[nodeID * nr_nodes + dest[k]] += 1;
+  }
+  parts_sort(s->parts, s->xparts, dest, s->nr_parts, 0, nr_nodes - 1);
+
+  /* Get all the counts from all the nodes. */
+  if (MPI_Allreduce(MPI_IN_PLACE, counts, nr_nodes * nr_nodes, MPI_INT, MPI_SUM,
+                    MPI_COMM_WORLD) != MPI_SUCCESS)
+    error("Failed to allreduce particle transfer counts.");
+
+  /* Get the new number of parts for this node, be generous in allocating. */
+  int nr_parts = 0;
+  for (k = 0; k < nr_nodes; k++) nr_parts += counts[k * nr_nodes + nodeID];
+  struct part *parts_new;
+  struct xpart *xparts_new, *xparts = s->xparts;
+  if (posix_memalign((void **)&parts_new, part_align,
+                     sizeof(struct part) * nr_parts * 1.2) != 0 ||
+      posix_memalign((void **)&xparts_new, part_align,
+                     sizeof(struct xpart) * nr_parts * 1.2) != 0)
+    error("Failed to allocate new part data.");
+
+  /* Emit the sends and recvs for the particle data. */
+  MPI_Request *reqs;
+  if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 4 * nr_nodes)) ==
+      NULL)
+    error("Failed to allocate MPI request list.");
+  for (k = 0; k < 4 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL;
+  for (i = 0, j = 0, k = 0; k < nr_nodes; k++) {
+    if (k == nodeID && counts[nodeID * nr_nodes + k] > 0) {
+      memcpy(&parts_new[j], &parts[i],
+             sizeof(struct part) * counts[k * nr_nodes + nodeID]);
+      memcpy(&xparts_new[j], &xparts[i],
+             sizeof(struct xpart) * counts[k * nr_nodes + nodeID]);
+      i += counts[nodeID * nr_nodes + k];
+      j += counts[k * nr_nodes + nodeID];
+    }
+    if (k != nodeID && counts[nodeID * nr_nodes + k] > 0) {
+      if (MPI_Isend(&parts[i],
+                    sizeof(struct part) * counts[nodeID * nr_nodes + k],
+                    MPI_BYTE, k, 2 * (nodeID * nr_nodes + k) + 0,
+                    MPI_COMM_WORLD, &reqs[4 * k]) != MPI_SUCCESS)
+        error("Failed to isend parts to node %i.", k);
+      if (MPI_Isend(&xparts[i],
+                    sizeof(struct xpart) * counts[nodeID * nr_nodes + k],
+                    MPI_BYTE, k, 2 * (nodeID * nr_nodes + k) + 1,
+                    MPI_COMM_WORLD, &reqs[4 * k + 1]) != MPI_SUCCESS)
+        error("Failed to isend xparts to node %i.", k);
+      i += counts[nodeID * nr_nodes + k];
+    }
+    if (k != nodeID && counts[k * nr_nodes + nodeID] > 0) {
+      if (MPI_Irecv(&parts_new[j],
+                    sizeof(struct part) * counts[k * nr_nodes + nodeID],
+                    MPI_BYTE, k, 2 * (k * nr_nodes + nodeID) + 0,
+                    MPI_COMM_WORLD, &reqs[4 * k + 2]) != MPI_SUCCESS)
+        error("Failed to emit irecv of parts from node %i.", k);
+      if (MPI_Irecv(&xparts_new[j],
+                    sizeof(struct xpart) * counts[k * nr_nodes + nodeID],
+                    MPI_BYTE, k, 2 * (k * nr_nodes + nodeID) + 1,
+                    MPI_COMM_WORLD, &reqs[4 * k + 3]) != MPI_SUCCESS)
+        error("Failed to emit irecv of parts from node %i.", k);
+      j += counts[k * nr_nodes + nodeID];
+    }
+  }
+
+  /* Wait for all the sends and recvs to tumble in. */
+  MPI_Status stats[4 * nr_nodes];
+  int res;
+  if ((res = MPI_Waitall(4 * nr_nodes, reqs, stats)) != MPI_SUCCESS) {
+    for (k = 0; k < 4 * nr_nodes; k++) {
+      char buff[MPI_MAX_ERROR_STRING];
+      int res;
+      MPI_Error_string(stats[k].MPI_ERROR, buff, &res);
+      message("request %i has error '%s'.", k, buff);
+    }
+    error("Failed during waitall for part data.");
+  }
+
+  /* Verify that all parts are in the right place. */
+  /* for ( k = 0 ; k < nr_parts ; k++ ) {
+      cid = cell_getid( cdim , parts_new[k].x[0]*ih[0] , parts_new[k].x[1]*ih[1]
+     , parts_new[k].x[2]*ih[2] );
+      if ( cells[ cid ].nodeID != nodeID )
+          error( "Received particle (%i) that does not belong here (nodeID=%i)."
+     , k , cells[ cid ].nodeID );
+      } */
+
+  /* Set the new part data, free the old. */
+  free(parts);
+  free(xparts);
+  s->parts = parts_new;
+  s->xparts = xparts_new;
+  s->nr_parts = nr_parts;
+  s->size_parts = 1.2 * nr_parts;
+
+  /* Be verbose about what just happened. */
+  for (k = 0; k < nr_cells; k++)
+    if (cells[k].nodeID == nodeID) my_cells += 1;
+  message("node %i now has %i parts in %i cells.", nodeID, nr_parts, my_cells);
+
+  /* Clean up other stuff. */
+  free(reqs);
+  free(counts);
+  free(dest);
 
-    /* Verify that all parts are in the right place. */
-    /* for ( k = 0 ; k < nr_parts ; k++ ) {
-        cid = cell_getid( cdim , parts_new[k].x[0]*ih[0] , parts_new[k].x[1]*ih[1] , parts_new[k].x[2]*ih[2] );
-        if ( cells[ cid ].nodeID != nodeID )
-            error( "Received particle (%i) that does not belong here (nodeID=%i)." , k , cells[ cid ].nodeID );
-        } */
-        
-    /* Set the new part data, free the old. */
-    free( parts );
-    free( xparts );
-    s->parts = parts_new;
-    s->xparts = xparts_new;
-    s->nr_parts = nr_parts;
-    s->size_parts = 1.2*nr_parts;
-    
-    /* Be verbose about what just happened. */
-    for ( k = 0 ; k < nr_cells ; k++ )
-        if ( cells[k].nodeID == nodeID )
-            my_cells += 1;
-    message( "node %i now has %i parts in %i cells." , nodeID , nr_parts , my_cells );
-    
-    /* Clean up other stuff. */
-    free( reqs );
-    free( counts );
-    free( dest );
-        
 #else
-    error( "SWIFT was not compiled with MPI and METIS support." );
+  error("SWIFT was not compiled with MPI and METIS support.");
 #endif
-
-    }
-
+}
 
 /**
  * @brief Repartition the cells amongst the nodes.
  *
  * @param e The #engine.
  */
- 
-void engine_repartition ( struct engine *e ) {
+
+void engine_repartition(struct engine *e) {
 
 #if defined(WITH_MPI) && defined(HAVE_METIS)
 
-    int i, j, k, l, cid, cjd, ii, jj, kk, res, w;
-    idx_t *inds, *nodeIDs;
-    idx_t *weights_v, *weights_e;
-    struct space *s = e->s;
-    int nr_cells = s->nr_cells, my_cells = 0;
-    struct cell *cells = s->cells;
-    int ind[3], *cdim = s->cdim;
-    struct task *t, *tasks = e->sched.tasks;
-    struct cell *ci, *cj;
-    int nr_nodes = e->nr_nodes, nodeID = e->nodeID;
-    float wscale = 1.0, vscale = 1e-3, wscale_buff;
-    idx_t wtot = 0;
-    const idx_t wmax = 1e9 / e->nr_nodes;
-    
-    /* Clear the repartition flag. */
-    e->forcerepart = 0;
-    
-    /* Allocate the inds and weights. */
-    if ( ( inds = (idx_t *)malloc( sizeof(idx_t) * 26*nr_cells ) ) == NULL ||
-         ( weights_v = (idx_t *)malloc( sizeof(idx_t) * nr_cells ) ) == NULL ||
-         ( weights_e = (idx_t *)malloc( sizeof(idx_t) * 26*nr_cells ) ) == NULL ||
-         ( nodeIDs = (idx_t *)malloc( sizeof(idx_t) * nr_cells ) ) == NULL )
-        error( "Failed to allocate inds and weights arrays." );
-        
-    /* Fill the inds array. */
-    for ( cid = 0 ; cid < nr_cells ; cid++ ) {
-        ind[0] = cells[cid].loc[0] / s->cells[cid].h[0] + 0.5;
-        ind[1] = cells[cid].loc[1] / s->cells[cid].h[1] + 0.5;
-        ind[2] = cells[cid].loc[2] / s->cells[cid].h[2] + 0.5;
-        l = 0;
-        for ( i = -1 ; i <= 1 ; i++ ) {
-            ii = ind[0] + i;
-            if ( ii < 0 ) ii += cdim[0];
-            else if ( ii >= cdim[0] ) ii -= cdim[0];
-            for ( j = -1 ; j <= 1 ; j++ ) {
-                jj = ind[1] + j;
-                if ( jj < 0 ) jj += cdim[1];
-                else if ( jj >= cdim[1] ) jj -= cdim[1];
-                for ( k = -1 ; k <= 1 ; k++ ) {
-                    kk = ind[2] + k;
-                    if ( kk < 0 ) kk += cdim[2];
-                    else if ( kk >= cdim[2] ) kk -= cdim[2];
-                    if ( i || j || k ) {
-                        inds[ cid*26 + l ] = cell_getid( cdim , ii , jj , kk );
-                        l += 1;
-                        }
-                    }
-                }
-            }
-        }
-        
-    /* Init the weights arrays. */
-    bzero( weights_e , sizeof(idx_t) * 26*nr_cells );
-    bzero( weights_v , sizeof(idx_t) * nr_cells );
-    
-    /* Loop over the tasks... */
-    for ( j = 0 ; j < e->sched.nr_tasks ; j++ ) {
-    
-        /* Get a pointer to the kth task. */
-        t = &tasks[j];
-        
-        /* Skip un-interesting tasks. */
-        if ( t->type != task_type_self &&
-             t->type != task_type_pair &&
-             t->type != task_type_sub &&
-             t->type != task_type_ghost &&
-             t->type != task_type_kick1 &&
-             t->type != task_type_kick2 )
-            continue;
-            
-        /* Get the task weight. */
-        w = ( t->toc - t->tic ) * wscale;
-        if ( w < 0 )
-            error( "Bad task weight (%i)." , w );
-            
-        /* Do we need to re-scale? */
-        wtot += w;
-        if (wtot > wmax) {
-          wscale /= 2;
-          wtot /= 2;
-          for (k = 0; k < 26 * nr_cells; k++) weights_e[k] *= 0.5;
-          for (k = 0; k < nr_cells; k++) weights_v[k] *= 0.5;
+  int i, j, k, l, cid, cjd, ii, jj, kk, res;
+  idx_t *inds, *nodeIDs;
+  idx_t *weights_v, *weights_e;
+  struct space *s = e->s;
+  int nr_cells = s->nr_cells, my_cells = 0;
+  struct cell *cells = s->cells;
+  int ind[3], *cdim = s->cdim;
+  struct task *t, *tasks = e->sched.tasks;
+  struct cell *ci, *cj;
+  int nr_nodes = e->nr_nodes, nodeID = e->nodeID;
+  float wscale = 1e-3, vscale = 1e-3, wscale_buff;
+  idx_t wtot = 0;
+  const idx_t wmax = 1e9 / e->nr_nodes;
+
+  /* Clear the repartition flag. */
+  e->forcerepart = 0;
+
+  /* Allocate the inds and weights. */
+  if ((inds = (idx_t *)malloc(sizeof(idx_t) * 26 *nr_cells)) == NULL ||
+      (weights_v = (idx_t *)malloc(sizeof(idx_t) *nr_cells)) == NULL ||
+      (weights_e = (idx_t *)malloc(sizeof(idx_t) * 26 *nr_cells)) == NULL ||
+      (nodeIDs = (idx_t *)malloc(sizeof(idx_t) * nr_cells)) == NULL)
+    error("Failed to allocate inds and weights arrays.");
+
+  /* Fill the inds array. */
+  for (cid = 0; cid < nr_cells; cid++) {
+    ind[0] = cells[cid].loc[0] / s->cells[cid].h[0] + 0.5;
+    ind[1] = cells[cid].loc[1] / s->cells[cid].h[1] + 0.5;
+    ind[2] = cells[cid].loc[2] / s->cells[cid].h[2] + 0.5;
+    l = 0;
+    for (i = -1; i <= 1; i++) {
+      ii = ind[0] + i;
+      if (ii < 0)
+        ii += cdim[0];
+      else if (ii >= cdim[0])
+        ii -= cdim[0];
+      for (j = -1; j <= 1; j++) {
+        jj = ind[1] + j;
+        if (jj < 0)
+          jj += cdim[1];
+        else if (jj >= cdim[1])
+          jj -= cdim[1];
+        for (k = -1; k <= 1; k++) {
+          kk = ind[2] + k;
+          if (kk < 0)
+            kk += cdim[2];
+          else if (kk >= cdim[2])
+            kk -= cdim[2];
+          if (i || j || k) {
+            inds[cid * 26 + l] = cell_getid(cdim, ii, jj, kk);
+            l += 1;
+          }
         }
-        
-        /* Get the top-level cells involved. */
-        for ( ci = t->ci ; ci->parent != NULL ; ci = ci->parent );
-        if ( t->cj != NULL )
-            for ( cj = t->cj ; cj->parent != NULL ; cj = cj->parent );
-        else
-            cj = NULL;
-            
-        /* Get the cell IDs. */
-        cid = ci - cells;
-            
-        /* Different weights for different tasks. */
-        if ( t->type == task_type_ghost ||
-             t->type == task_type_kick1 || 
-             t->type == task_type_kick2 ) {
-             
-            /* Particle updates add only to vertex weight. */
-            weights_v[cid] += w;
-            
-            }
-        
-        /* Self interaction? */     
-        else if ( ( t->type == task_type_self && ci->nodeID == nodeID ) ||
-             ( t->type == task_type_sub && cj == NULL && ci->nodeID == nodeID ) ) {
-        
-            /* Self interactions add only to vertex weight. */
-            weights_v[cid] += w;
-            
-            }
-            
-        /* Pair? */
-        else if ( t->type == task_type_pair ||
-                  ( t->type == task_type_sub && cj != NULL ) ) {
-                  
-            /* In-cell pair? */
-            if ( ci == cj ) {
-            
-                /* Add weight to vertex for ci. */
-                weights_v[cid] += w;
-            
-                }
-                
-            /* Distinct cells with local ci? */
-            else if ( ci->nodeID == nodeID ) {
-            
-                /* Index of the jth cell. */
-                cjd = cj - cells;
-                
-                /* Add half of weight to each cell. */
-                if ( ci->nodeID == nodeID )
-                    weights_v[cid] += 0.5 * w;
-                if ( cj->nodeID == nodeID )
-                    weights_v[cjd] += 0.5 * w;
-                    
-                /* Add Weight to edge. */
-                for ( k = 26*cid ; inds[k] != cjd ; k++ );
-                weights_e[ k ] += w;
-                for ( k = 26*cjd ; inds[k] != cid ; k++ );
-                weights_e[ k ] += w;
-            
-                }
-                  
-            }
-    
-        }
-        
-    /* Get the minimum scaling and re-scale if necessary. */
-    if ( ( res = MPI_Allreduce( &wscale , &wscale_buff , 1 , MPI_FLOAT , MPI_MIN , MPI_COMM_WORLD ) ) != MPI_SUCCESS ) {
-        char buff[ MPI_MAX_ERROR_STRING ];
-        MPI_Error_string( res , buff , &i );
-        error( "Failed to allreduce the weight scales (%s)." , buff );
+      }
+    }
+  }
+
+  /* Init the weights arrays. */
+  bzero(weights_e, sizeof(idx_t) * 26 * nr_cells);
+  bzero(weights_v, sizeof(idx_t) * nr_cells);
+
+  /* Loop over the tasks... */
+  for (j = 0; j < e->sched.nr_tasks; j++) {
+
+    /* Get a pointer to the kth task. */
+    t = &tasks[j];
+
+    /* Skip un-interesting tasks. */
+    if (t->type != task_type_self && t->type != task_type_pair &&
+        t->type != task_type_sub && t->type != task_type_ghost &&
+        t->type != task_type_kick1 && t->type != task_type_kick2)
+      continue;
+
+    /* Get the task weight. */
+    idx_t w = (t->toc - t->tic) * wscale;
+    if (w < 0) error("Bad task weight (%i).", w);
+
+    /* Do we need to re-scale? */
+    wtot += w;
+    while (wtot > wmax) {
+      wscale /= 2;
+      wtot /= 2;
+      w /= 2;
+      for (k = 0; k < 26 * nr_cells; k++) weights_e[k] *= 0.5;
+      for (k = 0; k < nr_cells; k++) weights_v[k] *= 0.5;
+    }
+
+    /* Get the top-level cells involved. */
+    for (ci = t->ci; ci->parent != NULL; ci = ci->parent)
+      ;
+    if (t->cj != NULL)
+      for (cj = t->cj; cj->parent != NULL; cj = cj->parent)
+        ;
+    else
+      cj = NULL;
+
+    /* Get the cell IDs. */
+    cid = ci - cells;
+
+    /* Different weights for different tasks. */
+    if (t->type == task_type_ghost || t->type == task_type_kick1 ||
+        t->type == task_type_kick2) {
+
+      /* Particle updates add only to vertex weight. */
+      weights_v[cid] += w;
+
     }
-    if (wscale_buff != wscale) {
-      float scale = wscale_buff / wscale;
-      for (k = 0; k < 26 * nr_cells; k++) weights_e[k] *= scale;
-      for (k = 0; k < nr_cells; k++) weights_v[k] *= scale;
+
+    /* Self interaction? */
+    else if ((t->type == task_type_self && ci->nodeID == nodeID) ||
+             (t->type == task_type_sub && cj == NULL && ci->nodeID == nodeID)) {
+
+      /* Self interactions add only to vertex weight. */
+      weights_v[cid] += w;
+
+    }
+
+    /* Pair? */
+    else if (t->type == task_type_pair ||
+             (t->type == task_type_sub && cj != NULL)) {
+
+      /* In-cell pair? */
+      if (ci == cj) {
+
+        /* Add weight to vertex for ci. */
+        weights_v[cid] += w;
+
+      }
+
+      /* Distinct cells with local ci? */
+      else if (ci->nodeID == nodeID) {
+
+        /* Index of the jth cell. */
+        cjd = cj - cells;
+
+        /* Add half of weight to each cell. */
+        if (ci->nodeID == nodeID) weights_v[cid] += 0.5 * w;
+        if (cj->nodeID == nodeID) weights_v[cjd] += 0.5 * w;
+
+        /* Add Weight to edge. */
+        for (k = 26 * cid; inds[k] != cjd; k++)
+          ;
+        weights_e[k] += w;
+        for (k = 26 * cjd; inds[k] != cid; k++)
+          ;
+        weights_e[k] += w;
+      }
     }
-        
-    /* Merge the weights arrays accross all nodes. */
-#if IDXTYPEWIDTH==32
-    if ( ( res = MPI_Reduce( ( nodeID == 0 ) ? MPI_IN_PLACE : weights_v , weights_v , nr_cells , MPI_INT , MPI_SUM , 0 , MPI_COMM_WORLD ) ) != MPI_SUCCESS ) {
+  }
+
+  /* Get the minimum scaling and re-scale if necessary. */
+  if ((res = MPI_Allreduce(&wscale, &wscale_buff, 1, MPI_FLOAT, MPI_MIN,
+                           MPI_COMM_WORLD)) != MPI_SUCCESS) {
+    char buff[MPI_MAX_ERROR_STRING];
+    MPI_Error_string(res, buff, &i);
+    error("Failed to allreduce the weight scales (%s).", buff);
+  }
+  if (wscale_buff != wscale) {
+    float scale = wscale_buff / wscale;
+    for (k = 0; k < 26 * nr_cells; k++) weights_e[k] *= scale;
+    for (k = 0; k < nr_cells; k++) weights_v[k] *= scale;
+  }
+
+/* Merge the weights arrays accross all nodes. */
+#if IDXTYPEWIDTH == 32
+  if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_v, weights_v,
+                        nr_cells, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD)) !=
+      MPI_SUCCESS) {
 #else
-    if ( ( res = MPI_Reduce( ( nodeID == 0 ) ? MPI_IN_PLACE : weights_v , weights_v , nr_cells , MPI_LONG_LONG_INT , MPI_SUM , 0 , MPI_COMM_WORLD ) ) != MPI_SUCCESS ) {
+  if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_v, weights_v,
+                        nr_cells, MPI_LONG_LONG_INT, MPI_SUM, 0,
+                        MPI_COMM_WORLD)) != MPI_SUCCESS) {
 #endif
-        char buff[ MPI_MAX_ERROR_STRING ];
-        MPI_Error_string( res , buff , &i );
-        error( "Failed to allreduce vertex weights (%s)." , buff );
-        }
-#if IDXTYPEWIDTH==32
-    if ( MPI_Reduce( ( nodeID == 0 ) ? MPI_IN_PLACE : weights_e , weights_e , 26*nr_cells , MPI_INT , MPI_SUM , 0 , MPI_COMM_WORLD ) != MPI_SUCCESS )
+    char buff[MPI_MAX_ERROR_STRING];
+    MPI_Error_string(res, buff, &i);
+    error("Failed to allreduce vertex weights (%s).", buff);
+  }
+#if IDXTYPEWIDTH == 32
+  if (MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_e, weights_e,
+                 26 * nr_cells, MPI_INT, MPI_SUM, 0,
+                 MPI_COMM_WORLD) != MPI_SUCCESS)
 #else
-    if ( MPI_Reduce( ( nodeID == 0 ) ? MPI_IN_PLACE : weights_e , weights_e , 26*nr_cells , MPI_LONG_LONG_INT , MPI_SUM , 0 , MPI_COMM_WORLD ) != MPI_SUCCESS )
+  if (MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_e, weights_e,
+                 26 * nr_cells, MPI_LONG_LONG_INT, MPI_SUM, 0,
+                 MPI_COMM_WORLD) != MPI_SUCCESS)
 #endif
-       error( "Failed to allreduce edge weights." );
-        
-    /* As of here, only one node needs to compute the partition. */
-    if ( nodeID == 0 ) {
-    
-        /* Check that the edge weights are fully symmetric. */
-        /* for ( cid = 0 ; cid < nr_cells ; cid++ )
-            for ( k = 0 ; k < 26 ; k++ ) {
-                cjd = inds[ cid*26 + k ];
-                for ( j = 26*cjd ; inds[j] != cid ; j++ );
-                if ( weights_e[ cid*26+k ] != weights_e[ j ] )
-                    error( "Unsymmetric edge weights detected (%i vs %i)." , weights_e[ cid*26+k ] , weights_e[ j ] );
-                } */
-        /* int w_min = weights_e[0], w_max = weights_e[0], w_tot = weights_e[0];
-        for ( k = 1 ; k < 26*nr_cells ; k++ ) {
-            w_tot += weights_e[k];
-            if ( weights_e[k] < w_min )
-                w_min = weights_e[k];
-            else if ( weights_e[k] > w_max )
-                w_max = weights_e[k];
-            }
-        message( "edge weights in [ %i , %i ], tot=%i." , w_min , w_max , w_tot );
-        w_min = weights_e[0], w_max = weights_e[0]; w_tot = weights_v[0];
-        for ( k = 1 ; k < nr_cells ; k++ ) {
-            w_tot += weights_v[k];
-            if ( weights_v[k] < w_min )
-                w_min = weights_v[k];
-            else if ( weights_v[k] > w_max )
-                w_max = weights_v[k];
-            }
-        message( "vertex weights in [ %i , %i ], tot=%i." , w_min , w_max , w_tot ); */
-                
-        /* Make sure there are no zero weights. */
-        for ( k = 0 ; k < 26*nr_cells ; k++ )
-            if ( weights_e[k] == 0 )
-                weights_e[k] = 1;
-        for ( k = 0 ; k < nr_cells ; k++ )
-            if ( ( weights_v[k] *= vscale ) == 0 )
-                weights_v[k] = 1;
-    
-        /* Allocate and fill the connection array. */
-        idx_t *offsets;
-        if ( ( offsets = (idx_t *)malloc( sizeof(idx_t) * (nr_cells + 1) ) ) == NULL )
-            error( "Failed to allocate offsets buffer." );
-        offsets[0] = 0;
-        for ( k = 0 ; k < nr_cells ; k++ )
-            offsets[k+1] = offsets[k] + 26;
-            
-        /* Set the METIS options. */
-        idx_t options[METIS_NOPTIONS];
-        METIS_SetDefaultOptions( options );
-        options[ METIS_OPTION_OBJTYPE ] = METIS_OBJTYPE_CUT;
-        options[ METIS_OPTION_NUMBERING ] = 0;
-        options[ METIS_OPTION_CONTIG ] = 1;
-        options[ METIS_OPTION_NCUTS ] = 10;
-        options[ METIS_OPTION_NITER ] = 20;
-        // options[ METIS_OPTION_UFACTOR ] = 1;
-        
-        /* Set the initial partition, although this is probably ignored. */
-        for ( k = 0 ; k < nr_cells ; k++ )
-            nodeIDs[k] = cells[k].nodeID;
-            
-        /* Call METIS. */
-        idx_t one = 1, idx_nr_cells = nr_cells, idx_nr_nodes = nr_nodes;
-        idx_t objval;
-        if ( METIS_PartGraphRecursive( &idx_nr_cells , &one , offsets , inds , weights_v , NULL , weights_e , &idx_nr_nodes , NULL , NULL , options , &objval , nodeIDs ) != METIS_OK )
-            error( "Call to METIS_PartGraphKway failed." );
-
-        /* Dump the 3d array of cell IDs. */
-        /* printf( "engine_repartition: nodeIDs = reshape( [" );
-        for ( i = 0 ; i < cdim[0]*cdim[1]*cdim[2] ; i++ )
-            printf( "%i " , (int)nodeIDs[ i ] );
-        printf("] ,%i,%i,%i);\n",cdim[0],cdim[1],cdim[2]); */
-    
+    error("Failed to allreduce edge weights.");
+
+  /* As of here, only one node needs to compute the partition. */
+  if (nodeID == 0) {
+
+    /* Check that the edge weights are fully symmetric. */
+    /* for ( cid = 0 ; cid < nr_cells ; cid++ )
+        for ( k = 0 ; k < 26 ; k++ ) {
+            cjd = inds[ cid*26 + k ];
+            for ( j = 26*cjd ; inds[j] != cid ; j++ );
+            if ( weights_e[ cid*26+k ] != weights_e[ j ] )
+                error( "Unsymmetric edge weights detected (%i vs %i)." ,
+       weights_e[ cid*26+k ] , weights_e[ j ] );
+            } */
+    /* int w_min = weights_e[0], w_max = weights_e[0], w_tot = weights_e[0];
+    for ( k = 1 ; k < 26*nr_cells ; k++ ) {
+        w_tot += weights_e[k];
+        if ( weights_e[k] < w_min )
+            w_min = weights_e[k];
+        else if ( weights_e[k] > w_max )
+            w_max = weights_e[k];
         }
-        
-    /* Broadcast the result of the partition. */
-    if ( MPI_Bcast( nodeIDs , nr_cells , MPI_INT , 0 , MPI_COMM_WORLD ) != MPI_SUCCESS )
-        error( "Failed to bcast the node IDs." );
-        
-    /* Set the cell nodeIDs and clear any non-local parts. */
-    for ( k = 0 ; k < nr_cells ; k++ ) {
-        cells[k].nodeID = nodeIDs[k];
-        if ( nodeIDs[k] == nodeID )
-            my_cells += 1;
+    message( "edge weights in [ %i , %i ], tot=%i." , w_min , w_max , w_tot );
+    w_min = weights_e[0], w_max = weights_e[0]; w_tot = weights_v[0];
+    for ( k = 1 ; k < nr_cells ; k++ ) {
+        w_tot += weights_v[k];
+        if ( weights_v[k] < w_min )
+            w_min = weights_v[k];
+        else if ( weights_v[k] > w_max )
+            w_max = weights_v[k];
         }
-        
-    /* Clean up. */
-    free( inds );
-    free( weights_v );
-    free( weights_e );
-    free( nodeIDs );
-        
-    /* Now comes the tricky part: Exchange particles between all nodes.
-       This is done in two steps, first allreducing a matrix of 
-       how many particles go from where to where, then re-allocating
-       the parts array, and emiting the sends and receives.
-       Finally, the space, tasks, and proxies need to be rebuilt. */
-       
-    /* Redistribute the particles between the nodes. */
-    engine_redistribute( e );
-        
-    /* Make the proxies. */
-    engine_makeproxies( e );
-        
-    /* Tell the engine it should re-build whenever possible */
-    e->forcerebuild = 1;
-    
+    message( "vertex weights in [ %i , %i ], tot=%i." , w_min , w_max , w_tot );
+    */
+
+    /* Make sure there are no zero weights. */
+    for (k = 0; k < 26 * nr_cells; k++)
+      if (weights_e[k] == 0) weights_e[k] = 1;
+    for (k = 0; k < nr_cells; k++)
+      if ((weights_v[k] *= vscale) == 0) weights_v[k] = 1;
+
+    /* Allocate and fill the connection array. */
+    idx_t *offsets;
+    if ((offsets = (idx_t *)malloc(sizeof(idx_t) * (nr_cells + 1))) == NULL)
+      error("Failed to allocate offsets buffer.");
+    offsets[0] = 0;
+    for (k = 0; k < nr_cells; k++) offsets[k + 1] = offsets[k] + 26;
+
+    /* Set the METIS options. +1 to keep the GCC sanitizer happy. */
+    idx_t options[METIS_NOPTIONS + 1];
+    METIS_SetDefaultOptions(options);
+    options[METIS_OPTION_OBJTYPE] = METIS_OBJTYPE_CUT;
+    options[METIS_OPTION_NUMBERING] = 0;
+    options[METIS_OPTION_CONTIG] = 1;
+    options[METIS_OPTION_NCUTS] = 10;
+    options[METIS_OPTION_NITER] = 20;
+    // options[ METIS_OPTION_UFACTOR ] = 1;
+
+    /* Set the initial partition, although this is probably ignored. */
+    for (k = 0; k < nr_cells; k++) nodeIDs[k] = cells[k].nodeID;
+
+    /* Call METIS. */
+    idx_t one = 1, idx_nr_cells = nr_cells, idx_nr_nodes = nr_nodes;
+    idx_t objval;
+    if (METIS_PartGraphRecursive(&idx_nr_cells, &one, offsets, inds, weights_v,
+                                 NULL, weights_e, &idx_nr_nodes, NULL, NULL,
+                                 options, &objval, nodeIDs) != METIS_OK)
+      error("Call to METIS_PartGraphKway failed.");
+
+    /* Dump the 3d array of cell IDs. */
+    /* printf( "engine_repartition: nodeIDs = reshape( [" );
+    for ( i = 0 ; i < cdim[0]*cdim[1]*cdim[2] ; i++ )
+        printf( "%i " , (int)nodeIDs[ i ] );
+    printf("] ,%i,%i,%i);\n",cdim[0],cdim[1],cdim[2]); */
+  }
+
+/* Broadcast the result of the partition. */
+#if IDXTYPEWIDTH == 32
+  if (MPI_Bcast(nodeIDs, nr_cells, MPI_INT, 0, MPI_COMM_WORLD) != MPI_SUCCESS)
+    error("Failed to bcast the node IDs.");
 #else
-    error( "SWIFT was not compiled with MPI and METIS support." );
+  if (MPI_Bcast(nodeIDs, nr_cells, MPI_LONG_LONG_INT, 0, MPI_COMM_WORLD) !=
+      MPI_SUCCESS)
+    error("Failed to bcast the node IDs.");
 #endif
 
-    }
-    
-    
+  /* Set the cell nodeIDs and clear any non-local parts. */
+  for (k = 0; k < nr_cells; k++) {
+    cells[k].nodeID = nodeIDs[k];
+    if (nodeIDs[k] == nodeID) my_cells += 1;
+  }
+
+  /* Clean up. */
+  free(inds);
+  free(weights_v);
+  free(weights_e);
+  free(nodeIDs);
+
+  /* Now comes the tricky part: Exchange particles between all nodes.
+     This is done in two steps, first allreducing a matrix of
+     how many particles go from where to where, then re-allocating
+     the parts array, and emiting the sends and receives.
+     Finally, the space, tasks, and proxies need to be rebuilt. */
+
+  /* Redistribute the particles between the nodes. */
+  engine_redistribute(e);
+
+  /* Make the proxies. */
+  engine_makeproxies(e);
+
+  /* Tell the engine it should re-build whenever possible */
+  e->forcerebuild = 1;
+
+#else
+  error("SWIFT was not compiled with MPI and METIS support.");
+#endif
+}
+
 /**
  * @brief Add up/down gravity tasks to a cell hierarchy.
  *
@@ -585,21 +672,20 @@ void engine_repartition ( struct engine *e ) {
  * @param up The upward gravity #task.
  * @param down The downward gravity #task.
  */
- 
-void engine_addtasks_grav ( struct engine *e , struct cell *c , struct task *up , struct task *down ) {
-
-    /* Link the tasks to this cell. */
-    c->grav_up = up;
-    c->grav_down = down;
-    
-    /* Recurse? */
-    if ( c->split )
-        for ( int k = 0 ; k < 8 ; k++ )
-            if ( c->progeny[k] != NULL )
-                engine_addtasks_grav( e , c->progeny[k] , up , down );
 
-    }
+void engine_addtasks_grav(struct engine *e, struct cell *c, struct task *up,
+                          struct task *down) {
 
+  /* Link the tasks to this cell. */
+  c->grav_up = up;
+  c->grav_down = down;
+
+  /* Recurse? */
+  if (c->split)
+    for (int k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL)
+        engine_addtasks_grav(e, c->progeny[k], up, down);
+}
 
 /**
  * @brief Add send tasks to a hierarchy of cells.
@@ -609,44 +695,45 @@ void engine_addtasks_grav ( struct engine *e , struct cell *c , struct task *up
  * @param cj The receiving #cell
  */
 
-void engine_addtasks_send ( struct engine *e , struct cell *ci , struct cell *cj ) {
+void engine_addtasks_send(struct engine *e, struct cell *ci, struct cell *cj) {
 
-    int k;
-    struct link *l = NULL;
-    struct scheduler *s = &e->sched;
+  int k;
+  struct link *l = NULL;
+  struct scheduler *s = &e->sched;
 
-    /* Check if any of the density tasks are for the target node. */
-    for ( l = ci->density ; l != NULL ; l = l->next )
-        if ( l->t->ci->nodeID == cj->nodeID ||
-             ( l->t->cj != NULL && l->t->cj->nodeID == cj->nodeID ) )
-            break;
+  /* Check if any of the density tasks are for the target node. */
+  for (l = ci->density; l != NULL; l = l->next)
+    if (l->t->ci->nodeID == cj->nodeID ||
+        (l->t->cj != NULL && l->t->cj->nodeID == cj->nodeID))
+      break;
 
-    /* If so, attach send tasks. */
-    if ( l != NULL ) {
+  /* If so, attach send tasks. */
+  if (l != NULL) {
 
-        /* Create the tasks. */
-        struct task *t_xv = scheduler_addtask( &e->sched , task_type_send , task_subtype_none , 2*ci->tag , 0 , ci , cj , 0 );
-        struct task *t_rho = scheduler_addtask( &e->sched , task_type_send , task_subtype_none , 2*ci->tag + 1 , 0 , ci , cj , 0 );
+    /* Create the tasks. */
+    struct task *t_xv =
+        scheduler_addtask(&e->sched, task_type_send, task_subtype_none,
+                          2 * ci->tag, 0, ci, cj, 0);
+    struct task *t_rho =
+        scheduler_addtask(&e->sched, task_type_send, task_subtype_none,
+                          2 * ci->tag + 1, 0, ci, cj, 0);
 
-        /* The send_rho task depends on the cell's ghost task. */
-        scheduler_addunlock( s , ci->super->ghost , t_rho );
+    /* The send_rho task depends on the cell's ghost task. */
+    scheduler_addunlock(s, ci->super->ghost, t_rho);
 
-        /* The send_rho task should unlock the super-cell's kick2 task. */
-        scheduler_addunlock( s , t_rho , ci->super->kick2 );
+    /* The send_rho task should unlock the super-cell's kick2 task. */
+    scheduler_addunlock(s, t_rho, ci->super->kick2);
 
-        /* The send_xv task should unlock the super-cell's ghost task. */
-        scheduler_addunlock( s , t_xv , ci->super->ghost );
+    /* The send_xv task should unlock the super-cell's ghost task. */
+    scheduler_addunlock(s, t_xv, ci->super->ghost);
 
-        }
-        
-    /* Recurse? */
-    else if ( ci->split )
-        for ( k = 0 ; k < 8 ; k++ )
-            if ( ci->progeny[k] != NULL )
-                engine_addtasks_send( e , ci->progeny[k] , cj );
-
-    }
+  }
 
+  /* Recurse? */
+  else if (ci->split)
+    for (k = 0; k < 8; k++)
+      if (ci->progeny[k] != NULL) engine_addtasks_send(e, ci->progeny[k], cj);
+}
 
 /**
  * @brief Add recv tasks to a hierarchy of cells.
@@ -657,796 +744,799 @@ void engine_addtasks_send ( struct engine *e , struct cell *ci , struct cell *cj
  * @param t_rho The recv_rho #task, if it has already been created.
  */
 
-void engine_addtasks_recv ( struct engine *e , struct cell *c , struct task *t_xv , struct task *t_rho ) {
-
-    int k;
-    struct scheduler *s = &e->sched;
-
-    /* Do we need to construct a recv task? */
-    if ( t_xv == NULL && c->nr_density > 0 ) {
-    
-        /* Create the tasks. */
-        t_xv = c->recv_xv = scheduler_addtask( &e->sched , task_type_recv , task_subtype_none , 2*c->tag , 0 , c , NULL , 0 );
-        t_rho = c->recv_rho = scheduler_addtask( &e->sched , task_type_recv , task_subtype_none , 2*c->tag + 1 , 0 , c , NULL , 0 );
-        
-        }
-        
-    /* Add dependencies. */
-    for ( struct link *l = c->density ; l != NULL ; l = l->next ) {
-        scheduler_addunlock( s , t_xv , l->t );
-        scheduler_addunlock( s , l->t , t_rho );
-        }
-    for ( struct link *l = c->force ; l != NULL ; l = l->next )
-        scheduler_addunlock( s , t_rho , l->t );
-    if ( c->sorts != NULL )
-        scheduler_addunlock( s , t_xv , c->sorts );
-    
-    /* Recurse? */
-    if ( c->split )
-        for ( k = 0 ; k < 8 ; k++ )
-            if ( c->progeny[k] != NULL )
-                engine_addtasks_recv( e , c->progeny[k] , t_xv , t_rho );
-
-    }
-
+void engine_addtasks_recv(struct engine *e, struct cell *c, struct task *t_xv,
+                          struct task *t_rho) {
+
+  int k;
+  struct scheduler *s = &e->sched;
+
+  /* Do we need to construct a recv task? */
+  if (t_xv == NULL && c->nr_density > 0) {
+
+    /* Create the tasks. */
+    t_xv = c->recv_xv =
+        scheduler_addtask(&e->sched, task_type_recv, task_subtype_none,
+                          2 * c->tag, 0, c, NULL, 0);
+    t_rho = c->recv_rho =
+        scheduler_addtask(&e->sched, task_type_recv, task_subtype_none,
+                          2 * c->tag + 1, 0, c, NULL, 0);
+  }
+
+  /* Add dependencies. */
+  for (struct link *l = c->density; l != NULL; l = l->next) {
+    scheduler_addunlock(s, t_xv, l->t);
+    scheduler_addunlock(s, l->t, t_rho);
+  }
+  for (struct link *l = c->force; l != NULL; l = l->next)
+    scheduler_addunlock(s, t_rho, l->t);
+  if (c->sorts != NULL) scheduler_addunlock(s, t_xv, c->sorts);
+
+  /* Recurse? */
+  if (c->split)
+    for (k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL)
+        engine_addtasks_recv(e, c->progeny[k], t_xv, t_rho);
+}
 
 /**
  * @brief Exchange cell structures with other nodes.
  *
  * @param e The #engine.
  */
- 
-void engine_exchange_cells ( struct engine *e ) {
 
-#ifdef WITH_MPI
+void engine_exchange_cells(struct engine *e) {
 
-    int j, k, pid, count = 0;
-    struct pcell *pcells;
-    struct space *s = e->s;
-    struct cell *cells = s->cells;
-    int nr_cells = s->nr_cells;
-    int nr_proxies = e->nr_proxies;
-    int offset[ nr_cells ];
-    MPI_Request reqs_in[ engine_maxproxies ];
-    MPI_Request reqs_out[ engine_maxproxies ];
-    MPI_Status status;
-    struct part *parts = &s->parts[ s->nr_parts ];
-    
-    /* Run through the cells and get the size of the ones that will be sent off. */
-    for ( k = 0 ; k < nr_cells ; k++ ) {
-        offset[k] = count;
-        if ( cells[k].sendto )
-            count += ( cells[k].pcell_size = cell_getsize( &cells[k] ) );
-        }
-        
-    /* Allocate the pcells. */
-    if ( ( pcells = (struct pcell *)malloc( sizeof(struct pcell) * count ) ) == NULL )
-        error( "Failed to allocate pcell buffer." );
-        
-    /* Pack the cells. */
-    cell_next_tag = 0;
-    for ( k = 0 ; k < nr_cells ; k++ )
-        if ( cells[k].sendto ) {
-            cell_pack( &cells[k] , &pcells[ offset[k] ] );
-            cells[k].pcell = &pcells[ offset[k] ];
-            }
+#ifdef WITH_MPI
 
-    /* Launch the proxies. */
-    for ( k = 0 ; k < nr_proxies ; k++ ) {
-        proxy_cells_exch1( &e->proxies[k] );
-        reqs_in[k] = e->proxies[k].req_cells_count_in;
-        reqs_out[k] = e->proxies[k].req_cells_count_out;
-        }
-        
-    /* Wait for each count to come in and start the recv. */
-    for ( k = 0 ; k < nr_proxies ; k++ ) {
-        if ( MPI_Waitany( nr_proxies , reqs_in , &pid , &status ) != MPI_SUCCESS ||
-             pid == MPI_UNDEFINED )
-            error( "MPI_Waitany failed." );
-        // message( "request from proxy %i has arrived." , pid );
-        proxy_cells_exch2( &e->proxies[pid] );
-        }
-        
-    /* Wait for all the sends to have finnished too. */
-    if ( MPI_Waitall( nr_proxies , reqs_out , MPI_STATUSES_IGNORE ) != MPI_SUCCESS )
-        error( "MPI_Waitall on sends failed." );
-        
-    /* Set the requests for the cells. */
-    for ( k = 0 ; k < nr_proxies ; k++ ) {
-        reqs_in[k] = e->proxies[k].req_cells_in;
-        reqs_out[k] = e->proxies[k].req_cells_out;
-        }
-    
-    /* Wait for each pcell array to come in from the proxies. */
-    for ( k = 0 ; k < nr_proxies ; k++ ) {
-        if ( MPI_Waitany( nr_proxies , reqs_in , &pid , &status ) != MPI_SUCCESS ||
-             pid == MPI_UNDEFINED )
-            error( "MPI_Waitany failed." );
-        // message( "cell data from proxy %i has arrived." , pid );
-        for ( count = 0 , j = 0 ; j < e->proxies[pid].nr_cells_in ; j++ )
-            count += cell_unpack( &e->proxies[pid].pcells_in[count] , e->proxies[pid].cells_in[j] , e->s );
-        }
-        
-    /* Wait for all the sends to have finnished too. */
-    if ( MPI_Waitall( nr_proxies , reqs_out , MPI_STATUSES_IGNORE ) != MPI_SUCCESS )
-        error( "MPI_Waitall on sends failed." );
-        
-    /* Count the number of particles we need to import and re-allocate
-       the buffer if needed. */
-    for ( count = 0 , k = 0 ; k < nr_proxies ; k++ )
-        for ( j = 0 ; j < e->proxies[k].nr_cells_in ; j++ )
-            count += e->proxies[k].cells_in[j]->count;
-    if ( count > s->size_parts_foreign ) {
-        if ( s->parts_foreign != NULL )
-            free( s->parts_foreign );
-        s->size_parts_foreign = 1.1 * count;
-        if ( posix_memalign( (void **)&s->parts_foreign , part_align , sizeof(struct part) * s->size_parts_foreign ) != 0 )
-            error( "Failed to allocate foreign part data." );
-        }
-        
-    /* Unpack the cells and link to the particle data. */
-    parts = s->parts_foreign;
-    for ( k = 0 ; k < nr_proxies ; k++ ) {
-        for ( count = 0 , j = 0 ; j < e->proxies[k].nr_cells_in ; j++ ) {
-            count += cell_link( e->proxies[k].cells_in[j] , parts );
-            parts = &parts[ e->proxies[k].cells_in[j]->count ];
-            }
-        }
-    s->nr_parts_foreign = parts - s->parts_foreign;
-        
-    /* Is the parts buffer large enough? */
-    if ( s->nr_parts_foreign > s->size_parts_foreign )
-        error( "Foreign parts buffer too small." );
-        
-    /* Free the pcell buffer. */
-    free( pcells );
-    
-#else
-    error( "SWIFT was not compiled with MPI support." );
-#endif
+  int j, k, pid, count = 0;
+  struct pcell *pcells;
+  struct space *s = e->s;
+  struct cell *cells = s->cells;
+  int nr_cells = s->nr_cells;
+  int nr_proxies = e->nr_proxies;
+  int offset[nr_cells];
+  MPI_Request reqs_in[engine_maxproxies];
+  MPI_Request reqs_out[engine_maxproxies];
+  MPI_Status status;
+  struct part *parts = &s->parts[s->nr_parts];
+
+  /* Run through the cells and get the size of the ones that will be sent off.
+   */
+  for (k = 0; k < nr_cells; k++) {
+    offset[k] = count;
+    if (cells[k].sendto)
+      count += (cells[k].pcell_size = cell_getsize(&cells[k]));
+  }
+
+  /* Allocate the pcells. */
+  if ((pcells = (struct pcell *)malloc(sizeof(struct pcell) * count)) == NULL)
+    error("Failed to allocate pcell buffer.");
+
+  /* Pack the cells. */
+  cell_next_tag = 0;
+  for (k = 0; k < nr_cells; k++)
+    if (cells[k].sendto) {
+      cell_pack(&cells[k], &pcells[offset[k]]);
+      cells[k].pcell = &pcells[offset[k]];
+    }
 
+  /* Launch the proxies. */
+  for (k = 0; k < nr_proxies; k++) {
+    proxy_cells_exch1(&e->proxies[k]);
+    reqs_in[k] = e->proxies[k].req_cells_count_in;
+    reqs_out[k] = e->proxies[k].req_cells_count_out;
+  }
+
+  /* Wait for each count to come in and start the recv. */
+  for (k = 0; k < nr_proxies; k++) {
+    if (MPI_Waitany(nr_proxies, reqs_in, &pid, &status) != MPI_SUCCESS ||
+        pid == MPI_UNDEFINED)
+      error("MPI_Waitany failed.");
+    // message( "request from proxy %i has arrived." , pid );
+    proxy_cells_exch2(&e->proxies[pid]);
+  }
+
+  /* Wait for all the sends to have finnished too. */
+  if (MPI_Waitall(nr_proxies, reqs_out, MPI_STATUSES_IGNORE) != MPI_SUCCESS)
+    error("MPI_Waitall on sends failed.");
+
+  /* Set the requests for the cells. */
+  for (k = 0; k < nr_proxies; k++) {
+    reqs_in[k] = e->proxies[k].req_cells_in;
+    reqs_out[k] = e->proxies[k].req_cells_out;
+  }
+
+  /* Wait for each pcell array to come in from the proxies. */
+  for (k = 0; k < nr_proxies; k++) {
+    if (MPI_Waitany(nr_proxies, reqs_in, &pid, &status) != MPI_SUCCESS ||
+        pid == MPI_UNDEFINED)
+      error("MPI_Waitany failed.");
+    // message( "cell data from proxy %i has arrived." , pid );
+    for (count = 0, j = 0; j < e->proxies[pid].nr_cells_in; j++)
+      count += cell_unpack(&e->proxies[pid].pcells_in[count],
+                           e->proxies[pid].cells_in[j], e->s);
+  }
+
+  /* Wait for all the sends to have finnished too. */
+  if (MPI_Waitall(nr_proxies, reqs_out, MPI_STATUSES_IGNORE) != MPI_SUCCESS)
+    error("MPI_Waitall on sends failed.");
+
+  /* Count the number of particles we need to import and re-allocate
+     the buffer if needed. */
+  for (count = 0, k = 0; k < nr_proxies; k++)
+    for (j = 0; j < e->proxies[k].nr_cells_in; j++)
+      count += e->proxies[k].cells_in[j]->count;
+  if (count > s->size_parts_foreign) {
+    if (s->parts_foreign != NULL) free(s->parts_foreign);
+    s->size_parts_foreign = 1.1 * count;
+    if (posix_memalign((void **)&s->parts_foreign, part_align,
+                       sizeof(struct part) * s->size_parts_foreign) != 0)
+      error("Failed to allocate foreign part data.");
+  }
+
+  /* Unpack the cells and link to the particle data. */
+  parts = s->parts_foreign;
+  for (k = 0; k < nr_proxies; k++) {
+    for (count = 0, j = 0; j < e->proxies[k].nr_cells_in; j++) {
+      count += cell_link(e->proxies[k].cells_in[j], parts);
+      parts = &parts[e->proxies[k].cells_in[j]->count];
     }
+  }
+  s->nr_parts_foreign = parts - s->parts_foreign;
+
+  /* Is the parts buffer large enough? */
+  if (s->nr_parts_foreign > s->size_parts_foreign)
+    error("Foreign parts buffer too small.");
 
+  /* Free the pcell buffer. */
+  free(pcells);
+
+#else
+  error("SWIFT was not compiled with MPI support.");
+#endif
+}
 
 /**
  * @brief Exchange straying parts with other nodes.
  *
  * @param e The #engine.
- * @param offset The index in the parts array as of which the foreign parts reside.
+ * @param offset The index in the parts array as of which the foreign parts
+ *reside.
  * @param ind The ID of the foreign #cell.
  * @param N The number of stray parts.
  *
  * @return The number of arrived parts copied to parts and xparts.
  */
- 
-int engine_exchange_strays ( struct engine *e , int offset , int *ind , int N ) {
+
+int engine_exchange_strays(struct engine *e, int offset, int *ind, int N) {
 
 #ifdef WITH_MPI
 
-    int k, pid, count = 0, nr_in = 0, nr_out = 0;
-    MPI_Request reqs_in[ 2*engine_maxproxies ];
-    MPI_Request reqs_out[ 2*engine_maxproxies ];
-    MPI_Status status;
-    struct proxy *p;
-    struct space *s = e->s;
-
-    /* Re-set the proxies. */
-    for ( k = 0 ; k < e->nr_proxies ; k++ )
-        e->proxies[k].nr_parts_out = 0;
-    
-    /* Put the parts into the corresponding proxies. */
-    for ( k = 0 ; k < N ; k++ ) {
-        pid = e->proxy_ind[ e->s->cells[ ind[k] ].nodeID ];
-        if ( pid < 0 )
-            error( "Do not have a proxy for the requested nodeID." );
-        proxy_parts_load( &e->proxies[pid] , &s->parts[offset + k] , &s->xparts[offset + k] , 1 );
-        }
-    
-    /* Launch the proxies. */
-    for ( k = 0 ; k < e->nr_proxies ; k++ ) {
-        proxy_parts_exch1( &e->proxies[k] );
-        reqs_in[k] = e->proxies[k].req_parts_count_in;
-        reqs_out[k] = e->proxies[k].req_parts_count_out;
-        }
-        
-    /* Wait for each count to come in and start the recv. */
-    for ( k = 0 ; k < e->nr_proxies ; k++ ) {
-        if ( MPI_Waitany( e->nr_proxies , reqs_in , &pid , &status ) != MPI_SUCCESS ||
-             pid == MPI_UNDEFINED )
-            error( "MPI_Waitany failed." );
-        // message( "request from proxy %i has arrived." , pid );
-        proxy_parts_exch2( &e->proxies[pid] );
-        }
-        
-    /* Wait for all the sends to have finnished too. */
-    if ( MPI_Waitall( e->nr_proxies , reqs_out , MPI_STATUSES_IGNORE ) != MPI_SUCCESS )
-        error( "MPI_Waitall on sends failed." );
-        
-    /* Count the total number of incomming particles and make sure we have
-       enough space to accommodate them. */
-    int count_in = 0;
-    for ( k = 0 ; k < e->nr_proxies ; k++ )
-      count_in += e->proxies[k].nr_parts_in;
-    message("sent out %i particles, got %i back.", N, count_in);
-    if ( offset + count_in > s->size_parts ) {
-      s->size_parts = (offset + count_in) * 1.05;
-      struct part *parts_new;
-      struct xpart *xparts_new;
-      if ( posix_memalign( (void **)&parts_new , part_align , sizeof(struct part) * s->size_parts ) != 0 ||
-           posix_memalign( (void **)&xparts_new , part_align , sizeof(struct xpart) * s->size_parts ) != 0 )
-          error( "Failed to allocate new part data." );
-      memcpy( parts_new , s->parts , sizeof(struct part) * offset );
-      memcpy( xparts_new , s->xparts , sizeof(struct xpart) * offset );
-      free( s->parts );
-      free( s->xparts );
-      s->parts = parts_new;
-      s->xparts = xparts_new;
+  int k, pid, count = 0, nr_in = 0, nr_out = 0;
+  MPI_Request reqs_in[2 * engine_maxproxies];
+  MPI_Request reqs_out[2 * engine_maxproxies];
+  MPI_Status status;
+  struct proxy *p;
+  struct space *s = e->s;
+
+  /* Re-set the proxies. */
+  for (k = 0; k < e->nr_proxies; k++) e->proxies[k].nr_parts_out = 0;
+
+  /* Put the parts into the corresponding proxies. */
+  for (k = 0; k < N; k++) {
+    int node_id = e->s->cells[ind[k]].nodeID;
+    if (node_id < 0 || node_id >= e->nr_nodes)
+      error("Bad node ID %i.", node_id);
+    pid = e->proxy_ind[node_id];
+    if (pid < 0)
+      error(
+          "Do not have a proxy for the requested nodeID %i for part with "
+          "id=%llu, x=[%e,%e,%e].",
+          node_id, s->parts[offset + k].id, s->parts[offset + k].x[0],
+          s->parts[offset + k].x[1], s->parts[offset + k].x[2]);
+    proxy_parts_load(&e->proxies[pid], &s->parts[offset + k],
+                     &s->xparts[offset + k], 1);
+  }
+
+  /* Launch the proxies. */
+  for (k = 0; k < e->nr_proxies; k++) {
+    proxy_parts_exch1(&e->proxies[k]);
+    reqs_in[k] = e->proxies[k].req_parts_count_in;
+    reqs_out[k] = e->proxies[k].req_parts_count_out;
+  }
+
+  /* Wait for each count to come in and start the recv. */
+  for (k = 0; k < e->nr_proxies; k++) {
+    if (MPI_Waitany(e->nr_proxies, reqs_in, &pid, &status) != MPI_SUCCESS ||
+        pid == MPI_UNDEFINED)
+      error("MPI_Waitany failed.");
+    // message( "request from proxy %i has arrived." , pid );
+    proxy_parts_exch2(&e->proxies[pid]);
+  }
+
+  /* Wait for all the sends to have finnished too. */
+  if (MPI_Waitall(e->nr_proxies, reqs_out, MPI_STATUSES_IGNORE) != MPI_SUCCESS)
+    error("MPI_Waitall on sends failed.");
+
+  /* Count the total number of incomming particles and make sure we have
+     enough space to accommodate them. */
+  int count_in = 0;
+  for (k = 0; k < e->nr_proxies; k++) count_in += e->proxies[k].nr_parts_in;
+  message("sent out %i particles, got %i back.", N, count_in);
+  if (offset + count_in > s->size_parts) {
+    s->size_parts = (offset + count_in) * 1.05;
+    struct part *parts_new;
+    struct xpart *xparts_new;
+    if (posix_memalign((void **)&parts_new, part_align,
+                       sizeof(struct part) * s->size_parts) != 0 ||
+        posix_memalign((void **)&xparts_new, part_align,
+                       sizeof(struct xpart) * s->size_parts) != 0)
+      error("Failed to allocate new part data.");
+    memcpy(parts_new, s->parts, sizeof(struct part) * offset);
+    memcpy(xparts_new, s->xparts, sizeof(struct xpart) * offset);
+    free(s->parts);
+    free(s->xparts);
+    s->parts = parts_new;
+    s->xparts = xparts_new;
+  }
+
+  /* Collect the requests for the particle data from the proxies. */
+  for (k = 0; k < e->nr_proxies; k++) {
+    if (e->proxies[k].nr_parts_in > 0) {
+      reqs_in[2 * k] = e->proxies[k].req_parts_in;
+      reqs_in[2 * k + 1] = e->proxies[k].req_xparts_in;
+      nr_in += 1;
+    } else
+      reqs_in[2 * k] = reqs_in[2 * k + 1] = MPI_REQUEST_NULL;
+    if (e->proxies[k].nr_parts_out > 0) {
+      reqs_out[2 * k] = e->proxies[k].req_parts_out;
+      reqs_out[2 * k + 1] = e->proxies[k].req_xparts_out;
+      nr_out += 1;
+    } else
+      reqs_out[2 * k] = reqs_out[2 * k + 1] = MPI_REQUEST_NULL;
+  }
+
+  /* Wait for each part array to come in and collect the new
+     parts from the proxies. */
+  for (k = 0; k < 2 * (nr_in + nr_out); k++) {
+    int err;
+    if ((err = MPI_Waitany(2 * e->nr_proxies, reqs_in, &pid, &status)) !=
+        MPI_SUCCESS) {
+      char buff[MPI_MAX_ERROR_STRING];
+      int res;
+      MPI_Error_string(err, buff, &res);
+      error("MPI_Waitany failed (%s).", buff);
     }
-        
-    /* Collect the requests for the particle data from the proxies. */
-    for ( k = 0 ; k < e->nr_proxies ; k++ ) {
-        if ( e->proxies[k].nr_parts_in > 0 ) {
-            reqs_in[2*k] = e->proxies[k].req_parts_in;
-            reqs_in[2*k+1] = e->proxies[k].req_xparts_in;
-            nr_in += 1;
-            }
-        else
-            reqs_in[2*k] = reqs_in[2*k+1] = MPI_REQUEST_NULL;
-        if ( e->proxies[k].nr_parts_out > 0 ) {
-            reqs_out[2*k] = e->proxies[k].req_parts_out;
-            reqs_out[2*k+1] = e->proxies[k].req_xparts_out;
-            nr_out += 1;
-            }
-        else
-            reqs_out[2*k] = reqs_out[2*k+1] = MPI_REQUEST_NULL;
-        }
-    
-    /* Wait for each part array to come in and collect the new
-       parts from the proxies. */
-    for ( k = 0 ; k < 2*(nr_in + nr_out) ; k++ ) {
-        int err;
-        if ( ( err = MPI_Waitany( 2*e->nr_proxies , reqs_in , &pid , &status ) ) != MPI_SUCCESS ) {
-            char buff[ MPI_MAX_ERROR_STRING ];
-            int res;
-            MPI_Error_string( err , buff , &res );
-                error( "MPI_Waitany failed (%s)." , buff );
-            }
-        if ( pid == MPI_UNDEFINED )
-            break;
-        // message( "request from proxy %i has arrived." , pid );
-        if ( reqs_in[pid & ~1] == MPI_REQUEST_NULL &&
-             reqs_in[pid | 1 ] == MPI_REQUEST_NULL ) {
-            p = &e->proxies[pid/2];
-            memcpy( &s->parts[offset + count] , p->parts_in , sizeof(struct part) * p->nr_parts_in );
-            memcpy( &s->xparts[offset + count] , p->xparts_in , sizeof(struct xpart) * p->nr_parts_in );
-            count += p->nr_parts_in;
-            /* for ( int k = 0 ; k < p->nr_parts_in ; k++ )
-                message( "received particle %lli, x=[%.3e %.3e %.3e], h=%.3e, from node %i." ,
-                    p->parts_in[k].id , p->parts_in[k].x[0] , p->parts_in[k].x[1] , p->parts_in[k].x[2] ,
-                    p->parts_in[k].h , p->nodeID ); */
-            }
-        }
-    
-    /* Wait for all the sends to have finnished too. */
-    if ( nr_out > 0 )
-        if ( MPI_Waitall( 2*e->nr_proxies , reqs_out , MPI_STATUSES_IGNORE ) != MPI_SUCCESS )
-            error( "MPI_Waitall on sends failed." );
-        
-    /* Return the number of harvested parts. */
-    return count;
-    
-#else
-    error( "SWIFT was not compiled with MPI support." );
-    return 0;
-#endif
-
+    if (pid == MPI_UNDEFINED) break;
+    // message( "request from proxy %i has arrived." , pid );
+    if (reqs_in[pid & ~1] == MPI_REQUEST_NULL &&
+        reqs_in[pid | 1] == MPI_REQUEST_NULL) {
+      p = &e->proxies[pid >> 1];
+      memcpy(&s->parts[offset + count], p->parts_in,
+             sizeof(struct part) * p->nr_parts_in);
+      memcpy(&s->xparts[offset + count], p->xparts_in,
+             sizeof(struct xpart) * p->nr_parts_in);
+      for (int k = offset; k < offset + count; k++)
+        message(
+            "received particle %lli, x=[%.3e %.3e %.3e], h=%.3e, from node %i.",
+            s->parts[k].id, s->parts[k].x[0], s->parts[k].x[1],
+            s->parts[k].x[2], s->parts[k].h, p->nodeID);
+      count += p->nr_parts_in;
     }
+  }
 
+  /* Wait for all the sends to have finnished too. */
+  if (nr_out > 0)
+    if (MPI_Waitall(2 * e->nr_proxies, reqs_out, MPI_STATUSES_IGNORE) !=
+        MPI_SUCCESS)
+      error("MPI_Waitall on sends failed.");
+
+  /* Return the number of harvested parts. */
+  return count;
+
+#else
+  error("SWIFT was not compiled with MPI support.");
+  return 0;
+#endif
+}
 
 /**
  * @brief Fill the #space's task list.
  *
  * @param e The #engine we are working with.
  */
- 
-void engine_maketasks ( struct engine *e ) {
-
-    struct space *s = e->s;
-    struct scheduler *sched = &e->sched;
-    struct cell *cells = s->cells;
-    int nr_cells = s->nr_cells;
-    int nodeID = e->nodeID;
-    int i, j, k, ii, jj, kk, iii, jjj, kkk, cid, cjd, sid;
-    int *cdim = s->cdim;
-    struct task *t, *t2;
-    struct cell *ci, *cj;
-
-    /* Re-set the scheduler. */
-    scheduler_reset( sched , s->tot_cells * engine_maxtaskspercell );
-    
-    /* Run through the highest level of cells and add pairs. */
-    for ( i = 0 ; i < cdim[0] ; i++ )
-        for ( j = 0 ; j < cdim[1] ; j++ )
-            for ( k = 0 ; k < cdim[2] ; k++ ) {
-                cid = cell_getid( cdim , i , j , k );
-                if ( cells[cid].count == 0 )
-                    continue;
-                ci = &cells[cid];
-                if ( ci->count == 0 )
-                    continue;
-                if ( ci->nodeID == nodeID )
-                    scheduler_addtask( sched , task_type_self , task_subtype_density , 0 , 0 , ci , NULL , 0 );
-                for ( ii = -1 ; ii < 2 ; ii++ ) {
-                    iii = i + ii;
-                    if ( !s->periodic && ( iii < 0 || iii >= cdim[0] ) )
-                        continue;
-                    iii = ( iii + cdim[0] ) % cdim[0];
-                    for ( jj = -1 ; jj < 2 ; jj++ ) {
-                        jjj = j + jj;
-                        if ( !s->periodic && ( jjj < 0 || jjj >= cdim[1] ) )
-                            continue;
-                        jjj = ( jjj + cdim[1] ) % cdim[1];
-                        for ( kk = -1 ; kk < 2 ; kk++ ) {
-                            kkk = k + kk;
-                            if ( !s->periodic && ( kkk < 0 || kkk >= cdim[2] ) )
-                                continue;
-                            kkk = ( kkk + cdim[2] ) % cdim[2];
-                            cjd = cell_getid( cdim , iii , jjj , kkk );
-                            cj = &cells[cjd];
-                            if ( cid >= cjd || cj->count == 0 || 
-                                 ( ci->nodeID != nodeID && cj->nodeID != nodeID ) )
-                                continue;
-                            sid = sortlistID[ (kk+1) + 3*( (jj+1) + 3*(ii+1) ) ];
-                            scheduler_addtask( sched , task_type_pair , task_subtype_density , sid , 0 , ci , cj , 1 );
-                            }
-                        }
-                    }
-                }
-                
-    /* Add the gravity mm tasks. */
-    for ( i = 0 ; i < nr_cells ; i++ )
-        if ( cells[i].gcount > 0 ) {
-            scheduler_addtask( sched , task_type_grav_mm , task_subtype_none , -1 , 0 , &cells[i] , NULL , 0 );
-            for ( j = i+1 ; j < nr_cells ; j++ )
-                if ( cells[j].gcount > 0 )
-                    scheduler_addtask( sched , task_type_grav_mm , task_subtype_none , -1 , 0 , &cells[i] , &cells[j] , 0 );
-            }
-        
-    /* Split the tasks. */
-    scheduler_splittasks( sched );
-    
-    /* Allocate the list of cell-task links. The maximum number of links
-       is the number of cells (s->tot_cells) times the number of neighbours (27)
-       times the number of interaction types (2, density and force). */
-    if ( e->links != NULL )
-        free( e->links );
-    if ( ( e->links = malloc( sizeof(struct link) * s->tot_cells * 27 * 2 ) ) == NULL )
-        error( "Failed to allocate cell-task links." );
-    e->nr_links = 0;
-    
-    /* Add the gravity up/down tasks at the top-level cells and push them down. */
-    for ( k = 0 ; k < nr_cells ; k++ )
-        if ( cells[k].nodeID == nodeID && cells[k].gcount > 0 ) {
-        
-            /* Create tasks at top level. */
-            struct task *up = scheduler_addtask( sched , task_type_grav_up , task_subtype_none , 0 , 0 , &cells[k] , NULL , 0 );
-            struct task *down = scheduler_addtask( sched , task_type_grav_down , task_subtype_none , 0 , 0 , &cells[k] , NULL , 0 );
-            
-            /* Push tasks down the cell hierarchy. */
-            engine_addtasks_grav( e , &cells[k] , up , down );
-            
-            }
-    
-    /* Count the number of tasks associated with each cell and
-       store the density tasks in each cell, and make each sort
-       depend on the sorts of its progeny. */
-    for ( k = 0 ; k < sched->nr_tasks ; k++ ) {
-        
-        /* Get the current task. */
-        t = &sched->tasks[k];
-        if ( t->skip )
-            continue;
-            
-        /* Link sort tasks together. */
-        if ( t->type == task_type_sort && t->ci->split )
-            for ( j = 0 ; j < 8 ; j++ )
-                if ( t->ci->progeny[j] != NULL && t->ci->progeny[j]->sorts != NULL ) {
-                    t->ci->progeny[j]->sorts->skip = 0;
-                    scheduler_addunlock( sched , t->ci->progeny[j]->sorts , t );
-                    }
-                    
-        /* Link density tasks to cells. */
-        if ( t->type == task_type_self ) {
-            atomic_inc( &t->ci->nr_tasks );
-            if ( t->subtype == task_subtype_density ) {
-                t->ci->density = engine_addlink( e , t->ci->density , t );
-                atomic_inc( &t->ci->nr_density );
-                }
-            }
-        else if ( t->type == task_type_pair ) {
-            atomic_inc( &t->ci->nr_tasks );
-            atomic_inc( &t->cj->nr_tasks );
-            if ( t->subtype == task_subtype_density ) {
-                t->ci->density = engine_addlink( e , t->ci->density , t );
-                atomic_inc( &t->ci->nr_density );
-                t->cj->density = engine_addlink( e , t->cj->density , t );
-                atomic_inc( &t->cj->nr_density );
-                }
-            }
-        else if ( t->type == task_type_sub ) {
-            atomic_inc( &t->ci->nr_tasks );
-            if ( t->cj != NULL )
-                atomic_inc( &t->cj->nr_tasks );
-            if ( t->subtype == task_subtype_density ) {
-                t->ci->density = engine_addlink( e , t->ci->density , t );
-                atomic_inc( &t->ci->nr_density );
-                if ( t->cj != NULL ) {
-                    t->cj->density = engine_addlink( e , t->cj->density , t );
-                    atomic_inc( &t->cj->nr_density );
-                    }
-                }
-            }
-            
-        /* Link gravity multipole tasks to the up/down tasks. */
-        if ( t->type == task_type_grav_mm ||
-             ( t->type == task_type_sub && t->subtype == task_subtype_grav ) ) {
-            atomic_inc( &t->ci->nr_tasks );
-            scheduler_addunlock( sched , t->ci->grav_up , t );
-            scheduler_addunlock( sched , t , t->ci->grav_down );
-            if ( t->cj != NULL && t->ci->grav_up != t->cj->grav_up ) {
-                scheduler_addunlock( sched , t->cj->grav_up , t );
-                scheduler_addunlock( sched , t , t->cj->grav_down );
-                }
+
+void engine_maketasks(struct engine *e) {
+
+  struct space *s = e->s;
+  struct scheduler *sched = &e->sched;
+  struct cell *cells = s->cells;
+  int nr_cells = s->nr_cells;
+  int nodeID = e->nodeID;
+  int i, j, k, ii, jj, kk, iii, jjj, kkk, cid, cjd, sid;
+  int *cdim = s->cdim;
+  struct task *t, *t2;
+  struct cell *ci, *cj;
+
+  /* Re-set the scheduler. */
+  scheduler_reset(sched, s->tot_cells * engine_maxtaskspercell);
+
+  /* Run through the highest level of cells and add pairs. */
+  for (i = 0; i < cdim[0]; i++)
+    for (j = 0; j < cdim[1]; j++)
+      for (k = 0; k < cdim[2]; k++) {
+        cid = cell_getid(cdim, i, j, k);
+        if (cells[cid].count == 0) continue;
+        ci = &cells[cid];
+        if (ci->count == 0) continue;
+        if (ci->nodeID == nodeID)
+          scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0,
+                            ci, NULL, 0);
+        for (ii = -1; ii < 2; ii++) {
+          iii = i + ii;
+          if (!s->periodic && (iii < 0 || iii >= cdim[0])) continue;
+          iii = (iii + cdim[0]) % cdim[0];
+          for (jj = -1; jj < 2; jj++) {
+            jjj = j + jj;
+            if (!s->periodic && (jjj < 0 || jjj >= cdim[1])) continue;
+            jjj = (jjj + cdim[1]) % cdim[1];
+            for (kk = -1; kk < 2; kk++) {
+              kkk = k + kk;
+              if (!s->periodic && (kkk < 0 || kkk >= cdim[2])) continue;
+              kkk = (kkk + cdim[2]) % cdim[2];
+              cjd = cell_getid(cdim, iii, jjj, kkk);
+              cj = &cells[cjd];
+              if (cid >= cjd || cj->count == 0 ||
+                  (ci->nodeID != nodeID && cj->nodeID != nodeID))
+                continue;
+              sid = sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))];
+              scheduler_addtask(sched, task_type_pair, task_subtype_density,
+                                sid, 0, ci, cj, 1);
             }
-            
+          }
         }
-        
-    /* Append a ghost task to each cell, and add kick2 tasks to the
-       super cells. */
-    for ( k = 0 ; k < nr_cells ; k++ )
-        engine_mkghosts( e , &cells[k] , NULL );
-        
-    /* Run through the tasks and make force tasks for each density task.
-       Each force task depends on the cell ghosts and unlocks the kick2 task
-       of its super-cell. */
-    kk = sched->nr_tasks;
-    for ( k = 0 ; k < kk ; k++ ) {
-    
-        /* Get a pointer to the task. */
-        t = &sched->tasks[k];
-        
-        /* Skip? */
-        if ( t->skip )
-            continue;
-            
-        /* Self-interaction? */
-        if ( t->type == task_type_self && t->subtype == task_subtype_density ) {
-            scheduler_addunlock( sched , t , t->ci->super->ghost );
-            t2 = scheduler_addtask( sched , task_type_self , task_subtype_force , 0 , 0 , t->ci , NULL , 0 );
-            scheduler_addunlock( sched , t->ci->super->ghost , t2 );
-            scheduler_addunlock( sched , t2 , t->ci->super->kick2 );
-            t->ci->force = engine_addlink( e , t->ci->force , t2 );
-            atomic_inc( &t->ci->nr_force );
-            }
-            
-        /* Otherwise, pair interaction? */
-        else if ( t->type == task_type_pair && t->subtype == task_subtype_density ) {
-            t2 = scheduler_addtask( sched , task_type_pair , task_subtype_force , 0 , 0 , t->ci , t->cj , 0 );
-            if ( t->ci->nodeID == nodeID ) {
-                scheduler_addunlock( sched , t , t->ci->super->ghost );
-                scheduler_addunlock( sched , t->ci->super->ghost , t2 );
-                scheduler_addunlock( sched , t2 , t->ci->super->kick2 );
-                }
-            if ( t->cj->nodeID == nodeID && t->ci->super != t->cj->super ) {
-                scheduler_addunlock( sched , t , t->cj->super->ghost );
-                scheduler_addunlock( sched , t->cj->super->ghost , t2 );
-                scheduler_addunlock( sched , t2 , t->cj->super->kick2 );
-                }
-            t->ci->force = engine_addlink( e , t->ci->force , t2 );
-            atomic_inc( &t->ci->nr_force );
-            t->cj->force = engine_addlink( e , t->cj->force , t2 );
-            atomic_inc( &t->cj->nr_force );
-            }
-    
-        /* Otherwise, sub interaction? */
-        else if ( t->type == task_type_sub && t->subtype == task_subtype_density ) {
-            t2 = scheduler_addtask( sched , task_type_sub , task_subtype_force , t->flags , 0 , t->ci , t->cj , 0 );
-            if ( t->ci->nodeID == nodeID ) {
-                scheduler_addunlock( sched , t , t->ci->super->ghost );
-                scheduler_addunlock( sched , t->ci->super->ghost , t2 );
-                scheduler_addunlock( sched , t2 , t->ci->super->kick2 );
-                }
-            if ( t->cj != NULL && t->cj->nodeID == nodeID && t->ci->super != t->cj->super ) {
-                scheduler_addunlock( sched , t , t->cj->super->ghost );
-                scheduler_addunlock( sched , t->cj->super->ghost , t2 );
-                scheduler_addunlock( sched , t2 , t->cj->super->kick2 );
-                }
-            t->ci->force = engine_addlink( e , t->ci->force , t2 );
-            atomic_inc( &t->ci->nr_force );
-            if ( t->cj != NULL ) {
-                t->cj->force = engine_addlink( e , t->cj->force , t2 );
-                atomic_inc( &t->cj->nr_force );
-                }
-            }
-            
-        /* Kick2 tasks should rely on the grav_down tasks of their cell. */
-        else if ( t->type == task_type_kick2 && t->ci->grav_down != NULL )
-            scheduler_addunlock( sched , t->ci->grav_down , t );
-            
+      }
+
+  /* Add the gravity mm tasks. */
+  for (i = 0; i < nr_cells; i++)
+    if (cells[i].gcount > 0) {
+      scheduler_addtask(sched, task_type_grav_mm, task_subtype_none, -1, 0,
+                        &cells[i], NULL, 0);
+      for (j = i + 1; j < nr_cells; j++)
+        if (cells[j].gcount > 0)
+          scheduler_addtask(sched, task_type_grav_mm, task_subtype_none, -1, 0,
+                            &cells[i], &cells[j], 0);
+    }
+
+  /* Split the tasks. */
+  scheduler_splittasks(sched);
+
+  /* Allocate the list of cell-task links. The maximum number of links
+     is the number of cells (s->tot_cells) times the number of neighbours (27)
+     times the number of interaction types (2, density and force). */
+  if (e->links != NULL) free(e->links);
+  if ((e->links = malloc(sizeof(struct link) * s->tot_cells * 27 * 2)) == NULL)
+    error("Failed to allocate cell-task links.");
+  e->nr_links = 0;
+
+  /* Add the gravity up/down tasks at the top-level cells and push them down. */
+  for (k = 0; k < nr_cells; k++)
+    if (cells[k].nodeID == nodeID && cells[k].gcount > 0) {
+
+      /* Create tasks at top level. */
+      struct task *up =
+          scheduler_addtask(sched, task_type_grav_up, task_subtype_none, 0, 0,
+                            &cells[k], NULL, 0);
+      struct task *down =
+          scheduler_addtask(sched, task_type_grav_down, task_subtype_none, 0, 0,
+                            &cells[k], NULL, 0);
+
+      /* Push tasks down the cell hierarchy. */
+      engine_addtasks_grav(e, &cells[k], up, down);
+    }
+
+  /* Count the number of tasks associated with each cell and
+     store the density tasks in each cell, and make each sort
+     depend on the sorts of its progeny. */
+  for (k = 0; k < sched->nr_tasks; k++) {
+
+    /* Get the current task. */
+    t = &sched->tasks[k];
+    if (t->skip) continue;
+
+    /* Link sort tasks together. */
+    if (t->type == task_type_sort && t->ci->split)
+      for (j = 0; j < 8; j++)
+        if (t->ci->progeny[j] != NULL && t->ci->progeny[j]->sorts != NULL) {
+          t->ci->progeny[j]->sorts->skip = 0;
+          scheduler_addunlock(sched, t->ci->progeny[j]->sorts, t);
         }
-        
-    /* Add the communication tasks if MPI is being used. */
-    #ifdef WITH_MPI
-        
-        /* Loop over the proxies. */
-        for ( int pid = 0 ; pid < e->nr_proxies ; pid++ ) {
-        
-            /* Get a handle on the proxy. */
-            struct proxy *p = &e->proxies[pid];
-            
-            /* Loop through the proxy's incomming cells and add the
-               recv tasks. */
-            for ( k = 0 ; k < p->nr_cells_in ; k++ )
-                engine_addtasks_recv( e , p->cells_in[k] , NULL , NULL );
-            
-            /* Loop through the proxy's outgoing cells and add the
-               send tasks. */
-            for ( k = 0 ; k < p->nr_cells_out ; k++ )
-                engine_addtasks_send( e , p->cells_out[k] , p->cells_in[0] );
-            
-            }
-        
-    #endif
-        
-    /* Rank the tasks. */
-    scheduler_ranktasks( sched );
-    
-    /* Weight the tasks. */
-    scheduler_reweight( sched );
-            
-    /* Set the tasks age. */
-    e->tasks_age = 0;
-            
+
+    /* Link density tasks to cells. */
+    if (t->type == task_type_self) {
+      atomic_inc(&t->ci->nr_tasks);
+      if (t->subtype == task_subtype_density) {
+        t->ci->density = engine_addlink(e, t->ci->density, t);
+        atomic_inc(&t->ci->nr_density);
+      }
+    } else if (t->type == task_type_pair) {
+      atomic_inc(&t->ci->nr_tasks);
+      atomic_inc(&t->cj->nr_tasks);
+      if (t->subtype == task_subtype_density) {
+        t->ci->density = engine_addlink(e, t->ci->density, t);
+        atomic_inc(&t->ci->nr_density);
+        t->cj->density = engine_addlink(e, t->cj->density, t);
+        atomic_inc(&t->cj->nr_density);
+      }
+    } else if (t->type == task_type_sub) {
+      atomic_inc(&t->ci->nr_tasks);
+      if (t->cj != NULL) atomic_inc(&t->cj->nr_tasks);
+      if (t->subtype == task_subtype_density) {
+        t->ci->density = engine_addlink(e, t->ci->density, t);
+        atomic_inc(&t->ci->nr_density);
+        if (t->cj != NULL) {
+          t->cj->density = engine_addlink(e, t->cj->density, t);
+          atomic_inc(&t->cj->nr_density);
+        }
+      }
+    }
+
+    /* Link gravity multipole tasks to the up/down tasks. */
+    if (t->type == task_type_grav_mm ||
+        (t->type == task_type_sub && t->subtype == task_subtype_grav)) {
+      atomic_inc(&t->ci->nr_tasks);
+      scheduler_addunlock(sched, t->ci->grav_up, t);
+      scheduler_addunlock(sched, t, t->ci->grav_down);
+      if (t->cj != NULL && t->ci->grav_up != t->cj->grav_up) {
+        scheduler_addunlock(sched, t->cj->grav_up, t);
+        scheduler_addunlock(sched, t, t->cj->grav_down);
+      }
+    }
+  }
+
+  /* Append a ghost task to each cell, and add kick2 tasks to the
+     super cells. */
+  for (k = 0; k < nr_cells; k++) engine_mkghosts(e, &cells[k], NULL);
+
+  /* Run through the tasks and make force tasks for each density task.
+     Each force task depends on the cell ghosts and unlocks the kick2 task
+     of its super-cell. */
+  kk = sched->nr_tasks;
+  for (k = 0; k < kk; k++) {
+
+    /* Get a pointer to the task. */
+    t = &sched->tasks[k];
+
+    /* Skip? */
+    if (t->skip) continue;
+
+    /* Self-interaction? */
+    if (t->type == task_type_self && t->subtype == task_subtype_density) {
+      scheduler_addunlock(sched, t, t->ci->super->ghost);
+      t2 = scheduler_addtask(sched, task_type_self, task_subtype_force, 0, 0,
+                             t->ci, NULL, 0);
+      scheduler_addunlock(sched, t->ci->super->ghost, t2);
+      scheduler_addunlock(sched, t2, t->ci->super->kick2);
+      t->ci->force = engine_addlink(e, t->ci->force, t2);
+      atomic_inc(&t->ci->nr_force);
     }
-    
-    
+
+    /* Otherwise, pair interaction? */
+    else if (t->type == task_type_pair && t->subtype == task_subtype_density) {
+      t2 = scheduler_addtask(sched, task_type_pair, task_subtype_force, 0, 0,
+                             t->ci, t->cj, 0);
+      if (t->ci->nodeID == nodeID) {
+        scheduler_addunlock(sched, t, t->ci->super->ghost);
+        scheduler_addunlock(sched, t->ci->super->ghost, t2);
+        scheduler_addunlock(sched, t2, t->ci->super->kick2);
+      }
+      if (t->cj->nodeID == nodeID && t->ci->super != t->cj->super) {
+        scheduler_addunlock(sched, t, t->cj->super->ghost);
+        scheduler_addunlock(sched, t->cj->super->ghost, t2);
+        scheduler_addunlock(sched, t2, t->cj->super->kick2);
+      }
+      t->ci->force = engine_addlink(e, t->ci->force, t2);
+      atomic_inc(&t->ci->nr_force);
+      t->cj->force = engine_addlink(e, t->cj->force, t2);
+      atomic_inc(&t->cj->nr_force);
+    }
+
+    /* Otherwise, sub interaction? */
+    else if (t->type == task_type_sub && t->subtype == task_subtype_density) {
+      t2 = scheduler_addtask(sched, task_type_sub, task_subtype_force, t->flags,
+                             0, t->ci, t->cj, 0);
+      if (t->ci->nodeID == nodeID) {
+        scheduler_addunlock(sched, t, t->ci->super->ghost);
+        scheduler_addunlock(sched, t->ci->super->ghost, t2);
+        scheduler_addunlock(sched, t2, t->ci->super->kick2);
+      }
+      if (t->cj != NULL && t->cj->nodeID == nodeID &&
+          t->ci->super != t->cj->super) {
+        scheduler_addunlock(sched, t, t->cj->super->ghost);
+        scheduler_addunlock(sched, t->cj->super->ghost, t2);
+        scheduler_addunlock(sched, t2, t->cj->super->kick2);
+      }
+      t->ci->force = engine_addlink(e, t->ci->force, t2);
+      atomic_inc(&t->ci->nr_force);
+      if (t->cj != NULL) {
+        t->cj->force = engine_addlink(e, t->cj->force, t2);
+        atomic_inc(&t->cj->nr_force);
+      }
+    }
+
+    /* Kick2 tasks should rely on the grav_down tasks of their cell. */
+    else if (t->type == task_type_kick2 && t->ci->grav_down != NULL)
+      scheduler_addunlock(sched, t->ci->grav_down, t);
+  }
+
+/* Add the communication tasks if MPI is being used. */
+#ifdef WITH_MPI
+
+  /* Loop over the proxies. */
+  for (int pid = 0; pid < e->nr_proxies; pid++) {
+
+    /* Get a handle on the proxy. */
+    struct proxy *p = &e->proxies[pid];
+
+    /* Loop through the proxy's incomming cells and add the
+       recv tasks. */
+    for (k = 0; k < p->nr_cells_in; k++)
+      engine_addtasks_recv(e, p->cells_in[k], NULL, NULL);
+
+    /* Loop through the proxy's outgoing cells and add the
+       send tasks. */
+    for (k = 0; k < p->nr_cells_out; k++)
+      engine_addtasks_send(e, p->cells_out[k], p->cells_in[0]);
+  }
+
+#endif
+
+  /* Rank the tasks. */
+  scheduler_ranktasks(sched);
+
+  /* Weight the tasks. */
+  scheduler_reweight(sched);
+
+  /* Set the tasks age. */
+  e->tasks_age = 0;
+}
 
 /**
  * @brief Mark tasks to be skipped and set the sort flags accordingly.
- * 
+ *
  * @return 1 if the space has to be rebuilt, 0 otherwise.
  */
- 
-int engine_marktasks ( struct engine *e ) {
-
-    struct scheduler *s = &e->sched;
-    int k, nr_tasks = s->nr_tasks, *ind = s->tasks_ind;
-    struct task *t, *tasks = s->tasks;
-    float dt_step = e->dt_step;
-    struct cell *ci, *cj;
-    // ticks tic = getticks();
-    
-    /* Muc less to do here if we're on a fixed time-step. */
-    if ( !( e->policy & engine_policy_multistep ) ) {
-    
-        /* Run through the tasks and mark as skip or not. */
-        for ( k = 0 ; k < nr_tasks ; k++ ) {
-
-            /* Get a handle on the kth task. */
-            t = &tasks[ ind[k] ];
-
-            /* Pair? */
-            if ( t->type == task_type_pair || ( t->type == task_type_sub && t->cj != NULL ) ) {
-
-                /* Local pointers. */
-                ci = t->ci;
-                cj = t->cj;
-
-                /* Too much particle movement? */
-                if ( t->tight &&
-                     ( fmaxf( ci->h_max , cj->h_max ) + ci->dx_max + cj->dx_max > cj->dmin || 
-                       ci->dx_max > space_maxreldx*ci->h_max || cj->dx_max > space_maxreldx*cj->h_max ) )
-                    return 1;
 
-                }
-                
-            /* Sort? */
-            else if ( t->type == task_type_sort ) {
-            
-                /* If all the sorts have been done, make this task implicit. */
-                if ( !( t->flags & (t->flags ^ t->ci->sorted ) ) )
-                    t->implicit = 1;
-            
-                }
+int engine_marktasks(struct engine *e) {
 
-            }
-            
-        }
-    
-    else {
-    
-        /* Run through the tasks and mark as skip or not. */
-        for ( k = 0 ; k < nr_tasks ; k++ ) {
+  struct scheduler *s = &e->sched;
+  int k, nr_tasks = s->nr_tasks, *ind = s->tasks_ind;
+  struct task *t, *tasks = s->tasks;
+  float dt_step = e->dt_step;
+  struct cell *ci, *cj;
+  // ticks tic = getticks();
 
-            /* Get a handle on the kth task. */
-            t = &tasks[ ind[k] ];
+  /* Muc less to do here if we're on a fixed time-step. */
+  if (!(e->policy & engine_policy_multistep)) {
 
-            /* Sort-task? Note that due to the task ranking, the sorts
-               will all come before the pairs. */
-            if ( t->type == task_type_sort ) {
+    /* Run through the tasks and mark as skip or not. */
+    for (k = 0; k < nr_tasks; k++) {
 
-                /* Re-set the flags. */
-                t->flags = 0;
-                t->skip = 1;
+      /* Get a handle on the kth task. */
+      t = &tasks[ind[k]];
 
-                }
+      /* Pair? */
+      if (t->type == task_type_pair ||
+          (t->type == task_type_sub && t->cj != NULL)) {
 
-            /* Single-cell task? */
-            else if ( t->type == task_type_self ||
-                      t->type == task_type_ghost ||
-                    ( t->type == task_type_sub && t->cj == NULL ) ) {
+        /* Local pointers. */
+        ci = t->ci;
+        cj = t->cj;
 
-                /* Set this task's skip. */
-                t->skip = ( t->ci->dt_min > dt_step );
+        /* Too much particle movement? */
+        if (t->tight &&
+            (fmaxf(ci->h_max, cj->h_max) + ci->dx_max + cj->dx_max > cj->dmin ||
+             ci->dx_max > space_maxreldx * ci->h_max ||
+             cj->dx_max > space_maxreldx * cj->h_max))
+          return 1;
 
-                }
+      }
 
-            /* Pair? */
-            else if ( t->type == task_type_pair || ( t->type == task_type_sub && t->cj != NULL ) ) {
-
-                /* Local pointers. */
-                ci = t->ci;
-                cj = t->cj;
-
-                /* Set this task's skip. */
-                t->skip = ( ci->dt_min > dt_step && cj->dt_min > dt_step );
-
-                /* Too much particle movement? */
-                if ( t->tight &&
-                     ( fmaxf( ci->h_max , cj->h_max ) + ci->dx_max + cj->dx_max > cj->dmin || 
-                       ci->dx_max > space_maxreldx*ci->h_max || cj->dx_max > space_maxreldx*cj->h_max ) )
-                    return 1;
-
-                /* Set the sort flags. */
-                if ( !t->skip && t->type == task_type_pair ) {
-                    if ( !( ci->sorted & ( 1 << t->flags ) ) ) {
-                        ci->sorts->flags |= (1 << t->flags);
-                        ci->sorts->skip = 0;
-                        }
-                    if ( !( cj->sorted & ( 1 << t->flags ) ) ) {
-                        cj->sorts->flags |= (1 << t->flags);
-                        cj->sorts->skip = 0;
-                        }
-                    }
-
-                }
+      /* Sort? */
+      else if (t->type == task_type_sort) {
 
-            /* Kick2? */
-            else if ( t->type == task_type_kick2 )
-                t->skip = 0;
+        /* If all the sorts have been done, make this task implicit. */
+        if (!(t->flags & (t->flags ^ t->ci->sorted))) t->implicit = 1;
+      }
+    }
 
-            /* None? */
-            else if ( t->type == task_type_none )
-                t->skip = 1;
+  } else {
 
-            }
-            
+    /* Run through the tasks and mark as skip or not. */
+    for (k = 0; k < nr_tasks; k++) {
+
+      /* Get a handle on the kth task. */
+      t = &tasks[ind[k]];
+
+      /* Sort-task? Note that due to the task ranking, the sorts
+         will all come before the pairs. */
+      if (t->type == task_type_sort) {
+
+        /* Re-set the flags. */
+        t->flags = 0;
+        t->skip = 1;
+
+      }
+
+      /* Single-cell task? */
+      else if (t->type == task_type_self || t->type == task_type_ghost ||
+               (t->type == task_type_sub && t->cj == NULL)) {
+
+        /* Set this task's skip. */
+        t->skip = (t->ci->dt_min > dt_step);
+
+      }
+
+      /* Pair? */
+      else if (t->type == task_type_pair ||
+               (t->type == task_type_sub && t->cj != NULL)) {
+
+        /* Local pointers. */
+        ci = t->ci;
+        cj = t->cj;
+
+        /* Set this task's skip. */
+        t->skip = (ci->dt_min > dt_step && cj->dt_min > dt_step);
+
+        /* Too much particle movement? */
+        if (t->tight &&
+            (fmaxf(ci->h_max, cj->h_max) + ci->dx_max + cj->dx_max > cj->dmin ||
+             ci->dx_max > space_maxreldx * ci->h_max ||
+             cj->dx_max > space_maxreldx * cj->h_max))
+          return 1;
+
+        /* Set the sort flags. */
+        if (!t->skip && t->type == task_type_pair) {
+          if (!(ci->sorted & (1 << t->flags))) {
+            ci->sorts->flags |= (1 << t->flags);
+            ci->sorts->skip = 0;
+          }
+          if (!(cj->sorted & (1 << t->flags))) {
+            cj->sorts->flags |= (1 << t->flags);
+            cj->sorts->skip = 0;
+          }
         }
-        
-    // message( "took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
-    
-    /* All is well... */
-    return 0;
-    
+
+      }
+
+      /* Kick2? */
+      else if (t->type == task_type_kick2)
+        t->skip = 0;
+
+      /* None? */
+      else if (t->type == task_type_none)
+        t->skip = 1;
     }
-    
+  }
+
+  // message( "took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
+
+  /* All is well... */
+  return 0;
+}
 
 /**
  * @brief Rebuild the space and tasks.
  *
  * @param e The #engine.
  */
- 
-void engine_rebuild ( struct engine *e ) {
 
-    int k;
-    struct scheduler *sched = &e->sched;
-    
-    /* Clear the forcerebuild flag, whatever it was. */
-    e->forcerebuild = 0;
+void engine_rebuild(struct engine *e) {
 
-    /* Re-build the space. */
-    // tic = getticks();
-    space_rebuild( e->s , 0.0 );
-    // message( "space_rebuild took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
+  int k;
+  struct scheduler *sched = &e->sched;
 
-    /* If in parallel, exchange the cell structure. */
-    #ifdef WITH_MPI
-        // tic = getticks();
-        engine_exchange_cells( e );
-        // message( "engine_exchange_cells took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
-    #endif
+  /* Clear the forcerebuild flag, whatever it was. */
+  e->forcerebuild = 0;
 
-    /* Re-build the tasks. */
-    // tic = getticks();
-    engine_maketasks( e );
-    // message( "engine_maketasks took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
+  /* Re-build the space. */
+  // tic = getticks();
+  space_rebuild(e->s, 0.0);
+// message( "space_rebuild took %.3f ms." , (double)(getticks() -
+// tic)/CPU_TPS*1000 );
 
-    /* Run through the tasks and mark as skip or not. */
-    // tic = getticks();
-    if ( engine_marktasks( e ) )
-        error( "engine_marktasks failed after space_rebuild." );
-    // message( "engine_marktasks took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
-
-    /* Count and print the number of each task type. */
-    int counts[ task_type_count+1 ];
-    for ( k = 0 ; k <= task_type_count ; k++ )
-        counts[k] = 0;
-    for ( k = 0 ; k < sched->nr_tasks ; k++ )
-        if ( !sched->tasks[k].skip )
-            counts[ (int)sched->tasks[k].type ] += 1;
-        else
-            counts[ task_type_count ] += 1;
-    #ifdef WITH_MPI
-        printf( "[%03i] engine_rebuild: task counts are [ %s=%i" , e->nodeID , taskID_names[0] , counts[0] );
-    #else
-        printf( "engine_rebuild: task counts are [ %s=%i" , taskID_names[0] , counts[0] );
-    #endif
-    for ( k = 1 ; k < task_type_count ; k++ )
-        printf( " %s=%i" , taskID_names[k] , counts[k] );
-    printf( " skipped=%i ]\n" , counts[ task_type_count ] ); fflush(stdout);
-    message( "nr_parts = %i." , e->s->nr_parts );
-    
-    }
+/* If in parallel, exchange the cell structure. */
+#ifdef WITH_MPI
+  // tic = getticks();
+  engine_exchange_cells(e);
+// message( "engine_exchange_cells took %.3f ms." , (double)(getticks() -
+// tic)/CPU_TPS*1000 );
+#endif
 
+  /* Re-build the tasks. */
+  // tic = getticks();
+  engine_maketasks(e);
+  // message( "engine_maketasks took %.3f ms." , (double)(getticks() -
+  // tic)/CPU_TPS*1000 );
+
+  /* Run through the tasks and mark as skip or not. */
+  // tic = getticks();
+  if (engine_marktasks(e))
+    error("engine_marktasks failed after space_rebuild.");
+  // message( "engine_marktasks took %.3f ms." , (double)(getticks() -
+  // tic)/CPU_TPS*1000 );
+
+  /* Count and print the number of each task type. */
+  int counts[task_type_count + 1];
+  for (k = 0; k <= task_type_count; k++) counts[k] = 0;
+  for (k = 0; k < sched->nr_tasks; k++)
+    if (!sched->tasks[k].skip)
+      counts[(int)sched->tasks[k].type] += 1;
+    else
+      counts[task_type_count] += 1;
+#ifdef WITH_MPI
+  printf("[%03i] engine_rebuild: task counts are [ %s=%i", e->nodeID,
+         taskID_names[0], counts[0]);
+#else
+  printf("engine_rebuild: task counts are [ %s=%i", taskID_names[0], counts[0]);
+#endif
+  for (k = 1; k < task_type_count; k++)
+    printf(" %s=%i", taskID_names[k], counts[k]);
+  printf(" skipped=%i ]\n", counts[task_type_count]);
+  fflush(stdout);
+  message("nr_parts = %i.", e->s->nr_parts);
+}
 
 /**
  * @brief Prepare the #engine by re-building the cells and tasks.
  *
  * @param e The #engine to prepare.
  */
- 
-void engine_prepare ( struct engine *e ) {
-    
-    int rebuild;
-    
-    TIMER_TIC
 
-    /* Run through the tasks and mark as skip or not. */
+void engine_prepare(struct engine *e) {
+
+  int rebuild;
+
+  TIMER_TIC
+
+  /* Run through the tasks and mark as skip or not. */
+  // tic = getticks();
+  rebuild = (e->forcerebuild || engine_marktasks(e));
+// message( "space_marktasks took %.3f ms." , (double)(getticks() -
+// tic)/CPU_TPS*1000 );
+
+/* Collect the values of rebuild from all nodes. */
+#ifdef WITH_MPI
+  // tic = getticks();
+  int buff;
+  if (MPI_Allreduce(&rebuild, &buff, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD) !=
+      MPI_SUCCESS)
+    error("Failed to aggreggate the rebuild flag accross nodes.");
+  rebuild = buff;
+// message( "rebuild allreduce took %.3f ms." , (double)(getticks() -
+// tic)/CPU_TPS*1000 );
+#endif
+  e->tic_step = getticks();
+
+  /* Did this not go through? */
+  if (rebuild) {
     // tic = getticks();
-    rebuild = ( e->forcerebuild || engine_marktasks( e ) );
-    // message( "space_marktasks took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
-        
-    /* Collect the values of rebuild from all nodes. */
-    #ifdef WITH_MPI
-        // tic = getticks();
-        int buff;
-        if ( MPI_Allreduce( &rebuild , &buff , 1 , MPI_INT , MPI_MAX , MPI_COMM_WORLD ) != MPI_SUCCESS )
-            error( "Failed to aggreggate the rebuild flag accross nodes." );
-        rebuild = buff;
-    // message( "rebuild allreduce took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
-    #endif
-    e->tic_step = getticks();
-    
-    /* Did this not go through? */
-    if ( rebuild ) {
-        // tic = getticks();
-        engine_rebuild( e );
-        // message( "engine_rebuild took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
-    }
-        
-    /* Re-rank the tasks every now and then. */
-    if ( e->tasks_age % engine_tasksreweight == 1 ) {
-        // tic = getticks();
-        scheduler_reweight( &e->sched );
-        // message( "scheduler_reweight took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
-    }
-    e->tasks_age += 1;
+    engine_rebuild(e);
+    // message( "engine_rebuild took %.3f ms." , (double)(getticks() -
+    // tic)/CPU_TPS*1000 );
+  }
 
-    TIMER_TOC( timer_prepare );
-    
-    }
+  /* Re-rank the tasks every now and then. */
+  if (e->tasks_age % engine_tasksreweight == 1) {
+    // tic = getticks();
+    scheduler_reweight(&e->sched);
+    // message( "scheduler_reweight took %.3f ms." , (double)(getticks() -
+    // tic)/CPU_TPS*1000 );
+  }
+  e->tasks_age += 1;
 
+  TIMER_TOC(timer_prepare);
+}
 
 /**
  * @brief Implements a barrier for the #runner threads.
@@ -1454,104 +1544,107 @@ void engine_prepare ( struct engine *e ) {
  * @param e The #engine.
  * @param tid The thread ID
  */
- 
-void engine_barrier ( struct engine *e , int tid ) {
-
-    /* First, get the barrier mutex. */
-    if ( pthread_mutex_lock( &e->barrier_mutex ) != 0 )
-        error( "Failed to get barrier mutex." );
-        
-    /* This thread is no longer running. */
-    e->barrier_running -= 1;
-        
-    /* If all threads are in, send a signal... */
-    if ( e->barrier_running == 0 )
-        if ( pthread_cond_broadcast( &e->barrier_cond ) != 0 )
-            error( "Failed to broadcast barrier full condition." );
-        
-    /* Wait for the barrier to open. */
-    while ( e->barrier_launch == 0 || tid >= e->barrier_launchcount )
-        if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 )
-            error( "Eror waiting for barrier to close." );
-        
-    /* This thread has been launched. */
-    e->barrier_running += 1;
-    e->barrier_launch -= 1;
-    
-    /* If I'm the last one out, signal the condition again. */
-    if ( e->barrier_launch == 0 )
-        if ( pthread_cond_broadcast( &e->barrier_cond ) != 0 )
-            error( "Failed to broadcast empty barrier condition." );
-            
-    /* Last but not least, release the mutex. */
-    if ( pthread_mutex_unlock( &e->barrier_mutex ) != 0 )
-        error( "Failed to get unlock the barrier mutex." );
 
-    }
-    
-    
+void engine_barrier(struct engine *e, int tid) {
+
+  /* First, get the barrier mutex. */
+  if (pthread_mutex_lock(&e->barrier_mutex) != 0)
+    error("Failed to get barrier mutex.");
+
+  /* This thread is no longer running. */
+  e->barrier_running -= 1;
+
+  /* If all threads are in, send a signal... */
+  if (e->barrier_running == 0)
+    if (pthread_cond_broadcast(&e->barrier_cond) != 0)
+      error("Failed to broadcast barrier full condition.");
+
+  /* Wait for the barrier to open. */
+  while (e->barrier_launch == 0 || tid >= e->barrier_launchcount)
+    if (pthread_cond_wait(&e->barrier_cond, &e->barrier_mutex) != 0)
+      error("Eror waiting for barrier to close.");
+
+  /* This thread has been launched. */
+  e->barrier_running += 1;
+  e->barrier_launch -= 1;
+
+  /* If I'm the last one out, signal the condition again. */
+  if (e->barrier_launch == 0)
+    if (pthread_cond_broadcast(&e->barrier_cond) != 0)
+      error("Failed to broadcast empty barrier condition.");
+
+  /* Last but not least, release the mutex. */
+  if (pthread_mutex_unlock(&e->barrier_mutex) != 0)
+    error("Failed to get unlock the barrier mutex.");
+}
+
 /**
  * @brief Mapping function to collect the data from the second kick.
  */
 
-void engine_collect_kick2 ( struct cell *c ) {
-
-    int k, updated = 0;
-    float dt_min = FLT_MAX, dt_max = 0.0f;
-    double ekin = 0.0, epot = 0.0;
-    float mom[3] = { 0.0f , 0.0f , 0.0f }, ang[3] = { 0.0f , 0.0f , 0.0f };
-    struct cell *cp;
-    
-    /* If I am a super-cell, return immediately. */
-    if ( c->kick2 != NULL || c->count == 0 )
-        return;
-        
-    /* If this cell is not split, I'm in trouble. */
-    if ( !c->split )
-        error( "Cell has no super-cell." );
-        
-    /* Collect the values from the progeny. */
-    for ( k = 0 ; k < 8 ; k++ )
-        if ( ( cp = c->progeny[k] ) != NULL ) {
-            engine_collect_kick2( cp );
-            dt_min = fminf( dt_min , cp->dt_min );
-            dt_max = fmaxf( dt_max , cp->dt_max );
-            updated += cp->updated;
-            ekin += cp->ekin;
-            epot += cp->epot;
-            mom[0] += cp->mom[0]; mom[1] += cp->mom[1]; mom[2] += cp->mom[2];
-            ang[0] += cp->ang[0]; ang[1] += cp->ang[1]; ang[2] += cp->ang[2];
-            }
-    
-    /* Store the collected values in the cell. */
-    c->dt_min = dt_min;
-    c->dt_max = dt_max;
-    c->updated = updated;
-    c->ekin = ekin;
-    c->epot = epot;
-    c->mom[0] = mom[0]; c->mom[1] = mom[1]; c->mom[2] = mom[2];
-    c->ang[0] = ang[0]; c->ang[1] = ang[1]; c->ang[2] = ang[2];
-        
+void engine_collect_kick2(struct cell *c) {
+
+  int k, updated = 0;
+  float dt_min = FLT_MAX, dt_max = 0.0f;
+  double ekin = 0.0, epot = 0.0;
+  float mom[3] = {0.0f, 0.0f, 0.0f}, ang[3] = {0.0f, 0.0f, 0.0f};
+  struct cell *cp;
+
+  /* If I am a super-cell, return immediately. */
+  if (c->kick2 != NULL || c->count == 0) return;
+
+  /* If this cell is not split, I'm in trouble. */
+  if (!c->split) error("Cell has no super-cell.");
+
+  /* Collect the values from the progeny. */
+  for (k = 0; k < 8; k++)
+    if ((cp = c->progeny[k]) != NULL) {
+      engine_collect_kick2(cp);
+      dt_min = fminf(dt_min, cp->dt_min);
+      dt_max = fmaxf(dt_max, cp->dt_max);
+      updated += cp->updated;
+      ekin += cp->ekin;
+      epot += cp->epot;
+      mom[0] += cp->mom[0];
+      mom[1] += cp->mom[1];
+      mom[2] += cp->mom[2];
+      ang[0] += cp->ang[0];
+      ang[1] += cp->ang[1];
+      ang[2] += cp->ang[2];
     }
 
+  /* Store the collected values in the cell. */
+  c->dt_min = dt_min;
+  c->dt_max = dt_max;
+  c->updated = updated;
+  c->ekin = ekin;
+  c->epot = epot;
+  c->mom[0] = mom[0];
+  c->mom[1] = mom[1];
+  c->mom[2] = mom[2];
+  c->ang[0] = ang[0];
+  c->ang[1] = ang[1];
+  c->ang[2] = ang[2];
+}
 
 /**
  * @brief Compute the force on a single particle brute-force.
  */
 
-// void engine_single_density ( double *dim , long long int pid , struct part *__restrict__ parts , int N , int periodic ) {
-// 
+// void engine_single_density ( double *dim , long long int pid , struct part
+// *__restrict__ parts , int N , int periodic ) {
+//
 //     int i, k;
 //     double r2, dx[3];
 //     float fdx[3], ih;
 //     struct part p;
-//     
+//
 //     /* Find "our" part. */
 //     for ( k = 0 ; k < N && parts[k].id != pid ; k++ );
 //     if ( k == N )
 //         error( "Part not found." );
 //     p = parts[k];
-//     
+//
 //     /* Clear accumulators. */
 //     ih = 1.0f / p.h;
 //     p.rho = 0.0f; p.rho_dh = 0.0f;
@@ -1559,7 +1652,7 @@ void engine_collect_kick2 ( struct cell *c ) {
 //     p.density.div_v = 0.0;
 //     for ( k=0 ; k < 3 ; k++)
 //         p.density.curl_v[k] = 0.0;
-//             
+//
 //     /* Loop over all particle pairs (force). */
 //     for ( k = 0 ; k < N ; k++ ) {
 //         if ( parts[k].id == p.id )
@@ -1576,37 +1669,40 @@ void engine_collect_kick2 ( struct cell *c ) {
 //             }
 //         r2 = fdx[0]*fdx[0] + fdx[1]*fdx[1] + fdx[2]*fdx[2];
 //         if ( r2 < p.h*p.h*kernel_gamma2 ) {
-//             runner_iact_nonsym_density( r2 , fdx , p.h , parts[k].h , &p , &parts[k] );
+//             runner_iact_nonsym_density( r2 , fdx , p.h , parts[k].h , &p ,
+// &parts[k] );
 //             }
 //         }
-//         
+//
 //     /* Dump the result. */
 //     p.rho = ih * ih * ih * ( p.rho + p.mass*kernel_root );
 //     p.rho_dh = p.rho_dh * ih * ih * ih * ih;
-//     p.density.wcount = ( p.density.wcount + kernel_root ) * ( 4.0f / 3.0 * M_PI * kernel_gamma3 );
-//     message( "part %lli (h=%e) has wcount=%e, rho=%e, rho_dh=%e." , p.id , p.h , p.density.wcount , p.rho , p.rho_dh );
+//     p.density.wcount = ( p.density.wcount + kernel_root ) * ( 4.0f / 3.0 *
+// M_PI * kernel_gamma3 );
+//     message( "part %lli (h=%e) has wcount=%e, rho=%e, rho_dh=%e." , p.id ,
+// p.h , p.density.wcount , p.rho , p.rho_dh );
 //     fflush(stdout);
-//     
+//
 //     }
 
-
-// void engine_single_force ( double *dim , long long int pid , struct part *__restrict__ parts , int N , int periodic ) {
-// 
+// void engine_single_force ( double *dim , long long int pid , struct part
+// *__restrict__ parts , int N , int periodic ) {
+//
 //     int i, k;
 //     double r2, dx[3];
 //     float fdx[3];
 //     struct part p;
-//     
+//
 //     /* Find "our" part. */
 //     for ( k = 0 ; k < N && parts[k].id != pid ; k++ );
 //     if ( k == N )
 //         error( "Part not found." );
 //     p = parts[k];
-//     
+//
 //     /* Clear accumulators. */
 //     p.a[0] = 0.0f; p.a[1] = 0.0f; p.a[2] = 0.0f;
 //     p.force.u_dt = 0.0f; p.force.h_dt = 0.0f; p.force.v_sig = 0.0f;
-//             
+//
 //     /* Loop over all particle pairs (force). */
 //     for ( k = 0 ; k < N ; k++ ) {
 //     // for ( k = N-1 ; k >= 0 ; k-- ) {
@@ -1623,23 +1719,28 @@ void engine_collect_kick2 ( struct cell *c ) {
 //             fdx[i] = dx[i];
 //             }
 //         r2 = fdx[0]*fdx[0] + fdx[1]*fdx[1] + fdx[2]*fdx[2];
-//         if ( r2 < p.h*p.h*kernel_gamma2 || r2 < parts[k].h*parts[k].h*kernel_gamma2 ) {
+//         if ( r2 < p.h*p.h*kernel_gamma2 || r2 <
+// parts[k].h*parts[k].h*kernel_gamma2 ) {
 //             p.a[0] = 0.0f; p.a[1] = 0.0f; p.a[2] = 0.0f;
 //             p.force.u_dt = 0.0f; p.force.h_dt = 0.0f; p.force.v_sig = 0.0f;
-//             runner_iact_nonsym_force( r2 , fdx , p.h , parts[k].h , &p , &parts[k] );
-//             double dvdr = ( (p.v[0]-parts[k].v[0])*fdx[0] + (p.v[1]-parts[k].v[1])*fdx[1] + (p.v[2]-parts[k].v[2])*fdx[2] ) / sqrt(r2);
-//             message( "part %lli and %lli interact (r=%.3e,dvdr=%.3e) with a=[%.3e,%.3e,%.3e], dudt=%.3e." ,
-//                 p.id , parts[k].id , sqrt(r2) , dvdr , p.a[0] , p.a[1], p.a[2] , p.force.u_dt );
+//             runner_iact_nonsym_force( r2 , fdx , p.h , parts[k].h , &p ,
+// &parts[k] );
+//             double dvdr = ( (p.v[0]-parts[k].v[0])*fdx[0] +
+// (p.v[1]-parts[k].v[1])*fdx[1] + (p.v[2]-parts[k].v[2])*fdx[2] ) / sqrt(r2);
+//             message( "part %lli and %lli interact (r=%.3e,dvdr=%.3e) with
+// a=[%.3e,%.3e,%.3e], dudt=%.3e." ,
+//                 p.id , parts[k].id , sqrt(r2) , dvdr , p.a[0] , p.a[1],
+// p.a[2] , p.force.u_dt );
 //             }
 //         }
-//         
+//
 //     /* Dump the result. */
-//     // message( "part %lli (h=%e) has a=[%.3e,%.3e,%.3e], udt=%e." , p.id , p.h , p.a[0] , p.a[1] , p.a[2] , p.force.u_dt );
+//     // message( "part %lli (h=%e) has a=[%.3e,%.3e,%.3e], udt=%e." , p.id ,
+// p.h , p.a[0] , p.a[1] , p.a[2] , p.force.u_dt );
 //     fflush(stdout);
-//     
+//
 //     }
-    
-    
+
 /**
  * @brief Launch the runners.
  *
@@ -1647,384 +1748,409 @@ void engine_collect_kick2 ( struct cell *c ) {
  * @param nr_runners The number of #runner to let loose.
  * @param mask The task mask to launch.
  */
- 
-void engine_launch ( struct engine *e , int nr_runners , unsigned int mask ) {
-
-    /* Prepare the scheduler. */
-    atomic_inc( &e->sched.waiting );
-
-    /* Cry havoc and let loose the dogs of war. */
-    e->barrier_launch = nr_runners;
-    e->barrier_launchcount = nr_runners;
-    if ( pthread_cond_broadcast( &e->barrier_cond ) != 0 )
-        error( "Failed to broadcast barrier open condition." );
-        
-    /* Load the tasks. */
-    pthread_mutex_unlock( &e->barrier_mutex );
-    scheduler_start( &e->sched , mask );
-    pthread_mutex_lock( &e->barrier_mutex );
-        
-    /* Remove the safeguard. */
-    pthread_mutex_lock( &e->sched.sleep_mutex );
-    atomic_dec( &e->sched.waiting );
-    pthread_cond_broadcast( &e->sched.sleep_cond );
-    pthread_mutex_unlock( &e->sched.sleep_mutex );
-
-    /* Sit back and wait for the runners to come home. */
-    while ( e->barrier_launch || e->barrier_running )
-        if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 )
-            error( "Error while waiting for barrier." );
-            
-    }
-    
-    
-void hassorted ( struct cell *c ) {
-
-    if ( c->sorted )
-        error( "Suprious sorted flags." );
-        
-    if ( c->split )
-        for ( int k = 0 ; k < 8 ; k++ )
-            if ( c->progeny[k] != NULL )
-                hassorted( c->progeny[k] );
-                    
-    }
 
+void engine_launch(struct engine *e, int nr_runners, unsigned int mask) {
+
+  /* Prepare the scheduler. */
+  atomic_inc(&e->sched.waiting);
+
+  /* Cry havoc and let loose the dogs of war. */
+  e->barrier_launch = nr_runners;
+  e->barrier_launchcount = nr_runners;
+  if (pthread_cond_broadcast(&e->barrier_cond) != 0)
+    error("Failed to broadcast barrier open condition.");
+
+  /* Load the tasks. */
+  pthread_mutex_unlock(&e->barrier_mutex);
+  scheduler_start(&e->sched, mask);
+  pthread_mutex_lock(&e->barrier_mutex);
+
+  /* Remove the safeguard. */
+  pthread_mutex_lock(&e->sched.sleep_mutex);
+  atomic_dec(&e->sched.waiting);
+  pthread_cond_broadcast(&e->sched.sleep_cond);
+  pthread_mutex_unlock(&e->sched.sleep_mutex);
+
+  /* Sit back and wait for the runners to come home. */
+  while (e->barrier_launch || e->barrier_running)
+    if (pthread_cond_wait(&e->barrier_cond, &e->barrier_mutex) != 0)
+      error("Error while waiting for barrier.");
+}
+
+void hassorted(struct cell *c) {
+
+  if (c->sorted) error("Suprious sorted flags.");
+
+  if (c->split)
+    for (int k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL) hassorted(c->progeny[k]);
+}
 
 /**
  * @brief Let the #engine loose to compute the forces.
  *
  * @param e The #engine.
  */
- 
-void engine_step ( struct engine *e ) {
-
-    int k;
-    float dt = e->dt, dt_step, dt_max = 0.0f, dt_min = FLT_MAX;
-    double epot = 0.0, ekin = 0.0;
-    float mom[3] = { 0.0 , 0.0 , 0.0 };
-    float ang[3] = { 0.0 , 0.0 , 0.0 };
-    int count = 0;
-    struct cell *c;
-    struct space *s = e->s;
-    
-    TIMER_TIC2
-
-    /* Get the maximum dt. */
-    if ( e->policy & engine_policy_multistep ) {
-        dt_step = 2.0f*dt;
-        for ( k = 0 ; k < 32 && (e->step & (1 << k)) == 0 ; k++ )
-            dt_step *= 2;
-        }
-    else
-        dt_step = FLT_MAX;
-        
-    /* Set the maximum dt. */
-    e->dt_step = dt_step;
-    e->s->dt_step = dt_step;
-    // message( "dt_step set to %.3e (dt=%.3e)." , dt_step , e->dt ); fflush(stdout);
-    
-    // printParticle( parts , 432626 );
-    
-    /* First kick. */
-    if ( e->step == 0 || !( e->policy & engine_policy_fixdt ) ) {
-        TIMER_TIC
-        engine_launch( e , ( e->nr_threads > 8 ) ? 8 : e->nr_threads , (1 << task_type_kick1) | (1 << task_type_link) );
-        TIMER_TOC( timer_kick1 );
-        }
-    
-    /* Check if all the kick1 threads have executed. */
-    /* for ( k = 0 ; k < e->sched.nr_tasks ; k++ )
-        if ( e->sched.tasks[k].type == task_type_kick1 &&
-             e->sched.tasks[k].toc == 0 )
-            error( "Not all kick1 tasks completed." ); */
-        
-    // for(k=0; k<10; ++k)
-    //   printParticle(parts, k);
-    // printParticle( e->s->parts , 3392063069037 , e->s->nr_parts );
- 
-    /* Re-distribute the particles amongst the nodes? */
-    if ( e->forcerepart )
-        engine_repartition( e );
-    
-    /* Prepare the space. */
-    engine_prepare( e );
-    
-    // engine_single_density( e->s->dim , 3392063069037 , e->s->parts , e->s->nr_parts , e->s->periodic );
-
-    /* Send off the runners. */
+
+void engine_step(struct engine *e) {
+
+  int k;
+  float dt = e->dt, dt_step, dt_max = 0.0f, dt_min = FLT_MAX;
+  double epot = 0.0, ekin = 0.0;
+  float mom[3] = {0.0, 0.0, 0.0};
+  float ang[3] = {0.0, 0.0, 0.0};
+  int count = 0;
+  struct cell *c;
+  struct space *s = e->s;
+
+  TIMER_TIC2
+
+  if (e->policy & engine_policy_paranoid) {
+    message("Checking system sanity...");
+    engine_check(e);
+  }
+
+  /* Get the maximum dt. */
+  if (e->policy & engine_policy_multistep) {
+    dt_step = 2.0f * dt;
+    for (k = 0; k < 32 && (e->step & (1 << k)) == 0; k++) dt_step *= 2;
+  } else
+    dt_step = FLT_MAX;
+
+  /* Set the maximum dt. */
+  e->dt_step = dt_step;
+  e->s->dt_step = dt_step;
+  // message( "dt_step set to %.3e (dt=%.3e)." , dt_step , e->dt );
+  // fflush(stdout);
+
+  // printParticle( parts , 432626 );
+
+  /* First kick. */
+  if (e->step == 0 || !(e->policy & engine_policy_fixdt)) {
     TIMER_TIC
-    engine_launch( e , e->nr_threads , (1 << task_type_sort) | 
-                                       (1 << task_type_self) |
-                                       (1 << task_type_pair) | 
-                                       (1 << task_type_sub) |
-                                       (1 << task_type_ghost) | 
-                                       (1 << task_type_kick2) |
-                                       (1 << task_type_send) |
-                                       (1 << task_type_recv) |
-                                       (1 << task_type_grav_pp) |
-                                       (1 << task_type_grav_mm) |
-                                       (1 << task_type_grav_up) |
-                                       (1 << task_type_grav_down) |
-                                       (1 << task_type_link) );
-    TIMER_TOC(timer_runners);
-    
-    // engine_single_force( e->s->dim , 8328423931905 , e->s->parts , e->s->nr_parts , e->s->periodic );
-    
-    // for(k=0; k<10; ++k)
-    //   printParticle(parts, k);
-    // printParticle( parts , 432626 );
-    // printParticle( e->s->parts , 3392063069037 , e->s->nr_parts );
-    // printParticle( e->s->parts , 8328423931905 , e->s->nr_parts );
-
-    /* Collect the cell data from the second kick. */
-    for ( k = 0 ; k < s->nr_cells ; k++ )
-        if ( s->cells[k].nodeID == e->nodeID ) {
-            c = &s->cells[k];
-            engine_collect_kick2( c );
-            dt_min = fminf( dt_min , c->dt_min );
-            dt_max = fmaxf( dt_max , c->dt_max );
-            ekin += c->ekin;
-            epot += c->epot;
-            count += c->updated;
-            mom[0] += c->mom[0]; mom[1] += c->mom[1]; mom[2] += c->mom[2];
-            ang[0] += c->ang[0]; ang[1] += c->ang[1]; ang[2] += c->ang[2];
-            }
-        
-    /* Aggregate the data from the different nodes. */
-    #ifdef WITH_MPI
-        double in[3], out[3];
-        out[0] = dt_min;
-        if ( MPI_Allreduce( out , in , 1 , MPI_DOUBLE , MPI_MIN , MPI_COMM_WORLD ) != MPI_SUCCESS )
-            error( "Failed to aggregate dt_min." );
-        dt_min = in[0];
-        out[0] = dt_max;
-        if ( MPI_Allreduce( out , in , 1 , MPI_DOUBLE , MPI_MAX , MPI_COMM_WORLD ) != MPI_SUCCESS )
-            error( "Failed to aggregate dt_max." );
-        dt_max = in[0];
-        out[0] = count; out[1] = ekin; out[2] = epot;
-        if ( MPI_Allreduce( out , in , 3 , MPI_DOUBLE , MPI_SUM , MPI_COMM_WORLD ) != MPI_SUCCESS )
-            error( "Failed to aggregate energies." );
-        count = in[0]; ekin = in[1]; epot = in[2];
-        /* int nr_parts;
-        if ( MPI_Allreduce( &s->nr_parts , &nr_parts , 1 , MPI_INT , MPI_SUM , MPI_COMM_WORLD ) != MPI_SUCCESS )
-            error( "Failed to aggregate particle count." );
-        if ( e->nodeID == 0 )
-            message( "nr_parts=%i." , nr_parts ); */
-    #endif
-    
-    e->dt_min = dt_min;
-    e->dt_max = dt_max;
-    e->count_step = count;
-    e->ekin = ekin;
-    e->epot = epot;
-    // printParticle( e->s->parts , 382557 , e->s->nr_parts );
-    // message( "dt_min/dt_max is %e/%e." , dt_min , dt_max ); fflush(stdout);
-    // message( "etot is %e (ekin=%e, epot=%e)." , ekin+epot , ekin , epot ); fflush(stdout);
-    // message( "total momentum is [ %e , %e , %e ]." , mom[0] , mom[1] , mom[2] ); fflush(stdout);
-    // message( "total angular momentum is [ %e , %e , %e ]." , ang[0] , ang[1] , ang[2] ); fflush(stdout);
-    // message( "updated %i parts (dt_step=%.3e)." , count , dt_step ); fflush(stdout);
-        
-    /* Increase the step. */
-    e->step += 1;
-
-    /* Does the time step need adjusting? */
-    if ( e->policy & engine_policy_fixdt ) {
+    engine_launch(e, (e->nr_threads > 8) ? 8 : e->nr_threads,
+                  (1 << task_type_kick1) | (1 << task_type_link));
+    TIMER_TOC(timer_kick1);
+  }
+
+  /* Check if all the kick1 threads have executed. */
+  /* for ( k = 0 ; k < e->sched.nr_tasks ; k++ )
+      if ( e->sched.tasks[k].type == task_type_kick1 &&
+           e->sched.tasks[k].toc == 0 )
+          error( "Not all kick1 tasks completed." ); */
+
+  // for(k=0; k<10; ++k)
+  //   printParticle(parts, k);
+  // printParticle( e->s->parts , 3392063069037 , e->s->nr_parts );
+
+  if (e->policy & engine_policy_paranoid) {
+    message("Checking system sanity...");
+    engine_check(e);
+  }
+
+  /* Re-distribute the particles amongst the nodes? */
+  if (e->forcerepart) engine_repartition(e);
+
+  if (e->policy & engine_policy_paranoid) {
+    message("Checking system sanity...");
+    engine_check(e);
+  }
+
+  /* Prepare the space. */
+  engine_prepare(e);
+
+  // engine_single_density( e->s->dim , 3392063069037 , e->s->parts ,
+  // e->s->nr_parts , e->s->periodic );
+
+  /* Send off the runners. */
+  TIMER_TIC
+  engine_launch(e, e->nr_threads,
+                (1 << task_type_sort) | (1 << task_type_self) |
+                    (1 << task_type_pair) | (1 << task_type_sub) |
+                    (1 << task_type_ghost) | (1 << task_type_kick2) |
+                    (1 << task_type_send) | (1 << task_type_recv) |
+                    (1 << task_type_grav_pp) | (1 << task_type_grav_mm) |
+                    (1 << task_type_grav_up) | (1 << task_type_grav_down) |
+                    (1 << task_type_link));
+
+  if (e->policy & engine_policy_paranoid) {
+    message("Checking system sanity...");
+    engine_check(e);
+  }
+
+  TIMER_TOC(timer_runners);
+
+  // engine_single_force( e->s->dim , 8328423931905 , e->s->parts ,
+  // e->s->nr_parts , e->s->periodic );
+
+  // for(k=0; k<10; ++k)
+  //   printParticle(parts, k);
+  // printParticle( parts , 432626 );
+  // printParticle( e->s->parts , 3392063069037 , e->s->nr_parts );
+  // printParticle( e->s->parts , 8328423931905 , e->s->nr_parts );
+
+  /* Collect the cell data from the second kick. */
+  for (k = 0; k < s->nr_cells; k++)
+    if (s->cells[k].nodeID == e->nodeID) {
+      c = &s->cells[k];
+      engine_collect_kick2(c);
+      dt_min = fminf(dt_min, c->dt_min);
+      dt_max = fmaxf(dt_max, c->dt_max);
+      ekin += c->ekin;
+      epot += c->epot;
+      count += c->updated;
+      mom[0] += c->mom[0];
+      mom[1] += c->mom[1];
+      mom[2] += c->mom[2];
+      ang[0] += c->ang[0];
+      ang[1] += c->ang[1];
+      ang[2] += c->ang[2];
+    }
+
+/* Aggregate the data from the different nodes. */
+#ifdef WITH_MPI
+  double in[3], out[3];
+  out[0] = dt_min;
+  if (MPI_Allreduce(out, in, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD) !=
+      MPI_SUCCESS)
+    error("Failed to aggregate dt_min.");
+  dt_min = in[0];
+  out[0] = dt_max;
+  if (MPI_Allreduce(out, in, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD) !=
+      MPI_SUCCESS)
+    error("Failed to aggregate dt_max.");
+  dt_max = in[0];
+  out[0] = count;
+  out[1] = ekin;
+  out[2] = epot;
+  if (MPI_Allreduce(out, in, 3, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD) !=
+      MPI_SUCCESS)
+    error("Failed to aggregate energies.");
+  count = in[0];
+  ekin = in[1];
+  epot = in[2];
+/* int nr_parts;
+if ( MPI_Allreduce( &s->nr_parts , &nr_parts , 1 , MPI_INT , MPI_SUM ,
+MPI_COMM_WORLD ) != MPI_SUCCESS )
+    error( "Failed to aggregate particle count." );
+if ( e->nodeID == 0 )
+    message( "nr_parts=%i." , nr_parts ); */
+#endif
+
+  e->dt_min = dt_min;
+  e->dt_max = dt_max;
+  e->count_step = count;
+  e->ekin = ekin;
+  e->epot = epot;
+  // printParticle( e->s->parts , 382557 , e->s->nr_parts );
+  // message( "dt_min/dt_max is %e/%e." , dt_min , dt_max ); fflush(stdout);
+  // message( "etot is %e (ekin=%e, epot=%e)." , ekin+epot , ekin , epot );
+  // fflush(stdout);
+  // message( "total momentum is [ %e , %e , %e ]." , mom[0] , mom[1] , mom[2]
+  // ); fflush(stdout);
+  // message( "total angular momentum is [ %e , %e , %e ]." , ang[0] , ang[1] ,
+  // ang[2] ); fflush(stdout);
+  // message( "updated %i parts (dt_step=%.3e)." , count , dt_step );
+  // fflush(stdout);
+
+  /* Increase the step. */
+  e->step += 1;
+
+  /* Does the time step need adjusting? */
+  if (e->policy & engine_policy_fixdt) {
+    dt = e->dt_orig;
+  } else {
+    if (dt == 0) {
+      e->nullstep += 1;
+      if (e->dt_orig > 0.0) {
         dt = e->dt_orig;
-        }
-    else {
-        if ( dt == 0 ) {
-            e->nullstep += 1;
-            if ( e->dt_orig > 0.0 ) {
-                dt = e->dt_orig;
-                while ( dt_min < dt )
-                    dt *= 0.5;
-                while ( dt_min > 2*dt )
-                    dt *= 2.0;
-                }
-            else
-                dt = dt_min;
-            for ( k = 0 ; k < s->nr_parts ; k++ ) {
-                /* struct part *p = &s->parts[k];
-                struct xpart *xp = &s->xparts[k];
-                float dt_curr = dt;
-                for ( int j = (int)( p->dt / dt ) ; j > 1 ; j >>= 1 )
-                    dt_curr *= 2.0f; 
-                xp->dt_curr = dt_curr; */
-                s->parts[k].dt = dt;
-                s->xparts[k].dt_curr = dt;
-                }
-            // message( "dt_min=%.3e, adjusting time step to dt=%e." , dt_min , e->dt );
-            }
-        else {
-            while ( dt_min < dt ) {
-                dt *= 0.5;
-                e->step *= 2;
-                e->nullstep *= 2;
-                // message( "dt_min dropped below time step, adjusting to dt=%e." , e->dt );
-                }
-            while ( dt_min > 2*dt && (e->step & 1) == 0 ) {
-                dt *= 2.0;
-                e->step /= 2;
-                e->nullstep /= 2;
-                // message( "dt_min is larger than twice the time step, adjusting to dt=%e." , e->dt );
-                }
-            }
-        } 
-    e->dt = dt;
-    
-    /* Set the system time. */
-    e->time = dt * (e->step - e->nullstep);
-        
-    TIMER_TOC2(timer_step);
-    
+        while (dt_min < dt) dt *= 0.5;
+        while (dt_min > 2 * dt) dt *= 2.0;
+      } else
+        dt = dt_min;
+      for (k = 0; k < s->nr_parts; k++) {
+        /* struct part *p = &s->parts[k];
+        struct xpart *xp = &s->xparts[k];
+        float dt_curr = dt;
+        for ( int j = (int)( p->dt / dt ) ; j > 1 ; j >>= 1 )
+            dt_curr *= 2.0f;
+        xp->dt_curr = dt_curr; */
+        s->parts[k].dt = dt;
+        s->xparts[k].dt_curr = dt;
+      }
+      // message( "dt_min=%.3e, adjusting time step to dt=%e." , dt_min , e->dt
+      // );
+    } else {
+      while (dt_min < dt) {
+        dt *= 0.5;
+        e->step *= 2;
+        e->nullstep *= 2;
+        // message( "dt_min dropped below time step, adjusting to dt=%e." ,
+        // e->dt );
+      }
+      while (dt_min > 2 * dt && (e->step & 1) == 0) {
+        dt *= 2.0;
+        e->step /= 2;
+        e->nullstep /= 2;
+        // message( "dt_min is larger than twice the time step, adjusting to
+        // dt=%e." , e->dt );
+      }
     }
-    
-    
+  }
+  e->dt = dt;
+
+  /* Set the system time. */
+  e->time = dt * (e->step - e->nullstep);
+
+  TIMER_TOC2(timer_step);
+}
+
 /**
  * @brief Create and fill the proxies.
  *
  * @param e The #engine.
  */
- 
-void engine_makeproxies ( struct engine *e ) {
-
-    int i, j, k, ii, jj, kk;
-    int cid, cjd, pid, ind[3], *cdim = e->s->cdim;
-    struct space *s = e->s;
-    struct cell *cells = s->cells;
-    struct proxy *proxies = e->proxies;
-    
-    /* Prepare the proxies and the proxy index. */
-    if ( e->proxy_ind == NULL )
-        if ( ( e->proxy_ind = (int *)malloc( sizeof(int) * e->nr_nodes ) ) == NULL )
-            error( "Failed to allocate proxy index." );
-    for ( k = 0 ; k < e->nr_nodes ; k++ )
-        e->proxy_ind[k] = -1;
-    e->nr_proxies = 0;
-    
-    /* The following loop is super-clunky, but it's necessary
-       to ensure that the order of the send and recv cells in
-       the proxies is identical for all nodes! */
-    
-    /* Loop over each cell in the space. */
-    for ( ind[0] = 0 ; ind[0] < cdim[0] ; ind[0]++ )
-        for ( ind[1] = 0 ; ind[1] < cdim[1] ; ind[1]++ )
-            for ( ind[2] = 0 ; ind[2] < cdim[2] ; ind[2]++ ) {
-            
-                /* Get the cell ID. */
-                cid = cell_getid( cdim , ind[0] , ind[1] , ind[2] );
-                
-                /* Loop over all its neighbours (periodic). */
-                for ( i = -1 ; i <= 1 ; i++ ) {
-                    ii = ind[0] + i;
-                    if ( ii >= cdim[0] )
-                        ii -= cdim[0];
-                    else if ( ii < 0 )
-                        ii += cdim[0];
-                    for ( j = -1 ; j <= 1 ; j++ ) {
-                        jj = ind[1] + j;
-                        if ( jj >= cdim[1] )
-                            jj -= cdim[1];
-                        else if ( jj < 0 )
-                            jj += cdim[1];
-                        for ( k = -1 ; k <= 1 ; k++ ) {
-                            kk = ind[2] + k;
-                            if ( kk >= cdim[2] )
-                                kk -= cdim[2];
-                            else if ( kk < 0 )
-                                kk += cdim[2];
-                            
-                            /* Get the cell ID. */
-                            cjd = cell_getid( cdim , ii , jj , kk );
-                            
-                            /* Add to proxies? */
-                            if ( cells[cid].nodeID == e->nodeID && cells[cjd].nodeID != e->nodeID ) {
-                                pid = e->proxy_ind[ cells[cjd].nodeID ];
-                                if ( pid < 0 ) {
-                                    if ( e->nr_proxies == engine_maxproxies )
-                                        error( "Maximum number of proxies exceeded." );
-                                    proxy_init( &proxies[ e->nr_proxies ] , e->nodeID , cells[cjd].nodeID );
-                                    e->proxy_ind[ cells[cjd].nodeID ] = e->nr_proxies;
-                                    pid = e->nr_proxies;
-                                    e->nr_proxies += 1;
-                                    }
-                                proxy_addcell_in( &proxies[pid] , &cells[cjd] );
-                                proxy_addcell_out( &proxies[pid] , &cells[cid] );
-                                cells[cid].sendto |= ( 1ULL << pid );
-                                }
-                                
-                            if ( cells[cjd].nodeID == e->nodeID && cells[cid].nodeID != e->nodeID ) {
-                                pid = e->proxy_ind[ cells[cid].nodeID ];
-                                if ( pid < 0 ) {
-                                    if ( e->nr_proxies == engine_maxproxies )
-                                        error( "Maximum number of proxies exceeded." );
-                                    proxy_init( &proxies[ e->nr_proxies ] , e->nodeID , cells[cid].nodeID );
-                                    e->proxy_ind[ cells[cid].nodeID ] = e->nr_proxies;
-                                    pid = e->nr_proxies;
-                                    e->nr_proxies += 1;
-                                    }
-                                proxy_addcell_in( &proxies[pid] , &cells[cid] );
-                                proxy_addcell_out( &proxies[pid] , &cells[cjd] );
-                                cells[cjd].sendto |= ( 1ULL << pid );
-                                }
-                            }
-                        }
-                    }
+
+void engine_makeproxies(struct engine *e) {
+
+  int i, j, k, ii, jj, kk;
+  int cid, cjd, pid, ind[3], *cdim = e->s->cdim;
+  struct space *s = e->s;
+  struct cell *cells = s->cells;
+  struct proxy *proxies = e->proxies;
+
+  /* Prepare the proxies and the proxy index. */
+  if (e->proxy_ind == NULL)
+    if ((e->proxy_ind = (int *)malloc(sizeof(int) * e->nr_nodes)) == NULL)
+      error("Failed to allocate proxy index.");
+  for (k = 0; k < e->nr_nodes; k++) e->proxy_ind[k] = -1;
+  e->nr_proxies = 0;
+
+  /* The following loop is super-clunky, but it's necessary
+     to ensure that the order of the send and recv cells in
+     the proxies is identical for all nodes! */
+
+  /* Loop over each cell in the space. */
+  for (ind[0] = 0; ind[0] < cdim[0]; ind[0]++)
+    for (ind[1] = 0; ind[1] < cdim[1]; ind[1]++)
+      for (ind[2] = 0; ind[2] < cdim[2]; ind[2]++) {
+
+        /* Get the cell ID. */
+        cid = cell_getid(cdim, ind[0], ind[1], ind[2]);
+
+        /* Loop over all its neighbours (periodic). */
+        for (i = -1; i <= 1; i++) {
+          ii = ind[0] + i;
+          if (ii >= cdim[0])
+            ii -= cdim[0];
+          else if (ii < 0)
+            ii += cdim[0];
+          for (j = -1; j <= 1; j++) {
+            jj = ind[1] + j;
+            if (jj >= cdim[1])
+              jj -= cdim[1];
+            else if (jj < 0)
+              jj += cdim[1];
+            for (k = -1; k <= 1; k++) {
+              kk = ind[2] + k;
+              if (kk >= cdim[2])
+                kk -= cdim[2];
+              else if (kk < 0)
+                kk += cdim[2];
+
+              /* Get the cell ID. */
+              cjd = cell_getid(cdim, ii, jj, kk);
+
+              /* Add to proxies? */
+              if (cells[cid].nodeID == e->nodeID &&
+                  cells[cjd].nodeID != e->nodeID) {
+                pid = e->proxy_ind[cells[cjd].nodeID];
+                if (pid < 0) {
+                  if (e->nr_proxies == engine_maxproxies)
+                    error("Maximum number of proxies exceeded.");
+                  proxy_init(&proxies[e->nr_proxies], e->nodeID,
+                             cells[cjd].nodeID);
+                  e->proxy_ind[cells[cjd].nodeID] = e->nr_proxies;
+                  pid = e->nr_proxies;
+                  e->nr_proxies += 1;
                 }
-        
-    }
-    
-    
-/** 
+                proxy_addcell_in(&proxies[pid], &cells[cjd]);
+                proxy_addcell_out(&proxies[pid], &cells[cid]);
+                cells[cid].sendto |= (1ULL << pid);
+              }
+
+              if (cells[cjd].nodeID == e->nodeID &&
+                  cells[cid].nodeID != e->nodeID) {
+                pid = e->proxy_ind[cells[cid].nodeID];
+                if (pid < 0) {
+                  if (e->nr_proxies == engine_maxproxies)
+                    error("Maximum number of proxies exceeded.");
+                  proxy_init(&proxies[e->nr_proxies], e->nodeID,
+                             cells[cid].nodeID);
+                  e->proxy_ind[cells[cid].nodeID] = e->nr_proxies;
+                  pid = e->nr_proxies;
+                  e->nr_proxies += 1;
+                }
+                proxy_addcell_in(&proxies[pid], &cells[cid]);
+                proxy_addcell_out(&proxies[pid], &cells[cjd]);
+                cells[cjd].sendto |= (1ULL << pid);
+              }
+            }
+          }
+        }
+      }
+}
+
+/**
  * @brief Split the underlying space according to the given grid.
  *
  * @param e The #engine.
  * @param grid The grid.
  */
- 
-void engine_split ( struct engine *e , int *grid ) {
-
-    int j, k;
-    int ind[3];
-    struct space *s = e->s;
-    struct cell *c;
-    
-    /* If we've got the wrong number of nodes, fail. */
-    if ( e->nr_nodes != grid[0]*grid[1]*grid[2] )
-        error( "Grid size does not match number of nodes." );
-        
-    /* Run through the cells and set their nodeID. */
-    // message("s->dim = [%e,%e,%e]", s->dim[0], s->dim[1], s->dim[2]);
-    for ( k = 0 ; k < s->nr_cells ; k++ ) {
-        c = &s->cells[k];
-        for ( j = 0 ; j < 3 ; j++ )
-            ind[j] = c->loc[j] / s->dim[j] * grid[j];
-        c->nodeID = ind[0] + grid[0]*( ind[1] + grid[1]*ind[2] );
-	// message("cell at [%e,%e,%e]: ind = [%i,%i,%i], nodeID = %i", c->loc[0], c->loc[1], c->loc[2], ind[0], ind[1], ind[2], c->nodeID);
-        }
-        
-    /* Make the proxies. */
-    engine_makeproxies( e );
-        
-    /* Re-allocate the local parts. */
-    message("Re-allocating parts array from %i to %i.", s->size_parts, (int)(s->nr_parts * 1.2));
-    s->size_parts = s->nr_parts * 1.2;
-    struct part *parts_new;
-    struct xpart *xparts_new;
-    if ( posix_memalign( (void **)&parts_new , part_align , sizeof(struct part) * s->size_parts ) != 0 ||
-         posix_memalign( (void **)&xparts_new , part_align , sizeof(struct xpart) * s->size_parts ) != 0 )
-        error( "Failed to allocate new part data." );
-    memcpy( parts_new , s->parts , sizeof(struct part) * s->nr_parts );
-    memcpy( xparts_new , s->xparts , sizeof(struct xpart) * s->nr_parts );
-    free( s->parts );
-    free( s->xparts );
-    s->parts = parts_new;
-    s->xparts = xparts_new;
 
-    }
-    
-    
+void engine_split(struct engine *e, int *grid) {
+
+  int j, k;
+  int ind[3];
+  struct space *s = e->s;
+  struct cell *c;
+
+  /* If we've got the wrong number of nodes, fail. */
+  if (e->nr_nodes != grid[0] * grid[1] * grid[2])
+    error("Grid size does not match number of nodes.");
+
+  /* Run through the cells and set their nodeID. */
+  // message("s->dim = [%e,%e,%e]", s->dim[0], s->dim[1], s->dim[2]);
+  for (k = 0; k < s->nr_cells; k++) {
+    c = &s->cells[k];
+    for (j = 0; j < 3; j++) ind[j] = c->loc[j] / s->dim[j] * grid[j];
+    c->nodeID = ind[0] + grid[0] * (ind[1] + grid[1] * ind[2]);
+    // message("cell at [%e,%e,%e]: ind = [%i,%i,%i], nodeID = %i", c->loc[0],
+    // c->loc[1], c->loc[2], ind[0], ind[1], ind[2], c->nodeID);
+  }
+
+  /* Make the proxies. */
+  engine_makeproxies(e);
+
+  /* Re-allocate the local parts. */
+  message("Re-allocating parts array from %i to %i.", s->size_parts,
+          (int)(s->nr_parts * 1.2));
+  s->size_parts = s->nr_parts * 1.2;
+  struct part *parts_new;
+  struct xpart *xparts_new;
+  if (posix_memalign((void **)&parts_new, part_align,
+                     sizeof(struct part) * s->size_parts) != 0 ||
+      posix_memalign((void **)&xparts_new, part_align,
+                     sizeof(struct xpart) * s->size_parts) != 0)
+    error("Failed to allocate new part data.");
+  memcpy(parts_new, s->parts, sizeof(struct part) * s->nr_parts);
+  memcpy(xparts_new, s->xparts, sizeof(struct xpart) * s->nr_parts);
+  free(s->parts);
+  free(s->xparts);
+  s->parts = parts_new;
+  s->xparts = xparts_new;
+}
+
 /**
  * @brief init an engine with the given number of threads, queues, and
  *      the given policy.
@@ -2038,152 +2164,148 @@ void engine_split ( struct engine *e , int *grid ) {
  * @param nodeID The MPI rank of this node
  * @param policy The queueing policy to use.
  */
- 
-void engine_init ( struct engine *e , struct space *s , float dt , int nr_threads , int nr_queues , int nr_nodes , int nodeID , int policy ) {
-
-    int k;
-    float dt_min = dt;
-    #if defined(HAVE_SETAFFINITY)
-        int nr_cores = sysconf( _SC_NPROCESSORS_ONLN );
-        int i, j, cpuid[ nr_cores ];
-        cpu_set_t cpuset;
-        if ( policy & engine_policy_cputight ) {
-            for ( k = 0 ; k < nr_cores ; k++ )
-                cpuid[k] = k;
-            }
-        else {
-            /*  Get next highest power of 2. */
-            int maxint = 1;
-            while ( maxint < nr_cores )
-                maxint *= 2;
-
-            cpuid[0] = 0;
-            k = 1;
-            for ( i = 1 ; i < maxint ; i *= 2 )
-                for ( j = maxint / i / 2 ; j < maxint ; j += maxint / i )
-                    if ( j < nr_cores && j != 0 )
-                        cpuid[k++] = j;
-            #ifdef WITHMPI
-                printf( "engine_init: cpu map is [ " );
-            #else
-                printf( "[%03i] engine_init: cpu map is [ " , nodeID );
-            #endif
-            for ( i = 0 ; i < nr_cores ; i++ )
-                printf( "%i " , cpuid[i] );
-            printf( "].\n" );
-            }
-    #endif
-    
-    /* Store the values. */
-    e->s = s;
-    e->nr_threads = nr_threads;
-    e->policy = policy;
-    e->step = 0;
-    e->nullstep = 0;
-    e->time = 0.0;
-    e->nr_nodes = nr_nodes;
-    e->nodeID = nodeID;
-    e->proxy_ind = NULL;
+
+void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
+                 int nr_queues, int nr_nodes, int nodeID, int policy) {
+
+  int k;
+  float dt_min = dt;
+#if defined(HAVE_SETAFFINITY)
+  int nr_cores = sysconf(_SC_NPROCESSORS_ONLN);
+  int i, j, cpuid[nr_cores];
+  cpu_set_t cpuset;
+  if (policy & engine_policy_cputight) {
+    for (k = 0; k < nr_cores; k++) cpuid[k] = k;
+  } else {
+    /*  Get next highest power of 2. */
+    int maxint = 1;
+    while (maxint < nr_cores) maxint *= 2;
+
+    cpuid[0] = 0;
+    k = 1;
+    for (i = 1; i < maxint; i *= 2)
+      for (j = maxint / i / 2; j < maxint; j += maxint / i)
+        if (j < nr_cores && j != 0) cpuid[k++] = j;
+#ifdef WITHMPI
+    printf("engine_init: cpu map is [ ");
+#else
+    printf("[%03i] engine_init: cpu map is [ ", nodeID);
+#endif
+    for (i = 0; i < nr_cores; i++) printf("%i ", cpuid[i]);
+    printf("].\n");
+  }
+#endif
+
+  /* Store the values. */
+  e->s = s;
+  e->nr_threads = nr_threads;
+  e->policy = policy;
+  e->step = 0;
+  e->nullstep = 0;
+  e->time = 0.0;
+  e->nr_nodes = nr_nodes;
+  e->nodeID = nodeID;
+  e->proxy_ind = NULL;
+  e->nr_proxies = 0;
+  e->forcerebuild = 1;
+  e->forcerepart = 0;
+  e->links = NULL;
+  e->nr_links = 0;
+  engine_rank = nodeID;
+
+  /* Make the space link back to the engine. */
+  s->e = e;
+
+  /* Are we doing stuff in parallel? */
+  if (nr_nodes > 1) {
+#ifndef HAVE_MPI
+    error("SWIFT was not compiled with MPI support.");
+#else
+    e->policy |= engine_policy_mpi;
+    if ((e->proxies = (struct proxy *)malloc(sizeof(struct proxy) *
+                                             engine_maxproxies)) == NULL)
+      error("Failed to allocate memory for proxies.");
+    bzero(e->proxies, sizeof(struct proxy) * engine_maxproxies);
     e->nr_proxies = 0;
-    e->forcerebuild = 1;
-    e->forcerepart = 0;
-    e->links = NULL;
-    e->nr_links = 0;
-    engine_rank = nodeID;
-    
-    /* Make the space link back to the engine. */
-    s->e = e;
-    
-    /* Are we doing stuff in parallel? */
-    if ( nr_nodes > 1 ) {
-        #ifndef HAVE_MPI
-            error( "SWIFT was not compiled with MPI support." );
-        #else
-            e->policy |= engine_policy_mpi;
-            if ( ( e->proxies = (struct proxy *)malloc( sizeof(struct proxy) * engine_maxproxies ) ) == NULL )
-                error( "Failed to allocate memory for proxies." );
-            bzero( e->proxies , sizeof(struct proxy) * engine_maxproxies );
-            e->nr_proxies = 0;
-        #endif
-        }
-    
-    /* First of all, init the barrier and lock it. */
-    if ( pthread_mutex_init( &e->barrier_mutex , NULL ) != 0 )
-        error( "Failed to initialize barrier mutex." );
-    if ( pthread_cond_init( &e->barrier_cond , NULL ) != 0 )
-        error( "Failed to initialize barrier condition variable." );
-    if ( pthread_mutex_lock( &e->barrier_mutex ) != 0 )
-        error( "Failed to lock barrier mutex." );
-    e->barrier_running = 0;
-    e->barrier_launch = 0;
-    e->barrier_launchcount = 0;
-    
-    /* Run through the parts and get the minimum time step. */
-    e->dt_orig = dt;
-    for ( k = 0 ; k < s->nr_parts ; k++ )
-        if ( s->parts[k].dt < dt_min )
-            dt_min = s->parts[k].dt;
-    if ( dt_min == 0.0f )
-        dt = 0.0f;
-    else
-        while ( dt > dt_min )
-            dt *= 0.5f;
-    e->dt = dt;
-    
-    /* Init the scheduler. */
-    scheduler_init( &e->sched , e->s , nr_queues , scheduler_flag_steal , e->nodeID );
-    s->nr_queues = nr_queues;
-        
-    /* Append a kick1 task to each cell. */
-    scheduler_reset( &e->sched , s->tot_cells );
-    for ( k = 0 ; k < s->nr_cells ; k++ )
-        s->cells[k].kick1 = scheduler_addtask( &e->sched , task_type_kick1 , task_subtype_none , 0 , 0 , &s->cells[k] , NULL , 0 );
-    scheduler_ranktasks( &e->sched );
-    
-    /* Allocate and init the threads. */
-    if ( ( e->runners = (struct runner *)malloc( sizeof(struct runner) * nr_threads ) ) == NULL )
-        error( "Failed to allocate threads array." );
-    for ( k = 0 ; k < nr_threads ; k++ ) {
-        e->runners[k].id = k;
-        e->runners[k].e = e;
-        e->barrier_running += 1;
-        if ( pthread_create( &e->runners[k].thread , NULL , &runner_main , &e->runners[k] ) != 0 )
-            error( "Failed to create runner thread." );
-        if ( e->policy & engine_policy_setaffinity ) {
-            #if defined(HAVE_SETAFFINITY)
-
-                /* Set a reasonable queue ID. */
-                e->runners[k].cpuid = cpuid[ k % nr_cores ];
-                if ( nr_queues < nr_threads )
-                    e->runners[k].qid = cpuid[ k % nr_cores ] * nr_queues / nr_cores;
-                else
-                    e->runners[k].qid = k;
-
-                /* Set the cpu mask to zero | e->id. */
-                CPU_ZERO( &cpuset );
-                CPU_SET( cpuid[ k % nr_cores ] , &cpuset );
-
-                /* Apply this mask to the runner's pthread. */
-                if ( pthread_setaffinity_np( e->runners[k].thread , sizeof(cpu_set_t) , &cpuset ) != 0 )
-                    error( "Failed to set thread affinity." );
-
-            #else
-                error( "SWIFT was not compiled with affinity enabled." );
-            #endif
-            }
-        else {
-            e->runners[k].cpuid = k;
-            e->runners[k].qid = k * nr_queues / nr_threads;
-            }
-        // message( "runner %i on cpuid=%i with qid=%i." , e->runners[k].id , e->runners[k].cpuid , e->runners[k].qid );
-        }
-        
-    /* Wait for the runner threads to be in place. */
-    while ( e->barrier_running || e->barrier_launch )
-        if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 )
-            error( "Error while waiting for runner threads to get in place." );
-    
+#endif
+  }
+
+  /* First of all, init the barrier and lock it. */
+  if (pthread_mutex_init(&e->barrier_mutex, NULL) != 0)
+    error("Failed to initialize barrier mutex.");
+  if (pthread_cond_init(&e->barrier_cond, NULL) != 0)
+    error("Failed to initialize barrier condition variable.");
+  if (pthread_mutex_lock(&e->barrier_mutex) != 0)
+    error("Failed to lock barrier mutex.");
+  e->barrier_running = 0;
+  e->barrier_launch = 0;
+  e->barrier_launchcount = 0;
+
+  /* Run through the parts and get the minimum time step. */
+  e->dt_orig = dt;
+  for (k = 0; k < s->nr_parts; k++)
+    if (s->parts[k].dt < dt_min) dt_min = s->parts[k].dt;
+  if (dt_min == 0.0f)
+    dt = 0.0f;
+  else
+    while (dt > dt_min) dt *= 0.5f;
+  e->dt = dt;
+
+  /* Init the scheduler. */
+  scheduler_init(&e->sched, e->s, nr_queues, scheduler_flag_steal, e->nodeID);
+  s->nr_queues = nr_queues;
+
+  /* Append a kick1 task to each cell. */
+  scheduler_reset(&e->sched, s->tot_cells);
+  for (k = 0; k < s->nr_cells; k++)
+    s->cells[k].kick1 =
+        scheduler_addtask(&e->sched, task_type_kick1, task_subtype_none, 0, 0,
+                          &s->cells[k], NULL, 0);
+  scheduler_ranktasks(&e->sched);
+
+  /* Allocate and init the threads. */
+  if ((e->runners =
+           (struct runner *)malloc(sizeof(struct runner) * nr_threads)) == NULL)
+    error("Failed to allocate threads array.");
+  for (k = 0; k < nr_threads; k++) {
+    e->runners[k].id = k;
+    e->runners[k].e = e;
+    e->barrier_running += 1;
+    if (pthread_create(&e->runners[k].thread, NULL, &runner_main,
+                       &e->runners[k]) != 0)
+      error("Failed to create runner thread.");
+    if (e->policy & engine_policy_setaffinity) {
+#if defined(HAVE_SETAFFINITY)
+
+      /* Set a reasonable queue ID. */
+      e->runners[k].cpuid = cpuid[k % nr_cores];
+      if (nr_queues < nr_threads)
+        e->runners[k].qid = cpuid[k % nr_cores] * nr_queues / nr_cores;
+      else
+        e->runners[k].qid = k;
+
+      /* Set the cpu mask to zero | e->id. */
+      CPU_ZERO(&cpuset);
+      CPU_SET(cpuid[k % nr_cores], &cpuset);
+
+      /* Apply this mask to the runner's pthread. */
+      if (pthread_setaffinity_np(e->runners[k].thread, sizeof(cpu_set_t),
+                                 &cpuset) != 0)
+        error("Failed to set thread affinity.");
+
+#else
+      error("SWIFT was not compiled with affinity enabled.");
+#endif
+    } else {
+      e->runners[k].cpuid = k;
+      e->runners[k].qid = k * nr_queues / nr_threads;
     }
-    
-    
-    
+    // message( "runner %i on cpuid=%i with qid=%i." , e->runners[k].id ,
+    // e->runners[k].cpuid , e->runners[k].qid );
+  }
+
+  /* Wait for the runner threads to be in place. */
+  while (e->barrier_running || e->barrier_launch)
+    if (pthread_cond_wait(&e->barrier_cond, &e->barrier_mutex) != 0)
+      error("Error while waiting for runner threads to get in place.");
+}
diff --git a/src/engine.h b/src/engine.h
index ba525ccc3ec4c20ef3d86361587be93be119f39b..caa286e7d3c518c0aba84fd9da1b6ff9ef6a78f4 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -1,135 +1,144 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_ENGINE_H
+#define SWIFT_ENGINE_H
 
+/* Some standard headers. */
+#include <pthread.h>
 
+/* Includes. */
+#include "lock.h"
+#include "proxy.h"
+#include "runner.h"
+#include "scheduler.h"
+#include "space.h"
+#include "task.h"
 
 /* Some constants. */
-#define engine_policy_none          0
-#define engine_policy_rand          1
-#define engine_policy_steal         2
-#define engine_policy_keep          4
-#define engine_policy_block         8
-#define engine_policy_fixdt         16
-#define engine_policy_multistep     32
-#define engine_policy_cputight      64
-#define engine_policy_mpi           128
-#define engine_policy_setaffinity   256
-
-#define engine_queue_scale          1.2
-#define engine_maxtaskspercell      128
-#define engine_maxproxies           64
-#define engine_tasksreweight        10
+#define engine_policy_none 0
+#define engine_policy_rand 1
+#define engine_policy_steal 2
+#define engine_policy_keep 4
+#define engine_policy_block 8
+#define engine_policy_fixdt 16
+#define engine_policy_multistep 32
+#define engine_policy_cputight 64
+#define engine_policy_mpi 128
+#define engine_policy_setaffinity 256
+#define engine_policy_paranoid 512
 
+#define engine_queue_scale 1.2
+#define engine_maxtaskspercell 128
+#define engine_maxproxies 64
+#define engine_tasksreweight 10
 
 /* The rank of the engine as a global variable (for messages). */
 extern int engine_rank;
 
-
 /* Mini struct to link cells to density/force tasks. */
 struct link {
 
-    /* The task pointer. */
-    struct task *t;
-    
-    /* The next pointer. */
-    struct link *next;
-    
-    };
+  /* The task pointer. */
+  struct task *t;
 
+  /* The next pointer. */
+  struct link *next;
+};
 
 /* Data structure for the engine. */
 struct engine {
 
-    /* Number of threads on which to run. */
-    int nr_threads;
-    
-    /* The space with which the runner is associated. */
-    struct space *s;
-    
-    /* The runner's threads. */
-    struct runner *runners;
-    
-    /* The running policy. */
-    int policy;
-    
-    /* The task scheduler. */
-    struct scheduler sched;
-    
-    /* The maximum dt to step (current). */
-    float dt_step;
-    
-    /* The minimum dt over all particles in the system. */
-    float dt_min, dt_max;
-    
-    /* The system time step. */
-    float dt, dt_orig;
-    
-    /* The system energies from the previous step. */
-    double ekin, epot;
-    
-    /* The current step number. */
-    int step, nullstep;
-    
-    /* The number of particles updated in the previous step. */
-    int count_step;
-    
-    /* The current system time. */
-    float time;
-    
-    /* Data for the threads' barrier. */
-    pthread_mutex_t barrier_mutex;
-    pthread_cond_t barrier_cond;
-    volatile int barrier_running, barrier_launch, barrier_launchcount;
-    
-    /* ID of the node this engine lives on. */
-    int nr_nodes, nodeID;
-    
-    /* Proxies for the other nodes in this simulation. */
-    struct proxy *proxies;
-    int nr_proxies, *proxy_ind;
-
-    /* Tic at the start of a step. */
-    ticks tic_step;
-    
-    /* Force the engine to rebuild? */
-    int forcerebuild, forcerepart;
-    
-    /* How many steps have we done with the same set of tasks? */
-    int tasks_age;
-    
-    /* Linked list for cell-task association. */
-    struct link *links;
-    int nr_links;
-    
-    };
+  /* Number of threads on which to run. */
+  int nr_threads;
+
+  /* The space with which the runner is associated. */
+  struct space *s;
+
+  /* The runner's threads. */
+  struct runner *runners;
+
+  /* The running policy. */
+  int policy;
+
+  /* The task scheduler. */
+  struct scheduler sched;
+
+  /* The maximum dt to step (current). */
+  float dt_step;
+
+  /* The minimum dt over all particles in the system. */
+  float dt_min, dt_max;
+
+  /* The system time step. */
+  float dt, dt_orig;
+
+  /* The system energies from the previous step. */
+  double ekin, epot;
+
+  /* The current step number. */
+  int step, nullstep;
 
+  /* The number of particles updated in the previous step. */
+  int count_step;
+
+  /* The current system time. */
+  float time;
+
+  /* Data for the threads' barrier. */
+  pthread_mutex_t barrier_mutex;
+  pthread_cond_t barrier_cond;
+  volatile int barrier_running, barrier_launch, barrier_launchcount;
+
+  /* ID of the node this engine lives on. */
+  int nr_nodes, nodeID;
+
+  /* Proxies for the other nodes in this simulation. */
+  struct proxy *proxies;
+  int nr_proxies, *proxy_ind;
+
+  /* Tic at the start of a step. */
+  ticks tic_step;
+
+  /* Force the engine to rebuild? */
+  int forcerebuild, forcerepart;
+
+  /* How many steps have we done with the same set of tasks? */
+  int tasks_age;
+
+  /* Linked list for cell-task association. */
+  struct link *links;
+  int nr_links;
+};
 
 /* Function prototypes. */
-void engine_barrier( struct engine *e , int tid );
-void engine_init ( struct engine *e , struct space *s , float dt , int nr_threads , int nr_queues , int nr_nodes , int nodeID , int policy );
-void engine_prepare ( struct engine *e );
-void engine_step ( struct engine *e );
-void engine_maketasks ( struct engine *e );
-void engine_split ( struct engine *e , int *grid );
-int engine_exchange_strays ( struct engine *e , int offset , int *ind , int N );
-void engine_rebuild ( struct engine *e );
-void engine_repartition ( struct engine *e );
-void engine_makeproxies ( struct engine *e );
-void engine_redistribute ( struct engine *e );
-struct link *engine_addlink( struct engine *e , struct link *l , struct task *t );
+void engine_barrier(struct engine *e, int tid);
+void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
+                 int nr_queues, int nr_nodes, int nodeID, int policy);
+void engine_prepare(struct engine *e);
+void engine_step(struct engine *e);
+void engine_maketasks(struct engine *e);
+void engine_split(struct engine *e, int *grid);
+int engine_exchange_strays(struct engine *e, int offset, int *ind, int N);
+void engine_rebuild(struct engine *e);
+void engine_repartition(struct engine *e);
+void engine_makeproxies(struct engine *e);
+void engine_redistribute(struct engine *e);
+struct link *engine_addlink(struct engine *e, struct link *l, struct task *t);
+
+#endif /* SWIFT_ENGINE_H */
diff --git a/src/error.h b/src/error.h
index b41f9a38e237509d9bcecd7c9cdf093487f804cd..e581dcf86ecea9abbc0a116fb041175fd872758c 100644
--- a/src/error.h
+++ b/src/error.h
@@ -2,44 +2,63 @@
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
  *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk).
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_ERROR_H
+#define SWIFT_ERROR_H
 
+/* Some standard headers. */
 #include <stdio.h>
 
+/* MPI headers. */
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
 
 /**
  * @brief Error macro. Prints the message given in argument and aborts.
  *
  */
 #ifdef WITH_MPI
-    extern int engine_rank;
-    #define error(s, ...) { fprintf( stderr , "[%03i] %s:%s():%i: " s "\n" , engine_rank , __FILE__ , __FUNCTION__ , __LINE__ , ##__VA_ARGS__ ); MPI_Abort(MPI_COMM_WORLD, -1); }
+extern int engine_rank;
+#define error(s, ...)                                                    \
+  {                                                                      \
+    fprintf(stderr, "[%03i] %s:%s():%i: " s "\n", engine_rank, __FILE__, \
+            __FUNCTION__, __LINE__, ##__VA_ARGS__);                      \
+    MPI_Abort(MPI_COMM_WORLD, -1);                                       \
+  }
 #else
-    #define error(s, ...) { fprintf( stderr , "%s:%s():%i: " s "\n" , __FILE__ , __FUNCTION__ , __LINE__ , ##__VA_ARGS__ ); abort(); }
+#define error(s, ...)                                                        \
+  {                                                                          \
+    fprintf(stderr, "%s:%s():%i: " s "\n", __FILE__, __FUNCTION__, __LINE__, \
+            ##__VA_ARGS__);                                                  \
+    abort();                                                                 \
+  }
 #endif
 
-
 /**
  * @brief Macro to print a localized message with variable arguments.
  *
  */
 #ifdef WITH_MPI
-    extern int engine_rank;
-    #define message(s, ...) printf( "[%03i] %s: " s "\n" , engine_rank , __FUNCTION__ , ##__VA_ARGS__ )
+extern int engine_rank;
+#define message(s, ...) \
+  printf("[%03i] %s: " s "\n", engine_rank, __FUNCTION__, ##__VA_ARGS__)
 #else
-    #define message(s, ...) printf( "%s: " s "\n" , __FUNCTION__ , ##__VA_ARGS__ )
+#define message(s, ...) printf("%s: " s "\n", __FUNCTION__, ##__VA_ARGS__)
 #endif
+
+#endif /* SWIFT_ERROR_H */
diff --git a/src/inline.h b/src/inline.h
index a9b3059fe7570a9b7bb67cc0d4b9f93181c19ccf..06728cb87f5e342b22d4a4a861cbd83ea6af31d9 100644
--- a/src/inline.h
+++ b/src/inline.h
@@ -2,29 +2,33 @@
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
  *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk).
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_INLINE_H
+#define SWIFT_INLINE_H
 
 /**
- * @brief Defines inline 
+ * @brief Defines inline
  */
 #ifndef INLINE
-# if __GNUC__ && !__GNUC_STDC_INLINE__
-#  define INLINE extern inline
-# else
-#  define INLINE inline
-# endif
+#if __GNUC__ && !__GNUC_STDC_INLINE__
+#define INLINE extern inline
+#else
+#define INLINE inline
 #endif
+#endif
+
+#endif /* SWIFT_INLINE_H */
diff --git a/src/kernel.h b/src/kernel.h
index c012739f300aeb5aeedd4b56798b00b2d7ed5cc9..0fc232597e1e9917d17f068407acc85b37659d42 100644
--- a/src/kernel.h
+++ b/src/kernel.h
@@ -2,24 +2,28 @@
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
  *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
-#ifndef KERNEL_H
-#define KERNEL_H 
+#ifndef SWIFT_KERNEL_H
+#define SWIFT_KERNEL_H
 
+/* Includes. */
+#include "const.h"
+#include "inline.h"
+#include "vector.h"
 
 /**
  * @file kernel.h
@@ -27,185 +31,194 @@
  *        as well as the blending function used for gravity.
  */
 
-#include "vector.h"
-
-/* Gravity kernel stuff ----------------------------------------------------------------------------------------------- */
+/* Gravity kernel stuff
+ * -----------------------------------------------------------------------------------------------
+ */
 
 /* The gravity kernel is defined as a degree 6 polynomial in the distance
    r. The resulting value should be post-multiplied with r^-3, resulting
    in a polynomial with terms ranging from r^-3 to r^3, which are
    sufficient to model both the direct potential as well as the splines
    near the origin. */
-   
+
 /* Coefficients for the gravity kernel. */
 #define kernel_grav_degree 6
 #define kernel_grav_ivals 2
-#define kernel_grav_scale (2*const_iepsilon)
-static float kernel_grav_coeffs[ (kernel_grav_degree+1) * (kernel_grav_ivals+1) ] =
-    { 32.0f*const_iepsilon6 , -192.0f/5.0f*const_iepsilon5 , 0.0f , 32.0f/3.0f*const_iepsilon3 , 0.0f , 0.0f , 0.0f ,
-      -32.0f/3.0f*const_iepsilon6 , 192.0f/5.0f*const_iepsilon5 , -48.0f*const_iepsilon4 , 64.0f/3.0f*const_iepsilon3 , 0.0f , 0.0f , -1.0f/15.0f ,
-      0.0f , 0.0f , 0.0f , 0.0f , 0.0f , 0.0f , 1.0f };
-
+#define kernel_grav_scale (2 * const_iepsilon)
+static float kernel_grav_coeffs
+    [(kernel_grav_degree + 1) * (kernel_grav_ivals + 1)] = {
+        32.0f * const_iepsilon6,         -192.0f / 5.0f * const_iepsilon5,
+        0.0f,                            32.0f / 3.0f * const_iepsilon3,
+        0.0f,                            0.0f,
+        0.0f,                            -32.0f / 3.0f * const_iepsilon6,
+        192.0f / 5.0f * const_iepsilon5, -48.0f * const_iepsilon4,
+        64.0f / 3.0f * const_iepsilon3,  0.0f,
+        0.0f,                            -1.0f / 15.0f,
+        0.0f,                            0.0f,
+        0.0f,                            0.0f,
+        0.0f,                            0.0f,
+        1.0f};
 
 /**
  * @brief Computes the gravity cubic spline for a given distance x.
  */
 
-__attribute__ ((always_inline)) INLINE static void kernel_grav_eval ( float x , float *W ) {
-    int ind = fmin( x*kernel_grav_scale , kernel_grav_ivals );
-    float *coeffs = &kernel_grav_coeffs[ ind*(kernel_grav_degree + 1) ];
-    float w = coeffs[0]*x + coeffs[1];
-    for ( int k = 2 ; k <= kernel_grav_degree ; k++ )
-        w = x*w + coeffs[k];
-    *W = w;
-    }
-
+__attribute__((always_inline)) INLINE static void kernel_grav_eval(float x,
+                                                                   float *W) {
+  int ind = fmin(x * kernel_grav_scale, kernel_grav_ivals);
+  float *coeffs = &kernel_grav_coeffs[ind * (kernel_grav_degree + 1)];
+  float w = coeffs[0] * x + coeffs[1];
+  for (int k = 2; k <= kernel_grav_degree; k++) w = x * w + coeffs[k];
+  *W = w;
+}
 
 #ifdef VECTORIZE
 
 /**
- * @brief Computes the gravity cubic spline for a given distance x (Vectorized version).
+ * @brief Computes the gravity cubic spline for a given distance x (Vectorized
+ * version).
  */
 
-__attribute__ ((always_inline)) INLINE static void kernel_grav_eval_vec ( vector *x , vector *w ) {
-    
-    vector ind, c[kernel_grav_degree+1];
-    int j, k;
-    
-    /* Load x and get the interval id. */
-    ind.m = vec_ftoi( vec_fmin( x->v*vec_set1( kernel_grav_scale ) , vec_set1( (float)kernel_grav_ivals ) ) );
-    
-    /* load the coefficients. */
-    for ( k = 0 ; k < VEC_SIZE ; k++ )
-        for ( j = 0 ; j < kernel_grav_degree+1 ; j++ )
-            c[j].f[k] = kernel_grav_coeffs[ ind.i[k]*(kernel_grav_degree + 1) + j ];
-
-    /* Init the iteration for Horner's scheme. */
-    w->v = ( c[0].v * x->v ) + c[1].v;
-    
-    /* And we're off! */
-    for ( int k = 2 ; k <= kernel_grav_degree ; k++ )
-        w->v = ( x->v * w->v ) + c[k].v;
-        
-    }
-    
-    
-#endif
+__attribute__((always_inline))
+    INLINE static void kernel_grav_eval_vec(vector *x, vector *w) {
+
+  vector ind, c[kernel_grav_degree + 1];
+  int j, k;
+
+  /* Load x and get the interval id. */
+  ind.m = vec_ftoi(vec_fmin(x->v * vec_set1(kernel_grav_scale),
+                            vec_set1((float)kernel_grav_ivals)));
 
+  /* load the coefficients. */
+  for (k = 0; k < VEC_SIZE; k++)
+    for (j = 0; j < kernel_grav_degree + 1; j++)
+      c[j].f[k] = kernel_grav_coeffs[ind.i[k] * (kernel_grav_degree + 1) + j];
 
-/* Blending function stuff -------------------------------------------------------------------------------------------- */
+  /* Init the iteration for Horner's scheme. */
+  w->v = (c[0].v * x->v) + c[1].v;
+
+  /* And we're off! */
+  for (int k = 2; k <= kernel_grav_degree; k++) w->v = (x->v * w->v) + c[k].v;
+}
+
+#endif
+
+/* Blending function stuff
+ * --------------------------------------------------------------------------------------------
+ */
 
 /* Coefficients for the blending function. */
 #define blender_degree 3
 #define blender_ivals 3
 #define blender_scale 4.0f
-static float blender_coeffs[ (blender_degree+1) * (blender_ivals+1) ] =
-    { 0.0f , 0.0f , 0.0f , 1.0f ,
-      -32.0f , 24.0f , -6.0f , 1.5f , 
-      -32.0f , 72.0f , -54.0f , 13.5f ,
-      0.0f , 0.0f , 0.0f , 0.0f };
-      
-      
+static float blender_coeffs[(blender_degree + 1) * (blender_ivals + 1)] = {
+    0.0f,   0.0f,  0.0f,   1.0f,  -32.0f, 24.0f, -6.0f, 1.5f,
+    -32.0f, 72.0f, -54.0f, 13.5f, 0.0f,   0.0f,  0.0f,  0.0f};
+
 /**
  * @brief Computes the cubic spline blender for a given distance x.
  */
 
-__attribute__ ((always_inline)) INLINE static void blender_eval ( float x , float *W ) {
-    int ind = fmin( x*blender_scale , blender_ivals );
-    float *coeffs = &blender_coeffs[ ind*(blender_degree + 1) ];
-    float w = coeffs[0]*x + coeffs[1];
-    for ( int k = 2 ; k <= blender_degree ; k++ )
-        w = x*w + coeffs[k];
-    *W = w;
-    }
-
+__attribute__((always_inline)) INLINE static void blender_eval(float x,
+                                                               float *W) {
+  int ind = fmin(x * blender_scale, blender_ivals);
+  float *coeffs = &blender_coeffs[ind * (blender_degree + 1)];
+  float w = coeffs[0] * x + coeffs[1];
+  for (int k = 2; k <= blender_degree; k++) w = x * w + coeffs[k];
+  *W = w;
+}
 
 /**
- * @brief Computes the cubic spline blender and its derivative for a given distance x.
+ * @brief Computes the cubic spline blender and its derivative for a given
+ * distance x.
  */
 
-__attribute__ ((always_inline)) INLINE static void blender_deval ( float x , float *W , float *dW_dx ) {
-    int ind = fminf( x*blender_scale , blender_ivals );
-    float *coeffs = &blender_coeffs[ ind*(blender_degree + 1) ];
-    float w = coeffs[0]*x + coeffs[1];
-    float dw_dx = coeffs[0];
-    for ( int k = 2 ; k <= blender_degree ; k++ ) {
-        dw_dx = dw_dx*x + w;
-        w = x*w + coeffs[k];
-        }
-    *W = w;
-    *dW_dx = dw_dx;
-    }
-
+__attribute__((always_inline)) INLINE static void blender_deval(float x,
+                                                                float *W,
+                                                                float *dW_dx) {
+  int ind = fminf(x * blender_scale, blender_ivals);
+  float *coeffs = &blender_coeffs[ind * (blender_degree + 1)];
+  float w = coeffs[0] * x + coeffs[1];
+  float dw_dx = coeffs[0];
+  for (int k = 2; k <= blender_degree; k++) {
+    dw_dx = dw_dx * x + w;
+    w = x * w + coeffs[k];
+  }
+  *W = w;
+  *dW_dx = dw_dx;
+}
 
 #ifdef VECTORIZE
 
 /**
- * @brief Computes the cubic spline blender and its derivative for a given distance x (Vectorized version). Gives a sensible answer only if x<2.
+ * @brief Computes the cubic spline blender and its derivative for a given
+ * distance x (Vectorized version). Gives a sensible answer only if x<2.
  */
 
-__attribute__ ((always_inline)) INLINE static void blender_eval_vec ( vector *x , vector *w ) {
-    
-    vector ind, c[blender_degree+1];
-    int j, k;
-    
-    /* Load x and get the interval id. */
-    ind.m = vec_ftoi( vec_fmin( x->v*vec_set1( blender_scale ) , vec_set1( (float)blender_ivals ) ) );
-    
-    /* load the coefficients. */
-    for ( k = 0 ; k < VEC_SIZE ; k++ )
-        for ( j = 0 ; j < blender_degree+1 ; j++ )
-            c[j].f[k] = blender_coeffs[ ind.i[k]*(blender_degree + 1) + j ];
-
-    /* Init the iteration for Horner's scheme. */
-    w->v = ( c[0].v * x->v ) + c[1].v;
-    
-    /* And we're off! */
-    for ( int k = 2 ; k <= blender_degree ; k++ )
-        w->v = ( x->v * w->v ) + c[k].v;
-        
-    }
-    
-    
+__attribute__((always_inline)) INLINE static void blender_eval_vec(vector *x,
+                                                                   vector *w) {
+
+  vector ind, c[blender_degree + 1];
+  int j, k;
+
+  /* Load x and get the interval id. */
+  ind.m = vec_ftoi(
+      vec_fmin(x->v * vec_set1(blender_scale), vec_set1((float)blender_ivals)));
+
+  /* load the coefficients. */
+  for (k = 0; k < VEC_SIZE; k++)
+    for (j = 0; j < blender_degree + 1; j++)
+      c[j].f[k] = blender_coeffs[ind.i[k] * (blender_degree + 1) + j];
+
+  /* Init the iteration for Horner's scheme. */
+  w->v = (c[0].v * x->v) + c[1].v;
+
+  /* And we're off! */
+  for (int k = 2; k <= blender_degree; k++) w->v = (x->v * w->v) + c[k].v;
+}
+
 /**
- * @brief Computes the cubic spline blender and its derivative for a given distance x (Vectorized version). Gives a sensible answer only if x<2.
+ * @brief Computes the cubic spline blender and its derivative for a given
+ * distance x (Vectorized version). Gives a sensible answer only if x<2.
  */
 
-__attribute__ ((always_inline)) INLINE static void blender_deval_vec ( vector *x , vector *w , vector *dw_dx ) {
-    
-    vector ind, c[blender_degree+1];
-    int j, k;
-    
-    /* Load x and get the interval id. */
-    ind.m = vec_ftoi( vec_fmin( x->v*vec_set1( blender_scale ) , vec_set1( (float)blender_ivals ) ) );
-    
-    /* load the coefficients. */
-    for ( k = 0 ; k < VEC_SIZE ; k++ )
-        for ( j = 0 ; j < blender_degree+1 ; j++ )
-            c[j].f[k] = blender_coeffs[ ind.i[k]*(blender_degree + 1) + j ];
-
-    /* Init the iteration for Horner's scheme. */
-    w->v = ( c[0].v * x->v ) + c[1].v;
-    dw_dx->v = c[0].v;
-    
-    /* And we're off! */
-    for ( int k = 2 ; k <= blender_degree ; k++ ) {
-        dw_dx->v = ( dw_dx->v * x->v ) + w->v;
-        w->v = ( x->v * w->v ) + c[k].v;
-        }
-        
-    }
-    
-#endif
+__attribute__((always_inline))
+    INLINE static void blender_deval_vec(vector *x, vector *w, vector *dw_dx) {
 
+  vector ind, c[blender_degree + 1];
+  int j, k;
 
-/* -------------------------------------------------------------------------------------------------------------------- */
+  /* Load x and get the interval id. */
+  ind.m = vec_ftoi(
+      vec_fmin(x->v * vec_set1(blender_scale), vec_set1((float)blender_ivals)));
+
+  /* load the coefficients. */
+  for (k = 0; k < VEC_SIZE; k++)
+    for (j = 0; j < blender_degree + 1; j++)
+      c[j].f[k] = blender_coeffs[ind.i[k] * (blender_degree + 1) + j];
+
+  /* Init the iteration for Horner's scheme. */
+  w->v = (c[0].v * x->v) + c[1].v;
+  dw_dx->v = c[0].v;
+
+  /* And we're off! */
+  for (int k = 2; k <= blender_degree; k++) {
+    dw_dx->v = (dw_dx->v * x->v) + w->v;
+    w->v = (x->v * w->v) + c[k].v;
+  }
+}
+
+#endif
+
+/* --------------------------------------------------------------------------------------------------------------------
+ */
 
 #if defined(CUBIC_SPLINE_KERNEL)
 
-/* -------------------------------------------------------------------------------------------------------------------- */
+/* --------------------------------------------------------------------------------------------------------------------
+ */
 
-/* Coefficients for the kernel. */ 
+/* Coefficients for the kernel. */
 #define kernel_name "Cubic spline"
 #define kernel_degree 3
 #define kernel_ivals 2
@@ -213,89 +226,94 @@ __attribute__ ((always_inline)) INLINE static void blender_deval_vec ( vector *x
 #define kernel_gamma2 4.0f
 #define kernel_gamma3 8.0f
 #define kernel_igamma 0.5f
-#define kernel_nwneigh ( 4.0/3.0*M_PI*const_eta_kernel*const_eta_kernel*const_eta_kernel*6.0858f ) 
-static float kernel_coeffs[ (kernel_degree + 1) * (kernel_ivals + 1) ] __attribute__ ((aligned (16))) =
-    { 3.0/4.0*M_1_PI , -3.0/2.0*M_1_PI , 0.0 , M_1_PI , 
-      -0.25*M_1_PI , 3.0/2.0*M_1_PI , -3.0*M_1_PI , M_2_PI , 
-      0.0 , 0.0 , 0.0 , 0.0 };
-#define kernel_root ( kernel_coeffs[ kernel_degree ] )
-#define kernel_wroot ( 4.0/3.0*M_PI*kernel_coeffs[ kernel_degree ] )
-
-      
+#define kernel_nwneigh                                                      \
+  (4.0 / 3.0 * M_PI *const_eta_kernel *const_eta_kernel *const_eta_kernel * \
+   6.0858f)
+static float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
+    __attribute__((aligned(16))) = {
+        3.0 / 4.0 * M_1_PI, -3.0 / 2.0 * M_1_PI, 0.0,           M_1_PI,
+        -0.25 * M_1_PI,     3.0 / 2.0 * M_1_PI,  -3.0 * M_1_PI, M_2_PI,
+        0.0,                0.0,                 0.0,           0.0};
+#define kernel_root (kernel_coeffs[kernel_degree])
+#define kernel_wroot (4.0 / 3.0 * M_PI *kernel_coeffs[kernel_degree])
+
 /**
- * @brief Computes the cubic spline kernel and its derivative for a given distance x. Gives a sensible answer only if x<2.
+ * @brief Computes the cubic spline kernel and its derivative for a given
+ * distance x. Gives a sensible answer only if x<2.
  */
 
-__attribute__ ((always_inline)) INLINE static void kernel_deval ( float x , float *W , float *dW_dx ) {
-    int ind = fminf( x , kernel_ivals );
-    float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ];
-    float w = coeffs[0]*x + coeffs[1];
-    float dw_dx = coeffs[0];
-    for ( int k = 2 ; k <= kernel_degree ; k++ ) {
-        dw_dx = dw_dx*x + w;
-        w = x*w + coeffs[k];
-        }
-    *W = w;
-    *dW_dx = dw_dx;
-    }
-
+__attribute__((always_inline)) INLINE static void kernel_deval(float x,
+                                                               float *W,
+                                                               float *dW_dx) {
+  int ind = fminf(x, kernel_ivals);
+  float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
+  float w = coeffs[0] * x + coeffs[1];
+  float dw_dx = coeffs[0];
+  for (int k = 2; k <= kernel_degree; k++) {
+    dw_dx = dw_dx * x + w;
+    w = x * w + coeffs[k];
+  }
+  *W = w;
+  *dW_dx = dw_dx;
+}
 
 #ifdef VECTORIZE
 
 /**
- * @brief Computes the cubic spline kernel and its derivative for a given distance x (Vectorized version). Gives a sensible answer only if x<2.
+ * @brief Computes the cubic spline kernel and its derivative for a given
+ * distance x (Vectorized version). Gives a sensible answer only if x<2.
  */
 
-__attribute__ ((always_inline)) INLINE static void kernel_deval_vec ( vector *x , vector *w , vector *dw_dx ) {
-    
-    vector ind, c[kernel_degree+1];
-    int j, k;
-    
-    /* Load x and get the interval id. */
-    ind.m = vec_ftoi( vec_fmin( x->v , vec_set1( (float)kernel_ivals ) ) );
-    
-    /* load the coefficients. */
-    for ( k = 0 ; k < VEC_SIZE ; k++ )
-        for ( j = 0 ; j < kernel_degree+1 ; j++ )
-            c[j].f[k] = kernel_coeffs[ ind.i[k]*(kernel_degree + 1) + j ];
-
-    /* Init the iteration for Horner's scheme. */
-    w->v = ( c[0].v * x->v ) + c[1].v;
-    dw_dx->v = c[0].v;
-    
-    /* And we're off! */
-    for ( int k = 2 ; k <= kernel_degree ; k++ ) {
-        dw_dx->v = ( dw_dx->v * x->v ) + w->v;
-        w->v = ( x->v * w->v ) + c[k].v;
-        }
-        
-    }
-    
-#endif
+__attribute__((always_inline))
+    INLINE static void kernel_deval_vec(vector *x, vector *w, vector *dw_dx) {
 
+  vector ind, c[kernel_degree + 1];
+  int j, k;
 
-/**
- * @brief Computes the cubic spline kernel for a given distance x. Gives a sensible answer only if x<2.
- */
+  /* Load x and get the interval id. */
+  ind.m = vec_ftoi(vec_fmin(x->v, vec_set1((float)kernel_ivals)));
 
-__attribute__ ((always_inline)) INLINE static void kernel_eval ( float x , float *W ) {
-    int ind = fmin( x , kernel_ivals );
-    float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ];
-    float w = coeffs[0]*x + coeffs[1];
-    for ( int k = 2 ; k <= kernel_degree ; k++ )
-        w = x*w + coeffs[k];
-    *W = w;
-    }
+  /* load the coefficients. */
+  for (k = 0; k < VEC_SIZE; k++)
+    for (j = 0; j < kernel_degree + 1; j++)
+      c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j];
 
+  /* Init the iteration for Horner's scheme. */
+  w->v = (c[0].v * x->v) + c[1].v;
+  dw_dx->v = c[0].v;
 
+  /* And we're off! */
+  for (int k = 2; k <= kernel_degree; k++) {
+    dw_dx->v = (dw_dx->v * x->v) + w->v;
+    w->v = (x->v * w->v) + c[k].v;
+  }
+}
 
-/* -------------------------------------------------------------------------------------------------------------------- */
+#endif
+
+/**
+ * @brief Computes the cubic spline kernel for a given distance x. Gives a
+ * sensible answer only if x<2.
+ */
+
+__attribute__((always_inline)) INLINE static void kernel_eval(float x,
+                                                              float *W) {
+  int ind = fmin(x, kernel_ivals);
+  float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
+  float w = coeffs[0] * x + coeffs[1];
+  for (int k = 2; k <= kernel_degree; k++) w = x * w + coeffs[k];
+  *W = w;
+}
+
+/* --------------------------------------------------------------------------------------------------------------------
+ */
 
 #elif defined(QUARTIC_SPLINE_KERNEL)
 
-/* -------------------------------------------------------------------------------------------------------------------- */
+/* --------------------------------------------------------------------------------------------------------------------
+ */
 
-/* Coefficients for the kernel. */ 
+/* Coefficients for the kernel. */
 #define kernel_name "Quartic spline"
 #define kernel_degree 4
 #define kernel_ivals 3
@@ -303,186 +321,198 @@ __attribute__ ((always_inline)) INLINE static void kernel_eval ( float x , float
 #define kernel_gamma2 6.25f
 #define kernel_gamma3 15.625f
 #define kernel_igamma 0.4f
-#define kernel_nwneigh ( 4.0/3.0*M_PI*const_eta_kernel*const_eta_kernel*const_eta_kernel*8.2293f )
-static float kernel_coeffs[ (kernel_degree + 1) * (kernel_ivals + 1) ] __attribute__ ((aligned (16))) =
-  { 3.0/10.0*M_1_PI , 0.0  , -3.0/4.0*M_1_PI , 0.0 , 23.0/32.0*M_1_PI , 
-    -1.0/5.0*M_1_PI , M_1_PI , -3.0/2.0*M_1_PI , 0.25*M_1_PI , 11.0/16.0*M_1_PI ,
-    1.0/20.0*M_1_PI , -0.5*M_1_PI , 15.0/8.0*M_1_PI , -25.0/8.0*M_1_PI , 125.0/64.0*M_1_PI ,
-    0.0 , 0.0 , 0.0 , 0.0 , 0.0 };
-#define kernel_root ( kernel_coeffs[ kernel_degree ] )
-#define kernel_wroot ( 4.0/3.0*M_PI*kernel_coeffs[ kernel_degree ] )
-      
-      
+#define kernel_nwneigh                                                      \
+  (4.0 / 3.0 * M_PI *const_eta_kernel *const_eta_kernel *const_eta_kernel * \
+   8.2293f)
+static float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
+    __attribute__((aligned(16))) = {
+        3.0 / 10.0 * M_1_PI,  0.0,                  -3.0 / 4.0 * M_1_PI,
+        0.0,                  23.0 / 32.0 * M_1_PI, -1.0 / 5.0 * M_1_PI,
+        M_1_PI,               -3.0 / 2.0 * M_1_PI,  0.25 * M_1_PI,
+        11.0 / 16.0 * M_1_PI, 1.0 / 20.0 * M_1_PI,  -0.5 * M_1_PI,
+        15.0 / 8.0 * M_1_PI,  -25.0 / 8.0 * M_1_PI, 125.0 / 64.0 * M_1_PI,
+        0.0,                  0.0,                  0.0,
+        0.0,                  0.0};
+#define kernel_root (kernel_coeffs[kernel_degree])
+#define kernel_wroot (4.0 / 3.0 * M_PI *kernel_coeffs[kernel_degree])
+
 /**
- * @brief Computes the quartic spline kernel and its derivative for a given distance x. Gives a sensible answer only if x<2.5
+ * @brief Computes the quartic spline kernel and its derivative for a given
+ * distance x. Gives a sensible answer only if x<2.5
  */
 
-__attribute__ ((always_inline)) INLINE static void kernel_deval ( float x , float *W , float *dW_dx ) {
-    int ind = fminf( x + 0.5, kernel_ivals);
-    float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ];
-    float w = coeffs[0]*x + coeffs[1];
-    float dw_dx = coeffs[0];
-    for ( int k = 2 ; k <= kernel_degree ; k++ ) {
-        dw_dx = dw_dx*x + w;
-        w = x*w + coeffs[k];
-        }
-    *W = w;
-    *dW_dx = dw_dx;
-    }
-
+__attribute__((always_inline)) INLINE static void kernel_deval(float x,
+                                                               float *W,
+                                                               float *dW_dx) {
+  int ind = fminf(x + 0.5, kernel_ivals);
+  float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
+  float w = coeffs[0] * x + coeffs[1];
+  float dw_dx = coeffs[0];
+  for (int k = 2; k <= kernel_degree; k++) {
+    dw_dx = dw_dx * x + w;
+    w = x * w + coeffs[k];
+  }
+  *W = w;
+  *dW_dx = dw_dx;
+}
 
 #ifdef VECTORIZE
 
 /**
- * @brief Computes the quartic spline kernel and its derivative for a given distance x (Vectorized version). Gives a sensible answer only if x<2.5
+ * @brief Computes the quartic spline kernel and its derivative for a given
+ * distance x (Vectorized version). Gives a sensible answer only if x<2.5
  */
 
-__attribute__ ((always_inline)) INLINE static void kernel_deval_vec ( vector *x , vector *w , vector *dw_dx ) {
-    
-    vector ind, c[kernel_degree+1];
-    int j, k;
-    
-    /* Load x and get the interval id. */
-    ind.m = vec_ftoi( vec_fmin( x->v + 0.5f, vec_set1( (float)kernel_ivals ) ) );
-    
-    /* load the coefficients. */
-    for ( k = 0 ; k < VEC_SIZE ; k++ )
-        for ( j = 0 ; j < kernel_degree+1 ; j++ )
-            c[j].f[k] = kernel_coeffs[ ind.i[k]*(kernel_degree + 1) + j ];
-
-    /* Init the iteration for Horner's scheme. */
-    w->v = ( c[0].v * x->v ) + c[1].v;
-    dw_dx->v = c[0].v;
-    
-    /* And we're off! */
-    for ( int k = 2 ; k <= kernel_degree ; k++ ) {
-        dw_dx->v = ( dw_dx->v * x->v ) + w->v;
-        w->v = ( x->v * w->v ) + c[k].v;
-        }
-        
-    }
-    
-#endif
+__attribute__((always_inline))
+    INLINE static void kernel_deval_vec(vector *x, vector *w, vector *dw_dx) {
 
-/**
- * @brief Computes the quartic spline kernel for a given distance x. Gives a sensible answer only if x<2.5
- */
+  vector ind, c[kernel_degree + 1];
+  int j, k;
 
-__attribute__ ((always_inline)) INLINE static void kernel_eval ( float x , float *W ) {
-    int ind = fmin( x + 0.5f, kernel_ivals );
-    float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ];
-    float w = coeffs[0]*x + coeffs[1];
-    for ( int k = 2 ; k <= kernel_degree ; k++ )
-        w = x*w + coeffs[k];
-    *W = w;
-    }
+  /* Load x and get the interval id. */
+  ind.m = vec_ftoi(vec_fmin(x->v + 0.5f, vec_set1((float)kernel_ivals)));
 
+  /* load the coefficients. */
+  for (k = 0; k < VEC_SIZE; k++)
+    for (j = 0; j < kernel_degree + 1; j++)
+      c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j];
 
+  /* Init the iteration for Horner's scheme. */
+  w->v = (c[0].v * x->v) + c[1].v;
+  dw_dx->v = c[0].v;
 
+  /* And we're off! */
+  for (int k = 2; k <= kernel_degree; k++) {
+    dw_dx->v = (dw_dx->v * x->v) + w->v;
+    w->v = (x->v * w->v) + c[k].v;
+  }
+}
 
+#endif
 
+/**
+ * @brief Computes the quartic spline kernel for a given distance x. Gives a
+ * sensible answer only if x<2.5
+ */
 
+__attribute__((always_inline)) INLINE static void kernel_eval(float x,
+                                                              float *W) {
+  int ind = fmin(x + 0.5f, kernel_ivals);
+  float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
+  float w = coeffs[0] * x + coeffs[1];
+  for (int k = 2; k <= kernel_degree; k++) w = x * w + coeffs[k];
+  *W = w;
+}
 
-/* -------------------------------------------------------------------------------------------------------------------- */
+/* --------------------------------------------------------------------------------------------------------------------
+ */
 
 #elif defined(QUINTIC_SPLINE_KERNEL)
 
-/* -------------------------------------------------------------------------------------------------------------------- */
+/* --------------------------------------------------------------------------------------------------------------------
+ */
 
-/* Coefficients for the kernel. */ 
+/* Coefficients for the kernel. */
 #define kernel_name "Quintic spline"
 #define kernel_degree 5
 #define kernel_ivals 3
 #define kernel_gamma 3.f
 #define kernel_gamma2 9.f
 #define kernel_gamma3 27.f
-#define kernel_igamma 1.0f/3.0f
-#define kernel_nwneigh ( 4.0/3.0*M_PI*const_eta_kernel*const_eta_kernel*const_eta_kernel*10.5868f )
-static float kernel_coeffs[ (kernel_degree + 1) * (kernel_ivals + 1) ] __attribute__ ((aligned (16))) =
-{ -1.0/12.0*M_1_PI  ,  1.0/4.0*M_1_PI ,  0.0            , -1.0/2.0*M_1_PI ,  0.0             , 11.0/20.0*M_1_PI,
-  1.0/24.0*M_1_PI  , -3.0/8.0*M_1_PI ,  5.0/4.0*M_1_PI , -7.0/4.0*M_1_PI ,   5.0/8.0*M_1_PI , 17.0/40.0*M_1_PI ,
-  -1.0/120.0*M_1_PI ,  1.0/8.0*M_1_PI , -3.0/4.0*M_1_PI ,  9.0/4.0*M_1_PI , -27.0/8.0*M_1_PI , 81.0/40.0*M_1_PI,
-  0.0              , 0.0             , 0.0             , 0.0             ,   0.0            , 0.0};
-#define kernel_root ( kernel_coeffs[ kernel_degree ] )
-#define kernel_wroot ( 4.0/3.0*M_PI*kernel_coeffs[ kernel_degree ] )
-      
-      
+#define kernel_igamma 1.0f / 3.0f
+#define kernel_nwneigh                                                      \
+  (4.0 / 3.0 * M_PI *const_eta_kernel *const_eta_kernel *const_eta_kernel * \
+   10.5868f)
+static float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
+    __attribute__((aligned(16))) = {
+        -1.0 / 12.0 * M_1_PI,  1.0 / 4.0 * M_1_PI,   0.0,
+        -1.0 / 2.0 * M_1_PI,   0.0,                  11.0 / 20.0 * M_1_PI,
+        1.0 / 24.0 * M_1_PI,   -3.0 / 8.0 * M_1_PI,  5.0 / 4.0 * M_1_PI,
+        -7.0 / 4.0 * M_1_PI,   5.0 / 8.0 * M_1_PI,   17.0 / 40.0 * M_1_PI,
+        -1.0 / 120.0 * M_1_PI, 1.0 / 8.0 * M_1_PI,   -3.0 / 4.0 * M_1_PI,
+        9.0 / 4.0 * M_1_PI,    -27.0 / 8.0 * M_1_PI, 81.0 / 40.0 * M_1_PI,
+        0.0,                   0.0,                  0.0,
+        0.0,                   0.0,                  0.0};
+#define kernel_root (kernel_coeffs[kernel_degree])
+#define kernel_wroot (4.0 / 3.0 * M_PI *kernel_coeffs[kernel_degree])
+
 /**
- * @brief Computes the quintic spline kernel and its derivative for a given distance x. Gives a sensible answer only if x<3.
+ * @brief Computes the quintic spline kernel and its derivative for a given
+ * distance x. Gives a sensible answer only if x<3.
  */
 
-__attribute__ ((always_inline)) INLINE static void kernel_deval ( float x , float *W , float *dW_dx ) {
-    int ind = fminf( x, kernel_ivals);
-    float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ];
-    float w = coeffs[0]*x + coeffs[1];
-    float dw_dx = coeffs[0];
-    for ( int k = 2 ; k <= kernel_degree ; k++ ) {
-        dw_dx = dw_dx*x + w;
-        w = x*w + coeffs[k];
-        }
-    *W = w;
-    *dW_dx = dw_dx;
-    }
-
+__attribute__((always_inline)) INLINE static void kernel_deval(float x,
+                                                               float *W,
+                                                               float *dW_dx) {
+  int ind = fminf(x, kernel_ivals);
+  float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
+  float w = coeffs[0] * x + coeffs[1];
+  float dw_dx = coeffs[0];
+  for (int k = 2; k <= kernel_degree; k++) {
+    dw_dx = dw_dx * x + w;
+    w = x * w + coeffs[k];
+  }
+  *W = w;
+  *dW_dx = dw_dx;
+}
 
 #ifdef VECTORIZE
 
 /**
- * @brief Computes the quintic spline kernel and its derivative for a given distance x (Vectorized version). Gives a sensible answer only if x<3.
+ * @brief Computes the quintic spline kernel and its derivative for a given
+ * distance x (Vectorized version). Gives a sensible answer only if x<3.
  */
 
-__attribute__ ((always_inline)) INLINE static void kernel_deval_vec ( vector *x , vector *w , vector *dw_dx ) {
-    
-    vector ind, c[kernel_degree+1];
-    int j, k;
-    
-    /* Load x and get the interval id. */
-    ind.m = vec_ftoi( vec_fmin( x->v, vec_set1( (float)kernel_ivals ) ) );
-    
-    /* load the coefficients. */
-    for ( k = 0 ; k < VEC_SIZE ; k++ )
-        for ( j = 0 ; j < kernel_degree+1 ; j++ )
-            c[j].f[k] = kernel_coeffs[ ind.i[k]*(kernel_degree + 1) + j ];
-
-    /* Init the iteration for Horner's scheme. */
-    w->v = ( c[0].v * x->v ) + c[1].v;
-    dw_dx->v = c[0].v;
-    
-    /* And we're off! */
-    for ( int k = 2 ; k <= kernel_degree ; k++ ) {
-        dw_dx->v = ( dw_dx->v * x->v ) + w->v;
-        w->v = ( x->v * w->v ) + c[k].v;
-        }
-        
-    }
-    
-#endif
+__attribute__((always_inline))
+    INLINE static void kernel_deval_vec(vector *x, vector *w, vector *dw_dx) {
 
-/**
- * @brief Computes the quintic spline kernel for a given distance x. Gives a sensible answer only if x<3.
- */
+  vector ind, c[kernel_degree + 1];
+  int j, k;
 
-__attribute__ ((always_inline)) INLINE static void kernel_eval ( float x , float *W ) {
-    int ind = fmin( x, kernel_ivals );
-    float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ];
-    float w = coeffs[0]*x + coeffs[1];
-    for ( int k = 2 ; k <= kernel_degree ; k++ )
-        w = x*w + coeffs[k];
-    *W = w;
-    }
+  /* Load x and get the interval id. */
+  ind.m = vec_ftoi(vec_fmin(x->v, vec_set1((float)kernel_ivals)));
 
+  /* load the coefficients. */
+  for (k = 0; k < VEC_SIZE; k++)
+    for (j = 0; j < kernel_degree + 1; j++)
+      c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j];
 
+  /* Init the iteration for Horner's scheme. */
+  w->v = (c[0].v * x->v) + c[1].v;
+  dw_dx->v = c[0].v;
 
+  /* And we're off! */
+  for (int k = 2; k <= kernel_degree; k++) {
+    dw_dx->v = (dw_dx->v * x->v) + w->v;
+    w->v = (x->v * w->v) + c[k].v;
+  }
+}
+
+#endif
 
+/**
+ * @brief Computes the quintic spline kernel for a given distance x. Gives a
+ * sensible answer only if x<3.
+ */
 
+__attribute__((always_inline)) INLINE static void kernel_eval(float x,
+                                                              float *W) {
+  int ind = fmin(x, kernel_ivals);
+  float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
+  float w = coeffs[0] * x + coeffs[1];
+  for (int k = 2; k <= kernel_degree; k++) w = x * w + coeffs[k];
+  *W = w;
+}
 
-/* -------------------------------------------------------------------------------------------------------------------- */
+/* --------------------------------------------------------------------------------------------------------------------
+ */
 
 #elif defined(WENDLAND_C2_KERNEL)
 
-/* -------------------------------------------------------------------------------------------------------------------- */
+/* --------------------------------------------------------------------------------------------------------------------
+ */
 
-/* Coefficients for the kernel. */ 
+/* Coefficients for the kernel. */
 #define kernel_name "Wendland C2"
 #define kernel_degree 5
 #define kernel_ivals 1
@@ -490,92 +520,93 @@ __attribute__ ((always_inline)) INLINE static void kernel_eval ( float x , float
 #define kernel_gamma2 1.f
 #define kernel_gamma3 1.f
 #define kernel_igamma 1.f
-#define kernel_nwneigh ( 4.0/3.0*M_PI*const_eta_kernel*const_eta_kernel*const_eta_kernel*7.261825f )
-static float kernel_coeffs[ (kernel_degree + 1) * (kernel_ivals + 1) ] __attribute__ ((aligned (16))) =
-{  4.0f             , -15.0f           , 20.0f            , -10.0f           , 0.0f            , 1.0f,
-  0.0f             , 0.0f             , 0.0f             , 0.0f            , 0.0f            , 0.0f};
-#define kernel_root ( kernel_coeffs[ kernel_degree ] )
-#define kernel_wroot ( 4.0/3.0*M_PI*kernel_coeffs[ kernel_degree ] )
-      
-      
+#define kernel_nwneigh                                                      \
+  (4.0 / 3.0 * M_PI *const_eta_kernel *const_eta_kernel *const_eta_kernel * \
+   7.261825f)
+static float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
+    __attribute__((aligned(16))) = {4.0f, -15.0f, 20.0f, -10.0f, 0.0f, 1.0f,
+                                    0.0f, 0.0f,   0.0f,  0.0f,   0.0f, 0.0f};
+#define kernel_root (kernel_coeffs[kernel_degree])
+#define kernel_wroot (4.0 / 3.0 * M_PI *kernel_coeffs[kernel_degree])
+
 /**
- * @brief Computes the quintic spline kernel and its derivative for a given distance x. Gives a sensible answer only if x<1.
+ * @brief Computes the quintic spline kernel and its derivative for a given
+ * distance x. Gives a sensible answer only if x<1.
  */
 
-__attribute__ ((always_inline)) INLINE static void kernel_deval ( float x , float *W , float *dW_dx ) {
-    int ind = fminf( x, kernel_ivals);
-    float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ];
-    float w = coeffs[0]*x + coeffs[1];
-    float dw_dx = coeffs[0];
-    for ( int k = 2 ; k <= kernel_degree ; k++ ) {
-        dw_dx = dw_dx*x + w;
-        w = x*w + coeffs[k];
-        }
-    *W = w;
-    *dW_dx = dw_dx;
-    }
-
+__attribute__((always_inline)) INLINE static void kernel_deval(float x,
+                                                               float *W,
+                                                               float *dW_dx) {
+  int ind = fminf(x, kernel_ivals);
+  float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
+  float w = coeffs[0] * x + coeffs[1];
+  float dw_dx = coeffs[0];
+  for (int k = 2; k <= kernel_degree; k++) {
+    dw_dx = dw_dx * x + w;
+    w = x * w + coeffs[k];
+  }
+  *W = w;
+  *dW_dx = dw_dx;
+}
 
 #ifdef VECTORIZE
 
 /**
- * @brief Computes the Wendland C2 kernel and its derivative for a given distance x (Vectorized version). Gives a sensible answer only if x<1.
+ * @brief Computes the Wendland C2 kernel and its derivative for a given
+ * distance x (Vectorized version). Gives a sensible answer only if x<1.
  */
 
-__attribute__ ((always_inline)) INLINE static void kernel_deval_vec ( vector *x , vector *w , vector *dw_dx ) {
-    
-    vector ind, c[kernel_degree+1];
-    int j, k;
-    
-    /* Load x and get the interval id. */
-    ind.m = vec_ftoi( vec_fmin( x->v, vec_set1( (float)kernel_ivals ) ) );
-    
-    /* load the coefficients. */
-    for ( k = 0 ; k < VEC_SIZE ; k++ )
-        for ( j = 0 ; j < kernel_degree+1 ; j++ )
-            c[j].f[k] = kernel_coeffs[ ind.i[k]*(kernel_degree + 1) + j ];
-
-    /* Init the iteration for Horner's scheme. */
-    w->v = ( c[0].v * x->v ) + c[1].v;
-    dw_dx->v = c[0].v;
-    
-    /* And we're off! */
-    for ( int k = 2 ; k <= kernel_degree ; k++ ) {
-        dw_dx->v = ( dw_dx->v * x->v ) + w->v;
-        w->v = ( x->v * w->v ) + c[k].v;
-        }
-        
-    }
-    
-#endif
+__attribute__((always_inline))
+    INLINE static void kernel_deval_vec(vector *x, vector *w, vector *dw_dx) {
 
-/**
- * @brief Computes the Wendland C2 kernel for a given distance x. Gives a sensible answer only if x<1.
- */
+  vector ind, c[kernel_degree + 1];
+  int j, k;
 
-__attribute__ ((always_inline)) INLINE static void kernel_eval ( float x , float *W ) {
-    int ind = fmin( x, kernel_ivals );
-    float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ];
-    float w = coeffs[0]*x + coeffs[1];
-    for ( int k = 2 ; k <= kernel_degree ; k++ )
-        w = x*w + coeffs[k];
-    *W = w;
-    }
+  /* Load x and get the interval id. */
+  ind.m = vec_ftoi(vec_fmin(x->v, vec_set1((float)kernel_ivals)));
 
+  /* load the coefficients. */
+  for (k = 0; k < VEC_SIZE; k++)
+    for (j = 0; j < kernel_degree + 1; j++)
+      c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j];
 
+  /* Init the iteration for Horner's scheme. */
+  w->v = (c[0].v * x->v) + c[1].v;
+  dw_dx->v = c[0].v;
 
+  /* And we're off! */
+  for (int k = 2; k <= kernel_degree; k++) {
+    dw_dx->v = (dw_dx->v * x->v) + w->v;
+    w->v = (x->v * w->v) + c[k].v;
+  }
+}
 
+#endif
 
+/**
+ * @brief Computes the Wendland C2 kernel for a given distance x. Gives a
+ * sensible answer only if x<1.
+ */
 
+__attribute__((always_inline)) INLINE static void kernel_eval(float x,
+                                                              float *W) {
+  int ind = fmin(x, kernel_ivals);
+  float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
+  float w = coeffs[0] * x + coeffs[1];
+  for (int k = 2; k <= kernel_degree; k++) w = x * w + coeffs[k];
+  *W = w;
+}
 
-/* -------------------------------------------------------------------------------------------------------------------- */
+/* --------------------------------------------------------------------------------------------------------------------
+ */
 
 #else
 
-/* -------------------------------------------------------------------------------------------------------------------- */
+/* --------------------------------------------------------------------------------------------------------------------
+ */
 
 #error "A kernel function must be chosen in const.h !!"
 
-#endif // Kernel choice
+#endif  // Kernel choice
 
-#endif //KERNEL_H
+#endif  // SWIFT_KERNEL_H
diff --git a/src/lock.h b/src/lock.h
index 3e3affb1b08e320770687217b4a572631f7413c1..19a4e74bf82d3b6bb8e305388ca42929cc9d719e 100644
--- a/src/lock.h
+++ b/src/lock.h
@@ -1,54 +1,61 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_LOCK_H
+#define SWIFT_LOCK_H
 
+/* Some standard headers. */
+#include <pthread.h>
 
-
+/* Includes. */
 #include "inline.h"
-    
+
 #ifdef PTHREAD_SPINLOCK
-    #include <pthread.h>
-    #define lock_type pthread_spinlock_t
-    #define lock_init( l ) ( pthread_spin_init( l , PTHREAD_PROCESS_PRIVATE ) != 0 )
-    #define lock_destroy( l ) ( pthread_spin_destroy( l ) != 0 )
-    #define lock_lock( l ) ( pthread_spin_lock( l ) != 0 )
-    #define lock_trylock( l ) ( pthread_spin_lock( l ) != 0 )
-    #define lock_unlock( l ) ( pthread_spin_unlock( l ) != 0 )
-    #define lock_unlock_blind( l ) pthread_spin_unlock( l )
+#include <pthread.h>
+#define lock_type pthread_spinlock_t
+#define lock_init(l) (pthread_spin_init(l, PTHREAD_PROCESS_PRIVATE) != 0)
+#define lock_destroy(l) (pthread_spin_destroy(l) != 0)
+#define lock_lock(l) (pthread_spin_lock(l) != 0)
+#define lock_trylock(l) (pthread_spin_lock(l) != 0)
+#define lock_unlock(l) (pthread_spin_unlock(l) != 0)
+#define lock_unlock_blind(l) pthread_spin_unlock(l)
 #elif defined(PTHREAD_LOCK)
-    #include <pthread.h>
-    #define lock_type pthread_mutex_t
-    #define lock_init( l ) ( pthread_mutex_init( l , NULL ) != 0 )
-    #define lock_destroy( l ) ( pthread_mutex_destroy( l ) != 0 )
-    #define lock_lock( l ) ( pthread_mutex_lock( l ) != 0 )
-    #define lock_trylock( l ) ( pthread_mutex_trylock( l ) != 0 )
-    #define lock_unlock( l ) ( pthread_mutex_unlock( l ) != 0 )
-    #define lock_unlock_blind( l ) pthread_mutex_unlock( l )
+#include <pthread.h>
+#define lock_type pthread_mutex_t
+#define lock_init(l) (pthread_mutex_init(l, NULL) != 0)
+#define lock_destroy(l) (pthread_mutex_destroy(l) != 0)
+#define lock_lock(l) (pthread_mutex_lock(l) != 0)
+#define lock_trylock(l) (pthread_mutex_trylock(l) != 0)
+#define lock_unlock(l) (pthread_mutex_unlock(l) != 0)
+#define lock_unlock_blind(l) pthread_mutex_unlock(l)
 #else
-    #define lock_type volatile int
-    #define lock_init( l ) ( *(l) = 0 )
-    #define lock_destroy( l ) 0
-    INLINE static int lock_lock ( volatile int *l ) {
-        while ( __sync_val_compare_and_swap( l , 0 , 1 ) != 0 );
-            // while( *l );
-        return 0;
-        }
-    #define lock_trylock( l ) ( ( *(l) ) ? 1 : __sync_val_compare_and_swap( l , 0 , 1 ) )
-    #define lock_unlock( l ) ( __sync_val_compare_and_swap( l , 1 , 0 ) != 1 )
-    #define lock_unlock_blind( l ) __sync_val_compare_and_swap( l , 1 , 0 )
+#define lock_type volatile int
+#define lock_init(l) (*(l) = 0)
+#define lock_destroy(l) 0
+INLINE static int lock_lock(volatile int *l) {
+  while (__sync_val_compare_and_swap(l, 0, 1) != 0)
+    ;
+  // while( *l );
+  return 0;
+}
+#define lock_trylock(l) ((*(l)) ? 1 : __sync_val_compare_and_swap(l, 0, 1))
+#define lock_unlock(l) (__sync_val_compare_and_swap(l, 1, 0) != 1)
+#define lock_unlock_blind(l) __sync_val_compare_and_swap(l, 1, 0)
 #endif
+
+#endif /* SWIFT_LOCK_H */
diff --git a/src/multipole.c b/src/multipole.c
index 38337230404338eae3faa6bd14a5d3f3a313e971..439e9cd5f0218bddf28d228de6eb3bb14a2d6735 100644
--- a/src/multipole.c
+++ b/src/multipole.c
@@ -1,49 +1,40 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 
 /* Config parameters. */
 #include "../config.h"
 
 /* Some standard headers. */
+#include <float.h>
+#include <limits.h>
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <float.h>
-#include <limits.h>
 
 /* MPI headers. */
 #ifdef WITH_MPI
-    #include <mpi.h>
+#include <mpi.h>
 #endif
 
-/* Local headers. */
-#include "error.h"
-#include "const.h"
-#include "cycle.h"
-#include "atomic.h"
-#include "lock.h"
-#include "space.h"
-#include "part.h"
+/* This object's header. */
 #include "multipole.h"
-#include "cell.h"
-
 
 /**
  * @brief Merge two multipoles.
@@ -51,26 +42,23 @@
  * @param ma The #multipole which will contain the merged result.
  * @param mb The other #multipole.
  */
- 
-void multipole_merge ( struct multipole *ma , struct multipole *mb ) {
-
-    #if multipole_order == 1
-    
-        /* Correct the position. */
-        float mma = ma->coeffs[0], mmb = mb->coeffs[0];
-        float w = 1.0f / ( mma + mmb );
-        for ( int k = 0 ; k < 3 ; k++ )
-            ma->x[k] = ( ma->x[k]*mma + mb->x[k]*mmb ) * w;
-            
-        /* Add the particle to the moments. */
-        ma->coeffs[0] = mma + mmb;
-    
-    #else
-        #error( "Multipoles of order %i not yet implemented." , multipole_order )
-    #endif
-
-    }
 
+void multipole_merge(struct multipole *ma, struct multipole *mb) {
+
+#if multipole_order == 1
+
+  /* Correct the position. */
+  float mma = ma->coeffs[0], mmb = mb->coeffs[0];
+  float w = 1.0f / (mma + mmb);
+  for (int k = 0; k < 3; k++) ma->x[k] = (ma->x[k] * mma + mb->x[k] * mmb) * w;
+
+  /* Add the particle to the moments. */
+  ma->coeffs[0] = mma + mmb;
+
+#else
+#error( "Multipoles of order %i not yet implemented." , multipole_order )
+#endif
+}
 
 /**
  * @brief Add a particle to the given multipole.
@@ -78,26 +66,23 @@ void multipole_merge ( struct multipole *ma , struct multipole *mb ) {
  * @param m The #multipole.
  * @param p The #gpart.
  */
- 
-void multipole_addpart ( struct multipole *m , struct gpart *p ) {
-    
-    #if multipole_order == 1
-
-        /* Correct the position. */
-        float mm = m->coeffs[0], mp = p->mass;
-        float w = 1.0f / ( mm + mp );
-        for ( int k = 0 ; k < 3 ; k++ )
-            m->x[k] = ( m->x[k]*mm + p->x[k]*mp ) * w;
-            
-        /* Add the particle to the moments. */
-        m->coeffs[0] = mm + mp;
-        
-    #else
-        #error( "Multipoles of order %i not yet implemented." , multipole_order )
-    #endif
-
-    }
 
+void multipole_addpart(struct multipole *m, struct gpart *p) {
+
+#if multipole_order == 1
+
+  /* Correct the position. */
+  float mm = m->coeffs[0], mp = p->mass;
+  float w = 1.0f / (mm + mp);
+  for (int k = 0; k < 3; k++) m->x[k] = (m->x[k] * mm + p->x[k] * mp) * w;
+
+  /* Add the particle to the moments. */
+  m->coeffs[0] = mm + mp;
+
+#else
+#error( "Multipoles of order %i not yet implemented." , multipole_order )
+#endif
+}
 
 /**
  * @brief Add a group of particles to the given multipole.
@@ -106,37 +91,34 @@ void multipole_addpart ( struct multipole *m , struct gpart *p ) {
  * @param p The #gpart array.
  * @param N Number of parts to add.
  */
- 
-void multipole_addparts ( struct multipole *m , struct gpart *p , int N ) {
-    
-    #if multipole_order == 1
-    
-        /* Get the combined mass and positions. */
-        double xp[3] = { 0.0 , 0.0 , 0.0 };
-        float mp = 0.0f, w;
-        for ( int k = 0 ; k < N ; k++ ) {
-            w = p[k].mass;
-            mp += w;
-            xp[0] += p[k].x[0] * w;
-            xp[1] += p[k].x[1] * w;
-            xp[2] += p[k].x[2] * w;
-            }
-
-        /* Correct the position. */
-        float mm = m->coeffs[0];
-        w = 1.0f / ( mm + mp );
-        for ( int k = 0 ; k < 3 ; k++ )
-            m->x[k] = ( m->x[k]*mm + xp[k] ) * w;
-            
-        /* Add the particle to the moments. */
-        m->coeffs[0] = mm + mp;
-        
-    #else
-        #error( "Multipoles of order %i not yet implemented." , multipole_order )
-    #endif
-
-    }
 
+void multipole_addparts(struct multipole *m, struct gpart *p, int N) {
+
+#if multipole_order == 1
+
+  /* Get the combined mass and positions. */
+  double xp[3] = {0.0, 0.0, 0.0};
+  float mp = 0.0f, w;
+  for (int k = 0; k < N; k++) {
+    w = p[k].mass;
+    mp += w;
+    xp[0] += p[k].x[0] * w;
+    xp[1] += p[k].x[1] * w;
+    xp[2] += p[k].x[2] * w;
+  }
+
+  /* Correct the position. */
+  float mm = m->coeffs[0];
+  w = 1.0f / (mm + mp);
+  for (int k = 0; k < 3; k++) m->x[k] = (m->x[k] * mm + xp[k]) * w;
+
+  /* Add the particle to the moments. */
+  m->coeffs[0] = mm + mp;
+
+#else
+#error( "Multipoles of order %i not yet implemented." , multipole_order )
+#endif
+}
 
 /**
  * @brief Init a multipole from a set of particles.
@@ -145,46 +127,43 @@ void multipole_addparts ( struct multipole *m , struct gpart *p , int N ) {
  * @param parts The #gpart.
  * @param N The number of particles.
  */
- 
-void multipole_init ( struct multipole *m , struct gpart *parts , int N ) {
-    
-    #if multipole_order == 1
-
-        float mass = 0.0f, w;
-        double x[3] = { 0.0 , 0.0 , 0.0 };
-        int k;
-        
-        /* Collect the particle data. */
-        for ( k = 0 ; k < N ; k++ ) {
-            w = parts[k].mass;
-            mass += w;
-            x[0] += parts[k].x[0] * w;
-            x[1] += parts[k].x[1] * w;
-            x[2] += parts[k].x[2] * w;
-            }
-            
-        /* Store the data on the multipole. */
-        m->coeffs[0] = mass;
-        m->x[0] = x[0] / mass;
-        m->x[1] = x[1] / mass;
-        m->x[2] = x[2] / mass;
-        
-    #else
-        #error( "Multipoles of order %i not yet implemented." , multipole_order )
-    #endif
-
-    }
 
+void multipole_init(struct multipole *m, struct gpart *parts, int N) {
+
+#if multipole_order == 1
+
+  float mass = 0.0f, w;
+  double x[3] = {0.0, 0.0, 0.0};
+  int k;
+
+  /* Collect the particle data. */
+  for (k = 0; k < N; k++) {
+    w = parts[k].mass;
+    mass += w;
+    x[0] += parts[k].x[0] * w;
+    x[1] += parts[k].x[1] * w;
+    x[2] += parts[k].x[2] * w;
+  }
+
+  /* Store the data on the multipole. */
+  m->coeffs[0] = mass;
+  m->x[0] = x[0] / mass;
+  m->x[1] = x[1] / mass;
+  m->x[2] = x[2] / mass;
+
+#else
+#error( "Multipoles of order %i not yet implemented." , multipole_order )
+#endif
+}
 
 /**
  * @brief Reset the data of a #multipole.
  *
  * @param m The #multipole.
  */
- 
-void multipole_reset ( struct multipole *m ) {
 
-    /* Just bzero the struct. */
-    bzero( m , sizeof(struct multipole) );
-    
-    }
+void multipole_reset(struct multipole *m) {
+
+  /* Just bzero the struct. */
+  bzero(m, sizeof(struct multipole));
+}
diff --git a/src/multipole.h b/src/multipole.h
index e3372e245637dbaec6e078ac7294902f6a22a9fe..ffa5d713f507b85f6ae9216cfe81d4fc49316345 100644
--- a/src/multipole.h
+++ b/src/multipole.h
@@ -1,53 +1,58 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_MULTIPOLE_H
+#define SWIFT_MULTIPOLE_H
 
-/* Some constants. */
-#define multipole_order                 1
+/* Some standard headers. */
+#include <math.h>
+
+/* Includes. */
+#include "const.h"
+#include "inline.h"
+#include "kernel.h"
+#include "part.h"
 
+/* Some constants. */
+#define multipole_order 1
 
 /* Multipole struct. */
 struct multipole {
 
-    /* Multipole location. */
-    double x[3];
-    
-    /* Acceleration on this multipole. */
-    float a[3];
-    
-    /* Multipole coefficients. */
-    float coeffs[ multipole_order*multipole_order ];
-    
-    };
-    
-    
-/* Multipole function prototypes. */
-static void multipole_iact_mm ( struct multipole *ma , struct multipole *mb , double *shift );
-void multipole_merge ( struct multipole *ma , struct multipole *mb );
-void multipole_addpart ( struct multipole *m , struct gpart *p );
-void multipole_addparts ( struct multipole *m , struct gpart *p , int N );
-void multipole_init ( struct multipole *m , struct gpart *parts , int N );
-void multipole_reset ( struct multipole *m );
+  /* Multipole location. */
+  double x[3];
 
+  /* Acceleration on this multipole. */
+  float a[3];
+
+  /* Multipole coefficients. */
+  float coeffs[multipole_order * multipole_order];
+};
+
+/* Multipole function prototypes. */
+static void multipole_iact_mm(struct multipole *ma, struct multipole *mb,
+                              double *shift);
+void multipole_merge(struct multipole *ma, struct multipole *mb);
+void multipole_addpart(struct multipole *m, struct gpart *p);
+void multipole_addparts(struct multipole *m, struct gpart *p, int N);
+void multipole_init(struct multipole *m, struct gpart *parts, int N);
+void multipole_reset(struct multipole *m);
 
-#include <math.h>
-#include "kernel.h"
- 
 /**
  * @brief Compute the pairwise interaction between two multipoles.
  *
@@ -55,41 +60,40 @@ void multipole_reset ( struct multipole *m );
  * @param mb The second #multipole.
  * @param shift The periodicity correction.
  */
- 
-__attribute__ ((always_inline)) INLINE static void multipole_iact_mm ( struct multipole *ma , struct multipole *mb , double *shift ) {
-
-    float dx[3], ir, r, r2 = 0.0f, acc;
-    int k;
-    
-    /* Compute the multipole distance. */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        dx[k] = ma->x[k] - mb->x[k] - shift[k];
-        r2 += dx[k]*dx[k];
-        }
-        
-    /* Compute the normalized distance vector. */
-    ir = 1.0f / sqrtf( r2 );
-    r = r2 * ir;
-        
-    /* Evaluate the gravity kernel. */
-    kernel_grav_eval( r , &acc );
-    
-    /* Scale the acceleration. */
-    acc *= const_G * ir * ir * ir;
-    
-    /* Compute the forces on both multipoles. */
-    #if multipole_order == 1
-        float mma = ma->coeffs[0], mmb = mb->coeffs[0];
-        for ( k = 0 ; k < 3 ; k++ ) {
-            ma->a[k] -= dx[k] * acc * mmb;
-            mb->a[k] += dx[k] * acc * mma;
-            }
-    #else
-        #error( "Multipoles of order %i not yet implemented." , multipole_order )
-    #endif
-
-    }
 
+__attribute__((always_inline)) INLINE static void multipole_iact_mm(
+    struct multipole *ma, struct multipole *mb, double *shift) {
+
+  float dx[3], ir, r, r2 = 0.0f, acc;
+  int k;
+
+  /* Compute the multipole distance. */
+  for (k = 0; k < 3; k++) {
+    dx[k] = ma->x[k] - mb->x[k] - shift[k];
+    r2 += dx[k] * dx[k];
+  }
+
+  /* Compute the normalized distance vector. */
+  ir = 1.0f / sqrtf(r2);
+  r = r2 * ir;
+
+  /* Evaluate the gravity kernel. */
+  kernel_grav_eval(r, &acc);
+
+  /* Scale the acceleration. */
+  acc *= const_G * ir * ir * ir;
+
+/* Compute the forces on both multipoles. */
+#if multipole_order == 1
+  float mma = ma->coeffs[0], mmb = mb->coeffs[0];
+  for (k = 0; k < 3; k++) {
+    ma->a[k] -= dx[k] * acc * mmb;
+    mb->a[k] += dx[k] * acc * mma;
+  }
+#else
+#error( "Multipoles of order %i not yet implemented." , multipole_order )
+#endif
+}
 
 /**
  * @brief Compute the interaction of a multipole on a particle.
@@ -98,36 +102,35 @@ __attribute__ ((always_inline)) INLINE static void multipole_iact_mm ( struct mu
  * @param p The #gpart.
  * @param shift The periodicity correction.
  */
- 
-__attribute__ ((always_inline)) INLINE static void multipole_iact_mp ( struct multipole *m , struct gpart *p , double *shift ) {
-
-    float dx[3], ir, r, r2 = 0.0f, acc;
-    int k;
-    
-    /* Compute the multipole distance. */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        dx[k] = m->x[k] - p->x[k] - shift[k];
-        r2 += dx[k]*dx[k];
-        }
-        
-    /* Compute the normalized distance vector. */
-    ir = 1.0f / sqrtf( r2 );
-    r = r2 * ir;
-        
-    /* Evaluate the gravity kernel. */
-    kernel_grav_eval( r , &acc );
-    
-    /* Scale the acceleration. */
-    acc *= const_G * ir * ir * ir * m->coeffs[0];
-    
-    /* Compute the forces on both multipoles. */
-    #if multipole_order == 1
-        for ( k = 0 ; k < 3 ; k++ )
-            p->a[k] += dx[k] * acc;
-    #else
-        #error( "Multipoles of order %i not yet implemented." , multipole_order )
-    #endif
-
-    }
 
+__attribute__((always_inline)) INLINE static void multipole_iact_mp(
+    struct multipole *m, struct gpart *p, double *shift) {
+
+  float dx[3], ir, r, r2 = 0.0f, acc;
+  int k;
+
+  /* Compute the multipole distance. */
+  for (k = 0; k < 3; k++) {
+    dx[k] = m->x[k] - p->x[k] - shift[k];
+    r2 += dx[k] * dx[k];
+  }
+
+  /* Compute the normalized distance vector. */
+  ir = 1.0f / sqrtf(r2);
+  r = r2 * ir;
+
+  /* Evaluate the gravity kernel. */
+  kernel_grav_eval(r, &acc);
+
+  /* Scale the acceleration. */
+  acc *= const_G * ir * ir * ir * m->coeffs[0];
+
+/* Compute the forces on both multipoles. */
+#if multipole_order == 1
+  for (k = 0; k < 3; k++) p->a[k] += dx[k] * acc;
+#else
+#error( "Multipoles of order %i not yet implemented." , multipole_order )
+#endif
+}
 
+#endif /* SWIFT_MULTIPOLE_H */
diff --git a/src/parallel_io.c b/src/parallel_io.c
index 7e2f5e244aa3b280872dc25e1fd14ce0348bbd3b..ccd35fe06ef4bed6bd183c6a28648d19c184b4f4 100644
--- a/src/parallel_io.c
+++ b/src/parallel_io.c
@@ -2,20 +2,20 @@
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
  *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk).
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 
 /* Config parameters. */
@@ -26,27 +26,21 @@
 /* Tell hdf5 that we intend to use shared-memory parallel stuff. */
 #define H5_HAVE_PARALLEL
 
-
 /* Some standard headers. */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stddef.h>
 #include <hdf5.h>
 #include <math.h>
 #include <mpi.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
-#include "const.h"
-#include "cycle.h"
-#include "lock.h"
-#include "task.h"
-#include "part.h"
-#include "space.h"
-#include "scheduler.h"
-#include "engine.h"
-#include "error.h"
-#include "kernel.h"
+/* This object's header. */
+#include "parallel_io.h"
+
+/* Local includes. */
 #include "common_io.h"
+#include "error.h"
 
 /**
  * @brief Reads a data array from a given HDF5 group.
@@ -56,83 +50,79 @@
  * @param type The #DATA_TYPE of the attribute.
  * @param N The number of particles.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
- * @param part_c A (char*) pointer on the first occurence of the field of interest in the parts array
- * @param importance If COMPULSORY, the data must be present in the IC file. If OPTIONAL, the array will be zeroed when the data is not present.
+ * @param part_c A (char*) pointer on the first occurence of the field of
+ *interest in the parts array
+ * @param importance If COMPULSORY, the data must be present in the IC file. If
+ *OPTIONAL, the array will be zeroed when the data is not present.
  *
- * @todo A better version using HDF5 hyperslabs to read the file directly into the part array 
+ * @todo A better version using HDF5 hyperslabs to read the file directly into
+ *the part array
  * will be written once the strucutres have been stabilized.
- *  
+ *
  * Calls #error() if an error occurs.
  */
-void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim, long long N_total, long long offset, char* part_c, enum DATA_IMPORTANCE importance)
-{
-  hid_t h_data=0, h_err=0, h_type=0, h_memspace=0, h_filespace=0, h_plist_id=0;
+void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
+                      int dim, long long N_total, long long offset,
+                      char* part_c, enum DATA_IMPORTANCE importance) {
+  hid_t h_data = 0, h_err = 0, h_type = 0, h_memspace = 0, h_filespace = 0,
+        h_plist_id = 0;
   hsize_t shape[2], offsets[2];
-  htri_t exist=0;
+  htri_t exist = 0;
   void* temp;
-  int i=0, rank=0;
+  int i = 0, rank = 0;
   const size_t typeSize = sizeOfType(type);
   const size_t copySize = typeSize * dim;
   const size_t partSize = sizeof(struct part);
   char* temp_c = 0;
 
-
   /* Check whether the dataspace exists or not */
   exist = H5Lexists(grp, name, 0);
-  if(exist < 0)
-    {
-      error( "Error while checking the existence of data set '%s'." , name );
-    }
-  else if(exist == 0)
-    {
-      if(importance == COMPULSORY)
-	{
-	  error( "Compulsory data set '%s' not present in the file." , name );
-	}
-      else
-	{	  
-	  for(i=0; i<N; ++i)
-	    memset(part_c+i*partSize, 0, copySize);
-	  return;
-	}
+  if (exist < 0) {
+    error("Error while checking the existence of data set '%s'.", name);
+  } else if (exist == 0) {
+    if (importance == COMPULSORY) {
+      error("Compulsory data set '%s' not present in the file.", name);
+    } else {
+      for (i = 0; i < N; ++i) memset(part_c + i * partSize, 0, copySize);
+      return;
     }
+  }
 
-  /* message( "Reading %s '%s' array...", importance == COMPULSORY ? "compulsory": "optional  ", name); */
+  /* message( "Reading %s '%s' array...", importance == COMPULSORY ?
+   * "compulsory": "optional  ", name); */
 
   /* Open data space in file */
   h_data = H5Dopen2(grp, name, H5P_DEFAULT);
-  if(h_data < 0)
-    error( "Error while opening data space '%s'." , name );
+  if (h_data < 0) error("Error while opening data space '%s'.", name);
 
   /* Check data type */
   h_type = H5Dget_type(h_data);
-  if(h_type < 0)
-    error("Unable to retrieve data type from the file");
-  if(!H5Tequal(h_type, hdf5Type(type)))
+  if (h_type < 0) error("Unable to retrieve data type from the file");
+  if (!H5Tequal(h_type, hdf5Type(type)))
     error("Non-matching types between the code and the file");
-  
+
   /* Allocate temporary buffer */
   temp = malloc(N * dim * sizeOfType(type));
-  if(temp == NULL)
-    error("Unable to allocate memory for temporary buffer");
+  if (temp == NULL) error("Unable to allocate memory for temporary buffer");
 
   /* Prepare information for hyperslab */
-  if(dim > 1)
-    {
-      rank = 2;
-      shape[0] = N; shape[1] = dim;
-      offsets[0] = offset; offsets[1] = 0;
-    }
-  else
-    {
-      rank = 1;
-      shape[0] = N; shape[1] = 0;
-      offsets[0] = offset; offsets[1] = 0;
-    }
+  if (dim > 1) {
+    rank = 2;
+    shape[0] = N;
+    shape[1] = dim;
+    offsets[0] = offset;
+    offsets[1] = 0;
+  } else {
+    rank = 1;
+    shape[0] = N;
+    shape[1] = 0;
+    offsets[0] = offset;
+    offsets[1] = 0;
+  }
 
   /* Create data space in memory */
   h_memspace = H5Screate_simple(rank, shape, NULL);
- 
+
   /* Select hyperslab in file */
   h_filespace = H5Dget_space(h_data);
   H5Sselect_hyperslab(h_filespace, H5S_SELECT_SET, offsets, NULL, shape, NULL);
@@ -144,17 +134,17 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim
   /* Read HDF5 dataspace in temporary buffer */
   /* Dirty version that happens to work for vectors but should be improved */
   /* Using HDF5 dataspaces would be better */
-  h_err = H5Dread(h_data, hdf5Type(type), h_memspace, h_filespace, h_plist_id, temp);
-  if(h_err < 0)
-    {
-      error( "Error while reading data array '%s'." , name );
-    }
+  h_err = H5Dread(h_data, hdf5Type(type), h_memspace, h_filespace, h_plist_id,
+                  temp);
+  if (h_err < 0) {
+    error("Error while reading data array '%s'.", name);
+  }
 
   /* Copy temporary buffer to particle data */
   temp_c = temp;
-  for(i=0; i<N; ++i)
-    memcpy(part_c+i*partSize, &temp_c[i*copySize], copySize);
-  
+  for (i = 0; i < N; ++i)
+    memcpy(part_c + i * partSize, &temp_c[i * copySize], copySize);
+
   /* Free and close everything */
   free(temp);
   H5Pclose(h_plist_id);
@@ -179,7 +169,10 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim
  * @param importance Is the data compulsory or not
  *
  */
-#define readArray(grp, name, type, N, dim, part, N_total, offset, field, importance) readArrayBackEnd(grp, name, type, N, dim, N_total, offset, (char*)(&(part[0]).field), importance)
+#define readArray(grp, name, type, N, dim, part, N_total, offset, field, \
+                  importance)                                            \
+  readArrayBackEnd(grp, name, type, N, dim, N_total, offset,             \
+                   (char*)(&(part[0]).field), importance)
 
 /**
  * @brief Reads an HDF5 initial condition file (GADGET-3 type) in parallel
@@ -200,12 +193,15 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim
  * Calls #error() if an error occurs.
  *
  */
-void read_ic_parallel ( char* fileName, double dim[3], struct part **parts,  int* N, int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm, MPI_Info info)
-{
-  hid_t h_file=0, h_grp=0;
-  double boxSize[3]={0.0,-1.0,-1.0};         /* GADGET has only cubic boxes (in cosmological mode) */
-  int numParticles[6]={0};   /* GADGET has 6 particle types. We only keep the type 0*/
-  int numParticles_highWord[6]={0};
+void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
+                      int* N, int* periodic, int mpi_rank, int mpi_size,
+                      MPI_Comm comm, MPI_Info info) {
+  hid_t h_file = 0, h_grp = 0;
+  double boxSize[3] = {
+      0.0, -1.0, -1.0}; /* GADGET has only cubic boxes (in cosmological mode) */
+  int numParticles[6] = {
+      0}; /* GADGET has 6 particle types. We only keep the type 0*/
+  int numParticles_highWord[6] = {0};
   long long offset = 0;
   long long N_total = 0;
 
@@ -214,38 +210,36 @@ void read_ic_parallel ( char* fileName, double dim[3], struct part **parts,  int
   hid_t h_plist_id = H5Pcreate(H5P_FILE_ACCESS);
   H5Pset_fapl_mpio(h_plist_id, comm, info);
   h_file = H5Fopen(fileName, H5F_ACC_RDONLY, h_plist_id);
-  if(h_file < 0)
-    {
-      error( "Error while opening file '%s'." , fileName );
-    }
+  if (h_file < 0) {
+    error("Error while opening file '%s'.", fileName);
+  }
 
   /* Open header to read simulation properties */
   /* message("Reading runtime parameters..."); */
   h_grp = H5Gopen1(h_file, "/RuntimePars");
-  if(h_grp < 0)
-    error("Error while opening runtime parameters\n");
+  if (h_grp < 0) error("Error while opening runtime parameters\n");
 
   /* Read the relevant information */
   readAttribute(h_grp, "PeriodicBoundariesOn", INT, periodic);
 
   /* Close runtime parameters */
   H5Gclose(h_grp);
-  
+
   /* Open header to read simulation properties */
   /* message("Reading file header..."); */
   h_grp = H5Gopen1(h_file, "/Header");
-  if(h_grp < 0)
-    error("Error while opening file header\n");
-    
+  if (h_grp < 0) error("Error while opening file header\n");
+
   /* Read the relevant information and print status */
   readAttribute(h_grp, "BoxSize", DOUBLE, boxSize);
   readAttribute(h_grp, "NumPart_Total", UINT, numParticles);
   readAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticles_highWord);
 
-  N_total = ((long long) numParticles[0]) + ((long long) numParticles_highWord[0] << 32);   
+  N_total = ((long long)numParticles[0]) +
+            ((long long)numParticles_highWord[0] << 32);
   dim[0] = boxSize[0];
-  dim[1] = ( boxSize[1] < 0 ) ? boxSize[0] : boxSize[1];
-  dim[2] = ( boxSize[2] < 0 ) ? boxSize[0] : boxSize[2];
+  dim[1] = (boxSize[1] < 0) ? boxSize[0] : boxSize[1];
+  dim[2] = (boxSize[2] < 0) ? boxSize[0] : boxSize[2];
 
   /* Divide the particles among the tasks. */
   offset = mpi_rank * N_total / mpi_size;
@@ -258,28 +252,37 @@ void read_ic_parallel ( char* fileName, double dim[3], struct part **parts,  int
   H5Gclose(h_grp);
 
   /* Allocate memory to store particles */
-  if(posix_memalign( (void*)parts , part_align , *N * sizeof(struct part)) != 0)
+  if (posix_memalign((void*)parts, part_align, *N * sizeof(struct part)) != 0)
     error("Error while allocating memory for particles");
-  bzero( *parts , *N * sizeof(struct part) );
+  bzero(*parts, *N * sizeof(struct part));
+
+  /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) /
+   * (1024.*1024.)); */
 
-  /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) / (1024.*1024.)); */
-		  
   /* Open SPH particles group */
   /* message("Reading particle arrays..."); */
   h_grp = H5Gopen1(h_file, "/PartType0");
-  if(h_grp < 0)
-    error( "Error while opening particle group.\n");
+  if (h_grp < 0) error("Error while opening particle group.\n");
 
   /* Read arrays */
-  readArray(h_grp, "Coordinates", DOUBLE, *N, 3, *parts, N_total, offset, x, COMPULSORY);
-  readArray(h_grp, "Velocities", FLOAT, *N, 3, *parts, N_total, offset, v, COMPULSORY);
-  readArray(h_grp, "Masses", FLOAT, *N, 1, *parts, N_total, offset, mass, COMPULSORY);
-  readArray(h_grp, "SmoothingLength", FLOAT, *N, 1, *parts, N_total, offset, h, COMPULSORY);
-  readArray(h_grp, "InternalEnergy", FLOAT, *N, 1, *parts, N_total, offset, u, COMPULSORY);
-  readArray(h_grp, "ParticleIDs", ULONGLONG, *N, 1, *parts, N_total, offset, id, COMPULSORY);
-  readArray(h_grp, "TimeStep", FLOAT, *N, 1, *parts, N_total, offset, dt, OPTIONAL);
-  readArray(h_grp, "Acceleration", FLOAT, *N, 3, *parts, N_total, offset, a, OPTIONAL);
-  readArray(h_grp, "Density", FLOAT, *N, 1, *parts, N_total, offset, rho, OPTIONAL );
+  readArray(h_grp, "Coordinates", DOUBLE, *N, 3, *parts, N_total, offset, x,
+            COMPULSORY);
+  readArray(h_grp, "Velocities", FLOAT, *N, 3, *parts, N_total, offset, v,
+            COMPULSORY);
+  readArray(h_grp, "Masses", FLOAT, *N, 1, *parts, N_total, offset, mass,
+            COMPULSORY);
+  readArray(h_grp, "SmoothingLength", FLOAT, *N, 1, *parts, N_total, offset, h,
+            COMPULSORY);
+  readArray(h_grp, "InternalEnergy", FLOAT, *N, 1, *parts, N_total, offset, u,
+            COMPULSORY);
+  readArray(h_grp, "ParticleIDs", ULONGLONG, *N, 1, *parts, N_total, offset, id,
+            COMPULSORY);
+  readArray(h_grp, "TimeStep", FLOAT, *N, 1, *parts, N_total, offset, dt,
+            OPTIONAL);
+  readArray(h_grp, "Acceleration", FLOAT, *N, 3, *parts, N_total, offset, a,
+            OPTIONAL);
+  readArray(h_grp, "Density", FLOAT, *N, 1, *parts, N_total, offset, rho,
+            OPTIONAL);
 
   /* Close particle group */
   H5Gclose(h_grp);
@@ -293,12 +296,10 @@ void read_ic_parallel ( char* fileName, double dim[3], struct part **parts,  int
   /* message("Done Reading particles..."); */
 }
 
-
 /*-----------------------------------------------------------------------------
  * Routines writing an output file
  *-----------------------------------------------------------------------------*/
 
-
 /**
  * @brief Writes a data array in given HDF5 group.
  *
@@ -311,89 +312,91 @@ void read_ic_parallel ( char* fileName, double dim[3], struct part **parts,  int
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
  * @param N_total Total number of particles across all cores
  * @param offset Offset in the array where this mpi task starts writing
- * @param part_c A (char*) pointer on the first occurence of the field of interest in the parts array
+ * @param part_c A (char*) pointer on the first occurence of the field of
+ *interest in the parts array
  * @param us The UnitSystem currently in use
  * @param convFactor The UnitConversionFactor for this array
  *
- * @todo A better version using HDF5 hyperslabs to write the file directly from the part array
+ * @todo A better version using HDF5 hyperslabs to write the file directly from
+ *the part array
  * will be written once the strucutres have been stabilized.
  *
  * Calls #error() if an error occurs.
  */
-void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, enum DATA_TYPE type, int N, int dim, long long N_total, int mpi_rank, long long offset, char* part_c, struct UnitSystem* us, enum UnitConversionFactor convFactor)
-{
-  hid_t h_data=0, h_err=0, h_memspace=0, h_filespace=0, h_plist_id=0;
+void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
+                       enum DATA_TYPE type, int N, int dim, long long N_total,
+                       int mpi_rank, long long offset, char* part_c,
+                       struct UnitSystem* us,
+                       enum UnitConversionFactor convFactor) {
+  hid_t h_data = 0, h_err = 0, h_memspace = 0, h_filespace = 0, h_plist_id = 0;
   hsize_t shape[2], shape_total[2], offsets[2];
   void* temp = 0;
-  int i=0, rank=0;
+  int i = 0, rank = 0;
   const size_t typeSize = sizeOfType(type);
   const size_t copySize = typeSize * dim;
   const size_t partSize = sizeof(struct part);
   char* temp_c = 0;
   char buffer[150];
 
-
   /* message("Writing '%s' array...", name); */
 
   /* Allocate temporary buffer */
   temp = malloc(N * dim * sizeOfType(type));
-  if(temp == NULL)
-    error("Unable to allocate memory for temporary buffer");
+  if (temp == NULL) error("Unable to allocate memory for temporary buffer");
 
   /* Copy particle data to temporary buffer */
   temp_c = temp;
-  for(i=0; i<N; ++i)
-    memcpy(&temp_c[i*copySize], part_c+i*partSize, copySize);
+  for (i = 0; i < N; ++i)
+    memcpy(&temp_c[i * copySize], part_c + i * partSize, copySize);
 
   /* Create data space */
   h_memspace = H5Screate(H5S_SIMPLE);
-  if(h_memspace < 0)
-    {
-      error( "Error while creating data space (memory) for field '%s'." , name );
-    }
+  if (h_memspace < 0) {
+    error("Error while creating data space (memory) for field '%s'.", name);
+  }
 
   h_filespace = H5Screate(H5S_SIMPLE);
-  if(h_filespace < 0)
-    {
-      error( "Error while creating data space (file) for field '%s'." , name );
-    }
-  
-  if(dim > 1)
-    {
-      rank = 2;
-      shape[0] = N; shape[1] = dim;
-      shape_total[0] = N_total; shape_total[1] = dim;
-      offsets[0] = offset; offsets[1] = 0;
-    }
-  else
-    {
-      rank = 1;
-      shape[0] = N; shape[1] = 0;
-      shape_total[0] = N_total; shape_total[1] = 0;
-      offsets[0] = offset; offsets[1] = 0;
-    }
-  
+  if (h_filespace < 0) {
+    error("Error while creating data space (file) for field '%s'.", name);
+  }
+
+  if (dim > 1) {
+    rank = 2;
+    shape[0] = N;
+    shape[1] = dim;
+    shape_total[0] = N_total;
+    shape_total[1] = dim;
+    offsets[0] = offset;
+    offsets[1] = 0;
+  } else {
+    rank = 1;
+    shape[0] = N;
+    shape[1] = 0;
+    shape_total[0] = N_total;
+    shape_total[1] = 0;
+    offsets[0] = offset;
+    offsets[1] = 0;
+  }
+
   /* Change shape of memory data space */
   h_err = H5Sset_extent_simple(h_memspace, rank, shape, NULL);
-  if(h_err < 0)
-    {
-      error( "Error while changing data space (memory) shape for field '%s'." , name );
-    }
+  if (h_err < 0) {
+    error("Error while changing data space (memory) shape for field '%s'.",
+          name);
+  }
 
   /* Change shape of file data space */
   h_err = H5Sset_extent_simple(h_filespace, rank, shape_total, NULL);
-  if(h_err < 0)
-    {
-      error( "Error while changing data space (file) shape for field '%s'." , name );
-    }
-  
+  if (h_err < 0) {
+    error("Error while changing data space (file) shape for field '%s'.", name);
+  }
+
   /* Create dataset */
   h_data = H5Dcreate1(grp, name, hdf5Type(type), h_filespace, H5P_DEFAULT);
-  if(h_data < 0)
-    {
-      error( "Error while creating dataset '%s'." , name );
-    }
-  
+  if (h_data < 0) {
+    error("Error while creating dataset '%s'.", name);
+  }
+
   H5Sclose(h_filespace);
   h_filespace = H5Dget_space(h_data);
   H5Sselect_hyperslab(h_filespace, H5S_SELECT_SET, offsets, NULL, shape, NULL);
@@ -403,23 +406,23 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, enu
   H5Pset_dxpl_mpio(h_plist_id, H5FD_MPIO_COLLECTIVE);
 
   /* Write temporary buffer to HDF5 dataspace */
-  h_err = H5Dwrite(h_data, hdf5Type(type), h_memspace, h_filespace, h_plist_id, temp);
-  if(h_err < 0)
-    {
-      error( "Error while writing data array '%s'." , name );
-    }
+  h_err = H5Dwrite(h_data, hdf5Type(type), h_memspace, h_filespace, h_plist_id,
+                   temp);
+  if (h_err < 0) {
+    error("Error while writing data array '%s'.", name);
+  }
 
   /* Write XMF description for this data set */
-  if(mpi_rank == 0)
-    writeXMFline(xmfFile, fileName, name, N_total, dim, type);
+  if (mpi_rank == 0) writeXMFline(xmfFile, fileName, name, N_total, dim, type);
 
   /* Write unit conversion factors for this data set */
-  conversionString( buffer, us, convFactor );
-  writeAttribute_d( h_data, "CGS conversion factor", conversionFactor( us, convFactor ) );
-  writeAttribute_f( h_data, "h-scale exponant", hFactor( us, convFactor ) );
-  writeAttribute_f( h_data, "a-scale exponant", aFactor( us, convFactor ) );
-  writeAttribute_s( h_data, "Conversion factor", buffer );
-    
+  conversionString(buffer, us, convFactor);
+  writeAttribute_d(h_data, "CGS conversion factor",
+                   conversionFactor(us, convFactor));
+  writeAttribute_f(h_data, "h-scale exponant", hFactor(us, convFactor));
+  writeAttribute_f(h_data, "a-scale exponant", aFactor(us, convFactor));
+  writeAttribute_s(h_data, "Conversion factor", buffer);
+
   /* Free and close everything */
   free(temp);
   H5Dclose(h_data);
@@ -441,13 +444,18 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, enu
  * @param N_total Total number of particles across all cores
  * @param mpi_rank The MPI task rank calling the function
  * @param offset Offset in the array where this mpi task starts writing
- * @param part A (char*) pointer on the first occurence of the field of interest in the parts array
+ * @param part A (char*) pointer on the first occurence of the field of interest
+ *in the parts array
  * @param field The name (code name) of the field to read from.
  * @param us The UnitSystem currently in use
  * @param convFactor The UnitConversionFactor for this array
  *
  */
-#define writeArray(grp, fileName, xmfFile, name, type, N, dim, part, N_total, mpi_rank, offset, field, us, convFactor) writeArrayBackEnd(grp, fileName, xmfFile, name, type, N, dim, N_total, mpi_rank, offset, (char*)(&(part[0]).field), us, convFactor)
+#define writeArray(grp, fileName, xmfFile, name, type, N, dim, part, N_total, \
+                   mpi_rank, offset, field, us, convFactor)                   \
+  writeArrayBackEnd(grp, fileName, xmfFile, name, type, N, dim, N_total,      \
+                    mpi_rank, offset, (char*)(&(part[0]).field), us,          \
+                    convFactor)
 
 /**
  * @brief Writes an HDF5 output file (GADGET-3 type) with its XMF descriptor
@@ -457,92 +465,90 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, enu
  *
  * Creates an HDF5 output file and writes the particles contained
  * in the engine. If such a file already exists, it is erased and replaced
- * by the new one. 
+ * by the new one.
  * The companion XMF file is also updated accordingly.
  *
  * Calls #error() if an error occurs.
  *
  */
-void write_output_parallel (struct engine *e, struct UnitSystem* us,  int mpi_rank, int mpi_size, MPI_Comm comm, MPI_Info info)
-{
-  
-  hid_t h_file=0, h_grp=0;
+void write_output_parallel(struct engine* e, struct UnitSystem* us,
+                           int mpi_rank, int mpi_size, MPI_Comm comm,
+                           MPI_Info info) {
+
+  hid_t h_file = 0, h_grp = 0;
   int N = e->s->nr_parts;
   int periodic = e->s->periodic;
-  unsigned int numParticles[6]={N,0};
-  unsigned int numParticlesHighWord[6]={0};
-  unsigned int flagEntropy[6]={0};
+  unsigned int numParticles[6] = {N, 0};
+  unsigned int numParticlesHighWord[6] = {0};
+  unsigned int flagEntropy[6] = {0};
   long long N_total = 0, offset = 0;
   double offset_d = 0., N_d = 0., N_total_d = 0.;
   int numFiles = 1;
   struct part* parts = e->s->parts;
   FILE* xmfFile = 0;
   static int outputCount = 0;
-  
+
   /* File name */
   char fileName[200];
   sprintf(fileName, "output_%03i.hdf5", outputCount);
 
   /* First time, we need to create the XMF file */
-  if(outputCount == 0 && mpi_rank == 0)
-    createXMFfile();
-  
+  if (outputCount == 0 && mpi_rank == 0) createXMFfile();
+
   /* Prepare the XMF file for the new entry */
-  if(mpi_rank == 0)
-    xmfFile = prepareXMFfile();
+  if (mpi_rank == 0) xmfFile = prepareXMFfile();
 
   /* Open HDF5 file */
   hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
   H5Pset_fapl_mpio(plist_id, comm, info);
   h_file = H5Fcreate(fileName, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
-  if(h_file < 0)
-    {
-      error( "Error while opening file '%s'." , fileName );
-    }
+  if (h_file < 0) {
+    error("Error while opening file '%s'.", fileName);
+  }
 
   /* Compute offset in the file and total number of particles */
   /* Done using double to allow for up to 2^50=10^15 particles */
   N_d = (double)N;
   MPI_Exscan(&N_d, &offset_d, 1, MPI_DOUBLE, MPI_SUM, comm);
   N_total_d = offset_d + N_d;
-  MPI_Bcast(&N_total_d, 1, MPI_DOUBLE, mpi_size-1, comm);
-  if(N_total_d > 1.e15)
-    error("Error while computing the offest for parallel output: Simulation has more than 10^15 particles.\n");
-  N_total = (long long) N_total_d;
-  offset = (long long) offset_d;
+  MPI_Bcast(&N_total_d, 1, MPI_DOUBLE, mpi_size - 1, comm);
+  if (N_total_d > 1.e15)
+    error(
+        "Error while computing the offest for parallel output: Simulation has "
+        "more than 10^15 particles.\n");
+  N_total = (long long)N_total_d;
+  offset = (long long)offset_d;
 
   /* Write the part of the XMF file corresponding to this specific output */
-  if(mpi_rank == 0)
-    writeXMFheader(xmfFile, N_total, fileName, e->time);
+  if (mpi_rank == 0) writeXMFheader(xmfFile, N_total, fileName, e->time);
 
   /* Open header to write simulation properties */
   /* message("Writing runtime parameters..."); */
   h_grp = H5Gcreate1(h_file, "/RuntimePars", 0);
-  if(h_grp < 0)
-    error("Error while creating runtime parameters group\n");
+  if (h_grp < 0) error("Error while creating runtime parameters group\n");
 
   /* Write the relevant information */
   writeAttribute(h_grp, "PeriodicBoundariesOn", INT, &periodic, 1);
 
   /* Close runtime parameters */
   H5Gclose(h_grp);
-  
+
   /* Open header to write simulation properties */
   /* message("Writing file header..."); */
   h_grp = H5Gcreate1(h_file, "/Header", 0);
-  if(h_grp < 0)
-    error("Error while creating file header\n");
-    
+  if (h_grp < 0) error("Error while creating file header\n");
+
   /* Print the relevant information and print status */
   writeAttribute(h_grp, "BoxSize", DOUBLE, e->s->dim, 3);
   writeAttribute(h_grp, "NumPart_ThisFile", UINT, numParticles, 6);
   writeAttribute(h_grp, "Time", DOUBLE, &e->time, 1);
 
   /* GADGET-2 legacy values */
-  numParticles[0] = (unsigned int) N_total ;
+  numParticles[0] = (unsigned int)N_total;
   writeAttribute(h_grp, "NumPart_Total", UINT, numParticles, 6);
-  numParticlesHighWord[0] = (unsigned int) (N_total >> 32);
-  writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord, 6);
+  numParticlesHighWord[0] = (unsigned int)(N_total >> 32);
+  writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord,
+                 6);
   double MassTable[6] = {0., 0., 0., 0., 0., 0.};
   writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, 6);
   writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy, 6);
@@ -556,30 +562,37 @@ void write_output_parallel (struct engine *e, struct UnitSystem* us,  int mpi_ra
 
   /* Print the system of Units */
   writeUnitSystem(h_file, us);
-		  
+
   /* Create SPH particles group */
   /* message("Writing particle arrays..."); */
   h_grp = H5Gcreate1(h_file, "/PartType0", 0);
-  if(h_grp < 0)
-    error( "Error while creating particle group.\n");
+  if (h_grp < 0) error("Error while creating particle group.\n");
 
   /* Write arrays */
-  writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N, 3, parts, N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N, 3, parts, N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
-  writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N, 1, parts, N_total, mpi_rank, offset, mass, us, UNIT_CONV_MASS);
-  writeArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N, 1, parts, N_total, mpi_rank, offset, h, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N, 1, parts, N_total, mpi_rank, offset, u, us, UNIT_CONV_ENERGY_PER_UNIT_MASS);
-  writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N, 1, parts, N_total, mpi_rank, offset, id, us, UNIT_CONV_NO_UNITS);
-  writeArray(h_grp, fileName, xmfFile, "TimeStep", FLOAT, N, 1, parts, N_total, mpi_rank, offset, dt, us, UNIT_CONV_TIME);
-  writeArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N, 3, parts, N_total, mpi_rank, offset, a, us, UNIT_CONV_ACCELERATION);
-  writeArray(h_grp, fileName, xmfFile, "Density", FLOAT, N, 1, parts, N_total, mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
+  writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N, 3, parts,
+             N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N, 3, parts,
+             N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
+  writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N, 1, parts, N_total,
+             mpi_rank, offset, mass, us, UNIT_CONV_MASS);
+  writeArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N, 1, parts,
+             N_total, mpi_rank, offset, h, us, UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N, 1, parts,
+             N_total, mpi_rank, offset, u, us, UNIT_CONV_ENERGY_PER_UNIT_MASS);
+  writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N, 1, parts,
+             N_total, mpi_rank, offset, id, us, UNIT_CONV_NO_UNITS);
+  writeArray(h_grp, fileName, xmfFile, "TimeStep", FLOAT, N, 1, parts, N_total,
+             mpi_rank, offset, dt, us, UNIT_CONV_TIME);
+  writeArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N, 3, parts,
+             N_total, mpi_rank, offset, a, us, UNIT_CONV_ACCELERATION);
+  writeArray(h_grp, fileName, xmfFile, "Density", FLOAT, N, 1, parts, N_total,
+             mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
 
   /* Close particle group */
   H5Gclose(h_grp);
 
   /* Write LXMF file descriptor */
-  if(mpi_rank == 0)
-    writeXMFfooter(xmfFile);
+  if (mpi_rank == 0) writeXMFfooter(xmfFile);
 
   /* message("Done writing particles..."); */
 
@@ -592,7 +605,4 @@ void write_output_parallel (struct engine *e, struct UnitSystem* us,  int mpi_ra
   ++outputCount;
 }
 
-
-#endif  /* HAVE_HDF5 */
-
-
+#endif /* HAVE_HDF5 */
diff --git a/src/parallel_io.h b/src/parallel_io.h
index 78699a5938894ef4577e9a14938e4a16dae2b612..fa46a230ab73e52a3b471dc0b157f5cf0f99ef73 100644
--- a/src/parallel_io.h
+++ b/src/parallel_io.h
@@ -1,28 +1,44 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_PARALLEL_IO_H
+#define SWIFT_PARALLEL_IO_H
 
+/* MPI headers. */
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+/* Includes. */
+#include "engine.h"
+#include "part.h"
+#include "units.h"
 
 #if defined(HAVE_HDF5) && defined(WITH_MPI) && defined(HAVE_PARALLEL_HDF5)
 
-void read_ic_parallel ( char* fileName, double dim[3], struct part **parts,  int* N, int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm, MPI_Info info);
+void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
+                      int* N, int* periodic, int mpi_rank, int mpi_size,
+                      MPI_Comm comm, MPI_Info info);
 
-void write_output_parallel ( struct engine* e, struct UnitSystem* us, int mpi_rank, int mpi_size, MPI_Comm comm, MPI_Info info);
+void write_output_parallel(struct engine* e, struct UnitSystem* us,
+                           int mpi_rank, int mpi_size, MPI_Comm comm,
+                           MPI_Info info);
 
 #endif
 
+#endif /* SWIFT_PARALLEL_IO_H */
diff --git a/src/part.h b/src/part.h
index 3e8f5891b15677faeca01b20a2edacd6e97481ab..380c2dedb2d7847c0d0efe937d0b24feb0a736f0 100644
--- a/src/part.h
+++ b/src/part.h
@@ -1,170 +1,167 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
-
+#ifndef SWIFT_PART_H
+#define SWIFT_PART_H
 
 /* Some constants. */
-#define part_maxwait                    3
-#define part_maxunlock                  39
-#define part_dtmax                      10
-#define part_align                      64
-
+#define part_maxwait 3
+#define part_maxunlock 39
+#define part_dtmax 10
+#define part_align 64
 
 /* Extra particle data not needed during the computation. */
 struct xpart {
 
-    /* Old position, at last tree rebuild. */
-    double x_old[3];
-    
-    /* Velocity at the half-step. */
-    float v_hdt[3];
-    
-    /* Entropy at the half-step. */
-    float u_hdt;
-    
-    /* Old density. */
-    float omega;
-    
-    /* particle's current time-step. */
-    float dt_curr;
-    
-    } __attribute__((aligned (32)));
-    
-    
+  /* Old position, at last tree rebuild. */
+  double x_old[3];
+
+  /* Velocity at the half-step. */
+  float v_hdt[3];
+
+  /* Entropy at the half-step. */
+  float u_hdt;
+
+  /* Old density. */
+  float omega;
+
+  /* particle's current time-step. */
+  float dt_curr;
+
+} __attribute__((aligned(32)));
+
 /* Gravity particle. */
 struct gpart {
 
-    /* Particle position. */
-    double x[3];
-    
-    /* Particle velocity. */
-    float v[3];
-    
-    /* Particle acceleration. */
-    float a[3];
-    
-    /* Particle mass. */
-    float mass;
-    
-    /* Particle time step. */
-    float dt;
-    
-    /* Anonymous union for id/part. */
-    union {
-    
-        /* Particle ID. */
-        size_t id;
-
-        /* Pointer to corresponding SPH part. */
-        struct part *part;
-        
-        };
-    
-    } __attribute__((aligned (part_align)));
-    
+  /* Particle position. */
+  double x[3];
+
+  /* Particle velocity. */
+  float v[3];
+
+  /* Particle acceleration. */
+  float a[3];
+
+  /* Particle mass. */
+  float mass;
+
+  /* Particle time step. */
+  float dt;
+
+  /* Anonymous union for id/part. */
+  union {
+
+    /* Particle ID. */
+    size_t id;
+
+    /* Pointer to corresponding SPH part. */
+    struct part *part;
+  };
+
+} __attribute__((aligned(part_align)));
 
 /* Data of a single particle. */
 struct part {
 
-    /* Particle position. */
-    double x[3];
-    
-    /* Particle velocity. */
-    float v[3];
-    
-    /* Particle acceleration. */
-    float a[3];
-    
-    /* Particle cutoff radius. */
-    float h;
-    
-    /* Particle time-step. */
-    float dt;
-    
-    /* Particle internal energy. */
-    float u;
-    
-    /* Particle density. */
-    float rho;
-
-    /* Derivative of the density with respect to this particle's smoothing length. */
-    float rho_dh;
+  /* Particle position. */
+  double x[3];
+
+  /* Particle velocity. */
+  float v[3];
+
+  /* Particle acceleration. */
+  float a[3];
+
+  /* Particle cutoff radius. */
+  float h;
+
+  /* Particle time-step. */
+  float dt;
+
+  /* Particle internal energy. */
+  float u;
+
+  /* Particle density. */
+  float rho;
+
+  /* Derivative of the density with respect to this particle's smoothing length.
+   */
+  float rho_dh;
 
 #ifndef LEGACY_GADGET2_SPH
-    /* Particle viscosity parameter */
-    float alpha;
-#endif    
-
-    /* Store density/force specific stuff. */
-    union {
-    
-        struct {
-        
-            /* Particle velocity divergence. */
-            float div_v;
-
-            /* Derivative of particle number density. */
-            float wcount_dh;
-            
-            /* Particle velocity curl. */
-            float curl_v[3];
-    
-            /* Particle number density. */
-            float wcount;
-    
-            } density;
-            
-        struct {
-        
-            /* Balsara switch */
-            float balsara;
-  
-            /* Aggregate quantities. */
-            float POrho2;
-    
-            /* Change in particle energy over time. */
-            float u_dt;
-
-            /* Change in smoothing length over time. */
-            float h_dt;
-    
-            /* Signal velocity */
-            float v_sig;
-    
-            /* Sound speed */
-            float c;
-            
-            } force;
-            
-        };
-
-    /* Particle pressure. */
-    // float P;
-    
-    /* Particle mass. */
-    float mass;
-    
-    /* Particle ID. */
-    unsigned long long id;
-    
-    /* Associated gravitas. */
-    struct gpart *gpart;
-    
-    } __attribute__((aligned (part_align)));
-    
+  /* Particle viscosity parameter */
+  float alpha;
+#endif
+
+  /* Store density/force specific stuff. */
+  union {
+
+    struct {
+
+      /* Particle velocity divergence. */
+      float div_v;
+
+      /* Derivative of particle number density. */
+      float wcount_dh;
+
+      /* Particle velocity curl. */
+      float curl_v[3];
+
+      /* Particle number density. */
+      float wcount;
+
+    } density;
+
+    struct {
+
+      /* Balsara switch */
+      float balsara;
+
+      /* Aggregate quantities. */
+      float POrho2;
+
+      /* Change in particle energy over time. */
+      float u_dt;
+
+      /* Change in smoothing length over time. */
+      float h_dt;
+
+      /* Signal velocity */
+      float v_sig;
+
+      /* Sound speed */
+      float c;
+
+    } force;
+  };
+
+  /* Particle pressure. */
+  // float P;
+
+  /* Particle mass. */
+  float mass;
+
+  /* Particle ID. */
+  unsigned long long id;
+
+  /* Associated gravitas. */
+  struct gpart *gpart;
+
+} __attribute__((aligned(part_align)));
 
+#endif /* SWIFT_PART_H */
diff --git a/src/proxy.c b/src/proxy.c
index a90ebc68e29566c00136e3189cd9fc024a37cb5e..bafa185cdcaf2100992398657b650a954daceb91 100644
--- a/src/proxy.c
+++ b/src/proxy.c
@@ -1,129 +1,126 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 
 /* Config parameters. */
 #include "../config.h"
 
 /* Some standard headers. */
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
-#include <pthread.h>
-#include <math.h>
 #include <float.h>
 #include <limits.h>
 #include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
 
 /* MPI headers. */
 #ifdef WITH_MPI
-    #include <mpi.h>
+#include <mpi.h>
 #endif
 
-/* Local headers. */
-#include "const.h"
-#include "cycle.h"
-#include "atomic.h"
-#include "timers.h"
-#include "const.h"
-#include "vector.h"
-#include "lock.h"
-#include "space.h"
-#include "part.h"
-#include "multipole.h"
-#include "cell.h"
-#include "task.h"
-#include "debug.h"
+/* This object's header. */
 #include "proxy.h"
-#include "error.h"
 
+/* Local headers. */
+#include "error.h"
 
 /**
  * @brief Exchange cells with a remote node.
  *
  * @param p The #proxy.
  */
- 
-void proxy_cells_exch1 ( struct proxy *p ) {
+
+void proxy_cells_exch1(struct proxy *p) {
 
 #ifdef WITH_MPI
 
-    int k, ind;
-    
-    /* Get the number of pcells we will need to send. */
-    p->size_pcells_out = 0;
-    for ( k = 0 ; k < p->nr_cells_out ; k++ )
-        p->size_pcells_out += p->cells_out[k]->pcell_size;
-        
-    /* Send the number of pcells. */
-    if ( MPI_Isend( &p->size_pcells_out , 1 , MPI_INT , p->nodeID , p->mynodeID*proxy_tag_shift + proxy_tag_count , MPI_COMM_WORLD , &p->req_cells_count_out ) != MPI_SUCCESS )
-        error( "Failed to isend nr of pcells." );
-    // message( "isent pcell count (%i) from node %i to node %i." , p->size_pcells_out , p->mynodeID , p->nodeID ); fflush(stdout);
-    
-    /* Allocate and fill the pcell buffer. */
-    if ( p->pcells_out != NULL )
-        free( p->pcells_out );
-    if ( ( p->pcells_out = malloc( sizeof(struct pcell) * p->size_pcells_out ) ) == NULL )
-        error( "Failed to allocate pcell_out buffer." );
-    for ( ind = 0 , k = 0 ; k < p->nr_cells_out ; k++ ) {
-        memcpy( &p->pcells_out[ind] , p->cells_out[k]->pcell , sizeof(struct pcell) * p->cells_out[k]->pcell_size );
-        ind += p->cells_out[k]->pcell_size;
-        }
-    
-    /* Send the pcell buffer. */
-    if ( MPI_Isend( p->pcells_out , sizeof(struct pcell)*p->size_pcells_out , MPI_BYTE , p->nodeID , p->mynodeID*proxy_tag_shift + proxy_tag_cells , MPI_COMM_WORLD , &p->req_cells_out ) != MPI_SUCCESS )
-        error( "Failed to pcell_out buffer." );
-    // message( "isent pcells (%i) from node %i to node %i." , p->size_pcells_out , p->mynodeID , p->nodeID ); fflush(stdout);
-
-    /* Receive the number of pcells. */
-    if ( MPI_Irecv( &p->size_pcells_in , 1 , MPI_INT , p->nodeID , p->nodeID*proxy_tag_shift + proxy_tag_count , MPI_COMM_WORLD , &p->req_cells_count_in ) != MPI_SUCCESS )
-        error( "Failed to irecv nr of pcells." );
-    // message( "irecv pcells count on node %i from node %i." , p->mynodeID , p->nodeID ); fflush(stdout);
-    
+  int k, ind;
+
+  /* Get the number of pcells we will need to send. */
+  p->size_pcells_out = 0;
+  for (k = 0; k < p->nr_cells_out; k++)
+    p->size_pcells_out += p->cells_out[k]->pcell_size;
+
+  /* Send the number of pcells. */
+  if (MPI_Isend(&p->size_pcells_out, 1, MPI_INT, p->nodeID,
+                p->mynodeID * proxy_tag_shift + proxy_tag_count, MPI_COMM_WORLD,
+                &p->req_cells_count_out) != MPI_SUCCESS)
+    error("Failed to isend nr of pcells.");
+  // message( "isent pcell count (%i) from node %i to node %i." ,
+  // p->size_pcells_out , p->mynodeID , p->nodeID ); fflush(stdout);
+
+  /* Allocate and fill the pcell buffer. */
+  if (p->pcells_out != NULL) free(p->pcells_out);
+  if ((p->pcells_out = malloc(sizeof(struct pcell) * p->size_pcells_out)) ==
+      NULL)
+    error("Failed to allocate pcell_out buffer.");
+  for (ind = 0, k = 0; k < p->nr_cells_out; k++) {
+    memcpy(&p->pcells_out[ind], p->cells_out[k]->pcell,
+           sizeof(struct pcell) * p->cells_out[k]->pcell_size);
+    ind += p->cells_out[k]->pcell_size;
+  }
+
+  /* Send the pcell buffer. */
+  if (MPI_Isend(p->pcells_out, sizeof(struct pcell) * p->size_pcells_out,
+                MPI_BYTE, p->nodeID,
+                p->mynodeID * proxy_tag_shift + proxy_tag_cells, MPI_COMM_WORLD,
+                &p->req_cells_out) != MPI_SUCCESS)
+    error("Failed to pcell_out buffer.");
+  // message( "isent pcells (%i) from node %i to node %i." , p->size_pcells_out
+  // , p->mynodeID , p->nodeID ); fflush(stdout);
+
+  /* Receive the number of pcells. */
+  if (MPI_Irecv(&p->size_pcells_in, 1, MPI_INT, p->nodeID,
+                p->nodeID * proxy_tag_shift + proxy_tag_count, MPI_COMM_WORLD,
+                &p->req_cells_count_in) != MPI_SUCCESS)
+    error("Failed to irecv nr of pcells.");
+// message( "irecv pcells count on node %i from node %i." , p->mynodeID ,
+// p->nodeID ); fflush(stdout);
+
 #else
-    error( "SWIFT was not compiled with MPI support." );
+  error("SWIFT was not compiled with MPI support.");
 #endif
+}
 
-    }
-
-
-void proxy_cells_exch2 ( struct proxy *p ) {
+void proxy_cells_exch2(struct proxy *p) {
 
 #ifdef WITH_MPI
 
-    /* Re-allocate the pcell_in buffer. */
-    if ( p->pcells_in != NULL )
-        free( p->pcells_in );
-    if ( ( p->pcells_in = (struct pcell *)malloc( sizeof(struct pcell) * p->size_pcells_in ) ) == NULL )
-        error( "Failed to allocate pcell_in buffer." );
-        
-    /* Receive the particle buffers. */
-    if ( MPI_Irecv( p->pcells_in , sizeof(struct pcell)*p->size_pcells_in , MPI_BYTE , p->nodeID , p->nodeID*proxy_tag_shift + proxy_tag_cells , MPI_COMM_WORLD , &p->req_cells_in ) != MPI_SUCCESS )
-        error( "Failed to irecv part data." );
-    // message( "irecv pcells (%i) on node %i from node %i." , p->size_pcells_in , p->mynodeID , p->nodeID ); fflush(stdout);
+  /* Re-allocate the pcell_in buffer. */
+  if (p->pcells_in != NULL) free(p->pcells_in);
+  if ((p->pcells_in = (struct pcell *)malloc(sizeof(struct pcell) *
+                                             p->size_pcells_in)) == NULL)
+    error("Failed to allocate pcell_in buffer.");
+
+  /* Receive the particle buffers. */
+  if (MPI_Irecv(p->pcells_in, sizeof(struct pcell) * p->size_pcells_in,
+                MPI_BYTE, p->nodeID,
+                p->nodeID * proxy_tag_shift + proxy_tag_cells, MPI_COMM_WORLD,
+                &p->req_cells_in) != MPI_SUCCESS)
+    error("Failed to irecv part data.");
+// message( "irecv pcells (%i) on node %i from node %i." , p->size_pcells_in ,
+// p->mynodeID , p->nodeID ); fflush(stdout);
 
 #else
-    error( "SWIFT was not compiled with MPI support." );
+  error("SWIFT was not compiled with MPI support.");
 #endif
-
-    }
-
+}
 
 /**
  * @brief Add a cell to the given proxy's input list.
@@ -132,32 +129,29 @@ void proxy_cells_exch2 ( struct proxy *p ) {
  * @param c The #cell.
  */
 
-void proxy_addcell_in ( struct proxy *p , struct cell *c ) {
-
-    int k;
-    struct cell **temp;
-    
-    /* Check if the cell is already registered with the proxy. */
-    for ( k = 0 ; k < p->nr_cells_in ; k++ )
-        if ( p->cells_in[k] == c )
-            return;
-            
-    /* Do we need to grow the number of in cells? */
-    if ( p->nr_cells_in == p->size_cells_in ) {
-        p->size_cells_in *= proxy_buffgrow;
-        if ( ( temp = malloc( sizeof(struct cell *) * p->size_cells_in ) ) == NULL )
-            error( "Failed to allocate ingoing cell list." );
-        memcpy( temp , p->cells_in , sizeof(struct cell *) * p->nr_cells_in );
-        free( p->cells_in );
-        p->cells_in = temp;
-        }
-        
-    /* Add the cell. */
-    p->cells_in[ p->nr_cells_in ] = c;
-    p->nr_cells_in += 1;
-
-    }
+void proxy_addcell_in(struct proxy *p, struct cell *c) {
+
+  int k;
+  struct cell **temp;
 
+  /* Check if the cell is already registered with the proxy. */
+  for (k = 0; k < p->nr_cells_in; k++)
+    if (p->cells_in[k] == c) return;
+
+  /* Do we need to grow the number of in cells? */
+  if (p->nr_cells_in == p->size_cells_in) {
+    p->size_cells_in *= proxy_buffgrow;
+    if ((temp = malloc(sizeof(struct cell *) * p->size_cells_in)) == NULL)
+      error("Failed to allocate ingoing cell list.");
+    memcpy(temp, p->cells_in, sizeof(struct cell *) * p->nr_cells_in);
+    free(p->cells_in);
+    p->cells_in = temp;
+  }
+
+  /* Add the cell. */
+  p->cells_in[p->nr_cells_in] = c;
+  p->nr_cells_in += 1;
+}
 
 /**
  * @brief Add a cell to the given proxy's output list.
@@ -166,101 +160,114 @@ void proxy_addcell_in ( struct proxy *p , struct cell *c ) {
  * @param c The #cell.
  */
 
-void proxy_addcell_out ( struct proxy *p , struct cell *c ) {
-
-    int k;
-    struct cell **temp;
-    
-    /* Check if the cell is already registered with the proxy. */
-    for ( k = 0 ; k < p->nr_cells_out ; k++ )
-        if ( p->cells_out[k] == c )
-            return;
-            
-    /* Do we need to grow the number of out cells? */
-    if ( p->nr_cells_out == p->size_cells_out ) {
-        p->size_cells_out *= proxy_buffgrow;
-        if ( ( temp = malloc( sizeof(struct cell *) * p->size_cells_out ) ) == NULL )
-            error( "Failed to allocate outgoing cell list." );
-        memcpy( temp , p->cells_out , sizeof(struct cell *) * p->nr_cells_out );
-        free( p->cells_out );
-        p->cells_out = temp;
-        }
-        
-    /* Add the cell. */
-    p->cells_out[ p->nr_cells_out ] = c;
-    p->nr_cells_out += 1;
-
-    }
+void proxy_addcell_out(struct proxy *p, struct cell *c) {
+
+  int k;
+  struct cell **temp;
+
+  /* Check if the cell is already registered with the proxy. */
+  for (k = 0; k < p->nr_cells_out; k++)
+    if (p->cells_out[k] == c) return;
 
+  /* Do we need to grow the number of out cells? */
+  if (p->nr_cells_out == p->size_cells_out) {
+    p->size_cells_out *= proxy_buffgrow;
+    if ((temp = malloc(sizeof(struct cell *) * p->size_cells_out)) == NULL)
+      error("Failed to allocate outgoing cell list.");
+    memcpy(temp, p->cells_out, sizeof(struct cell *) * p->nr_cells_out);
+    free(p->cells_out);
+    p->cells_out = temp;
+  }
+
+  /* Add the cell. */
+  p->cells_out[p->nr_cells_out] = c;
+  p->nr_cells_out += 1;
+}
 
 /**
  * @brief Exchange particles with a remote node.
  *
  * @param p The #proxy.
  */
- 
-void proxy_parts_exch1 ( struct proxy *p ) {
+
+void proxy_parts_exch1(struct proxy *p) {
 
 #ifdef WITH_MPI
 
-    /* Send the number of particles. */
-    if ( MPI_Isend( &p->nr_parts_out , 1 , MPI_INT , p->nodeID , p->mynodeID*proxy_tag_shift + proxy_tag_count , MPI_COMM_WORLD , &p->req_parts_count_out ) != MPI_SUCCESS )
-        error( "Failed to isend nr of parts." );
-    // message( "isent particle count (%i) from node %i to node %i." , p->nr_parts_out , p->mynodeID , p->nodeID ); fflush(stdout);
-    
-    /* Send the particle buffers. */
-    if ( p->nr_parts_out > 0 ) {
-        if ( MPI_Isend( p->parts_out , sizeof(struct part)*p->nr_parts_out , MPI_BYTE , p->nodeID , p->mynodeID*proxy_tag_shift + proxy_tag_parts , MPI_COMM_WORLD , &p->req_parts_out ) != MPI_SUCCESS ||
-             MPI_Isend( p->xparts_out , sizeof(struct xpart)*p->nr_parts_out , MPI_BYTE , p->nodeID , p->mynodeID*proxy_tag_shift + proxy_tag_xparts , MPI_COMM_WORLD , &p->req_xparts_out ) != MPI_SUCCESS )
-            error( "Failed to isend part data." );
-        // message( "isent particle data (%i) to node %i." , p->nr_parts_out , p->nodeID ); fflush(stdout);
-        /* for ( int k = 0 ; k < p->nr_parts_out ; k++ )
-            message( "sending particle %lli, x=[%.3e %.3e %.3e], h=%.3e, to node %i." ,
-                p->parts_out[k].id , p->parts_out[k].x[0] , p->parts_out[k].x[1] , p->parts_out[k].x[2] ,
-                p->parts_out[k].h , p->nodeID ); */
-        }
-
-    /* Receive the number of particles. */
-    if ( MPI_Irecv( &p->nr_parts_in , 1 , MPI_INT , p->nodeID , p->nodeID*proxy_tag_shift + proxy_tag_count , MPI_COMM_WORLD , &p->req_parts_count_in ) != MPI_SUCCESS )
-        error( "Failed to irecv nr of parts." );
-    // message( "irecv particle count on node %i from node %i." , p->mynodeID , p->nodeID ); fflush(stdout);
-    
+  /* Send the number of particles. */
+  if (MPI_Isend(&p->nr_parts_out, 1, MPI_INT, p->nodeID,
+                p->mynodeID * proxy_tag_shift + proxy_tag_count, MPI_COMM_WORLD,
+                &p->req_parts_count_out) != MPI_SUCCESS)
+    error("Failed to isend nr of parts.");
+  // message( "isent particle count (%i) from node %i to node %i." ,
+  // p->nr_parts_out , p->mynodeID , p->nodeID ); fflush(stdout);
+
+  /* Send the particle buffers. */
+  if (p->nr_parts_out > 0) {
+    if (MPI_Isend(p->parts_out, sizeof(struct part) * p->nr_parts_out, MPI_BYTE,
+                  p->nodeID, p->mynodeID * proxy_tag_shift + proxy_tag_parts,
+                  MPI_COMM_WORLD, &p->req_parts_out) != MPI_SUCCESS ||
+        MPI_Isend(p->xparts_out, sizeof(struct xpart) * p->nr_parts_out,
+                  MPI_BYTE, p->nodeID,
+                  p->mynodeID * proxy_tag_shift + proxy_tag_xparts,
+                  MPI_COMM_WORLD, &p->req_xparts_out) != MPI_SUCCESS)
+      error("Failed to isend part data.");
+    // message( "isent particle data (%i) to node %i." , p->nr_parts_out ,
+    // p->nodeID ); fflush(stdout);
+    for (int k = 0; k < p->nr_parts_out; k++)
+      message("sending particle %lli, x=[%.3e %.3e %.3e], h=%.3e, to node %i.",
+              p->parts_out[k].id, p->parts_out[k].x[0], p->parts_out[k].x[1],
+              p->parts_out[k].x[2], p->parts_out[k].h, p->nodeID);
+  }
+
+  /* Receive the number of particles. */
+  if (MPI_Irecv(&p->nr_parts_in, 1, MPI_INT, p->nodeID,
+                p->nodeID * proxy_tag_shift + proxy_tag_count, MPI_COMM_WORLD,
+                &p->req_parts_count_in) != MPI_SUCCESS)
+    error("Failed to irecv nr of parts.");
+// message( "irecv particle count on node %i from node %i." , p->mynodeID ,
+// p->nodeID ); fflush(stdout);
+
 #else
-    error( "SWIFT was not compiled with MPI support." );
+  error("SWIFT was not compiled with MPI support.");
 #endif
+}
 
-    }
-
-
-void proxy_parts_exch2 ( struct proxy *p ) {
+void proxy_parts_exch2(struct proxy *p) {
 
 #ifdef WITH_MPI
 
-    /* Is there enough space in the buffer? */
-    if ( p->nr_parts_in > p->size_parts_in ) {
-        do {
-            p->size_parts_in *= proxy_buffgrow;
-            } while ( p->nr_parts_in > p->size_parts_in );
-        free( p->parts_in ); free( p->xparts_in );
-        if ( ( p->parts_in = (struct part *)malloc( sizeof(struct part) * p->size_parts_in ) ) == NULL ||
-             ( p->xparts_in = (struct xpart *)malloc( sizeof(struct xpart) * p->size_parts_in ) ) == NULL )
-            error( "Failed to re-allocate parts_in buffers." );
-        }
-        
-    /* Receive the particle buffers. */
-    if ( p->nr_parts_in > 0 ) {
-        if ( MPI_Irecv( p->parts_in , sizeof(struct part)*p->nr_parts_in , MPI_BYTE , p->nodeID , p->nodeID*proxy_tag_shift + proxy_tag_parts , MPI_COMM_WORLD , &p->req_parts_in ) != MPI_SUCCESS ||
-             MPI_Irecv( p->xparts_in , sizeof(struct xpart)*p->nr_parts_in , MPI_BYTE , p->nodeID , p->nodeID*proxy_tag_shift + proxy_tag_xparts , MPI_COMM_WORLD , &p->req_xparts_in ) != MPI_SUCCESS )
-            error( "Failed to irecv part data." );
-        // message( "irecv particle data (%i) from node %i." , p->nr_parts_in , p->nodeID ); fflush(stdout);
-        }
+  /* Is there enough space in the buffer? */
+  if (p->nr_parts_in > p->size_parts_in) {
+    do {
+      p->size_parts_in *= proxy_buffgrow;
+    } while (p->nr_parts_in > p->size_parts_in);
+    free(p->parts_in);
+    free(p->xparts_in);
+    if ((p->parts_in = (struct part *)malloc(
+             sizeof(struct part) *p->size_parts_in)) == NULL ||
+        (p->xparts_in = (struct xpart *)malloc(sizeof(struct xpart) *
+                                               p->size_parts_in)) == NULL)
+      error("Failed to re-allocate parts_in buffers.");
+  }
+
+  /* Receive the particle buffers. */
+  if (p->nr_parts_in > 0) {
+    if (MPI_Irecv(p->parts_in, sizeof(struct part) * p->nr_parts_in, MPI_BYTE,
+                  p->nodeID, p->nodeID * proxy_tag_shift + proxy_tag_parts,
+                  MPI_COMM_WORLD, &p->req_parts_in) != MPI_SUCCESS ||
+        MPI_Irecv(p->xparts_in, sizeof(struct xpart) * p->nr_parts_in, MPI_BYTE,
+                  p->nodeID, p->nodeID * proxy_tag_shift + proxy_tag_xparts,
+                  MPI_COMM_WORLD, &p->req_xparts_in) != MPI_SUCCESS)
+      error("Failed to irecv part data.");
+    // message( "irecv particle data (%i) from node %i." , p->nr_parts_in ,
+    // p->nodeID ); fflush(stdout);
+  }
 
 #else
-    error( "SWIFT was not compiled with MPI support." );
+  error("SWIFT was not compiled with MPI support.");
 #endif
-
-    }
-
+}
 
 /**
  * @brief Load parts onto a proxy for exchange.
@@ -270,34 +277,37 @@ void proxy_parts_exch2 ( struct proxy *p ) {
  * @param xparts Pointer to an array of #xpart to send.
  * @param N The number of parts.
  */
- 
-void proxy_parts_load ( struct proxy *p , struct part *parts , struct xpart *xparts , int N ) {
-
-    /* Is there enough space in the buffer? */
-    if ( p->nr_parts_out + N > p->size_parts_out ) {
-        do {
-            p->size_parts_out *= proxy_buffgrow;
-            } while ( p->nr_parts_out + N > p->size_parts_out );
-        struct part *tp;
-        struct xpart *txp;
-        if ( ( tp = (struct part *)malloc( sizeof(struct part) * p->size_parts_out ) ) == NULL ||
-             ( txp = (struct xpart *)malloc( sizeof(struct xpart) * p->size_parts_out ) ) == NULL )
-            error( "Failed to re-allocate parts_out buffers." );
-        memcpy( tp , p->parts_out , sizeof(struct part) * p->nr_parts_out );
-        memcpy( txp , p->xparts_out , sizeof(struct xpart) * p->nr_parts_out );
-        free( p->parts_out ); free( p->xparts_out );
-        p->parts_out = tp; p->xparts_out = txp;
-        }
-        
-    /* Copy the parts and xparts data to the buffer. */
-    memcpy( &p->parts_out[ p->nr_parts_out ] , parts , sizeof(struct part) * N );
-    memcpy( &p->xparts_out[ p->nr_parts_out ] , xparts , sizeof(struct xpart) * N );
-    
-    /* Increase the counters. */
-    p->nr_parts_out += N;
-
-    }
 
+void proxy_parts_load(struct proxy *p, struct part *parts, struct xpart *xparts,
+                      int N) {
+
+  /* Is there enough space in the buffer? */
+  if (p->nr_parts_out + N > p->size_parts_out) {
+    do {
+      p->size_parts_out *= proxy_buffgrow;
+    } while (p->nr_parts_out + N > p->size_parts_out);
+    struct part *tp;
+    struct xpart *txp;
+    if ((tp = (struct part *)malloc(sizeof(struct part) *p->size_parts_out)) ==
+            NULL ||
+        (txp = (struct xpart *)malloc(sizeof(struct xpart) *
+                                      p->size_parts_out)) == NULL)
+      error("Failed to re-allocate parts_out buffers.");
+    memcpy(tp, p->parts_out, sizeof(struct part) * p->nr_parts_out);
+    memcpy(txp, p->xparts_out, sizeof(struct xpart) * p->nr_parts_out);
+    free(p->parts_out);
+    free(p->xparts_out);
+    p->parts_out = tp;
+    p->xparts_out = txp;
+  }
+
+  /* Copy the parts and xparts data to the buffer. */
+  memcpy(&p->parts_out[p->nr_parts_out], parts, sizeof(struct part) * N);
+  memcpy(&p->xparts_out[p->nr_parts_out], xparts, sizeof(struct xpart) * N);
+
+  /* Increase the counters. */
+  p->nr_parts_out += N;
+}
 
 /**
  * @brief Initialize the given proxy.
@@ -306,41 +316,46 @@ void proxy_parts_load ( struct proxy *p , struct part *parts , struct xpart *xpa
  * @param mynodeID The node this proxy is running on.
  * @param nodeID The node with which this proxy will communicate.
  */
- 
-void proxy_init ( struct proxy *p , int mynodeID , int nodeID ) {
-
-    /* Set the nodeID. */
-    p->mynodeID = mynodeID;
-    p->nodeID = nodeID;
-    
-    /* Allocate the cell send and receive buffers, if needed. */
-    if ( p->cells_in == NULL ) {
-        p->size_cells_in = proxy_buffinit;
-        if ( ( p->cells_in = (struct cell **)malloc( sizeof(void *) * p->size_cells_in ) ) == NULL )
-            error( "Failed to allocate cells_in buffer." );
-        }
-    p->nr_cells_in = 0;
-    if ( p->cells_out == NULL ) {
-        p->size_cells_out = proxy_buffinit;
-        if ( ( p->cells_out = (struct cell **)malloc( sizeof(void *) * p->size_cells_out ) ) == NULL )
-            error( "Failed to allocate cells_out buffer." );
-        }
-    p->nr_cells_out = 0;
-
-    /* Allocate the part send and receive buffers, if needed. */
-    if ( p->parts_in == NULL ) {
-        p->size_parts_in = proxy_buffinit;
-        if ( ( p->parts_in = (struct part *)malloc( sizeof(struct part) * p->size_parts_in ) ) == NULL ||
-             ( p->xparts_in = (struct xpart *)malloc( sizeof(struct xpart) * p->size_parts_in ) ) == NULL )
-            error( "Failed to allocate parts_in buffers." );
-        }
-    p->nr_parts_in = 0;
-    if ( p->parts_out == NULL ) {
-        p->size_parts_out = proxy_buffinit;
-        if ( ( p->parts_out = (struct part *)malloc( sizeof(struct part) * p->size_parts_out ) ) == NULL ||
-             ( p->xparts_out = (struct xpart *)malloc( sizeof(struct xpart) * p->size_parts_out ) ) == NULL )
-            error( "Failed to allocate parts_out buffers." );
-        }
-    p->nr_parts_out = 0;
-
-    }
+
+void proxy_init(struct proxy *p, int mynodeID, int nodeID) {
+
+  /* Set the nodeID. */
+  p->mynodeID = mynodeID;
+  p->nodeID = nodeID;
+
+  /* Allocate the cell send and receive buffers, if needed. */
+  if (p->cells_in == NULL) {
+    p->size_cells_in = proxy_buffinit;
+    if ((p->cells_in =
+             (struct cell **)malloc(sizeof(void *) * p->size_cells_in)) == NULL)
+      error("Failed to allocate cells_in buffer.");
+  }
+  p->nr_cells_in = 0;
+  if (p->cells_out == NULL) {
+    p->size_cells_out = proxy_buffinit;
+    if ((p->cells_out = (struct cell **)malloc(sizeof(void *) *
+                                               p->size_cells_out)) == NULL)
+      error("Failed to allocate cells_out buffer.");
+  }
+  p->nr_cells_out = 0;
+
+  /* Allocate the part send and receive buffers, if needed. */
+  if (p->parts_in == NULL) {
+    p->size_parts_in = proxy_buffinit;
+    if ((p->parts_in = (struct part *)malloc(
+             sizeof(struct part) *p->size_parts_in)) == NULL ||
+        (p->xparts_in = (struct xpart *)malloc(sizeof(struct xpart) *
+                                               p->size_parts_in)) == NULL)
+      error("Failed to allocate parts_in buffers.");
+  }
+  p->nr_parts_in = 0;
+  if (p->parts_out == NULL) {
+    p->size_parts_out = proxy_buffinit;
+    if ((p->parts_out = (struct part *)malloc(
+             sizeof(struct part) *p->size_parts_out)) == NULL ||
+        (p->xparts_out = (struct xpart *)malloc(sizeof(struct xpart) *
+                                                p->size_parts_out)) == NULL)
+      error("Failed to allocate parts_out buffers.");
+  }
+  p->nr_parts_out = 0;
+}
diff --git a/src/proxy.h b/src/proxy.h
index 4710f5a8909bbab3b85fb71e15ae762a602a9dad..8cb08d0a66095597227b52b317f3808190cdc45f 100644
--- a/src/proxy.h
+++ b/src/proxy.h
@@ -1,76 +1,80 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_PROXY_H
+#define SWIFT_PROXY_H
 
-
+/* Includes. */
+#include "cell.h"
+#include "part.h"
 
 /* Some constants. */
-#define proxy_buffgrow                  1.5
-#define proxy_buffinit                  100
+#define proxy_buffgrow 1.5
+#define proxy_buffinit 100
 
 /* Proxy tag arithmetic. */
-#define proxy_tag_shift                 8
-#define proxy_tag_count                 0
-#define proxy_tag_parts                 1
-#define proxy_tag_xparts                2
-#define proxy_tag_cells                 3
-
+#define proxy_tag_shift 8
+#define proxy_tag_count 0
+#define proxy_tag_parts 1
+#define proxy_tag_xparts 2
+#define proxy_tag_cells 3
 
 /* Data structure for the proxy. */
 struct proxy {
 
-    /* ID of the node this proxy represents. */
-    int mynodeID, nodeID;
-    
-    /* Incomming cells. */
-    struct cell **cells_in;
-    struct pcell *pcells_in;
-    int nr_cells_in, size_cells_in, size_pcells_in;
-    
-    /* Outgoing cells. */
-    struct cell **cells_out;
-    struct pcell *pcells_out;
-    int nr_cells_out, size_cells_out, size_pcells_out;
-    
-    /* The parts and xparts buffers for input and output. */
-    struct part *parts_in, *parts_out;
-    struct xpart *xparts_in, *xparts_out;
-    int size_parts_in, size_parts_out;
-    int nr_parts_in, nr_parts_out;
-    
-    /* MPI request handles. */
-    #ifdef WITH_MPI
-    MPI_Request req_parts_count_out, req_parts_count_in;
-    MPI_Request req_parts_out, req_parts_in;
-    MPI_Request req_xparts_out, req_xparts_in;
-    MPI_Request req_cells_count_out, req_cells_count_in;
-    MPI_Request req_cells_out, req_cells_in;
-    #endif
-    
-    };
+  /* ID of the node this proxy represents. */
+  int mynodeID, nodeID;
+
+  /* Incomming cells. */
+  struct cell **cells_in;
+  struct pcell *pcells_in;
+  int nr_cells_in, size_cells_in, size_pcells_in;
 
+  /* Outgoing cells. */
+  struct cell **cells_out;
+  struct pcell *pcells_out;
+  int nr_cells_out, size_cells_out, size_pcells_out;
+
+  /* The parts and xparts buffers for input and output. */
+  struct part *parts_in, *parts_out;
+  struct xpart *xparts_in, *xparts_out;
+  int size_parts_in, size_parts_out;
+  int nr_parts_in, nr_parts_out;
+
+/* MPI request handles. */
+#ifdef WITH_MPI
+  MPI_Request req_parts_count_out, req_parts_count_in;
+  MPI_Request req_parts_out, req_parts_in;
+  MPI_Request req_xparts_out, req_xparts_in;
+  MPI_Request req_cells_count_out, req_cells_count_in;
+  MPI_Request req_cells_out, req_cells_in;
+#endif
+};
 
 /* Function prototypes. */
-void proxy_init ( struct proxy *p , int mynodeID , int nodeID );
-void proxy_parts_load ( struct proxy *p , struct part *parts , struct xpart *xparts , int N );
-void proxy_parts_exch1 ( struct proxy *p );
-void proxy_parts_exch2 ( struct proxy *p );
-void proxy_addcell_in ( struct proxy *p , struct cell *c );
-void proxy_addcell_out ( struct proxy *p , struct cell *c );
-void proxy_cells_exch1 ( struct proxy *p );
-void proxy_cells_exch2 ( struct proxy *p );
+void proxy_init(struct proxy *p, int mynodeID, int nodeID);
+void proxy_parts_load(struct proxy *p, struct part *parts, struct xpart *xparts,
+                      int N);
+void proxy_parts_exch1(struct proxy *p);
+void proxy_parts_exch2(struct proxy *p);
+void proxy_addcell_in(struct proxy *p, struct cell *c);
+void proxy_addcell_out(struct proxy *p, struct cell *c);
+void proxy_cells_exch1(struct proxy *p);
+void proxy_cells_exch2(struct proxy *p);
+
+#endif /* SWIFT_PROXY_H */
diff --git a/src/queue.c b/src/queue.c
index 497a23d2024cd6c8f29890802d8c00b4ab639621..3fa0096bf0fab8ecc6ec2508d5a7c2529451e54d 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -1,20 +1,20 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 
 /* Config parameters. */
@@ -27,37 +27,25 @@
 
 /* MPI headers. */
 #ifdef WITH_MPI
-    #include <mpi.h>
+#include <mpi.h>
 #endif
 
+/* This object's header. */
+#include "queue.h"
+
 /* Local headers. */
 #include "const.h"
-#include "cycle.h"
-#include "lock.h"
-#include "task.h"
-#include "timers.h"
-#include "space.h"
-#include "part.h"
-#include "multipole.h"
-#include "cell.h"
-#include "queue.h"
 #include "error.h"
-#include "inline.h"
-
-
 
 /* Counter macros. */
 #ifdef COUNTER
-    #define COUNT(c) ( __sync_add_and_fetch( &queue_counter[ c ] , 1 ) )
+#define COUNT(c) (__sync_add_and_fetch(&queue_counter[c], 1))
 #else
-    #define COUNT(c)
+#define COUNT(c)
 #endif
 
-
 /* The counters. */
-int queue_counter[ queue_counter_count ];
-
-        
+int queue_counter[queue_counter_count];
 
 /**
  * @brief Insert a used tasks into the given queue.
@@ -65,82 +53,75 @@ int queue_counter[ queue_counter_count ];
  * @param q The #queue.
  * @param t The #task.
  */
- 
-void queue_insert ( struct queue *q , struct task *t ) {
-
-    int k, *tid;
-    struct task *tasks;
-
-    /* Lock the queue. */
-    if ( lock_lock( &q->lock ) != 0 )
-        error( "Failed to get queue lock." );
-        
-    tid = q->tid;
-    tasks = q->tasks;
-
-    /* Does the queue need to be grown? */
-    if ( q->count == q->size ) {
-        int *temp;
-        q->size *= queue_sizegrow;
-        if ( ( temp = (int *)malloc( sizeof(int) * q->size ) ) == NULL )
-            error( "Failed to allocate new indices." );
-        memcpy( temp , tid , sizeof(int) * q->count );
-        free( tid );
-        q->tid = tid = temp;
-        }
-        
-    /* Drop the task at the end of the queue. */
-    tid[ q->count ] = ( t - tasks );
-    q->count += 1;
-    
-    /* Shuffle up. */
-    for ( k = q->count - 1 ; k > 0 ; k = (k-1)/2 )
-        if ( tasks[ tid[k] ].weight > tasks[ tid[(k-1)/2] ].weight ) {
-            int temp = tid[k];
-            tid[k] = tid[(k-1)/2];
-            tid[(k-1)/2] = temp;
-            }
-        else
-            break;
-            
-    /* Check the queue's consistency. */
-    /* for ( k = 1 ; k < q->count ; k++ )
-        if ( tasks[ tid[(k-1)/2] ].weight < tasks[ tid[k] ].weight )
-            error( "Queue heap is disordered." ); */
-    
-    /* Unlock the queue. */
-    if ( lock_unlock( &q->lock ) != 0 )
-        error( "Failed to unlock queue." );
-    }
 
+void queue_insert(struct queue *q, struct task *t) {
+
+  int k, *tid;
+  struct task *tasks;
+
+  /* Lock the queue. */
+  if (lock_lock(&q->lock) != 0) error("Failed to get queue lock.");
+
+  tid = q->tid;
+  tasks = q->tasks;
+
+  /* Does the queue need to be grown? */
+  if (q->count == q->size) {
+    int *temp;
+    q->size *= queue_sizegrow;
+    if ((temp = (int *)malloc(sizeof(int) * q->size)) == NULL)
+      error("Failed to allocate new indices.");
+    memcpy(temp, tid, sizeof(int) * q->count);
+    free(tid);
+    q->tid = tid = temp;
+  }
+
+  /* Drop the task at the end of the queue. */
+  tid[q->count] = (t - tasks);
+  q->count += 1;
+
+  /* Shuffle up. */
+  for (k = q->count - 1; k > 0; k = (k - 1) / 2)
+    if (tasks[tid[k]].weight > tasks[tid[(k - 1) / 2]].weight) {
+      int temp = tid[k];
+      tid[k] = tid[(k - 1) / 2];
+      tid[(k - 1) / 2] = temp;
+    } else
+      break;
+
+  /* Check the queue's consistency. */
+  /* for ( k = 1 ; k < q->count ; k++ )
+      if ( tasks[ tid[(k-1)/2] ].weight < tasks[ tid[k] ].weight )
+          error( "Queue heap is disordered." ); */
+
+  /* Unlock the queue. */
+  if (lock_unlock(&q->lock) != 0) error("Failed to unlock queue.");
+}
 
-/** 
+/**
  * @brief Initialize the given queue.
  *
  * @param q The #queue.
  * @param tasks List of tasks to which the queue indices refer to.
  */
- 
-void queue_init ( struct queue *q , struct task *tasks ) {
-    
-    /* Allocate the task list if needed. */
-    q->size = queue_sizeinit;
-    if ( ( q->tid = (int *)malloc( sizeof(int) * q->size ) ) == NULL )
-        error( "Failed to allocate queue tids." );
-        
-    /* Set the tasks pointer. */
-    q->tasks = tasks;
-        
-    /* Init counters. */
-    q->count = 0;
-    
-    /* Init the queue lock. */
-    if ( lock_init( &q->lock ) != 0 )
-        error( "Failed to init queue lock." );
 
-    }
-    
-    
+void queue_init(struct queue *q, struct task *tasks) {
+
+  /* Allocate the task list if needed. */
+  q->size = queue_sizeinit;
+  if ((q->tid = (int *)malloc(sizeof(int) * q->size)) == NULL)
+    error("Failed to allocate queue tids.");
+
+  /* Set the tasks pointer. */
+  q->tasks = tasks;
+
+  /* Init counters. */
+  q->count = 0;
+
+  /* Init the queue lock. */
+  if (lock_init(&q->lock) != 0) error("Failed to init queue lock.");
+}
+
 /**
  * @brief Get a task free of dependencies and conflicts.
  *
@@ -148,111 +129,102 @@ void queue_init ( struct queue *q , struct task *tasks ) {
  * @param super The super-cell tat might conflict with the #queue
  * @param blocking Block until access to the queue is granted.
  */
- 
-struct task *queue_gettask ( struct queue *q , struct cell *super , int blocking ) {
-
-    int k, qcount, *qtid, gotcha;
-    lock_type *qlock = &q->lock;
-    struct task *qtasks, *res = NULL;
-    
-    /* If there are no tasks, leave immediately. */
-    if ( q->count == 0 )
-        return NULL;
-
-    /* Grab the task lock. */
-    if ( blocking ) {
-        if ( lock_lock( qlock ) != 0 )
-            error( "Locking the qlock failed.\n" );
-        }
-    else {
-        if ( lock_trylock( qlock ) != 0 )
-            return NULL;
-        }
-
-    /* Set some pointers we will use often. */
-    qtid = q->tid;
-    qtasks = q->tasks;
-    qcount = q->count;
-    gotcha = 0;
-
-    /* Loop over the task IDs looking for tasks with the same super-cell. */
-    if ( super != NULL ) {
-        for ( k = 0 ; k < qcount && k < queue_maxsuper ; k++ ) {
-
-            /* Put a finger on the task. */
-            res = &qtasks[ qtid[k] ];
-
-            /* Try to lock the task and exit if successful. */
-            if ( ( res->ci->super == super || ( res->cj != NULL && res->cj->super == super ) ) &&
-                 task_lock( res ) ) {
-                gotcha = 1;
-                break;
-                }
-
-            } /* loop over the task IDs. */
-        }
-
-    /* Loop over the task IDs again if nothing was found, take anything. */
-    if ( !gotcha ) {
-        for ( k = 0 ; k < qcount ; k++ ) {
-
-            /* Put a finger on the task. */
-            res = &qtasks[ qtid[k] ];
-
-            /* Try to lock the task and exit if successful. */
-            if ( task_lock( res ) )
-                break;
-
-            } /* loop over the task IDs. */
-        }
-
-    /* Did we get a task? */
-    if ( k < qcount ) {
-
-        /* Another one bites the dust. */
-        qcount = q->count -= 1;
-
-        /* Swap this task with the last task and re-heap. */
-        if ( k < qcount ) {
-            qtid[ k ] = qtid[ qcount ];
-            int w = qtasks[ qtid[k] ].weight;
-            while ( k > 0 && w > qtasks[ qtid[(k-1)/2] ].weight ) {
-                int temp = q->tid[k];
-                q->tid[k] = q->tid[(k-1)/2];
-                q->tid[(k-1)/2] = temp;
-                k = (k-1)/2;
-                }
-            int i;
-            while ( ( i = 2*k+1 ) < qcount ) {
-                if ( i+1 < qcount && qtasks[ qtid[i+1] ].weight > qtasks[ qtid[i] ].weight )
-                    i += 1;
-                if ( qtasks[ qtid[i] ].weight > w ) {
-                    int temp = qtid[i];
-                    qtid[i] = qtid[k];
-                    qtid[k] = temp;
-                    k = i;
-                    }
-                else
-                    break;
-                }
-            }
-
-        }
-    else
-        res = NULL;
-
-    /* Check the queue's consistency. */
-    /* for ( k = 1 ; k < q->count ; k++ )
-        if ( qtasks[ qtid[(k-1)/2] ].weight < qtasks[ qtid[k] ].weight )
-            error( "Queue heap is disordered." ); */
-    
-    /* Release the task lock. */
-    if ( lock_unlock( qlock ) != 0 )
-        error( "Unlocking the qlock failed.\n" );
-            
-    /* Take the money and run. */
-    return res;
 
+struct task *queue_gettask(struct queue *q, struct cell *super, int blocking) {
+
+  int k, qcount, *qtid, gotcha;
+  lock_type *qlock = &q->lock;
+  struct task *qtasks, *res = NULL;
+
+  /* If there are no tasks, leave immediately. */
+  if (q->count == 0) return NULL;
+
+  /* Grab the task lock. */
+  if (blocking) {
+    if (lock_lock(qlock) != 0) error("Locking the qlock failed.\n");
+  } else {
+    if (lock_trylock(qlock) != 0) return NULL;
+  }
+
+  /* Set some pointers we will use often. */
+  qtid = q->tid;
+  qtasks = q->tasks;
+  qcount = q->count;
+  gotcha = 0;
+
+  /* Loop over the task IDs looking for tasks with the same super-cell. */
+  if (super != NULL) {
+    for (k = 0; k < qcount && k < queue_maxsuper; k++) {
+
+      /* Put a finger on the task. */
+      res = &qtasks[qtid[k]];
+
+      /* Try to lock the task and exit if successful. */
+      if ((res->ci->super == super ||
+           (res->cj != NULL && res->cj->super == super)) &&
+          task_lock(res)) {
+        gotcha = 1;
+        break;
+      }
+
+    } /* loop over the task IDs. */
+  }
+
+  /* Loop over the task IDs again if nothing was found, take anything. */
+  if (!gotcha) {
+    for (k = 0; k < qcount; k++) {
+
+      /* Put a finger on the task. */
+      res = &qtasks[qtid[k]];
+
+      /* Try to lock the task and exit if successful. */
+      if (task_lock(res)) break;
+
+    } /* loop over the task IDs. */
+  }
+
+  /* Did we get a task? */
+  if (k < qcount) {
+
+    /* Another one bites the dust. */
+    qcount = q->count -= 1;
+
+    /* Swap this task with the last task and re-heap. */
+    if (k < qcount) {
+      qtid[k] = qtid[qcount];
+      int w = qtasks[qtid[k]].weight;
+      while (k > 0 && w > qtasks[qtid[(k - 1) / 2]].weight) {
+        int temp = q->tid[k];
+        q->tid[k] = q->tid[(k - 1) / 2];
+        q->tid[(k - 1) / 2] = temp;
+        k = (k - 1) / 2;
+      }
+      int i;
+      while ((i = 2 * k + 1) < qcount) {
+        if (i + 1 < qcount &&
+            qtasks[qtid[i + 1]].weight > qtasks[qtid[i]].weight)
+          i += 1;
+        if (qtasks[qtid[i]].weight > w) {
+          int temp = qtid[i];
+          qtid[i] = qtid[k];
+          qtid[k] = temp;
+          k = i;
+        } else
+          break;
+      }
     }
 
+  } else
+    res = NULL;
+
+  /* Check the queue's consistency. */
+  /* for ( k = 1 ; k < q->count ; k++ )
+      if ( qtasks[ qtid[(k-1)/2] ].weight < qtasks[ qtid[k] ].weight )
+          error( "Queue heap is disordered." ); */
+
+  /* Release the task lock. */
+  if (lock_unlock(qlock) != 0) error("Unlocking the qlock failed.\n");
 
+  /* Take the money and run. */
+  return res;
+}
diff --git a/src/queue.h b/src/queue.h
index 76ae9b6971e456af17f338561eaecb5670172d21..533007684fa41a4a25a10a14c504358926d0fe06 100644
--- a/src/queue.h
+++ b/src/queue.h
@@ -1,56 +1,61 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_QUEUE_H
+#define SWIFT_QUEUE_H
 
+/* Includes. */
+#include "cell.h"
+#include "lock.h"
+#include "task.h"
 
 /* Some constants. */
-#define queue_maxsuper           50
-#define queue_sizeinit           100
-#define queue_sizegrow           2
-
+#define queue_maxsuper 50
+#define queue_sizeinit 100
+#define queue_sizegrow 2
 
 /* Counters. */
 enum {
-    queue_counter_swap = 0,
-    queue_counter_count,
-    };
-extern int queue_counter[ queue_counter_count ];
-
+  queue_counter_swap = 0,
+  queue_counter_count,
+};
+extern int queue_counter[queue_counter_count];
 
 /** The queue struct. */
 struct queue {
 
-    /* The lock to access this queue. */
-    lock_type lock;
+  /* The lock to access this queue. */
+  lock_type lock;
+
+  /* Size, count and next element. */
+  int size, count;
 
-    /* Size, count and next element. */
-    int size, count;
-    
-    /* The actual tasks to which the indices refer. */
-    struct task *tasks;
-    
-    /* The task indices. */
-    int *tid;
+  /* The actual tasks to which the indices refer. */
+  struct task *tasks;
 
-    } __attribute__((aligned (64)));
-    
+  /* The task indices. */
+  int *tid;
+
+} __attribute__((aligned(64)));
 
 /* Function prototypes. */
-struct task *queue_gettask ( struct queue *q , struct cell *super , int blocking );
-void queue_init ( struct queue *q , struct task *tasks );
-void queue_insert ( struct queue *q , struct task *t );
+struct task *queue_gettask(struct queue *q, struct cell *super, int blocking);
+void queue_init(struct queue *q, struct task *tasks);
+void queue_insert(struct queue *q, struct task *t);
+
+#endif /* SWIFT_QUEUE_H */
diff --git a/src/runner.c b/src/runner.c
index 71f425196622a99636118e62f6e4e1fef4f117e3..a3056f414bd352c2a9b857b980e7989dfc28130a 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -1,56 +1,45 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 
 /* Config parameters. */
 #include "../config.h"
 
 /* Some standard headers. */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <pthread.h>
-#include <math.h>
 #include <float.h>
 #include <limits.h>
 
 /* MPI headers. */
 #ifdef WITH_MPI
-    #include <mpi.h>
+#include <mpi.h>
 #endif
 
+/* This object's header. */
+#include "runner.h"
+
 /* Local headers. */
 #include "const.h"
-#include "cycle.h"
-#include "atomic.h"
-#include "timers.h"
-#include "const.h"
-#include "lock.h"
-#include "task.h"
-#include "part.h"
-#include "space.h"
-#include "multipole.h"
-#include "cell.h"
-#include "queue.h"
-#include "scheduler.h"
 #include "engine.h"
-#include "runner.h"
 #include "error.h"
+#include "scheduler.h"
+#include "space.h"
+#include "task.h"
+#include "timers.h"
 
 /* Include the right variant of the SPH interactions */
 #ifdef LEGACY_GADGET2_SPH
@@ -61,31 +50,28 @@
 #include "runner_iact_grav.h"
 
 /* Convert cell location to ID. */
-#define cell_getid( cdim , i , j , k ) ( (int)(k) + (cdim)[2]*( (int)(j) + (cdim)[1]*(int)(i) ) )
+#define cell_getid(cdim, i, j, k) \
+  ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i)))
 
 /* The counters. */
-int runner_counter[ runner_counter_count ];
-
-        
-
-const float runner_shift[13*3] = {
-     5.773502691896258e-01 ,  5.773502691896258e-01 ,  5.773502691896258e-01 ,
-     7.071067811865475e-01 ,  7.071067811865475e-01 ,  0.0                   ,
-     5.773502691896258e-01 ,  5.773502691896258e-01 , -5.773502691896258e-01 ,
-     7.071067811865475e-01 ,  0.0                   ,  7.071067811865475e-01 ,
-     1.0                   ,  0.0                   ,  0.0                   ,
-     7.071067811865475e-01 ,  0.0                   , -7.071067811865475e-01 ,
-     5.773502691896258e-01 , -5.773502691896258e-01 ,  5.773502691896258e-01 ,
-     7.071067811865475e-01 , -7.071067811865475e-01 ,  0.0                   ,
-     5.773502691896258e-01 , -5.773502691896258e-01 , -5.773502691896258e-01 ,
-     0.0                   ,  7.071067811865475e-01 ,  7.071067811865475e-01 ,
-     0.0                   ,  1.0                   ,  0.0                   ,
-     0.0                   ,  7.071067811865475e-01 , -7.071067811865475e-01 ,
-     0.0                   ,  0.0                   ,  1.0                   ,
-    };
-const char runner_flip[27] = { 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 ,
-                               0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }; 
-
+int runner_counter[runner_counter_count];
+
+const float runner_shift[13 * 3] = {
+    5.773502691896258e-01, 5.773502691896258e-01,  5.773502691896258e-01,
+    7.071067811865475e-01, 7.071067811865475e-01,  0.0,
+    5.773502691896258e-01, 5.773502691896258e-01,  -5.773502691896258e-01,
+    7.071067811865475e-01, 0.0,                    7.071067811865475e-01,
+    1.0,                   0.0,                    0.0,
+    7.071067811865475e-01, 0.0,                    -7.071067811865475e-01,
+    5.773502691896258e-01, -5.773502691896258e-01, 5.773502691896258e-01,
+    7.071067811865475e-01, -7.071067811865475e-01, 0.0,
+    5.773502691896258e-01, -5.773502691896258e-01, -5.773502691896258e-01,
+    0.0,                   7.071067811865475e-01,  7.071067811865475e-01,
+    0.0,                   1.0,                    0.0,
+    0.0,                   7.071067811865475e-01,  -7.071067811865475e-01,
+    0.0,                   0.0,                    1.0, };
+const char runner_flip[27] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 
 /* Import the density loop functions. */
 #define FUNCTION density
@@ -99,119 +85,85 @@ const char runner_flip[27] = { 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1
 /* Import the gravity loop functions. */
 #include "runner_doiact_grav.h"
 
-
-/**
- * @brief Send a local cell's particle data to another node.
- *
- * @param r The #runner.
- * @param c The #cell.
- * @param nodeID The destination node's ID.
- * @param tag bit to distinguish between xv and rho sends.
- */
- 
-void runner_dosend ( struct runner *r , struct cell *c , int nodeID , int tag ) {
-
-#ifdef WITH_MPI
-
-    MPI_Request req;
-    
-    /* First check if all the density tasks have been run. */
-    if ( tag & 1 )
-        if ( c->parts[0].rho == 0.0 )
-            error( "Attempting to send rhos before ghost task completed." );
-    
-    /* Emit the isend. */
-    if ( MPI_Isend( c->parts , sizeof(struct part) * c->count , MPI_BYTE , nodeID , tag , MPI_COMM_WORLD , &req ) != MPI_SUCCESS )
-        error( "Failed to isend particle data." );
-        
-    message( "sending %i parts with tag=%i from %i to %i." ,
-        c->count , tag , r->e->nodeID , nodeID ); fflush(stdout);
-    
-    /* Free the request handler as we don't care what happens next. */
-    MPI_Request_free( &req );
-
-#else
-    error( "SWIFT was not compiled with MPI support." );
-#endif
-
-    }
-    
-
 /**
  * @brief Sort the entries in ascending order using QuickSort.
  *
  * @param sort The entries
  * @param N The number of entries.
  */
- 
-void runner_dosort_ascending ( struct entry *sort , int N ) {
-
-    struct {
-        short int lo, hi;
-        } qstack[10];
-    int qpos, i, j, lo, hi, imin;
-    struct entry temp;
-    float pivot;
-        
-    /* Sort parts in cell_i in decreasing order with quicksort */
-    qstack[0].lo = 0; qstack[0].hi = N - 1; qpos = 0;
-    while ( qpos >= 0 ) {
-        lo = qstack[qpos].lo; hi = qstack[qpos].hi;
-        qpos -= 1;
-        if ( hi - lo < 15 ) {
-            for ( i = lo ; i < hi ; i++ ) {
-                imin = i;
-                for ( j = i+1 ; j <= hi ; j++ )
-                    if ( sort[j].d < sort[imin].d )
-                        imin = j;
-                if ( imin != i ) {
-                    temp = sort[imin]; sort[imin] = sort[i]; sort[i] = temp;
-                    }
-                }
-            }
-        else {
-            pivot = sort[ ( lo + hi ) / 2 ].d;
-            i = lo; j = hi;
-            while ( i <= j ) {
-                while ( sort[i].d < pivot ) i++;
-                while ( sort[j].d > pivot ) j--;
-                if ( i <= j ) {
-                    if ( i < j ) {
-                        temp = sort[i]; sort[i] = sort[j]; sort[j] = temp;
-                        }
-                    i += 1; j -= 1;
-                    }
-                }
-            if ( j > ( lo + hi ) / 2 ) {
-                if ( lo < j ) {
-                    qpos += 1;
-                    qstack[qpos].lo = lo;
-                    qstack[qpos].hi = j;
-                    }
-                if ( i < hi ) {
-                    qpos += 1;
-                    qstack[qpos].lo = i;
-                    qstack[qpos].hi = hi;
-                    }
-                }
-            else {
-                if ( i < hi ) {
-                    qpos += 1;
-                    qstack[qpos].lo = i;
-                    qstack[qpos].hi = hi;
-                    }
-                if ( lo < j ) {
-                    qpos += 1;
-                    qstack[qpos].lo = lo;
-                    qstack[qpos].hi = j;
-                    }
-                }
-            }
+
+void runner_dosort_ascending(struct entry *sort, int N) {
+
+  struct {
+    short int lo, hi;
+  } qstack[10];
+  int qpos, i, j, lo, hi, imin;
+  struct entry temp;
+  float pivot;
+
+  /* Sort parts in cell_i in decreasing order with quicksort */
+  qstack[0].lo = 0;
+  qstack[0].hi = N - 1;
+  qpos = 0;
+  while (qpos >= 0) {
+    lo = qstack[qpos].lo;
+    hi = qstack[qpos].hi;
+    qpos -= 1;
+    if (hi - lo < 15) {
+      for (i = lo; i < hi; i++) {
+        imin = i;
+        for (j = i + 1; j <= hi; j++)
+          if (sort[j].d < sort[imin].d) imin = j;
+        if (imin != i) {
+          temp = sort[imin];
+          sort[imin] = sort[i];
+          sort[i] = temp;
+        }
+      }
+    } else {
+      pivot = sort[(lo + hi) / 2].d;
+      i = lo;
+      j = hi;
+      while (i <= j) {
+        while (sort[i].d < pivot) i++;
+        while (sort[j].d > pivot) j--;
+        if (i <= j) {
+          if (i < j) {
+            temp = sort[i];
+            sort[i] = sort[j];
+            sort[j] = temp;
+          }
+          i += 1;
+          j -= 1;
+        }
+      }
+      if (j > (lo + hi) / 2) {
+        if (lo < j) {
+          qpos += 1;
+          qstack[qpos].lo = lo;
+          qstack[qpos].hi = j;
+        }
+        if (i < hi) {
+          qpos += 1;
+          qstack[qpos].lo = i;
+          qstack[qpos].hi = hi;
+        }
+      } else {
+        if (i < hi) {
+          qpos += 1;
+          qstack[qpos].lo = i;
+          qstack[qpos].hi = hi;
         }
-                
+        if (lo < j) {
+          qpos += 1;
+          qstack[qpos].lo = lo;
+          qstack[qpos].hi = j;
+        }
+      }
     }
-    
-    
+  }
+}
+
 /**
  * @brief Sort the particles in the given cell along all cardinal directions.
  *
@@ -221,1147 +173,1183 @@ void runner_dosort_ascending ( struct entry *sort , int N ) {
  * @param clock Flag indicating whether to record the timing or not, needed
  *      for recursive calls.
  */
- 
-void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock ) {
-
-    struct entry *finger;
-    struct entry *fingers[8];
-    struct part *parts = c->parts;
-    struct entry *sort;
-    int j, k, count = c->count;
-    int i, ind, off[8], inds[8], temp_i, missing;
-    // float shift[3];
-    float buff[8], px[3];
-    
-    TIMER_TIC
-    
-    /* Clean-up the flags, i.e. filter out what's already been sorted. */
-    flags &= ~c->sorted;
-    if ( flags == 0 )
-        return;
-    
-    /* start by allocating the entry arrays. */
-    if ( c->sort == NULL || c->sortsize < count ) {
-        if ( c->sort != NULL )
-            free( c->sort );
-        c->sortsize = count * 1.1;
-        if ( ( c->sort = (struct entry *)malloc( sizeof(struct entry) * (c->sortsize + 1) * 13 ) ) == NULL )
-            error( "Failed to allocate sort memory." );
-        }
-    sort = c->sort;
-        
-    /* Does this cell have any progeny? */
-    if ( c->split ) {
-    
-        /* Fill in the gaps within the progeny. */
-        for ( k = 0 ; k < 8 ; k++ ) {
-            if ( c->progeny[k] == NULL )
-                continue;
-            missing = flags & ~c->progeny[k]->sorted;
-            if ( missing )
-                runner_dosort( r , c->progeny[k] , missing , 0 );
-            }
-    
-        /* Loop over the 13 different sort arrays. */
-        for ( j = 0 ; j < 13 ; j++ ) {
-        
-            /* Has this sort array been flagged? */
-            if ( !( flags & (1 << j) ) )
-                continue;
-                
-            /* Init the particle index offsets. */
-            for ( off[0] = 0 , k = 1 ; k < 8 ; k++ )
-                if ( c->progeny[k-1] != NULL )
-                    off[k] = off[k-1] + c->progeny[k-1]->count;
-                else
-                    off[k] = off[k-1];
-
-            /* Init the entries and indices. */
-            for ( k = 0 ; k < 8 ; k++ ) {
-                inds[k] = k;
-                if ( c->progeny[k] != NULL && c->progeny[k]->count > 0 ) {
-                    fingers[k] = &c->progeny[k]->sort[ j*(c->progeny[k]->count + 1) ];
-                    buff[k] = fingers[k]->d;
-                    off[k] = off[k];
-                    }
-                else
-                    buff[k] = FLT_MAX;
-                }
-
-            /* Sort the buffer. */
-            for ( i = 0 ; i < 7 ; i++ )
-                for ( k = i+1 ; k < 8 ; k++ )
-                    if ( buff[ inds[k] ] < buff[ inds[i] ] ) {
-                        temp_i = inds[i]; inds[i] = inds[k]; inds[k] = temp_i;
-                        }
-
-            /* For each entry in the new sort list. */
-            finger = &sort[ j*(count + 1) ];
-            for ( ind = 0 ; ind < count ; ind++ ) {
-
-                /* Copy the minimum into the new sort array. */
-                finger[ind].d = buff[inds[0]];
-                finger[ind].i = fingers[inds[0]]->i + off[inds[0]];
-
-                /* Update the buffer. */
-                fingers[inds[0]] += 1;
-                buff[inds[0]] = fingers[inds[0]]->d;
-
-                /* Find the smallest entry. */
-                for ( k = 1 ; k < 8 && buff[inds[k]] < buff[inds[k-1]] ; k++ ) {
-                    temp_i = inds[k-1]; inds[k-1] = inds[k]; inds[k] = temp_i;
-                    }
-
-                } /* Merge. */
-            
-            /* Add a sentinel. */
-            sort[ j*(count + 1) + count ].d = FLT_MAX;
-            sort[ j*(count + 1) + count ].i = 0;
-            
-            /* Mark as sorted. */
-            c->sorted |= ( 1 << j );
-            
-            } /* loop over sort arrays. */
-    
-        } /* progeny? */
-        
-    /* Otherwise, just sort. */
-    else {
-    
-        /* Fill the sort array. */
-        for ( k = 0 ; k < count ; k++ ) {
-            px[0] = parts[k].x[0];
-            px[1] = parts[k].x[1];
-            px[2] = parts[k].x[2];
-            for ( j = 0 ; j < 13 ; j++ )
-                if ( flags & (1 << j) ) {
-                    sort[ j*(count + 1) + k].i = k;
-                    sort[ j*(count + 1) + k].d = px[0]*runner_shift[ 3*j + 0 ] + px[1]*runner_shift[ 3*j + 1 ] + px[2]*runner_shift[ 3*j + 2 ];
-                    }
-            }
-
-        /* Add the sentinel and sort. */
-        for ( j = 0 ; j < 13 ; j++ )
-            if ( flags & (1 << j) ) {
-                sort[ j*(count + 1) + count ].d = FLT_MAX;
-                sort[ j*(count + 1) + count ].i = 0;
-                runner_dosort_ascending( &sort[ j*(count + 1) ] , count );
-                c->sorted |= ( 1 << j );
-                }
-            
+
+void runner_dosort(struct runner *r, struct cell *c, int flags, int clock) {
+
+  struct entry *finger;
+  struct entry *fingers[8];
+  struct part *parts = c->parts;
+  struct entry *sort;
+  int j, k, count = c->count;
+  int i, ind, off[8], inds[8], temp_i, missing;
+  // float shift[3];
+  float buff[8], px[3];
+
+  TIMER_TIC
+
+  /* Clean-up the flags, i.e. filter out what's already been sorted. */
+  flags &= ~c->sorted;
+  if (flags == 0) return;
+
+  /* start by allocating the entry arrays. */
+  if (c->sort == NULL || c->sortsize < count) {
+    if (c->sort != NULL) free(c->sort);
+    c->sortsize = count * 1.1;
+    if ((c->sort = (struct entry *)malloc(sizeof(struct entry) *
+                                          (c->sortsize + 1) * 13)) == NULL)
+      error("Failed to allocate sort memory.");
+  }
+  sort = c->sort;
+
+  /* Does this cell have any progeny? */
+  if (c->split) {
+
+    /* Fill in the gaps within the progeny. */
+    for (k = 0; k < 8; k++) {
+      if (c->progeny[k] == NULL) continue;
+      missing = flags & ~c->progeny[k]->sorted;
+      if (missing) runner_dosort(r, c->progeny[k], missing, 0);
+    }
+
+    /* Loop over the 13 different sort arrays. */
+    for (j = 0; j < 13; j++) {
+
+      /* Has this sort array been flagged? */
+      if (!(flags & (1 << j))) continue;
+
+      /* Init the particle index offsets. */
+      for (off[0] = 0, k = 1; k < 8; k++)
+        if (c->progeny[k - 1] != NULL)
+          off[k] = off[k - 1] + c->progeny[k - 1]->count;
+        else
+          off[k] = off[k - 1];
+
+      /* Init the entries and indices. */
+      for (k = 0; k < 8; k++) {
+        inds[k] = k;
+        if (c->progeny[k] != NULL && c->progeny[k]->count > 0) {
+          fingers[k] = &c->progeny[k]->sort[j * (c->progeny[k]->count + 1)];
+          buff[k] = fingers[k]->d;
+          off[k] = off[k];
+        } else
+          buff[k] = FLT_MAX;
+      }
+
+      /* Sort the buffer. */
+      for (i = 0; i < 7; i++)
+        for (k = i + 1; k < 8; k++)
+          if (buff[inds[k]] < buff[inds[i]]) {
+            temp_i = inds[i];
+            inds[i] = inds[k];
+            inds[k] = temp_i;
+          }
+
+      /* For each entry in the new sort list. */
+      finger = &sort[j * (count + 1)];
+      for (ind = 0; ind < count; ind++) {
+
+        /* Copy the minimum into the new sort array. */
+        finger[ind].d = buff[inds[0]];
+        finger[ind].i = fingers[inds[0]]->i + off[inds[0]];
+
+        /* Update the buffer. */
+        fingers[inds[0]] += 1;
+        buff[inds[0]] = fingers[inds[0]]->d;
+
+        /* Find the smallest entry. */
+        for (k = 1; k < 8 && buff[inds[k]] < buff[inds[k - 1]]; k++) {
+          temp_i = inds[k - 1];
+          inds[k - 1] = inds[k];
+          inds[k] = temp_i;
         }
-        
-    /* Verify the sorting. */
-    /* for ( j = 0 ; j < 13 ; j++ ) {
-        if ( !( flags & (1 << j) ) )
-            continue;
-        finger = &sort[ j*(count + 1) ];
-        for ( k = 1 ; k < count ; k++ ) {
-            if ( finger[k].d < finger[k-1].d )
-                error( "Sorting failed, ascending array." );
-            if ( finger[k].i >= count )
-                error( "Sorting failed, indices borked." );
-            }
-        } */
-        
-    #ifdef TIMER_VERBOSE
-        message( "runner %02i: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) took %.3f ms." ,
-            r->id , count , c->depth ,
-            (flags & 0x1000) >> 12 , (flags & 0x800) >> 11 , (flags & 0x400) >> 10 , (flags & 0x200) >> 9 , (flags & 0x100) >> 8 , (flags & 0x80) >> 7 , (flags & 0x40) >> 6 , (flags & 0x20) >> 5 , (flags & 0x10) >> 4 , (flags & 0x8) >> 3 , (flags & 0x4) >> 2 , (flags & 0x2) >> 1 , (flags & 0x1) >> 0 , 
-            ((double)TIMER_TOC(timer_dosort)) / CPU_TPS * 1000 ); fflush(stdout);
-    #else
-        if ( clock )
-            TIMER_TOC(timer_dosort);
-    #endif
 
+      } /* Merge. */
+
+      /* Add a sentinel. */
+      sort[j * (count + 1) + count].d = FLT_MAX;
+      sort[j * (count + 1) + count].i = 0;
+
+      /* Mark as sorted. */
+      c->sorted |= (1 << j);
+
+    } /* loop over sort arrays. */
+
+  } /* progeny? */
+
+  /* Otherwise, just sort. */
+  else {
+
+    /* Fill the sort array. */
+    for (k = 0; k < count; k++) {
+      px[0] = parts[k].x[0];
+      px[1] = parts[k].x[1];
+      px[2] = parts[k].x[2];
+      for (j = 0; j < 13; j++)
+        if (flags & (1 << j)) {
+          sort[j * (count + 1) + k].i = k;
+          sort[j * (count + 1) + k].d = px[0] * runner_shift[3 * j + 0] +
+                                        px[1] * runner_shift[3 * j + 1] +
+                                        px[2] * runner_shift[3 * j + 2];
+        }
     }
-    
-    
-void runner_dogsort ( struct runner *r , struct cell *c , int flags , int clock ) {
-
-    struct entry *finger;
-    struct entry *fingers[8];
-    struct gpart *gparts = c->gparts;
-    struct entry *gsort;
-    int j, k, count = c->gcount;
-    int i, ind, off[8], inds[8], temp_i, missing;
-    // float shift[3];
-    float buff[8], px[3];
-    
-    TIMER_TIC
-    
-    /* Clean-up the flags, i.e. filter out what's already been sorted. */
-    flags &= ~c->gsorted;
-    if ( flags == 0 )
-        return;
-    
-    /* start by allocating the entry arrays. */
-    if ( c->gsort == NULL || c->gsortsize < count ) {
-        if ( c->gsort != NULL )
-            free( c->gsort );
-        c->gsortsize = count * 1.1;
-        if ( ( c->gsort = (struct entry *)malloc( sizeof(struct entry) * (c->gsortsize + 1) * 13 ) ) == NULL )
-            error( "Failed to allocate sort memory." );
+
+    /* Add the sentinel and sort. */
+    for (j = 0; j < 13; j++)
+      if (flags & (1 << j)) {
+        sort[j * (count + 1) + count].d = FLT_MAX;
+        sort[j * (count + 1) + count].i = 0;
+        runner_dosort_ascending(&sort[j * (count + 1)], count);
+        c->sorted |= (1 << j);
+      }
+  }
+
+/* Verify the sorting. */
+/* for ( j = 0 ; j < 13 ; j++ ) {
+    if ( !( flags & (1 << j) ) )
+        continue;
+    finger = &sort[ j*(count + 1) ];
+    for ( k = 1 ; k < count ; k++ ) {
+        if ( finger[k].d < finger[k-1].d )
+            error( "Sorting failed, ascending array." );
+        if ( finger[k].i >= count )
+            error( "Sorting failed, indices borked." );
         }
-    gsort = c->gsort;
-        
-    /* Does this cell have any progeny? */
-    if ( c->split ) {
-    
-        /* Fill in the gaps within the progeny. */
-        for ( k = 0 ; k < 8 ; k++ ) {
-            if ( c->progeny[k] == NULL )
-                continue;
-            missing = flags & ~c->progeny[k]->gsorted;
-            if ( missing )
-                runner_dogsort( r , c->progeny[k] , missing , 0 );
-            }
-    
-        /* Loop over the 13 different sort arrays. */
-        for ( j = 0 ; j < 13 ; j++ ) {
-        
-            /* Has this sort array been flagged? */
-            if ( !( flags & (1 << j) ) )
-                continue;
-                
-            /* Init the particle index offsets. */
-            for ( off[0] = 0 , k = 1 ; k < 8 ; k++ )
-                if ( c->progeny[k-1] != NULL )
-                    off[k] = off[k-1] + c->progeny[k-1]->gcount;
-                else
-                    off[k] = off[k-1];
-
-            /* Init the entries and indices. */
-            for ( k = 0 ; k < 8 ; k++ ) {
-                inds[k] = k;
-                if ( c->progeny[k] != NULL && c->progeny[k]->gcount > 0 ) {
-                    fingers[k] = &c->progeny[k]->gsort[ j*(c->progeny[k]->gcount + 1) ];
-                    buff[k] = fingers[k]->d;
-                    off[k] = off[k];
-                    }
-                else
-                    buff[k] = FLT_MAX;
-                }
-
-            /* Sort the buffer. */
-            for ( i = 0 ; i < 7 ; i++ )
-                for ( k = i+1 ; k < 8 ; k++ )
-                    if ( buff[ inds[k] ] < buff[ inds[i] ] ) {
-                        temp_i = inds[i]; inds[i] = inds[k]; inds[k] = temp_i;
-                        }
-
-            /* For each entry in the new sort list. */
-            finger = &gsort[ j*(count + 1) ];
-            for ( ind = 0 ; ind < count ; ind++ ) {
-
-                /* Copy the minimum into the new sort array. */
-                finger[ind].d = buff[inds[0]];
-                finger[ind].i = fingers[inds[0]]->i + off[inds[0]];
-
-                /* Update the buffer. */
-                fingers[inds[0]] += 1;
-                buff[inds[0]] = fingers[inds[0]]->d;
-
-                /* Find the smallest entry. */
-                for ( k = 1 ; k < 8 && buff[inds[k]] < buff[inds[k-1]] ; k++ ) {
-                    temp_i = inds[k-1]; inds[k-1] = inds[k]; inds[k] = temp_i;
-                    }
-
-                } /* Merge. */
-            
-            /* Add a sentinel. */
-            gsort[ j*(count + 1) + count ].d = FLT_MAX;
-            gsort[ j*(count + 1) + count ].i = 0;
-            
-            /* Mark as sorted. */
-            c->gsorted |= ( 1 << j );
-            
-            } /* loop over sort arrays. */
-    
-        } /* progeny? */
-        
-    /* Otherwise, just sort. */
-    else {
-    
-        /* Fill the sort array. */
-        for ( k = 0 ; k < count ; k++ ) {
-            px[0] = gparts[k].x[0];
-            px[1] = gparts[k].x[1];
-            px[2] = gparts[k].x[2];
-            for ( j = 0 ; j < 13 ; j++ )
-                if ( flags & (1 << j) ) {
-                    gsort[ j*(count + 1) + k].i = k;
-                    gsort[ j*(count + 1) + k].d = px[0]*runner_shift[ 3*j + 0 ] + px[1]*runner_shift[ 3*j + 1 ] + px[2]*runner_shift[ 3*j + 2 ];
-                    }
-            }
-
-        /* Add the sentinel and sort. */
-        for ( j = 0 ; j < 13 ; j++ )
-            if ( flags & (1 << j) ) {
-                gsort[ j*(count + 1) + count ].d = FLT_MAX;
-                gsort[ j*(count + 1) + count ].i = 0;
-                runner_dosort_ascending( &gsort[ j*(count + 1) ] , count );
-                c->gsorted |= ( 1 << j );
-                }
-            
+    } */
+
+#ifdef TIMER_VERBOSE
+  message(
+      "runner %02i: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) "
+      "took %.3f ms.",
+      r->id, count, c->depth, (flags & 0x1000) >> 12, (flags & 0x800) >> 11,
+      (flags & 0x400) >> 10, (flags & 0x200) >> 9, (flags & 0x100) >> 8,
+      (flags & 0x80) >> 7, (flags & 0x40) >> 6, (flags & 0x20) >> 5,
+      (flags & 0x10) >> 4, (flags & 0x8) >> 3, (flags & 0x4) >> 2,
+      (flags & 0x2) >> 1, (flags & 0x1) >> 0,
+      ((double)TIMER_TOC(timer_dosort)) / CPU_TPS * 1000);
+  fflush(stdout);
+#else
+  if (clock) TIMER_TOC(timer_dosort);
+#endif
+}
+
+void runner_dogsort(struct runner *r, struct cell *c, int flags, int clock) {
+
+  struct entry *finger;
+  struct entry *fingers[8];
+  struct gpart *gparts = c->gparts;
+  struct entry *gsort;
+  int j, k, count = c->gcount;
+  int i, ind, off[8], inds[8], temp_i, missing;
+  // float shift[3];
+  float buff[8], px[3];
+
+  TIMER_TIC
+
+  /* Clean-up the flags, i.e. filter out what's already been sorted. */
+  flags &= ~c->gsorted;
+  if (flags == 0) return;
+
+  /* start by allocating the entry arrays. */
+  if (c->gsort == NULL || c->gsortsize < count) {
+    if (c->gsort != NULL) free(c->gsort);
+    c->gsortsize = count * 1.1;
+    if ((c->gsort = (struct entry *)malloc(sizeof(struct entry) *
+                                           (c->gsortsize + 1) * 13)) == NULL)
+      error("Failed to allocate sort memory.");
+  }
+  gsort = c->gsort;
+
+  /* Does this cell have any progeny? */
+  if (c->split) {
+
+    /* Fill in the gaps within the progeny. */
+    for (k = 0; k < 8; k++) {
+      if (c->progeny[k] == NULL) continue;
+      missing = flags & ~c->progeny[k]->gsorted;
+      if (missing) runner_dogsort(r, c->progeny[k], missing, 0);
+    }
+
+    /* Loop over the 13 different sort arrays. */
+    for (j = 0; j < 13; j++) {
+
+      /* Has this sort array been flagged? */
+      if (!(flags & (1 << j))) continue;
+
+      /* Init the particle index offsets. */
+      for (off[0] = 0, k = 1; k < 8; k++)
+        if (c->progeny[k - 1] != NULL)
+          off[k] = off[k - 1] + c->progeny[k - 1]->gcount;
+        else
+          off[k] = off[k - 1];
+
+      /* Init the entries and indices. */
+      for (k = 0; k < 8; k++) {
+        inds[k] = k;
+        if (c->progeny[k] != NULL && c->progeny[k]->gcount > 0) {
+          fingers[k] = &c->progeny[k]->gsort[j * (c->progeny[k]->gcount + 1)];
+          buff[k] = fingers[k]->d;
+          off[k] = off[k];
+        } else
+          buff[k] = FLT_MAX;
+      }
+
+      /* Sort the buffer. */
+      for (i = 0; i < 7; i++)
+        for (k = i + 1; k < 8; k++)
+          if (buff[inds[k]] < buff[inds[i]]) {
+            temp_i = inds[i];
+            inds[i] = inds[k];
+            inds[k] = temp_i;
+          }
+
+      /* For each entry in the new sort list. */
+      finger = &gsort[j * (count + 1)];
+      for (ind = 0; ind < count; ind++) {
+
+        /* Copy the minimum into the new sort array. */
+        finger[ind].d = buff[inds[0]];
+        finger[ind].i = fingers[inds[0]]->i + off[inds[0]];
+
+        /* Update the buffer. */
+        fingers[inds[0]] += 1;
+        buff[inds[0]] = fingers[inds[0]]->d;
+
+        /* Find the smallest entry. */
+        for (k = 1; k < 8 && buff[inds[k]] < buff[inds[k - 1]]; k++) {
+          temp_i = inds[k - 1];
+          inds[k - 1] = inds[k];
+          inds[k] = temp_i;
         }
-        
-    /* Verify the sorting. */
-    /* for ( j = 0 ; j < 13 ; j++ ) {
-        if ( !( flags & (1 << j) ) )
-            continue;
-        finger = &c->gsort[ j*(count + 1) ];
-        for ( k = 1 ; k < count ; k++ ) {
-            if ( finger[k].d < finger[k-1].d )
-                error( "Sorting failed, ascending array." );
-            if ( finger[k].i < 0 || finger[k].i >= count )
-                error( "Sorting failed, indices borked." );
-            }
-        } */
-        
-    #ifdef TIMER_VERBOSE
-        message( "runner %02i: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) took %.3f ms." ,
-            r->id , count , c->depth ,
-            (flags & 0x1000) >> 12 , (flags & 0x800) >> 11 , (flags & 0x400) >> 10 , (flags & 0x200) >> 9 , (flags & 0x100) >> 8 , (flags & 0x80) >> 7 , (flags & 0x40) >> 6 , (flags & 0x20) >> 5 , (flags & 0x10) >> 4 , (flags & 0x8) >> 3 , (flags & 0x4) >> 2 , (flags & 0x2) >> 1 , (flags & 0x1) >> 0 , 
-            ((double)TIMER_TOC(timer_dosort)) / CPU_TPS * 1000 ); fflush(stdout);
-    #else
-        if ( clock )
-            TIMER_TOC(timer_dosort);
-    #endif
 
+      } /* Merge. */
+
+      /* Add a sentinel. */
+      gsort[j * (count + 1) + count].d = FLT_MAX;
+      gsort[j * (count + 1) + count].i = 0;
+
+      /* Mark as sorted. */
+      c->gsorted |= (1 << j);
+
+    } /* loop over sort arrays. */
+
+  } /* progeny? */
+
+  /* Otherwise, just sort. */
+  else {
+
+    /* Fill the sort array. */
+    for (k = 0; k < count; k++) {
+      px[0] = gparts[k].x[0];
+      px[1] = gparts[k].x[1];
+      px[2] = gparts[k].x[2];
+      for (j = 0; j < 13; j++)
+        if (flags & (1 << j)) {
+          gsort[j * (count + 1) + k].i = k;
+          gsort[j * (count + 1) + k].d = px[0] * runner_shift[3 * j + 0] +
+                                         px[1] * runner_shift[3 * j + 1] +
+                                         px[2] * runner_shift[3 * j + 2];
+        }
     }
-    
-    
+
+    /* Add the sentinel and sort. */
+    for (j = 0; j < 13; j++)
+      if (flags & (1 << j)) {
+        gsort[j * (count + 1) + count].d = FLT_MAX;
+        gsort[j * (count + 1) + count].i = 0;
+        runner_dosort_ascending(&gsort[j * (count + 1)], count);
+        c->gsorted |= (1 << j);
+      }
+  }
+
+/* Verify the sorting. */
+/* for ( j = 0 ; j < 13 ; j++ ) {
+    if ( !( flags & (1 << j) ) )
+        continue;
+    finger = &c->gsort[ j*(count + 1) ];
+    for ( k = 1 ; k < count ; k++ ) {
+        if ( finger[k].d < finger[k-1].d )
+            error( "Sorting failed, ascending array." );
+        if ( finger[k].i < 0 || finger[k].i >= count )
+            error( "Sorting failed, indices borked." );
+        }
+    } */
+
+#ifdef TIMER_VERBOSE
+  message(
+      "runner %02i: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) "
+      "took %.3f ms.",
+      r->id, count, c->depth, (flags & 0x1000) >> 12, (flags & 0x800) >> 11,
+      (flags & 0x400) >> 10, (flags & 0x200) >> 9, (flags & 0x100) >> 8,
+      (flags & 0x80) >> 7, (flags & 0x40) >> 6, (flags & 0x20) >> 5,
+      (flags & 0x10) >> 4, (flags & 0x8) >> 3, (flags & 0x4) >> 2,
+      (flags & 0x2) >> 1, (flags & 0x1) >> 0,
+      ((double)TIMER_TOC(timer_dosort)) / CPU_TPS * 1000);
+  fflush(stdout);
+#else
+  if (clock) TIMER_TOC(timer_dosort);
+#endif
+}
+
 /**
  * @brief Intermediate task between density and force
  *
  * @param r The runner thread.
  * @param c The cell.
  */
- 
-void runner_doghost ( struct runner *r , struct cell *c ) {
-
-    struct part *p, *parts = c->parts;
-    struct cell *finger;
-    int i, k, redo, count = c->count;
-    int *pid;
-    float h, ih, ih2, ih4, h_corr, rho, wcount, rho_dh, wcount_dh, u, fc;
-    float normDiv_v, normCurl_v;
+
+void runner_doghost(struct runner *r, struct cell *c) {
+
+  struct part *p, *parts = c->parts;
+  struct cell *finger;
+  int i, k, redo, count = c->count;
+  int *pid;
+  float h, ih, ih2, ih4, h_corr, rho, wcount, rho_dh, wcount_dh, u, fc;
+  float normDiv_v, normCurl_v;
 #ifndef LEGACY_GADGET2_SPH
-    float alpha_dot, tau, S;   
+  float alpha_dot, tau, S;
 #endif
-    float dt_step = r->e->dt_step;
-    TIMER_TIC
-    
-    /* Recurse? */
-    if ( c->split ) {
-        for ( k = 0 ; k < 8 ; k++ )
-            if ( c->progeny[k] != NULL )
-                runner_doghost( r , c->progeny[k] );
-        return;
+  float dt_step = r->e->dt_step;
+  TIMER_TIC
+
+  /* Recurse? */
+  if (c->split) {
+    for (k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL) runner_doghost(r, c->progeny[k]);
+    return;
+  }
+
+  /* Init the IDs that have to be updated. */
+  if ((pid = (int *)alloca(sizeof(int) * count)) == NULL)
+    error("Call to alloca failed.");
+  for (k = 0; k < count; k++) pid[k] = k;
+
+  /* While there are particles that need to be updated... */
+  while (count > 0) {
+
+    /* Reset the redo-count. */
+    redo = 0;
+
+    /* Loop over the parts in this cell. */
+    __builtin_prefetch(&parts[pid[0]], 0, 1);
+    __builtin_prefetch(&parts[pid[0]].rho_dh, 0, 1);
+    __builtin_prefetch(&parts[pid[1]], 0, 1);
+    __builtin_prefetch(&parts[pid[1]].rho_dh, 0, 1);
+    __builtin_prefetch(&parts[pid[2]], 0, 1);
+    __builtin_prefetch(&parts[pid[2]].rho_dh, 0, 1);
+    for (i = 0; i < count; i++) {
+
+      /* Get a direct pointer on the part. */
+      __builtin_prefetch(&parts[pid[i + 3]], 0, 1);
+      __builtin_prefetch(&parts[pid[i + 3]].rho_dh, 0, 1);
+      p = &parts[pid[i]];
+
+      /* Is this part within the timestep? */
+      if (p->dt <= dt_step) {
+
+        /* Some smoothing length multiples. */
+        h = p->h;
+        ih = 1.0f / h;
+        ih2 = ih * ih;
+        ih4 = ih2 * ih2;
+
+        /* Final operation on the density. */
+        p->rho = rho = ih * ih2 * (p->rho + p->mass * kernel_root);
+        p->rho_dh = rho_dh = (p->rho_dh - 3.0f * p->mass * kernel_root) * ih4;
+        wcount = (p->density.wcount + kernel_root) *
+                 (4.0f / 3.0 * M_PI * kernel_gamma3);
+        wcount_dh =
+            p->density.wcount_dh * ih * (4.0f / 3.0 * M_PI * kernel_gamma3);
+
+        /* If no derivative, double the smoothing length. */
+        if (wcount_dh == 0.0f) h_corr = p->h;
+
+        /* Otherwise, compute the smoothing length update (Newton step). */
+        else {
+          h_corr = (kernel_nwneigh - wcount) / wcount_dh;
+
+          /* Truncate to the range [ -p->h/2 , p->h ]. */
+          h_corr = fminf(h_corr, h);
+          h_corr = fmaxf(h_corr, -h / 2.f);
         }
-        
-    /* Init the IDs that have to be updated. */
-    if ( ( pid = (int *)alloca( sizeof(int) * count ) ) == NULL )
-        error( "Call to alloca failed." );
-    for ( k = 0 ; k < count ; k++ )
-        pid[k] = k;
-        
-    /* While there are particles that need to be updated... */
-    while ( count > 0 ) {
-    
-        /* Reset the redo-count. */
-        redo = 0;
-    
-        /* Loop over the parts in this cell. */
-        __builtin_prefetch( &parts[ pid[0] ] , 0 , 1 );
-        __builtin_prefetch( &parts[ pid[0] ].rho_dh , 0 , 1 );
-        __builtin_prefetch( &parts[ pid[1] ] , 0 , 1 );
-        __builtin_prefetch( &parts[ pid[1] ].rho_dh , 0 , 1 );
-        __builtin_prefetch( &parts[ pid[2] ] , 0 , 1 );
-        __builtin_prefetch( &parts[ pid[2] ].rho_dh , 0 , 1 );
-        for ( i = 0 ; i < count ; i++ ) {
-
-            /* Get a direct pointer on the part. */
-            __builtin_prefetch( &parts[ pid[i+3] ] , 0 , 1 );
-            __builtin_prefetch( &parts[ pid[i+3] ].rho_dh , 0 , 1 );
-            p = &parts[ pid[i] ];
-            
-            /* Is this part within the timestep? */
-            if ( p->dt <= dt_step ) {
-            
-	            /* Some smoothing length multiples. */
-	            h = p->h;
-                ih = 1.0f / h;
-                ih2 = ih * ih;
-                ih4 = ih2 * ih2;
-
-		        /* Final operation on the density. */
-                p->rho = rho = ih * ih2 * ( p->rho + p->mass*kernel_root );
-                p->rho_dh = rho_dh = ( p->rho_dh - 3.0f*p->mass*kernel_root ) * ih4;
-                wcount = ( p->density.wcount + kernel_root ) * ( 4.0f / 3.0 * M_PI * kernel_gamma3 );
-                wcount_dh = p->density.wcount_dh * ih * ( 4.0f / 3.0 * M_PI * kernel_gamma3 );
-                    
-                /* If no derivative, double the smoothing length. */
-                if ( wcount_dh == 0.0f )
-                    h_corr = p->h;
-                    
-                /* Otherwise, compute the smoothing length update (Newton step). */
-                else {
-                    h_corr = ( kernel_nwneigh - wcount ) / wcount_dh;
-
-                    /* Truncate to the range [ -p->h/2 , p->h ]. */
-                    h_corr = fminf( h_corr , h );
-                    h_corr = fmaxf( h_corr , -h/2.f );
-                    
-                    }
-                
-                /* Apply the correction to p->h and to the compact part. */
-                p->h += h_corr;
-
-                /* Did we get the right number density? */
-                if ( wcount > kernel_nwneigh + const_delta_nwneigh ||
-                     wcount < kernel_nwneigh - const_delta_nwneigh ) {
-                    // message( "particle %lli (h=%e,depth=%i) has bad wcount=%.3f." , p->id , p->h , c->depth , wcount ); fflush(stdout);
-                    // p->h += ( p->density.wcount + kernel_root - kernel_nwneigh ) / p->density.wcount_dh;
-                    pid[redo] = pid[i];
-                    redo += 1;
-                    p->density.wcount = 0.0;
-                    p->density.wcount_dh = 0.0;
-                    p->rho = 0.0;
-                    p->rho_dh = 0.0;
-		            p->density.div_v = 0.0;
-		            for ( k=0 ; k < 3 ; k++)
-		                p->density.curl_v[k] = 0.0;
-                    continue;
-                    }
-
-                /* Pre-compute some stuff for the balsara switch. */
-		        normDiv_v = fabs( p->density.div_v / rho * ih4 );
-		        normCurl_v = sqrtf( p->density.curl_v[0] * p->density.curl_v[0] + p->density.curl_v[1] * p->density.curl_v[1] + p->density.curl_v[2] * p->density.curl_v[2] ) / rho * ih4;
-                
-                /* As of here, particle force variables will be set. Do _NOT_
-                   try to read any particle density variables! */
-                
-                /* Compute this particle's sound speed. */
-                u = p->u;
-                p->force.c = fc = sqrtf( const_hydro_gamma * ( const_hydro_gamma - 1.0f ) * u );
-
-                /* Compute the P/Omega/rho2. */
-                p->force.POrho2 = u * ( const_hydro_gamma - 1.0f ) / ( rho + h * rho_dh / 3.0f );
-
-		        /* Balsara switch */
-		        p->force.balsara = normDiv_v / ( normDiv_v + normCurl_v + 0.0001f * fc * ih );
-
-                #ifndef LEGACY_GADGET2_SPH
-		            /* Viscosity parameter decay time */
-		            tau = h / ( 2.f * const_viscosity_length * p->force.c );
-
-		            /* Viscosity source term */
-		            S = fmaxf( -normDiv_v, 0.f );
-
-		            /* Compute the particle's viscosity parameter time derivative */
-		            alpha_dot = ( const_viscosity_alpha_min - p->alpha ) / tau + ( const_viscosity_alpha_max - p->alpha ) * S;
-
-		            /* Update particle's viscosity paramter */
-		            p->alpha += alpha_dot * p->dt; 
-                #endif                
-
-                /* Reset the acceleration. */
-                for ( k = 0 ; k < 3 ; k++ )
-                    p->a[k] = 0.0f;
-
-                /* Reset the time derivatives. */
-                p->force.u_dt = 0.0f;
-                p->force.h_dt = 0.0f;
-                p->force.v_sig = 0.0f;
-
-                }
-
-            }
-            
-        /* Re-set the counter for the next loop (potentially). */
-        count = redo;
-        if ( count > 0 ) {
-        
-            // error( "Bad smoothing length, fixing this isn't implemented yet." );
-            
-            /* Climb up the cell hierarchy. */
-            for ( finger = c ; finger != NULL ; finger = finger->parent ) {
-            
-                /* Run through this cell's density interactions. */
-                for ( struct link *l = finger->density ; l != NULL ; l = l->next ) {
-                
-                    /* Self-interaction? */
-                    if ( l->t->type == task_type_self )
-                        runner_doself_subset_density( r , finger , parts , pid , count );
-                        
-                    /* Otherwise, pair interaction? */
-                    else if ( l->t->type == task_type_pair ) {
-                    
-                        /* Left or right? */
-                        if ( l->t->ci == finger )
-                            runner_dopair_subset_density( r , finger , parts , pid , count , l->t->cj );
-                        else
-                            runner_dopair_subset_density( r , finger , parts , pid , count , l->t->ci );
-                        
-                        }
-                
-                    /* Otherwise, sub interaction? */
-                    else if ( l->t->type == task_type_sub ) {
-                    
-                        /* Left or right? */
-                        if ( l->t->ci == finger )
-                            runner_dosub_subset_density( r , finger , parts , pid , count , l->t->cj , -1 , 1 );
-                        else
-                            runner_dosub_subset_density( r , finger , parts , pid , count , l->t->ci , -1 , 1 );
-                        
-                        }
-                
-                    }
-                    
-                }
-        
-            }
-            
+
+        /* Apply the correction to p->h and to the compact part. */
+        p->h += h_corr;
+
+        /* Did we get the right number density? */
+        if (wcount > kernel_nwneigh + const_delta_nwneigh ||
+            wcount < kernel_nwneigh - const_delta_nwneigh) {
+          // message( "particle %lli (h=%e,depth=%i) has bad wcount=%.3f." ,
+          // p->id , p->h , c->depth , wcount ); fflush(stdout);
+          // p->h += ( p->density.wcount + kernel_root - kernel_nwneigh ) /
+          // p->density.wcount_dh;
+          pid[redo] = pid[i];
+          redo += 1;
+          p->density.wcount = 0.0;
+          p->density.wcount_dh = 0.0;
+          p->rho = 0.0;
+          p->rho_dh = 0.0;
+          p->density.div_v = 0.0;
+          for (k = 0; k < 3; k++) p->density.curl_v[k] = 0.0;
+          continue;
         }
 
-    #ifdef TIMER_VERBOSE
-        message( "runner %02i: %i parts at depth %i took %.3f ms." ,
-            r->id , c->count , c->depth ,
-            ((double)TIMER_TOC(timer_doghost)) / CPU_TPS * 1000 ); fflush(stdout);
-    #else
-        TIMER_TOC(timer_doghost);
-    #endif
-    
+        /* Pre-compute some stuff for the balsara switch. */
+        normDiv_v = fabs(p->density.div_v / rho * ih4);
+        normCurl_v = sqrtf(p->density.curl_v[0] * p->density.curl_v[0] +
+                           p->density.curl_v[1] * p->density.curl_v[1] +
+                           p->density.curl_v[2] * p->density.curl_v[2]) /
+                     rho * ih4;
+
+        /* As of here, particle force variables will be set. Do _NOT_
+           try to read any particle density variables! */
+
+        /* Compute this particle's sound speed. */
+        u = p->u;
+        p->force.c = fc =
+            sqrtf(const_hydro_gamma * (const_hydro_gamma - 1.0f) * u);
+
+        /* Compute the P/Omega/rho2. */
+        p->force.POrho2 =
+            u * (const_hydro_gamma - 1.0f) / (rho + h * rho_dh / 3.0f);
+
+        /* Balsara switch */
+        p->force.balsara =
+            normDiv_v / (normDiv_v + normCurl_v + 0.0001f * fc * ih);
+
+#ifndef LEGACY_GADGET2_SPH
+        /* Viscosity parameter decay time */
+        tau = h / (2.f * const_viscosity_length * p->force.c);
+
+        /* Viscosity source term */
+        S = fmaxf(-normDiv_v, 0.f);
+
+        /* Compute the particle's viscosity parameter time derivative */
+        alpha_dot = (const_viscosity_alpha_min - p->alpha) / tau +
+                    (const_viscosity_alpha_max - p->alpha) * S;
+
+        /* Update particle's viscosity paramter */
+        p->alpha += alpha_dot * p->dt;
+#endif
+
+        /* Reset the acceleration. */
+        for (k = 0; k < 3; k++) p->a[k] = 0.0f;
+
+        /* Reset the time derivatives. */
+        p->force.u_dt = 0.0f;
+        p->force.h_dt = 0.0f;
+        p->force.v_sig = 0.0f;
+      }
+    }
+
+    /* Re-set the counter for the next loop (potentially). */
+    count = redo;
+    if (count > 0) {
+
+      // error( "Bad smoothing length, fixing this isn't implemented yet." );
+
+      /* Climb up the cell hierarchy. */
+      for (finger = c; finger != NULL; finger = finger->parent) {
+
+        /* Run through this cell's density interactions. */
+        for (struct link *l = finger->density; l != NULL; l = l->next) {
+
+          /* Self-interaction? */
+          if (l->t->type == task_type_self)
+            runner_doself_subset_density(r, finger, parts, pid, count);
+
+          /* Otherwise, pair interaction? */
+          else if (l->t->type == task_type_pair) {
+
+            /* Left or right? */
+            if (l->t->ci == finger)
+              runner_dopair_subset_density(r, finger, parts, pid, count,
+                                           l->t->cj);
+            else
+              runner_dopair_subset_density(r, finger, parts, pid, count,
+                                           l->t->ci);
+
+          }
+
+          /* Otherwise, sub interaction? */
+          else if (l->t->type == task_type_sub) {
+
+            /* Left or right? */
+            if (l->t->ci == finger)
+              runner_dosub_subset_density(r, finger, parts, pid, count,
+                                          l->t->cj, -1, 1);
+            else
+              runner_dosub_subset_density(r, finger, parts, pid, count,
+                                          l->t->ci, -1, 1);
+          }
+        }
+      }
     }
-    
-    
+  }
+
+#ifdef TIMER_VERBOSE
+  message("runner %02i: %i parts at depth %i took %.3f ms.", r->id, c->count,
+          c->depth, ((double)TIMER_TOC(timer_doghost)) / CPU_TPS * 1000);
+  fflush(stdout);
+#else
+  TIMER_TOC(timer_doghost);
+#endif
+}
+
 /**
  * @brief Compute the second kick of the given cell.
  *
  * @param r The runner thread.
  * @param c The cell.
  */
- 
-void runner_dokick2 ( struct runner *r , struct cell *c ) {
-
-    int j, k, count = 0, nr_parts = c->count;
-    float dt_min = FLT_MAX, dt_max = 0.0f;
-    double ekin = 0.0, epot = 0.0;
-    float mom[3] = { 0.0f , 0.0f , 0.0f }, ang[3] = { 0.0f , 0.0f , 0.0f };
-    float x[3], v_hdt[3], u_hdt, h, pdt, m;
-    float dt_step = r->e->dt_step, dt = r->e->dt, hdt, idt;
-    float dt_cfl, dt_h_change, dt_u_change, dt_new;
-    float h_dt, u_dt;
-    struct part *restrict p, *restrict parts = c->parts;
-    struct xpart *restrict xp, *restrict xparts = c->xparts;
-    
-    TIMER_TIC
-    
-    /* Init idt to avoid compiler stupidity. */
-    idt = ( dt > 0 ) ? 1.0f / dt : 0.0f;
-    hdt = dt / 2;
-    
-    /* Loop over the particles and kick them. */
-    __builtin_prefetch( &parts[0] , 0 , 1 );
-    __builtin_prefetch( &parts[0].rho_dh , 0 , 1 );
-    __builtin_prefetch( &xparts[0] , 0 , 1 );
-    __builtin_prefetch( &parts[1] , 0 , 1 );
-    __builtin_prefetch( &parts[1].rho_dh , 0 , 1 );
-    __builtin_prefetch( &xparts[1] , 0 , 1 );
-    __builtin_prefetch( &parts[2] , 0 , 1 );
-    __builtin_prefetch( &parts[2].rho_dh , 0 , 1 );
-    __builtin_prefetch( &xparts[2] , 0 , 1 );
-    for ( k = 0 ; k < nr_parts ; k++ ) {
-
-        /* Get a handle on the part. */
-        __builtin_prefetch( &parts[k+3] , 0 , 1 );
-        __builtin_prefetch( &parts[k+3].rho_dh , 0 , 1 );
-        __builtin_prefetch( &xparts[k+3] , 0 , 1 );
-        p = &parts[k];
-        xp = &xparts[k];
-
-        /* Get local copies of particle data. */
-        pdt = p->dt;
-        m = p->mass;
-        x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2];
-        v_hdt[0] = xp->v_hdt[0]; v_hdt[1] = xp->v_hdt[1]; v_hdt[2] = xp->v_hdt[2];
-        u_hdt = xp->u_hdt;
-
-        /* Update the particle's data (if active). */
-        if ( pdt <= dt_step ) {
-            
-            /* Increase the number of particles updated. */
-            count += 1;
-            
-            /* Scale the derivatives as they're freshly computed. */
-            h = p->h;
-            h_dt = p->force.h_dt *= h * 0.333333333f;
-            xp->omega = 1.0f + h * p->rho_dh / p->rho * 0.3333333333f;
-            
-            /* Compute the new time step. */
-            u_dt = p->force.u_dt;
-            dt_cfl = const_cfl * h / p->force.v_sig;
-            dt_h_change = ( h_dt != 0.0f ) ? fabsf( const_ln_max_h_change * h / h_dt ) : FLT_MAX;
-            dt_u_change = ( u_dt != 0.0f ) ? fabsf( const_max_u_change * p->u / u_dt ) : FLT_MAX;
-            dt_new = fminf( dt_cfl , fminf( dt_h_change , dt_u_change ) );
-            if ( pdt == 0.0f )
-                p->dt = pdt = dt_new;
-            else
-                p->dt = pdt = fminf( dt_new , 2.0f*pdt );
-                
-            /* Update positions and energies at the full step. */
-            p->v[0] = v_hdt[0] + hdt * p->a[0];
-            p->v[1] = v_hdt[1] + hdt * p->a[1];
-            p->v[2] = v_hdt[2] + hdt * p->a[2];
-            p->u = u_hdt + hdt * u_dt;
-            
-            /* Set the new particle-specific time step. */
-            if ( dt > 0.0f ) {
-                float dt_curr = dt;
-                j = (int)( pdt * idt );
-                while ( j > 1 ) {
-                    dt_curr *= 2.0f;
-                    j >>= 1;
-                    }
-                xp->dt_curr = dt_curr;
-                }
-            
-            }
-
-        /* Get the smallest/largest dt. */
-        dt_min = fminf( dt_min , pdt );
-        dt_max = fmaxf( dt_max , pdt );
-
-        /* Collect total energy. */
-        ekin += 0.5 * m * ( v_hdt[0]*v_hdt[0] + v_hdt[1]*v_hdt[1] + v_hdt[2]*v_hdt[2] );
-        epot += m * u_hdt;
-
-        /* Collect momentum */
-        mom[0] += m * v_hdt[0];
-        mom[1] += m * v_hdt[1];
-        mom[2] += m * v_hdt[2];
-
-	    /* Collect angular momentum */
-	    ang[0] += m * ( x[1]*v_hdt[2] - x[2]*v_hdt[1] );
-	    ang[1] += m * ( x[2]*v_hdt[0] - x[0]*v_hdt[2] );
-	    ang[2] += m * ( x[0]*v_hdt[1] - x[1]*v_hdt[0] );
-
-	    /* Collect entropic function */
-	    // lent += u * pow( p->rho, 1.f-const_gamma );
 
+void runner_dokick2(struct runner *r, struct cell *c) {
+
+  int j, k, count = 0, nr_parts = c->count;
+  float dt_min = FLT_MAX, dt_max = 0.0f;
+  double ekin = 0.0, epot = 0.0;
+  float mom[3] = {0.0f, 0.0f, 0.0f}, ang[3] = {0.0f, 0.0f, 0.0f};
+  float x[3], v_hdt[3], u_hdt, h, pdt, m;
+  float dt_step = r->e->dt_step, dt = r->e->dt, hdt, idt;
+  float dt_cfl, dt_h_change, dt_u_change, dt_new;
+  float h_dt, u_dt;
+  struct part *restrict p, *restrict parts = c->parts;
+  struct xpart *restrict xp, *restrict xparts = c->xparts;
+
+  TIMER_TIC
+
+  /* Init idt to avoid compiler stupidity. */
+  idt = (dt > 0) ? 1.0f / dt : 0.0f;
+  hdt = dt / 2;
+
+  /* Loop over the particles and kick them. */
+  __builtin_prefetch(&parts[0], 0, 1);
+  __builtin_prefetch(&parts[0].rho_dh, 0, 1);
+  __builtin_prefetch(&xparts[0], 0, 1);
+  __builtin_prefetch(&parts[1], 0, 1);
+  __builtin_prefetch(&parts[1].rho_dh, 0, 1);
+  __builtin_prefetch(&xparts[1], 0, 1);
+  __builtin_prefetch(&parts[2], 0, 1);
+  __builtin_prefetch(&parts[2].rho_dh, 0, 1);
+  __builtin_prefetch(&xparts[2], 0, 1);
+  for (k = 0; k < nr_parts; k++) {
+
+    /* Get a handle on the part. */
+    __builtin_prefetch(&parts[k + 3], 0, 1);
+    __builtin_prefetch(&parts[k + 3].rho_dh, 0, 1);
+    __builtin_prefetch(&xparts[k + 3], 0, 1);
+    p = &parts[k];
+    xp = &xparts[k];
+
+    /* Get local copies of particle data. */
+    pdt = p->dt;
+    m = p->mass;
+    x[0] = p->x[0];
+    x[1] = p->x[1];
+    x[2] = p->x[2];
+    v_hdt[0] = xp->v_hdt[0];
+    v_hdt[1] = xp->v_hdt[1];
+    v_hdt[2] = xp->v_hdt[2];
+    u_hdt = xp->u_hdt;
+
+    /* Update the particle's data (if active). */
+    if (pdt <= dt_step) {
+
+      /* Increase the number of particles updated. */
+      count += 1;
+
+      /* Scale the derivatives as they're freshly computed. */
+      h = p->h;
+      h_dt = p->force.h_dt *= h * 0.333333333f;
+      xp->omega = 1.0f + h * p->rho_dh / p->rho * 0.3333333333f;
+
+      /* Compute the new time step. */
+      u_dt = p->force.u_dt;
+      dt_cfl = const_cfl * h / p->force.v_sig;
+      dt_h_change =
+          (h_dt != 0.0f) ? fabsf(const_ln_max_h_change * h / h_dt) : FLT_MAX;
+      dt_u_change =
+          (u_dt != 0.0f) ? fabsf(const_max_u_change * p->u / u_dt) : FLT_MAX;
+      dt_new = fminf(dt_cfl, fminf(dt_h_change, dt_u_change));
+      if (pdt == 0.0f)
+        p->dt = pdt = dt_new;
+      else
+        p->dt = pdt = fminf(dt_new, 2.0f * pdt);
+
+      /* Update positions and energies at the full step. */
+      p->v[0] = v_hdt[0] + hdt * p->a[0];
+      p->v[1] = v_hdt[1] + hdt * p->a[1];
+      p->v[2] = v_hdt[2] + hdt * p->a[2];
+      p->u = u_hdt + hdt * u_dt;
+
+      /* Set the new particle-specific time step. */
+      if (dt > 0.0f) {
+        float dt_curr = dt;
+        j = (int)(pdt * idt);
+        while (j > 1) {
+          dt_curr *= 2.0f;
+          j >>= 1;
         }
-
-    #ifdef TIMER_VERBOSE
-        message( "runner %02i: %i parts at depth %i took %.3f ms." ,
-            r->id , c->count , c->depth ,
-            ((double)TIMER_TOC(timer_kick2)) / CPU_TPS * 1000 ); fflush(stdout);
-    #else
-        TIMER_TOC(timer_kick2);
-    #endif
-        
-    /* Store the computed values in the cell. */
-    c->dt_min = dt_min;
-    c->dt_max = dt_max;
-    c->updated = count;
-    c->ekin = ekin;
-    c->epot = epot;
-    c->mom[0] = mom[0]; c->mom[1] = mom[1]; c->mom[2] = mom[2];
-    c->ang[0] = ang[0]; c->ang[1] = ang[1]; c->ang[2] = ang[2];
-        
+        xp->dt_curr = dt_curr;
+      }
     }
 
+    /* Get the smallest/largest dt. */
+    dt_min = fminf(dt_min, pdt);
+    dt_max = fmaxf(dt_max, pdt);
+
+    /* Collect total energy. */
+    ekin += 0.5 * m *
+            (v_hdt[0] * v_hdt[0] + v_hdt[1] * v_hdt[1] + v_hdt[2] * v_hdt[2]);
+    epot += m * u_hdt;
+
+    /* Collect momentum */
+    mom[0] += m * v_hdt[0];
+    mom[1] += m * v_hdt[1];
+    mom[2] += m * v_hdt[2];
+
+    /* Collect angular momentum */
+    ang[0] += m * (x[1] * v_hdt[2] - x[2] * v_hdt[1]);
+    ang[1] += m * (x[2] * v_hdt[0] - x[0] * v_hdt[2]);
+    ang[2] += m * (x[0] * v_hdt[1] - x[1] * v_hdt[0]);
+
+    /* Collect entropic function */
+    // lent += u * pow( p->rho, 1.f-const_gamma );
+  }
+
+#ifdef TIMER_VERBOSE
+  message("runner %02i: %i parts at depth %i took %.3f ms.", r->id, c->count,
+          c->depth, ((double)TIMER_TOC(timer_kick2)) / CPU_TPS * 1000);
+  fflush(stdout);
+#else
+  TIMER_TOC(timer_kick2);
+#endif
+
+  /* Store the computed values in the cell. */
+  c->dt_min = dt_min;
+  c->dt_max = dt_max;
+  c->updated = count;
+  c->ekin = ekin;
+  c->epot = epot;
+  c->mom[0] = mom[0];
+  c->mom[1] = mom[1];
+  c->mom[2] = mom[2];
+  c->ang[0] = ang[0];
+  c->ang[1] = ang[1];
+  c->ang[2] = ang[2];
+}
 
 /**
  * @brief Mapping function to set dt_min and dt_max, do the first
  * kick.
  */
 
-void runner_dokick1 ( struct runner *r , struct cell *c ) {
-
-    int j, k;
-    struct engine *e = r->e;
-    float pdt, dt_step = e->dt_step, dt = e->dt, hdt = dt/2;
-    float dt_min, dt_max, h_max, dx, dx_max;
-    float a[3], v[3], u, u_dt, h, h_dt, w, rho;
-    double x[3], x_old[3];
-    struct part *restrict p, *restrict parts = c->parts;
-    struct xpart *restrict xp, *restrict xparts = c->xparts;
-
-    /* No children? */
-    if ( !c->split ) {
-    
-        /* Init the min/max counters. */
-        dt_min = FLT_MAX;
-        dt_max = 0.0f;
-        h_max = 0.0f;
-        dx_max = 0.0f;
-    
-        /* Loop over parts. */
-        __builtin_prefetch( &parts[0] , 0 , 1 );
-        __builtin_prefetch( &parts[0].rho_dh , 0 , 1 );
-        __builtin_prefetch( &xparts[0] , 0 , 1 );
-        __builtin_prefetch( &parts[1] , 0 , 1 );
-        __builtin_prefetch( &parts[1].rho_dh , 0 , 1 );
-        __builtin_prefetch( &xparts[1] , 0 , 1 );
-        __builtin_prefetch( &parts[2] , 0 , 1 );
-        __builtin_prefetch( &parts[2].rho_dh , 0 , 1 );
-        __builtin_prefetch( &xparts[2] , 0 , 1 );
-        for ( k = 0 ; k < c->count ; k++ ) {
-            
-            /* Get a handle on the kth particle. */
-            __builtin_prefetch( &parts[k+3] , 0 , 1 );
-            __builtin_prefetch( &parts[k+3].rho_dh , 0 , 1 );
-            __builtin_prefetch( &xparts[k+3] , 0 , 1 );
-            p = &parts[k];
-            xp = &xparts[k];
-            
-            /* Load the data locally. */
-            a[0] = p->a[0]; a[1] = p->a[1]; a[2] = p->a[2];
-            v[0] = p->v[0]; v[1] = p->v[1]; v[2] = p->v[2];
-            x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2];
-            x_old[0] = xp->x_old[0]; x_old[1] = xp->x_old[1]; x_old[2] = xp->x_old[2];
-            h = p->h;
-            u = p->u;
-            h_dt = p->force.h_dt;
-            u_dt = p->force.u_dt;
-            pdt = p->dt;
-            
-            /* Store the min/max dt. */
-            dt_min = fminf( dt_min , pdt );
-            dt_max = fmaxf( dt_max , pdt );
-            
-            /* Update the half-step velocities from the current velocities. */
-            xp->v_hdt[0] = v[0] + hdt * a[0];
-            xp->v_hdt[1] = v[1] + hdt * a[1];
-            xp->v_hdt[2] = v[2] + hdt * a[2];
-            xp->u_hdt = u + hdt * u_dt;
-            
-            /* Move the particles with the velocities at the half-step. */
-            p->x[0] = x[0] += dt * xp->v_hdt[0];
-            p->x[1] = x[1] += dt * xp->v_hdt[1];
-            p->x[2] = x[2] += dt * xp->v_hdt[2];
-            dx = sqrtf( (x[0] - x_old[0])*(x[0] - x_old[0]) +
-                        (x[1] - x_old[1])*(x[1] - x_old[1]) +
-                        (x[2] - x_old[2])*(x[2] - x_old[2]) );
-            dx_max = fmaxf( dx_max , dx );
-
-            /* Update positions and energies at the half-step. */
-            p->v[0] = v[0] + dt * a[0];
-            p->v[1] = v[1] + dt * a[1];
-            p->v[2] = v[2] + dt * a[2];
-            w = u_dt / u * dt;
-            if ( fabsf( w ) < 0.01f )
-                p->u = u *= 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) );
-            else
-                p->u = u *= expf( w );
-            w = h_dt / h * dt;
-            if ( fabsf( w ) < 0.01f )
-                p->h = h *= 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) );
-            else
-                p->h = h *= expf( w );
-            h_max = fmaxf( h_max , h );
-
-        
-            /* Integrate other values if this particle will not be updated. */
-            /* Init fields for density calculation. */
-            if ( pdt > dt_step ) {
-                float w = -3.0f * h_dt / h * dt;
-                if ( fabsf( w ) < 0.1f )
-                    rho = p->rho *= 1.0f + w*( 1.0f + w*( 0.5f + w*(1.0f/6.0f + 1.0f/24.0f*w ) ) );
-                else
-                    rho = p->rho *= expf( w );
-                p->force.POrho2 = u * ( const_hydro_gamma - 1.0f ) / ( rho * xp->omega );
-                }
-            else {
-                p->density.wcount = 0.0f;
-                p->density.wcount_dh = 0.0f;
-                p->rho = 0.0f;
-                p->rho_dh = 0.0f;
-	            p->density.div_v = 0.0f;
-	            for ( j = 0 ; j < 3 ; ++j)
-	                p->density.curl_v[j] = 0.0f;
-                }
-                
-            }
-            
-        }
-        
-    /* Otherwise, agregate data from children. */
-    else {
-    
-        /* Init with the first non-null child. */
-        dt_min = FLT_MAX;
-        dt_max = 0.0f;
-        h_max = 0.0f;
-        dx_max = 0.0f;
-        
-        /* Loop over the progeny. */
-        for ( k = 0 ; k < 8 ; k++ )
-            if ( c->progeny[k] != NULL ) {
-                if ( c->count < space_subsize )
-                    runner_dokick1( r , c->progeny[k] );
-                dt_min = fminf( dt_min , c->progeny[k]->dt_min );
-                dt_max = fmaxf( dt_max , c->progeny[k]->dt_max );
-                h_max = fmaxf( h_max , c->progeny[k]->h_max );
-                dx_max = fmaxf( dx_max , c->progeny[k]->dx_max );
-                }
-    
-        }
-
-    /* Store the values. */
-    c->dt_min = dt_min;
-    c->dt_max = dt_max;
-    c->h_max = h_max;
-    c->dx_max = dx_max;
-    
+void runner_dokick1(struct runner *r, struct cell *c) {
+
+  int j, k;
+  struct engine *e = r->e;
+  float pdt, dt_step = e->dt_step, dt = e->dt, hdt = dt / 2;
+  float dt_min, dt_max, h_max, dx, dx_max;
+  float a[3], v[3], u, u_dt, h, h_dt, w, rho;
+  double x[3], x_old[3];
+  struct part *restrict p, *restrict parts = c->parts;
+  struct xpart *restrict xp, *restrict xparts = c->xparts;
+
+  /* No children? */
+  if (!c->split) {
+
+    /* Init the min/max counters. */
+    dt_min = FLT_MAX;
+    dt_max = 0.0f;
+    h_max = 0.0f;
+    dx_max = 0.0f;
+
+    /* Loop over parts. */
+    __builtin_prefetch(&parts[0], 0, 1);
+    __builtin_prefetch(&parts[0].rho_dh, 0, 1);
+    __builtin_prefetch(&xparts[0], 0, 1);
+    __builtin_prefetch(&parts[1], 0, 1);
+    __builtin_prefetch(&parts[1].rho_dh, 0, 1);
+    __builtin_prefetch(&xparts[1], 0, 1);
+    __builtin_prefetch(&parts[2], 0, 1);
+    __builtin_prefetch(&parts[2].rho_dh, 0, 1);
+    __builtin_prefetch(&xparts[2], 0, 1);
+    for (k = 0; k < c->count; k++) {
+
+      /* Get a handle on the kth particle. */
+      __builtin_prefetch(&parts[k + 3], 0, 1);
+      __builtin_prefetch(&parts[k + 3].rho_dh, 0, 1);
+      __builtin_prefetch(&xparts[k + 3], 0, 1);
+      p = &parts[k];
+      xp = &xparts[k];
+
+      /* Load the data locally. */
+      a[0] = p->a[0];
+      a[1] = p->a[1];
+      a[2] = p->a[2];
+      v[0] = p->v[0];
+      v[1] = p->v[1];
+      v[2] = p->v[2];
+      x[0] = p->x[0];
+      x[1] = p->x[1];
+      x[2] = p->x[2];
+      x_old[0] = xp->x_old[0];
+      x_old[1] = xp->x_old[1];
+      x_old[2] = xp->x_old[2];
+      h = p->h;
+      u = p->u;
+      h_dt = p->force.h_dt;
+      u_dt = p->force.u_dt;
+      pdt = p->dt;
+
+      /* Store the min/max dt. */
+      dt_min = fminf(dt_min, pdt);
+      dt_max = fmaxf(dt_max, pdt);
+
+      /* Update the half-step velocities from the current velocities. */
+      xp->v_hdt[0] = v[0] + hdt * a[0];
+      xp->v_hdt[1] = v[1] + hdt * a[1];
+      xp->v_hdt[2] = v[2] + hdt * a[2];
+      xp->u_hdt = u + hdt * u_dt;
+
+      /* Move the particles with the velocities at the half-step. */
+      p->x[0] = x[0] += dt * xp->v_hdt[0];
+      p->x[1] = x[1] += dt * xp->v_hdt[1];
+      p->x[2] = x[2] += dt * xp->v_hdt[2];
+      dx = sqrtf((x[0] - x_old[0]) * (x[0] - x_old[0]) +
+                 (x[1] - x_old[1]) * (x[1] - x_old[1]) +
+                 (x[2] - x_old[2]) * (x[2] - x_old[2]));
+      dx_max = fmaxf(dx_max, dx);
+
+      /* Update positions and energies at the half-step. */
+      p->v[0] = v[0] + dt * a[0];
+      p->v[1] = v[1] + dt * a[1];
+      p->v[2] = v[2] + dt * a[2];
+      w = u_dt / u * dt;
+      if (fabsf(w) < 0.01f)
+        p->u = u *=
+            1.0f +
+            w * (1.0f + w * (0.5f + w * (1.0f / 6.0f + 1.0f / 24.0f * w)));
+      else
+        p->u = u *= expf(w);
+      w = h_dt / h * dt;
+      if (fabsf(w) < 0.01f)
+        p->h = h *=
+            1.0f +
+            w * (1.0f + w * (0.5f + w * (1.0f / 6.0f + 1.0f / 24.0f * w)));
+      else
+        p->h = h *= expf(w);
+      h_max = fmaxf(h_max, h);
+
+      /* Integrate other values if this particle will not be updated. */
+      /* Init fields for density calculation. */
+      if (pdt > dt_step) {
+        float w = -3.0f * h_dt / h * dt;
+        if (fabsf(w) < 0.1f)
+          rho = p->rho *=
+              1.0f +
+              w * (1.0f + w * (0.5f + w * (1.0f / 6.0f + 1.0f / 24.0f * w)));
+        else
+          rho = p->rho *= expf(w);
+        p->force.POrho2 = u * (const_hydro_gamma - 1.0f) / (rho * xp->omega);
+      } else {
+        p->density.wcount = 0.0f;
+        p->density.wcount_dh = 0.0f;
+        p->rho = 0.0f;
+        p->rho_dh = 0.0f;
+        p->density.div_v = 0.0f;
+        for (j = 0; j < 3; ++j) p->density.curl_v[j] = 0.0f;
+      }
     }
 
+  }
+
+  /* Otherwise, agregate data from children. */
+  else {
+
+    /* Init with the first non-null child. */
+    dt_min = FLT_MAX;
+    dt_max = 0.0f;
+    h_max = 0.0f;
+    dx_max = 0.0f;
+
+    /* Loop over the progeny. */
+    for (k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL) {
+        if (c->count < space_subsize) runner_dokick1(r, c->progeny[k]);
+        dt_min = fminf(dt_min, c->progeny[k]->dt_min);
+        dt_max = fmaxf(dt_max, c->progeny[k]->dt_max);
+        h_max = fmaxf(h_max, c->progeny[k]->h_max);
+        dx_max = fmaxf(dx_max, c->progeny[k]->dx_max);
+      }
+  }
+
+  /* Store the values. */
+  c->dt_min = dt_min;
+  c->dt_max = dt_max;
+  c->h_max = h_max;
+  c->dx_max = dx_max;
+}
 
 /**
  * @brief Combined second and first kick for fixed dt.
  *
  * @param r The runner thread.
  * @param c The cell.
- * @param timer The timer 
+ * @param timer The timer
  */
- 
-void runner_dokick ( struct runner *r , struct cell *c , int timer ) {
-
-    int k, count = 0, nr_parts = c->count, updated;
-    float dt_min = FLT_MAX, dt_max = 0.0f;
-    float h_max, dx, dx_max;
-    double ekin = 0.0, epot = 0.0;
-    float mom[3] = { 0.0f , 0.0f , 0.0f }, ang[3] = { 0.0f , 0.0f , 0.0f };
-    float x[3], x_old[3], v_hdt[3], a[3], u, u_hdt, h, pdt, m, w;
-    float dt = r->e->dt, hdt = 0.5f*dt;
-    float dt_cfl, dt_h_change, dt_u_change, dt_new;
-    float h_dt, u_dt;
-    struct part *restrict p, *restrict parts = c->parts;
-    struct xpart *restrict xp, *restrict xparts = c->xparts;
-    
-    TIMER_TIC
-    
-    /* No children? */
-    if ( !c->split ) {
-    
-        /* Init the min/max counters. */
-        dt_min = FLT_MAX;
-        dt_max = 0.0f;
-        h_max = 0.0f;
-        dx_max = 0.0f;
-    
-        /* Loop over the particles and kick them. */
-        __builtin_prefetch( &parts[0] , 0 , 1 );
-        __builtin_prefetch( &parts[0].rho_dh , 0 , 1 );
-        __builtin_prefetch( &xparts[0] , 0 , 1 );
-        __builtin_prefetch( &parts[1] , 0 , 1 );
-        __builtin_prefetch( &parts[1].rho_dh , 0 , 1 );
-        __builtin_prefetch( &xparts[1] , 0 , 1 );
-        __builtin_prefetch( &parts[2] , 0 , 1 );
-        __builtin_prefetch( &parts[2].rho_dh , 0 , 1 );
-        __builtin_prefetch( &xparts[2] , 0 , 1 );
-        for ( k = 0 ; k < nr_parts ; k++ ) {
-
-            /* Get a handle on the part. */
-            __builtin_prefetch( &parts[k+3] , 0 , 1 );
-            __builtin_prefetch( &parts[k+3].rho_dh , 0 , 1 );
-            __builtin_prefetch( &xparts[k+3] , 0 , 1 );
-            p = &parts[k];
-            xp = &xparts[k];
-
-            /* Get local copies of particle data. */
-            pdt = p->dt;
-            u_dt = p->force.u_dt;
-            h = p->h;
-            m = p->mass;
-            x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2];
-            a[0] = p->a[0]; a[1] = p->a[1]; a[2] = p->a[2];
-            x_old[0] = xp->x_old[0]; x_old[1] = xp->x_old[1]; x_old[2] = xp->x_old[2];
-            v_hdt[0] = xp->v_hdt[0]; v_hdt[1] = xp->v_hdt[1]; v_hdt[2] = xp->v_hdt[2];
-            u_hdt = xp->u_hdt;
-
-            /* Scale the derivatives if they're freshly computed. */
-            h_dt = p->force.h_dt *= h * 0.333333333f;
-            count += 1;
-            xp->omega = 1.0f + h * p->rho_dh / p->rho * 0.3333333333f;
-
-            /* Update the particle's time step. */
-            dt_cfl = const_cfl * h / p->force.v_sig;
-            dt_h_change = ( h_dt != 0.0f ) ? fabsf( const_ln_max_h_change * h / h_dt ) : FLT_MAX;
-            dt_u_change = ( u_dt != 0.0f ) ? fabsf( const_max_u_change * p->u / u_dt ) : FLT_MAX;
-            dt_new = fminf( dt_cfl , fminf( dt_h_change , dt_u_change ) );
-            if ( pdt == 0.0f )
-                p->dt = pdt = dt_new;
-            else
-                p->dt = pdt = fminf( dt_new , 2.0f*pdt );
-
-            /* Get the smallest/largest dt. */
-            dt_min = fminf( dt_min , pdt );
-            dt_max = fmaxf( dt_max , pdt );
-
-            /* Step and store the velocity and internal energy. */
-            xp->v_hdt[0] = ( v_hdt[0] += dt * a[0] );
-            xp->v_hdt[1] = ( v_hdt[1] += dt * a[1] );
-            xp->v_hdt[2] = ( v_hdt[2] += dt * a[2] );
-            xp->u_hdt = ( u_hdt += dt * u_dt );
-
-            /* Move the particles with the velocitie at the half-step. */
-            p->x[0] = x[0] += dt * v_hdt[0];
-            p->x[1] = x[1] += dt * v_hdt[1];
-            p->x[2] = x[2] += dt * v_hdt[2];
-            dx = sqrtf( (x[0] - x_old[0])*(x[0] - x_old[0]) +
-                        (x[1] - x_old[1])*(x[1] - x_old[1]) +
-                        (x[2] - x_old[2])*(x[2] - x_old[2]) );
-            dx_max = fmaxf( dx_max , dx );
-
-            /* Update positions and energies at the next full step. */
-            p->v[0] = v_hdt[0] + hdt * a[0];
-            p->v[1] = v_hdt[1] + hdt * a[1];
-            p->v[2] = v_hdt[2] + hdt * a[2];
-            w = u_dt / u_hdt * hdt;
-            if ( fabsf( w ) < 0.01f )
-                p->u = u = u_hdt * ( 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) ) );
-            else
-                p->u = u = u_hdt * expf( w );
-            w = h_dt / h * dt;
-            if ( fabsf( w ) < 0.01f )
-                p->h = h *= ( 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) ) );
-            else
-                p->h = h *= expf( w );
-            h_max = fmaxf( h_max , h );
-
-            /* Collect momentum */
-            mom[0] += m * v_hdt[0];
-            mom[1] += m * v_hdt[1];
-            mom[2] += m * v_hdt[2];
-
-	        /* Collect angular momentum */
-	        ang[0] += m * ( x[1]*v_hdt[2] - x[2]*v_hdt[1] );
-	        ang[1] += m * ( x[2]*v_hdt[0] - x[0]*v_hdt[2] );
-	        ang[2] += m * ( x[0]*v_hdt[1] - x[1]*v_hdt[0] );
-
-            /* Collect total energy. */
-            ekin += 0.5 * m * ( v_hdt[0]*v_hdt[0] + v_hdt[1]*v_hdt[1] + v_hdt[2]*v_hdt[2] );
-            epot += m * u_hdt;
-
-            /* Init fields for density calculation. */
-            p->density.wcount = 0.0f;
-            p->density.wcount_dh = 0.0f;
-            p->rho = 0.0f;
-            p->rho_dh = 0.0f;
-	        p->density.div_v = 0.0f;
-            p->density.curl_v[0] = 0.0f;
-            p->density.curl_v[1] = 0.0f;
-            p->density.curl_v[2] = 0.0f;
-                
-            }
-            
-        }
-        
-    /* Otherwise, agregate data from children. */
-    else {
-    
-        /* Init with the first non-null child. */
-        dt_min = FLT_MAX;
-        dt_max = 0.0f;
-        h_max = 0.0f;
-        dx_max = 0.0f;
-        updated = 0;
-        ekin = 0.0;
-        epot = 0.0;
-        mom[0] = 0.0f; mom[1] = 0.0f; mom[2] = 0.0f;
-        ang[0] = 0.0f; ang[1] = 0.0f; ang[2] = 0.0f;
-        
-        /* Loop over the progeny. */
-        for ( k = 0 ; k < 8 ; k++ )
-            if ( c->progeny[k] != NULL ) {
-                struct cell *cp = c->progeny[k];
-                runner_dokick( r , cp , 0 );
-                dt_min = fminf( dt_min , cp->dt_min );
-                dt_max = fmaxf( dt_max , cp->dt_max );
-                h_max = fmaxf( h_max , cp->h_max );
-                dx_max = fmaxf( dx_max , cp->dx_max );
-                updated += cp->count;
-                ekin += cp->ekin;
-                epot += cp->epot;
-                mom[0] += cp->mom[0]; mom[1] += cp->mom[1]; mom[2] += cp->mom[2];
-                ang[0] += cp->ang[0]; ang[1] += cp->ang[1]; ang[2] += cp->ang[2];
-                }
-    
-        }
 
-    /* Store the values. */
-    c->dt_min = dt_min;
-    c->dt_max = dt_max;
-    c->h_max = h_max;
-    c->dx_max = dx_max;
-    c->updated = count;
-    c->ekin = ekin;
-    c->epot = epot;
-    c->mom[0] = mom[0]; c->mom[1] = mom[1]; c->mom[2] = mom[2];
-    c->ang[0] = ang[0]; c->ang[1] = ang[1]; c->ang[2] = ang[2];
-    
-    if ( timer ) {
-        #ifdef TIMER_VERBOSE
-            message( "runner %02i: %i parts at depth %i took %.3f ms." ,
-                r->id , c->count , c->depth ,
-                ((double)TIMER_TOC(timer_kick2)) / CPU_TPS * 1000 ); fflush(stdout);
-        #else
-            TIMER_TOC(timer_kick2);
-        #endif
-        }
-        
+void runner_dokick(struct runner *r, struct cell *c, int timer) {
+
+  int k, count = 0, nr_parts = c->count, updated;
+  float dt_min = FLT_MAX, dt_max = 0.0f;
+  float h_max, dx, dx_max;
+  double ekin = 0.0, epot = 0.0;
+  float mom[3] = {0.0f, 0.0f, 0.0f}, ang[3] = {0.0f, 0.0f, 0.0f};
+  float x[3], x_old[3], v_hdt[3], a[3], u, u_hdt, h, pdt, m, w;
+  float dt = r->e->dt, hdt = 0.5f * dt;
+  float dt_cfl, dt_h_change, dt_u_change, dt_new;
+  float h_dt, u_dt;
+  struct part *restrict p, *restrict parts = c->parts;
+  struct xpart *restrict xp, *restrict xparts = c->xparts;
+
+  TIMER_TIC
+
+  /* No children? */
+  if (!c->split) {
+
+    /* Init the min/max counters. */
+    dt_min = FLT_MAX;
+    dt_max = 0.0f;
+    h_max = 0.0f;
+    dx_max = 0.0f;
+
+    /* Loop over the particles and kick them. */
+    __builtin_prefetch(&parts[0], 0, 1);
+    __builtin_prefetch(&parts[0].rho_dh, 0, 1);
+    __builtin_prefetch(&xparts[0], 0, 1);
+    __builtin_prefetch(&parts[1], 0, 1);
+    __builtin_prefetch(&parts[1].rho_dh, 0, 1);
+    __builtin_prefetch(&xparts[1], 0, 1);
+    __builtin_prefetch(&parts[2], 0, 1);
+    __builtin_prefetch(&parts[2].rho_dh, 0, 1);
+    __builtin_prefetch(&xparts[2], 0, 1);
+    for (k = 0; k < nr_parts; k++) {
+
+      /* Get a handle on the part. */
+      __builtin_prefetch(&parts[k + 3], 0, 1);
+      __builtin_prefetch(&parts[k + 3].rho_dh, 0, 1);
+      __builtin_prefetch(&xparts[k + 3], 0, 1);
+      p = &parts[k];
+      xp = &xparts[k];
+
+      /* Get local copies of particle data. */
+      pdt = p->dt;
+      u_dt = p->force.u_dt;
+      h = p->h;
+      m = p->mass;
+      x[0] = p->x[0];
+      x[1] = p->x[1];
+      x[2] = p->x[2];
+      a[0] = p->a[0];
+      a[1] = p->a[1];
+      a[2] = p->a[2];
+      x_old[0] = xp->x_old[0];
+      x_old[1] = xp->x_old[1];
+      x_old[2] = xp->x_old[2];
+      v_hdt[0] = xp->v_hdt[0];
+      v_hdt[1] = xp->v_hdt[1];
+      v_hdt[2] = xp->v_hdt[2];
+      u_hdt = xp->u_hdt;
+
+      /* Scale the derivatives if they're freshly computed. */
+      h_dt = p->force.h_dt *= h * 0.333333333f;
+      count += 1;
+      xp->omega = 1.0f + h * p->rho_dh / p->rho * 0.3333333333f;
+
+      /* Update the particle's time step. */
+      dt_cfl = const_cfl * h / p->force.v_sig;
+      dt_h_change =
+          (h_dt != 0.0f) ? fabsf(const_ln_max_h_change * h / h_dt) : FLT_MAX;
+      dt_u_change =
+          (u_dt != 0.0f) ? fabsf(const_max_u_change * p->u / u_dt) : FLT_MAX;
+      dt_new = fminf(dt_cfl, fminf(dt_h_change, dt_u_change));
+      if (pdt == 0.0f)
+        p->dt = pdt = dt_new;
+      else
+        p->dt = pdt = fminf(dt_new, 2.0f * pdt);
+
+      /* Get the smallest/largest dt. */
+      dt_min = fminf(dt_min, pdt);
+      dt_max = fmaxf(dt_max, pdt);
+
+      /* Step and store the velocity and internal energy. */
+      xp->v_hdt[0] = (v_hdt[0] += dt * a[0]);
+      xp->v_hdt[1] = (v_hdt[1] += dt * a[1]);
+      xp->v_hdt[2] = (v_hdt[2] += dt * a[2]);
+      xp->u_hdt = (u_hdt += dt * u_dt);
+
+      /* Move the particles with the velocitie at the half-step. */
+      p->x[0] = x[0] += dt * v_hdt[0];
+      p->x[1] = x[1] += dt * v_hdt[1];
+      p->x[2] = x[2] += dt * v_hdt[2];
+      dx = sqrtf((x[0] - x_old[0]) * (x[0] - x_old[0]) +
+                 (x[1] - x_old[1]) * (x[1] - x_old[1]) +
+                 (x[2] - x_old[2]) * (x[2] - x_old[2]));
+      dx_max = fmaxf(dx_max, dx);
+
+      /* Update positions and energies at the next full step. */
+      p->v[0] = v_hdt[0] + hdt * a[0];
+      p->v[1] = v_hdt[1] + hdt * a[1];
+      p->v[2] = v_hdt[2] + hdt * a[2];
+      w = u_dt / u_hdt * hdt;
+      if (fabsf(w) < 0.01f)
+        p->u = u =
+            u_hdt *
+            (1.0f +
+             w * (1.0f + w * (0.5f + w * (1.0f / 6.0f + 1.0f / 24.0f * w))));
+      else
+        p->u = u = u_hdt * expf(w);
+      w = h_dt / h * dt;
+      if (fabsf(w) < 0.01f)
+        p->h = h *=
+            (1.0f +
+             w * (1.0f + w * (0.5f + w * (1.0f / 6.0f + 1.0f / 24.0f * w))));
+      else
+        p->h = h *= expf(w);
+      h_max = fmaxf(h_max, h);
+
+      /* Collect momentum */
+      mom[0] += m * v_hdt[0];
+      mom[1] += m * v_hdt[1];
+      mom[2] += m * v_hdt[2];
+
+      /* Collect angular momentum */
+      ang[0] += m * (x[1] * v_hdt[2] - x[2] * v_hdt[1]);
+      ang[1] += m * (x[2] * v_hdt[0] - x[0] * v_hdt[2]);
+      ang[2] += m * (x[0] * v_hdt[1] - x[1] * v_hdt[0]);
+
+      /* Collect total energy. */
+      ekin += 0.5 * m *
+              (v_hdt[0] * v_hdt[0] + v_hdt[1] * v_hdt[1] + v_hdt[2] * v_hdt[2]);
+      epot += m * u_hdt;
+
+      /* Init fields for density calculation. */
+      p->density.wcount = 0.0f;
+      p->density.wcount_dh = 0.0f;
+      p->rho = 0.0f;
+      p->rho_dh = 0.0f;
+      p->density.div_v = 0.0f;
+      p->density.curl_v[0] = 0.0f;
+      p->density.curl_v[1] = 0.0f;
+      p->density.curl_v[2] = 0.0f;
     }
 
+  }
+
+  /* Otherwise, agregate data from children. */
+  else {
+
+    /* Init with the first non-null child. */
+    dt_min = FLT_MAX;
+    dt_max = 0.0f;
+    h_max = 0.0f;
+    dx_max = 0.0f;
+    updated = 0;
+    ekin = 0.0;
+    epot = 0.0;
+    mom[0] = 0.0f;
+    mom[1] = 0.0f;
+    mom[2] = 0.0f;
+    ang[0] = 0.0f;
+    ang[1] = 0.0f;
+    ang[2] = 0.0f;
+
+    /* Loop over the progeny. */
+    for (k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL) {
+        struct cell *cp = c->progeny[k];
+        runner_dokick(r, cp, 0);
+        dt_min = fminf(dt_min, cp->dt_min);
+        dt_max = fmaxf(dt_max, cp->dt_max);
+        h_max = fmaxf(h_max, cp->h_max);
+        dx_max = fmaxf(dx_max, cp->dx_max);
+        updated += cp->count;
+        ekin += cp->ekin;
+        epot += cp->epot;
+        mom[0] += cp->mom[0];
+        mom[1] += cp->mom[1];
+        mom[2] += cp->mom[2];
+        ang[0] += cp->ang[0];
+        ang[1] += cp->ang[1];
+        ang[2] += cp->ang[2];
+      }
+  }
+
+  /* Store the values. */
+  c->dt_min = dt_min;
+  c->dt_max = dt_max;
+  c->h_max = h_max;
+  c->dx_max = dx_max;
+  c->updated = count;
+  c->ekin = ekin;
+  c->epot = epot;
+  c->mom[0] = mom[0];
+  c->mom[1] = mom[1];
+  c->mom[2] = mom[2];
+  c->ang[0] = ang[0];
+  c->ang[1] = ang[1];
+  c->ang[2] = ang[2];
+
+  if (timer) {
+#ifdef TIMER_VERBOSE
+    message("runner %02i: %i parts at depth %i took %.3f ms.", r->id, c->count,
+            c->depth, ((double)TIMER_TOC(timer_kick2)) / CPU_TPS * 1000);
+    fflush(stdout);
+#else
+    TIMER_TOC(timer_kick2);
+#endif
+  }
+}
 
 /**
  * @brief The #runner main thread routine.
  *
  * @param data A pointer to this thread's data.
  */
- 
-void *runner_main ( void *data ) {
-
-    struct runner *r = (struct runner *)data;
-    struct engine *e = r->e;
-    struct scheduler *sched = &e->sched;
-    struct task *t = NULL;
-    struct cell *ci, *cj, *super;
-    struct part *parts;
-    int k, nr_parts;
-    
-    /* Main loop. */
-    while ( 1 ) {
-    
-        /* Wait at the barrier. */
-        engine_barrier( e , r->id );
-
-        /* Re-set the pointer to the previous super cell. */
-        super = NULL;
-        
-        /* Loop while there are tasks... */
-        while ( 1 ) {
-        
-            /* If there's no old task, try to get a new one. */
-            if ( t == NULL ) {
-            
-                /* Get the task. */
-                TIMER_TIC
-                t = scheduler_gettask( sched , r->qid , super );
-                TIMER_TOC(timer_gettask);
-                
-                /* Did I get anything? */
-                if ( t == NULL )
-                    break;
-        
-                }
-            
-            /* Get the cells. */
-            ci = t->ci;
-            cj = t->cj;
-            t->rid = r->cpuid;
-            
-            /* Set super to the first cell that I own. */
-            if ( ci->super != NULL && ci->super->owner == r->qid )
-                super = ci->super;
-            else if ( cj != NULL && cj->super != NULL && cj->super->owner == r->qid )
-                super = cj->super;
-            /* else
-                super = NULL; */
-                
-            /* Prefetch? */
-            if ( runner_prefetch &&
-                 t->type != task_type_kick1 && t->type != task_type_kick2 && t->type != task_type_ghost ) {
-                for ( int k = 0 ; k < ci->count ; k++ )
-                    __builtin_prefetch( &ci->parts[k] , 1 , 3 );
-                if ( cj != NULL )
-                    for ( int k = 0 ; k < cj->count ; k++ )
-                        __builtin_prefetch( &cj->parts[k] , 1 , 3 );
-                }
-            
-            /* Different types of tasks... */
-            switch ( t->type ) {
-                case task_type_self:
-                    if ( t->subtype == task_subtype_density )
-                        runner_doself1_density( r , ci );
-                    else if ( t->subtype == task_subtype_force )
-                        runner_doself2_force( r , ci );
-                    else
-                        error( "Unknown task subtype." );
-                    break;
-                case task_type_pair:
-                    if ( t->subtype == task_subtype_density )
-                        runner_dopair1_density( r , ci , cj );
-                    else if ( t->subtype == task_subtype_force )
-                        runner_dopair2_force( r , ci , cj );
-                    else
-                        error( "Unknown task subtype." );
-                    break;
-                case task_type_sort:
-                    runner_dosort( r , ci , t->flags , 1 );
-                    break;
-                case task_type_sub:
-                    if ( t->subtype == task_subtype_density )
-                        runner_dosub1_density( r , ci , cj , t->flags , 1 );
-                    else if ( t->subtype == task_subtype_force )
-                        runner_dosub2_force( r , ci , cj , t->flags , 1 );
-                    else if ( t->subtype == task_subtype_grav )
-                        runner_dosub_grav( r , ci , cj , 1 );
-                    else
-                        error( "Unknown task subtype." );
-                    break;
-                case task_type_ghost:
-                    runner_doghost( r , ci );
-                    break;
-                case task_type_kick1:
-                    runner_dokick1( r , ci );
-                    break;
-                case task_type_kick2:
-                    if ( e->policy & engine_policy_fixdt )
-                        runner_dokick( r , ci , 1 );
-                    else
-                        runner_dokick2( r , ci );
-                    break;
-                case task_type_send:
-                    break;
-                case task_type_recv:
-                    parts = ci->parts;
-                    nr_parts = ci->count;
-                    for ( k = 0 ; k < nr_parts ; k++ )
-                        parts[k].dt = FLT_MAX;
-                    ci->dt_min = ci->dt_max = FLT_MAX;
-                    break;
-                case task_type_grav_pp:
-                    if ( t->cj == NULL )
-                        runner_doself_grav( r , t->ci );
-                    else
-                       runner_dopair_grav( r , t->ci , t->cj );
-                    break;
-                case task_type_grav_mm:
-                    runner_dograv_mm( r , t->ci , t->cj );
-                    break;
-                case task_type_grav_up:
-                    runner_dograv_up( r , t->ci );
-                    break;
-                case task_type_grav_down:
-                    runner_dograv_down( r , t->ci );
-                    break;
-                default:
-                    error( "Unknown task type." );
-                }
-            
-            /* We're done with this task, see if we get a next one. */
-            t = scheduler_done( sched , t );
-                
-            } /* main loop. */
-            
-        }
-        
-    /* Be kind, rewind. */
-    return NULL;
-
-    }
-    
 
+void *runner_main(void *data) {
+
+  struct runner *r = (struct runner *)data;
+  struct engine *e = r->e;
+  struct scheduler *sched = &e->sched;
+  struct task *t = NULL;
+  struct cell *ci, *cj, *super;
+  struct part *parts;
+  int k, nr_parts;
+
+  /* Main loop. */
+  while (1) {
+
+    /* Wait at the barrier. */
+    engine_barrier(e, r->id);
+
+    /* Re-set the pointer to the previous super cell. */
+    super = NULL;
+
+    /* Loop while there are tasks... */
+    while (1) {
+
+      /* If there's no old task, try to get a new one. */
+      if (t == NULL) {
+
+        /* Get the task. */
+        TIMER_TIC
+        t = scheduler_gettask(sched, r->qid, super);
+        TIMER_TOC(timer_gettask);
+
+        /* Did I get anything? */
+        if (t == NULL) break;
+      }
+
+      /* Get the cells. */
+      ci = t->ci;
+      cj = t->cj;
+      t->rid = r->cpuid;
+
+      /* Set super to the first cell that I own. */
+      if (ci->super != NULL && ci->super->owner == r->qid)
+        super = ci->super;
+      else if (cj != NULL && cj->super != NULL && cj->super->owner == r->qid)
+        super = cj->super;
+      /* else
+          super = NULL; */
+
+      /* Prefetch? */
+      if (runner_prefetch && t->type != task_type_kick1 &&
+          t->type != task_type_kick2 && t->type != task_type_ghost) {
+        for (int k = 0; k < ci->count; k++)
+          __builtin_prefetch(&ci->parts[k], 1, 3);
+        if (cj != NULL)
+          for (int k = 0; k < cj->count; k++)
+            __builtin_prefetch(&cj->parts[k], 1, 3);
+      }
+
+      /* Different types of tasks... */
+      switch (t->type) {
+        case task_type_self:
+          if (t->subtype == task_subtype_density)
+            runner_doself1_density(r, ci);
+          else if (t->subtype == task_subtype_force)
+            runner_doself2_force(r, ci);
+          else
+            error("Unknown task subtype.");
+          break;
+        case task_type_pair:
+          if (t->subtype == task_subtype_density)
+            runner_dopair1_density(r, ci, cj);
+          else if (t->subtype == task_subtype_force)
+            runner_dopair2_force(r, ci, cj);
+          else
+            error("Unknown task subtype.");
+          break;
+        case task_type_sort:
+          runner_dosort(r, ci, t->flags, 1);
+          break;
+        case task_type_sub:
+          if (t->subtype == task_subtype_density)
+            runner_dosub1_density(r, ci, cj, t->flags, 1);
+          else if (t->subtype == task_subtype_force)
+            runner_dosub2_force(r, ci, cj, t->flags, 1);
+          else if (t->subtype == task_subtype_grav)
+            runner_dosub_grav(r, ci, cj, 1);
+          else
+            error("Unknown task subtype.");
+          break;
+        case task_type_ghost:
+          runner_doghost(r, ci);
+          break;
+        case task_type_kick1:
+          runner_dokick1(r, ci);
+          break;
+        case task_type_kick2:
+          if (e->policy & engine_policy_fixdt)
+            runner_dokick(r, ci, 1);
+          else
+            runner_dokick2(r, ci);
+          break;
+        case task_type_send:
+          break;
+        case task_type_recv:
+          parts = ci->parts;
+          nr_parts = ci->count;
+          for (k = 0; k < nr_parts; k++) parts[k].dt = FLT_MAX;
+          ci->dt_min = ci->dt_max = FLT_MAX;
+          break;
+        case task_type_grav_pp:
+          if (t->cj == NULL)
+            runner_doself_grav(r, t->ci);
+          else
+            runner_dopair_grav(r, t->ci, t->cj);
+          break;
+        case task_type_grav_mm:
+          runner_dograv_mm(r, t->ci, t->cj);
+          break;
+        case task_type_grav_up:
+          runner_dograv_up(r, t->ci);
+          break;
+        case task_type_grav_down:
+          runner_dograv_down(r, t->ci);
+          break;
+        default:
+          error("Unknown task type.");
+      }
+
+      /* We're done with this task, see if we get a next one. */
+      t = scheduler_done(sched, t);
+
+    } /* main loop. */
+  }
+
+  /* Be kind, rewind. */
+  return NULL;
+}
diff --git a/src/runner.h b/src/runner.h
index 91ac475d7079a5a49dd194668bea567e9528a74c..30e75bd6ad21d45baf328adef23d2b500015ce9b 100644
--- a/src/runner.h
+++ b/src/runner.h
@@ -1,87 +1,99 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_RUNNER_H
+#define SWIFT_RUNNER_H
 
+/* Some standard headers. */
+#include <pthread.h>
+
+/* Includes. */
+#include "cell.h"
 #include "inline.h"
 
+/* Forward-declare the engine type to avoid cyclic header dependencies. */
+struct engine;
+
 /* Some constants/flags. */
-#define runner_prefetch                 0
+#define runner_prefetch 0
 
 /* SID stuff. */
 extern const char runner_flip[];
 
-
 /* Counters. */
 enum runner_counters {
-    runner_counter_swap = 0,
-    runner_counter_stall,
-    runner_counter_steal_stall,
-    runner_counter_steal_empty,
-    runner_counter_keep,
-    runner_counter_iact,
-    runner_counter_count,
-    };
-extern int runner_counter[ runner_counter_count ];
-
+  runner_counter_swap = 0,
+  runner_counter_stall,
+  runner_counter_steal_stall,
+  runner_counter_steal_empty,
+  runner_counter_keep,
+  runner_counter_iact,
+  runner_counter_count,
+};
+extern int runner_counter[runner_counter_count];
 
 /* Counter macros. */
 #ifdef COUNTER
-    #define COUNT(c) ( __sync_add_and_fetch( &runner_counter[ c ] , 1 ) )
+#define COUNT(c) (__sync_add_and_fetch(&runner_counter[c], 1))
 #else
-    #define COUNT(c)
+#define COUNT(c)
 #endif
 
-
 /* Histogram functions. */
 #define runner_hist_a 1.0
 #define runner_hist_b 100.0
 #define runner_hist_N 99
-long long int runner_hist_bins[ runner_hist_N ];
-#define runner_hist_hit( x ) __sync_add_and_fetch( &runner_hist_bins[ (int)fmax( 0.0 , fmin( runner_hist_N-1 , ((x) - runner_hist_a) / (runner_hist_b - runner_hist_a) * runner_hist_N ) ) ] , 1 )
-
-
+long long int runner_hist_bins[runner_hist_N];
+#define runner_hist_hit(x)                                                   \
+  __sync_add_and_fetch(                                                      \
+      &runner_hist_bins[(int)fmax(                                           \
+          0.0, fmin(runner_hist_N - 1, ((x) - runner_hist_a) /               \
+                                           (runner_hist_b - runner_hist_a) * \
+                                           runner_hist_N))],                 \
+      1)
 
 /* A struct representing a runner's thread and its data. */
 struct runner {
 
-    /* The id of this thread. */
-    int id;
+  /* The id of this thread. */
+  int id;
 
-    /* The thread which it is running. */
-    pthread_t thread;
-    
-    /* The queue to use to get tasks. */
-    int cpuid, qid;
+  /* The thread which it is running. */
+  pthread_t thread;
 
-    /* The underlying runner. */
-    struct engine *e;
-    
-    };
+  /* The queue to use to get tasks. */
+  int cpuid, qid;
 
+  /* The underlying runner. */
+  struct engine *e;
+};
 
 /* Function prototypes. */
-void runner_doghost ( struct runner *r , struct cell *c );
-void runner_dopair_density ( struct runner *r , struct cell *ci , struct cell *cj );
-void runner_doself_density ( struct runner *r , struct cell *c );
-void runner_dosub_density ( struct runner *r , struct cell *ci , struct cell *cj , int flags );
-void runner_dosort ( struct runner *r , struct cell *c , int flag , int clock );
-void runner_dogsort ( struct runner *r , struct cell *c , int flag , int clock );
-void runner_dokick ( struct runner *r , struct cell *c , int timer );
-void runner_dokick1 ( struct runner *r , struct cell *c );
-void runner_dokick2 ( struct runner *r , struct cell *c );
-void *runner_main ( void *data );
+void runner_doghost(struct runner *r, struct cell *c);
+void runner_dopair_density(struct runner *r, struct cell *ci, struct cell *cj);
+void runner_doself_density(struct runner *r, struct cell *c);
+void runner_dosub_density(struct runner *r, struct cell *ci, struct cell *cj,
+                          int flags);
+void runner_dosort(struct runner *r, struct cell *c, int flag, int clock);
+void runner_dogsort(struct runner *r, struct cell *c, int flag, int clock);
+void runner_dokick(struct runner *r, struct cell *c, int timer);
+void runner_dokick1(struct runner *r, struct cell *c);
+void runner_dokick2(struct runner *r, struct cell *c);
+void *runner_main(void *data);
+
+#endif /* SWIFT_RUNNER_H */
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index 1c28b81f72572ea305814aac9ecabd65f41cbfae..017529cc94021ee9ea38ce543ac8a3c4dea2e1db 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -1,22 +1,25 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 
+/* Includes. */
+#include "cell.h"
+#include "part.h"
 
 /* Before including this file, define FUNCTION, which is the
    name of the interaction function. This creates the interaction functions
@@ -24,73 +27,71 @@
    and runner_dosub_FUNCTION calling the pairwise interaction function
    runner_iact_FUNCTION. */
 
-#define PASTE(x,y) x ## _ ## y
+#define PASTE(x, y) x##_##y
 
-#define _DOPAIR1(f) PASTE(runner_dopair1,f)
+#define _DOPAIR1(f) PASTE(runner_dopair1, f)
 #define DOPAIR1 _DOPAIR1(FUNCTION)
 
-#define _DOPAIR2(f) PASTE(runner_dopair2,f)
+#define _DOPAIR2(f) PASTE(runner_dopair2, f)
 #define DOPAIR2 _DOPAIR2(FUNCTION)
 
-#define _DOPAIR_SUBSET(f) PASTE(runner_dopair_subset,f)
+#define _DOPAIR_SUBSET(f) PASTE(runner_dopair_subset, f)
 #define DOPAIR_SUBSET _DOPAIR_SUBSET(FUNCTION)
 
-#define _DOPAIR_SUBSET_NAIVE(f) PASTE(runner_dopair_subset_naive,f)
+#define _DOPAIR_SUBSET_NAIVE(f) PASTE(runner_dopair_subset_naive, f)
 #define DOPAIR_SUBSET_NAIVE _DOPAIR_SUBSET_NAIVE(FUNCTION)
 
-#define _DOPAIR_NAIVE(f) PASTE(runner_dopair_naive,f)
+#define _DOPAIR_NAIVE(f) PASTE(runner_dopair_naive, f)
 #define DOPAIR_NAIVE _DOPAIR_NAIVE(FUNCTION)
 
-#define _DOSELF_NAIVE(f) PASTE(runner_doself_naive,f)
+#define _DOSELF_NAIVE(f) PASTE(runner_doself_naive, f)
 #define DOSELF_NAIVE _DOSELF_NAIVE(FUNCTION)
 
-#define _DOSELF1(f) PASTE(runner_doself1,f)
+#define _DOSELF1(f) PASTE(runner_doself1, f)
 #define DOSELF1 _DOSELF1(FUNCTION)
 
-#define _DOSELF2(f) PASTE(runner_doself2,f)
+#define _DOSELF2(f) PASTE(runner_doself2, f)
 #define DOSELF2 _DOSELF2(FUNCTION)
 
-#define _DOSELF_SUBSET(f) PASTE(runner_doself_subset,f)
+#define _DOSELF_SUBSET(f) PASTE(runner_doself_subset, f)
 #define DOSELF_SUBSET _DOSELF_SUBSET(FUNCTION)
 
-#define _DOSUB1(f) PASTE(runner_dosub1,f)
+#define _DOSUB1(f) PASTE(runner_dosub1, f)
 #define DOSUB1 _DOSUB1(FUNCTION)
 
-#define _DOSUB2(f) PASTE(runner_dosub2,f)
+#define _DOSUB2(f) PASTE(runner_dosub2, f)
 #define DOSUB2 _DOSUB2(FUNCTION)
 
-#define _DOSUB_SUBSET(f) PASTE(runner_dosub_subset,f)
+#define _DOSUB_SUBSET(f) PASTE(runner_dosub_subset, f)
 #define DOSUB_SUBSET _DOSUB_SUBSET(FUNCTION)
 
-#define _IACT_NONSYM(f) PASTE(runner_iact_nonsym,f)
+#define _IACT_NONSYM(f) PASTE(runner_iact_nonsym, f)
 #define IACT_NONSYM _IACT_NONSYM(FUNCTION)
 
-#define _IACT(f) PASTE(runner_iact,f)
+#define _IACT(f) PASTE(runner_iact, f)
 #define IACT _IACT(FUNCTION)
 
-#define _TIMER_DOSELF(f) PASTE(timer_doself,f)
+#define _TIMER_DOSELF(f) PASTE(timer_doself, f)
 #define TIMER_DOSELF _TIMER_DOSELF(FUNCTION)
 
-#define _TIMER_DOPAIR(f) PASTE(timer_dopair,f)
+#define _TIMER_DOPAIR(f) PASTE(timer_dopair, f)
 #define TIMER_DOPAIR _TIMER_DOPAIR(FUNCTION)
 
-#define _TIMER_DOSUB(f) PASTE(timer_dosub,f)
+#define _TIMER_DOSUB(f) PASTE(timer_dosub, f)
 #define TIMER_DOSUB _TIMER_DOSUB(FUNCTION)
 
-#define _TIMER_DOSELF_SUBSET(f) PASTE(timer_doself_subset,f)
+#define _TIMER_DOSELF_SUBSET(f) PASTE(timer_doself_subset, f)
 #define TIMER_DOSELF_SUBSET _TIMER_DOSELF_SUBSET(FUNCTION)
 
-#define _TIMER_DOPAIR_SUBSET(f) PASTE(timer_dopair_subset,f)
+#define _TIMER_DOPAIR_SUBSET(f) PASTE(timer_dopair_subset, f)
 #define TIMER_DOPAIR_SUBSET _TIMER_DOPAIR_SUBSET(FUNCTION)
 
-#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec,f)
+#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec, f)
 #define IACT_NONSYM_VEC _IACT_NONSYM_VEC(FUNCTION)
 
-#define _IACT_VEC(f) PASTE(runner_iact_vec,f)
+#define _IACT_VEC(f) PASTE(runner_iact_vec, f)
 #define IACT_VEC _IACT_VEC(FUNCTION)
 
-
-
 /**
  * @brief Compute the interactions between a cell pair.
  *
@@ -98,218 +99,218 @@
  * @param ci The first #cell.
  * @param cj The second #cell.
  */
- 
-void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *restrict cj ) {
-
-    struct engine *e = r->e;
-    int pid, pjd, k, count_i = ci->count, count_j = cj->count;
-    double shift[3] = { 0.0 , 0.0 , 0.0 };
-    struct part *restrict parts_i = ci->parts, *restrict parts_j = cj->parts;
-    struct part *restrict pi, *restrict pj;
-    double pix[3];
-    float dx[3], hi, hig2, r2;
-    float dt_step = e->dt_step;
-    #ifdef VECTORIZE
-        int icount = 0;
-        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
-        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
-        struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
-    #endif
-    TIMER_TIC
-    
-    /* Anything to do here? */
-    if ( ci->dt_min > dt_step && cj->dt_min > dt_step )
-        return;
-    
-    /* Get the relative distance between the pairs, wrapping. */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 )
-            shift[k] = e->s->dim[k];
-        else if ( cj->loc[k] - ci->loc[k] > e->s->dim[k]/2 )
-            shift[k] = -e->s->dim[k];
+
+void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci,
+                  struct cell *restrict cj) {
+
+  struct engine *e = r->e;
+  int pid, pjd, k, count_i = ci->count, count_j = cj->count;
+  double shift[3] = {0.0, 0.0, 0.0};
+  struct part *restrict parts_i = ci->parts, *restrict parts_j = cj->parts;
+  struct part *restrict pi, *restrict pj;
+  double pix[3];
+  float dx[3], hi, hig2, r2;
+  float dt_step = e->dt_step;
+#ifdef VECTORIZE
+  int icount = 0;
+  float r2q[VEC_SIZE] __attribute__((aligned(16)));
+  float hiq[VEC_SIZE] __attribute__((aligned(16)));
+  float hjq[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
+#endif
+  TIMER_TIC
+
+  /* Anything to do here? */
+  if (ci->dt_min > dt_step && cj->dt_min > dt_step) return;
+
+  /* Get the relative distance between the pairs, wrapping. */
+  for (k = 0; k < 3; k++) {
+    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
+      shift[k] = e->s->dim[k];
+    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
+      shift[k] = -e->s->dim[k];
+  }
+
+  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
+  %i/%i parts and shift = [ %g %g %g ].\n" ,
+      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
+  cj->loc[2] ,
+      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
+  tic = getticks(); */
+
+  /* Loop over the parts in ci. */
+  for (pid = 0; pid < count_i; pid++) {
+
+    /* Get a hold of the ith part in ci. */
+    pi = &parts_i[pid];
+    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    hi = pi->h;
+    hig2 = hi * hi * kernel_gamma2;
+
+    /* Loop over the parts in cj. */
+    for (pjd = 0; pjd < count_j; pjd++) {
+
+      /* Get a pointer to the jth particle. */
+      pj = &parts_j[pjd];
+
+      /* Compute the pairwise distance. */
+      r2 = 0.0f;
+      for (k = 0; k < 3; k++) {
+        dx[k] = pix[k] - pj->x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+      /* Hit or miss? */
+      if (r2 < hig2 || r2 < pj->h * pj->h * kernel_gamma2) {
+
+#ifndef VECTORIZE
+
+        IACT(r2, dx, hi, pj->h, pi, pj);
+
+#else
+
+        /* Add this interaction to the queue. */
+        r2q[icount] = r2;
+        dxq[3 * icount + 0] = dx[0];
+        dxq[3 * icount + 1] = dx[1];
+        dxq[3 * icount + 2] = dx[2];
+        hiq[icount] = hi;
+        hjq[icount] = pj->h;
+        piq[icount] = pi;
+        pjq[icount] = pj;
+        icount += 1;
+
+        /* Flush? */
+        if (icount == VEC_SIZE) {
+          IACT_VEC(r2q, dxq, hiq, hjq, piq, pjq);
+          icount = 0;
         }
-        
-    /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" ,
-        ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] ,
-        ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-    tic = getticks(); */
-    
-    /* Loop over the parts in ci. */
-    for ( pid = 0 ; pid < count_i ; pid++ ) {
-    
-        /* Get a hold of the ith part in ci. */
-        pi = &parts_i[ pid ];
-        for ( k = 0 ; k < 3 ; k++ )
-            pix[k] = pi->x[k] - shift[k];
-        hi = pi->h;
-        hig2 = hi * hi * kernel_gamma2;
-        
-        /* Loop over the parts in cj. */
-        for ( pjd = 0 ; pjd < count_j ; pjd++ ) {
-        
-            /* Get a pointer to the jth particle. */
-            pj = &parts_j[ pjd ];
-        
-            /* Compute the pairwise distance. */
-            r2 = 0.0f;
-            for ( k = 0 ; k < 3 ; k++ ) {
-                dx[k] = pix[k] - pj->x[k];
-                r2 += dx[k]*dx[k];
-                }
-                
-            /* Hit or miss? */
-            if ( r2 < hig2 || r2 < pj->h*pj->h*kernel_gamma2 ) {
-            
-                #ifndef VECTORIZE
-                        
-                    IACT( r2 , dx , hi , pj->h , pi , pj );
-                
-                #else
-
-                    /* Add this interaction to the queue. */
-                    r2q[icount] = r2;
-                    dxq[3*icount+0] = dx[0];
-                    dxq[3*icount+1] = dx[1];
-                    dxq[3*icount+2] = dx[2];
-                    hiq[icount] = hi;
-                    hjq[icount] = pj->h;
-                    piq[icount] = pi;
-                    pjq[icount] = pj;
-                    icount += 1;
-
-                    /* Flush? */
-                    if ( icount == VEC_SIZE ) {
-                        IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq );
-                        icount = 0;
-                        }
-
-                #endif
-                    
-                }
-        
-            } /* loop over the parts in cj. */
-    
-        } /* loop over the parts in ci. */
-        
-    #ifdef VECTORIZE
-    /* Pick up any leftovers. */
-    if ( icount > 0 )
-        for ( k = 0 ; k < icount ; k++ )
-            IACT( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
-    #endif
-        
-    #ifdef TIMER_VERBOSE
-        printf( "runner_dopair_naive[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 );
-    #else
-        TIMER_TOC(TIMER_DOPAIR);
-    #endif
 
+#endif
+      }
+
+    } /* loop over the parts in cj. */
+
+  } /* loop over the parts in ci. */
+
+#ifdef VECTORIZE
+  /* Pick up any leftovers. */
+  if (icount > 0)
+    for (k = 0; k < icount; k++)
+      IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
+#endif
+
+#ifdef TIMER_VERBOSE
+  printf(
+      "runner_dopair_naive[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) "
+      "took %.3f ms.\n",
+      r->id, count_i, count_j, ci->depth, ci->h_max, cj->h_max,
+      ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000);
+#else
+  TIMER_TOC(TIMER_DOPAIR);
+#endif
+}
+
+void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) {
+
+  int pid, pjd, k, count = c->count;
+  struct part *restrict parts = c->parts;
+  struct part *restrict pi, *restrict pj;
+  double pix[3] = {0.0, 0.0, 0.0};
+  float dx[3], hi, hig2, r2;
+  float dt_step = r->e->dt_step;
+#ifdef VECTORIZE
+  int icount = 0;
+  float r2q[VEC_SIZE] __attribute__((aligned(16)));
+  float hiq[VEC_SIZE] __attribute__((aligned(16)));
+  float hjq[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
+#endif
+  TIMER_TIC
+
+  /* Anything to do here? */
+  if (c->dt_min > dt_step) return;
+
+  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
+  %i/%i parts and shift = [ %g %g %g ].\n" ,
+      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
+  cj->loc[2] ,
+      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
+  tic = getticks(); */
+
+  /* Loop over the parts in ci. */
+  for (pid = 0; pid < count; pid++) {
+
+    /* Get a hold of the ith part in ci. */
+    pi = &parts[pid];
+    pix[0] = pi->x[0];
+    pix[1] = pi->x[1];
+    pix[2] = pi->x[2];
+    hi = pi->h;
+    hig2 = hi * hi * kernel_gamma2;
 
-    }
+    /* Loop over the parts in cj. */
+    for (pjd = pid + 1; pjd < count; pjd++) {
+
+      /* Get a pointer to the jth particle. */
+      pj = &parts[pjd];
+
+      /* Compute the pairwise distance. */
+      r2 = 0.0f;
+      for (k = 0; k < 3; k++) {
+        dx[k] = pix[k] - pj->x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+      /* Hit or miss? */
+      if (r2 < hig2 || r2 < pj->h * pj->h * kernel_gamma2) {
+
+#ifndef VECTORIZE
+
+        IACT(r2, dx, hi, pj->h, pi, pj);
+
+#else
+
+        /* Add this interaction to the queue. */
+        r2q[icount] = r2;
+        dxq[3 * icount + 0] = dx[0];
+        dxq[3 * icount + 1] = dx[1];
+        dxq[3 * icount + 2] = dx[2];
+        hiq[icount] = hi;
+        hjq[icount] = pj->h;
+        piq[icount] = pi;
+        pjq[icount] = pj;
+        icount += 1;
+
+        /* Flush? */
+        if (icount == VEC_SIZE) {
+          IACT_VEC(r2q, dxq, hiq, hjq, piq, pjq);
+          icount = 0;
+        }
 
+#endif
+      }
 
-void DOSELF_NAIVE ( struct runner *r , struct cell *restrict c ) {
-
-    int pid, pjd, k, count = c->count;
-    struct part *restrict parts = c->parts;
-    struct part *restrict pi, *restrict pj;
-    double pix[3] = {0.0,0.0,0.0};
-    float dx[3], hi, hig2, r2;
-    float dt_step = r->e->dt_step;
-    #ifdef VECTORIZE
-        int icount = 0;
-        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
-        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
-        struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
-    #endif
-    TIMER_TIC
-    
-    /* Anything to do here? */
-    if ( c->dt_min > dt_step )
-        return;
-    
-    /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" ,
-        ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] ,
-        ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-    tic = getticks(); */
-    
-    /* Loop over the parts in ci. */
-    for ( pid = 0 ; pid < count ; pid++ ) {
-    
-        /* Get a hold of the ith part in ci. */
-        pi = &parts[ pid ];
-        pix[0] = pi->x[0];
-        pix[1] = pi->x[1];
-        pix[2] = pi->x[2];
-        hi = pi->h;
-        hig2 = hi * hi * kernel_gamma2;
-        
-        /* Loop over the parts in cj. */
-        for ( pjd = pid+1 ; pjd < count ; pjd++ ) {
-        
-            /* Get a pointer to the jth particle. */
-            pj = &parts[ pjd ];
-        
-            /* Compute the pairwise distance. */
-            r2 = 0.0f;
-            for ( k = 0 ; k < 3 ; k++ ) {
-                dx[k] = pix[k] - pj->x[k];
-                r2 += dx[k]*dx[k];
-                }
-                
-            /* Hit or miss? */
-            if ( r2 < hig2 || r2 < pj->h*pj->h*kernel_gamma2 ) {
-            
-                #ifndef VECTORIZE
-                        
-                    IACT( r2 , dx , hi , pj->h , pi , pj );
-                
-                #else
-
-                    /* Add this interaction to the queue. */
-                    r2q[icount] = r2;
-                    dxq[3*icount+0] = dx[0];
-                    dxq[3*icount+1] = dx[1];
-                    dxq[3*icount+2] = dx[2];
-                    hiq[icount] = hi;
-                    hjq[icount] = pj->h;
-                    piq[icount] = pi;
-                    pjq[icount] = pj;
-                    icount += 1;
-
-                    /* Flush? */
-                    if ( icount == VEC_SIZE ) {
-                        IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq );
-                        icount = 0;
-                        }
-
-                #endif
-                    
-                }
-        
-            } /* loop over the parts in cj. */
-    
-        } /* loop over the parts in ci. */
-        
-    #ifdef VECTORIZE
-    /* Pick up any leftovers. */
-    if ( icount > 0 )
-        for ( k = 0 ; k < icount ; k++ )
-            IACT( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
-    #endif
-        
-    #ifdef TIMER_VERBOSE
-        printf( "runner_doself[%02i]: %i parts at depth %i took %.3f ms.\n" , r->id , count , c->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 );
-    #else
-        TIMER_TOC(TIMER_DOSELF);
-    #endif
+    } /* loop over the parts in cj. */
 
-    }
+  } /* loop over the parts in ci. */
 
+#ifdef VECTORIZE
+  /* Pick up any leftovers. */
+  if (icount > 0)
+    for (k = 0; k < icount; k++)
+      IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
+#endif
+
+#ifdef TIMER_VERBOSE
+  printf("runner_doself[%02i]: %i parts at depth %i took %.3f ms.\n", r->id,
+         count, c->depth, ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000);
+#else
+  TIMER_TOC(TIMER_DOSELF);
+#endif
+}
 
 /**
  * @brief Compute the interactions between a cell pair, but only for the
@@ -322,197 +323,202 @@ void DOSELF_NAIVE ( struct runner *r , struct cell *restrict c ) {
  * @param count The number of particles in @c ind.
  * @param cj The second #cell.
  */
- 
-void DOPAIR_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *restrict parts_i , int *restrict ind , int count , struct cell *restrict cj ) {
-
-    struct engine *e = r->e;
-    int pid, pjd, sid, k, count_j = cj->count, flipped;
-    double shift[3] = { 0.0 , 0.0 , 0.0 };
-    struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts;
-    double pix[3];
-    float dx[3], hi, hig2, r2, di, dxj;
-    struct entry *sort_j;
-    #ifdef VECTORIZE
-        int icount = 0;
-        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
-        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
-        struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
-    #endif
-    TIMER_TIC
-    
-    /* Get the relative distance between the pairs, wrapping. */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 )
-            shift[k] = e->s->dim[k];
-        else if ( cj->loc[k] - ci->loc[k] > e->s->dim[k]/2 )
-            shift[k] = -e->s->dim[k];
+
+void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
+                   struct part *restrict parts_i, int *restrict ind, int count,
+                   struct cell *restrict cj) {
+
+  struct engine *e = r->e;
+  int pid, pjd, sid, k, count_j = cj->count, flipped;
+  double shift[3] = {0.0, 0.0, 0.0};
+  struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts;
+  double pix[3];
+  float dx[3], hi, hig2, r2, di, dxj;
+  struct entry *sort_j;
+#ifdef VECTORIZE
+  int icount = 0;
+  float r2q[VEC_SIZE] __attribute__((aligned(16)));
+  float hiq[VEC_SIZE] __attribute__((aligned(16)));
+  float hjq[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
+#endif
+  TIMER_TIC
+
+  /* Get the relative distance between the pairs, wrapping. */
+  for (k = 0; k < 3; k++) {
+    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
+      shift[k] = e->s->dim[k];
+    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
+      shift[k] = -e->s->dim[k];
+  }
+
+  /* Get the sorting index. */
+  for (sid = 0, k = 0; k < 3; k++)
+    sid = 3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0)
+                         ? 0
+                         : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1);
+
+  /* Switch the cells around? */
+  flipped = runner_flip[sid];
+  sid = sortlistID[sid];
+
+  /* Have the cells been sorted? */
+  if (!(cj->sorted & (1 << sid))) error("Trying to interact unsorted cells.");
+
+  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
+  %i/%i parts and shift = [ %g %g %g ].\n" ,
+      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
+  cj->loc[2] ,
+      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
+  tic = getticks(); */
+
+  /* Pick-out the sorted lists. */
+  sort_j = &cj->sort[sid * (cj->count + 1)];
+  dxj = cj->dx_max;
+
+  /* Parts are on the left? */
+  if (!flipped) {
+
+    /* Loop over the parts_i. */
+    for (pid = 0; pid < count; pid++) {
+
+      /* Get a hold of the ith part in ci. */
+      pi = &parts_i[ind[pid]];
+      for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+      hi = pi->h;
+      hig2 = hi * hi * kernel_gamma2;
+      di = hi * kernel_gamma + dxj + pix[0] * runner_shift[3 * sid + 0] +
+           pix[1] * runner_shift[3 * sid + 1] +
+           pix[2] * runner_shift[3 * sid + 2];
+
+      /* Loop over the parts in cj. */
+      for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
+
+        /* Get a pointer to the jth particle. */
+        pj = &parts_j[sort_j[pjd].i];
+
+        /* Compute the pairwise distance. */
+        r2 = 0.0f;
+        for (k = 0; k < 3; k++) {
+          dx[k] = pix[k] - pj->x[k];
+          r2 += dx[k] * dx[k];
         }
-        
-    /* Get the sorting index. */
-    for ( sid = 0 , k = 0 ; k < 3 ; k++ )
-        sid = 3*sid + ( (cj->loc[k] - ci->loc[k] + shift[k] < 0) ? 0 : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1 );
-
-    /* Switch the cells around? */
-    flipped = runner_flip[sid];
-    sid = sortlistID[sid];
-    
-    /* Have the cells been sorted? */
-    if ( !(cj->sorted & (1 << sid) ) )
-        error( "Trying to interact unsorted cells." );
-    
-    /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" ,
-        ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] ,
-        ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-    tic = getticks(); */
-    
-    /* Pick-out the sorted lists. */
-    sort_j = &cj->sort[ sid*(cj->count + 1) ];
-    dxj = cj->dx_max;
-    
-    /* Parts are on the left? */
-    if ( !flipped ) {
-    
-        /* Loop over the parts_i. */
-        for ( pid = 0 ; pid < count ; pid++ ) {
-
-            /* Get a hold of the ith part in ci. */
-            pi = &parts_i[ ind[ pid ] ];
-            for ( k = 0 ; k < 3 ; k++ )
-                pix[k] = pi->x[k] - shift[k];
-            hi = pi->h;
-            hig2 = hi * hi * kernel_gamma2;
-            di = hi*kernel_gamma + dxj + pix[0]*runner_shift[ 3*sid + 0 ] + pix[1]*runner_shift[ 3*sid + 1 ] + pix[2]*runner_shift[ 3*sid + 2 ];
-
-            /* Loop over the parts in cj. */
-            for ( pjd = 0 ; pjd < count_j && sort_j[ pjd ].d < di ; pjd++ ) {
-
-                /* Get a pointer to the jth particle. */
-                pj = &parts_j[ sort_j[ pjd ].i ];
-
-                /* Compute the pairwise distance. */
-                r2 = 0.0f;
-                for ( k = 0 ; k < 3 ; k++ ) {
-                    dx[k] = pix[k] - pj->x[k];
-                    r2 += dx[k]*dx[k];
-                    }
-
-                /* Hit or miss? */
-                if ( r2 < hig2 ) {
-
-                    #ifndef VECTORIZE
-                        
-                        IACT_NONSYM( r2 , dx , hi , pj->h , pi , pj );
-                    
-                    #else
-                    
-                        /* Add this interaction to the queue. */
-                        r2q[icount] = r2;
-                        dxq[3*icount+0] = dx[0];
-                        dxq[3*icount+1] = dx[1];
-                        dxq[3*icount+2] = dx[2];
-                        hiq[icount] = hi;
-                        hjq[icount] = pj->h;
-                        piq[icount] = pi;
-                        pjq[icount] = pj;
-                        icount += 1;
-                        
-                        /* Flush? */
-                        if ( icount == VEC_SIZE ) {
-                            IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq );
-                            icount = 0;
-                            }
-
-                    #endif
-                    
-                    }
-
-                } /* loop over the parts in cj. */
-
-            } /* loop over the parts in ci. */
-            
+
+        /* Hit or miss? */
+        if (r2 < hig2) {
+
+#ifndef VECTORIZE
+
+          IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);
+
+#else
+
+          /* Add this interaction to the queue. */
+          r2q[icount] = r2;
+          dxq[3 * icount + 0] = dx[0];
+          dxq[3 * icount + 1] = dx[1];
+          dxq[3 * icount + 2] = dx[2];
+          hiq[icount] = hi;
+          hjq[icount] = pj->h;
+          piq[icount] = pi;
+          pjq[icount] = pj;
+          icount += 1;
+
+          /* Flush? */
+          if (icount == VEC_SIZE) {
+            IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
+            icount = 0;
+          }
+
+#endif
         }
-        
-    /* Parts are on the right. */
-    else {
-    
-        /* Loop over the parts_i. */
-        for ( pid = 0 ; pid < count ; pid++ ) {
-
-            /* Get a hold of the ith part in ci. */
-            pi = &parts_i[ ind[ pid ] ];
-            for ( k = 0 ; k < 3 ; k++ )
-                pix[k] = pi->x[k] - shift[k];
-            hi = pi->h;
-            hig2 = hi * hi * kernel_gamma2;
-            di = -hi*kernel_gamma - dxj + pix[0]*runner_shift[ 3*sid + 0 ] + pix[1]*runner_shift[ 3*sid + 1 ] + pix[2]*runner_shift[ 3*sid + 2 ];
-
-            /* Loop over the parts in cj. */
-            for ( pjd = count_j-1 ; pjd >= 0 && di < sort_j[ pjd ].d ; pjd-- ) {
-
-                /* Get a pointer to the jth particle. */
-                pj = &parts_j[ sort_j[ pjd ].i ];
-
-                /* Compute the pairwise distance. */
-                r2 = 0.0f;
-                for ( k = 0 ; k < 3 ; k++ ) {
-                    dx[k] = pix[k] - pj->x[k];
-                    r2 += dx[k]*dx[k];
-                    }
-
-                /* Hit or miss? */
-                if ( r2 < hig2 ) {
-
-                    #ifndef VECTORIZE
-                        
-                        IACT_NONSYM( r2 , dx , hi , pj->h , pi , pj );
-                    
-                    #else
-                    
-                        /* Add this interaction to the queue. */
-                        r2q[icount] = r2;
-                        dxq[3*icount+0] = dx[0];
-                        dxq[3*icount+1] = dx[1];
-                        dxq[3*icount+2] = dx[2];
-                        hiq[icount] = hi;
-                        hjq[icount] = pj->h;
-                        piq[icount] = pi;
-                        pjq[icount] = pj;
-                        icount += 1;
-                        
-                        /* Flush? */
-                        if ( icount == VEC_SIZE ) {
-                            IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq );
-                            icount = 0;
-                            }
-
-                    #endif
-
-                    }
-
-                } /* loop over the parts in cj. */
-
-            } /* loop over the parts in ci. */
-            
+
+      } /* loop over the parts in cj. */
+
+    } /* loop over the parts in ci. */
+
+  }
+
+  /* Parts are on the right. */
+  else {
+
+    /* Loop over the parts_i. */
+    for (pid = 0; pid < count; pid++) {
+
+      /* Get a hold of the ith part in ci. */
+      pi = &parts_i[ind[pid]];
+      for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+      hi = pi->h;
+      hig2 = hi * hi * kernel_gamma2;
+      di = -hi * kernel_gamma - dxj + pix[0] * runner_shift[3 * sid + 0] +
+           pix[1] * runner_shift[3 * sid + 1] +
+           pix[2] * runner_shift[3 * sid + 2];
+
+      /* Loop over the parts in cj. */
+      for (pjd = count_j - 1; pjd >= 0 && di < sort_j[pjd].d; pjd--) {
+
+        /* Get a pointer to the jth particle. */
+        pj = &parts_j[sort_j[pjd].i];
+
+        /* Compute the pairwise distance. */
+        r2 = 0.0f;
+        for (k = 0; k < 3; k++) {
+          dx[k] = pix[k] - pj->x[k];
+          r2 += dx[k] * dx[k];
         }
-        
-    #ifdef VECTORIZE
-    /* Pick up any leftovers. */
-    if ( icount > 0 )
-        for ( k = 0 ; k < icount ; k++ )
-            IACT_NONSYM( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
-    #endif
-        
-    #ifdef TIMER_VERBOSE
-        printf( "runner_dopair_subset[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 );
-    #else
-        TIMER_TOC(timer_dopair_subset);
-    #endif
 
+        /* Hit or miss? */
+        if (r2 < hig2) {
 
-    }
+#ifndef VECTORIZE
+
+          IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);
+
+#else
 
+          /* Add this interaction to the queue. */
+          r2q[icount] = r2;
+          dxq[3 * icount + 0] = dx[0];
+          dxq[3 * icount + 1] = dx[1];
+          dxq[3 * icount + 2] = dx[2];
+          hiq[icount] = hi;
+          hjq[icount] = pj->h;
+          piq[icount] = pi;
+          pjq[icount] = pj;
+          icount += 1;
+
+          /* Flush? */
+          if (icount == VEC_SIZE) {
+            IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
+            icount = 0;
+          }
+
+#endif
+        }
+
+      } /* loop over the parts in cj. */
+
+    } /* loop over the parts in ci. */
+  }
+
+#ifdef VECTORIZE
+  /* Pick up any leftovers. */
+  if (icount > 0)
+    for (k = 0; k < icount; k++)
+      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
+#endif
+
+#ifdef TIMER_VERBOSE
+  printf(
+      "runner_dopair_subset[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) "
+      "took %.3f ms.\n",
+      r->id, count, count_j, ci->depth, ci->h_max, cj->h_max,
+      ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000);
+#else
+  TIMER_TOC(timer_dopair_subset);
+#endif
+}
 
 /**
  * @brief Compute the interactions between a cell pair, but only for the
@@ -525,111 +531,114 @@ void DOPAIR_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
  * @param count The number of particles in @c ind.
  * @param cj The second #cell.
  */
- 
-void DOPAIR_SUBSET_NAIVE ( struct runner *r , struct cell *restrict ci , struct part *restrict parts_i , int *restrict ind , int count , struct cell *restrict cj ) {
-
-    struct engine *e = r->e;
-    int pid, pjd, k, count_j = cj->count;
-    double shift[3] = { 0.0 , 0.0 , 0.0 };
-    struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts;
-    double pix[3];
-    float dx[3], hi, hig2, r2;
-    #ifdef VECTORIZE
-        int icount = 0;
-        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
-        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
-        struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
-    #endif
-    TIMER_TIC
-    
-    /* Get the relative distance between the pairs, wrapping. */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 )
-            shift[k] = e->s->dim[k];
-        else if ( cj->loc[k] - ci->loc[k] > e->s->dim[k]/2 )
-            shift[k] = -e->s->dim[k];
-        }
-        
-    /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" ,
-        ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] ,
-        ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-    tic = getticks(); */
-    
-    /* Loop over the parts_i. */
-    for ( pid = 0 ; pid < count ; pid++ ) {
 
-        /* Get a hold of the ith part in ci. */
-        pi = &parts_i[ ind[ pid ] ];
-        for ( k = 0 ; k < 3 ; k++ )
-            pix[k] = pi->x[k] - shift[k];
-        hi = pi->h;
-        hig2 = hi * hi * kernel_gamma2;
+void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
+                         struct part *restrict parts_i, int *restrict ind,
+                         int count, struct cell *restrict cj) {
+
+  struct engine *e = r->e;
+  int pid, pjd, k, count_j = cj->count;
+  double shift[3] = {0.0, 0.0, 0.0};
+  struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts;
+  double pix[3];
+  float dx[3], hi, hig2, r2;
+#ifdef VECTORIZE
+  int icount = 0;
+  float r2q[VEC_SIZE] __attribute__((aligned(16)));
+  float hiq[VEC_SIZE] __attribute__((aligned(16)));
+  float hjq[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
+#endif
+  TIMER_TIC
+
+  /* Get the relative distance between the pairs, wrapping. */
+  for (k = 0; k < 3; k++) {
+    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
+      shift[k] = e->s->dim[k];
+    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
+      shift[k] = -e->s->dim[k];
+  }
+
+  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
+  %i/%i parts and shift = [ %g %g %g ].\n" ,
+      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
+  cj->loc[2] ,
+      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
+  tic = getticks(); */
+
+  /* Loop over the parts_i. */
+  for (pid = 0; pid < count; pid++) {
+
+    /* Get a hold of the ith part in ci. */
+    pi = &parts_i[ind[pid]];
+    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    hi = pi->h;
+    hig2 = hi * hi * kernel_gamma2;
 
-        /* Loop over the parts in cj. */
-        for ( pjd = 0 ; pjd < count_j ; pjd++ ) {
-
-            /* Get a pointer to the jth particle. */
-            pj = &parts_j[ pjd ];
-
-            /* Compute the pairwise distance. */
-            r2 = 0.0f;
-            for ( k = 0 ; k < 3 ; k++ ) {
-                dx[k] = pix[k] - pj->x[k];
-                r2 += dx[k]*dx[k];
-                }
-
-            /* Hit or miss? */
-            if ( r2 < hig2 ) {
-
-                #ifndef VECTORIZE
-
-                    IACT_NONSYM( r2 , dx , hi , pj->h , pi , pj );
-
-                #else
-
-                    /* Add this interaction to the queue. */
-                    r2q[icount] = r2;
-                    dxq[3*icount+0] = dx[0];
-                    dxq[3*icount+1] = dx[1];
-                    dxq[3*icount+2] = dx[2];
-                    hiq[icount] = hi;
-                    hjq[icount] = pj->h;
-                    piq[icount] = pi;
-                    pjq[icount] = pj;
-                    icount += 1;
-
-                    /* Flush? */
-                    if ( icount == VEC_SIZE ) {
-                        IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq );
-                        icount = 0;
-                        }
-
-                #endif
-
-                }
-
-            } /* loop over the parts in cj. */
+    /* Loop over the parts in cj. */
+    for (pjd = 0; pjd < count_j; pjd++) {
+
+      /* Get a pointer to the jth particle. */
+      pj = &parts_j[pjd];
+
+      /* Compute the pairwise distance. */
+      r2 = 0.0f;
+      for (k = 0; k < 3; k++) {
+        dx[k] = pix[k] - pj->x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+      /* Hit or miss? */
+      if (r2 < hig2) {
+
+#ifndef VECTORIZE
+
+        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);
+
+#else
+
+        /* Add this interaction to the queue. */
+        r2q[icount] = r2;
+        dxq[3 * icount + 0] = dx[0];
+        dxq[3 * icount + 1] = dx[1];
+        dxq[3 * icount + 2] = dx[2];
+        hiq[icount] = hi;
+        hjq[icount] = pj->h;
+        piq[icount] = pi;
+        pjq[icount] = pj;
+        icount += 1;
+
+        /* Flush? */
+        if (icount == VEC_SIZE) {
+          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
+          icount = 0;
+        }
 
-        } /* loop over the parts in ci. */
+#endif
+      }
 
-    #ifdef VECTORIZE
-    /* Pick up any leftovers. */
-    if ( icount > 0 )
-        for ( k = 0 ; k < icount ; k++ )
-            IACT_NONSYM( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
-    #endif
-        
-    #ifdef TIMER_VERBOSE
-        printf( "runner_dopair_subset[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 );
-    #else
-        TIMER_TOC(timer_dopair_subset);
-    #endif
+    } /* loop over the parts in cj. */
 
+  } /* loop over the parts in ci. */
 
-    }
+#ifdef VECTORIZE
+  /* Pick up any leftovers. */
+  if (icount > 0)
+    for (k = 0; k < icount; k++)
+      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
+#endif
 
+#ifdef TIMER_VERBOSE
+  printf(
+      "runner_dopair_subset[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) "
+      "took %.3f ms.\n",
+      r->id, count, count_j, ci->depth, ci->h_max, cj->h_max,
+      ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000);
+#else
+  TIMER_TOC(timer_dopair_subset);
+#endif
+}
 
 /**
  * @brief Compute the interactions between a cell pair, but only for the
@@ -641,103 +650,104 @@ void DOPAIR_SUBSET_NAIVE ( struct runner *r , struct cell *restrict ci , struct
  * @param ind The list of indices of particles in @c ci to interact with.
  * @param count The number of particles in @c ind.
  */
- 
-void DOSELF_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *restrict parts , int *restrict ind , int count ) {
-
-    int pid, pjd, k, count_i = ci->count;
-    struct part *restrict parts_j = ci->parts;
-    struct part *restrict pi, *restrict pj;
-    double pix[3] = {0.0,0.0,0.0};
-    float dx[3], hi, hig2, r2;
-    #ifdef VECTORIZE
-        int icount = 0;
-        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
-        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
-        struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
-    #endif
-    TIMER_TIC
-    
-    /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" ,
-        ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] ,
-        ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-    tic = getticks(); */
-    
-    /* Loop over the parts in ci. */
-    for ( pid = 0 ; pid < count ; pid++ ) {
-    
-        /* Get a hold of the ith part in ci. */
-        pi = &parts[ ind[ pid ] ];
-        pix[0] = pi->x[0];
-        pix[1] = pi->x[1];
-        pix[2] = pi->x[2];
-        hi = pi->h;
-        hig2 = hi * hi * kernel_gamma2;
-        
-        /* Loop over the parts in cj. */
-        for ( pjd = 0 ; pjd < count_i ; pjd++ ) {
-        
-            /* Get a pointer to the jth particle. */
-            pj = &parts_j[ pjd ];
-            
-            /* Compute the pairwise distance. */
-            r2 = 0.0f;
-            for ( k = 0 ; k < 3 ; k++ ) {
-                dx[k] = pix[k] - pj->x[k];
-                r2 += dx[k]*dx[k];
-                }
-                
-            /* Hit or miss? */
-            if ( r2 > 0.0f && r2 < hig2 ) {
-            
-                #ifndef VECTORIZE
-
-                    IACT_NONSYM( r2 , dx , hi , pj->h , pi , pj );
-
-                #else
-
-                    /* Add this interaction to the queue. */
-                    r2q[icount] = r2;
-                    dxq[3*icount+0] = dx[0];
-                    dxq[3*icount+1] = dx[1];
-                    dxq[3*icount+2] = dx[2];
-                    hiq[icount] = hi;
-                    hjq[icount] = pj->h;
-                    piq[icount] = pi;
-                    pjq[icount] = pj;
-                    icount += 1;
-
-                    /* Flush? */
-                    if ( icount == VEC_SIZE ) {
-                        IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq );
-                        icount = 0;
-                        }
-
-                #endif
-            
-                }
-        
-            } /* loop over the parts in cj. */
-    
-        } /* loop over the parts in ci. */
-        
-    #ifdef VECTORIZE
-    /* Pick up any leftovers. */
-    if ( icount > 0 )
-        for ( k = 0 ; k < icount ; k++ )
-            IACT_NONSYM( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
-    #endif
-        
-    #ifdef TIMER_VERBOSE
-        printf( "runner_doself_subset[%02i]: %i/%i parts at depth %i took %.3f ms.\n" , r->id , count , ci->count , ci->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 );
-    #else
-        TIMER_TOC(timer_dopair_subset);
-    #endif
 
+void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
+                   struct part *restrict parts, int *restrict ind, int count) {
+
+  int pid, pjd, k, count_i = ci->count;
+  struct part *restrict parts_j = ci->parts;
+  struct part *restrict pi, *restrict pj;
+  double pix[3] = {0.0, 0.0, 0.0};
+  float dx[3], hi, hig2, r2;
+#ifdef VECTORIZE
+  int icount = 0;
+  float r2q[VEC_SIZE] __attribute__((aligned(16)));
+  float hiq[VEC_SIZE] __attribute__((aligned(16)));
+  float hjq[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
+#endif
+  TIMER_TIC
+
+  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
+  %i/%i parts and shift = [ %g %g %g ].\n" ,
+      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
+  cj->loc[2] ,
+      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
+  tic = getticks(); */
+
+  /* Loop over the parts in ci. */
+  for (pid = 0; pid < count; pid++) {
+
+    /* Get a hold of the ith part in ci. */
+    pi = &parts[ind[pid]];
+    pix[0] = pi->x[0];
+    pix[1] = pi->x[1];
+    pix[2] = pi->x[2];
+    hi = pi->h;
+    hig2 = hi * hi * kernel_gamma2;
 
-    }
+    /* Loop over the parts in cj. */
+    for (pjd = 0; pjd < count_i; pjd++) {
+
+      /* Get a pointer to the jth particle. */
+      pj = &parts_j[pjd];
+
+      /* Compute the pairwise distance. */
+      r2 = 0.0f;
+      for (k = 0; k < 3; k++) {
+        dx[k] = pix[k] - pj->x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+      /* Hit or miss? */
+      if (r2 > 0.0f && r2 < hig2) {
+
+#ifndef VECTORIZE
+
+        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);
+
+#else
+
+        /* Add this interaction to the queue. */
+        r2q[icount] = r2;
+        dxq[3 * icount + 0] = dx[0];
+        dxq[3 * icount + 1] = dx[1];
+        dxq[3 * icount + 2] = dx[2];
+        hiq[icount] = hi;
+        hjq[icount] = pj->h;
+        piq[icount] = pi;
+        pjq[icount] = pj;
+        icount += 1;
+
+        /* Flush? */
+        if (icount == VEC_SIZE) {
+          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
+          icount = 0;
+        }
+
+#endif
+      }
+
+    } /* loop over the parts in cj. */
+
+  } /* loop over the parts in ci. */
 
+#ifdef VECTORIZE
+  /* Pick up any leftovers. */
+  if (icount > 0)
+    for (k = 0; k < icount; k++)
+      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
+#endif
+
+#ifdef TIMER_VERBOSE
+  printf("runner_doself_subset[%02i]: %i/%i parts at depth %i took %.3f ms.\n",
+         r->id, count, ci->count, ci->depth,
+         ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000);
+#else
+  TIMER_TOC(timer_dopair_subset);
+#endif
+}
 
 /**
  * @brief Compute the interactions between a cell pair.
@@ -746,605 +756,596 @@ void DOSELF_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
  * @param ci The first #cell.
  * @param cj The second #cell.
  */
- 
-void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) {
-
-    struct engine *restrict e = r->e;
-    int pid, pjd, k, sid;
-    double rshift, shift[3] = { 0.0 , 0.0 , 0.0 };
-    struct entry *restrict sort_i, *restrict sort_j;
-    struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j;
-    double pix[3], pjx[3], di, dj;
-    float dx[3], hi, hig2, hj, hjg2, r2, dx_max;
-    double hi_max, hj_max;
-    double di_max, dj_min;
-    int count_i, count_j;
-    float dt_step = e->dt_step;
-    #ifdef VECTORIZE
-        int icount = 0;
-        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
-        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
-        struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
-    #endif
-    TIMER_TIC
-    
-    /* Anything to do here? */
-    if ( ci->dt_min > dt_step && cj->dt_min > dt_step )
-        return;
-        
-    /* Get the sort ID. */
-    sid = space_getsid( e->s , &ci , &cj , shift );
-    
-    /* Have the cells been sorted? */
-    if ( !(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid) ) )
-        error( "Trying to interact unsorted cells." );
-    
-    /* Get the cutoff shift. */
-    for ( rshift = 0.0 , k = 0 ; k < 3 ; k++ )
-        rshift += shift[k]*runner_shift[ 3*sid + k ];
-        
-    /* Pick-out the sorted lists. */
-    sort_i = &ci->sort[ sid*(ci->count + 1) ];
-    sort_j = &cj->sort[ sid*(cj->count + 1) ];
-    
-    /* Get some other useful values. */
-    hi_max = ci->h_max*kernel_gamma - rshift; hj_max = cj->h_max*kernel_gamma;
-    count_i = ci->count; count_j = cj->count;
-    parts_i = ci->parts; parts_j = cj->parts;
-    di_max = sort_i[count_i-1].d - rshift;
-    dj_min = sort_j[0].d;
-    dx_max = ( ci->dx_max + cj->dx_max );
-    
 
-    /* Loop over the parts in ci. */
-    for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min ; pid-- ) {
-    
-        /* Get a hold of the ith part in ci. */
-        pi = &parts_i[ sort_i[ pid ].i ];
-        if ( pi->dt > dt_step )
-            continue;
-        hi = pi->h;
-        di = sort_i[pid].d + hi*kernel_gamma + dx_max - rshift;
-        if ( di < dj_min )
-            continue;
-            
-        hig2 = hi * hi * kernel_gamma2;
-        for ( k = 0 ; k < 3 ; k++ )
-            pix[k] = pi->x[k] - shift[k];
-        
-        /* Loop over the parts in cj. */
-        for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d < di ; pjd++ ) {
-        
-            /* Get a pointer to the jth particle. */
-            pj = &parts_j[ sort_j[pjd].i ];
-        
-            /* Compute the pairwise distance. */
-            r2 = 0.0f;
-            for ( k = 0 ; k < 3 ; k++ ) {
-                dx[k] = pix[k] - pj->x[k];
-                r2 += dx[k]*dx[k];
-                }
-                
-            /* Hit or miss? */
-            if ( r2 < hig2 ) {
-            
-                #ifndef VECTORIZE
-                        
-                    IACT_NONSYM( r2 , dx , hi , pj->h , pi , pj );
-                
-                #else
-
-                    /* Add this interaction to the queue. */
-                    r2q[icount] = r2;
-                    dxq[3*icount+0] = dx[0];
-                    dxq[3*icount+1] = dx[1];
-                    dxq[3*icount+2] = dx[2];
-                    hiq[icount] = hi;
-                    hjq[icount] = pj->h;
-                    piq[icount] = pi;
-                    pjq[icount] = pj;
-                    icount += 1;
-
-                    /* Flush? */
-                    if ( icount == VEC_SIZE ) {
-                        IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq );
-                        icount = 0;
-                        }
-
-                #endif
-            
-                }
-        
-            } /* loop over the parts in cj. */
-    
-        } /* loop over the parts in ci. */
-        
-    /* printf( "runner_dopair: first half took %.3f ms...\n" , ((double)(getticks() - tic)) / CPU_TPS * 1000 );
-    tic = getticks(); */
+void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
+
+  struct engine *restrict e = r->e;
+  int pid, pjd, k, sid;
+  double rshift, shift[3] = {0.0, 0.0, 0.0};
+  struct entry *restrict sort_i, *restrict sort_j;
+  struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j;
+  double pix[3], pjx[3], di, dj;
+  float dx[3], hi, hig2, hj, hjg2, r2, dx_max;
+  double hi_max, hj_max;
+  double di_max, dj_min;
+  int count_i, count_j;
+  float dt_step = e->dt_step;
+#ifdef VECTORIZE
+  int icount = 0;
+  float r2q[VEC_SIZE] __attribute__((aligned(16)));
+  float hiq[VEC_SIZE] __attribute__((aligned(16)));
+  float hjq[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
+#endif
+  TIMER_TIC
+
+  /* Anything to do here? */
+  if (ci->dt_min > dt_step && cj->dt_min > dt_step) return;
+
+  /* Get the sort ID. */
+  sid = space_getsid(e->s, &ci, &cj, shift);
+
+  /* Have the cells been sorted? */
+  if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
+    error("Trying to interact unsorted cells.");
+
+  /* Get the cutoff shift. */
+  for (rshift = 0.0, k = 0; k < 3; k++)
+    rshift += shift[k] * runner_shift[3 * sid + k];
+
+  /* Pick-out the sorted lists. */
+  sort_i = &ci->sort[sid * (ci->count + 1)];
+  sort_j = &cj->sort[sid * (cj->count + 1)];
+
+  /* Get some other useful values. */
+  hi_max = ci->h_max * kernel_gamma - rshift;
+  hj_max = cj->h_max * kernel_gamma;
+  count_i = ci->count;
+  count_j = cj->count;
+  parts_i = ci->parts;
+  parts_j = cj->parts;
+  di_max = sort_i[count_i - 1].d - rshift;
+  dj_min = sort_j[0].d;
+  dx_max = (ci->dx_max + cj->dx_max);
+
+  /* Loop over the parts in ci. */
+  for (pid = count_i - 1; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min;
+       pid--) {
+
+    /* Get a hold of the ith part in ci. */
+    pi = &parts_i[sort_i[pid].i];
+    if (pi->dt > dt_step) continue;
+    hi = pi->h;
+    di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
+    if (di < dj_min) continue;
+
+    hig2 = hi * hi * kernel_gamma2;
+    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
 
     /* Loop over the parts in cj. */
-    for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max ; pjd++ ) {
-    
-        /* Get a hold of the jth part in cj. */
-        pj = &parts_j[ sort_j[ pjd ].i ];
-        if ( pj->dt > dt_step )
-            continue;
+    for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
+
+      /* Get a pointer to the jth particle. */
+      pj = &parts_j[sort_j[pjd].i];
+
+      /* Compute the pairwise distance. */
+      r2 = 0.0f;
+      for (k = 0; k < 3; k++) {
+        dx[k] = pix[k] - pj->x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+      /* Hit or miss? */
+      if (r2 < hig2) {
+
+#ifndef VECTORIZE
+
+        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);
+
+#else
+
+        /* Add this interaction to the queue. */
+        r2q[icount] = r2;
+        dxq[3 * icount + 0] = dx[0];
+        dxq[3 * icount + 1] = dx[1];
+        dxq[3 * icount + 2] = dx[2];
+        hiq[icount] = hi;
+        hjq[icount] = pj->h;
+        piq[icount] = pi;
+        pjq[icount] = pj;
+        icount += 1;
+
+        /* Flush? */
+        if (icount == VEC_SIZE) {
+          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
+          icount = 0;
+        }
+
+#endif
+      }
+
+    } /* loop over the parts in cj. */
+
+  } /* loop over the parts in ci. */
+
+  /* printf( "runner_dopair: first half took %.3f ms...\n" ,
+  ((double)(getticks() - tic)) / CPU_TPS * 1000 );
+  tic = getticks(); */
+
+  /* Loop over the parts in cj. */
+  for (pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
+       pjd++) {
+
+    /* Get a hold of the jth part in cj. */
+    pj = &parts_j[sort_j[pjd].i];
+    if (pj->dt > dt_step) continue;
+    hj = pj->h;
+    dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
+    if (dj > di_max) continue;
+
+    for (k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
+    hjg2 = hj * hj * kernel_gamma2;
+
+    /* Loop over the parts in ci. */
+    for (pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
+
+      /* Get a pointer to the jth particle. */
+      pi = &parts_i[sort_i[pid].i];
+
+      /* Compute the pairwise distance. */
+      r2 = 0.0f;
+      for (k = 0; k < 3; k++) {
+        dx[k] = pjx[k] - pi->x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+      /* Hit or miss? */
+      if (r2 < hjg2) {
+
+#ifndef VECTORIZE
+
+        IACT_NONSYM(r2, dx, hj, pi->h, pj, pi);
+
+#else
+
+        /* Add this interaction to the queue. */
+        r2q[icount] = r2;
+        dxq[3 * icount + 0] = dx[0];
+        dxq[3 * icount + 1] = dx[1];
+        dxq[3 * icount + 2] = dx[2];
+        hiq[icount] = hj;
+        hjq[icount] = pi->h;
+        piq[icount] = pj;
+        pjq[icount] = pi;
+        icount += 1;
+
+        /* Flush? */
+        if (icount == VEC_SIZE) {
+          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
+          icount = 0;
+        }
+
+#endif
+      }
+
+    } /* loop over the parts in cj. */
+
+  } /* loop over the parts in ci. */
+
+#ifdef VECTORIZE
+  /* Pick up any leftovers. */
+  if (icount > 0)
+    for (k = 0; k < icount; k++)
+      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
+#endif
+
+#ifdef TIMER_VERBOSE
+  printf(
+      "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) "
+      "took %.3f ms.\n",
+      r->id, count_i, count_j, ci->depth, ci->h_max, cj->h_max,
+      fmax(ci->h[0], fmax(ci->h[1], ci->h[2])),
+      ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000);
+#else
+  TIMER_TOC(TIMER_DOPAIR);
+#endif
+}
+
+void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
+
+  struct engine *restrict e = r->e;
+  int pid, pjd, k, sid;
+  double rshift, shift[3] = {0.0, 0.0, 0.0};
+  struct entry *restrict sort_i, *restrict sort_j;
+  struct entry *restrict sortdt_i = NULL, *restrict sortdt_j = NULL;
+  int countdt_i = 0, countdt_j = 0;
+  struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j;
+  double pix[3], pjx[3], di, dj;
+  float dx[3], hi, hig2, hj, hjg2, r2, dx_max;
+  double hi_max, hj_max;
+  double di_max, dj_min;
+  int count_i, count_j;
+  float dt_step = e->dt_step;
+#ifdef VECTORIZE
+  int icount1 = 0;
+  float r2q1[VEC_SIZE] __attribute__((aligned(16)));
+  float hiq1[VEC_SIZE] __attribute__((aligned(16)));
+  float hjq1[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq1[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct part *piq1[VEC_SIZE], *pjq1[VEC_SIZE];
+  int icount2 = 0;
+  float r2q2[VEC_SIZE] __attribute__((aligned(16)));
+  float hiq2[VEC_SIZE] __attribute__((aligned(16)));
+  float hjq2[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq2[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE];
+#endif
+  TIMER_TIC
+
+  /* Anything to do here? */
+  if (ci->dt_min > dt_step && cj->dt_min > dt_step) return;
+
+  /* Get the shift ID. */
+  sid = space_getsid(e->s, &ci, &cj, shift);
+
+  /* Have the cells been sorted? */
+  if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
+    error("Trying to interact unsorted cells.");
+
+  /* Get the cutoff shift. */
+  for (rshift = 0.0, k = 0; k < 3; k++)
+    rshift += shift[k] * runner_shift[3 * sid + k];
+
+  /* Pick-out the sorted lists. */
+  sort_i = &ci->sort[sid * (ci->count + 1)];
+  sort_j = &cj->sort[sid * (cj->count + 1)];
+
+  /* Get some other useful values. */
+  hi_max = ci->h_max * kernel_gamma - rshift;
+  hj_max = cj->h_max * kernel_gamma;
+  count_i = ci->count;
+  count_j = cj->count;
+  parts_i = ci->parts;
+  parts_j = cj->parts;
+  di_max = sort_i[count_i - 1].d - rshift;
+  dj_min = sort_j[0].d;
+  dx_max = (ci->dx_max + cj->dx_max);
+
+  /* Collect the number of parts left and right below dt. */
+  if (ci->dt_max <= dt_step) {
+    sortdt_i = sort_i;
+    countdt_i = count_i;
+  } else if (ci->dt_min <= dt_step) {
+    if ((sortdt_i = (struct entry *)alloca(sizeof(struct entry) * count_i)) ==
+        NULL)
+      error("Failed to allocate dt sortlists.");
+    for (k = 0; k < count_i; k++)
+      if (parts_i[sort_i[k].i].dt <= dt_step) {
+        sortdt_i[countdt_i] = sort_i[k];
+        countdt_i += 1;
+      }
+  }
+  if (cj->dt_max <= dt_step) {
+    sortdt_j = sort_j;
+    countdt_j = count_j;
+  } else if (cj->dt_min <= dt_step) {
+    if ((sortdt_j = (struct entry *)alloca(sizeof(struct entry) * count_j)) ==
+        NULL)
+      error("Failed to allocate dt sortlists.");
+    for (k = 0; k < count_j; k++)
+      if (parts_j[sort_j[k].i].dt <= dt_step) {
+        sortdt_j[countdt_j] = sort_j[k];
+        countdt_j += 1;
+      }
+  }
+
+  /* Loop over the parts in ci. */
+  for (pid = count_i - 1; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min;
+       pid--) {
+
+    /* Get a hold of the ith part in ci. */
+    pi = &parts_i[sort_i[pid].i];
+    hi = pi->h;
+    di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
+    if (di < dj_min) continue;
+
+    hig2 = hi * hi * kernel_gamma2;
+    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+
+    /* Look at valid dt parts only? */
+    if (pi->dt > dt_step) {
+
+      /* Loop over the parts in cj within dt. */
+      for (pjd = 0; pjd < countdt_j && sortdt_j[pjd].d < di; pjd++) {
+
+        /* Get a pointer to the jth particle. */
+        pj = &parts_j[sortdt_j[pjd].i];
         hj = pj->h;
-        dj = sort_j[pjd].d - hj*kernel_gamma - dx_max - rshift;
-        if ( dj > di_max )
-            continue;
-            
-        for ( k = 0 ; k < 3 ; k++ )
-            pjx[k] = pj->x[k] + shift[k];
-        hjg2 = hj * hj * kernel_gamma2;
-        
-        /* Loop over the parts in ci. */
-        for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d > dj ; pid-- ) {
-        
-            /* Get a pointer to the jth particle. */
-            pi = &parts_i[ sort_i[pid].i ];
-            
-            /* Compute the pairwise distance. */
-            r2 = 0.0f;
-            for ( k = 0 ; k < 3 ; k++ ) {
-                dx[k] = pjx[k] - pi->x[k];
-                r2 += dx[k]*dx[k];
-                }
-                
-            /* Hit or miss? */
-            if ( r2 < hjg2 ) {
-            
-                #ifndef VECTORIZE
-                        
-                    IACT_NONSYM( r2 , dx , hj , pi->h , pj , pi );
-                
-                #else
-
-                    /* Add this interaction to the queue. */
-                    r2q[icount] = r2;
-                    dxq[3*icount+0] = dx[0];
-                    dxq[3*icount+1] = dx[1];
-                    dxq[3*icount+2] = dx[2];
-                    hiq[icount] = hj;
-                    hjq[icount] = pi->h;
-                    piq[icount] = pj;
-                    pjq[icount] = pi;
-                    icount += 1;
-
-                    /* Flush? */
-                    if ( icount == VEC_SIZE ) {
-                        IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq );
-                        icount = 0;
-                        }
-
-                #endif
-            
-                }
-        
-            } /* loop over the parts in cj. */
-    
-        } /* loop over the parts in ci. */
-
-    #ifdef VECTORIZE
-    /* Pick up any leftovers. */
-    if ( icount > 0 )
-        for ( k = 0 ; k < icount ; k++ )
-            IACT_NONSYM( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
-    #endif
-        
-    #ifdef TIMER_VERBOSE
-        printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000 );
-    #else
-        TIMER_TOC(TIMER_DOPAIR);
-    #endif
+
+        /* Compute the pairwise distance. */
+        r2 = 0.0f;
+        for (k = 0; k < 3; k++) {
+          dx[k] = pj->x[k] - pix[k];
+          r2 += dx[k] * dx[k];
+        }
+
+        /* Hit or miss? */
+        if (r2 < hig2) {
+
+#ifndef VECTORIZE
+
+          IACT_NONSYM(r2, dx, hj, hi, pj, pi);
+
+#else
+
+          /* Add this interaction to the queue. */
+          r2q1[icount1] = r2;
+          dxq1[3 * icount1 + 0] = dx[0];
+          dxq1[3 * icount1 + 1] = dx[1];
+          dxq1[3 * icount1 + 2] = dx[2];
+          hiq1[icount1] = hj;
+          hjq1[icount1] = hi;
+          piq1[icount1] = pj;
+          pjq1[icount1] = pi;
+          icount1 += 1;
+
+          /* Flush? */
+          if (icount1 == VEC_SIZE) {
+            IACT_NONSYM_VEC(r2q1, dxq1, hiq1, hjq1, piq1, pjq1);
+            icount1 = 0;
+          }
+
+#endif
+        }
+
+      } /* loop over the parts in cj. */
 
     }
 
+    /* Otherwise, look at all parts. */
+    else {
+
+      /* Loop over the parts in cj. */
+      for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
+
+        /* Get a pointer to the jth particle. */
+        pj = &parts_j[sort_j[pjd].i];
+        hj = pj->h;
 
-void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
-
-    struct engine *restrict e = r->e;
-    int pid, pjd, k, sid;
-    double rshift, shift[3] = { 0.0 , 0.0 , 0.0 };
-    struct entry *restrict sort_i, *restrict sort_j;
-    struct entry *restrict sortdt_i = NULL, *restrict sortdt_j = NULL;
-    int countdt_i = 0, countdt_j = 0;
-    struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j;
-    double pix[3], pjx[3], di, dj;
-    float dx[3], hi, hig2, hj, hjg2, r2, dx_max;
-    double hi_max, hj_max;
-    double di_max, dj_min;
-    int count_i, count_j;
-    float dt_step = e->dt_step;
-    #ifdef VECTORIZE
-        int icount1 = 0;
-        float r2q1[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hiq1[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hjq1[VEC_SIZE] __attribute__ ((aligned (16)));
-        float dxq1[3*VEC_SIZE] __attribute__ ((aligned (16)));
-        struct part *piq1[VEC_SIZE], *pjq1[VEC_SIZE];
-        int icount2 = 0;
-        float r2q2[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hiq2[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hjq2[VEC_SIZE] __attribute__ ((aligned (16)));
-        float dxq2[3*VEC_SIZE] __attribute__ ((aligned (16)));
-        struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE];
-    #endif
-    TIMER_TIC
-    
-    /* Anything to do here? */
-    if ( ci->dt_min > dt_step && cj->dt_min > dt_step )
-        return;
-        
-    /* Get the shift ID. */
-    sid = space_getsid( e->s , &ci , &cj , shift );
-    
-    /* Have the cells been sorted? */
-    if ( !(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid) ) )
-        error( "Trying to interact unsorted cells." );
-    
-    /* Get the cutoff shift. */
-    for ( rshift = 0.0 , k = 0 ; k < 3 ; k++ )
-        rshift += shift[k]*runner_shift[ 3*sid + k ];
-        
-    /* Pick-out the sorted lists. */
-    sort_i = &ci->sort[ sid*(ci->count + 1) ];
-    sort_j = &cj->sort[ sid*(cj->count + 1) ];
-    
-    /* Get some other useful values. */
-    hi_max = ci->h_max*kernel_gamma - rshift; hj_max = cj->h_max*kernel_gamma;
-    count_i = ci->count; count_j = cj->count;
-    parts_i = ci->parts; parts_j = cj->parts;
-    di_max = sort_i[count_i-1].d - rshift;
-    dj_min = sort_j[0].d;
-    dx_max = ( ci->dx_max + cj->dx_max );
-    
-    /* Collect the number of parts left and right below dt. */
-    if ( ci->dt_max <= dt_step ) {
-        sortdt_i = sort_i;
-        countdt_i = count_i;
+        /* Compute the pairwise distance. */
+        r2 = 0.0f;
+        for (k = 0; k < 3; k++) {
+          dx[k] = pix[k] - pj->x[k];
+          r2 += dx[k] * dx[k];
         }
-    else if ( ci->dt_min <= dt_step ) {
-        if ( ( sortdt_i = (struct entry *)alloca( sizeof(struct entry) * count_i ) ) == NULL )
-            error( "Failed to allocate dt sortlists." );
-        for ( k = 0 ; k < count_i ; k++ )
-            if ( parts_i[ sort_i[ k ].i ].dt <= dt_step ) {
-                sortdt_i[ countdt_i ] = sort_i[k];
-                countdt_i += 1;
-                }
+
+        /* Hit or miss? */
+        if (r2 < hig2) {
+
+#ifndef VECTORIZE
+
+          /* Does pj need to be updated too? */
+          if (pj->dt <= dt_step)
+            IACT(r2, dx, hi, hj, pi, pj);
+          else
+            IACT_NONSYM(r2, dx, hi, hj, pi, pj);
+
+#else
+
+          /* Does pj need to be updated too? */
+          if (pj->dt <= dt_step) {
+
+            /* Add this interaction to the symmetric queue. */
+            r2q2[icount2] = r2;
+            dxq2[3 * icount2 + 0] = dx[0];
+            dxq2[3 * icount2 + 1] = dx[1];
+            dxq2[3 * icount2 + 2] = dx[2];
+            hiq2[icount2] = hi;
+            hjq2[icount2] = hj;
+            piq2[icount2] = pi;
+            pjq2[icount2] = pj;
+            icount2 += 1;
+
+            /* Flush? */
+            if (icount2 == VEC_SIZE) {
+              IACT_VEC(r2q2, dxq2, hiq2, hjq2, piq2, pjq2);
+              icount2 = 0;
+            }
+
+          } else {
+
+            /* Add this interaction to the non-symmetric queue. */
+            r2q1[icount1] = r2;
+            dxq1[3 * icount1 + 0] = dx[0];
+            dxq1[3 * icount1 + 1] = dx[1];
+            dxq1[3 * icount1 + 2] = dx[2];
+            hiq1[icount1] = hi;
+            hjq1[icount1] = hj;
+            piq1[icount1] = pi;
+            pjq1[icount1] = pj;
+            icount1 += 1;
+
+            /* Flush? */
+            if (icount1 == VEC_SIZE) {
+              IACT_NONSYM_VEC(r2q1, dxq1, hiq1, hjq1, piq1, pjq1);
+              icount1 = 0;
+            }
+          }
+
+#endif
         }
-    if ( cj->dt_max <= dt_step ) {
-        sortdt_j = sort_j;
-        countdt_j = count_j;
+
+      } /* loop over the parts in cj. */
+    }
+
+  } /* loop over the parts in ci. */
+
+  /* printf( "runner_dopair: first half took %.3f ms...\n" ,
+  ((double)(getticks() - tic)) / CPU_TPS * 1000 );
+  tic = getticks(); */
+
+  /* Loop over the parts in cj. */
+  for (pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
+       pjd++) {
+
+    /* Get a hold of the jth part in cj. */
+    pj = &parts_j[sort_j[pjd].i];
+    hj = pj->h;
+    dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
+    if (dj > di_max) continue;
+
+    for (k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
+    hjg2 = hj * hj * kernel_gamma2;
+
+    /* Is this particle outside the dt? */
+    if (pj->dt > dt_step) {
+
+      /* Loop over the parts in ci. */
+      for (pid = countdt_i - 1; pid >= 0 && sortdt_i[pid].d > dj; pid--) {
+
+        /* Get a pointer to the jth particle. */
+        pi = &parts_i[sortdt_i[pid].i];
+        hi = pi->h;
+
+        /* Compute the pairwise distance. */
+        r2 = 0.0f;
+        for (k = 0; k < 3; k++) {
+          dx[k] = pi->x[k] - pjx[k];
+          r2 += dx[k] * dx[k];
         }
-    else if ( cj->dt_min <= dt_step ) {
-        if ( ( sortdt_j = (struct entry *)alloca( sizeof(struct entry) * count_j ) ) == NULL )
-            error( "Failed to allocate dt sortlists." );
-        for ( k = 0 ; k < count_j ; k++ )
-            if ( parts_j[ sort_j[ k ].i ].dt <= dt_step ) {
-                sortdt_j[ countdt_j ] = sort_j[k];
-                countdt_j += 1;
-                }
+
+        /* Hit or miss? */
+        if (r2 < hjg2 && r2 > hi * hi * kernel_gamma2) {
+
+#ifndef VECTORIZE
+
+          IACT_NONSYM(r2, dx, hi, hj, pi, pj);
+
+#else
+
+          /* Add this interaction to the queue. */
+          r2q1[icount1] = r2;
+          dxq1[3 * icount1 + 0] = dx[0];
+          dxq1[3 * icount1 + 1] = dx[1];
+          dxq1[3 * icount1 + 2] = dx[2];
+          hiq1[icount1] = hi;
+          hjq1[icount1] = hj;
+          piq1[icount1] = pi;
+          pjq1[icount1] = pj;
+          icount1 += 1;
+
+          /* Flush? */
+          if (icount1 == VEC_SIZE) {
+            IACT_NONSYM_VEC(r2q1, dxq1, hiq1, hjq1, piq1, pjq1);
+            icount1 = 0;
+          }
+
+#endif
         }
-    
-    /* Loop over the parts in ci. */
-    for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min ; pid-- ) {
-    
-        /* Get a hold of the ith part in ci. */
-        pi = &parts_i[ sort_i[ pid ].i ];
+
+      } /* loop over the parts in cj. */
+    }
+
+    /* Otherwise, interact with all particles in cj. */
+    else {
+
+      /* Loop over the parts in ci. */
+      for (pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
+
+        /* Get a pointer to the jth particle. */
+        pi = &parts_i[sort_i[pid].i];
         hi = pi->h;
-        di = sort_i[pid].d + hi*kernel_gamma + dx_max - rshift;
-        if ( di < dj_min )
-            continue;
-            
-        hig2 = hi * hi * kernel_gamma2;
-        for ( k = 0 ; k < 3 ; k++ )
-            pix[k] = pi->x[k] - shift[k];
-            
-        /* Look at valid dt parts only? */
-        if ( pi->dt > dt_step ) {
-        
-            /* Loop over the parts in cj within dt. */
-            for ( pjd = 0 ; pjd < countdt_j && sortdt_j[pjd].d < di ; pjd++ ) {
-
-                /* Get a pointer to the jth particle. */
-                pj = &parts_j[ sortdt_j[pjd].i ];
-                hj = pj->h;
-
-                /* Compute the pairwise distance. */
-                r2 = 0.0f;
-                for ( k = 0 ; k < 3 ; k++ ) {
-                    dx[k] = pj->x[k] - pix[k];
-                    r2 += dx[k]*dx[k];
-                    }
-
-                /* Hit or miss? */
-                if ( r2 < hig2 ) {
-
-                    #ifndef VECTORIZE
-
-                        IACT_NONSYM( r2 , dx , hj , hi , pj , pi );
-
-                    #else
-
-                        /* Add this interaction to the queue. */
-                        r2q1[icount1] = r2;
-                        dxq1[3*icount1+0] = dx[0];
-                        dxq1[3*icount1+1] = dx[1];
-                        dxq1[3*icount1+2] = dx[2];
-                        hiq1[icount1] = hj;
-                        hjq1[icount1] = hi;
-                        piq1[icount1] = pj;
-                        pjq1[icount1] = pi;
-                        icount1 += 1;
-
-                        /* Flush? */
-                        if ( icount1 == VEC_SIZE ) {
-                            IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 );
-                            icount1 = 0;
-                            }
-
-                    #endif
-
-                    }
-        
-                } /* loop over the parts in cj. */
-                
-            }
-            
-        /* Otherwise, look at all parts. */
-        else {
-        
-            /* Loop over the parts in cj. */
-            for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d < di ; pjd++ ) {
-
-                /* Get a pointer to the jth particle. */
-                pj = &parts_j[ sort_j[pjd].i ];
-                hj = pj->h;
-
-                /* Compute the pairwise distance. */
-                r2 = 0.0f;
-                for ( k = 0 ; k < 3 ; k++ ) {
-                    dx[k] = pix[k] - pj->x[k];
-                    r2 += dx[k]*dx[k];
-                    }
-
-                /* Hit or miss? */
-                if ( r2 < hig2 ) {
-
-                    #ifndef VECTORIZE
-
-                        /* Does pj need to be updated too? */
-                        if ( pj->dt <= dt_step )
-                            IACT( r2 , dx , hi , hj , pi , pj );
-                        else
-                            IACT_NONSYM( r2 , dx , hi , hj , pi , pj );
-
-                    #else
-
-                        /* Does pj need to be updated too? */
-                        if ( pj->dt <= dt_step ) {
-                        
-                            /* Add this interaction to the symmetric queue. */
-                            r2q2[icount2] = r2;
-                            dxq2[3*icount2+0] = dx[0];
-                            dxq2[3*icount2+1] = dx[1];
-                            dxq2[3*icount2+2] = dx[2];
-                            hiq2[icount2] = hi;
-                            hjq2[icount2] = hj;
-                            piq2[icount2] = pi;
-                            pjq2[icount2] = pj;
-                            icount2 += 1;
-
-                            /* Flush? */
-                            if ( icount2 == VEC_SIZE ) {
-                                IACT_VEC( r2q2 , dxq2 , hiq2 , hjq2 , piq2 , pjq2 );
-                                icount2 = 0;
-                                }
-                                
-                            }
-                            
-                        else {
-                        
-                            /* Add this interaction to the non-symmetric queue. */
-                            r2q1[icount1] = r2;
-                            dxq1[3*icount1+0] = dx[0];
-                            dxq1[3*icount1+1] = dx[1];
-                            dxq1[3*icount1+2] = dx[2];
-                            hiq1[icount1] = hi;
-                            hjq1[icount1] = hj;
-                            piq1[icount1] = pi;
-                            pjq1[icount1] = pj;
-                            icount1 += 1;
-
-                            /* Flush? */
-                            if ( icount1 == VEC_SIZE ) {
-                                IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 );
-                                icount1 = 0;
-                                }
-                                
-                            }
-
-                    #endif
-
-                    }
-        
-                } /* loop over the parts in cj. */
-                
-            }
-    
-        } /* loop over the parts in ci. */
-        
-    /* printf( "runner_dopair: first half took %.3f ms...\n" , ((double)(getticks() - tic)) / CPU_TPS * 1000 );
-    tic = getticks(); */
 
-    /* Loop over the parts in cj. */
-    for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max ; pjd++ ) {
-    
-        /* Get a hold of the jth part in cj. */
-        pj = &parts_j[ sort_j[ pjd ].i ];
-        hj = pj->h;
-        dj = sort_j[pjd].d - hj*kernel_gamma - dx_max - rshift;
-        if ( dj > di_max )
-            continue;
-            
-        for ( k = 0 ; k < 3 ; k++ )
-            pjx[k] = pj->x[k] + shift[k];
-        hjg2 = hj * hj * kernel_gamma2;
-        
-        /* Is this particle outside the dt? */
-        if ( pj->dt > dt_step ) {
-        
-            /* Loop over the parts in ci. */
-            for ( pid = countdt_i-1 ; pid >= 0 && sortdt_i[pid].d > dj ; pid-- ) {
-
-                /* Get a pointer to the jth particle. */
-                pi = &parts_i[ sortdt_i[pid].i ];
-                hi = pi->h;
-
-                /* Compute the pairwise distance. */
-                r2 = 0.0f;
-                for ( k = 0 ; k < 3 ; k++ ) {
-                    dx[k] = pi->x[k] - pjx[k];
-                    r2 += dx[k]*dx[k];
-                    }
-
-                /* Hit or miss? */
-                if ( r2 < hjg2 && r2 > hi*hi*kernel_gamma2 ) {
-
-                    #ifndef VECTORIZE
-
-                        IACT_NONSYM( r2 , dx , hi , hj , pi , pj );
-
-                    #else
-
-                        /* Add this interaction to the queue. */
-                        r2q1[icount1] = r2;
-                        dxq1[3*icount1+0] = dx[0];
-                        dxq1[3*icount1+1] = dx[1];
-                        dxq1[3*icount1+2] = dx[2];
-                        hiq1[icount1] = hi;
-                        hjq1[icount1] = hj;
-                        piq1[icount1] = pi;
-                        pjq1[icount1] = pj;
-                        icount1 += 1;
-
-                        /* Flush? */
-                        if ( icount1 == VEC_SIZE ) {
-                            IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 );
-                            icount1 = 0;
-                            }
-
-                    #endif
-
-                    }
-
-                } /* loop over the parts in cj. */
+        /* Compute the pairwise distance. */
+        r2 = 0.0f;
+        for (k = 0; k < 3; k++) {
+          dx[k] = pjx[k] - pi->x[k];
+          r2 += dx[k] * dx[k];
+        }
+
+        /* Hit or miss? */
+        if (r2 < hjg2 && r2 > hi * hi * kernel_gamma2) {
+
+#ifndef VECTORIZE
+
+          /* Does pi need to be updated too? */
+          if (pi->dt <= dt_step)
+            IACT(r2, dx, hj, hi, pj, pi);
+          else
+            IACT_NONSYM(r2, dx, hj, hi, pj, pi);
+
+#else
+
+          /* Does pi need to be updated too? */
+          if (pi->dt <= dt_step) {
+
+            /* Add this interaction to the symmetric queue. */
+            r2q2[icount2] = r2;
+            dxq2[3 * icount2 + 0] = dx[0];
+            dxq2[3 * icount2 + 1] = dx[1];
+            dxq2[3 * icount2 + 2] = dx[2];
+            hiq2[icount2] = hj;
+            hjq2[icount2] = hi;
+            piq2[icount2] = pj;
+            pjq2[icount2] = pi;
+            icount2 += 1;
+
+            /* Flush? */
+            if (icount2 == VEC_SIZE) {
+              IACT_VEC(r2q2, dxq2, hiq2, hjq2, piq2, pjq2);
+              icount2 = 0;
             }
-            
-        /* Otherwise, interact with all particles in cj. */
-        else {
-        
-            /* Loop over the parts in ci. */
-            for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d > dj ; pid-- ) {
-
-                /* Get a pointer to the jth particle. */
-                pi = &parts_i[ sort_i[pid].i ];
-                hi = pi->h;
-
-                /* Compute the pairwise distance. */
-                r2 = 0.0f;
-                for ( k = 0 ; k < 3 ; k++ ) {
-                    dx[k] = pjx[k] - pi->x[k];
-                    r2 += dx[k]*dx[k];
-                    }
-
-                /* Hit or miss? */
-                if ( r2 < hjg2 && r2 > hi*hi*kernel_gamma2 ) {
-
-                    #ifndef VECTORIZE
-
-                        /* Does pi need to be updated too? */
-                        if ( pi->dt <= dt_step )
-                            IACT( r2 , dx , hj , hi , pj , pi );
-                        else
-                            IACT_NONSYM( r2 , dx , hj , hi , pj , pi );
-
-                    #else
-
-                        /* Does pi need to be updated too? */
-                        if ( pi->dt <= dt_step ) {
-                        
-                            /* Add this interaction to the symmetric queue. */
-                            r2q2[icount2] = r2;
-                            dxq2[3*icount2+0] = dx[0];
-                            dxq2[3*icount2+1] = dx[1];
-                            dxq2[3*icount2+2] = dx[2];
-                            hiq2[icount2] = hj;
-                            hjq2[icount2] = hi;
-                            piq2[icount2] = pj;
-                            pjq2[icount2] = pi;
-                            icount2 += 1;
-
-                            /* Flush? */
-                            if ( icount2 == VEC_SIZE ) {
-                                IACT_VEC( r2q2 , dxq2 , hiq2 , hjq2 , piq2 , pjq2 );
-                                icount2 = 0;
-                                }
-                                
-                            }
-                            
-                        else {
-                        
-                            /* Add this interaction to the non-summetric queue. */
-                            r2q1[icount1] = r2;
-                            dxq1[3*icount1+0] = dx[0];
-                            dxq1[3*icount1+1] = dx[1];
-                            dxq1[3*icount1+2] = dx[2];
-                            hiq1[icount1] = hj;
-                            hjq1[icount1] = hi;
-                            piq1[icount1] = pj;
-                            pjq1[icount1] = pi;
-                            icount1 += 1;
-
-                            /* Flush? */
-                            if ( icount1 == VEC_SIZE ) {
-                                IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 );
-                                icount1 = 0;
-                                }
-                                
-                            }
-
-                    #endif
-
-                    }
-
-                } /* loop over the parts in cj. */
-                
+
+          } else {
+
+            /* Add this interaction to the non-summetric queue. */
+            r2q1[icount1] = r2;
+            dxq1[3 * icount1 + 0] = dx[0];
+            dxq1[3 * icount1 + 1] = dx[1];
+            dxq1[3 * icount1 + 2] = dx[2];
+            hiq1[icount1] = hj;
+            hjq1[icount1] = hi;
+            piq1[icount1] = pj;
+            pjq1[icount1] = pi;
+            icount1 += 1;
+
+            /* Flush? */
+            if (icount1 == VEC_SIZE) {
+              IACT_NONSYM_VEC(r2q1, dxq1, hiq1, hjq1, piq1, pjq1);
+              icount1 = 0;
             }
-    
-        } /* loop over the parts in ci. */
-
-    #ifdef VECTORIZE
-    /* Pick up any leftovers. */
-    if ( icount1 > 0 )
-        for ( k = 0 ; k < icount1 ; k++ )
-            IACT_NONSYM( r2q1[k] , &dxq1[3*k] , hiq1[k] , hjq1[k] , piq1[k] , pjq1[k] );
-    if ( icount2 > 0 )
-        for ( k = 0 ; k < icount2 ; k++ )
-            IACT( r2q2[k] , &dxq2[3*k] , hiq2[k] , hjq2[k] , piq2[k] , pjq2[k] );
-    #endif
-    
-    #ifdef TIMER_VERBOSE
-        printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000 );
-    #else
-        TIMER_TOC(TIMER_DOPAIR);
-    #endif
+          }
+
+#endif
+        }
 
+      } /* loop over the parts in cj. */
     }
 
+  } /* loop over the parts in ci. */
+
+#ifdef VECTORIZE
+  /* Pick up any leftovers. */
+  if (icount1 > 0)
+    for (k = 0; k < icount1; k++)
+      IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]);
+  if (icount2 > 0)
+    for (k = 0; k < icount2; k++)
+      IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]);
+#endif
+
+#ifdef TIMER_VERBOSE
+  printf(
+      "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) "
+      "took %.3f ms.\n",
+      r->id, count_i, count_j, ci->depth, ci->h_max, cj->h_max,
+      fmax(ci->h[0], fmax(ci->h[1], ci->h[2])),
+      ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000);
+#else
+  TIMER_TOC(TIMER_DOPAIR);
+#endif
+}
 
 /**
  * @brief Compute the cell self-interaction.
@@ -1353,441 +1354,425 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
  * @param c The #cell.
  */
 
-void DOSELF1 ( struct runner *r , struct cell *restrict c ) {
-
-    int k, pid, pjd, count = c->count;
-    double pix[3];
-    float dx[3], hi, hj, hig2, r2;
-    struct part *restrict parts = c->parts, *restrict pi, *restrict pj;
-    float dt_step = r->e->dt_step;
-    int firstdt = 0, countdt = 0, *indt = NULL, doj;
-    #ifdef VECTORIZE
-        int icount1 = 0;
-        float r2q1[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hiq1[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hjq1[VEC_SIZE] __attribute__ ((aligned (16)));
-        float dxq1[3*VEC_SIZE] __attribute__ ((aligned (16)));
-        struct part *piq1[VEC_SIZE], *pjq1[VEC_SIZE];
-        int icount2 = 0;
-        float r2q2[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hiq2[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hjq2[VEC_SIZE] __attribute__ ((aligned (16)));
-        float dxq2[3*VEC_SIZE] __attribute__ ((aligned (16)));
-        struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE];
-    #endif
-    TIMER_TIC
-    
-    /* Set up indt if needed. */
-    if ( c->dt_min > dt_step )
-        return;
-    else if ( c->dt_max > dt_step ) {
-        if ( ( indt = (int *)alloca( sizeof(int) * count ) ) == NULL )
-            error( "Failed to allocate indt." );
-        for ( k = 0 ; k < count ; k++ )
-            if ( parts[k].dt <= dt_step ) {
-                indt[ countdt ] = k;
-                countdt += 1;
-                }
+void DOSELF1(struct runner *r, struct cell *restrict c) {
+
+  int k, pid, pjd, count = c->count;
+  double pix[3];
+  float dx[3], hi, hj, hig2, r2;
+  struct part *restrict parts = c->parts, *restrict pi, *restrict pj;
+  float dt_step = r->e->dt_step;
+  int firstdt = 0, countdt = 0, *indt = NULL, doj;
+#ifdef VECTORIZE
+  int icount1 = 0;
+  float r2q1[VEC_SIZE] __attribute__((aligned(16)));
+  float hiq1[VEC_SIZE] __attribute__((aligned(16)));
+  float hjq1[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq1[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct part *piq1[VEC_SIZE], *pjq1[VEC_SIZE];
+  int icount2 = 0;
+  float r2q2[VEC_SIZE] __attribute__((aligned(16)));
+  float hiq2[VEC_SIZE] __attribute__((aligned(16)));
+  float hjq2[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq2[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE];
+#endif
+  TIMER_TIC
+
+  /* Set up indt if needed. */
+  if (c->dt_min > dt_step)
+    return;
+  else if (c->dt_max > dt_step) {
+    if ((indt = (int *)alloca(sizeof(int) * count)) == NULL)
+      error("Failed to allocate indt.");
+    for (k = 0; k < count; k++)
+      if (parts[k].dt <= dt_step) {
+        indt[countdt] = k;
+        countdt += 1;
+      }
+  }
+
+  /* Loop over the particles in the cell. */
+  for (pid = 0; pid < count; pid++) {
+
+    /* Get a pointer to the ith particle. */
+    pi = &parts[pid];
+
+    /* Get the particle position and radius. */
+    for (k = 0; k < 3; k++) pix[k] = pi->x[k];
+    hi = pi->h;
+    hig2 = hi * hi * kernel_gamma2;
+
+    /* Is the ith particle inactive? */
+    if (pi->dt > dt_step) {
+
+      /* Loop over the other particles .*/
+      for (pjd = firstdt; pjd < countdt; pjd++) {
+
+        /* Get a pointer to the jth particle. */
+        pj = &parts[indt[pjd]];
+        hj = pj->h;
+
+        /* Compute the pairwise distance. */
+        r2 = 0.0f;
+        for (k = 0; k < 3; k++) {
+          dx[k] = pj->x[k] - pix[k];
+          r2 += dx[k] * dx[k];
         }
-    
-    /* Loop over the particles in the cell. */
-    for ( pid = 0 ; pid < count ; pid++ ) {
-    
-        /* Get a pointer to the ith particle. */
-        pi = &parts[pid];
-    
-        /* Get the particle position and radius. */
-        for ( k = 0 ; k < 3 ; k++ )
-            pix[k] = pi->x[k];
-        hi = pi->h;
-        hig2 = hi * hi * kernel_gamma2;
-        
-        /* Is the ith particle inactive? */
-        if ( pi->dt > dt_step ) {
-        
-            /* Loop over the other particles .*/
-            for ( pjd = firstdt ; pjd < countdt ; pjd++ ) {
-
-                /* Get a pointer to the jth particle. */
-                pj = &parts[ indt[ pjd ] ];
-                hj = pj->h;
-
-                /* Compute the pairwise distance. */
-                r2 = 0.0f;
-                for ( k = 0 ; k < 3 ; k++ ) {
-                    dx[k] = pj->x[k] - pix[k];
-                    r2 += dx[k]*dx[k];
-                    }
-
-                /* Hit or miss? */
-                if ( r2 < hj*hj*kernel_gamma2 ) {
-
-                    #ifndef VECTORIZE
-
-                        IACT_NONSYM( r2 , dx , hj , hi , pj , pi );
-
-                    #else
-
-                        /* Add this interaction to the queue. */
-                        r2q1[icount1] = r2;
-                        dxq1[3*icount1+0] = dx[0];
-                        dxq1[3*icount1+1] = dx[1];
-                        dxq1[3*icount1+2] = dx[2];
-                        hiq1[icount1] = hj;
-                        hjq1[icount1] = hi;
-                        piq1[icount1] = pj;
-                        pjq1[icount1] = pi;
-                        icount1 += 1;
-
-                        /* Flush? */
-                        if ( icount1 == VEC_SIZE ) {
-                            IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 );
-                            icount1 = 0;
-                            }
-
-                    #endif
-
-                    }
-
-                } /* loop over all other particles. */
-                
+
+        /* Hit or miss? */
+        if (r2 < hj * hj * kernel_gamma2) {
+
+#ifndef VECTORIZE
+
+          IACT_NONSYM(r2, dx, hj, hi, pj, pi);
+
+#else
+
+          /* Add this interaction to the queue. */
+          r2q1[icount1] = r2;
+          dxq1[3 * icount1 + 0] = dx[0];
+          dxq1[3 * icount1 + 1] = dx[1];
+          dxq1[3 * icount1 + 2] = dx[2];
+          hiq1[icount1] = hj;
+          hjq1[icount1] = hi;
+          piq1[icount1] = pj;
+          pjq1[icount1] = pi;
+          icount1 += 1;
+
+          /* Flush? */
+          if (icount1 == VEC_SIZE) {
+            IACT_NONSYM_VEC(r2q1, dxq1, hiq1, hjq1, piq1, pjq1);
+            icount1 = 0;
+          }
+
+#endif
+        }
+
+      } /* loop over all other particles. */
+
+    }
+
+    /* Otherwise, interact with all candidates. */
+    else {
+
+      /* We caught a live one! */
+      firstdt += 1;
+
+      /* Loop over the other particles .*/
+      for (pjd = pid + 1; pjd < count; pjd++) {
+
+        /* Get a pointer to the jth particle. */
+        pj = &parts[pjd];
+        hj = pj->h;
+
+        /* Compute the pairwise distance. */
+        r2 = 0.0f;
+        for (k = 0; k < 3; k++) {
+          dx[k] = pix[k] - pj->x[k];
+          r2 += dx[k] * dx[k];
+        }
+        doj = (pj->dt <= dt_step) && (r2 < hj * hj * kernel_gamma2);
+
+        /* Hit or miss? */
+        if (r2 < hig2 || doj) {
+
+#ifndef VECTORIZE
+
+          /* Which parts need to be updated? */
+          if (r2 < hig2 && doj)
+            IACT(r2, dx, hi, hj, pi, pj);
+          else if (!doj)
+            IACT_NONSYM(r2, dx, hi, hj, pi, pj);
+          else {
+            dx[0] = -dx[0];
+            dx[1] = -dx[1];
+            dx[2] = -dx[2];
+            IACT_NONSYM(r2, dx, hj, hi, pj, pi);
+          }
+
+#else
+
+          /* Does pj need to be updated too? */
+          if (r2 < hig2 && doj) {
+
+            /* Add this interaction to the symmetric queue. */
+            r2q2[icount2] = r2;
+            dxq2[3 * icount2 + 0] = dx[0];
+            dxq2[3 * icount2 + 1] = dx[1];
+            dxq2[3 * icount2 + 2] = dx[2];
+            hiq2[icount2] = hi;
+            hjq2[icount2] = hj;
+            piq2[icount2] = pi;
+            pjq2[icount2] = pj;
+            icount2 += 1;
+
+            /* Flush? */
+            if (icount2 == VEC_SIZE) {
+              IACT_VEC(r2q2, dxq2, hiq2, hjq2, piq2, pjq2);
+              icount2 = 0;
             }
-            
-        /* Otherwise, interact with all candidates. */
-        else {
-        
-            /* We caught a live one! */
-            firstdt += 1;
-            
-            /* Loop over the other particles .*/
-            for ( pjd = pid+1 ; pjd < count ; pjd++ ) {
-
-                /* Get a pointer to the jth particle. */
-                pj = &parts[pjd];
-                hj = pj->h;
-
-                /* Compute the pairwise distance. */
-                r2 = 0.0f;
-                for ( k = 0 ; k < 3 ; k++ ) {
-                    dx[k] = pix[k] - pj->x[k];
-                    r2 += dx[k]*dx[k];
-                    }
-                doj = ( pj->dt <= dt_step ) && ( r2 < hj*hj*kernel_gamma2 );
-
-                /* Hit or miss? */
-                if ( r2 < hig2 || doj ) {
-
-                    #ifndef VECTORIZE
-
-                        /* Which parts need to be updated? */
-                        if ( r2 < hig2 && doj )
-                            IACT( r2 , dx , hi , hj , pi , pj );
-                        else if ( !doj )
-                            IACT_NONSYM( r2 , dx , hi , hj , pi , pj );
-                        else {
-                            dx[0] = -dx[0]; dx[1] = -dx[1]; dx[2] = -dx[2];
-                            IACT_NONSYM( r2 , dx , hj , hi , pj , pi );
-                            }
-
-                    #else
-
-                        /* Does pj need to be updated too? */
-                        if ( r2 < hig2 && doj ) {
-                        
-                            /* Add this interaction to the symmetric queue. */
-                            r2q2[icount2] = r2;
-                            dxq2[3*icount2+0] = dx[0];
-                            dxq2[3*icount2+1] = dx[1];
-                            dxq2[3*icount2+2] = dx[2];
-                            hiq2[icount2] = hi;
-                            hjq2[icount2] = hj;
-                            piq2[icount2] = pi;
-                            pjq2[icount2] = pj;
-                            icount2 += 1;
-
-                            /* Flush? */
-                            if ( icount2 == VEC_SIZE ) {
-                                IACT_VEC( r2q2 , dxq2 , hiq2 , hjq2 , piq2 , pjq2 );
-                                icount2 = 0;
-                                }
-                                
-                            }
-                            
-                        else if ( !doj ) {
-                        
-                            /* Add this interaction to the non-symmetric queue. */
-                            r2q1[icount1] = r2;
-                            dxq1[3*icount1+0] = dx[0];
-                            dxq1[3*icount1+1] = dx[1];
-                            dxq1[3*icount1+2] = dx[2];
-                            hiq1[icount1] = hi;
-                            hjq1[icount1] = hj;
-                            piq1[icount1] = pi;
-                            pjq1[icount1] = pj;
-                            icount1 += 1;
-
-                            /* Flush? */
-                            if ( icount1 == VEC_SIZE ) {
-                                IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 );
-                                icount1 = 0;
-                                }
-                                
-                            }
-
-                        else  {
-                        
-                            /* Add this interaction to the non-symmetric queue. */
-                            r2q1[icount1] = r2;
-                            dxq1[3*icount1+0] = -dx[0];
-                            dxq1[3*icount1+1] = -dx[1];
-                            dxq1[3*icount1+2] = -dx[2];
-                            hiq1[icount1] = hj;
-                            hjq1[icount1] = hi;
-                            piq1[icount1] = pj;
-                            pjq1[icount1] = pi;
-                            icount1 += 1;
-
-                            /* Flush? */
-                            if ( icount1 == VEC_SIZE ) {
-                                IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 );
-                                icount1 = 0;
-                                }
-                                
-                            }
-
-                    #endif
-
-                    }
-
-                } /* loop over all other particles. */
-            
+
+          } else if (!doj) {
+
+            /* Add this interaction to the non-symmetric queue. */
+            r2q1[icount1] = r2;
+            dxq1[3 * icount1 + 0] = dx[0];
+            dxq1[3 * icount1 + 1] = dx[1];
+            dxq1[3 * icount1 + 2] = dx[2];
+            hiq1[icount1] = hi;
+            hjq1[icount1] = hj;
+            piq1[icount1] = pi;
+            pjq1[icount1] = pj;
+            icount1 += 1;
+
+            /* Flush? */
+            if (icount1 == VEC_SIZE) {
+              IACT_NONSYM_VEC(r2q1, dxq1, hiq1, hjq1, piq1, pjq1);
+              icount1 = 0;
             }
-    
-        } /* loop over all particles. */
-
-    #ifdef VECTORIZE
-    /* Pick up any leftovers. */
-    if ( icount1 > 0 )
-        for ( k = 0 ; k < icount1 ; k++ )
-            IACT_NONSYM( r2q1[k] , &dxq1[3*k] , hiq1[k] , hjq1[k] , piq1[k] , pjq1[k] );
-    if ( icount2 > 0 )
-        for ( k = 0 ; k < icount2 ; k++ )
-            IACT( r2q2[k] , &dxq2[3*k] , hiq2[k] , hjq2[k] , piq2[k] , pjq2[k] );
-    #endif
-    
-    #ifdef TIMER_VERBOSE
-        printf( "runner_doself1[%02i]: %i parts at depth %i took %.3f ms.\n" , r->id , count , c->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 );
-    #else
-        TIMER_TOC(TIMER_DOSELF);
-    #endif
 
+          } else {
+
+            /* Add this interaction to the non-symmetric queue. */
+            r2q1[icount1] = r2;
+            dxq1[3 * icount1 + 0] = -dx[0];
+            dxq1[3 * icount1 + 1] = -dx[1];
+            dxq1[3 * icount1 + 2] = -dx[2];
+            hiq1[icount1] = hj;
+            hjq1[icount1] = hi;
+            piq1[icount1] = pj;
+            pjq1[icount1] = pi;
+            icount1 += 1;
+
+            /* Flush? */
+            if (icount1 == VEC_SIZE) {
+              IACT_NONSYM_VEC(r2q1, dxq1, hiq1, hjq1, piq1, pjq1);
+              icount1 = 0;
+            }
+          }
+
+#endif
+        }
+
+      } /* loop over all other particles. */
     }
 
+  } /* loop over all particles. */
+
+#ifdef VECTORIZE
+  /* Pick up any leftovers. */
+  if (icount1 > 0)
+    for (k = 0; k < icount1; k++)
+      IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]);
+  if (icount2 > 0)
+    for (k = 0; k < icount2; k++)
+      IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]);
+#endif
+
+#ifdef TIMER_VERBOSE
+  printf("runner_doself1[%02i]: %i parts at depth %i took %.3f ms.\n", r->id,
+         count, c->depth, ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000);
+#else
+  TIMER_TOC(TIMER_DOSELF);
+#endif
+}
+
+void DOSELF2(struct runner *r, struct cell *restrict c) {
+
+  int k, pid, pjd, count = c->count;
+  double pix[3];
+  float dx[3], hi, hj, hig2, r2;
+  struct part *restrict parts = c->parts, *restrict pi, *restrict pj;
+  float dt_step = r->e->dt_step;
+  int firstdt = 0, countdt = 0, *indt = NULL;
+#ifdef VECTORIZE
+  int icount1 = 0;
+  float r2q1[VEC_SIZE] __attribute__((aligned(16)));
+  float hiq1[VEC_SIZE] __attribute__((aligned(16)));
+  float hjq1[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq1[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct part *piq1[VEC_SIZE], *pjq1[VEC_SIZE];
+  int icount2 = 0;
+  float r2q2[VEC_SIZE] __attribute__((aligned(16)));
+  float hiq2[VEC_SIZE] __attribute__((aligned(16)));
+  float hjq2[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq2[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE];
+#endif
+  TIMER_TIC
+
+  /* Set up indt if needed. */
+  if (c->dt_min > dt_step)
+    return;
+  else if (c->dt_max > dt_step) {
+    if ((indt = (int *)alloca(sizeof(int) * count)) == NULL)
+      error("Failed to allocate indt.");
+    for (k = 0; k < count; k++)
+      if (parts[k].dt <= dt_step) {
+        indt[countdt] = k;
+        countdt += 1;
+      }
+  }
+
+  /* Loop over the particles in the cell. */
+  for (pid = 0; pid < count; pid++) {
+
+    /* Get a pointer to the ith particle. */
+    pi = &parts[pid];
+
+    /* Get the particle position and radius. */
+    for (k = 0; k < 3; k++) pix[k] = pi->x[k];
+    hi = pi->h;
+    hig2 = hi * hi * kernel_gamma2;
+
+    /* Is the ith particle not active? */
+    if (pi->dt > dt_step) {
+
+      /* Loop over the other particles .*/
+      for (pjd = firstdt; pjd < countdt; pjd++) {
+
+        /* Get a pointer to the jth particle. */
+        pj = &parts[indt[pjd]];
+        hj = pj->h;
 
-void DOSELF2 ( struct runner *r , struct cell *restrict c ) {
-
-    int k, pid, pjd, count = c->count;
-    double pix[3];
-    float dx[3], hi, hj, hig2, r2;
-    struct part *restrict parts = c->parts, *restrict pi, *restrict pj;
-    float dt_step = r->e->dt_step;
-    int firstdt = 0, countdt = 0, *indt = NULL;
-    #ifdef VECTORIZE
-        int icount1 = 0;
-        float r2q1[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hiq1[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hjq1[VEC_SIZE] __attribute__ ((aligned (16)));
-        float dxq1[3*VEC_SIZE] __attribute__ ((aligned (16)));
-        struct part *piq1[VEC_SIZE], *pjq1[VEC_SIZE];
-        int icount2 = 0;
-        float r2q2[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hiq2[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hjq2[VEC_SIZE] __attribute__ ((aligned (16)));
-        float dxq2[3*VEC_SIZE] __attribute__ ((aligned (16)));
-        struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE];
-    #endif
-    TIMER_TIC
-    
-    /* Set up indt if needed. */
-    if ( c->dt_min > dt_step )
-        return;
-    else if ( c->dt_max > dt_step ) {
-        if ( ( indt = (int *)alloca( sizeof(int) * count ) ) == NULL )
-            error( "Failed to allocate indt." );
-        for ( k = 0 ; k < count ; k++ )
-            if ( parts[k].dt <= dt_step ) {
-                indt[ countdt ] = k;
-                countdt += 1;
-                }
+        /* Compute the pairwise distance. */
+        r2 = 0.0f;
+        for (k = 0; k < 3; k++) {
+          dx[k] = pj->x[k] - pix[k];
+          r2 += dx[k] * dx[k];
         }
-    
-    /* Loop over the particles in the cell. */
-    for ( pid = 0 ; pid < count ; pid++ ) {
-    
-        /* Get a pointer to the ith particle. */
-        pi = &parts[pid];
-    
-        /* Get the particle position and radius. */
-        for ( k = 0 ; k < 3 ; k++ )
-            pix[k] = pi->x[k];
-        hi = pi->h;
-        hig2 = hi * hi * kernel_gamma2;
-        
-        /* Is the ith particle not active? */
-        if ( pi->dt > dt_step ) {
-        
-            /* Loop over the other particles .*/
-            for ( pjd = firstdt ; pjd < countdt ; pjd++ ) {
-
-                /* Get a pointer to the jth particle. */
-                pj = &parts[ indt[ pjd ] ];
-                hj = pj->h;
-
-                /* Compute the pairwise distance. */
-                r2 = 0.0f;
-                for ( k = 0 ; k < 3 ; k++ ) {
-                    dx[k] = pj->x[k] - pix[k];
-                    r2 += dx[k]*dx[k];
-                    }
-
-                /* Hit or miss? */
-                if ( r2 < hig2 || r2 < hj*hj*kernel_gamma2 ) {
-
-                    #ifndef VECTORIZE
-
-                        IACT_NONSYM( r2 , dx , hj , hi , pj , pi );
-
-                    #else
-
-                        /* Add this interaction to the queue. */
-                        r2q1[icount1] = r2;
-                        dxq1[3*icount1+0] = dx[0];
-                        dxq1[3*icount1+1] = dx[1];
-                        dxq1[3*icount1+2] = dx[2];
-                        hiq1[icount1] = hj;
-                        hjq1[icount1] = hi;
-                        piq1[icount1] = pj;
-                        pjq1[icount1] = pi;
-                        icount1 += 1;
-
-                        /* Flush? */
-                        if ( icount1 == VEC_SIZE ) {
-                            IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 );
-                            icount1 = 0;
-                            }
-
-                    #endif
-
-                    }
-
-                } /* loop over all other particles. */
-                
+
+        /* Hit or miss? */
+        if (r2 < hig2 || r2 < hj * hj * kernel_gamma2) {
+
+#ifndef VECTORIZE
+
+          IACT_NONSYM(r2, dx, hj, hi, pj, pi);
+
+#else
+
+          /* Add this interaction to the queue. */
+          r2q1[icount1] = r2;
+          dxq1[3 * icount1 + 0] = dx[0];
+          dxq1[3 * icount1 + 1] = dx[1];
+          dxq1[3 * icount1 + 2] = dx[2];
+          hiq1[icount1] = hj;
+          hjq1[icount1] = hi;
+          piq1[icount1] = pj;
+          pjq1[icount1] = pi;
+          icount1 += 1;
+
+          /* Flush? */
+          if (icount1 == VEC_SIZE) {
+            IACT_NONSYM_VEC(r2q1, dxq1, hiq1, hjq1, piq1, pjq1);
+            icount1 = 0;
+          }
+
+#endif
+        }
+
+      } /* loop over all other particles. */
+
+    }
+
+    /* Otherwise, interact with all candidates. */
+    else {
+
+      /* We caught a live one! */
+      firstdt += 1;
+
+      /* Loop over the other particles .*/
+      for (pjd = pid + 1; pjd < count; pjd++) {
+
+        /* Get a pointer to the jth particle. */
+        pj = &parts[pjd];
+        hj = pj->h;
+
+        /* Compute the pairwise distance. */
+        r2 = 0.0f;
+        for (k = 0; k < 3; k++) {
+          dx[k] = pix[k] - pj->x[k];
+          r2 += dx[k] * dx[k];
+        }
+
+        /* Hit or miss? */
+        if (r2 < hig2 || r2 < hj * hj * kernel_gamma2) {
+
+#ifndef VECTORIZE
+
+          /* Does pj need to be updated too? */
+          if (pj->dt <= dt_step)
+            IACT(r2, dx, hi, hj, pi, pj);
+          else
+            IACT_NONSYM(r2, dx, hi, hj, pi, pj);
+
+#else
+
+          /* Does pj need to be updated too? */
+          if (pj->dt <= dt_step) {
+
+            /* Add this interaction to the symmetric queue. */
+            r2q2[icount2] = r2;
+            dxq2[3 * icount2 + 0] = dx[0];
+            dxq2[3 * icount2 + 1] = dx[1];
+            dxq2[3 * icount2 + 2] = dx[2];
+            hiq2[icount2] = hi;
+            hjq2[icount2] = hj;
+            piq2[icount2] = pi;
+            pjq2[icount2] = pj;
+            icount2 += 1;
+
+            /* Flush? */
+            if (icount2 == VEC_SIZE) {
+              IACT_VEC(r2q2, dxq2, hiq2, hjq2, piq2, pjq2);
+              icount2 = 0;
             }
-            
-        /* Otherwise, interact with all candidates. */
-        else {
-        
-            /* We caught a live one! */
-            firstdt += 1;
-            
-            /* Loop over the other particles .*/
-            for ( pjd = pid+1 ; pjd < count ; pjd++ ) {
-
-                /* Get a pointer to the jth particle. */
-                pj = &parts[pjd];
-                hj = pj->h;
-
-                /* Compute the pairwise distance. */
-                r2 = 0.0f;
-                for ( k = 0 ; k < 3 ; k++ ) {
-                    dx[k] = pix[k] - pj->x[k];
-                    r2 += dx[k]*dx[k];
-                    }
-
-                /* Hit or miss? */
-                if ( r2 < hig2 || r2 < hj*hj*kernel_gamma2 ) {
-
-                    #ifndef VECTORIZE
-
-                        /* Does pj need to be updated too? */
-                        if ( pj->dt <= dt_step )
-                            IACT( r2 , dx , hi , hj , pi , pj );
-                        else
-                            IACT_NONSYM( r2 , dx , hi , hj , pi , pj );
-
-                    #else
-
-                        /* Does pj need to be updated too? */
-                        if ( pj->dt <= dt_step ) {
-                        
-                            /* Add this interaction to the symmetric queue. */
-                            r2q2[icount2] = r2;
-                            dxq2[3*icount2+0] = dx[0];
-                            dxq2[3*icount2+1] = dx[1];
-                            dxq2[3*icount2+2] = dx[2];
-                            hiq2[icount2] = hi;
-                            hjq2[icount2] = hj;
-                            piq2[icount2] = pi;
-                            pjq2[icount2] = pj;
-                            icount2 += 1;
-
-                            /* Flush? */
-                            if ( icount2 == VEC_SIZE ) {
-                                IACT_VEC( r2q2 , dxq2 , hiq2 , hjq2 , piq2 , pjq2 );
-                                icount2 = 0;
-                                }
-                                
-                            }
-                            
-                        else {
-                        
-                            /* Add this interaction to the non-symmetric queue. */
-                            r2q1[icount1] = r2;
-                            dxq1[3*icount1+0] = dx[0];
-                            dxq1[3*icount1+1] = dx[1];
-                            dxq1[3*icount1+2] = dx[2];
-                            hiq1[icount1] = hi;
-                            hjq1[icount1] = hj;
-                            piq1[icount1] = pi;
-                            pjq1[icount1] = pj;
-                            icount1 += 1;
-
-                            /* Flush? */
-                            if ( icount1 == VEC_SIZE ) {
-                                IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 );
-                                icount1 = 0;
-                                }
-                                
-                            }
-
-                    #endif
-
-                    }
-
-                } /* loop over all other particles. */
-            
+
+          } else {
+
+            /* Add this interaction to the non-symmetric queue. */
+            r2q1[icount1] = r2;
+            dxq1[3 * icount1 + 0] = dx[0];
+            dxq1[3 * icount1 + 1] = dx[1];
+            dxq1[3 * icount1 + 2] = dx[2];
+            hiq1[icount1] = hi;
+            hjq1[icount1] = hj;
+            piq1[icount1] = pi;
+            pjq1[icount1] = pj;
+            icount1 += 1;
+
+            /* Flush? */
+            if (icount1 == VEC_SIZE) {
+              IACT_NONSYM_VEC(r2q1, dxq1, hiq1, hjq1, piq1, pjq1);
+              icount1 = 0;
             }
-    
-        } /* loop over all particles. */
-
-    #ifdef VECTORIZE
-    /* Pick up any leftovers. */
-    if ( icount1 > 0 )
-        for ( k = 0 ; k < icount1 ; k++ )
-            IACT_NONSYM( r2q1[k] , &dxq1[3*k] , hiq1[k] , hjq1[k] , piq1[k] , pjq1[k] );
-    if ( icount2 > 0 )
-        for ( k = 0 ; k < icount2 ; k++ )
-            IACT( r2q2[k] , &dxq2[3*k] , hiq2[k] , hjq2[k] , piq2[k] , pjq2[k] );
-    #endif
-    
-    #ifdef TIMER_VERBOSE
-        printf( "runner_doself2[%02i]: %i parts at depth %i took %.3f ms.\n" , r->id , count , c->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 );
-    #else
-        TIMER_TOC(TIMER_DOSELF);
-    #endif
+          }
+
+#endif
+        }
 
+      } /* loop over all other particles. */
     }
 
+  } /* loop over all particles. */
+
+#ifdef VECTORIZE
+  /* Pick up any leftovers. */
+  if (icount1 > 0)
+    for (k = 0; k < icount1; k++)
+      IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]);
+  if (icount2 > 0)
+    for (k = 0; k < icount2; k++)
+      IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]);
+#endif
+
+#ifdef TIMER_VERBOSE
+  printf("runner_doself2[%02i]: %i parts at depth %i took %.3f ms.\n", r->id,
+         count, c->depth, ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000);
+#else
+  TIMER_TOC(TIMER_DOSELF);
+#endif
+}
 
 /**
  * @brief Compute grouped sub-cell interactions
@@ -1802,1015 +1787,1156 @@ void DOSELF2 ( struct runner *r , struct cell *restrict c ) {
  * redundant computations to find the sid on-the-fly.
  */
 
-void DOSUB1 ( struct runner *r , struct cell *ci , struct cell *cj , int sid , int gettimer ) {
-
-    int j = 0, k;
-    double shift[3];
-    float h;
-    struct space *s = r->e->s;
-    float dt_step = r->e->dt_step;
-
-    TIMER_TIC
-    
-    /* Is this a single cell? */
-    if ( cj == NULL ) {
-
-        /* Should we even bother? */    
-        if ( ci->dt_min > dt_step )
-            return;
-        
-        /* Recurse? */
-        if ( ci->split ) {
-        
-            /* Loop over all progeny. */
-            for ( k = 0 ; k < 8 ; k++ )
-                if ( ci->progeny[k] != NULL ) {
-                    DOSUB1( r , ci->progeny[k] , NULL , -1 , 0 );
-                    for ( j = k+1 ; j < 8 ; j++ )
-                        if ( ci->progeny[j] != NULL )
-                            DOSUB1( r , ci->progeny[k] , ci->progeny[j] , -1 , 0 );
-                    }
-        
-            }
-        
-        /* Otherwsie, compute self-interaction. */
-        else
-            DOSELF1( r , ci );
-            
-        } /* self-interaction. */
-        
-    /* Otherwise, it's a pair interaction. */
-    else {
-    
-        /* Should we even bother? */    
-        if ( ci->dt_min > dt_step && cj->dt_min > dt_step )
-            return;
-        
-        /* Get the cell dimensions. */
-        h = fmin( ci->h[0] , fmin( ci->h[1] , ci->h[2] ) );
-        
-        /* Get the type of pair if not specified explicitly. */
-        // if ( sid < 0 )
-            sid = space_getsid( s , &ci , &cj , shift );
-            
-        /* Recurse? */
-        if ( ci->split && cj->split &&
-             fmaxf( ci->h_max , cj->h_max )*kernel_gamma + ci->dx_max + cj->dx_max < h/2 ) {
-             
-            /* Different types of flags. */
-            switch ( sid ) {
-
-                /* Regular sub-cell interactions of a single cell. */
-                case 0: /* (  1 ,  1 ,  1 ) */
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 );
-                    break;
-                    
-                case 1: /* (  1 ,  1 ,  0 ) */
-                    if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[6] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL )
-                        DOSUB1( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[1] , -1 , 0 );
-                    break;
-
-                case 2: /* (  1 ,  1 , -1 ) */
-                    if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL )
-                        DOSUB1( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 );
-                    break;
-                    
-                case 3: /* (  1 ,  0 ,  1 ) */
-                    if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[5] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL )
-                        DOSUB1( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[2] , -1 , 0 );
-                    break;
-
-                case 4: /* (  1 ,  0 ,  0 ) */
-                    if ( ci->progeny[4] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[4] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[1] != NULL )
-                        DOSUB1( r , ci->progeny[4] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[2] != NULL )
-                        DOSUB1( r , ci->progeny[4] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL )
-                        DOSUB1( r , ci->progeny[4] , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[5] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[1] != NULL )
-                        DOSUB1( r , ci->progeny[5] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL )
-                        DOSUB1( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[3] != NULL )
-                        DOSUB1( r , ci->progeny[5] , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[6] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL )
-                        DOSUB1( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[2] != NULL )
-                        DOSUB1( r , ci->progeny[6] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[3] != NULL )
-                        DOSUB1( r , ci->progeny[6] , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[3] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[3] , -1 , 0 );
-                    break;
-
-                case 5: /* (  1 ,  0 , -1 ) */
-                    if ( ci->progeny[4] != NULL && cj->progeny[1] != NULL )
-                        DOSUB1( r , ci->progeny[4] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL )
-                        DOSUB1( r , ci->progeny[4] , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL )
-                        DOSUB1( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[3] != NULL )
-                        DOSUB1( r , ci->progeny[6] , cj->progeny[3] , -1 , 0 );
-                    break;
-
-                case 6: /* (  1 , -1 ,  1 ) */
-                    if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL )
-                        DOSUB1( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 );
-                    break;
-                    
-                case 7: /* (  1 , -1 ,  0 ) */
-                    if ( ci->progeny[4] != NULL && cj->progeny[2] != NULL )
-                        DOSUB1( r , ci->progeny[4] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL )
-                        DOSUB1( r , ci->progeny[4] , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL )
-                        DOSUB1( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[3] != NULL )
-                        DOSUB1( r , ci->progeny[5] , cj->progeny[3] , -1 , 0 );
-                    break;
-
-                case 8: /* (  1 , -1 , -1 ) */
-                    if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL )
-                        DOSUB1( r , ci->progeny[4] , cj->progeny[3] , -1 , 0 );
-                    break;
-                    
-                case 9: /* (  0 ,  1 ,  1 ) */
-                    if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[3] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL )
-                        DOSUB1( r , ci->progeny[3] , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[4] , -1 , 0 );
-                    break;
-
-                case 10: /* (  0 ,  1 ,  0 ) */
-                    if ( ci->progeny[2] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[2] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[2] != NULL && cj->progeny[1] != NULL )
-                        DOSUB1( r , ci->progeny[2] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[2] != NULL && cj->progeny[4] != NULL )
-                        DOSUB1( r , ci->progeny[2] , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[2] != NULL && cj->progeny[5] != NULL )
-                        DOSUB1( r , ci->progeny[2] , cj->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[3] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[1] != NULL )
-                        DOSUB1( r , ci->progeny[3] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL )
-                        DOSUB1( r , ci->progeny[3] , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[5] != NULL )
-                        DOSUB1( r , ci->progeny[3] , cj->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[6] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL )
-                        DOSUB1( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[4] != NULL )
-                        DOSUB1( r , ci->progeny[6] , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[5] != NULL )
-                        DOSUB1( r , ci->progeny[6] , cj->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[5] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[5] , -1 , 0 );
-                    break;
-
-                case 11: /* (  0 ,  1 , -1 ) */
-                    if ( ci->progeny[2] != NULL && cj->progeny[1] != NULL )
-                        DOSUB1( r , ci->progeny[2] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[2] != NULL && cj->progeny[5] != NULL )
-                        DOSUB1( r , ci->progeny[2] , cj->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL )
-                        DOSUB1( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[5] != NULL )
-                        DOSUB1( r , ci->progeny[6] , cj->progeny[5] , -1 , 0 );
-                    break;
-
-                case 12: /* (  0 ,  0 ,  1 ) */
-                    if ( ci->progeny[1] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[1] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[1] != NULL && cj->progeny[2] != NULL )
-                        DOSUB1( r , ci->progeny[1] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[1] != NULL && cj->progeny[4] != NULL )
-                        DOSUB1( r , ci->progeny[1] , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[1] != NULL && cj->progeny[6] != NULL )
-                        DOSUB1( r , ci->progeny[1] , cj->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[3] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[2] != NULL )
-                        DOSUB1( r , ci->progeny[3] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL )
-                        DOSUB1( r , ci->progeny[3] , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[6] != NULL )
-                        DOSUB1( r , ci->progeny[3] , cj->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[5] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL )
-                        DOSUB1( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[4] != NULL )
-                        DOSUB1( r , ci->progeny[5] , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[6] != NULL )
-                        DOSUB1( r , ci->progeny[5] , cj->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[6] != NULL )
-                        DOSUB1( r , ci->progeny[7] , cj->progeny[6] , -1 , 0 );
-                    break;
-
-                }
-            
-            }
-            
-        /* Otherwise, compute the pair directly. */
-        else if ( ci->dt_min <= dt_step || cj->dt_min <= dt_step ) {
-        
-            /* Do any of the cells need to be sorted first? */
-            if ( !(ci->sorted & (1 << sid) ) )
-                runner_dosort( r , ci , (1 << sid) , 1 );
-            if ( !(cj->sorted & (1 << sid) ) )
-                runner_dosort( r , cj , (1 << sid) , 1 );
-        
-            /* Compute the interactions. */
-            DOPAIR1( r , ci , cj );
-            
-            }
-    
-        } /* otherwise, pair interaction. */
-    
+void DOSUB1(struct runner *r, struct cell *ci, struct cell *cj, int sid,
+            int gettimer) {
+
+  int j = 0, k;
+  double shift[3];
+  float h;
+  struct space *s = r->e->s;
+  float dt_step = r->e->dt_step;
+
+  TIMER_TIC
+
+  /* Is this a single cell? */
+  if (cj == NULL) {
 
-    if ( gettimer )
-        #ifdef TIMER_VERBOSE
-            printf( "runner_dosub1[%02i]: flags=%i at depth %i took %.3f ms.\n" , r->id , sid , ci->depth , ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000 );
-        #else
-            TIMER_TOC(TIMER_DOSUB);
-        #endif
+    /* Should we even bother? */
+    if (ci->dt_min > dt_step) return;
+
+    /* Recurse? */
+    if (ci->split) {
+
+      /* Loop over all progeny. */
+      for (k = 0; k < 8; k++)
+        if (ci->progeny[k] != NULL) {
+          DOSUB1(r, ci->progeny[k], NULL, -1, 0);
+          for (j = k + 1; j < 8; j++)
+            if (ci->progeny[j] != NULL)
+              DOSUB1(r, ci->progeny[k], ci->progeny[j], -1, 0);
+        }
 
     }
 
+    /* Otherwsie, compute self-interaction. */
+    else
+      DOSELF1(r, ci);
+
+  } /* self-interaction. */
+
+  /* Otherwise, it's a pair interaction. */
+  else {
+
+    /* Should we even bother? */
+    if (ci->dt_min > dt_step && cj->dt_min > dt_step) return;
+
+    /* Get the cell dimensions. */
+    h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
+
+    /* Get the type of pair if not specified explicitly. */
+    // if ( sid < 0 )
+    sid = space_getsid(s, &ci, &cj, shift);
+
+    /* Recurse? */
+    if (ci->split && cj->split &&
+        fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max <
+            h / 2) {
+
+      /* Different types of flags. */
+      switch (sid) {
+
+        /* Regular sub-cell interactions of a single cell. */
+        case 0: /* (  1 ,  1 ,  1 ) */
+          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+          break;
+
+        case 1: /* (  1 ,  1 ,  0 ) */
+          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[6], cj->progeny[0], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[1], -1, 0);
+          break;
+
+        case 2: /* (  1 ,  1 , -1 ) */
+          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+          break;
+
+        case 3: /* (  1 ,  0 ,  1 ) */
+          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[5], cj->progeny[0], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[2], -1, 0);
+          break;
+
+        case 4: /* (  1 ,  0 ,  0 ) */
+          if (ci->progeny[4] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[4], cj->progeny[0], -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
+            DOSUB1(r, ci->progeny[4], cj->progeny[1], -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
+            DOSUB1(r, ci->progeny[4], cj->progeny[2], -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+            DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[5], cj->progeny[0], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[1] != NULL)
+            DOSUB1(r, ci->progeny[5], cj->progeny[1], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
+            DOSUB1(r, ci->progeny[5], cj->progeny[3], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[6], cj->progeny[0], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[2] != NULL)
+            DOSUB1(r, ci->progeny[6], cj->progeny[2], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
+            DOSUB1(r, ci->progeny[6], cj->progeny[3], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[1], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[2], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[3] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[3], -1, 0);
+          break;
+
+        case 5: /* (  1 ,  0 , -1 ) */
+          if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
+            DOSUB1(r, ci->progeny[4], cj->progeny[1], -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+            DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
+            DOSUB1(r, ci->progeny[6], cj->progeny[3], -1, 0);
+          break;
+
+        case 6: /* (  1 , -1 ,  1 ) */
+          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+          break;
+
+        case 7: /* (  1 , -1 ,  0 ) */
+          if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
+            DOSUB1(r, ci->progeny[4], cj->progeny[2], -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+            DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
+            DOSUB1(r, ci->progeny[5], cj->progeny[3], -1, 0);
+          break;
+
+        case 8: /* (  1 , -1 , -1 ) */
+          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+            DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0);
+          break;
+
+        case 9: /* (  0 ,  1 ,  1 ) */
+          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[3], cj->progeny[0], -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+            DOSUB1(r, ci->progeny[3], cj->progeny[4], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[4], -1, 0);
+          break;
+
+        case 10: /* (  0 ,  1 ,  0 ) */
+          if (ci->progeny[2] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[2], cj->progeny[0], -1, 0);
+          if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
+            DOSUB1(r, ci->progeny[2], cj->progeny[1], -1, 0);
+          if (ci->progeny[2] != NULL && cj->progeny[4] != NULL)
+            DOSUB1(r, ci->progeny[2], cj->progeny[4], -1, 0);
+          if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
+            DOSUB1(r, ci->progeny[2], cj->progeny[5], -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[3], cj->progeny[0], -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[1] != NULL)
+            DOSUB1(r, ci->progeny[3], cj->progeny[1], -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+            DOSUB1(r, ci->progeny[3], cj->progeny[4], -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[5] != NULL)
+            DOSUB1(r, ci->progeny[3], cj->progeny[5], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[6], cj->progeny[0], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[4] != NULL)
+            DOSUB1(r, ci->progeny[6], cj->progeny[4], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
+            DOSUB1(r, ci->progeny[6], cj->progeny[5], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[1], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[4], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[5] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[5], -1, 0);
+          break;
+
+        case 11: /* (  0 ,  1 , -1 ) */
+          if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
+            DOSUB1(r, ci->progeny[2], cj->progeny[1], -1, 0);
+          if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
+            DOSUB1(r, ci->progeny[2], cj->progeny[5], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
+            DOSUB1(r, ci->progeny[6], cj->progeny[5], -1, 0);
+          break;
+
+        case 12: /* (  0 ,  0 ,  1 ) */
+          if (ci->progeny[1] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[1], cj->progeny[0], -1, 0);
+          if (ci->progeny[1] != NULL && cj->progeny[2] != NULL)
+            DOSUB1(r, ci->progeny[1], cj->progeny[2], -1, 0);
+          if (ci->progeny[1] != NULL && cj->progeny[4] != NULL)
+            DOSUB1(r, ci->progeny[1], cj->progeny[4], -1, 0);
+          if (ci->progeny[1] != NULL && cj->progeny[6] != NULL)
+            DOSUB1(r, ci->progeny[1], cj->progeny[6], -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[3], cj->progeny[0], -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[2] != NULL)
+            DOSUB1(r, ci->progeny[3], cj->progeny[2], -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+            DOSUB1(r, ci->progeny[3], cj->progeny[4], -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[6] != NULL)
+            DOSUB1(r, ci->progeny[3], cj->progeny[6], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[5], cj->progeny[0], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[4] != NULL)
+            DOSUB1(r, ci->progeny[5], cj->progeny[4], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[6] != NULL)
+            DOSUB1(r, ci->progeny[5], cj->progeny[6], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[2], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[4], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[6] != NULL)
+            DOSUB1(r, ci->progeny[7], cj->progeny[6], -1, 0);
+          break;
+      }
 
-void DOSUB2 ( struct runner *r , struct cell *ci , struct cell *cj , int sid , int gettimer ) {
-
-    int j, k;
-    double shift[3];
-    float h;
-    struct space *s = r->e->s;
-    float dt_step = r->e->dt_step;
-
-    TIMER_TIC
-    
-    /* Is this a single cell? */
-    if ( cj == NULL ) {
-    
-        /* Should we even bother? */    
-        if ( ci->dt_min > dt_step )
-            return;
-        
-        /* Recurse? */
-        if ( ci->split ) {
-        
-            /* Loop over all progeny. */
-            for ( k = 0 ; k < 8 ; k++ )
-                if ( ci->progeny[k] != NULL ) {
-                    DOSUB2( r , ci->progeny[k] , NULL , -1 , 0 );
-                    for ( j = k+1 ; j < 8 ; j++ )
-                        if ( ci->progeny[j] != NULL )
-                            DOSUB2( r , ci->progeny[k] , ci->progeny[j] , -1 , 0 );
-                    }
-        
-            }
-        
-        /* Otherwsie, compute self-interaction. */
-        else
-            DOSELF2( r , ci );
-            
-        } /* self-interaction. */
-        
-    /* Otherwise, it's a pair interaction. */
-    else {
-    
-        /* Should we even bother? */    
-        if ( ci->dt_min > dt_step && cj->dt_min > dt_step )
-            return;
-        
-        /* Get the cell dimensions. */
-        h = fmin( ci->h[0] , fmin( ci->h[1] , ci->h[2] ) );
-        
-        /* Get the type of pair if not specified explicitly. */
-        // if ( sid < 0 )
-            sid = space_getsid( s , &ci , &cj , shift );
-    
-        /* Recurse? */
-        if ( ci->split && cj->split &&
-             fmaxf( ci->h_max , cj->h_max )*kernel_gamma + ci->dx_max + cj->dx_max < h/2 ) {
-             
-            /* Different types of flags. */
-            switch ( sid ) {
-
-                /* Regular sub-cell interactions of a single cell. */
-                case 0: /* (  1 ,  1 ,  1 ) */
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 );
-                    break;
-                    
-                case 1: /* (  1 ,  1 ,  0 ) */
-                    if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[6] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL )
-                        DOSUB2( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[1] , -1 , 0 );
-                    break;
-
-                case 2: /* (  1 ,  1 , -1 ) */
-                    if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL )
-                        DOSUB2( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 );
-                    break;
-                    
-                case 3: /* (  1 ,  0 ,  1 ) */
-                    if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[5] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL )
-                        DOSUB2( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[2] , -1 , 0 );
-                    break;
-
-                case 4: /* (  1 ,  0 ,  0 ) */
-                    if ( ci->progeny[4] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[4] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[1] != NULL )
-                        DOSUB2( r , ci->progeny[4] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[2] != NULL )
-                        DOSUB2( r , ci->progeny[4] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL )
-                        DOSUB2( r , ci->progeny[4] , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[5] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[1] != NULL )
-                        DOSUB2( r , ci->progeny[5] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL )
-                        DOSUB2( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[3] != NULL )
-                        DOSUB2( r , ci->progeny[5] , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[6] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL )
-                        DOSUB2( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[2] != NULL )
-                        DOSUB2( r , ci->progeny[6] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[3] != NULL )
-                        DOSUB2( r , ci->progeny[6] , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[3] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[3] , -1 , 0 );
-                    break;
-
-                case 5: /* (  1 ,  0 , -1 ) */
-                    if ( ci->progeny[4] != NULL && cj->progeny[1] != NULL )
-                        DOSUB2( r , ci->progeny[4] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL )
-                        DOSUB2( r , ci->progeny[4] , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL )
-                        DOSUB2( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[3] != NULL )
-                        DOSUB2( r , ci->progeny[6] , cj->progeny[3] , -1 , 0 );
-                    break;
-
-                case 6: /* (  1 , -1 ,  1 ) */
-                    if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL )
-                        DOSUB2( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 );
-                    break;
-                    
-                case 7: /* (  1 , -1 ,  0 ) */
-                    if ( ci->progeny[4] != NULL && cj->progeny[2] != NULL )
-                        DOSUB2( r , ci->progeny[4] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL )
-                        DOSUB2( r , ci->progeny[4] , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL )
-                        DOSUB2( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[3] != NULL )
-                        DOSUB2( r , ci->progeny[5] , cj->progeny[3] , -1 , 0 );
-                    break;
-
-                case 8: /* (  1 , -1 , -1 ) */
-                    if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL )
-                        DOSUB2( r , ci->progeny[4] , cj->progeny[3] , -1 , 0 );
-                    break;
-                    
-                case 9: /* (  0 ,  1 ,  1 ) */
-                    if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[3] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL )
-                        DOSUB2( r , ci->progeny[3] , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[4] , -1 , 0 );
-                    break;
-
-                case 10: /* (  0 ,  1 ,  0 ) */
-                    if ( ci->progeny[2] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[2] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[2] != NULL && cj->progeny[1] != NULL )
-                        DOSUB2( r , ci->progeny[2] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[2] != NULL && cj->progeny[4] != NULL )
-                        DOSUB2( r , ci->progeny[2] , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[2] != NULL && cj->progeny[5] != NULL )
-                        DOSUB2( r , ci->progeny[2] , cj->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[3] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[1] != NULL )
-                        DOSUB2( r , ci->progeny[3] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL )
-                        DOSUB2( r , ci->progeny[3] , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[5] != NULL )
-                        DOSUB2( r , ci->progeny[3] , cj->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[6] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL )
-                        DOSUB2( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[4] != NULL )
-                        DOSUB2( r , ci->progeny[6] , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[5] != NULL )
-                        DOSUB2( r , ci->progeny[6] , cj->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[5] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[5] , -1 , 0 );
-                    break;
-
-                case 11: /* (  0 ,  1 , -1 ) */
-                    if ( ci->progeny[2] != NULL && cj->progeny[1] != NULL )
-                        DOSUB2( r , ci->progeny[2] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[2] != NULL && cj->progeny[5] != NULL )
-                        DOSUB2( r , ci->progeny[2] , cj->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL )
-                        DOSUB2( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[5] != NULL )
-                        DOSUB2( r , ci->progeny[6] , cj->progeny[5] , -1 , 0 );
-                    break;
-
-                case 12: /* (  0 ,  0 ,  1 ) */
-                    if ( ci->progeny[1] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[1] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[1] != NULL && cj->progeny[2] != NULL )
-                        DOSUB2( r , ci->progeny[1] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[1] != NULL && cj->progeny[4] != NULL )
-                        DOSUB2( r , ci->progeny[1] , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[1] != NULL && cj->progeny[6] != NULL )
-                        DOSUB2( r , ci->progeny[1] , cj->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[3] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[2] != NULL )
-                        DOSUB2( r , ci->progeny[3] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL )
-                        DOSUB2( r , ci->progeny[3] , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[6] != NULL )
-                        DOSUB2( r , ci->progeny[3] , cj->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[5] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL )
-                        DOSUB2( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[4] != NULL )
-                        DOSUB2( r , ci->progeny[5] , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[6] != NULL )
-                        DOSUB2( r , ci->progeny[5] , cj->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[6] != NULL )
-                        DOSUB2( r , ci->progeny[7] , cj->progeny[6] , -1 , 0 );
-                    break;
-
-                }
-            
-            }
-            
-        /* Otherwise, compute the pair directly. */
-        else if ( ci->dt_min <= dt_step || cj->dt_min <= dt_step ) {
-        
-            /* Do any of the cells need to be sorted first? */
-            if ( !(ci->sorted & (1 << sid) ) )
-                runner_dosort( r , ci , (1 << sid) , 1 );
-            if ( !(cj->sorted & (1 << sid) ) )
-                runner_dosort( r , cj , (1 << sid) , 1 );
-        
-            /* Compute the interactions. */
-            DOPAIR2( r , ci , cj );
-            
-            }
-    
-        } /* otherwise, pair interaction. */
-    
+    }
 
-    if ( gettimer )
-        #ifdef TIMER_VERBOSE
-            printf( "runner_dosub2[%02i]: flags=%i at depth %i took %.3f ms.\n" , r->id , sid , ci->depth , ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000 );
-        #else
-            TIMER_TOC(TIMER_DOSUB);
-        #endif
+    /* Otherwise, compute the pair directly. */
+    else if (ci->dt_min <= dt_step || cj->dt_min <= dt_step) {
 
+      /* Do any of the cells need to be sorted first? */
+      if (!(ci->sorted & (1 << sid))) runner_dosort(r, ci, (1 << sid), 1);
+      if (!(cj->sorted & (1 << sid))) runner_dosort(r, cj, (1 << sid), 1);
+
+      /* Compute the interactions. */
+      DOPAIR1(r, ci, cj);
     }
 
+  } /* otherwise, pair interaction. */
 
-void DOSUB_SUBSET ( struct runner *r , struct cell *ci , struct part *parts , int *ind , int count , struct cell *cj , int sid , int gettimer ) {
-
-    int j, k;
-    double shift[3];
-    float h;
-    struct space *s = r->e->s;
-    struct cell *sub = NULL;
-    float dt_step = r->e->dt_step;
-
-    TIMER_TIC
-    
-    /* Find out in which sub-cell of ci the parts are. */
-    for ( k = 0 ; k < 8 ; k++ )
-        if ( ci->progeny[k] != NULL ) {
-            // if ( parts[ ind[ 0 ] ].x[0] >= ci->progeny[k]->loc[0] &&
-            //      parts[ ind[ 0 ] ].x[0] <= ci->progeny[k]->loc[0] + ci->progeny[k]->h[0] &&
-            //      parts[ ind[ 0 ] ].x[1] >= ci->progeny[k]->loc[1] &&
-            //      parts[ ind[ 0 ] ].x[1] <= ci->progeny[k]->loc[1] + ci->progeny[k]->h[1] &&
-            //      parts[ ind[ 0 ] ].x[2] >= ci->progeny[k]->loc[2] &&
-            //      parts[ ind[ 0 ] ].x[2] <= ci->progeny[k]->loc[2] + ci->progeny[k]->h[2] ) {
-            if ( &parts[ ind[0] ] >= &ci->progeny[k]->parts[0] &&
-                 &parts[ ind[0] ] <  &ci->progeny[k]->parts[ci->progeny[k]->count] ) {
-                sub = ci->progeny[k];
-                break;
-                }
-            }
-    
-    
-    /* Is this a single cell? */
-    if ( cj == NULL ) {
-    
-        /* Recurse? */
-        if ( ci->split ) {
-        
-            /* Loop over all progeny. */
-            DOSUB_SUBSET( r , sub , parts , ind , count , NULL , -1 , 0 );
-            for ( j = 0 ; j < 8 ; j++ )
-                if ( ci->progeny[j] != sub && ci->progeny[j] != NULL )
-                    DOSUB_SUBSET( r , sub , parts , ind , count , ci->progeny[j] , -1 , 0 );
-        
-            }
-        
-        /* Otherwsie, compute self-interaction. */
-        else
-            DOSELF_SUBSET( r , ci , parts , ind , count );
-            
-        } /* self-interaction. */
-        
-    /* Otherwise, it's a pair interaction. */
-    else {
-    
-        /* Get the cell dimensions. */
-        h = fmin( ci->h[0] , fmin( ci->h[1] , ci->h[2] ) );
-
-        /* Recurse? */
-        if ( ci->split && cj->split &&
-             fmaxf( ci->h_max , cj->h_max )*kernel_gamma + ci->dx_max + cj->dx_max < h/2 ) {
-             
-            /* Get the type of pair if not specified explicitly. */
-            sid = space_getsid( s , &ci , &cj , shift );
-
-            /* Different types of flags. */
-            switch ( sid ) {
-
-                /* Regular sub-cell interactions of a single cell. */
-                case 0: /* (  1 ,  1 ,  1 ) */
-                    if ( ci->progeny[7] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , ci->progeny[0] , parts , ind , count , cj->progeny[7] , -1 , 0 );
-                    break;
-                    
-                case 1: /* (  1 ,  1 ,  0 ) */
-                    if ( ci->progeny[6] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[6] == sub && cj->progeny[1] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[1] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[7] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[7] , -1 , 0 );
-                    if ( ci->progeny[7] == sub && cj->progeny[1] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[1] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[7] , -1 , 0 );
-                    break;
-
-                case 2: /* (  1 ,  1 , -1 ) */
-                    if ( ci->progeny[6] == sub && cj->progeny[1] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[1] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[6] , -1 , 0 );
-                    break;
-                    
-                case 3: /* (  1 ,  0 ,  1 ) */
-                    if ( ci->progeny[5] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[5] == sub && cj->progeny[2] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[2] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[7] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[7] , -1 , 0 );
-                    if ( ci->progeny[7] == sub && cj->progeny[2] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[2] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[7] , -1 , 0 );
-                    break;
-
-                case 4: /* (  1 ,  0 ,  0 ) */
-                    if ( ci->progeny[4] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[4] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[4] == sub && cj->progeny[1] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[4] , parts , ind , count , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[1] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[4] == sub && cj->progeny[2] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[4] , parts , ind , count , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[2] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[4] == sub && cj->progeny[3] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[4] , parts , ind , count , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[3] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[3] , parts , ind , count , ci->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[5] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[5] == sub && cj->progeny[1] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[1] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[5] == sub && cj->progeny[2] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[2] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[5] == sub && cj->progeny[3] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[3] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[3] , parts , ind , count , ci->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[6] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[6] == sub && cj->progeny[1] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[1] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[6] == sub && cj->progeny[2] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[2] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[6] == sub && cj->progeny[3] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[3] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[3] , parts , ind , count , ci->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[7] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[7] , -1 , 0 );
-                    if ( ci->progeny[7] == sub && cj->progeny[1] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[1] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[7] , -1 , 0 );
-                    if ( ci->progeny[7] == sub && cj->progeny[2] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[2] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[7] , -1 , 0 );
-                    if ( ci->progeny[7] == sub && cj->progeny[3] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[3] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[3] , parts , ind , count , ci->progeny[7] , -1 , 0 );
-                    break;
-
-                case 5: /* (  1 ,  0 , -1 ) */
-                    if ( ci->progeny[4] == sub && cj->progeny[1] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[4] , parts , ind , count , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[1] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[4] == sub && cj->progeny[3] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[4] , parts , ind , count , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[3] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[3] , parts , ind , count , ci->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[6] == sub && cj->progeny[1] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[1] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[6] == sub && cj->progeny[3] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[3] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[3] , parts , ind , count , ci->progeny[6] , -1 , 0 );
-                    break;
-
-                case 6: /* (  1 , -1 ,  1 ) */
-                    if ( ci->progeny[5] == sub && cj->progeny[2] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[2] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[5] , -1 , 0 );
-                    break;
-                    
-                case 7: /* (  1 , -1 ,  0 ) */
-                    if ( ci->progeny[4] == sub && cj->progeny[2] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[4] , parts , ind , count , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[2] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[4] == sub && cj->progeny[3] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[4] , parts , ind , count , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[3] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[3] , parts , ind , count , ci->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[5] == sub && cj->progeny[2] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[2] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[5] == sub && cj->progeny[3] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[3] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[3] , parts , ind , count , ci->progeny[5] , -1 , 0 );
-                    break;
-
-                case 8: /* (  1 , -1 , -1 ) */
-                    if ( ci->progeny[4] == sub && cj->progeny[3] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[4] , parts , ind , count , cj->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[4] != NULL && cj->progeny[3] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[3] , parts , ind , count , ci->progeny[4] , -1 , 0 );
-                    break;
-                    
-                case 9: /* (  0 ,  1 ,  1 ) */
-                    if ( ci->progeny[3] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[3] == sub && cj->progeny[4] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[4] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[7] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[7] , -1 , 0 );
-                    if ( ci->progeny[7] == sub && cj->progeny[4] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[4] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[7] , -1 , 0 );
-                    break;
-
-                case 10: /* (  0 ,  1 ,  0 ) */
-                    if ( ci->progeny[2] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[2] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[2] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[2] == sub && cj->progeny[1] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[2] , parts , ind , count , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[2] != NULL && cj->progeny[1] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[2] == sub && cj->progeny[4] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[2] , parts , ind , count , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[2] != NULL && cj->progeny[4] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[2] == sub && cj->progeny[5] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[2] , parts , ind , count , cj->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[2] != NULL && cj->progeny[5] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[5] , parts , ind , count , ci->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[3] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[3] == sub && cj->progeny[1] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[1] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[3] == sub && cj->progeny[4] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[4] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[3] == sub && cj->progeny[5] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[5] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[5] , parts , ind , count , ci->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[6] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[6] == sub && cj->progeny[1] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[1] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[6] == sub && cj->progeny[4] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[4] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[6] == sub && cj->progeny[5] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[5] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[5] , parts , ind , count , ci->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[7] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[7] , -1 , 0 );
-                    if ( ci->progeny[7] == sub && cj->progeny[1] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[1] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[7] , -1 , 0 );
-                    if ( ci->progeny[7] == sub && cj->progeny[4] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[4] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[7] , -1 , 0 );
-                    if ( ci->progeny[7] == sub && cj->progeny[5] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[5] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[5] , parts , ind , count , ci->progeny[7] , -1 , 0 );
-                    break;
-
-                case 11: /* (  0 ,  1 , -1 ) */
-                    if ( ci->progeny[2] == sub && cj->progeny[1] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[2] , parts , ind , count , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[2] != NULL && cj->progeny[1] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[2] == sub && cj->progeny[5] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[2] , parts , ind , count , cj->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[2] != NULL && cj->progeny[5] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[5] , parts , ind , count , ci->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[6] == sub && cj->progeny[1] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[1] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[6] == sub && cj->progeny[5] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[6] != NULL && cj->progeny[5] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[5] , parts , ind , count , ci->progeny[6] , -1 , 0 );
-                    break;
-
-                case 12: /* (  0 ,  0 ,  1 ) */
-                    if ( ci->progeny[1] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[1] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[1] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[1] == sub && cj->progeny[2] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[1] , parts , ind , count , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[1] != NULL && cj->progeny[2] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[1] == sub && cj->progeny[4] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[1] , parts , ind , count , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[1] != NULL && cj->progeny[4] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[1] == sub && cj->progeny[6] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[1] , parts , ind , count , cj->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[1] != NULL && cj->progeny[6] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[6] , parts , ind , count , ci->progeny[1] , -1 , 0 );
-                    if ( ci->progeny[3] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[3] == sub && cj->progeny[2] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[2] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[3] == sub && cj->progeny[4] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[4] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[3] == sub && cj->progeny[6] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[3] != NULL && cj->progeny[6] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[6] , parts , ind , count , ci->progeny[3] , -1 , 0 );
-                    if ( ci->progeny[5] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[5] == sub && cj->progeny[2] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[2] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[5] == sub && cj->progeny[4] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[4] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[5] == sub && cj->progeny[6] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[5] != NULL && cj->progeny[6] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[6] , parts , ind , count , ci->progeny[5] , -1 , 0 );
-                    if ( ci->progeny[7] == sub && cj->progeny[0] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[0] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[0] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[7] , -1 , 0 );
-                    if ( ci->progeny[7] == sub && cj->progeny[2] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[2] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[2] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[7] , -1 , 0 );
-                    if ( ci->progeny[7] == sub && cj->progeny[4] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[4] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[4] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[7] , -1 , 0 );
-                    if ( ci->progeny[7] == sub && cj->progeny[6] != NULL )
-                        DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[6] , -1 , 0 );
-                    if ( ci->progeny[7] != NULL && cj->progeny[6] == sub )
-                        DOSUB_SUBSET( r , cj->progeny[6] , parts , ind , count , ci->progeny[7] , -1 , 0 );
-                    break;
-
-                }
-            
-            }
-            
-        /* Otherwise, compute the pair directly. */
-        else if ( ci->dt_min <= dt_step || cj->dt_min <= dt_step ) {
-        
-            /* Get the relative distance between the pairs, wrapping. */
-            for ( k = 0 ; k < 3 ; k++ ) {
-                if ( cj->loc[k] - ci->loc[k] < -s->dim[k]/2 )
-                    shift[k] = s->dim[k];
-                else if ( cj->loc[k] - ci->loc[k] > s->dim[k]/2 )
-                    shift[k] = -s->dim[k];
-                }
-        
-            /* Get the sorting index. */
-            for ( sid = 0 , k = 0 ; k < 3 ; k++ )
-                sid = 3*sid + ( (cj->loc[k] - ci->loc[k] + shift[k] < 0) ? 0 : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1 );
-            sid = sortlistID[sid];
-    
-            /* Do any of the cells need to be sorted first? */
-            if ( !(cj->sorted & (1 << sid) ) )
-                runner_dosort( r , cj , (1 << sid) , 1 );
-        
-            /* Compute the interactions. */
-            DOPAIR_SUBSET( r , ci , parts , ind , count , cj );
-            
-            }
-    
-        } /* otherwise, pair interaction. */
-    
+  if (gettimer)
+#ifdef TIMER_VERBOSE
+    printf("runner_dosub1[%02i]: flags=%i at depth %i took %.3f ms.\n", r->id,
+           sid, ci->depth, ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000);
+#else
+    TIMER_TOC(TIMER_DOSUB);
+#endif
+}
+
+void DOSUB2(struct runner *r, struct cell *ci, struct cell *cj, int sid,
+            int gettimer) {
+
+  int j, k;
+  double shift[3];
+  float h;
+  struct space *s = r->e->s;
+  float dt_step = r->e->dt_step;
+
+  TIMER_TIC
+
+  /* Is this a single cell? */
+  if (cj == NULL) {
+
+    /* Should we even bother? */
+    if (ci->dt_min > dt_step) return;
 
-    if ( gettimer )
-        #ifdef TIMER_VERBOSE
-            printf( "runner_dosub[%02i]: flags=%i at depth %i took %.3f ms.\n" , r->id , sid , ci->depth , ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000 );
-        #else
-            TIMER_TOC(TIMER_DOSUB);
-        #endif
+    /* Recurse? */
+    if (ci->split) {
+
+      /* Loop over all progeny. */
+      for (k = 0; k < 8; k++)
+        if (ci->progeny[k] != NULL) {
+          DOSUB2(r, ci->progeny[k], NULL, -1, 0);
+          for (j = k + 1; j < 8; j++)
+            if (ci->progeny[j] != NULL)
+              DOSUB2(r, ci->progeny[k], ci->progeny[j], -1, 0);
+        }
+
+    }
+
+    /* Otherwsie, compute self-interaction. */
+    else
+      DOSELF2(r, ci);
+
+  } /* self-interaction. */
+
+  /* Otherwise, it's a pair interaction. */
+  else {
+
+    /* Should we even bother? */
+    if (ci->dt_min > dt_step && cj->dt_min > dt_step) return;
+
+    /* Get the cell dimensions. */
+    h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
+
+    /* Get the type of pair if not specified explicitly. */
+    // if ( sid < 0 )
+    sid = space_getsid(s, &ci, &cj, shift);
+
+    /* Recurse? */
+    if (ci->split && cj->split &&
+        fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max <
+            h / 2) {
+
+      /* Different types of flags. */
+      switch (sid) {
+
+        /* Regular sub-cell interactions of a single cell. */
+        case 0: /* (  1 ,  1 ,  1 ) */
+          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+          break;
+
+        case 1: /* (  1 ,  1 ,  0 ) */
+          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[6], cj->progeny[0], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[1], -1, 0);
+          break;
+
+        case 2: /* (  1 ,  1 , -1 ) */
+          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+          break;
+
+        case 3: /* (  1 ,  0 ,  1 ) */
+          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[5], cj->progeny[0], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[2], -1, 0);
+          break;
+
+        case 4: /* (  1 ,  0 ,  0 ) */
+          if (ci->progeny[4] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[4], cj->progeny[0], -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
+            DOSUB2(r, ci->progeny[4], cj->progeny[1], -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
+            DOSUB2(r, ci->progeny[4], cj->progeny[2], -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+            DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[5], cj->progeny[0], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[1] != NULL)
+            DOSUB2(r, ci->progeny[5], cj->progeny[1], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
+            DOSUB2(r, ci->progeny[5], cj->progeny[3], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[6], cj->progeny[0], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[2] != NULL)
+            DOSUB2(r, ci->progeny[6], cj->progeny[2], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
+            DOSUB2(r, ci->progeny[6], cj->progeny[3], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[1], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[2], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[3] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[3], -1, 0);
+          break;
+
+        case 5: /* (  1 ,  0 , -1 ) */
+          if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
+            DOSUB2(r, ci->progeny[4], cj->progeny[1], -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+            DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
+            DOSUB2(r, ci->progeny[6], cj->progeny[3], -1, 0);
+          break;
+
+        case 6: /* (  1 , -1 ,  1 ) */
+          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+          break;
+
+        case 7: /* (  1 , -1 ,  0 ) */
+          if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
+            DOSUB2(r, ci->progeny[4], cj->progeny[2], -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+            DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
+            DOSUB2(r, ci->progeny[5], cj->progeny[3], -1, 0);
+          break;
+
+        case 8: /* (  1 , -1 , -1 ) */
+          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+            DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0);
+          break;
+
+        case 9: /* (  0 ,  1 ,  1 ) */
+          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[3], cj->progeny[0], -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+            DOSUB2(r, ci->progeny[3], cj->progeny[4], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[4], -1, 0);
+          break;
+
+        case 10: /* (  0 ,  1 ,  0 ) */
+          if (ci->progeny[2] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[2], cj->progeny[0], -1, 0);
+          if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
+            DOSUB2(r, ci->progeny[2], cj->progeny[1], -1, 0);
+          if (ci->progeny[2] != NULL && cj->progeny[4] != NULL)
+            DOSUB2(r, ci->progeny[2], cj->progeny[4], -1, 0);
+          if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
+            DOSUB2(r, ci->progeny[2], cj->progeny[5], -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[3], cj->progeny[0], -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[1] != NULL)
+            DOSUB2(r, ci->progeny[3], cj->progeny[1], -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+            DOSUB2(r, ci->progeny[3], cj->progeny[4], -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[5] != NULL)
+            DOSUB2(r, ci->progeny[3], cj->progeny[5], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[6], cj->progeny[0], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[4] != NULL)
+            DOSUB2(r, ci->progeny[6], cj->progeny[4], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
+            DOSUB2(r, ci->progeny[6], cj->progeny[5], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[1], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[4], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[5] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[5], -1, 0);
+          break;
+
+        case 11: /* (  0 ,  1 , -1 ) */
+          if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
+            DOSUB2(r, ci->progeny[2], cj->progeny[1], -1, 0);
+          if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
+            DOSUB2(r, ci->progeny[2], cj->progeny[5], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
+            DOSUB2(r, ci->progeny[6], cj->progeny[5], -1, 0);
+          break;
+
+        case 12: /* (  0 ,  0 ,  1 ) */
+          if (ci->progeny[1] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[1], cj->progeny[0], -1, 0);
+          if (ci->progeny[1] != NULL && cj->progeny[2] != NULL)
+            DOSUB2(r, ci->progeny[1], cj->progeny[2], -1, 0);
+          if (ci->progeny[1] != NULL && cj->progeny[4] != NULL)
+            DOSUB2(r, ci->progeny[1], cj->progeny[4], -1, 0);
+          if (ci->progeny[1] != NULL && cj->progeny[6] != NULL)
+            DOSUB2(r, ci->progeny[1], cj->progeny[6], -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[3], cj->progeny[0], -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[2] != NULL)
+            DOSUB2(r, ci->progeny[3], cj->progeny[2], -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+            DOSUB2(r, ci->progeny[3], cj->progeny[4], -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[6] != NULL)
+            DOSUB2(r, ci->progeny[3], cj->progeny[6], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[5], cj->progeny[0], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[4] != NULL)
+            DOSUB2(r, ci->progeny[5], cj->progeny[4], -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[6] != NULL)
+            DOSUB2(r, ci->progeny[5], cj->progeny[6], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[2], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[4], -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[6] != NULL)
+            DOSUB2(r, ci->progeny[7], cj->progeny[6], -1, 0);
+          break;
+      }
+
+    }
+
+    /* Otherwise, compute the pair directly. */
+    else if (ci->dt_min <= dt_step || cj->dt_min <= dt_step) {
+
+      /* Do any of the cells need to be sorted first? */
+      if (!(ci->sorted & (1 << sid))) runner_dosort(r, ci, (1 << sid), 1);
+      if (!(cj->sorted & (1 << sid))) runner_dosort(r, cj, (1 << sid), 1);
+
+      /* Compute the interactions. */
+      DOPAIR2(r, ci, cj);
+    }
+
+  } /* otherwise, pair interaction. */
+
+  if (gettimer)
+#ifdef TIMER_VERBOSE
+    printf("runner_dosub2[%02i]: flags=%i at depth %i took %.3f ms.\n", r->id,
+           sid, ci->depth, ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000);
+#else
+    TIMER_TOC(TIMER_DOSUB);
+#endif
+}
+
+void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
+                  int *ind, int count, struct cell *cj, int sid, int gettimer) {
+
+  int j, k;
+  double shift[3];
+  float h;
+  struct space *s = r->e->s;
+  struct cell *sub = NULL;
+  float dt_step = r->e->dt_step;
+
+  TIMER_TIC
+
+  /* Find out in which sub-cell of ci the parts are. */
+  for (k = 0; k < 8; k++)
+    if (ci->progeny[k] != NULL) {
+      // if ( parts[ ind[ 0 ] ].x[0] >= ci->progeny[k]->loc[0] &&
+      //      parts[ ind[ 0 ] ].x[0] <= ci->progeny[k]->loc[0] +
+      // ci->progeny[k]->h[0] &&
+      //      parts[ ind[ 0 ] ].x[1] >= ci->progeny[k]->loc[1] &&
+      //      parts[ ind[ 0 ] ].x[1] <= ci->progeny[k]->loc[1] +
+      // ci->progeny[k]->h[1] &&
+      //      parts[ ind[ 0 ] ].x[2] >= ci->progeny[k]->loc[2] &&
+      //      parts[ ind[ 0 ] ].x[2] <= ci->progeny[k]->loc[2] +
+      // ci->progeny[k]->h[2] ) {
+      if (&parts[ind[0]] >= &ci->progeny[k]->parts[0] &&
+          &parts[ind[0]] < &ci->progeny[k]->parts[ci->progeny[k]->count]) {
+        sub = ci->progeny[k];
+        break;
+      }
+    }
+
+  /* Is this a single cell? */
+  if (cj == NULL) {
+
+    /* Recurse? */
+    if (ci->split) {
+
+      /* Loop over all progeny. */
+      DOSUB_SUBSET(r, sub, parts, ind, count, NULL, -1, 0);
+      for (j = 0; j < 8; j++)
+        if (ci->progeny[j] != sub && ci->progeny[j] != NULL)
+          DOSUB_SUBSET(r, sub, parts, ind, count, ci->progeny[j], -1, 0);
+
+    }
+
+    /* Otherwsie, compute self-interaction. */
+    else
+      DOSELF_SUBSET(r, ci, parts, ind, count);
+
+  } /* self-interaction. */
+
+  /* Otherwise, it's a pair interaction. */
+  else {
+
+    /* Get the cell dimensions. */
+    h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
+
+    /* Recurse? */
+    if (ci->split && cj->split &&
+        fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max <
+            h / 2) {
+
+      /* Get the type of pair if not specified explicitly. */
+      sid = space_getsid(s, &ci, &cj, shift);
+
+      /* Different types of flags. */
+      switch (sid) {
+
+        /* Regular sub-cell interactions of a single cell. */
+        case 0: /* (  1 ,  1 ,  1 ) */
+          if (ci->progeny[7] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, ci->progeny[0], parts, ind, count, cj->progeny[7],
+                         -1, 0);
+          break;
+
+        case 1: /* (  1 ,  1 ,  0 ) */
+          if (ci->progeny[6] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[6],
+                         -1, 0);
+          if (ci->progeny[6] == sub && cj->progeny[1] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[1],
+                         -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[1] == sub)
+            DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[6],
+                         -1, 0);
+          if (ci->progeny[7] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[7],
+                         -1, 0);
+          if (ci->progeny[7] == sub && cj->progeny[1] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[1],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[1] == sub)
+            DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[7],
+                         -1, 0);
+          break;
+
+        case 2: /* (  1 ,  1 , -1 ) */
+          if (ci->progeny[6] == sub && cj->progeny[1] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[1],
+                         -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[1] == sub)
+            DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[6],
+                         -1, 0);
+          break;
+
+        case 3: /* (  1 ,  0 ,  1 ) */
+          if (ci->progeny[5] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[5],
+                         -1, 0);
+          if (ci->progeny[5] == sub && cj->progeny[2] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[2],
+                         -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[2] == sub)
+            DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[5],
+                         -1, 0);
+          if (ci->progeny[7] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[7],
+                         -1, 0);
+          if (ci->progeny[7] == sub && cj->progeny[2] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[2],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[2] == sub)
+            DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[7],
+                         -1, 0);
+          break;
+
+        case 4: /* (  1 ,  0 ,  0 ) */
+          if (ci->progeny[4] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[4], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[4],
+                         -1, 0);
+          if (ci->progeny[4] == sub && cj->progeny[1] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[4], parts, ind, count, cj->progeny[1],
+                         -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[1] == sub)
+            DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[4],
+                         -1, 0);
+          if (ci->progeny[4] == sub && cj->progeny[2] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[4], parts, ind, count, cj->progeny[2],
+                         -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[2] == sub)
+            DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[4],
+                         -1, 0);
+          if (ci->progeny[4] == sub && cj->progeny[3] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[4], parts, ind, count, cj->progeny[3],
+                         -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[3] == sub)
+            DOSUB_SUBSET(r, cj->progeny[3], parts, ind, count, ci->progeny[4],
+                         -1, 0);
+          if (ci->progeny[5] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[5],
+                         -1, 0);
+          if (ci->progeny[5] == sub && cj->progeny[1] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[1],
+                         -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[1] == sub)
+            DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[5],
+                         -1, 0);
+          if (ci->progeny[5] == sub && cj->progeny[2] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[2],
+                         -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[2] == sub)
+            DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[5],
+                         -1, 0);
+          if (ci->progeny[5] == sub && cj->progeny[3] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[3],
+                         -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[3] == sub)
+            DOSUB_SUBSET(r, cj->progeny[3], parts, ind, count, ci->progeny[5],
+                         -1, 0);
+          if (ci->progeny[6] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[6],
+                         -1, 0);
+          if (ci->progeny[6] == sub && cj->progeny[1] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[1],
+                         -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[1] == sub)
+            DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[6],
+                         -1, 0);
+          if (ci->progeny[6] == sub && cj->progeny[2] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[2],
+                         -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[2] == sub)
+            DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[6],
+                         -1, 0);
+          if (ci->progeny[6] == sub && cj->progeny[3] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[3],
+                         -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[3] == sub)
+            DOSUB_SUBSET(r, cj->progeny[3], parts, ind, count, ci->progeny[6],
+                         -1, 0);
+          if (ci->progeny[7] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[7],
+                         -1, 0);
+          if (ci->progeny[7] == sub && cj->progeny[1] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[1],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[1] == sub)
+            DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[7],
+                         -1, 0);
+          if (ci->progeny[7] == sub && cj->progeny[2] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[2],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[2] == sub)
+            DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[7],
+                         -1, 0);
+          if (ci->progeny[7] == sub && cj->progeny[3] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[3],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[3] == sub)
+            DOSUB_SUBSET(r, cj->progeny[3], parts, ind, count, ci->progeny[7],
+                         -1, 0);
+          break;
+
+        case 5: /* (  1 ,  0 , -1 ) */
+          if (ci->progeny[4] == sub && cj->progeny[1] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[4], parts, ind, count, cj->progeny[1],
+                         -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[1] == sub)
+            DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[4],
+                         -1, 0);
+          if (ci->progeny[4] == sub && cj->progeny[3] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[4], parts, ind, count, cj->progeny[3],
+                         -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[3] == sub)
+            DOSUB_SUBSET(r, cj->progeny[3], parts, ind, count, ci->progeny[4],
+                         -1, 0);
+          if (ci->progeny[6] == sub && cj->progeny[1] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[1],
+                         -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[1] == sub)
+            DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[6],
+                         -1, 0);
+          if (ci->progeny[6] == sub && cj->progeny[3] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[3],
+                         -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[3] == sub)
+            DOSUB_SUBSET(r, cj->progeny[3], parts, ind, count, ci->progeny[6],
+                         -1, 0);
+          break;
+
+        case 6: /* (  1 , -1 ,  1 ) */
+          if (ci->progeny[5] == sub && cj->progeny[2] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[2],
+                         -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[2] == sub)
+            DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[5],
+                         -1, 0);
+          break;
+
+        case 7: /* (  1 , -1 ,  0 ) */
+          if (ci->progeny[4] == sub && cj->progeny[2] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[4], parts, ind, count, cj->progeny[2],
+                         -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[2] == sub)
+            DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[4],
+                         -1, 0);
+          if (ci->progeny[4] == sub && cj->progeny[3] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[4], parts, ind, count, cj->progeny[3],
+                         -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[3] == sub)
+            DOSUB_SUBSET(r, cj->progeny[3], parts, ind, count, ci->progeny[4],
+                         -1, 0);
+          if (ci->progeny[5] == sub && cj->progeny[2] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[2],
+                         -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[2] == sub)
+            DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[5],
+                         -1, 0);
+          if (ci->progeny[5] == sub && cj->progeny[3] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[3],
+                         -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[3] == sub)
+            DOSUB_SUBSET(r, cj->progeny[3], parts, ind, count, ci->progeny[5],
+                         -1, 0);
+          break;
+
+        case 8: /* (  1 , -1 , -1 ) */
+          if (ci->progeny[4] == sub && cj->progeny[3] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[4], parts, ind, count, cj->progeny[3],
+                         -1, 0);
+          if (ci->progeny[4] != NULL && cj->progeny[3] == sub)
+            DOSUB_SUBSET(r, cj->progeny[3], parts, ind, count, ci->progeny[4],
+                         -1, 0);
+          break;
+
+        case 9: /* (  0 ,  1 ,  1 ) */
+          if (ci->progeny[3] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[3],
+                         -1, 0);
+          if (ci->progeny[3] == sub && cj->progeny[4] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[4],
+                         -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[4] == sub)
+            DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[3],
+                         -1, 0);
+          if (ci->progeny[7] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[7],
+                         -1, 0);
+          if (ci->progeny[7] == sub && cj->progeny[4] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[4],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[4] == sub)
+            DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[7],
+                         -1, 0);
+          break;
+
+        case 10: /* (  0 ,  1 ,  0 ) */
+          if (ci->progeny[2] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[2], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[2] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[2],
+                         -1, 0);
+          if (ci->progeny[2] == sub && cj->progeny[1] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[2], parts, ind, count, cj->progeny[1],
+                         -1, 0);
+          if (ci->progeny[2] != NULL && cj->progeny[1] == sub)
+            DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[2],
+                         -1, 0);
+          if (ci->progeny[2] == sub && cj->progeny[4] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[2], parts, ind, count, cj->progeny[4],
+                         -1, 0);
+          if (ci->progeny[2] != NULL && cj->progeny[4] == sub)
+            DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[2],
+                         -1, 0);
+          if (ci->progeny[2] == sub && cj->progeny[5] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[2], parts, ind, count, cj->progeny[5],
+                         -1, 0);
+          if (ci->progeny[2] != NULL && cj->progeny[5] == sub)
+            DOSUB_SUBSET(r, cj->progeny[5], parts, ind, count, ci->progeny[2],
+                         -1, 0);
+          if (ci->progeny[3] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[3],
+                         -1, 0);
+          if (ci->progeny[3] == sub && cj->progeny[1] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[1],
+                         -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[1] == sub)
+            DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[3],
+                         -1, 0);
+          if (ci->progeny[3] == sub && cj->progeny[4] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[4],
+                         -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[4] == sub)
+            DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[3],
+                         -1, 0);
+          if (ci->progeny[3] == sub && cj->progeny[5] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[5],
+                         -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[5] == sub)
+            DOSUB_SUBSET(r, cj->progeny[5], parts, ind, count, ci->progeny[3],
+                         -1, 0);
+          if (ci->progeny[6] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[6],
+                         -1, 0);
+          if (ci->progeny[6] == sub && cj->progeny[1] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[1],
+                         -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[1] == sub)
+            DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[6],
+                         -1, 0);
+          if (ci->progeny[6] == sub && cj->progeny[4] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[4],
+                         -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[4] == sub)
+            DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[6],
+                         -1, 0);
+          if (ci->progeny[6] == sub && cj->progeny[5] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[5],
+                         -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[5] == sub)
+            DOSUB_SUBSET(r, cj->progeny[5], parts, ind, count, ci->progeny[6],
+                         -1, 0);
+          if (ci->progeny[7] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[7],
+                         -1, 0);
+          if (ci->progeny[7] == sub && cj->progeny[1] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[1],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[1] == sub)
+            DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[7],
+                         -1, 0);
+          if (ci->progeny[7] == sub && cj->progeny[4] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[4],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[4] == sub)
+            DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[7],
+                         -1, 0);
+          if (ci->progeny[7] == sub && cj->progeny[5] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[5],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[5] == sub)
+            DOSUB_SUBSET(r, cj->progeny[5], parts, ind, count, ci->progeny[7],
+                         -1, 0);
+          break;
+
+        case 11: /* (  0 ,  1 , -1 ) */
+          if (ci->progeny[2] == sub && cj->progeny[1] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[2], parts, ind, count, cj->progeny[1],
+                         -1, 0);
+          if (ci->progeny[2] != NULL && cj->progeny[1] == sub)
+            DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[2],
+                         -1, 0);
+          if (ci->progeny[2] == sub && cj->progeny[5] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[2], parts, ind, count, cj->progeny[5],
+                         -1, 0);
+          if (ci->progeny[2] != NULL && cj->progeny[5] == sub)
+            DOSUB_SUBSET(r, cj->progeny[5], parts, ind, count, ci->progeny[2],
+                         -1, 0);
+          if (ci->progeny[6] == sub && cj->progeny[1] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[1],
+                         -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[1] == sub)
+            DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[6],
+                         -1, 0);
+          if (ci->progeny[6] == sub && cj->progeny[5] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[5],
+                         -1, 0);
+          if (ci->progeny[6] != NULL && cj->progeny[5] == sub)
+            DOSUB_SUBSET(r, cj->progeny[5], parts, ind, count, ci->progeny[6],
+                         -1, 0);
+          break;
+
+        case 12: /* (  0 ,  0 ,  1 ) */
+          if (ci->progeny[1] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[1], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[1] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[1],
+                         -1, 0);
+          if (ci->progeny[1] == sub && cj->progeny[2] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[1], parts, ind, count, cj->progeny[2],
+                         -1, 0);
+          if (ci->progeny[1] != NULL && cj->progeny[2] == sub)
+            DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[1],
+                         -1, 0);
+          if (ci->progeny[1] == sub && cj->progeny[4] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[1], parts, ind, count, cj->progeny[4],
+                         -1, 0);
+          if (ci->progeny[1] != NULL && cj->progeny[4] == sub)
+            DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[1],
+                         -1, 0);
+          if (ci->progeny[1] == sub && cj->progeny[6] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[1], parts, ind, count, cj->progeny[6],
+                         -1, 0);
+          if (ci->progeny[1] != NULL && cj->progeny[6] == sub)
+            DOSUB_SUBSET(r, cj->progeny[6], parts, ind, count, ci->progeny[1],
+                         -1, 0);
+          if (ci->progeny[3] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[3],
+                         -1, 0);
+          if (ci->progeny[3] == sub && cj->progeny[2] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[2],
+                         -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[2] == sub)
+            DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[3],
+                         -1, 0);
+          if (ci->progeny[3] == sub && cj->progeny[4] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[4],
+                         -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[4] == sub)
+            DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[3],
+                         -1, 0);
+          if (ci->progeny[3] == sub && cj->progeny[6] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[6],
+                         -1, 0);
+          if (ci->progeny[3] != NULL && cj->progeny[6] == sub)
+            DOSUB_SUBSET(r, cj->progeny[6], parts, ind, count, ci->progeny[3],
+                         -1, 0);
+          if (ci->progeny[5] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[5],
+                         -1, 0);
+          if (ci->progeny[5] == sub && cj->progeny[2] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[2],
+                         -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[2] == sub)
+            DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[5],
+                         -1, 0);
+          if (ci->progeny[5] == sub && cj->progeny[4] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[4],
+                         -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[4] == sub)
+            DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[5],
+                         -1, 0);
+          if (ci->progeny[5] == sub && cj->progeny[6] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[6],
+                         -1, 0);
+          if (ci->progeny[5] != NULL && cj->progeny[6] == sub)
+            DOSUB_SUBSET(r, cj->progeny[6], parts, ind, count, ci->progeny[5],
+                         -1, 0);
+          if (ci->progeny[7] == sub && cj->progeny[0] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[0],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[0] == sub)
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[7],
+                         -1, 0);
+          if (ci->progeny[7] == sub && cj->progeny[2] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[2],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[2] == sub)
+            DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[7],
+                         -1, 0);
+          if (ci->progeny[7] == sub && cj->progeny[4] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[4],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[4] == sub)
+            DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[7],
+                         -1, 0);
+          if (ci->progeny[7] == sub && cj->progeny[6] != NULL)
+            DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[6],
+                         -1, 0);
+          if (ci->progeny[7] != NULL && cj->progeny[6] == sub)
+            DOSUB_SUBSET(r, cj->progeny[6], parts, ind, count, ci->progeny[7],
+                         -1, 0);
+          break;
+      }
+
+    }
 
+    /* Otherwise, compute the pair directly. */
+    else if (ci->dt_min <= dt_step || cj->dt_min <= dt_step) {
+
+      /* Get the relative distance between the pairs, wrapping. */
+      for (k = 0; k < 3; k++) {
+        if (cj->loc[k] - ci->loc[k] < -s->dim[k] / 2)
+          shift[k] = s->dim[k];
+        else if (cj->loc[k] - ci->loc[k] > s->dim[k] / 2)
+          shift[k] = -s->dim[k];
+      }
+
+      /* Get the sorting index. */
+      for (sid = 0, k = 0; k < 3; k++)
+        sid =
+            3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0)
+                           ? 0
+                           : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1);
+      sid = sortlistID[sid];
+
+      /* Do any of the cells need to be sorted first? */
+      if (!(cj->sorted & (1 << sid))) runner_dosort(r, cj, (1 << sid), 1);
+
+      /* Compute the interactions. */
+      DOPAIR_SUBSET(r, ci, parts, ind, count, cj);
     }
 
+  } /* otherwise, pair interaction. */
 
+  if (gettimer)
+#ifdef TIMER_VERBOSE
+    printf("runner_dosub[%02i]: flags=%i at depth %i took %.3f ms.\n", r->id,
+           sid, ci->depth, ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000);
+#else
+    TIMER_TOC(TIMER_DOSUB);
+#endif
+}
diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h
index ba24b6bf4a024d4ae9f6e83f325cdcd75edee145..98fd23585768b4594e84099177a5d291912230cb 100644
--- a/src/runner_doiact_grav.h
+++ b/src/runner_doiact_grav.h
@@ -1,23 +1,27 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_RUNNER_DOIACT_GRAV_H
+#define SWIFT_RUNNER_DOIACT_GRAV_H
 
-
+/* Includes. */
+#include "cell.h"
+#include "part.h"
 
 /**
  * @brief Compute the sorted gravity interactions between a cell pair.
@@ -26,171 +30,179 @@
  * @param ci The first #cell.
  * @param cj The second #cell.
  */
- 
-void runner_dopair_grav_new ( struct runner *r , struct cell *ci , struct cell *cj ) {
-
-    struct engine *restrict e = r->e;
-    int pid, pjd, k, sid;
-    double rshift, shift[3] = { 0.0 , 0.0 , 0.0 }, nshift[3];
-    struct entry *restrict sort_i, *restrict sort_j;
-    struct gpart *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j;
-    double pix[3];
-    float dx[3], r2, h_max, di, dj;
-    int count_i, count_j, cnj, cnj_new;
-    float dt_step = e->dt_step;
-    struct multipole m;
-    #ifdef VECTORIZE
-        int icount = 0;
-        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
-        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
-        struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE];
-    #endif
-    TIMER_TIC
-    
-    /* Anything to do here? */
-    if ( ci->dt_min > dt_step && cj->dt_min > dt_step )
-        return;
-        
-    /* Get the sort ID. */
-    sid = space_getsid( e->s , &ci , &cj , shift );
-    
-    /* Make sure the cells are sorted. */
-    runner_dogsort( r , ci , (1 << sid) , 0 );
-    runner_dogsort( r , cj , (1 << sid) , 0 );
-    
-    /* Have the cells been sorted? */
-    if ( !(ci->gsorted & (1 << sid)) || !(cj->gsorted & (1 << sid) ) )
-        error( "Trying to interact unsorted cells." );
-    
-    /* Get the cutoff shift. */
-    for ( rshift = 0.0 , k = 0 ; k < 3 ; k++ )
-        rshift += shift[k]*runner_shift[ 3*sid + k ];
-        
-    /* Pick-out the sorted lists. */
-    sort_i = &ci->gsort[ sid*(ci->count + 1) ];
-    sort_j = &cj->gsort[ sid*(cj->count + 1) ];
-    
-    /* Get some other useful values. */
-    h_max = sqrtf( ci->h[0]*ci->h[0] + ci->h[1]*ci->h[1] + ci->h[2]*ci->h[2] ) * const_theta_max;
-    count_i = ci->gcount; count_j = cj->gcount;
-    parts_i = ci->gparts; parts_j = cj->gparts;
-    cnj = count_j;
-    multipole_reset( &m );
-    nshift[0] = -shift[0]; nshift[1] = -shift[1]; nshift[2] = -shift[2];
-
-    /* Loop over the parts in ci. */
-    for ( pid = count_i-1 ; pid >= 0 ; pid-- ) {
-    
-        /* Get a hold of the ith part in ci. */
-        pi = &parts_i[ sort_i[ pid ].i ];
-        if ( pi->dt > dt_step )
-            continue;
-        di = sort_i[pid].d + h_max - rshift;
-            
-        for ( k = 0 ; k < 3 ; k++ )
-            pix[k] = pi->x[k] - shift[k];
-        
-        /* Loop over the parts in cj. */
-        for ( pjd = 0 ; pjd < cnj && sort_j[pjd].d < di ; pjd++ ) {
-        
-            /* Get a pointer to the jth particle. */
-            pj = &parts_j[ sort_j[pjd].i ];
-        
-            /* Compute the pairwise distance. */
-            r2 = 0.0f;
-            for ( k = 0 ; k < 3 ; k++ ) {
-                dx[k] = pix[k] - pj->x[k];
-                r2 += dx[k]*dx[k];
-                }
-                
-            #ifndef VECTORIZE
-
-                // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 )
-                //     message( "interacting particles pi=%lli and pj=%lli with r=%.3e in cells %lli/%lli." , pi->part->id , pj->part->id , sqrtf(r2) , ((long long int)ci) / sizeof(struct cell) , ((long long int)cj) / sizeof(struct cell) );
-
-                runner_iact_grav( r2 , dx , pi , pj );
-
-            #else
-
-                /* Add this interaction to the queue. */
-                r2q[icount] = r2;
-                dxq[3*icount+0] = dx[0];
-                dxq[3*icount+1] = dx[1];
-                dxq[3*icount+2] = dx[2];
-                piq[icount] = pi;
-                pjq[icount] = pj;
-                icount += 1;
-
-                /* Flush? */
-                if ( icount == VEC_SIZE ) {
-                    runner_iact_vec_grav( r2q , dxq , piq , pjq );
-                    icount = 0;
-                    }
-
-            #endif
-            
-            } /* loop over the parts in cj. */
-            
-        /* Set the new limit. */
-        cnj_new = pjd;
-        
-        /* Add trailing parts to the multipole. */
-        for ( pjd = cnj_new ; pjd < cnj ; pjd++ ) {
-        
-            /* Add the part to the multipole. */
-            multipole_addpart( &m , &parts_j[ sort_j[pjd].i ] );
-        
-            } /* add trailing parts to the multipole. */
-            
-        /* Set the new cnj. */
-        cnj = cnj_new;
-            
-        /* Interact the ith particle with the multipole. */
-        multipole_iact_mp( &m , pi , nshift );
-    
-        } /* loop over the parts in ci. */
-        
-    #ifdef VECTORIZE
-    /* Pick up any leftovers. */
-    if ( icount > 0 )
-        for ( k = 0 ; k < icount ; k++ )
-            runner_iact_grav( r2q[k] , &dxq[3*k] , piq[k] , pjq[k] );
-    #endif
-        
-    /* Re-set the multipole. */
-    multipole_reset( &m );
-        
-    /* Loop over the parts in cj and interact with the multipole in ci. */
-    for ( pid = count_i - 1 , pjd = 0 ; pjd < count_j ; pjd++ ) {
-    
-        /* Get the position of pj along the axis. */
-        dj = sort_j[pjd].d - h_max + rshift;
-        
-        /* Add any left-over parts in cell_i to the multipole. */
-        while ( pid >= 0 && sort_i[pid].d < dj ) {
-            
-            /* Add this particle to the multipole. */
-            multipole_addpart( &m , &parts_i[ sort_i[pid].i ] );
-            
-            /* Decrease pid. */
-            pid -= 1;
-            
-            }
-        
-        /* Interact pj with the multipole. */
-        multipole_iact_mp( &m , &parts_j[ sort_j[pjd].i ] , shift );
-    
-        } /* loop over the parts in cj and interact with the multipole. */
-        
-        
-    #ifdef TIMER_VERBOSE
-        printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000 );
-    #else
-        TIMER_TOC(TIMER_DOPAIR);
-    #endif
 
+void runner_dopair_grav_new(struct runner *r, struct cell *ci,
+                            struct cell *cj) {
+
+  struct engine *restrict e = r->e;
+  int pid, pjd, k, sid;
+  double rshift, shift[3] = {0.0, 0.0, 0.0}, nshift[3];
+  struct entry *restrict sort_i, *restrict sort_j;
+  struct gpart *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j;
+  double pix[3];
+  float dx[3], r2, h_max, di, dj;
+  int count_i, count_j, cnj, cnj_new;
+  float dt_step = e->dt_step;
+  struct multipole m;
+#ifdef VECTORIZE
+  int icount = 0;
+  float r2q[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE];
+#endif
+  TIMER_TIC
+
+  /* Anything to do here? */
+  if (ci->dt_min > dt_step && cj->dt_min > dt_step) return;
+
+  /* Get the sort ID. */
+  sid = space_getsid(e->s, &ci, &cj, shift);
+
+  /* Make sure the cells are sorted. */
+  runner_dogsort(r, ci, (1 << sid), 0);
+  runner_dogsort(r, cj, (1 << sid), 0);
+
+  /* Have the cells been sorted? */
+  if (!(ci->gsorted & (1 << sid)) || !(cj->gsorted & (1 << sid)))
+    error("Trying to interact unsorted cells.");
+
+  /* Get the cutoff shift. */
+  for (rshift = 0.0, k = 0; k < 3; k++)
+    rshift += shift[k] * runner_shift[3 * sid + k];
+
+  /* Pick-out the sorted lists. */
+  sort_i = &ci->gsort[sid * (ci->count + 1)];
+  sort_j = &cj->gsort[sid * (cj->count + 1)];
+
+  /* Get some other useful values. */
+  h_max =
+      sqrtf(ci->h[0] * ci->h[0] + ci->h[1] * ci->h[1] + ci->h[2] * ci->h[2]) *
+      const_theta_max;
+  count_i = ci->gcount;
+  count_j = cj->gcount;
+  parts_i = ci->gparts;
+  parts_j = cj->gparts;
+  cnj = count_j;
+  multipole_reset(&m);
+  nshift[0] = -shift[0];
+  nshift[1] = -shift[1];
+  nshift[2] = -shift[2];
+
+  /* Loop over the parts in ci. */
+  for (pid = count_i - 1; pid >= 0; pid--) {
+
+    /* Get a hold of the ith part in ci. */
+    pi = &parts_i[sort_i[pid].i];
+    if (pi->dt > dt_step) continue;
+    di = sort_i[pid].d + h_max - rshift;
+
+    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+
+    /* Loop over the parts in cj. */
+    for (pjd = 0; pjd < cnj && sort_j[pjd].d < di; pjd++) {
+
+      /* Get a pointer to the jth particle. */
+      pj = &parts_j[sort_j[pjd].i];
+
+      /* Compute the pairwise distance. */
+      r2 = 0.0f;
+      for (k = 0; k < 3; k++) {
+        dx[k] = pix[k] - pj->x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+#ifndef VECTORIZE
+
+      // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 )
+      //     message( "interacting particles pi=%lli and pj=%lli with r=%.3e in
+      // cells %lli/%lli." , pi->part->id , pj->part->id , sqrtf(r2) , ((long
+      // long int)ci) / sizeof(struct cell) , ((long long int)cj) /
+      // sizeof(struct cell) );
+
+      runner_iact_grav(r2, dx, pi, pj);
+
+#else
+
+      /* Add this interaction to the queue. */
+      r2q[icount] = r2;
+      dxq[3 * icount + 0] = dx[0];
+      dxq[3 * icount + 1] = dx[1];
+      dxq[3 * icount + 2] = dx[2];
+      piq[icount] = pi;
+      pjq[icount] = pj;
+      icount += 1;
+
+      /* Flush? */
+      if (icount == VEC_SIZE) {
+        runner_iact_vec_grav(r2q, dxq, piq, pjq);
+        icount = 0;
+      }
+
+#endif
+
+    } /* loop over the parts in cj. */
+
+    /* Set the new limit. */
+    cnj_new = pjd;
+
+    /* Add trailing parts to the multipole. */
+    for (pjd = cnj_new; pjd < cnj; pjd++) {
+
+      /* Add the part to the multipole. */
+      multipole_addpart(&m, &parts_j[sort_j[pjd].i]);
+
+    } /* add trailing parts to the multipole. */
+
+    /* Set the new cnj. */
+    cnj = cnj_new;
+
+    /* Interact the ith particle with the multipole. */
+    multipole_iact_mp(&m, pi, nshift);
+
+  } /* loop over the parts in ci. */
+
+#ifdef VECTORIZE
+  /* Pick up any leftovers. */
+  if (icount > 0)
+    for (k = 0; k < icount; k++)
+      runner_iact_grav(r2q[k], &dxq[3 * k], piq[k], pjq[k]);
+#endif
+
+  /* Re-set the multipole. */
+  multipole_reset(&m);
+
+  /* Loop over the parts in cj and interact with the multipole in ci. */
+  for (pid = count_i - 1, pjd = 0; pjd < count_j; pjd++) {
+
+    /* Get the position of pj along the axis. */
+    dj = sort_j[pjd].d - h_max + rshift;
+
+    /* Add any left-over parts in cell_i to the multipole. */
+    while (pid >= 0 && sort_i[pid].d < dj) {
+
+      /* Add this particle to the multipole. */
+      multipole_addpart(&m, &parts_i[sort_i[pid].i]);
+
+      /* Decrease pid. */
+      pid -= 1;
     }
 
+    /* Interact pj with the multipole. */
+    multipole_iact_mp(&m, &parts_j[sort_j[pjd].i], shift);
+
+  } /* loop over the parts in cj and interact with the multipole. */
+
+#ifdef TIMER_VERBOSE
+  printf(
+      "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) "
+      "took %.3f ms.\n",
+      r->id, count_i, count_j, ci->depth, ci->h_max, cj->h_max,
+      fmax(ci->h[0], fmax(ci->h[1], ci->h[2])),
+      ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000);
+#else
+  TIMER_TOC(TIMER_DOPAIR);
+#endif
+}
 
 /**
  * @brief Compute the recursive upward sweep, i.e. construct the
@@ -199,36 +211,33 @@ void runner_dopair_grav_new ( struct runner *r , struct cell *ci , struct cell *
  * @param r The #runner.
  * @param c The top-level #cell.
  */
- 
-void runner_dograv_up ( struct runner *r , struct cell *c ) {
-
-    /* Re-set this cell's multipole. */
-    multipole_reset( &c->multipole );
-
-    /* Split? */
-    if ( c->split ) {
-    
-        /* Recurse. */
-        for ( int k = 0 ; k < 8 ; k++ )
-            if ( c->progeny[k] != NULL )
-                runner_dograv_up( r , c->progeny[k] );
-                
-        /* Collect the multipoles from the progeny. */
-        multipole_reset( &c->multipole );
-        for ( int k = 0 ; k < 8 ; k++ )
-            if ( c->progeny[k] != NULL )
-                multipole_merge( &c->multipole , &c->progeny[k]->multipole );
-    
-        }
-        
-    /* No, leaf node. */
-    else
-    
-        /* Just collect the multipole. */
-        multipole_init( &c->multipole , c->gparts , c->gcount );
 
-    }
+void runner_dograv_up(struct runner *r, struct cell *c) {
+
+  /* Re-set this cell's multipole. */
+  multipole_reset(&c->multipole);
 
+  /* Split? */
+  if (c->split) {
+
+    /* Recurse. */
+    for (int k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL) runner_dograv_up(r, c->progeny[k]);
+
+    /* Collect the multipoles from the progeny. */
+    multipole_reset(&c->multipole);
+    for (int k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL)
+        multipole_merge(&c->multipole, &c->progeny[k]->multipole);
+
+  }
+
+  /* No, leaf node. */
+  else
+
+    /* Just collect the multipole. */
+    multipole_init(&c->multipole, c->gparts, c->gcount);
+}
 
 /**
  * @brief Compute the recursive downward sweep, i.e. apply the multipole
@@ -237,45 +246,41 @@ void runner_dograv_up ( struct runner *r , struct cell *c ) {
  * @param r The #runner.
  * @param c The top-level #cell.
  */
- 
-void runner_dograv_down ( struct runner *r , struct cell *c ) {
-
-    struct multipole *m = &c->multipole;
-
-    /* Split? */
-    if ( c->split ) {
-    
-        /* Apply this cell's accelleration on the multipoles below. */
-        for ( int k = 0 ; k < 8 ; k++ )
-            if ( c->progeny[k] != NULL ) {
-                struct multipole *mp = &c->progeny[k]->multipole;
-                mp->a[0] += m->a[0];
-                mp->a[1] += m->a[1];
-                mp->a[2] += m->a[2];
-                }
-    
-        /* Recurse. */
-        for ( int k = 0 ; k < 8 ; k++ )
-            if ( c->progeny[k] != NULL )
-                runner_dograv_down( r , c->progeny[k] );
-                
-        }
-        
-    /* No, leaf node. */
-    else {
-    
-        /* Apply the multipole accelleration to all gparts. */
-        for ( int k = 0 ; k < c->gcount ; k++ ) {
-            struct gpart *p = &c->gparts[k];
-            p->a[0] += m->a[0];
-            p->a[1] += m->a[1];
-            p->a[2] += m->a[2];
-            }
-    
-        }
 
-    }
+void runner_dograv_down(struct runner *r, struct cell *c) {
 
+  struct multipole *m = &c->multipole;
+
+  /* Split? */
+  if (c->split) {
+
+    /* Apply this cell's accelleration on the multipoles below. */
+    for (int k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL) {
+        struct multipole *mp = &c->progeny[k]->multipole;
+        mp->a[0] += m->a[0];
+        mp->a[1] += m->a[1];
+        mp->a[2] += m->a[2];
+      }
+
+    /* Recurse. */
+    for (int k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL) runner_dograv_down(r, c->progeny[k]);
+
+  }
+
+  /* No, leaf node. */
+  else {
+
+    /* Apply the multipole accelleration to all gparts. */
+    for (int k = 0; k < c->gcount; k++) {
+      struct gpart *p = &c->gparts[k];
+      p->a[0] += m->a[0];
+      p->a[1] += m->a[1];
+      p->a[2] += m->a[2];
+    }
+  }
+}
 
 /**
  * @brief Compute the multipole-multipole interaction between two cells.
@@ -284,48 +289,45 @@ void runner_dograv_down ( struct runner *r , struct cell *c ) {
  * @param ci The first #cell.
  * @param cj The second #cell.
  */
- 
-void runner_dograv_mm ( struct runner *r , struct cell *restrict ci , struct cell *restrict cj ) {
 
-    struct engine *e = r->e;
-    int k;
-    double shift[3] = { 0.0 , 0.0 , 0.0 };
-    float dx[3], theta;
+void runner_dograv_mm(struct runner *r, struct cell *restrict ci,
+                      struct cell *restrict cj) {
+
+  struct engine *e = r->e;
+  int k;
+  double shift[3] = {0.0, 0.0, 0.0};
+  float dx[3], theta;
+
+  /* Compute the shift between the cells. */
+  for (k = 0; k < 3; k++) {
+    dx[k] = cj->loc[k] - ci->loc[k];
+    if (r->e->s->periodic) {
+      if (dx[k] < -e->s->dim[k] / 2)
+        shift[k] = e->s->dim[k];
+      else if (dx[k] > e->s->dim[k] / 2)
+        shift[k] = -e->s->dim[k];
+      dx[k] += shift[k];
+    }
+  }
+  theta =
+      sqrt((dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]) /
+           (ci->h[0] * ci->h[0] + ci->h[1] * ci->h[1] + ci->h[2] * ci->h[2]));
 
-    /* Compute the shift between the cells. */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        dx[k] = cj->loc[k] - ci->loc[k];
-        if ( r->e->s->periodic ) {
-            if ( dx[k] < -e->s->dim[k]/2 )
-                shift[k] = e->s->dim[k];
-            else if ( dx[k] > e->s->dim[k]/2 )
-                shift[k] = -e->s->dim[k];
-            dx[k] += shift[k];
-            }
-        }
-    theta = sqrt( ( dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2] ) /
-            ( ci->h[0]*ci->h[0] + ci->h[1]*ci->h[1] + ci->h[2]*ci->h[2] ) );
-    
-    /* Do an MM or an MP/PM? */
-    if ( theta > const_theta_max*4 ) {
-        
-        /* Update the multipoles. */
-        multipole_iact_mm( &ci->multipole , &cj->multipole , shift );
-        
-        }
-        
-    else {
-
-        /* Interact the multipoles via their parts. */
-        for ( k = 0 ; k < ci->gcount ; k++ )
-            multipole_iact_mp( &cj->multipole , &ci->gparts[k] , shift );
-        for ( k = 0 ; k < cj->gcount ; k++ )
-            multipole_iact_mp( &ci->multipole , &cj->gparts[k] , shift );
-            
-        }
+  /* Do an MM or an MP/PM? */
+  if (theta > const_theta_max * 4) {
 
-    }
+    /* Update the multipoles. */
+    multipole_iact_mm(&ci->multipole, &cj->multipole, shift);
+
+  } else {
 
+    /* Interact the multipoles via their parts. */
+    for (k = 0; k < ci->gcount; k++)
+      multipole_iact_mp(&cj->multipole, &ci->gparts[k], shift);
+    for (k = 0; k < cj->gcount; k++)
+      multipole_iact_mp(&ci->multipole, &cj->gparts[k], shift);
+  }
+}
 
 /**
  * @brief Compute the interactions between a cell pair.
@@ -334,106 +336,109 @@ void runner_dograv_mm ( struct runner *r , struct cell *restrict ci , struct cel
  * @param ci The first #cell.
  * @param cj The second #cell.
  */
- 
-void runner_dopair_grav ( struct runner *r , struct cell *restrict ci , struct cell *restrict cj ) {
-
-    struct engine *e = r->e;
-    int pid, pjd, k, count_i = ci->gcount, count_j = cj->gcount;
-    double shift[3] = { 0.0 , 0.0 , 0.0 };
-    struct gpart *restrict parts_i = ci->gparts, *restrict parts_j = cj->gparts;
-    struct gpart *restrict pi, *restrict pj;
-    double pix[3];
-    float dx[3], r2;
-    float dt_step = e->dt_step;
-    #ifdef VECTORIZE
-        int icount = 0;
-        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
-        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
-        struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE];
-    #endif
-    TIMER_TIC
-    
-    /* Anything to do here? */
-    if ( ci->dt_min > dt_step && cj->dt_min > dt_step )
-        return;
-    
-    /* Get the relative distance between the pairs, wrapping. */
-    if ( e->s->periodic )
-        for ( k = 0 ; k < 3 ; k++ ) {
-            if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 )
-                shift[k] = e->s->dim[k];
-            else if ( cj->loc[k] - ci->loc[k] > e->s->dim[k]/2 )
-                shift[k] = -e->s->dim[k];
-            }
-        
-    /* Loop over the parts in ci. */
-    for ( pid = 0 ; pid < count_i ; pid++ ) {
-    
-        /* Get a hold of the ith part in ci. */
-        pi = &parts_i[ pid ];
-        for ( k = 0 ; k < 3 ; k++ )
-            pix[k] = pi->x[k] - shift[k];
-        
-        /* Loop over the parts in cj. */
-        for ( pjd = 0 ; pjd < count_j ; pjd++ ) {
-        
-            /* Get a pointer to the jth particle. */
-            pj = &parts_j[ pjd ];
-        
-            /* Compute the pairwise distance. */
-            r2 = 0.0f;
-            for ( k = 0 ; k < 3 ; k++ ) {
-                dx[k] = pix[k] - pj->x[k];
-                r2 += dx[k]*dx[k];
-                }
-                
-            /* Compute the interaction. */
-            #ifndef VECTORIZE
-            
-                // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 )
-                //     message( "interacting particles pi=%lli and pj=%lli with r=%.3e in cells %lli/%lli." , pi->part->id , pj->part->id , sqrtf(r2) , ((long long int)ci) / sizeof(struct cell) , ((long long int)cj) / sizeof(struct cell) );
-
-                runner_iact_grav( r2 , dx , pi , pj );
-
-            #else
-
-                /* Add this interaction to the queue. */
-                r2q[icount] = r2;
-                dxq[3*icount+0] = dx[0];
-                dxq[3*icount+1] = dx[1];
-                dxq[3*icount+2] = dx[2];
-                piq[icount] = pi;
-                pjq[icount] = pj;
-                icount += 1;
-
-                /* Flush? */
-                if ( icount == VEC_SIZE ) {
-                    runner_iact_vec_grav( r2q , dxq , piq , pjq );
-                    icount = 0;
-                    }
-
-            #endif
-        
-            } /* loop over the parts in cj. */
-    
-        } /* loop over the parts in ci. */
-        
-    #ifdef VECTORIZE
-    /* Pick up any leftovers. */
-    if ( icount > 0 )
-        for ( k = 0 ; k < icount ; k++ )
-            runner_iact_grav( r2q[k] , &dxq[3*k] , piq[k] , pjq[k] );
-    #endif
-        
-    #ifdef TIMER_VERBOSE
-        printf( "runner_dopair_naive_grav[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 );
-    #else
-        TIMER_TOC(timer_dopair_grav);
-    #endif
-
 
+void runner_dopair_grav(struct runner *r, struct cell *restrict ci,
+                        struct cell *restrict cj) {
+
+  struct engine *e = r->e;
+  int pid, pjd, k, count_i = ci->gcount, count_j = cj->gcount;
+  double shift[3] = {0.0, 0.0, 0.0};
+  struct gpart *restrict parts_i = ci->gparts, *restrict parts_j = cj->gparts;
+  struct gpart *restrict pi, *restrict pj;
+  double pix[3];
+  float dx[3], r2;
+  float dt_step = e->dt_step;
+#ifdef VECTORIZE
+  int icount = 0;
+  float r2q[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE];
+#endif
+  TIMER_TIC
+
+  /* Anything to do here? */
+  if (ci->dt_min > dt_step && cj->dt_min > dt_step) return;
+
+  /* Get the relative distance between the pairs, wrapping. */
+  if (e->s->periodic)
+    for (k = 0; k < 3; k++) {
+      if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
+        shift[k] = e->s->dim[k];
+      else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
+        shift[k] = -e->s->dim[k];
     }
 
+  /* Loop over the parts in ci. */
+  for (pid = 0; pid < count_i; pid++) {
+
+    /* Get a hold of the ith part in ci. */
+    pi = &parts_i[pid];
+    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+
+    /* Loop over the parts in cj. */
+    for (pjd = 0; pjd < count_j; pjd++) {
+
+      /* Get a pointer to the jth particle. */
+      pj = &parts_j[pjd];
+
+      /* Compute the pairwise distance. */
+      r2 = 0.0f;
+      for (k = 0; k < 3; k++) {
+        dx[k] = pix[k] - pj->x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+/* Compute the interaction. */
+#ifndef VECTORIZE
+
+      // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 )
+      //     message( "interacting particles pi=%lli and pj=%lli with r=%.3e in
+      // cells %lli/%lli." , pi->part->id , pj->part->id , sqrtf(r2) , ((long
+      // long int)ci) / sizeof(struct cell) , ((long long int)cj) /
+      // sizeof(struct cell) );
+
+      runner_iact_grav(r2, dx, pi, pj);
+
+#else
+
+      /* Add this interaction to the queue. */
+      r2q[icount] = r2;
+      dxq[3 * icount + 0] = dx[0];
+      dxq[3 * icount + 1] = dx[1];
+      dxq[3 * icount + 2] = dx[2];
+      piq[icount] = pi;
+      pjq[icount] = pj;
+      icount += 1;
+
+      /* Flush? */
+      if (icount == VEC_SIZE) {
+        runner_iact_vec_grav(r2q, dxq, piq, pjq);
+        icount = 0;
+      }
+
+#endif
+
+    } /* loop over the parts in cj. */
+
+  } /* loop over the parts in ci. */
+
+#ifdef VECTORIZE
+  /* Pick up any leftovers. */
+  if (icount > 0)
+    for (k = 0; k < icount; k++)
+      runner_iact_grav(r2q[k], &dxq[3 * k], piq[k], pjq[k]);
+#endif
+
+#ifdef TIMER_VERBOSE
+  printf(
+      "runner_dopair_naive_grav[%02i]: %i/%i parts at depth %i "
+      "(r_max=%.3f/%.3f) took %.3f ms.\n",
+      r->id, count_i, count_j, ci->depth, ci->h_max, cj->h_max,
+      ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000);
+#else
+  TIMER_TOC(timer_dopair_grav);
+#endif
+}
 
 /**
  * @brief Compute the interactions within a cell.
@@ -441,186 +446,184 @@ void runner_dopair_grav ( struct runner *r , struct cell *restrict ci , struct c
  * @param r The #runner.
  * @param c The #cell.
  */
- 
-void runner_doself_grav ( struct runner *r , struct cell *restrict c ) {
-
-    struct engine *e = r->e;
-    int pid, pjd, k, count = c->gcount;
-    struct gpart *restrict parts = c->gparts;
-    struct gpart *restrict pi, *restrict pj;
-    double pix[3] = { 0.0 , 0.0 , 0.0 };
-    float dx[3], r2;
-    float dt_step = e->dt_step;
-    #ifdef VECTORIZE
-        int icount = 0;
-        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
-        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
-        struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE];
-    #endif
-    TIMER_TIC
-    
-    /* Anything to do here? */
-    if ( c->dt_min > dt_step )
-        return;
-    
-    /* Loop over every part in c. */
-    for ( pid = 0 ; pid < count ; pid++ ) {
-    
-        /* Get a hold of the ith part in ci. */
-        pi = &parts[ pid ];
-        for ( k = 0 ; k < 3 ; k++ )
-            pix[k] = pi->x[k];
-        
-        /* Loop over every other part in c. */
-        for ( pjd = pid+1 ; pjd < count ; pjd++ ) {
-        
-            /* Get a pointer to the jth particle. */
-            pj = &parts[ pjd ];
-        
-            /* Compute the pairwise distance. */
-            r2 = 0.0f;
-            for ( k = 0 ; k < 3 ; k++ ) {
-                dx[k] = pix[k] - pj->x[k];
-                r2 += dx[k]*dx[k];
-                }
-                
-            /* Compute the interaction. */
-            #ifndef VECTORIZE
-
-                // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 )
-                //     message( "interacting particles pi=%lli and pj=%lli with r=%.3e." , pi->part->id , pj->part->id , sqrtf(r2) );
-
-                runner_iact_grav( r2 , dx , pi , pj );
-
-            #else
-
-                /* Add this interaction to the queue. */
-                r2q[icount] = r2;
-                dxq[3*icount+0] = dx[0];
-                dxq[3*icount+1] = dx[1];
-                dxq[3*icount+2] = dx[2];
-                piq[icount] = pi;
-                pjq[icount] = pj;
-                icount += 1;
-
-                /* Flush? */
-                if ( icount == VEC_SIZE ) {
-                    runner_iact_vec_grav( r2q , dxq , piq , pjq );
-                    icount = 0;
-                    }
-
-            #endif
-        
-            } /* loop over the remaining parts in c. */
-    
-        } /* loop over the parts in c. */
-        
-    #ifdef VECTORIZE
-    /* Pick up any leftovers. */
-    if ( icount > 0 )
-        for ( k = 0 ; k < icount ; k++ )
-            runner_iact_grav( r2q[k] , &dxq[3*k] , piq[k] , pjq[k] );
-    #endif
-        
-    #ifdef TIMER_VERBOSE
-        printf( "runner_doself_grav[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 );
-    #else
-        TIMER_TOC(timer_doself_grav);
-    #endif
-
-
-    }
 
+void runner_doself_grav(struct runner *r, struct cell *restrict c) {
+
+  struct engine *e = r->e;
+  int pid, pjd, k, count = c->gcount;
+  struct gpart *restrict parts = c->gparts;
+  struct gpart *restrict pi, *restrict pj;
+  double pix[3] = {0.0, 0.0, 0.0};
+  float dx[3], r2;
+  float dt_step = e->dt_step;
+#ifdef VECTORIZE
+  int icount = 0;
+  float r2q[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE];
+#endif
+  TIMER_TIC
+
+  /* Anything to do here? */
+  if (c->dt_min > dt_step) return;
+
+  /* Loop over every part in c. */
+  for (pid = 0; pid < count; pid++) {
+
+    /* Get a hold of the ith part in ci. */
+    pi = &parts[pid];
+    for (k = 0; k < 3; k++) pix[k] = pi->x[k];
+
+    /* Loop over every other part in c. */
+    for (pjd = pid + 1; pjd < count; pjd++) {
+
+      /* Get a pointer to the jth particle. */
+      pj = &parts[pjd];
+
+      /* Compute the pairwise distance. */
+      r2 = 0.0f;
+      for (k = 0; k < 3; k++) {
+        dx[k] = pix[k] - pj->x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+/* Compute the interaction. */
+#ifndef VECTORIZE
+
+      // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 )
+      //     message( "interacting particles pi=%lli and pj=%lli with r=%.3e." ,
+      // pi->part->id , pj->part->id , sqrtf(r2) );
+
+      runner_iact_grav(r2, dx, pi, pj);
+
+#else
+
+      /* Add this interaction to the queue. */
+      r2q[icount] = r2;
+      dxq[3 * icount + 0] = dx[0];
+      dxq[3 * icount + 1] = dx[1];
+      dxq[3 * icount + 2] = dx[2];
+      piq[icount] = pi;
+      pjq[icount] = pj;
+      icount += 1;
+
+      /* Flush? */
+      if (icount == VEC_SIZE) {
+        runner_iact_vec_grav(r2q, dxq, piq, pjq);
+        icount = 0;
+      }
+
+#endif
+
+    } /* loop over the remaining parts in c. */
+
+  } /* loop over the parts in c. */
+
+#ifdef VECTORIZE
+  /* Pick up any leftovers. */
+  if (icount > 0)
+    for (k = 0; k < icount; k++)
+      runner_iact_grav(r2q[k], &dxq[3 * k], piq[k], pjq[k]);
+#endif
+
+#ifdef TIMER_VERBOSE
+  printf(
+      "runner_doself_grav[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) "
+      "took %.3f ms.\n",
+      r->id, count_i, count_j, ci->depth, ci->h_max, cj->h_max,
+      ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000);
+#else
+  TIMER_TOC(timer_doself_grav);
+#endif
+}
 
 /**
  * @brief Compute a gravity sub-task.
- * 
+ *
  * @param r The #runner.
  * @param ci The first #cell.
  * @param cj The second #cell.
  * @param gettimer Flag to record timer or not.
  */
- 
-void runner_dosub_grav ( struct runner *r , struct cell *ci , struct cell *cj , int gettimer ) {
 
-    int j, k, periodic = r->e->s->periodic;
-    struct space *s = r->e->s;
+void runner_dosub_grav(struct runner *r, struct cell *ci, struct cell *cj,
+                       int gettimer) {
 
-    TIMER_TIC
+  int j, k, periodic = r->e->s->periodic;
+  struct space *s = r->e->s;
 
-    /* Self-interaction? */
-    if ( cj == NULL ) {
+  TIMER_TIC
 
-        /* If the cell is split, recurse. */
-        if ( ci->split ) {
+  /* Self-interaction? */
+  if (cj == NULL) {
 
-            /* Split this task into tasks on its progeny. */
-            for ( j = 0 ; j < 8 ; j++ )
-                if ( ci->progeny[j] != NULL ) {
-                    runner_dosub_grav( r , ci->progeny[j] , NULL , 0 );
-                    for ( k = j+1 ; k < 8 ; k++ )
-                        if ( ci->progeny[k] != NULL )
-                            runner_dosub_grav( r , ci->progeny[j] , ci->progeny[k] , 0 );
-                    }
+    /* If the cell is split, recurse. */
+    if (ci->split) {
 
-            }
+      /* Split this task into tasks on its progeny. */
+      for (j = 0; j < 8; j++)
+        if (ci->progeny[j] != NULL) {
+          runner_dosub_grav(r, ci->progeny[j], NULL, 0);
+          for (k = j + 1; k < 8; k++)
+            if (ci->progeny[k] != NULL)
+              runner_dosub_grav(r, ci->progeny[j], ci->progeny[k], 0);
+        }
 
-        /* Otherwise, just make a pp task out of it. */
-        else
-            runner_doself_grav( r , ci );
+    }
 
-        }
+    /* Otherwise, just make a pp task out of it. */
+    else
+      runner_doself_grav(r, ci);
 
-    /* Nope, pair. */
-    else {
-
-        /* Get the opening angle theta. */
-        float dx[3], theta;
-        for ( k = 0 ; k < 3 ; k++ ) {
-            dx[k] = fabsf( ci->loc[k] - cj->loc[k] );
-            if ( periodic && dx[k] > 0.5*s->dim[k] )
-                dx[k] = -dx[k] + s->dim[k];
-            if ( dx[k] > 0.0f )
-                dx[k] -= ci->h[k];
-            }
-        theta = ( dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2] ) / 
-                ( ci->h[0]*ci->h[0] + ci->h[1]*ci->h[1] + ci->h[2]*ci->h[2] );
-
-        /* Split the interacton? */
-        if ( theta < const_theta_max*const_theta_max ) {
-
-            /* Are both ci and cj split? */
-            if ( ci->split && cj->split ) {
-
-                /* Split this task into tasks on its progeny. */
-                for ( j = 0 ; j < 8 ; j++ )
-                    if ( ci->progeny[j] != NULL ) {
-                        for ( k = 0 ; k < 8 ; k++ )
-                            if ( cj->progeny[k] != NULL )
-                                runner_dosub_grav( r , ci->progeny[j] , cj->progeny[k] , 0 );
-                        }
-
-                }
-
-            /* Otherwise, make a pp task out of it. */
-            else
-                runner_dopair_grav( r , ci , cj );
-
-            }
-            
-        /* Otherwise, mm interaction is fine. */
-        else
-            runner_dograv_mm( r , ci , cj );
+  }
 
-        }
-        
-    if ( gettimer )
-        #ifdef TIMER_VERBOSE
-            printf( "runner_dosub_grav[%02i]: at depth %i took %.3f ms.\n" , r->id , ci->depth , ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000 );
-        #else
-            TIMER_TOC(timer_dosub_grav);
-        #endif
+  /* Nope, pair. */
+  else {
 
+    /* Get the opening angle theta. */
+    float dx[3], theta;
+    for (k = 0; k < 3; k++) {
+      dx[k] = fabsf(ci->loc[k] - cj->loc[k]);
+      if (periodic && dx[k] > 0.5 * s->dim[k]) dx[k] = -dx[k] + s->dim[k];
+      if (dx[k] > 0.0f) dx[k] -= ci->h[k];
     }
+    theta = (dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]) /
+            (ci->h[0] * ci->h[0] + ci->h[1] * ci->h[1] + ci->h[2] * ci->h[2]);
+
+    /* Split the interacton? */
+    if (theta < const_theta_max * const_theta_max) {
+
+      /* Are both ci and cj split? */
+      if (ci->split && cj->split) {
+
+        /* Split this task into tasks on its progeny. */
+        for (j = 0; j < 8; j++)
+          if (ci->progeny[j] != NULL) {
+            for (k = 0; k < 8; k++)
+              if (cj->progeny[k] != NULL)
+                runner_dosub_grav(r, ci->progeny[j], cj->progeny[k], 0);
+          }
 
+      }
 
+      /* Otherwise, make a pp task out of it. */
+      else
+        runner_dopair_grav(r, ci, cj);
+
+    }
+
+    /* Otherwise, mm interaction is fine. */
+    else
+      runner_dograv_mm(r, ci, cj);
+  }
+
+  if (gettimer)
+#ifdef TIMER_VERBOSE
+    printf("runner_dosub_grav[%02i]: at depth %i took %.3f ms.\n", r->id,
+           ci->depth, ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000);
+#else
+    TIMER_TOC(timer_dosub_grav);
+#endif
+}
+
+#endif /* SWIFT_RUNNER_DOIACT_GRAV_H */
diff --git a/src/runner_iact.h b/src/runner_iact.h
index 0a6b9c4ce74b41e1eb795b3c13adc1348b63aa23..e1561132af3fff847989af34c268dfb1069ed40d 100644
--- a/src/runner_iact.h
+++ b/src/runner_iact.h
@@ -2,873 +2,976 @@
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
  *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_RUNNER_IACT_H
+#define SWIFT_RUNNER_IACT_H
 
+/* Includes. */
+#include "const.h"
 #include "kernel.h"
+#include "part.h"
 #include "vector.h"
 
 /**
  * @file  runner_iact.h
  * @brief SPH interaction functions following the Gadget-2 version of SPH.
  *
- * The interactions computed here are the ones presented in the Gadget-2 paper and use the same 
- * numerical coefficients as the Gadget-2 code. When used with the Spline-3 kernel, the results
- * should be equivalent to the ones obtained with Gadget-2 up to the rounding errors and interactions
+ * The interactions computed here are the ones presented in the Gadget-2 paper
+ *and use the same
+ * numerical coefficients as the Gadget-2 code. When used with the Spline-3
+ *kernel, the results
+ * should be equivalent to the ones obtained with Gadget-2 up to the rounding
+ *errors and interactions
  * missed by the Gadget-2 tree-code neighbours search.
  *
- * The code uses internal energy instead of entropy as a thermodynamical variable. 
+ * The code uses internal energy instead of entropy as a thermodynamical
+ *variable.
  */
 
-
 /**
  * @brief Density loop
  */
 
-__attribute__ ((always_inline)) INLINE static void runner_iact_density ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) {
-
-    float r = sqrtf( r2 ), ri = 1.0f / r;
-    float xi, xj;
-    float h_inv;
-    float wi, wj, wi_dx, wj_dx;
-    float mi, mj;
-    float dvdr;
-    float dv[3], curlvr[3];
-    int k;
-    
-    /* Get the masses. */
-    mi = pi->mass; mj = pj->mass;
-    
-    /* Compute dv dot r */
-    dv[0] = pi->v[0] - pj->v[0];
-    dv[1] = pi->v[1] - pj->v[1];
-    dv[2] = pi->v[2] - pj->v[2];
-    dvdr = dv[0]*dx[0] + dv[1]*dx[1] + dv[2]*dx[2];
-    dvdr *= ri;
-
-    /* Compute dv cross r */
-    curlvr[0] = dv[1]*dx[2] - dv[2]*dx[1];
-    curlvr[1] = dv[2]*dx[0] - dv[0]*dx[2];
-    curlvr[2] = dv[0]*dx[1] - dv[1]*dx[0];
-    for ( k = 0 ; k < 3 ; k++ )
-        curlvr[k] *= ri;
-            
-    /* Compute density of pi. */
-    h_inv = 1.0 / hi;
-    xi = r * h_inv;
-    kernel_deval( xi , &wi , &wi_dx );
-
-    pi->rho += mj * wi;
-    pi->rho_dh -= mj * ( 3.0*wi + xi*wi_dx );
-    pi->density.wcount += wi;
-    pi->density.wcount_dh -= xi * wi_dx;
-
-	pi->density.div_v += mj * dvdr * wi_dx;
-	for ( k = 0 ; k < 3 ; k++ )
-	    pi->density.curl_v[k] += mj * curlvr[k] * wi_dx;
-
-    /* Compute density of pj. */
-    h_inv = 1.0 / hj;
-    xj = r * h_inv;
-    kernel_deval( xj , &wj , &wj_dx );
-
-    pj->rho += mi * wj;
-    pj->rho_dh -= mi * ( 3.0*wj + xj*wj_dx );
-    pj->density.wcount += wj;
-    pj->density.wcount_dh -= xj * wj_dx;
-
-	pj->density.div_v += mi * dvdr * wj_dx;
-	for ( k = 0 ; k < 3 ; k++ )
-	    pj->density.curl_v[k] += mi * curlvr[k] * wj_dx;
-        
-    }
-    
+__attribute__((always_inline)) INLINE static void runner_iact_density(
+    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
+
+  float r = sqrtf(r2), ri = 1.0f / r;
+  float xi, xj;
+  float h_inv;
+  float wi, wj, wi_dx, wj_dx;
+  float mi, mj;
+  float dvdr;
+  float dv[3], curlvr[3];
+  int k;
+
+  /* Get the masses. */
+  mi = pi->mass;
+  mj = pj->mass;
+
+  /* Compute dv dot r */
+  dv[0] = pi->v[0] - pj->v[0];
+  dv[1] = pi->v[1] - pj->v[1];
+  dv[2] = pi->v[2] - pj->v[2];
+  dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
+  dvdr *= ri;
+
+  /* Compute dv cross r */
+  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
+  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
+  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];
+  for (k = 0; k < 3; k++) curlvr[k] *= ri;
+
+  /* Compute density of pi. */
+  h_inv = 1.0 / hi;
+  xi = r * h_inv;
+  kernel_deval(xi, &wi, &wi_dx);
+
+  pi->rho += mj * wi;
+  pi->rho_dh -= mj * (3.0 * wi + xi * wi_dx);
+  pi->density.wcount += wi;
+  pi->density.wcount_dh -= xi * wi_dx;
+
+  pi->density.div_v += mj * dvdr * wi_dx;
+  for (k = 0; k < 3; k++) pi->density.curl_v[k] += mj * curlvr[k] * wi_dx;
+
+  /* Compute density of pj. */
+  h_inv = 1.0 / hj;
+  xj = r * h_inv;
+  kernel_deval(xj, &wj, &wj_dx);
+
+  pj->rho += mi * wj;
+  pj->rho_dh -= mi * (3.0 * wj + xj * wj_dx);
+  pj->density.wcount += wj;
+  pj->density.wcount_dh -= xj * wj_dx;
+
+  pj->density.div_v += mi * dvdr * wj_dx;
+  for (k = 0; k < 3; k++) pj->density.curl_v[k] += mi * curlvr[k] * wj_dx;
+}
+
 /**
  * @brief Density loop (Vectorized version)
  */
-__attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( float *R2 , float *Dx , float *Hi , float *Hj , struct part **pi , struct part **pj ) {
+__attribute__((always_inline)) INLINE static void runner_iact_vec_density(
+    float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
+    struct part **pj) {
 
 #ifdef VECTORIZE
 
-    vector r, ri, r2, xi, xj, hi, hj, hi_inv, hj_inv,  wi, wj, wi_dx, wj_dx;
-    vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh;
-    vector mi, mj;
-    vector dx[3], dv[3];
-    vector vi[3], vj[3];    
-    vector dvdr, div_vi, div_vj;
-    vector curlvr[3], curl_vi[3], curl_vj[3];
-    int k, j;
-    
-    #if VEC_SIZE==8
-        mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass , pi[4]->mass , pi[5]->mass , pi[6]->mass , pi[7]->mass );
-        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass );
-        for ( k = 0 ; k < 3 ; k++ ) {
-            vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] , pi[4]->v[k] , pi[5]->v[k] , pi[6]->v[k] , pi[7]->v[k] );
-            vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] , pj[4]->v[k] , pj[5]->v[k] , pj[6]->v[k] , pj[7]->v[k] );
-            }
-        for ( k = 0 ; k < 3 ; k++ )
-            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] );
-    #elif VEC_SIZE==4
-        mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass );
-        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass );
-        for ( k = 0 ; k < 3 ; k++ ) {
-            vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] );
-            vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] );
-            }
-        for ( k = 0 ; k < 3 ; k++ )
-            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] );
-    #endif
-
-    /* Get the radius and inverse radius. */
-    r2.v = vec_load( R2 );
-    ri.v = vec_rsqrt( r2.v );
-    ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) );
-    r.v = r2.v * ri.v;
-    
-    hi.v = vec_load( Hi );
-    hi_inv.v = vec_rcp( hi.v );
-    hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v  - vec_set1( 1.0f ) );
-    xi.v = r.v * hi_inv.v;
-
-    hj.v = vec_load( Hj );
-    hj_inv.v = vec_rcp( hj.v );
-    hj_inv.v = hj_inv.v - hj_inv.v * ( hj_inv.v * hj.v  - vec_set1( 1.0f ) );
-    xj.v = r.v * hj_inv.v;
-    
-    kernel_deval_vec( &xi , &wi , &wi_dx );
-    kernel_deval_vec( &xj , &wj , &wj_dx );
-
-    /* Compute dv. */
-    dv[0].v = vi[0].v - vj[0].v;
-    dv[1].v = vi[1].v - vj[1].v;
-    dv[2].v = vi[2].v - vj[2].v;
-
-    /* Compute dv dot r */
-    dvdr.v = ( dv[0].v * dx[0].v ) + ( dv[1].v * dx[1].v ) + ( dv[2].v * dx[2].v );
-    dvdr.v = dvdr.v * ri.v;
-
-    /* Compute dv cross r */
-    curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v;
-    curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v;
-    curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v;
-    for ( k = 0 ; k < 3 ; k++ )
-        curlvr[k].v *= ri.v;    
-
-    rhoi.v = mj.v * wi.v;
-    rhoi_dh.v = mj.v * ( vec_set1( 3.0f ) * wi.v + xi.v * wi_dx.v );
-    wcounti.v = wi.v;
-    wcounti_dh.v = xi.v * wi_dx.v;
-    div_vi.v = mj.v * dvdr.v * wi_dx.v;
-    for ( k = 0 ; k < 3 ; k++ )
-        curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;
-        
-    rhoj.v = mi.v * wj.v;
-    rhoj_dh.v = mi.v * ( vec_set1( 3.0f ) * wj.v + xj.v * wj_dx.v );
-    wcountj.v = wj.v;
-    wcountj_dh.v = xj.v * wj_dx.v;
-    div_vj.v = mi.v * dvdr.v * wj_dx.v;
-    for ( k = 0 ; k < 3 ; k++ )
-        curl_vj[k].v = mi.v * curlvr[k].v * wj_dx.v;
-
-        
-    for ( k = 0 ; k < VEC_SIZE ; k++ ) {
-        pi[k]->rho += rhoi.f[k];
-        pi[k]->rho_dh -= rhoi_dh.f[k];
-        pi[k]->density.wcount += wcounti.f[k];
-        pi[k]->density.wcount_dh -= wcounti_dh.f[k];
-	    pi[k]->density.div_v += div_vi.f[k];
-	    for( j = 0 ; j < 3 ; j++ )
-   	        pi[k]->density.curl_v[j] += curl_vi[j].f[k];
-        pj[k]->rho += rhoj.f[k];
-        pj[k]->rho_dh -= rhoj_dh.f[k];
-        pj[k]->density.wcount += wcountj.f[k];
-        pj[k]->density.wcount_dh -= wcountj_dh.f[k];
-	    pj[k]->density.div_v += div_vj.f[k];
-	    for( j = 0 ; j < 3 ; j++ )
-   	        pj[k]->density.curl_v[j] += curl_vj[j].f[k];
-        }
-        
+  vector r, ri, r2, xi, xj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx;
+  vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh;
+  vector mi, mj;
+  vector dx[3], dv[3];
+  vector vi[3], vj[3];
+  vector dvdr, div_vi, div_vj;
+  vector curlvr[3], curl_vi[3], curl_vj[3];
+  int k, j;
+
+#if VEC_SIZE == 8
+  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass,
+                 pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass);
+  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
+                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
+  for (k = 0; k < 3; k++) {
+    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
+                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
+    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
+                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
+  }
+  for (k = 0; k < 3; k++)
+    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
+                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
+#elif VEC_SIZE == 4
+  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass);
+  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
+  for (k = 0; k < 3; k++) {
+    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
+    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
+  }
+  for (k = 0; k < 3; k++)
+    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
+#endif
+
+  /* Get the radius and inverse radius. */
+  r2.v = vec_load(R2);
+  ri.v = vec_rsqrt(r2.v);
+  ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f));
+  r.v = r2.v * ri.v;
+
+  hi.v = vec_load(Hi);
+  hi_inv.v = vec_rcp(hi.v);
+  hi_inv.v = hi_inv.v - hi_inv.v * (hi_inv.v * hi.v - vec_set1(1.0f));
+  xi.v = r.v * hi_inv.v;
+
+  hj.v = vec_load(Hj);
+  hj_inv.v = vec_rcp(hj.v);
+  hj_inv.v = hj_inv.v - hj_inv.v * (hj_inv.v * hj.v - vec_set1(1.0f));
+  xj.v = r.v * hj_inv.v;
+
+  kernel_deval_vec(&xi, &wi, &wi_dx);
+  kernel_deval_vec(&xj, &wj, &wj_dx);
+
+  /* Compute dv. */
+  dv[0].v = vi[0].v - vj[0].v;
+  dv[1].v = vi[1].v - vj[1].v;
+  dv[2].v = vi[2].v - vj[2].v;
+
+  /* Compute dv dot r */
+  dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v);
+  dvdr.v = dvdr.v * ri.v;
+
+  /* Compute dv cross r */
+  curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v;
+  curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v;
+  curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v;
+  for (k = 0; k < 3; k++) curlvr[k].v *= ri.v;
+
+  rhoi.v = mj.v * wi.v;
+  rhoi_dh.v = mj.v * (vec_set1(3.0f) * wi.v + xi.v * wi_dx.v);
+  wcounti.v = wi.v;
+  wcounti_dh.v = xi.v * wi_dx.v;
+  div_vi.v = mj.v * dvdr.v * wi_dx.v;
+  for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;
+
+  rhoj.v = mi.v * wj.v;
+  rhoj_dh.v = mi.v * (vec_set1(3.0f) * wj.v + xj.v * wj_dx.v);
+  wcountj.v = wj.v;
+  wcountj_dh.v = xj.v * wj_dx.v;
+  div_vj.v = mi.v * dvdr.v * wj_dx.v;
+  for (k = 0; k < 3; k++) curl_vj[k].v = mi.v * curlvr[k].v * wj_dx.v;
+
+  for (k = 0; k < VEC_SIZE; k++) {
+    pi[k]->rho += rhoi.f[k];
+    pi[k]->rho_dh -= rhoi_dh.f[k];
+    pi[k]->density.wcount += wcounti.f[k];
+    pi[k]->density.wcount_dh -= wcounti_dh.f[k];
+    pi[k]->density.div_v += div_vi.f[k];
+    for (j = 0; j < 3; j++) pi[k]->density.curl_v[j] += curl_vi[j].f[k];
+    pj[k]->rho += rhoj.f[k];
+    pj[k]->rho_dh -= rhoj_dh.f[k];
+    pj[k]->density.wcount += wcountj.f[k];
+    pj[k]->density.wcount_dh -= wcountj_dh.f[k];
+    pj[k]->density.div_v += div_vj.f[k];
+    for (j = 0; j < 3; j++) pj[k]->density.curl_v[j] += curl_vj[j].f[k];
+  }
+
 #else
 
-    for ( int k = 0 ; k < VEC_SIZE ; k++ )
-        runner_iact_density( R2[k] , &Dx[3*k] , Hi[k] , Hj[k] , pi[k] , pj[k] );
-        
-#endif
-    
-    }
-    
+  for (int k = 0; k < VEC_SIZE; k++)
+    runner_iact_density(R2[k], &Dx[3 * k], Hi[k], Hj[k], pi[k], pj[k]);
 
+#endif
+}
 
 /**
  * @brief Density loop (non-symmetric version)
  */
 
-__attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_density ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) {
-
-    float r, ri;
-    float xi;
-    float h_inv;
-    float wi, wi_dx;
-    float mj;
-    float dvdr;
-    float dv[3], curlvr[3];
-    int k;
-
-    /* Get the masses. */
-    mj = pj->mass;
-
-    /* Get r and r inverse. */
-    r = sqrtf( r2 );
-    ri = 1.0f / r;
-
-    /* Compute dv dot r */
-    dv[0] = pi->v[0] - pj->v[0];
-    dv[1] = pi->v[1] - pj->v[1];
-    dv[2] = pi->v[2] - pj->v[2];
-    dvdr = dv[0]*dx[0] + dv[1]*dx[1] + dv[2]*dx[2];
-    dvdr *= ri;
-
-    /* Compute dv cross r */
-    curlvr[0] = dv[1]*dx[2] - dv[2]*dx[1];
-    curlvr[1] = dv[2]*dx[0] - dv[0]*dx[2];
-    curlvr[2] = dv[0]*dx[1] - dv[1]*dx[0];
-    for ( k = 0 ; k < 3 ; k++ )
-        curlvr[k] *= ri;
-
-    h_inv = 1.0 / hi;
-    xi = r * h_inv;
-    kernel_deval( xi , &wi , &wi_dx );
-
-    pi->rho += mj * wi;
-    pi->rho_dh -= mj * ( 3.0*wi + xi*wi_dx );
-    pi->density.wcount += wi;
-    pi->density.wcount_dh -= xi * wi_dx;
-
-	pi->density.div_v += mj * dvdr * wi_dx;
-	for ( k = 0 ; k < 3 ; k++ )
-	    pi->density.curl_v[k] += mj * curlvr[k] * wi_dx;
-            
-    }
-    
+__attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
+    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
+
+  float r, ri;
+  float xi;
+  float h_inv;
+  float wi, wi_dx;
+  float mj;
+  float dvdr;
+  float dv[3], curlvr[3];
+  int k;
+
+  /* Get the masses. */
+  mj = pj->mass;
+
+  /* Get r and r inverse. */
+  r = sqrtf(r2);
+  ri = 1.0f / r;
+
+  /* Compute dv dot r */
+  dv[0] = pi->v[0] - pj->v[0];
+  dv[1] = pi->v[1] - pj->v[1];
+  dv[2] = pi->v[2] - pj->v[2];
+  dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
+  dvdr *= ri;
+
+  /* Compute dv cross r */
+  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
+  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
+  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];
+  for (k = 0; k < 3; k++) curlvr[k] *= ri;
+
+  h_inv = 1.0 / hi;
+  xi = r * h_inv;
+  kernel_deval(xi, &wi, &wi_dx);
+
+  pi->rho += mj * wi;
+  pi->rho_dh -= mj * (3.0 * wi + xi * wi_dx);
+  pi->density.wcount += wi;
+  pi->density.wcount_dh -= xi * wi_dx;
+
+  pi->density.div_v += mj * dvdr * wi_dx;
+  for (k = 0; k < 3; k++) pi->density.curl_v[k] += mj * curlvr[k] * wi_dx;
+}
+
 /**
  * @brief Density loop (non-symmetric vectorized version)
  */
 
-__attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_density ( float *R2 , float *Dx , float *Hi , float *Hj , struct part **pi , struct part **pj ) {
+__attribute__((always_inline))
+    INLINE static void runner_iact_nonsym_vec_density(float *R2, float *Dx,
+                                                      float *Hi, float *Hj,
+                                                      struct part **pi,
+                                                      struct part **pj) {
 
 #ifdef VECTORIZE
 
-    vector r, ri, r2, xi, hi, hi_inv, wi, wi_dx;
-    vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi;
-    vector mj;
-    vector dx[3], dv[3];
-    vector vi[3], vj[3];
-    vector dvdr;
-    vector curlvr[3], curl_vi[3];
-    int k, j;
-    
-    #if VEC_SIZE==8
-        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass );
-        for ( k = 0 ; k < 3 ; k++ ) {
-            vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] , pi[4]->v[k] , pi[5]->v[k] , pi[6]->v[k] , pi[7]->v[k] );
-            vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] , pj[4]->v[k] , pj[5]->v[k] , pj[6]->v[k] , pj[7]->v[k] );
-            }
-        for ( k = 0 ; k < 3 ; k++ )
-            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] );
-    #elif VEC_SIZE==4
-        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass );
-        for ( k = 0 ; k < 3 ; k++ ) {
-            vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] );
-            vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] );
-            }
-        for ( k = 0 ; k < 3 ; k++ )
-            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] );
-    #endif
-    
-    /* Get the radius and inverse radius. */
-    r2.v = vec_load( R2 );
-    ri.v = vec_rsqrt( r2.v );
-    ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) );
-    r.v = r2.v * ri.v;
-    
-    hi.v = vec_load( Hi );
-    hi_inv.v = vec_rcp( hi.v );
-    hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v  - vec_set1( 1.0f ) );
-    xi.v = r.v * hi_inv.v;
-
-    kernel_deval_vec( &xi , &wi , &wi_dx );
-    
-    /* Compute dv. */
-    dv[0].v = vi[0].v - vj[0].v;
-    dv[1].v = vi[1].v - vj[1].v;
-    dv[2].v = vi[2].v - vj[2].v;
-
-    /* Compute dv dot r */
-    dvdr.v = ( dv[0].v * dx[0].v ) + ( dv[1].v * dx[1].v ) + ( dv[2].v * dx[2].v );
-    dvdr.v = dvdr.v * ri.v;
-
-    /* Compute dv cross r */
-    curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v;
-    curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v;
-    curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v;
-    for ( k = 0 ; k < 3 ; k++ )
-        curlvr[k].v *= ri.v;    
-
-    rhoi.v = mj.v * wi.v;
-    rhoi_dh.v = mj.v * ( vec_set1( 3.0f ) * wi.v + xi.v * wi_dx.v );
-    wcounti.v = wi.v;
-    wcounti_dh.v = xi.v * wi_dx.v;
-    div_vi.v = mj.v * dvdr.v * wi_dx.v;
-    for ( k = 0 ; k < 3 ; k++ )
-        curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;
-        
-    for ( k = 0 ; k < VEC_SIZE ; k++ ) {
-        pi[k]->rho += rhoi.f[k];
-        pi[k]->rho_dh -= rhoi_dh.f[k];
-        pi[k]->density.wcount += wcounti.f[k];
-        pi[k]->density.wcount_dh -= wcounti_dh.f[k];
-	    pi[k]->density.div_v += div_vi.f[k];
-	    for( j = 0 ; j < 3 ; j++ )
-   	        pi[k]->density.curl_v[j] += curl_vi[j].f[k];
-        }
-        
+  vector r, ri, r2, xi, hi, hi_inv, wi, wi_dx;
+  vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi;
+  vector mj;
+  vector dx[3], dv[3];
+  vector vi[3], vj[3];
+  vector dvdr;
+  vector curlvr[3], curl_vi[3];
+  int k, j;
+
+#if VEC_SIZE == 8
+  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
+                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
+  for (k = 0; k < 3; k++) {
+    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
+                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
+    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
+                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
+  }
+  for (k = 0; k < 3; k++)
+    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
+                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
+#elif VEC_SIZE == 4
+  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
+  for (k = 0; k < 3; k++) {
+    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
+    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
+  }
+  for (k = 0; k < 3; k++)
+    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
+#endif
+
+  /* Get the radius and inverse radius. */
+  r2.v = vec_load(R2);
+  ri.v = vec_rsqrt(r2.v);
+  ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f));
+  r.v = r2.v * ri.v;
+
+  hi.v = vec_load(Hi);
+  hi_inv.v = vec_rcp(hi.v);
+  hi_inv.v = hi_inv.v - hi_inv.v * (hi_inv.v * hi.v - vec_set1(1.0f));
+  xi.v = r.v * hi_inv.v;
+
+  kernel_deval_vec(&xi, &wi, &wi_dx);
+
+  /* Compute dv. */
+  dv[0].v = vi[0].v - vj[0].v;
+  dv[1].v = vi[1].v - vj[1].v;
+  dv[2].v = vi[2].v - vj[2].v;
+
+  /* Compute dv dot r */
+  dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v);
+  dvdr.v = dvdr.v * ri.v;
+
+  /* Compute dv cross r */
+  curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v;
+  curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v;
+  curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v;
+  for (k = 0; k < 3; k++) curlvr[k].v *= ri.v;
+
+  rhoi.v = mj.v * wi.v;
+  rhoi_dh.v = mj.v * (vec_set1(3.0f) * wi.v + xi.v * wi_dx.v);
+  wcounti.v = wi.v;
+  wcounti_dh.v = xi.v * wi_dx.v;
+  div_vi.v = mj.v * dvdr.v * wi_dx.v;
+  for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;
+
+  for (k = 0; k < VEC_SIZE; k++) {
+    pi[k]->rho += rhoi.f[k];
+    pi[k]->rho_dh -= rhoi_dh.f[k];
+    pi[k]->density.wcount += wcounti.f[k];
+    pi[k]->density.wcount_dh -= wcounti_dh.f[k];
+    pi[k]->density.div_v += div_vi.f[k];
+    for (j = 0; j < 3; j++) pi[k]->density.curl_v[j] += curl_vi[j].f[k];
+  }
+
 #else
 
-    for ( int k = 0 ; k < VEC_SIZE ; k++ )
-        runner_iact_nonsym_density( R2[k] , &Dx[3*k] , Hi[k] , Hj[k] , pi[k] , pj[k] );
+  for (int k = 0; k < VEC_SIZE; k++)
+    runner_iact_nonsym_density(R2[k], &Dx[3 * k], Hi[k], Hj[k], pi[k], pj[k]);
 
 #endif
-        
-    }
-    
+}
 
 /**
  * @brief Force loop
  */
 
-__attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) {
-
-    float r = sqrtf( r2 ), ri = 1.0f / r;
-    float xi, xj;
-    float hi_inv, hi2_inv;
-    float hj_inv, hj2_inv;
-    float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr;
-    float mi, mj, POrho2i, POrho2j, rhoi, rhoj;
-    float v_sig, omega_ij, Pi_ij, alpha_ij, tc, v_sig_u;
-    // float dt_max;
-    float f;
-    int k;
-    
-    /* Get some values in local variables. */
-    mi = pi->mass; mj = pj->mass;
-    rhoi = pi->rho; rhoj = pj->rho;
-    POrho2i = pi->force.POrho2;
-    POrho2j = pj->force.POrho2;
-    
-    /* Get the kernel for hi. */
-    hi_inv = 1.0f / hi;
-    hi2_inv = hi_inv * hi_inv;
-    xi = r * hi_inv;
-    kernel_deval( xi , &wi , &wi_dx );
-    wi_dr = hi2_inv * hi2_inv * wi_dx;
-        
-    /* Get the kernel for hj. */
-    hj_inv = 1.0f / hj;
-    hj2_inv = hj_inv * hj_inv;
-    xj = r * hj_inv;
-    kernel_deval( xj , &wj , &wj_dx );
-    wj_dr = hj2_inv * hj2_inv * wj_dx;
-                
-    /* Compute dv dot r. */
-    dvdr = ( pi->v[0] - pj->v[0] ) * dx[0] + ( pi->v[1] - pj->v[1] ) * dx[1] + ( pi->v[2] - pj->v[2] ) * dx[2];
-    dvdr *= ri;
-
-    /* Compute the relative velocity. (This is 0 if the particles move away from each other and negative otherwise) */
-    omega_ij = fminf( dvdr , 0.f );
-    
-    /* Compute signal velocity */
-    v_sig = pi->force.c + pj->force.c - 2.0f*omega_ij;
-
-    /* Compute viscosity parameter */
-    alpha_ij = -0.5f * ( pi->alpha + pj->alpha );
-
-    /* Compute viscosity tensor */
-    Pi_ij = alpha_ij * v_sig * omega_ij / ( rhoi + rhoj );
-
-    /* Apply balsara switch */
-    Pi_ij *= ( pi->force.balsara + pj->force.balsara );
-
-    /* Termal conductivity */
-    v_sig_u = sqrtf( 2.f * ( const_hydro_gamma - 1.f ) * fabs( rhoi * pi->u - rhoj * pj->u  ) / ( rhoi + rhoj ) );
-    tc = const_conductivity_alpha * v_sig_u / ( rhoi + rhoj );
-    tc *= ( wi_dr + wj_dr );
-
-    /* Get the common factor out. */
-    w = ri * ( ( POrho2i * wi_dr + POrho2j * wj_dr ) + 0.25f * Pi_ij * ( wi_dr + wj_dr ) );
-
-    /* Use the force, Luke! */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        f = dx[k] * w;
-        pi->a[k] -= mj * f;
-        pj->a[k] += mi * f;
-        }
-            
-    /* Get the time derivative for u. */
-    pi->force.u_dt += mj * dvdr * ( POrho2i * wi_dr + 0.125f * Pi_ij * ( wi_dr + wj_dr )  );
-    pj->force.u_dt += mi * dvdr * ( POrho2j * wj_dr + 0.125f * Pi_ij * ( wi_dr + wj_dr ) );
-
-    /* Add the thermal conductivity */
-    pi->force.u_dt += mj * tc * ( pi->u - pj->u );
-    pj->force.u_dt += mi * tc * ( pj->u - pi->u );
-    
-    /* Get the time derivative for h. */
-    pi->force.h_dt -= mj * dvdr / rhoj * wi_dr;
-    pj->force.h_dt -= mi * dvdr / rhoi * wj_dr;
-    
-    /* Update the signal velocity. */
-    pi->force.v_sig = fmaxf( pi->force.v_sig , v_sig );
-    pj->force.v_sig = fmaxf( pj->force.v_sig , v_sig );
-    
-    }
-    
+__attribute__((always_inline)) INLINE static void runner_iact_force(
+    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
+
+  float r = sqrtf(r2), ri = 1.0f / r;
+  float xi, xj;
+  float hi_inv, hi2_inv;
+  float hj_inv, hj2_inv;
+  float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr;
+  float mi, mj, POrho2i, POrho2j, rhoi, rhoj;
+  float v_sig, omega_ij, Pi_ij, alpha_ij, tc, v_sig_u;
+  // float dt_max;
+  float f;
+  int k;
+
+  /* Get some values in local variables. */
+  mi = pi->mass;
+  mj = pj->mass;
+  rhoi = pi->rho;
+  rhoj = pj->rho;
+  POrho2i = pi->force.POrho2;
+  POrho2j = pj->force.POrho2;
+
+  /* Get the kernel for hi. */
+  hi_inv = 1.0f / hi;
+  hi2_inv = hi_inv * hi_inv;
+  xi = r * hi_inv;
+  kernel_deval(xi, &wi, &wi_dx);
+  wi_dr = hi2_inv * hi2_inv * wi_dx;
+
+  /* Get the kernel for hj. */
+  hj_inv = 1.0f / hj;
+  hj2_inv = hj_inv * hj_inv;
+  xj = r * hj_inv;
+  kernel_deval(xj, &wj, &wj_dx);
+  wj_dr = hj2_inv * hj2_inv * wj_dx;
+
+  /* Compute dv dot r. */
+  dvdr = (pi->v[0] - pj->v[0]) * dx[0] + (pi->v[1] - pj->v[1]) * dx[1] +
+         (pi->v[2] - pj->v[2]) * dx[2];
+  dvdr *= ri;
+
+  /* Compute the relative velocity. (This is 0 if the particles move away from
+   * each other and negative otherwise) */
+  omega_ij = fminf(dvdr, 0.f);
+
+  /* Compute signal velocity */
+  v_sig = pi->force.c + pj->force.c - 2.0f * omega_ij;
+
+  /* Compute viscosity parameter */
+  alpha_ij = -0.5f * (pi->alpha + pj->alpha);
+
+  /* Compute viscosity tensor */
+  Pi_ij = alpha_ij * v_sig * omega_ij / (rhoi + rhoj);
+
+  /* Apply balsara switch */
+  Pi_ij *= (pi->force.balsara + pj->force.balsara);
+
+  /* Termal conductivity */
+  v_sig_u = sqrtf(2.f * (const_hydro_gamma - 1.f) *
+                  fabs(rhoi * pi->u - rhoj * pj->u) / (rhoi + rhoj));
+  tc = const_conductivity_alpha * v_sig_u / (rhoi + rhoj);
+  tc *= (wi_dr + wj_dr);
+
+  /* Get the common factor out. */
+  w = ri *
+      ((POrho2i * wi_dr + POrho2j * wj_dr) + 0.25f * Pi_ij * (wi_dr + wj_dr));
+
+  /* Use the force, Luke! */
+  for (k = 0; k < 3; k++) {
+    f = dx[k] * w;
+    pi->a[k] -= mj * f;
+    pj->a[k] += mi * f;
+  }
+
+  /* Get the time derivative for u. */
+  pi->force.u_dt +=
+      mj * dvdr * (POrho2i * wi_dr + 0.125f * Pi_ij * (wi_dr + wj_dr));
+  pj->force.u_dt +=
+      mi * dvdr * (POrho2j * wj_dr + 0.125f * Pi_ij * (wi_dr + wj_dr));
+
+  /* Add the thermal conductivity */
+  pi->force.u_dt += mj * tc * (pi->u - pj->u);
+  pj->force.u_dt += mi * tc * (pj->u - pi->u);
+
+  /* Get the time derivative for h. */
+  pi->force.h_dt -= mj * dvdr / rhoj * wi_dr;
+  pj->force.h_dt -= mi * dvdr / rhoi * wj_dr;
+
+  /* Update the signal velocity. */
+  pi->force.v_sig = fmaxf(pi->force.v_sig, v_sig);
+  pj->force.v_sig = fmaxf(pj->force.v_sig, v_sig);
+}
 
 /**
  * @brief Force loop (Vectorized version)
  */
 
-__attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float *R2 , float *Dx , float *Hi , float *Hj , struct part **pi , struct part **pj ) {
+__attribute__((always_inline)) INLINE static void runner_iact_vec_force(
+    float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
+    struct part **pj) {
 
 #ifdef VECTORIZE
 
-    vector r, r2, ri;
-    vector xi, xj;
-    vector hi, hj, hi_inv, hj_inv;
-    vector hi2_inv, hj2_inv;
-    vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
-    vector w;
-    vector piPOrho2, pjPOrho2, pirho, pjrho, piu, pju;
-    vector mi, mj;
-    vector f;
-    vector dx[3];
-    vector vi[3], vj[3];
-    vector pia[3], pja[3];
-    vector piu_dt, pju_dt;
-    vector pih_dt, pjh_dt;
-    vector ci, cj, v_sig, vi_sig, vj_sig;
-    vector omega_ij, Pi_ij, balsara;
-    vector pialpha, pjalpha, alpha_ij, v_sig_u, tc;
-    int j, k;
-
-    /* Load stuff. */
-    #if VEC_SIZE==8
-        mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass , pi[4]->mass , pi[5]->mass , pi[6]->mass , pi[7]->mass );
-        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass );
-        piPOrho2.v = vec_set( pi[0]->force.POrho2 , pi[1]->force.POrho2 , pi[2]->force.POrho2 , pi[3]->force.POrho2 , pi[4]->force.POrho2 , pi[5]->force.POrho2 , pi[6]->force.POrho2 , pi[7]->force.POrho2 );
-        pjPOrho2.v = vec_set( pj[0]->force.POrho2 , pj[1]->force.POrho2 , pj[2]->force.POrho2 , pj[3]->force.POrho2 , pj[4]->force.POrho2 , pj[5]->force.POrho2 , pj[6]->force.POrho2 , pj[7]->force.POrho2 );
-        pirho.v = vec_set( pi[0]->rho , pi[1]->rho , pi[2]->rho , pi[3]->rho , pi[4]->rho , pi[5]->rho , pi[6]->rho , pi[7]->rho );
-        pjrho.v = vec_set( pj[0]->rho , pj[1]->rho , pj[2]->rho , pj[3]->rho , pj[4]->rho , pj[5]->rho , pj[6]->rho , pj[7]->rho );
-        piu.v = vec_set( pi[0]->u , pi[1]->u , pi[2]->u , pi[3]->u , pi[4]->u , pi[5]->u , pi[6]->u , pi[7]->u );
-        pju.v = vec_set( pj[0]->u , pj[1]->u , pj[2]->u , pj[3]->u , pj[4]->u , pj[5]->u , pj[6]->u , pj[7]->u );
-        ci.v = vec_set( pi[0]->force.c , pi[1]->force.c , pi[2]->force.c , pi[3]->force.c , pi[4]->force.c , pi[5]->force.c , pi[6]->force.c , pi[7]->force.c );
-        cj.v = vec_set( pj[0]->force.c , pj[1]->force.c , pj[2]->force.c , pj[3]->force.c , pj[4]->force.c , pj[5]->force.c , pj[6]->force.c , pj[7]->force.c );
-        vi_sig.v = vec_set( pi[0]->force.v_sig , pi[1]->force.v_sig , pi[2]->force.v_sig , pi[3]->force.v_sig , pi[4]->force.v_sig , pi[5]->force.v_sig , pi[6]->force.v_sig , pi[7]->force.v_sig );
-        vj_sig.v = vec_set( pj[0]->force.v_sig , pj[1]->force.v_sig , pj[2]->force.v_sig , pj[3]->force.v_sig , pj[4]->force.v_sig , pj[5]->force.v_sig , pj[6]->force.v_sig , pj[7]->force.v_sig );
-        for ( k = 0 ; k < 3 ; k++ ) {
-            vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] , pi[4]->v[k] , pi[5]->v[k] , pi[6]->v[k] , pi[7]->v[k] );
-            vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] , pj[4]->v[k] , pj[5]->v[k] , pj[6]->v[k] , pj[7]->v[k] );
-            }
-        for ( k = 0 ; k < 3 ; k++ )
-            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] );
-        balsara.v = vec_set( pi[0]->force.balsara , pi[1]->force.balsara , pi[2]->force.balsara , pi[3]->force.balsara , pi[4]->force.balsara , pi[5]->force.balsara , pi[6]->force.balsara , pi[7]->force.balsara ) + 
-	            vec_set( pj[0]->force.balsara , pj[1]->force.balsara , pj[2]->force.balsara , pj[3]->force.balsara , pj[4]->force.balsara , pj[5]->force.balsara , pj[6]->force.balsara , pj[7]->force.balsara );
-	    pialpha.v = vec_set( pi[0]->alpha, pi[1]->alpha, pi[2]->alpha, pi[3]->alpha, pi[4]->alpha, pi[5]->alpha , pi[6]->alpha, pi[7]->alpha );
-	    pjalpha.v = vec_set( pj[0]->alpha, pj[1]->alpha, pj[2]->alpha, pj[3]->alpha, pj[4]->alpha, pj[5]->alpha , pj[6]->alpha, pj[7]->alpha );
-    #elif VEC_SIZE==4
-        mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass );
-        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass );
-        piPOrho2.v = vec_set( pi[0]->force.POrho2 , pi[1]->force.POrho2 , pi[2]->force.POrho2 , pi[3]->force.POrho2 );
-        pjPOrho2.v = vec_set( pj[0]->force.POrho2 , pj[1]->force.POrho2 , pj[2]->force.POrho2 , pj[3]->force.POrho2 );
-        pirho.v = vec_set( pi[0]->rho , pi[1]->rho , pi[2]->rho , pi[3]->rho );
-        pjrho.v = vec_set( pj[0]->rho , pj[1]->rho , pj[2]->rho , pj[3]->rho );
-        piu.v = vec_set( pi[0]->u , pi[1]->u , pi[2]->u , pi[3]->u );
-        pju.v = vec_set( pj[0]->u , pj[1]->u , pj[2]->u , pj[3]->u );
-        ci.v = vec_set( pi[0]->force.c , pi[1]->force.c , pi[2]->force.c , pi[3]->force.c );
-        cj.v = vec_set( pj[0]->force.c , pj[1]->force.c , pj[2]->force.c , pj[3]->force.c );
-        vi_sig.v = vec_set( pi[0]->force.v_sig , pi[1]->force.v_sig , pi[2]->force.v_sig , pi[3]->force.v_sig );
-        vj_sig.v = vec_set( pj[0]->force.v_sig , pj[1]->force.v_sig , pj[2]->force.v_sig , pj[3]->force.v_sig );
-        for ( k = 0 ; k < 3 ; k++ ) {
-            vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] );
-            vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] );
-            }
-        for ( k = 0 ; k < 3 ; k++ )
-            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] );
-        balsara.v = vec_set( pi[0]->force.balsara , pi[1]->force.balsara , pi[2]->force.balsara , pi[3]->force.balsara ) +
-                    vec_set( pj[0]->force.balsara , pj[1]->force.balsara , pj[2]->force.balsara , pj[3]->force.balsara );
-	    pialpha.v = vec_set( pi[0]->alpha, pi[1]->alpha, pi[2]->alpha, pi[3]->alpha );
-	    pjalpha.v = vec_set( pj[0]->alpha, pj[1]->alpha, pj[2]->alpha, pj[3]->alpha );
-    #else
-        #error
-    #endif
-
-    /* Get the radius and inverse radius. */
-    r2.v = vec_load( R2 );
-    ri.v = vec_rsqrt( r2.v );
-    ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) );
-    r.v = r2.v * ri.v;
-    
-    /* Get the kernel for hi. */
-    hi.v = vec_load( Hi );
-    hi_inv.v = vec_rcp( hi.v );
-    hi_inv.v = hi_inv.v - hi_inv.v * ( hi.v * hi_inv.v - vec_set1( 1.0f ) );
-    hi2_inv.v = hi_inv.v * hi_inv.v;
-    xi.v = r.v * hi_inv.v;
-    kernel_deval_vec( &xi , &wi , &wi_dx );
-    wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v;
-        
-    /* Get the kernel for hj. */
-    hj.v = vec_load( Hj );
-    hj_inv.v = vec_rcp( hj.v );
-    hj_inv.v = hj_inv.v - hj_inv.v * ( hj.v * hj_inv.v - vec_set1( 1.0f ) );
-    hj2_inv.v = hj_inv.v * hj_inv.v;
-    xj.v = r.v * hj_inv.v;
-    kernel_deval_vec( &xj , &wj , &wj_dx );
-    wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v;
-        
-    /* Compute dv dot r. */
-    dvdr.v = ( (vi[0].v - vj[0].v) * dx[0].v ) + ( (vi[1].v - vj[1].v) * dx[1].v ) + ( (vi[2].v - vj[2].v) * dx[2].v );
-    dvdr.v = dvdr.v * ri.v;
-        
-    /* Get the time derivative for h. */
-    pih_dt.v = mj.v / pjrho.v * dvdr.v * wi_dr.v;
-    pjh_dt.v = mi.v / pirho.v * dvdr.v * wj_dr.v;
-    
-    /* Compute the relative velocity. (This is 0 if the particles move away from each other and negative otherwise) */
-    omega_ij.v = vec_fmin( dvdr.v , vec_set1( 0.0f ) );
-    
-    /* Compute signal velocity */
-    v_sig.v = ci.v + cj.v - vec_set1( 2.0f )*omega_ij.v;
-
-    /* Compute viscosity parameter */
-    alpha_ij.v = vec_set1(-0.5f) * ( pialpha.v + pjalpha.v );
-
-    /* Compute viscosity tensor */
-    Pi_ij.v = balsara.v * alpha_ij.v  * v_sig.v * omega_ij.v / (pirho.v + pjrho.v);
-    Pi_ij.v *= ( wi_dr.v + wj_dr.v );
-
-    /* Termal conductivity */
-    v_sig_u.v = vec_sqrt( vec_set1( 2.f * ( const_hydro_gamma - 1.f ) ) * vec_fabs( pirho.v * piu.v - pjrho.v * pju.v  ) / ( pirho.v + pjrho.v ) );
-    tc.v = vec_set1( const_conductivity_alpha ) * v_sig_u.v / ( pirho.v + pjrho.v );
-    tc.v *=  ( wi_dr.v + wj_dr.v );
-    
-    /* Get the common factor out. */
-    w.v = ri.v * ( ( piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v ) + vec_set1( 0.25f ) * Pi_ij.v );
-
-    /* Use the force, Luke! */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        f.v = dx[k].v * w.v;
-        pia[k].v = mj.v * f.v;
-        pja[k].v = mi.v * f.v;
-        }
-        
-    /* Get the time derivative for u. */
-    piu_dt.v = mj.v * dvdr.v * ( piPOrho2.v * wi_dr.v + vec_set1( 0.125f ) * Pi_ij.v );
-    pju_dt.v = mi.v * dvdr.v * ( pjPOrho2.v * wj_dr.v + vec_set1( 0.125f ) * Pi_ij.v );
-
-    /* Add the thermal conductivity */
-    piu_dt.v += mj.v * tc.v * ( piu.v - pju.v );
-    pju_dt.v += mi.v * tc.v * ( pju.v - piu.v );
-    
-    /* compute the signal velocity (this is always symmetrical). */
-    vi_sig.v = vec_fmax( vi_sig.v , v_sig.v );
-    vj_sig.v = vec_fmax( vj_sig.v , v_sig.v );
-
-    /* Store the forces back on the particles. */
-    for ( k = 0 ; k < VEC_SIZE ; k++ ) {
-        pi[k]->force.u_dt += piu_dt.f[k];
-        pj[k]->force.u_dt += pju_dt.f[k];
-        pi[k]->force.h_dt -= pih_dt.f[k];
-        pj[k]->force.h_dt -= pjh_dt.f[k];
-        pi[k]->force.v_sig = vi_sig.f[k];
-        pj[k]->force.v_sig = vj_sig.f[k];
-        for ( j = 0 ; j < 3 ; j++ ) {
-            pi[k]->a[j] -= pia[j].f[k];
-            pj[k]->a[j] += pja[j].f[k];
-            }
-        }
-        
+  vector r, r2, ri;
+  vector xi, xj;
+  vector hi, hj, hi_inv, hj_inv;
+  vector hi2_inv, hj2_inv;
+  vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
+  vector w;
+  vector piPOrho2, pjPOrho2, pirho, pjrho, piu, pju;
+  vector mi, mj;
+  vector f;
+  vector dx[3];
+  vector vi[3], vj[3];
+  vector pia[3], pja[3];
+  vector piu_dt, pju_dt;
+  vector pih_dt, pjh_dt;
+  vector ci, cj, v_sig, vi_sig, vj_sig;
+  vector omega_ij, Pi_ij, balsara;
+  vector pialpha, pjalpha, alpha_ij, v_sig_u, tc;
+  int j, k;
+
+/* Load stuff. */
+#if VEC_SIZE == 8
+  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass,
+                 pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass);
+  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
+                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
+  piPOrho2.v =
+      vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2, pi[2]->force.POrho2,
+              pi[3]->force.POrho2, pi[4]->force.POrho2, pi[5]->force.POrho2,
+              pi[6]->force.POrho2, pi[7]->force.POrho2);
+  pjPOrho2.v =
+      vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2, pj[2]->force.POrho2,
+              pj[3]->force.POrho2, pj[4]->force.POrho2, pj[5]->force.POrho2,
+              pj[6]->force.POrho2, pj[7]->force.POrho2);
+  pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho,
+                    pi[5]->rho, pi[6]->rho, pi[7]->rho);
+  pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho,
+                    pj[5]->rho, pj[6]->rho, pj[7]->rho);
+  piu.v = vec_set(pi[0]->u, pi[1]->u, pi[2]->u, pi[3]->u, pi[4]->u, pi[5]->u,
+                  pi[6]->u, pi[7]->u);
+  pju.v = vec_set(pj[0]->u, pj[1]->u, pj[2]->u, pj[3]->u, pj[4]->u, pj[5]->u,
+                  pj[6]->u, pj[7]->u);
+  ci.v =
+      vec_set(pi[0]->force.c, pi[1]->force.c, pi[2]->force.c, pi[3]->force.c,
+              pi[4]->force.c, pi[5]->force.c, pi[6]->force.c, pi[7]->force.c);
+  cj.v =
+      vec_set(pj[0]->force.c, pj[1]->force.c, pj[2]->force.c, pj[3]->force.c,
+              pj[4]->force.c, pj[5]->force.c, pj[6]->force.c, pj[7]->force.c);
+  vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig,
+                     pi[3]->force.v_sig, pi[4]->force.v_sig, pi[5]->force.v_sig,
+                     pi[6]->force.v_sig, pi[7]->force.v_sig);
+  vj_sig.v = vec_set(pj[0]->force.v_sig, pj[1]->force.v_sig, pj[2]->force.v_sig,
+                     pj[3]->force.v_sig, pj[4]->force.v_sig, pj[5]->force.v_sig,
+                     pj[6]->force.v_sig, pj[7]->force.v_sig);
+  for (k = 0; k < 3; k++) {
+    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
+                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
+    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
+                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
+  }
+  for (k = 0; k < 3; k++)
+    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
+                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
+  balsara.v =
+      vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara,
+              pi[3]->force.balsara, pi[4]->force.balsara, pi[5]->force.balsara,
+              pi[6]->force.balsara, pi[7]->force.balsara) +
+      vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara,
+              pj[3]->force.balsara, pj[4]->force.balsara, pj[5]->force.balsara,
+              pj[6]->force.balsara, pj[7]->force.balsara);
+  pialpha.v = vec_set(pi[0]->alpha, pi[1]->alpha, pi[2]->alpha, pi[3]->alpha,
+                      pi[4]->alpha, pi[5]->alpha, pi[6]->alpha, pi[7]->alpha);
+  pjalpha.v = vec_set(pj[0]->alpha, pj[1]->alpha, pj[2]->alpha, pj[3]->alpha,
+                      pj[4]->alpha, pj[5]->alpha, pj[6]->alpha, pj[7]->alpha);
+#elif VEC_SIZE == 4
+  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass);
+  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
+  piPOrho2.v = vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2,
+                       pi[2]->force.POrho2, pi[3]->force.POrho2);
+  pjPOrho2.v = vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2,
+                       pj[2]->force.POrho2, pj[3]->force.POrho2);
+  pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho);
+  pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho);
+  piu.v = vec_set(pi[0]->u, pi[1]->u, pi[2]->u, pi[3]->u);
+  pju.v = vec_set(pj[0]->u, pj[1]->u, pj[2]->u, pj[3]->u);
+  ci.v =
+      vec_set(pi[0]->force.c, pi[1]->force.c, pi[2]->force.c, pi[3]->force.c);
+  cj.v =
+      vec_set(pj[0]->force.c, pj[1]->force.c, pj[2]->force.c, pj[3]->force.c);
+  vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig,
+                     pi[3]->force.v_sig);
+  vj_sig.v = vec_set(pj[0]->force.v_sig, pj[1]->force.v_sig, pj[2]->force.v_sig,
+                     pj[3]->force.v_sig);
+  for (k = 0; k < 3; k++) {
+    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
+    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
+  }
+  for (k = 0; k < 3; k++)
+    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
+  balsara.v = vec_set(pi[0]->force.balsara, pi[1]->force.balsara,
+                      pi[2]->force.balsara, pi[3]->force.balsara) +
+              vec_set(pj[0]->force.balsara, pj[1]->force.balsara,
+                      pj[2]->force.balsara, pj[3]->force.balsara);
+  pialpha.v = vec_set(pi[0]->alpha, pi[1]->alpha, pi[2]->alpha, pi[3]->alpha);
+  pjalpha.v = vec_set(pj[0]->alpha, pj[1]->alpha, pj[2]->alpha, pj[3]->alpha);
+#else
+#error
+#endif
+
+  /* Get the radius and inverse radius. */
+  r2.v = vec_load(R2);
+  ri.v = vec_rsqrt(r2.v);
+  ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f));
+  r.v = r2.v * ri.v;
+
+  /* Get the kernel for hi. */
+  hi.v = vec_load(Hi);
+  hi_inv.v = vec_rcp(hi.v);
+  hi_inv.v = hi_inv.v - hi_inv.v * (hi.v * hi_inv.v - vec_set1(1.0f));
+  hi2_inv.v = hi_inv.v * hi_inv.v;
+  xi.v = r.v * hi_inv.v;
+  kernel_deval_vec(&xi, &wi, &wi_dx);
+  wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v;
+
+  /* Get the kernel for hj. */
+  hj.v = vec_load(Hj);
+  hj_inv.v = vec_rcp(hj.v);
+  hj_inv.v = hj_inv.v - hj_inv.v * (hj.v * hj_inv.v - vec_set1(1.0f));
+  hj2_inv.v = hj_inv.v * hj_inv.v;
+  xj.v = r.v * hj_inv.v;
+  kernel_deval_vec(&xj, &wj, &wj_dx);
+  wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v;
+
+  /* Compute dv dot r. */
+  dvdr.v = ((vi[0].v - vj[0].v) * dx[0].v) + ((vi[1].v - vj[1].v) * dx[1].v) +
+           ((vi[2].v - vj[2].v) * dx[2].v);
+  dvdr.v = dvdr.v * ri.v;
+
+  /* Get the time derivative for h. */
+  pih_dt.v = mj.v / pjrho.v * dvdr.v * wi_dr.v;
+  pjh_dt.v = mi.v / pirho.v * dvdr.v * wj_dr.v;
+
+  /* Compute the relative velocity. (This is 0 if the particles move away from
+   * each other and negative otherwise) */
+  omega_ij.v = vec_fmin(dvdr.v, vec_set1(0.0f));
+
+  /* Compute signal velocity */
+  v_sig.v = ci.v + cj.v - vec_set1(2.0f) * omega_ij.v;
+
+  /* Compute viscosity parameter */
+  alpha_ij.v = vec_set1(-0.5f) * (pialpha.v + pjalpha.v);
+
+  /* Compute viscosity tensor */
+  Pi_ij.v = balsara.v * alpha_ij.v * v_sig.v * omega_ij.v / (pirho.v + pjrho.v);
+  Pi_ij.v *= (wi_dr.v + wj_dr.v);
+
+  /* Termal conductivity */
+  v_sig_u.v = vec_sqrt(vec_set1(2.f * (const_hydro_gamma - 1.f)) *
+                       vec_fabs(pirho.v * piu.v - pjrho.v * pju.v) /
+                       (pirho.v + pjrho.v));
+  tc.v = vec_set1(const_conductivity_alpha) * v_sig_u.v / (pirho.v + pjrho.v);
+  tc.v *= (wi_dr.v + wj_dr.v);
+
+  /* Get the common factor out. */
+  w.v = ri.v * ((piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v) +
+                vec_set1(0.25f) * Pi_ij.v);
+
+  /* Use the force, Luke! */
+  for (k = 0; k < 3; k++) {
+    f.v = dx[k].v * w.v;
+    pia[k].v = mj.v * f.v;
+    pja[k].v = mi.v * f.v;
+  }
+
+  /* Get the time derivative for u. */
+  piu_dt.v =
+      mj.v * dvdr.v * (piPOrho2.v * wi_dr.v + vec_set1(0.125f) * Pi_ij.v);
+  pju_dt.v =
+      mi.v * dvdr.v * (pjPOrho2.v * wj_dr.v + vec_set1(0.125f) * Pi_ij.v);
+
+  /* Add the thermal conductivity */
+  piu_dt.v += mj.v * tc.v * (piu.v - pju.v);
+  pju_dt.v += mi.v * tc.v * (pju.v - piu.v);
+
+  /* compute the signal velocity (this is always symmetrical). */
+  vi_sig.v = vec_fmax(vi_sig.v, v_sig.v);
+  vj_sig.v = vec_fmax(vj_sig.v, v_sig.v);
+
+  /* Store the forces back on the particles. */
+  for (k = 0; k < VEC_SIZE; k++) {
+    pi[k]->force.u_dt += piu_dt.f[k];
+    pj[k]->force.u_dt += pju_dt.f[k];
+    pi[k]->force.h_dt -= pih_dt.f[k];
+    pj[k]->force.h_dt -= pjh_dt.f[k];
+    pi[k]->force.v_sig = vi_sig.f[k];
+    pj[k]->force.v_sig = vj_sig.f[k];
+    for (j = 0; j < 3; j++) {
+      pi[k]->a[j] -= pia[j].f[k];
+      pj[k]->a[j] += pja[j].f[k];
+    }
+  }
+
 #else
 
-    for ( int k = 0 ; k < VEC_SIZE ; k++ )
-        runner_iact_force( R2[k] , &Dx[3*k] , Hi[k] , Hj[k] , pi[k] , pj[k] );
+  for (int k = 0; k < VEC_SIZE; k++)
+    runner_iact_force(R2[k], &Dx[3 * k], Hi[k], Hj[k], pi[k], pj[k]);
 
 #endif
-        
-    }
-    
+}
 
 /**
  * @brief Force loop (non-symmetric version)
  */
 
-__attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_force ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) {
-
-    float r = sqrtf( r2 ), ri = 1.0f / r;
-    float xi, xj;
-    float hi_inv, hi2_inv;
-    float hj_inv, hj2_inv;
-    float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr;
-    float /*mi,*/ mj, POrho2i, POrho2j, rhoi, rhoj;
-    float v_sig, omega_ij, Pi_ij, alpha_ij, tc, v_sig_u;
-    // float dt_max;
-    float f;
-    int k;
-    
-    /* Get some values in local variables. */
-    // mi = pi->mass;
-    mj = pj->mass;
-    rhoi = pi->rho; rhoj = pj->rho;
-    POrho2i = pi->force.POrho2;
-    POrho2j = pj->force.POrho2;
-    
-    /* Get the kernel for hi. */
-    hi_inv = 1.0f / hi;
-    hi2_inv = hi_inv * hi_inv;
-    xi = r * hi_inv;
-    kernel_deval( xi , &wi , &wi_dx );
-    wi_dr = hi2_inv * hi2_inv * wi_dx;
-        
-    /* Get the kernel for hj. */
-    hj_inv = 1.0f / hj;
-    hj2_inv = hj_inv * hj_inv;
-    xj = r * hj_inv;
-    kernel_deval( xj , &wj , &wj_dx );
-    wj_dr = hj2_inv * hj2_inv * wj_dx;
-                
-    /* Compute dv dot r. */
-    dvdr = ( pi->v[0] - pj->v[0] ) * dx[0] + ( pi->v[1] - pj->v[1] ) * dx[1] + ( pi->v[2] - pj->v[2] ) * dx[2];
-    dvdr *= ri;
-
-    /* Compute the relative velocity. (This is 0 if the particles move away from each other and negative otherwise) */
-    omega_ij = fminf( dvdr , 0.f );
-    
-    /* Compute signal velocity */
-    v_sig = pi->force.c + pj->force.c - 2.0f*omega_ij;
-
-    /* Compute viscosity parameter */
-    alpha_ij = -0.5f * ( pi->alpha + pj->alpha );
-
-    /* Compute viscosity tensor */
-    Pi_ij = alpha_ij * v_sig * omega_ij / ( rhoi + rhoj );
-
-    /* Apply balsara switch */
-    Pi_ij *= ( pi->force.balsara + pj->force.balsara );
-
-    /* Termal conductivity */
-    v_sig_u = sqrtf( 2.f * ( const_hydro_gamma - 1.f ) * fabs( rhoi * pi->u - rhoj * pj->u  ) / ( rhoi + rhoj ) );
-    tc = const_conductivity_alpha * v_sig_u / ( rhoi + rhoj );
-    tc *= ( wi_dr + wj_dr );
-
-    /* Get the common factor out. */
-    w = ri * ( ( POrho2i * wi_dr + POrho2j * wj_dr ) + 0.25f * Pi_ij * ( wi_dr + wj_dr ) );
-
-    /* Use the force, Luke! */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        f = dx[k] * w;
-        pi->a[k] -= mj * f;
-        }
-                
-    /* Get the time derivative for u. */
-    pi->force.u_dt += mj * dvdr * ( POrho2i * wi_dr + 0.125f * Pi_ij * ( wi_dr + wj_dr ) );
-    
-    /* Add the thermal conductivity */
-    pi->force.u_dt += mj * tc * ( pi->u - pj->u );
-
-    /* Get the time derivative for h. */
-    pi->force.h_dt -= mj * dvdr / rhoj * wi_dr;
-    
-    /* Update the signal velocity. */
-    pi->force.v_sig = fmaxf( pi->force.v_sig , v_sig );
-    pj->force.v_sig = fmaxf( pj->force.v_sig , v_sig );
-    
-    }
-    
+__attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
+    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
+
+  float r = sqrtf(r2), ri = 1.0f / r;
+  float xi, xj;
+  float hi_inv, hi2_inv;
+  float hj_inv, hj2_inv;
+  float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr;
+  float /*mi,*/ mj, POrho2i, POrho2j, rhoi, rhoj;
+  float v_sig, omega_ij, Pi_ij, alpha_ij, tc, v_sig_u;
+  // float dt_max;
+  float f;
+  int k;
+
+  /* Get some values in local variables. */
+  // mi = pi->mass;
+  mj = pj->mass;
+  rhoi = pi->rho;
+  rhoj = pj->rho;
+  POrho2i = pi->force.POrho2;
+  POrho2j = pj->force.POrho2;
+
+  /* Get the kernel for hi. */
+  hi_inv = 1.0f / hi;
+  hi2_inv = hi_inv * hi_inv;
+  xi = r * hi_inv;
+  kernel_deval(xi, &wi, &wi_dx);
+  wi_dr = hi2_inv * hi2_inv * wi_dx;
+
+  /* Get the kernel for hj. */
+  hj_inv = 1.0f / hj;
+  hj2_inv = hj_inv * hj_inv;
+  xj = r * hj_inv;
+  kernel_deval(xj, &wj, &wj_dx);
+  wj_dr = hj2_inv * hj2_inv * wj_dx;
+
+  /* Compute dv dot r. */
+  dvdr = (pi->v[0] - pj->v[0]) * dx[0] + (pi->v[1] - pj->v[1]) * dx[1] +
+         (pi->v[2] - pj->v[2]) * dx[2];
+  dvdr *= ri;
+
+  /* Compute the relative velocity. (This is 0 if the particles move away from
+   * each other and negative otherwise) */
+  omega_ij = fminf(dvdr, 0.f);
+
+  /* Compute signal velocity */
+  v_sig = pi->force.c + pj->force.c - 2.0f * omega_ij;
+
+  /* Compute viscosity parameter */
+  alpha_ij = -0.5f * (pi->alpha + pj->alpha);
+
+  /* Compute viscosity tensor */
+  Pi_ij = alpha_ij * v_sig * omega_ij / (rhoi + rhoj);
+
+  /* Apply balsara switch */
+  Pi_ij *= (pi->force.balsara + pj->force.balsara);
+
+  /* Termal conductivity */
+  v_sig_u = sqrtf(2.f * (const_hydro_gamma - 1.f) *
+                  fabs(rhoi * pi->u - rhoj * pj->u) / (rhoi + rhoj));
+  tc = const_conductivity_alpha * v_sig_u / (rhoi + rhoj);
+  tc *= (wi_dr + wj_dr);
+
+  /* Get the common factor out. */
+  w = ri *
+      ((POrho2i * wi_dr + POrho2j * wj_dr) + 0.25f * Pi_ij * (wi_dr + wj_dr));
+
+  /* Use the force, Luke! */
+  for (k = 0; k < 3; k++) {
+    f = dx[k] * w;
+    pi->a[k] -= mj * f;
+  }
+
+  /* Get the time derivative for u. */
+  pi->force.u_dt +=
+      mj * dvdr * (POrho2i * wi_dr + 0.125f * Pi_ij * (wi_dr + wj_dr));
+
+  /* Add the thermal conductivity */
+  pi->force.u_dt += mj * tc * (pi->u - pj->u);
+
+  /* Get the time derivative for h. */
+  pi->force.h_dt -= mj * dvdr / rhoj * wi_dr;
+
+  /* Update the signal velocity. */
+  pi->force.v_sig = fmaxf(pi->force.v_sig, v_sig);
+  pj->force.v_sig = fmaxf(pj->force.v_sig, v_sig);
+}
 
 /**
  * @brief Force loop (Vectorized non-symmetric version)
  */
 
-__attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force ( float *R2 , float *Dx , float *Hi , float *Hj , struct part **pi , struct part **pj ) {
+__attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
+    float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
+    struct part **pj) {
 
 #ifdef VECTORIZE
 
-    vector r, r2, ri;
-    vector xi, xj;
-    vector hi, hj, hi_inv, hj_inv;
-    vector hi2_inv, hj2_inv;
-    vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
-    vector w;
-    vector piPOrho2, pjPOrho2, pirho, pjrho, piu, pju;
-    vector mj;
-    vector f;
-    vector dx[3];
-    vector vi[3], vj[3];
-    vector pia[3];
-    vector piu_dt;
-    vector pih_dt;
-    vector ci, cj, v_sig, vi_sig, vj_sig;
-    vector omega_ij, Pi_ij, balsara;
-    vector pialpha, pjalpha, alpha_ij, v_sig_u, tc;
-    int j, k;
-
-    /* Load stuff. */
-    #if VEC_SIZE==8
-        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass );
-        piPOrho2.v = vec_set( pi[0]->force.POrho2 , pi[1]->force.POrho2 , pi[2]->force.POrho2 , pi[3]->force.POrho2 , pi[4]->force.POrho2 , pi[5]->force.POrho2 , pi[6]->force.POrho2 , pi[7]->force.POrho2 );
-        pjPOrho2.v = vec_set( pj[0]->force.POrho2 , pj[1]->force.POrho2 , pj[2]->force.POrho2 , pj[3]->force.POrho2 , pj[4]->force.POrho2 , pj[5]->force.POrho2 , pj[6]->force.POrho2 , pj[7]->force.POrho2 );
-        pirho.v = vec_set( pi[0]->rho , pi[1]->rho , pi[2]->rho , pi[3]->rho , pi[4]->rho , pi[5]->rho , pi[6]->rho , pi[7]->rho );
-        pjrho.v = vec_set( pj[0]->rho , pj[1]->rho , pj[2]->rho , pj[3]->rho , pj[4]->rho , pj[5]->rho , pj[6]->rho , pj[7]->rho );
-        piu.v = vec_set( pi[0]->u , pi[1]->u , pi[2]->u , pi[3]->u , pi[4]->u , pi[5]->u , pi[6]->u , pi[7]->u );
-        pju.v = vec_set( pj[0]->u , pj[1]->u , pj[2]->u , pj[3]->u , pj[4]->u , pj[5]->u , pj[6]->u , pj[7]->u );
-        ci.v = vec_set( pi[0]->force.c , pi[1]->force.c , pi[2]->force.c , pi[3]->force.c , pi[4]->force.c , pi[5]->force.c , pi[6]->force.c , pi[7]->force.c );
-        cj.v = vec_set( pj[0]->force.c , pj[1]->force.c , pj[2]->force.c , pj[3]->force.c , pj[4]->force.c , pj[5]->force.c , pj[6]->force.c , pj[7]->force.c );
-        vi_sig.v = vec_set( pi[0]->force.v_sig , pi[1]->force.v_sig , pi[2]->force.v_sig , pi[3]->force.v_sig , pi[4]->force.v_sig , pi[5]->force.v_sig , pi[6]->force.v_sig , pi[7]->force.v_sig );
-        vj_sig.v = vec_set( pj[0]->force.v_sig , pj[1]->force.v_sig , pj[2]->force.v_sig , pj[3]->force.v_sig , pj[4]->force.v_sig , pj[5]->force.v_sig , pj[6]->force.v_sig , pj[7]->force.v_sig );
-        for ( k = 0 ; k < 3 ; k++ ) {
-            vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] , pi[4]->v[k] , pi[5]->v[k] , pi[6]->v[k] , pi[7]->v[k] );
-            vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] , pj[4]->v[k] , pj[5]->v[k] , pj[6]->v[k] , pj[7]->v[k] );
-            }
-        for ( k = 0 ; k < 3 ; k++ )
-            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] );
-        balsara.v = vec_set( pi[0]->force.balsara , pi[1]->force.balsara , pi[2]->force.balsara , pi[3]->force.balsara , pi[4]->force.balsara , pi[5]->force.balsara , pi[6]->force.balsara , pi[7]->force.balsara ) +
-                    vec_set( pj[0]->force.balsara , pj[1]->force.balsara , pj[2]->force.balsara , pj[3]->force.balsara , pj[4]->force.balsara , pj[5]->force.balsara , pj[6]->force.balsara , pj[7]->force.balsara );
-	    pialpha.v = vec_set( pi[0]->alpha, pi[1]->alpha, pi[2]->alpha, pi[3]->alpha, pi[4]->alpha, pi[5]->alpha , pi[6]->alpha, pi[7]->alpha );
-	    pjalpha.v = vec_set( pj[0]->alpha, pj[1]->alpha, pj[2]->alpha, pj[3]->alpha, pj[4]->alpha, pj[5]->alpha , pj[6]->alpha, pj[7]->alpha );
-    #elif VEC_SIZE==4
-        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass );
-        piPOrho2.v = vec_set( pi[0]->force.POrho2 , pi[1]->force.POrho2 , pi[2]->force.POrho2 , pi[3]->force.POrho2 );
-        pjPOrho2.v = vec_set( pj[0]->force.POrho2 , pj[1]->force.POrho2 , pj[2]->force.POrho2 , pj[3]->force.POrho2 );
-        pirho.v = vec_set( pi[0]->rho , pi[1]->rho , pi[2]->rho , pi[3]->rho );
-        pjrho.v = vec_set( pj[0]->rho , pj[1]->rho , pj[2]->rho , pj[3]->rho );
-        piu.v = vec_set( pi[0]->u , pi[1]->u , pi[2]->u , pi[3]->u );
-        pju.v = vec_set( pj[0]->u , pj[1]->u , pj[2]->u , pj[3]->u );
-        ci.v = vec_set( pi[0]->force.c , pi[1]->force.c , pi[2]->force.c , pi[3]->force.c );
-        cj.v = vec_set( pj[0]->force.c , pj[1]->force.c , pj[2]->force.c , pj[3]->force.c );
-        vi_sig.v = vec_set( pi[0]->force.v_sig , pi[1]->force.v_sig , pi[2]->force.v_sig , pi[3]->force.v_sig );
-        vj_sig.v = vec_set( pj[0]->force.v_sig , pj[1]->force.v_sig , pj[2]->force.v_sig , pj[3]->force.v_sig );
-        for ( k = 0 ; k < 3 ; k++ ) {
-            vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] );
-            vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] );
-            }
-        for ( k = 0 ; k < 3 ; k++ )
-            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] );
-        balsara.v = vec_set( pi[0]->force.balsara , pi[1]->force.balsara , pi[2]->force.balsara , pi[3]->force.balsara ) +
-                    vec_set( pj[0]->force.balsara , pj[1]->force.balsara , pj[2]->force.balsara , pj[3]->force.balsara );
-	    pialpha.v = vec_set( pi[0]->alpha, pi[1]->alpha, pi[2]->alpha, pi[3]->alpha );
-	    pjalpha.v = vec_set( pj[0]->alpha, pj[1]->alpha, pj[2]->alpha, pj[3]->alpha );
-    #else
-        #error
-    #endif
-
-    /* Get the radius and inverse radius. */
-    r2.v = vec_load( R2 );
-    ri.v = vec_rsqrt( r2.v );
-    ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) );
-    r.v = r2.v * ri.v;
-    
-    /* Get the kernel for hi. */
-    hi.v = vec_load( Hi );
-    hi_inv.v = vec_rcp( hi.v );
-    hi_inv.v = hi_inv.v - hi_inv.v * ( hi.v * hi_inv.v - vec_set1( 1.0f ) );
-    hi2_inv.v = hi_inv.v * hi_inv.v;
-    xi.v = r.v * hi_inv.v;
-    kernel_deval_vec( &xi , &wi , &wi_dx );
-    wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v;
-        
-    /* Get the kernel for hj. */
-    hj.v = vec_load( Hj );
-    hj_inv.v = vec_rcp( hj.v );
-    hj_inv.v = hj_inv.v - hj_inv.v * ( hj.v * hj_inv.v - vec_set1( 1.0f ) );
-    hj2_inv.v = hj_inv.v * hj_inv.v;
-    xj.v = r.v * hj_inv.v;
-    kernel_deval_vec( &xj , &wj , &wj_dx );
-    wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v;
-        
-    /* Compute dv dot r. */
-    dvdr.v = ( (vi[0].v - vj[0].v) * dx[0].v ) + ( (vi[1].v - vj[1].v) * dx[1].v ) + ( (vi[2].v - vj[2].v) * dx[2].v );
-    dvdr.v = dvdr.v * ri.v;
-        
-    /* Get the time derivative for h. */
-    pih_dt.v = mj.v / pjrho.v * dvdr.v * wi_dr.v;
-    
-    /* Compute the relative velocity. (This is 0 if the particles move away from each other and negative otherwise) */
-    omega_ij.v = vec_fmin( dvdr.v , vec_set1( 0.0f ) );
-    
-    /* Compute signal velocity */
-    v_sig.v = ci.v + cj.v - vec_set1( 2.0f )*omega_ij.v;
-
-    /* Compute viscosity parameter */
-    alpha_ij.v = vec_set1(-0.5f) * ( pialpha.v + pjalpha.v );
-
-    /* Compute viscosity tensor */
-    Pi_ij.v = balsara.v * alpha_ij.v  * v_sig.v * omega_ij.v / (pirho.v + pjrho.v);
-    Pi_ij.v *= ( wi_dr.v + wj_dr.v );
-
-    /* Termal conductivity */
-    v_sig_u.v = vec_sqrt( vec_set1( 2.f * ( const_hydro_gamma - 1.f ) ) * vec_fabs( pirho.v * piu.v - pjrho.v * pju.v  ) / ( pirho.v + pjrho.v ) );
-    tc.v = vec_set1( const_conductivity_alpha ) * v_sig_u.v / ( pirho.v + pjrho.v );
-    tc.v *=  ( wi_dr.v + wj_dr.v );
-
-    /* Get the common factor out. */
-    w.v = ri.v * ( ( piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v ) + vec_set1( 0.25f ) * Pi_ij.v );
-
-    /* Use the force, Luke! */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        f.v = dx[k].v * w.v;
-        pia[k].v = mj.v * f.v;
-        }
-        
-    /* Get the time derivative for u. */
-    piu_dt.v = mj.v * dvdr.v * ( piPOrho2.v * wi_dr.v + vec_set1( 0.125f ) * Pi_ij.v );
-
-    /* Add the thermal conductivity */
-    piu_dt.v += mj.v * tc.v * ( piu.v - pju.v );
-    
-    /* compute the signal velocity (this is always symmetrical). */
-    vi_sig.v = vec_fmax( vi_sig.v , v_sig.v );
-    vj_sig.v = vec_fmax( vj_sig.v , v_sig.v );
-
-    /* Store the forces back on the particles. */
-    for ( k = 0 ; k < VEC_SIZE ; k++ ) {
-        pi[k]->force.u_dt += piu_dt.f[k];
-        pi[k]->force.h_dt -= pih_dt.f[k];
-        pi[k]->force.v_sig = vi_sig.f[k];
-        pj[k]->force.v_sig = vj_sig.f[k];
-        for ( j = 0 ; j < 3 ; j++ )
-            pi[k]->a[j] -= pia[j].f[k];
-        }
-
+  vector r, r2, ri;
+  vector xi, xj;
+  vector hi, hj, hi_inv, hj_inv;
+  vector hi2_inv, hj2_inv;
+  vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
+  vector w;
+  vector piPOrho2, pjPOrho2, pirho, pjrho, piu, pju;
+  vector mj;
+  vector f;
+  vector dx[3];
+  vector vi[3], vj[3];
+  vector pia[3];
+  vector piu_dt;
+  vector pih_dt;
+  vector ci, cj, v_sig, vi_sig, vj_sig;
+  vector omega_ij, Pi_ij, balsara;
+  vector pialpha, pjalpha, alpha_ij, v_sig_u, tc;
+  int j, k;
+
+/* Load stuff. */
+#if VEC_SIZE == 8
+  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
+                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
+  piPOrho2.v =
+      vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2, pi[2]->force.POrho2,
+              pi[3]->force.POrho2, pi[4]->force.POrho2, pi[5]->force.POrho2,
+              pi[6]->force.POrho2, pi[7]->force.POrho2);
+  pjPOrho2.v =
+      vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2, pj[2]->force.POrho2,
+              pj[3]->force.POrho2, pj[4]->force.POrho2, pj[5]->force.POrho2,
+              pj[6]->force.POrho2, pj[7]->force.POrho2);
+  pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho,
+                    pi[5]->rho, pi[6]->rho, pi[7]->rho);
+  pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho,
+                    pj[5]->rho, pj[6]->rho, pj[7]->rho);
+  piu.v = vec_set(pi[0]->u, pi[1]->u, pi[2]->u, pi[3]->u, pi[4]->u, pi[5]->u,
+                  pi[6]->u, pi[7]->u);
+  pju.v = vec_set(pj[0]->u, pj[1]->u, pj[2]->u, pj[3]->u, pj[4]->u, pj[5]->u,
+                  pj[6]->u, pj[7]->u);
+  ci.v =
+      vec_set(pi[0]->force.c, pi[1]->force.c, pi[2]->force.c, pi[3]->force.c,
+              pi[4]->force.c, pi[5]->force.c, pi[6]->force.c, pi[7]->force.c);
+  cj.v =
+      vec_set(pj[0]->force.c, pj[1]->force.c, pj[2]->force.c, pj[3]->force.c,
+              pj[4]->force.c, pj[5]->force.c, pj[6]->force.c, pj[7]->force.c);
+  vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig,
+                     pi[3]->force.v_sig, pi[4]->force.v_sig, pi[5]->force.v_sig,
+                     pi[6]->force.v_sig, pi[7]->force.v_sig);
+  vj_sig.v = vec_set(pj[0]->force.v_sig, pj[1]->force.v_sig, pj[2]->force.v_sig,
+                     pj[3]->force.v_sig, pj[4]->force.v_sig, pj[5]->force.v_sig,
+                     pj[6]->force.v_sig, pj[7]->force.v_sig);
+  for (k = 0; k < 3; k++) {
+    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
+                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
+    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
+                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
+  }
+  for (k = 0; k < 3; k++)
+    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
+                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
+  balsara.v =
+      vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara,
+              pi[3]->force.balsara, pi[4]->force.balsara, pi[5]->force.balsara,
+              pi[6]->force.balsara, pi[7]->force.balsara) +
+      vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara,
+              pj[3]->force.balsara, pj[4]->force.balsara, pj[5]->force.balsara,
+              pj[6]->force.balsara, pj[7]->force.balsara);
+  pialpha.v = vec_set(pi[0]->alpha, pi[1]->alpha, pi[2]->alpha, pi[3]->alpha,
+                      pi[4]->alpha, pi[5]->alpha, pi[6]->alpha, pi[7]->alpha);
+  pjalpha.v = vec_set(pj[0]->alpha, pj[1]->alpha, pj[2]->alpha, pj[3]->alpha,
+                      pj[4]->alpha, pj[5]->alpha, pj[6]->alpha, pj[7]->alpha);
+#elif VEC_SIZE == 4
+  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
+  piPOrho2.v = vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2,
+                       pi[2]->force.POrho2, pi[3]->force.POrho2);
+  pjPOrho2.v = vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2,
+                       pj[2]->force.POrho2, pj[3]->force.POrho2);
+  pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho);
+  pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho);
+  piu.v = vec_set(pi[0]->u, pi[1]->u, pi[2]->u, pi[3]->u);
+  pju.v = vec_set(pj[0]->u, pj[1]->u, pj[2]->u, pj[3]->u);
+  ci.v =
+      vec_set(pi[0]->force.c, pi[1]->force.c, pi[2]->force.c, pi[3]->force.c);
+  cj.v =
+      vec_set(pj[0]->force.c, pj[1]->force.c, pj[2]->force.c, pj[3]->force.c);
+  vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig,
+                     pi[3]->force.v_sig);
+  vj_sig.v = vec_set(pj[0]->force.v_sig, pj[1]->force.v_sig, pj[2]->force.v_sig,
+                     pj[3]->force.v_sig);
+  for (k = 0; k < 3; k++) {
+    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
+    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
+  }
+  for (k = 0; k < 3; k++)
+    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
+  balsara.v = vec_set(pi[0]->force.balsara, pi[1]->force.balsara,
+                      pi[2]->force.balsara, pi[3]->force.balsara) +
+              vec_set(pj[0]->force.balsara, pj[1]->force.balsara,
+                      pj[2]->force.balsara, pj[3]->force.balsara);
+  pialpha.v = vec_set(pi[0]->alpha, pi[1]->alpha, pi[2]->alpha, pi[3]->alpha);
+  pjalpha.v = vec_set(pj[0]->alpha, pj[1]->alpha, pj[2]->alpha, pj[3]->alpha);
 #else
+#error
+#endif
 
-    for ( int k = 0 ; k < VEC_SIZE ; k++ )
-        runner_iact_nonsym_force( R2[k] , &Dx[3*k] , Hi[k] , Hj[k] , pi[k] , pj[k] );
+  /* Get the radius and inverse radius. */
+  r2.v = vec_load(R2);
+  ri.v = vec_rsqrt(r2.v);
+  ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f));
+  r.v = r2.v * ri.v;
+
+  /* Get the kernel for hi. */
+  hi.v = vec_load(Hi);
+  hi_inv.v = vec_rcp(hi.v);
+  hi_inv.v = hi_inv.v - hi_inv.v * (hi.v * hi_inv.v - vec_set1(1.0f));
+  hi2_inv.v = hi_inv.v * hi_inv.v;
+  xi.v = r.v * hi_inv.v;
+  kernel_deval_vec(&xi, &wi, &wi_dx);
+  wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v;
+
+  /* Get the kernel for hj. */
+  hj.v = vec_load(Hj);
+  hj_inv.v = vec_rcp(hj.v);
+  hj_inv.v = hj_inv.v - hj_inv.v * (hj.v * hj_inv.v - vec_set1(1.0f));
+  hj2_inv.v = hj_inv.v * hj_inv.v;
+  xj.v = r.v * hj_inv.v;
+  kernel_deval_vec(&xj, &wj, &wj_dx);
+  wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v;
+
+  /* Compute dv dot r. */
+  dvdr.v = ((vi[0].v - vj[0].v) * dx[0].v) + ((vi[1].v - vj[1].v) * dx[1].v) +
+           ((vi[2].v - vj[2].v) * dx[2].v);
+  dvdr.v = dvdr.v * ri.v;
+
+  /* Get the time derivative for h. */
+  pih_dt.v = mj.v / pjrho.v * dvdr.v * wi_dr.v;
+
+  /* Compute the relative velocity. (This is 0 if the particles move away from
+   * each other and negative otherwise) */
+  omega_ij.v = vec_fmin(dvdr.v, vec_set1(0.0f));
+
+  /* Compute signal velocity */
+  v_sig.v = ci.v + cj.v - vec_set1(2.0f) * omega_ij.v;
+
+  /* Compute viscosity parameter */
+  alpha_ij.v = vec_set1(-0.5f) * (pialpha.v + pjalpha.v);
+
+  /* Compute viscosity tensor */
+  Pi_ij.v = balsara.v * alpha_ij.v * v_sig.v * omega_ij.v / (pirho.v + pjrho.v);
+  Pi_ij.v *= (wi_dr.v + wj_dr.v);
+
+  /* Termal conductivity */
+  v_sig_u.v = vec_sqrt(vec_set1(2.f * (const_hydro_gamma - 1.f)) *
+                       vec_fabs(pirho.v * piu.v - pjrho.v * pju.v) /
+                       (pirho.v + pjrho.v));
+  tc.v = vec_set1(const_conductivity_alpha) * v_sig_u.v / (pirho.v + pjrho.v);
+  tc.v *= (wi_dr.v + wj_dr.v);
+
+  /* Get the common factor out. */
+  w.v = ri.v * ((piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v) +
+                vec_set1(0.25f) * Pi_ij.v);
+
+  /* Use the force, Luke! */
+  for (k = 0; k < 3; k++) {
+    f.v = dx[k].v * w.v;
+    pia[k].v = mj.v * f.v;
+  }
+
+  /* Get the time derivative for u. */
+  piu_dt.v =
+      mj.v * dvdr.v * (piPOrho2.v * wi_dr.v + vec_set1(0.125f) * Pi_ij.v);
+
+  /* Add the thermal conductivity */
+  piu_dt.v += mj.v * tc.v * (piu.v - pju.v);
+
+  /* compute the signal velocity (this is always symmetrical). */
+  vi_sig.v = vec_fmax(vi_sig.v, v_sig.v);
+  vj_sig.v = vec_fmax(vj_sig.v, v_sig.v);
+
+  /* Store the forces back on the particles. */
+  for (k = 0; k < VEC_SIZE; k++) {
+    pi[k]->force.u_dt += piu_dt.f[k];
+    pi[k]->force.h_dt -= pih_dt.f[k];
+    pi[k]->force.v_sig = vi_sig.f[k];
+    pj[k]->force.v_sig = vj_sig.f[k];
+    for (j = 0; j < 3; j++) pi[k]->a[j] -= pia[j].f[k];
+  }
 
-#endif
-        
-    }
-    
+#else
 
+  for (int k = 0; k < VEC_SIZE; k++)
+    runner_iact_nonsym_force(R2[k], &Dx[3 * k], Hi[k], Hj[k], pi[k], pj[k]);
 
+#endif
+}
 
+#endif /* SWIFT_RUNNER_IACT_H */
diff --git a/src/runner_iact_grav.h b/src/runner_iact_grav.h
index da1f552ae073aab3575de03255a3919d7a14cf95..2fd30c1c3854db56564300f0a3e1a13a6dc31251 100644
--- a/src/runner_iact_grav.h
+++ b/src/runner_iact_grav.h
@@ -2,22 +2,26 @@
  * This file is part of SWIFT.
  * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
  *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_RUNNER_IACT_GRAV_H
+#define SWIFT_RUNNER_IACT_GRAV_H
 
+/* Includes. */
+#include "const.h"
 #include "kernel.h"
 #include "vector.h"
 
@@ -27,93 +31,93 @@
  *
  */
 
-
 /**
  * @brief Gravity potential
  */
 
-__attribute__ ((always_inline)) INLINE static void runner_iact_grav ( float r2 , float *dx , struct gpart *pi , struct gpart *pj ) {
-
-    float ir, r;
-    float w, acc;
-    float mi = pi->mass, mj = pj->mass;
-    int k;
-    
-    /* Get the absolute distance. */
-    ir = 1.0f / sqrtf( r2 );
-    r = r2 * ir;
-    
-    /* Evaluate the gravity kernel. */
-    kernel_grav_eval( r , &acc );
-    
-    /* Scale the acceleration. */
-    acc *= const_G * ir * ir * ir;
-    
-    /* Aggregate the accellerations. */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        w = acc * dx[k];
-        pi->a[k] -= w * mj;
-        pj->a[k] += w * mi;
-        }
+__attribute__((always_inline)) INLINE static void runner_iact_grav(
+    float r2, float *dx, struct gpart *pi, struct gpart *pj) {
+
+  float ir, r;
+  float w, acc;
+  float mi = pi->mass, mj = pj->mass;
+  int k;
+
+  /* Get the absolute distance. */
+  ir = 1.0f / sqrtf(r2);
+  r = r2 * ir;
+
+  /* Evaluate the gravity kernel. */
+  kernel_grav_eval(r, &acc);
+
+  /* Scale the acceleration. */
+  acc *= const_G * ir * ir * ir;
+
+  /* Aggregate the accellerations. */
+  for (k = 0; k < 3; k++) {
+    w = acc * dx[k];
+    pi->a[k] -= w * mj;
+    pj->a[k] += w * mi;
+  }
+}
 
-    }
-    
-    
 /**
  * @brief Gravity potential (Vectorized version)
  */
-__attribute__ ((always_inline)) INLINE static void runner_iact_vec_grav ( float *R2 , float *Dx , struct gpart **pi , struct gpart **pj ) {
+__attribute__((always_inline)) INLINE static void runner_iact_vec_grav(
+    float *R2, float *Dx, struct gpart **pi, struct gpart **pj) {
 
 #ifdef VECTORIZE
 
-    vector ir, r, r2, dx[3];
-    vector w, acc, ai, aj;
-    vector mi, mj;
-    int j, k;
-    
-    #if VEC_SIZE==8
-        mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass , pi[4]->mass , pi[5]->mass , pi[6]->mass , pi[7]->mass );
-        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass );
-        for ( k = 0 ; k < 3 ; k++ )
-            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] );
-    #elif VEC_SIZE==4
-        mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass );
-        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass );
-        for ( k = 0 ; k < 3 ; k++ )
-            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] );
-    #endif
-
-        
-    /* Get the radius and inverse radius. */
-    r2.v = vec_load( R2 );
-    ir.v = vec_rsqrt( r2.v );
-    ir.v = ir.v - vec_set1( 0.5f ) * ir.v * ( r2.v * ir.v * ir.v - vec_set1( 1.0f ) );
-    r.v = r2.v * ir.v;
-    
-    /* Evaluate the gravity kernel. */
-    blender_eval_vec( &r , &acc );
-    
-    /* Scale the acceleration. */
-    acc.v *= vec_set1( const_G ) * ir.v * ir.v * ir.v;
-    
-    /* Aggregate the accellerations. */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        w.v = acc.v * dx[k].v;
-        ai.v = w.v * mj.v;
-        aj.v = w.v * mi.v;
-        for ( j = 0 ; j < VEC_SIZE ; j++ ) {
-            pi[j]->a[k] -= ai.f[j];
-            pj[j]->a[k] += aj.f[j];
-            }
-        }
+  vector ir, r, r2, dx[3];
+  vector w, acc, ai, aj;
+  vector mi, mj;
+  int j, k;
+
+#if VEC_SIZE == 8
+  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass,
+                 pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass);
+  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
+                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
+  for (k = 0; k < 3; k++)
+    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
+                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
+#elif VEC_SIZE == 4
+  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass);
+  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
+  for (k = 0; k < 3; k++)
+    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
+#endif
+
+  /* Get the radius and inverse radius. */
+  r2.v = vec_load(R2);
+  ir.v = vec_rsqrt(r2.v);
+  ir.v = ir.v - vec_set1(0.5f) * ir.v * (r2.v * ir.v * ir.v - vec_set1(1.0f));
+  r.v = r2.v * ir.v;
+
+  /* Evaluate the gravity kernel. */
+  blender_eval_vec(&r, &acc);
+
+  /* Scale the acceleration. */
+  acc.v *= vec_set1(const_G) * ir.v * ir.v * ir.v;
+
+  /* Aggregate the accellerations. */
+  for (k = 0; k < 3; k++) {
+    w.v = acc.v * dx[k].v;
+    ai.v = w.v * mj.v;
+    aj.v = w.v * mi.v;
+    for (j = 0; j < VEC_SIZE; j++) {
+      pi[j]->a[k] -= ai.f[j];
+      pj[j]->a[k] += aj.f[j];
+    }
+  }
 
 #else
 
-    for ( int k = 0 ; k < VEC_SIZE ; k++ )
-        runner_iact_grav( R2[k] , &Dx[3*k] , pi[k] , pj[k] );
-        
+  for (int k = 0; k < VEC_SIZE; k++)
+    runner_iact_grav(R2[k], &Dx[3 * k], pi[k], pj[k]);
+
 #endif
-    
-    }
-    
+}
 
+#endif /* SWIFT_RUNNER_IACT_GRAV_H */
diff --git a/src/runner_iact_legacy.h b/src/runner_iact_legacy.h
index aa50cc1fe2c09fa558a21eaf8b9079ffc08b6cbb..3f5df4cd40668862a2e2c0a01c5b28069f184377 100644
--- a/src/runner_iact_legacy.h
+++ b/src/runner_iact_legacy.h
@@ -2,809 +2,900 @@
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
  *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_RUNNER_IACT_LEGACY_H
+#define SWIFT_RUNNER_IACT_LECAGY_H
 
+/* Includes. */
+#include "const.h"
 #include "kernel.h"
+#include "part.h"
 #include "vector.h"
 
 /**
  * @file  runner_iact_legacy.h
  * @brief SPH interaction functions following the Gadget-2 version of SPH.
  *
- * The interactions computed here are the ones presented in the Gadget-2 paper and use the same 
- * numerical coefficients as the Gadget-2 code. When used with the Spline-3 kernel, the results
- * should be equivalent to the ones obtained with Gadget-2 up to the rounding errors and interactions
+ * The interactions computed here are the ones presented in the Gadget-2 paper
+ *and use the same
+ * numerical coefficients as the Gadget-2 code. When used with the Spline-3
+ *kernel, the results
+ * should be equivalent to the ones obtained with Gadget-2 up to the rounding
+ *errors and interactions
  * missed by the Gadget-2 tree-code neighbours search.
  *
- * The code uses internal energy instead of entropy as a thermodynamical variable. 
+ * The code uses internal energy instead of entropy as a thermodynamical
+ *variable.
  */
 
-
 /**
  * @brief Density loop
  */
 
-__attribute__ ((always_inline)) INLINE static void runner_iact_density ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) {
-
-    float r = sqrtf( r2 ), ri = 1.0f / r;
-    float xi, xj;
-    float h_inv;
-    float wi, wj, wi_dx, wj_dx;
-    float mi, mj;
-    float dvdr;
-    float dv[3], curlvr[3];
-    int k;
-    
-    /* Get the masses. */
-    mi = pi->mass; mj = pj->mass;
-    
-    /* Compute dv dot r */
-    dv[0] = pi->v[0] - pj->v[0];
-    dv[1] = pi->v[1] - pj->v[1];
-    dv[2] = pi->v[2] - pj->v[2];
-    dvdr = dv[0]*dx[0] + dv[1]*dx[1] + dv[2]*dx[2];
-    dvdr *= ri;
-
-    /* Compute dv cross r */
-    curlvr[0] = dv[1]*dx[2] - dv[2]*dx[1];
-    curlvr[1] = dv[2]*dx[0] - dv[0]*dx[2];
-    curlvr[2] = dv[0]*dx[1] - dv[1]*dx[0];
-    for ( k = 0 ; k < 3 ; k++ )
-        curlvr[k] *= ri;
-            
-    /* Compute density of pi. */
-    h_inv = 1.0 / hi;
-    xi = r * h_inv;
-    kernel_deval( xi , &wi , &wi_dx );
-
-    pi->rho += mj * wi;
-    pi->rho_dh -= mj * ( 3.0*wi + xi*wi_dx );
-    pi->density.wcount += wi;
-    pi->density.wcount_dh -= xi * wi_dx;
-
-	pi->density.div_v += mj * dvdr * wi_dx;
-	for ( k = 0 ; k < 3 ; k++ )
-	    pi->density.curl_v[k] += mj * curlvr[k] * wi_dx;
-
-    /* Compute density of pj. */
-    h_inv = 1.0 / hj;
-    xj = r * h_inv;
-    kernel_deval( xj , &wj , &wj_dx );
-
-    pj->rho += mi * wj;
-    pj->rho_dh -= mi * ( 3.0*wj + xj*wj_dx );
-    pj->density.wcount += wj;
-    pj->density.wcount_dh -= xj * wj_dx;
-
-	pj->density.div_v += mi * dvdr * wj_dx;
-	for ( k = 0 ; k < 3 ; k++ )
-	    pj->density.curl_v[k] += mi * curlvr[k] * wj_dx;
-        
-    }
-    
+__attribute__((always_inline)) INLINE static void runner_iact_density(
+    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
+
+  float r = sqrtf(r2), ri = 1.0f / r;
+  float xi, xj;
+  float h_inv;
+  float wi, wj, wi_dx, wj_dx;
+  float mi, mj;
+  float dvdr;
+  float dv[3], curlvr[3];
+  int k;
+
+  /* Get the masses. */
+  mi = pi->mass;
+  mj = pj->mass;
+
+  /* Compute dv dot r */
+  dv[0] = pi->v[0] - pj->v[0];
+  dv[1] = pi->v[1] - pj->v[1];
+  dv[2] = pi->v[2] - pj->v[2];
+  dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
+  dvdr *= ri;
+
+  /* Compute dv cross r */
+  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
+  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
+  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];
+  for (k = 0; k < 3; k++) curlvr[k] *= ri;
+
+  /* Compute density of pi. */
+  h_inv = 1.0 / hi;
+  xi = r * h_inv;
+  kernel_deval(xi, &wi, &wi_dx);
+
+  pi->rho += mj * wi;
+  pi->rho_dh -= mj * (3.0 * wi + xi * wi_dx);
+  pi->density.wcount += wi;
+  pi->density.wcount_dh -= xi * wi_dx;
+
+  pi->density.div_v += mj * dvdr * wi_dx;
+  for (k = 0; k < 3; k++) pi->density.curl_v[k] += mj * curlvr[k] * wi_dx;
+
+  /* Compute density of pj. */
+  h_inv = 1.0 / hj;
+  xj = r * h_inv;
+  kernel_deval(xj, &wj, &wj_dx);
+
+  pj->rho += mi * wj;
+  pj->rho_dh -= mi * (3.0 * wj + xj * wj_dx);
+  pj->density.wcount += wj;
+  pj->density.wcount_dh -= xj * wj_dx;
+
+  pj->density.div_v += mi * dvdr * wj_dx;
+  for (k = 0; k < 3; k++) pj->density.curl_v[k] += mi * curlvr[k] * wj_dx;
+}
+
 /**
  * @brief Density loop (Vectorized version)
  */
-__attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( float *R2 , float *Dx , float *Hi , float *Hj , struct part **pi , struct part **pj ) {
+__attribute__((always_inline)) INLINE static void runner_iact_vec_density(
+    float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
+    struct part **pj) {
 
 #ifdef VECTORIZE
 
-    vector r, r2, ri, xi, xj, hi, hj, hi_inv, hj_inv,  wi, wj, wi_dx, wj_dx;
-    vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh;
-    vector mi, mj;
-    vector dx[3], dv[3];
-    vector vi[3], vj[3];    
-    vector dvdr, div_vi, div_vj;
-    vector curlvr[3], curl_vi[3], curl_vj[3];
-    int k, j;
-    
-    #if VEC_SIZE==8
-        mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass , pi[4]->mass , pi[5]->mass , pi[6]->mass , pi[7]->mass );
-        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass );
-        for ( k = 0 ; k < 3 ; k++ ) {
-            vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] , pi[4]->v[k] , pi[5]->v[k] , pi[6]->v[k] , pi[7]->v[k] );
-            vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] , pj[4]->v[k] , pj[5]->v[k] , pj[6]->v[k] , pj[7]->v[k] );
-            }
-        for ( k = 0 ; k < 3 ; k++ )
-            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] );
-    #elif VEC_SIZE==4
-        mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass );
-        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass );
-        for ( k = 0 ; k < 3 ; k++ ) {
-            vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] );
-            vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] );
-            }
-        for ( k = 0 ; k < 3 ; k++ )
-            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] );
-    #endif
-
-    /* Get the radius and inverse radius. */
-    r2.v = vec_load( R2 );
-    ri.v = vec_rsqrt( r2.v );
-    ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) );
-    r.v = r2.v * ri.v;
-    
-    hi.v = vec_load( Hi );
-    hi_inv.v = vec_rcp( hi.v );
-    hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v  - vec_set1( 1.0f ) );
-    xi.v = r.v * hi_inv.v;
-
-    hj.v = vec_load( Hj );
-    hj_inv.v = vec_rcp( hj.v );
-    hj_inv.v = hj_inv.v - hj_inv.v * ( hj_inv.v * hj.v  - vec_set1( 1.0f ) );
-    xj.v = r.v * hj_inv.v;
-    
-    kernel_deval_vec( &xi , &wi , &wi_dx );
-    kernel_deval_vec( &xj , &wj , &wj_dx );
-
-    /* Compute dv. */
-    dv[0].v = vi[0].v - vj[0].v;
-    dv[1].v = vi[1].v - vj[1].v;
-    dv[2].v = vi[2].v - vj[2].v;
-
-    /* Compute dv dot r */
-    dvdr.v = ( dv[0].v * dx[0].v ) + ( dv[1].v * dx[1].v ) + ( dv[2].v * dx[2].v );
-    dvdr.v = dvdr.v * ri.v;
-
-    /* Compute dv cross r */
-    curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v;
-    curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v;
-    curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v;
-    for ( k = 0 ; k < 3 ; k++ )
-        curlvr[k].v *= ri.v;    
-
-    rhoi.v = mj.v * wi.v;
-    rhoi_dh.v = mj.v * ( vec_set1( 3.0f ) * wi.v + xi.v * wi_dx.v );
-    wcounti.v = wi.v;
-    wcounti_dh.v = xi.v * wi_dx.v;
-    div_vi.v = mj.v * dvdr.v * wi_dx.v;
-    for ( k = 0 ; k < 3 ; k++ )
-        curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;
-        
-    rhoj.v = mi.v * wj.v;
-    rhoj_dh.v = mi.v * ( vec_set1( 3.0f ) * wj.v + xj.v * wj_dx.v );
-    wcountj.v = wj.v;
-    wcountj_dh.v = xj.v * wj_dx.v;
-    div_vj.v = mi.v * dvdr.v * wj_dx.v;
-    for ( k = 0 ; k < 3 ; k++ )
-        curl_vj[k].v = mi.v * curlvr[k].v * wj_dx.v;
-
-        
-    for ( k = 0 ; k < VEC_SIZE ; k++ ) {
-        pi[k]->rho += rhoi.f[k];
-        pi[k]->rho_dh -= rhoi_dh.f[k];
-        pi[k]->density.wcount += wcounti.f[k];
-        pi[k]->density.wcount_dh -= wcounti_dh.f[k];
-	    pi[k]->density.div_v += div_vi.f[k];
-	    for( j = 0 ; j < 3 ; j++ )
-   	        pi[k]->density.curl_v[j] += curl_vi[j].f[k];
-        pj[k]->rho += rhoj.f[k];
-        pj[k]->rho_dh -= rhoj_dh.f[k];
-        pj[k]->density.wcount += wcountj.f[k];
-        pj[k]->density.wcount_dh -= wcountj_dh.f[k];
-	    pj[k]->density.div_v += div_vj.f[k];
-	    for( j = 0 ; j < 3 ; j++ )
-   	        pj[k]->density.curl_v[j] += curl_vj[j].f[k];
-        }
-        
+  vector r, r2, ri, xi, xj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx;
+  vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh;
+  vector mi, mj;
+  vector dx[3], dv[3];
+  vector vi[3], vj[3];
+  vector dvdr, div_vi, div_vj;
+  vector curlvr[3], curl_vi[3], curl_vj[3];
+  int k, j;
+
+#if VEC_SIZE == 8
+  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass,
+                 pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass);
+  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
+                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
+  for (k = 0; k < 3; k++) {
+    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
+                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
+    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
+                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
+  }
+  for (k = 0; k < 3; k++)
+    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
+                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
+#elif VEC_SIZE == 4
+  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass);
+  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
+  for (k = 0; k < 3; k++) {
+    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
+    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
+  }
+  for (k = 0; k < 3; k++)
+    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
+#endif
+
+  /* Get the radius and inverse radius. */
+  r2.v = vec_load(R2);
+  ri.v = vec_rsqrt(r2.v);
+  ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f));
+  r.v = r2.v * ri.v;
+
+  hi.v = vec_load(Hi);
+  hi_inv.v = vec_rcp(hi.v);
+  hi_inv.v = hi_inv.v - hi_inv.v * (hi_inv.v * hi.v - vec_set1(1.0f));
+  xi.v = r.v * hi_inv.v;
+
+  hj.v = vec_load(Hj);
+  hj_inv.v = vec_rcp(hj.v);
+  hj_inv.v = hj_inv.v - hj_inv.v * (hj_inv.v * hj.v - vec_set1(1.0f));
+  xj.v = r.v * hj_inv.v;
+
+  kernel_deval_vec(&xi, &wi, &wi_dx);
+  kernel_deval_vec(&xj, &wj, &wj_dx);
+
+  /* Compute dv. */
+  dv[0].v = vi[0].v - vj[0].v;
+  dv[1].v = vi[1].v - vj[1].v;
+  dv[2].v = vi[2].v - vj[2].v;
+
+  /* Compute dv dot r */
+  dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v);
+  dvdr.v = dvdr.v * ri.v;
+
+  /* Compute dv cross r */
+  curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v;
+  curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v;
+  curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v;
+  for (k = 0; k < 3; k++) curlvr[k].v *= ri.v;
+
+  rhoi.v = mj.v * wi.v;
+  rhoi_dh.v = mj.v * (vec_set1(3.0f) * wi.v + xi.v * wi_dx.v);
+  wcounti.v = wi.v;
+  wcounti_dh.v = xi.v * wi_dx.v;
+  div_vi.v = mj.v * dvdr.v * wi_dx.v;
+  for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;
+
+  rhoj.v = mi.v * wj.v;
+  rhoj_dh.v = mi.v * (vec_set1(3.0f) * wj.v + xj.v * wj_dx.v);
+  wcountj.v = wj.v;
+  wcountj_dh.v = xj.v * wj_dx.v;
+  div_vj.v = mi.v * dvdr.v * wj_dx.v;
+  for (k = 0; k < 3; k++) curl_vj[k].v = mi.v * curlvr[k].v * wj_dx.v;
+
+  for (k = 0; k < VEC_SIZE; k++) {
+    pi[k]->rho += rhoi.f[k];
+    pi[k]->rho_dh -= rhoi_dh.f[k];
+    pi[k]->density.wcount += wcounti.f[k];
+    pi[k]->density.wcount_dh -= wcounti_dh.f[k];
+    pi[k]->density.div_v += div_vi.f[k];
+    for (j = 0; j < 3; j++) pi[k]->density.curl_v[j] += curl_vi[j].f[k];
+    pj[k]->rho += rhoj.f[k];
+    pj[k]->rho_dh -= rhoj_dh.f[k];
+    pj[k]->density.wcount += wcountj.f[k];
+    pj[k]->density.wcount_dh -= wcountj_dh.f[k];
+    pj[k]->density.div_v += div_vj.f[k];
+    for (j = 0; j < 3; j++) pj[k]->density.curl_v[j] += curl_vj[j].f[k];
+  }
+
 #else
 
-    for ( int k = 0 ; k < VEC_SIZE ; k++ )
-        runner_iact_density( R2[k] , &Dx[3*k] , Hi[k] , Hj[k] , pi[k] , pj[k] );
-        
-#endif
-    
-    }
-    
+  for (int k = 0; k < VEC_SIZE; k++)
+    runner_iact_density(R2[k], &Dx[3 * k], Hi[k], Hj[k], pi[k], pj[k]);
 
+#endif
+}
 
 /**
  * @brief Density loop (non-symmetric version)
  */
 
-__attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_density ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) {
-
-    float r, ri;
-    float xi;
-    float h_inv;
-    float wi, wi_dx;
-    float mj;
-    float dvdr;
-    float dv[3], curlvr[3];
-    int k;
-
-    /* Get the masses. */
-    mj = pj->mass;
-
-    /* Get r and r inverse. */
-    r = sqrtf( r2 );
-    ri = 1.0f / r;
-
-    /* Compute dv dot r */
-    dv[0] = pi->v[0] - pj->v[0];
-    dv[1] = pi->v[1] - pj->v[1];
-    dv[2] = pi->v[2] - pj->v[2];
-    dvdr = dv[0]*dx[0] + dv[1]*dx[1] + dv[2]*dx[2];
-    dvdr *= ri;
-
-    /* Compute dv cross r */
-    curlvr[0] = dv[1]*dx[2] - dv[2]*dx[1];
-    curlvr[1] = dv[2]*dx[0] - dv[0]*dx[2];
-    curlvr[2] = dv[0]*dx[1] - dv[1]*dx[0];
-    for ( k = 0 ; k < 3 ; k++ )
-        curlvr[k] *= ri;
-
-    h_inv = 1.0 / hi;
-    xi = r * h_inv;
-    kernel_deval( xi , &wi , &wi_dx );
-
-    pi->rho += mj * wi;
-    pi->rho_dh -= mj * ( 3.0*wi + xi*wi_dx );
-    pi->density.wcount += wi;
-    pi->density.wcount_dh -= xi * wi_dx;
-
-	pi->density.div_v += mj * dvdr * wi_dx;
-	for ( k = 0 ; k < 3 ; k++ )
-	    pi->density.curl_v[k] += mj * curlvr[k] * wi_dx;
-            
-    }
-    
+__attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
+    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
+
+  float r, ri;
+  float xi;
+  float h_inv;
+  float wi, wi_dx;
+  float mj;
+  float dvdr;
+  float dv[3], curlvr[3];
+  int k;
+
+  /* Get the masses. */
+  mj = pj->mass;
+
+  /* Get r and r inverse. */
+  r = sqrtf(r2);
+  ri = 1.0f / r;
+
+  /* Compute dv dot r */
+  dv[0] = pi->v[0] - pj->v[0];
+  dv[1] = pi->v[1] - pj->v[1];
+  dv[2] = pi->v[2] - pj->v[2];
+  dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
+  dvdr *= ri;
+
+  /* Compute dv cross r */
+  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
+  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
+  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];
+  for (k = 0; k < 3; k++) curlvr[k] *= ri;
+
+  h_inv = 1.0 / hi;
+  xi = r * h_inv;
+  kernel_deval(xi, &wi, &wi_dx);
+
+  pi->rho += mj * wi;
+  pi->rho_dh -= mj * (3.0 * wi + xi * wi_dx);
+  pi->density.wcount += wi;
+  pi->density.wcount_dh -= xi * wi_dx;
+
+  pi->density.div_v += mj * dvdr * wi_dx;
+  for (k = 0; k < 3; k++) pi->density.curl_v[k] += mj * curlvr[k] * wi_dx;
+}
+
 /**
  * @brief Density loop (non-symmetric vectorized version)
  */
 
-__attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_density ( float *R2 , float *Dx , float *Hi , float *Hj , struct part **pi , struct part **pj ) {
+__attribute__((always_inline))
+    INLINE static void runner_iact_nonsym_vec_density(float *R2, float *Dx,
+                                                      float *Hi, float *Hj,
+                                                      struct part **pi,
+                                                      struct part **pj) {
 
 #ifdef VECTORIZE
 
-    vector r, r2, ri, xi, hi, hi_inv, wi, wi_dx;
-    vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi;
-    vector mj;
-    vector dx[3], dv[3];
-    vector vi[3], vj[3];
-    vector dvdr;
-    vector curlvr[3], curl_vi[3];
-    int k, j;
-    
-    #if VEC_SIZE==8
-        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass );
-        for ( k = 0 ; k < 3 ; k++ ) {
-            vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] , pi[4]->v[k] , pi[5]->v[k] , pi[6]->v[k] , pi[7]->v[k] );
-            vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] , pj[4]->v[k] , pj[5]->v[k] , pj[6]->v[k] , pj[7]->v[k] );
-            }
-        for ( k = 0 ; k < 3 ; k++ )
-            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] );
-    #elif VEC_SIZE==4
-        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass );
-        for ( k = 0 ; k < 3 ; k++ ) {
-            vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] );
-            vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] );
-            }
-        for ( k = 0 ; k < 3 ; k++ )
-            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] );
-    #endif
-    
-    /* Get the radius and inverse radius. */
-    r2.v = vec_load( R2 );
-    ri.v = vec_rsqrt( r2.v );
-    ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) );
-    r.v = r2.v * ri.v;
-    
-    hi.v = vec_load( Hi );
-    hi_inv.v = vec_rcp( hi.v );
-    hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v  - vec_set1( 1.0f ) );
-    xi.v = r.v * hi_inv.v;
-
-    kernel_deval_vec( &xi , &wi , &wi_dx );
-    
-    /* Compute dv. */
-    dv[0].v = vi[0].v - vj[0].v;
-    dv[1].v = vi[1].v - vj[1].v;
-    dv[2].v = vi[2].v - vj[2].v;
-
-    /* Compute dv dot r */
-    dvdr.v = ( dv[0].v * dx[0].v ) + ( dv[1].v * dx[1].v ) + ( dv[2].v * dx[2].v );
-    dvdr.v = dvdr.v * ri.v;
-
-    /* Compute dv cross r */
-    curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v;
-    curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v;
-    curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v;
-    for ( k = 0 ; k < 3 ; k++ )
-        curlvr[k].v *= ri.v;    
-
-    rhoi.v = mj.v * wi.v;
-    rhoi_dh.v = mj.v * ( vec_set1( 3.0f ) * wi.v + xi.v * wi_dx.v );
-    wcounti.v = wi.v;
-    wcounti_dh.v = xi.v * wi_dx.v;
-    div_vi.v = mj.v * dvdr.v * wi_dx.v;
-    for ( k = 0 ; k < 3 ; k++ )
-        curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;
-        
-    for ( k = 0 ; k < VEC_SIZE ; k++ ) {
-        pi[k]->rho += rhoi.f[k];
-        pi[k]->rho_dh -= rhoi_dh.f[k];
-        pi[k]->density.wcount += wcounti.f[k];
-        pi[k]->density.wcount_dh -= wcounti_dh.f[k];
-	    pi[k]->density.div_v += div_vi.f[k];
-	    for( j = 0 ; j < 3 ; j++ )
-   	        pi[k]->density.curl_v[j] += curl_vi[j].f[k];
-        }
-        
+  vector r, r2, ri, xi, hi, hi_inv, wi, wi_dx;
+  vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi;
+  vector mj;
+  vector dx[3], dv[3];
+  vector vi[3], vj[3];
+  vector dvdr;
+  vector curlvr[3], curl_vi[3];
+  int k, j;
+
+#if VEC_SIZE == 8
+  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
+                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
+  for (k = 0; k < 3; k++) {
+    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
+                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
+    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
+                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
+  }
+  for (k = 0; k < 3; k++)
+    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
+                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
+#elif VEC_SIZE == 4
+  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
+  for (k = 0; k < 3; k++) {
+    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
+    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
+  }
+  for (k = 0; k < 3; k++)
+    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
+#endif
+
+  /* Get the radius and inverse radius. */
+  r2.v = vec_load(R2);
+  ri.v = vec_rsqrt(r2.v);
+  ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f));
+  r.v = r2.v * ri.v;
+
+  hi.v = vec_load(Hi);
+  hi_inv.v = vec_rcp(hi.v);
+  hi_inv.v = hi_inv.v - hi_inv.v * (hi_inv.v * hi.v - vec_set1(1.0f));
+  xi.v = r.v * hi_inv.v;
+
+  kernel_deval_vec(&xi, &wi, &wi_dx);
+
+  /* Compute dv. */
+  dv[0].v = vi[0].v - vj[0].v;
+  dv[1].v = vi[1].v - vj[1].v;
+  dv[2].v = vi[2].v - vj[2].v;
+
+  /* Compute dv dot r */
+  dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v);
+  dvdr.v = dvdr.v * ri.v;
+
+  /* Compute dv cross r */
+  curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v;
+  curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v;
+  curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v;
+  for (k = 0; k < 3; k++) curlvr[k].v *= ri.v;
+
+  rhoi.v = mj.v * wi.v;
+  rhoi_dh.v = mj.v * (vec_set1(3.0f) * wi.v + xi.v * wi_dx.v);
+  wcounti.v = wi.v;
+  wcounti_dh.v = xi.v * wi_dx.v;
+  div_vi.v = mj.v * dvdr.v * wi_dx.v;
+  for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;
+
+  for (k = 0; k < VEC_SIZE; k++) {
+    pi[k]->rho += rhoi.f[k];
+    pi[k]->rho_dh -= rhoi_dh.f[k];
+    pi[k]->density.wcount += wcounti.f[k];
+    pi[k]->density.wcount_dh -= wcounti_dh.f[k];
+    pi[k]->density.div_v += div_vi.f[k];
+    for (j = 0; j < 3; j++) pi[k]->density.curl_v[j] += curl_vi[j].f[k];
+  }
+
 #else
 
-    for ( int k = 0 ; k < VEC_SIZE ; k++ )
-        runner_iact_nonsym_density( R2[k] , &Dx[3*k] , Hi[k] , Hj[k] , pi[k] , pj[k] );
+  for (int k = 0; k < VEC_SIZE; k++)
+    runner_iact_nonsym_density(R2[k], &Dx[3 * k], Hi[k], Hj[k], pi[k], pj[k]);
 
 #endif
-        
-    }
-    
+}
 
 /**
  * @brief Force loop
  */
 
-__attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) {
-
-    float r = sqrtf( r2 ), ri = 1.0f / r;
-    float xi, xj;
-    float hi_inv, hi2_inv;
-    float hj_inv, hj2_inv;
-    float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr;
-    float mi, mj, POrho2i, POrho2j, rhoi, rhoj;
-    float v_sig, omega_ij, Pi_ij;
-    // float dt_max;
-    float f;
-    int k;
-    
-    /* Get some values in local variables. */
-    mi = pi->mass; mj = pj->mass;
-    rhoi = pi->rho; rhoj = pj->rho;
-    POrho2i = pi->force.POrho2;
-    POrho2j = pj->force.POrho2;
-    
-    /* Get the kernel for hi. */
-    hi_inv = 1.0f / hi;
-    hi2_inv = hi_inv * hi_inv;
-    xi = r * hi_inv;
-    kernel_deval( xi , &wi , &wi_dx );
-    wi_dr = hi2_inv * hi2_inv * wi_dx;
-        
-    /* Get the kernel for hj. */
-    hj_inv = 1.0f / hj;
-    hj2_inv = hj_inv * hj_inv;
-    xj = r * hj_inv;
-    kernel_deval( xj , &wj , &wj_dx );
-    wj_dr = hj2_inv * hj2_inv * wj_dx;
-                
-    /* Compute dv dot r. */
-    dvdr = ( pi->v[0] - pj->v[0] ) * dx[0] + ( pi->v[1] - pj->v[1] ) * dx[1] + ( pi->v[2] - pj->v[2] ) * dx[2];
-    dvdr *= ri;
-
-    /* Compute the relative velocity. (This is 0 if the particles move away from each other and negative otherwise) */
-    omega_ij = fminf( dvdr , 0.f );
-    
-    /* Compute signal velocity */
-    v_sig = pi->force.c + pj->force.c - 3.0f*omega_ij;
-
-    /* Compute viscosity tensor */
-    Pi_ij = -const_viscosity_alpha * v_sig * omega_ij / ( rhoi + rhoj );
-
-    /* Apply balsara switch */
-    Pi_ij *= ( pi->force.balsara + pj->force.balsara );
-
-    /* Get the common factor out. */
-    w = ri * ( ( POrho2i * wi_dr + POrho2j * wj_dr ) + 0.25f * Pi_ij * ( wi_dr + wj_dr ) );
-
-    /* Use the force, Luke! */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        f = dx[k] * w;
-        pi->a[k] -= mj * f;
-        pj->a[k] += mi * f;
-        }
-                
-    /* Get the time derivative for u. */
-    pi->force.u_dt += mj * dvdr * ( POrho2i * wi_dr + 0.125f * Pi_ij * ( wi_dr + wj_dr ) );
-    pj->force.u_dt += mi * dvdr * ( POrho2j * wj_dr + 0.125f * Pi_ij * ( wi_dr + wj_dr ) );
-    
-    /* Get the time derivative for h. */
-    pi->force.h_dt -= mj * dvdr / rhoj * wi_dr;
-    pj->force.h_dt -= mi * dvdr / rhoi * wj_dr;
-    
-    /* Update the signal velocity. */
-    pi->force.v_sig = fmaxf( pi->force.v_sig , v_sig );
-    pj->force.v_sig = fmaxf( pj->force.v_sig , v_sig );
-    
-    }
-    
+__attribute__((always_inline)) INLINE static void runner_iact_force(
+    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
+
+  float r = sqrtf(r2), ri = 1.0f / r;
+  float xi, xj;
+  float hi_inv, hi2_inv;
+  float hj_inv, hj2_inv;
+  float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr;
+  float mi, mj, POrho2i, POrho2j, rhoi, rhoj;
+  float v_sig, omega_ij, Pi_ij;
+  // float dt_max;
+  float f;
+  int k;
+
+  /* Get some values in local variables. */
+  mi = pi->mass;
+  mj = pj->mass;
+  rhoi = pi->rho;
+  rhoj = pj->rho;
+  POrho2i = pi->force.POrho2;
+  POrho2j = pj->force.POrho2;
+
+  /* Get the kernel for hi. */
+  hi_inv = 1.0f / hi;
+  hi2_inv = hi_inv * hi_inv;
+  xi = r * hi_inv;
+  kernel_deval(xi, &wi, &wi_dx);
+  wi_dr = hi2_inv * hi2_inv * wi_dx;
+
+  /* Get the kernel for hj. */
+  hj_inv = 1.0f / hj;
+  hj2_inv = hj_inv * hj_inv;
+  xj = r * hj_inv;
+  kernel_deval(xj, &wj, &wj_dx);
+  wj_dr = hj2_inv * hj2_inv * wj_dx;
+
+  /* Compute dv dot r. */
+  dvdr = (pi->v[0] - pj->v[0]) * dx[0] + (pi->v[1] - pj->v[1]) * dx[1] +
+         (pi->v[2] - pj->v[2]) * dx[2];
+  dvdr *= ri;
+
+  /* Compute the relative velocity. (This is 0 if the particles move away from
+   * each other and negative otherwise) */
+  omega_ij = fminf(dvdr, 0.f);
+
+  /* Compute signal velocity */
+  v_sig = pi->force.c + pj->force.c - 3.0f * omega_ij;
+
+  /* Compute viscosity tensor */
+  Pi_ij = -const_viscosity_alpha * v_sig * omega_ij / (rhoi + rhoj);
+
+  /* Apply balsara switch */
+  Pi_ij *= (pi->force.balsara + pj->force.balsara);
+
+  /* Get the common factor out. */
+  w = ri *
+      ((POrho2i * wi_dr + POrho2j * wj_dr) + 0.25f * Pi_ij * (wi_dr + wj_dr));
+
+  /* Use the force, Luke! */
+  for (k = 0; k < 3; k++) {
+    f = dx[k] * w;
+    pi->a[k] -= mj * f;
+    pj->a[k] += mi * f;
+  }
+
+  /* Get the time derivative for u. */
+  pi->force.u_dt +=
+      mj * dvdr * (POrho2i * wi_dr + 0.125f * Pi_ij * (wi_dr + wj_dr));
+  pj->force.u_dt +=
+      mi * dvdr * (POrho2j * wj_dr + 0.125f * Pi_ij * (wi_dr + wj_dr));
+
+  /* Get the time derivative for h. */
+  pi->force.h_dt -= mj * dvdr / rhoj * wi_dr;
+  pj->force.h_dt -= mi * dvdr / rhoi * wj_dr;
+
+  /* Update the signal velocity. */
+  pi->force.v_sig = fmaxf(pi->force.v_sig, v_sig);
+  pj->force.v_sig = fmaxf(pj->force.v_sig, v_sig);
+}
 
 /**
  * @brief Force loop (Vectorized version)
  */
 
-__attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float *R2 , float *Dx , float *Hi , float *Hj , struct part **pi , struct part **pj ) {
+__attribute__((always_inline)) INLINE static void runner_iact_vec_force(
+    float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
+    struct part **pj) {
 
 #ifdef VECTORIZE
 
-    vector r, r2, ri;
-    vector xi, xj;
-    vector hi, hj, hi_inv, hj_inv;
-    vector hi2_inv, hj2_inv;
-    vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
-    vector w;
-    vector piPOrho2, pjPOrho2, pirho, pjrho;
-    vector mi, mj;
-    vector f;
-    vector dx[3];
-    vector vi[3], vj[3];
-    vector pia[3], pja[3];
-    vector piu_dt, pju_dt;
-    vector pih_dt, pjh_dt;
-    vector ci, cj, v_sig, vi_sig, vj_sig;
-    vector omega_ij, Pi_ij, balsara;
-    int j, k;
-
-    /* Load stuff. */
-    #if VEC_SIZE==8
-        mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass , pi[4]->mass , pi[5]->mass , pi[6]->mass , pi[7]->mass );
-        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass );
-        piPOrho2.v = vec_set( pi[0]->force.POrho2 , pi[1]->force.POrho2 , pi[2]->force.POrho2 , pi[3]->force.POrho2 , pi[4]->force.POrho2 , pi[5]->force.POrho2 , pi[6]->force.POrho2 , pi[7]->force.POrho2 );
-        pjPOrho2.v = vec_set( pj[0]->force.POrho2 , pj[1]->force.POrho2 , pj[2]->force.POrho2 , pj[3]->force.POrho2 , pj[4]->force.POrho2 , pj[5]->force.POrho2 , pj[6]->force.POrho2 , pj[7]->force.POrho2 );
-        pirho.v = vec_set( pi[0]->rho , pi[1]->rho , pi[2]->rho , pi[3]->rho , pi[4]->rho , pi[5]->rho , pi[6]->rho , pi[7]->rho );
-        pjrho.v = vec_set( pj[0]->rho , pj[1]->rho , pj[2]->rho , pj[3]->rho , pj[4]->rho , pj[5]->rho , pj[6]->rho , pj[7]->rho );
-        ci.v = vec_set( pi[0]->force.c , pi[1]->force.c , pi[2]->force.c , pi[3]->force.c , pi[4]->force.c , pi[5]->force.c , pi[6]->force.c , pi[7]->force.c );
-        cj.v = vec_set( pj[0]->force.c , pj[1]->force.c , pj[2]->force.c , pj[3]->force.c , pj[4]->force.c , pj[5]->force.c , pj[6]->force.c , pj[7]->force.c );
-        vi_sig.v = vec_set( pi[0]->force.v_sig , pi[1]->force.v_sig , pi[2]->force.v_sig , pi[3]->force.v_sig , pi[4]->force.v_sig , pi[5]->force.v_sig , pi[6]->force.v_sig , pi[7]->force.v_sig );
-        vj_sig.v = vec_set( pj[0]->force.v_sig , pj[1]->force.v_sig , pj[2]->force.v_sig , pj[3]->force.v_sig , pj[4]->force.v_sig , pj[5]->force.v_sig , pj[6]->force.v_sig , pj[7]->force.v_sig );
-        for ( k = 0 ; k < 3 ; k++ ) {
-            vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] , pi[4]->v[k] , pi[5]->v[k] , pi[6]->v[k] , pi[7]->v[k] );
-            vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] , pj[4]->v[k] , pj[5]->v[k] , pj[6]->v[k] , pj[7]->v[k] );
-            }
-        for ( k = 0 ; k < 3 ; k++ )
-            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] );
-        balsara.v = vec_set( pi[0]->force.balsara , pi[1]->force.balsara , pi[2]->force.balsara , pi[3]->force.balsara , pi[4]->force.balsara , pi[5]->force.balsara , pi[6]->force.balsara , pi[7]->force.balsara ) +
-                    vec_set( pj[0]->force.balsara , pj[1]->force.balsara , pj[2]->force.balsara , pj[3]->force.balsara , pj[4]->force.balsara , pj[5]->force.balsara , pj[6]->force.balsara , pj[7]->force.balsara );
-    #elif VEC_SIZE==4
-        mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass );
-        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass );
-        piPOrho2.v = vec_set( pi[0]->force.POrho2 , pi[1]->force.POrho2 , pi[2]->force.POrho2 , pi[3]->force.POrho2 );
-        pjPOrho2.v = vec_set( pj[0]->force.POrho2 , pj[1]->force.POrho2 , pj[2]->force.POrho2 , pj[3]->force.POrho2 );
-        pirho.v = vec_set( pi[0]->rho , pi[1]->rho , pi[2]->rho , pi[3]->rho );
-        pjrho.v = vec_set( pj[0]->rho , pj[1]->rho , pj[2]->rho , pj[3]->rho );
-        ci.v = vec_set( pi[0]->force.c , pi[1]->force.c , pi[2]->force.c , pi[3]->force.c );
-        cj.v = vec_set( pj[0]->force.c , pj[1]->force.c , pj[2]->force.c , pj[3]->force.c );
-        vi_sig.v = vec_set( pi[0]->force.v_sig , pi[1]->force.v_sig , pi[2]->force.v_sig , pi[3]->force.v_sig );
-        vj_sig.v = vec_set( pj[0]->force.v_sig , pj[1]->force.v_sig , pj[2]->force.v_sig , pj[3]->force.v_sig );
-        for ( k = 0 ; k < 3 ; k++ ) {
-            vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] );
-            vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] );
-            }
-        for ( k = 0 ; k < 3 ; k++ )
-            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] );
-        balsara.v = vec_set( pi[0]->force.balsara , pi[1]->force.balsara , pi[2]->force.balsara , pi[3]->force.balsara ) +
-                    vec_set( pj[0]->force.balsara , pj[1]->force.balsara , pj[2]->force.balsara , pj[3]->force.balsara );
-    #else
-        #error
-    #endif
-
-    /* Get the radius and inverse radius. */
-    r2.v = vec_load( R2 );
-    ri.v = vec_rsqrt( r2.v );
-    ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) );
-    r.v = r2.v * ri.v;
-    
-    /* Get the kernel for hi. */
-    hi.v = vec_load( Hi );
-    hi_inv.v = vec_rcp( hi.v );
-    hi_inv.v = hi_inv.v - hi_inv.v * ( hi.v * hi_inv.v - vec_set1( 1.0f ) );
-    hi2_inv.v = hi_inv.v * hi_inv.v;
-    xi.v = r.v * hi_inv.v;
-    kernel_deval_vec( &xi , &wi , &wi_dx );
-    wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v;
-        
-    /* Get the kernel for hj. */
-    hj.v = vec_load( Hj );
-    hj_inv.v = vec_rcp( hj.v );
-    hj_inv.v = hj_inv.v - hj_inv.v * ( hj.v * hj_inv.v - vec_set1( 1.0f ) );
-    hj2_inv.v = hj_inv.v * hj_inv.v;
-    xj.v = r.v * hj_inv.v;
-    kernel_deval_vec( &xj , &wj , &wj_dx );
-    wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v;
-        
-    /* Compute dv dot r. */
-    dvdr.v = ( (vi[0].v - vj[0].v) * dx[0].v ) + ( (vi[1].v - vj[1].v) * dx[1].v ) + ( (vi[2].v - vj[2].v) * dx[2].v );
-    dvdr.v = dvdr.v * ri.v;
-        
-    /* Get the time derivative for h. */
-    pih_dt.v = mj.v / pjrho.v * dvdr.v * wi_dr.v;
-    pjh_dt.v = mi.v / pirho.v * dvdr.v * wj_dr.v;
-    
-    /* Compute the relative velocity. (This is 0 if the particles move away from each other and negative otherwise) */
-    omega_ij.v = vec_fmin( dvdr.v , vec_set1( 0.0f ) );
-    
-    /* Compute signal velocity */
-    v_sig.v = ci.v + cj.v - vec_set1( 3.0f )*omega_ij.v;
-
-    /* Compute viscosity tensor */
-    Pi_ij.v = -balsara.v * vec_set1( const_viscosity_alpha ) * v_sig.v * omega_ij.v / (pirho.v + pjrho.v);
-    Pi_ij.v *= ( wi_dr.v + wj_dr.v );
-
-    /* Get the common factor out. */
-    w.v = ri.v * ( ( piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v ) + vec_set1( 0.25f ) * Pi_ij.v );
-
-    /* Use the force, Luke! */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        f.v = dx[k].v * w.v;
-        pia[k].v = mj.v * f.v;
-        pja[k].v = mi.v * f.v;
-        }
-        
-    /* Get the time derivative for u. */
-    piu_dt.v = mj.v * dvdr.v * ( piPOrho2.v * wi_dr.v + vec_set1( 0.125f ) * Pi_ij.v );
-    pju_dt.v = mi.v * dvdr.v * ( pjPOrho2.v * wj_dr.v + vec_set1( 0.125f ) * Pi_ij.v );
-    
-    /* compute the signal velocity (this is always symmetrical). */
-    vi_sig.v = vec_fmax( vi_sig.v , v_sig.v );
-    vj_sig.v = vec_fmax( vj_sig.v , v_sig.v );
-
-    /* Store the forces back on the particles. */
-    for ( k = 0 ; k < VEC_SIZE ; k++ ) {
-        pi[k]->force.u_dt += piu_dt.f[k];
-        pj[k]->force.u_dt += pju_dt.f[k];
-        pi[k]->force.h_dt -= pih_dt.f[k];
-        pj[k]->force.h_dt -= pjh_dt.f[k];
-        pi[k]->force.v_sig = vi_sig.f[k];
-        pj[k]->force.v_sig = vj_sig.f[k];
-        for ( j = 0 ; j < 3 ; j++ ) {
-            pi[k]->a[j] -= pia[j].f[k];
-            pj[k]->a[j] += pja[j].f[k];
-            }
-        }
-        
+  vector r, r2, ri;
+  vector xi, xj;
+  vector hi, hj, hi_inv, hj_inv;
+  vector hi2_inv, hj2_inv;
+  vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
+  vector w;
+  vector piPOrho2, pjPOrho2, pirho, pjrho;
+  vector mi, mj;
+  vector f;
+  vector dx[3];
+  vector vi[3], vj[3];
+  vector pia[3], pja[3];
+  vector piu_dt, pju_dt;
+  vector pih_dt, pjh_dt;
+  vector ci, cj, v_sig, vi_sig, vj_sig;
+  vector omega_ij, Pi_ij, balsara;
+  int j, k;
+
+/* Load stuff. */
+#if VEC_SIZE == 8
+  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass,
+                 pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass);
+  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
+                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
+  piPOrho2.v =
+      vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2, pi[2]->force.POrho2,
+              pi[3]->force.POrho2, pi[4]->force.POrho2, pi[5]->force.POrho2,
+              pi[6]->force.POrho2, pi[7]->force.POrho2);
+  pjPOrho2.v =
+      vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2, pj[2]->force.POrho2,
+              pj[3]->force.POrho2, pj[4]->force.POrho2, pj[5]->force.POrho2,
+              pj[6]->force.POrho2, pj[7]->force.POrho2);
+  pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho,
+                    pi[5]->rho, pi[6]->rho, pi[7]->rho);
+  pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho,
+                    pj[5]->rho, pj[6]->rho, pj[7]->rho);
+  ci.v =
+      vec_set(pi[0]->force.c, pi[1]->force.c, pi[2]->force.c, pi[3]->force.c,
+              pi[4]->force.c, pi[5]->force.c, pi[6]->force.c, pi[7]->force.c);
+  cj.v =
+      vec_set(pj[0]->force.c, pj[1]->force.c, pj[2]->force.c, pj[3]->force.c,
+              pj[4]->force.c, pj[5]->force.c, pj[6]->force.c, pj[7]->force.c);
+  vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig,
+                     pi[3]->force.v_sig, pi[4]->force.v_sig, pi[5]->force.v_sig,
+                     pi[6]->force.v_sig, pi[7]->force.v_sig);
+  vj_sig.v = vec_set(pj[0]->force.v_sig, pj[1]->force.v_sig, pj[2]->force.v_sig,
+                     pj[3]->force.v_sig, pj[4]->force.v_sig, pj[5]->force.v_sig,
+                     pj[6]->force.v_sig, pj[7]->force.v_sig);
+  for (k = 0; k < 3; k++) {
+    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
+                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
+    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
+                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
+  }
+  for (k = 0; k < 3; k++)
+    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
+                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
+  balsara.v =
+      vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara,
+              pi[3]->force.balsara, pi[4]->force.balsara, pi[5]->force.balsara,
+              pi[6]->force.balsara, pi[7]->force.balsara) +
+      vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara,
+              pj[3]->force.balsara, pj[4]->force.balsara, pj[5]->force.balsara,
+              pj[6]->force.balsara, pj[7]->force.balsara);
+#elif VEC_SIZE == 4
+  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass);
+  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
+  piPOrho2.v = vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2,
+                       pi[2]->force.POrho2, pi[3]->force.POrho2);
+  pjPOrho2.v = vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2,
+                       pj[2]->force.POrho2, pj[3]->force.POrho2);
+  pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho);
+  pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho);
+  ci.v =
+      vec_set(pi[0]->force.c, pi[1]->force.c, pi[2]->force.c, pi[3]->force.c);
+  cj.v =
+      vec_set(pj[0]->force.c, pj[1]->force.c, pj[2]->force.c, pj[3]->force.c);
+  vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig,
+                     pi[3]->force.v_sig);
+  vj_sig.v = vec_set(pj[0]->force.v_sig, pj[1]->force.v_sig, pj[2]->force.v_sig,
+                     pj[3]->force.v_sig);
+  for (k = 0; k < 3; k++) {
+    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
+    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
+  }
+  for (k = 0; k < 3; k++)
+    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
+  balsara.v = vec_set(pi[0]->force.balsara, pi[1]->force.balsara,
+                      pi[2]->force.balsara, pi[3]->force.balsara) +
+              vec_set(pj[0]->force.balsara, pj[1]->force.balsara,
+                      pj[2]->force.balsara, pj[3]->force.balsara);
+#else
+#error
+#endif
+
+  /* Get the radius and inverse radius. */
+  r2.v = vec_load(R2);
+  ri.v = vec_rsqrt(r2.v);
+  ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f));
+  r.v = r2.v * ri.v;
+
+  /* Get the kernel for hi. */
+  hi.v = vec_load(Hi);
+  hi_inv.v = vec_rcp(hi.v);
+  hi_inv.v = hi_inv.v - hi_inv.v * (hi.v * hi_inv.v - vec_set1(1.0f));
+  hi2_inv.v = hi_inv.v * hi_inv.v;
+  xi.v = r.v * hi_inv.v;
+  kernel_deval_vec(&xi, &wi, &wi_dx);
+  wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v;
+
+  /* Get the kernel for hj. */
+  hj.v = vec_load(Hj);
+  hj_inv.v = vec_rcp(hj.v);
+  hj_inv.v = hj_inv.v - hj_inv.v * (hj.v * hj_inv.v - vec_set1(1.0f));
+  hj2_inv.v = hj_inv.v * hj_inv.v;
+  xj.v = r.v * hj_inv.v;
+  kernel_deval_vec(&xj, &wj, &wj_dx);
+  wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v;
+
+  /* Compute dv dot r. */
+  dvdr.v = ((vi[0].v - vj[0].v) * dx[0].v) + ((vi[1].v - vj[1].v) * dx[1].v) +
+           ((vi[2].v - vj[2].v) * dx[2].v);
+  dvdr.v = dvdr.v * ri.v;
+
+  /* Get the time derivative for h. */
+  pih_dt.v = mj.v / pjrho.v * dvdr.v * wi_dr.v;
+  pjh_dt.v = mi.v / pirho.v * dvdr.v * wj_dr.v;
+
+  /* Compute the relative velocity. (This is 0 if the particles move away from
+   * each other and negative otherwise) */
+  omega_ij.v = vec_fmin(dvdr.v, vec_set1(0.0f));
+
+  /* Compute signal velocity */
+  v_sig.v = ci.v + cj.v - vec_set1(3.0f) * omega_ij.v;
+
+  /* Compute viscosity tensor */
+  Pi_ij.v = -balsara.v * vec_set1(const_viscosity_alpha) * v_sig.v *
+            omega_ij.v / (pirho.v + pjrho.v);
+  Pi_ij.v *= (wi_dr.v + wj_dr.v);
+
+  /* Get the common factor out. */
+  w.v = ri.v * ((piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v) +
+                vec_set1(0.25f) * Pi_ij.v);
+
+  /* Use the force, Luke! */
+  for (k = 0; k < 3; k++) {
+    f.v = dx[k].v * w.v;
+    pia[k].v = mj.v * f.v;
+    pja[k].v = mi.v * f.v;
+  }
+
+  /* Get the time derivative for u. */
+  piu_dt.v =
+      mj.v * dvdr.v * (piPOrho2.v * wi_dr.v + vec_set1(0.125f) * Pi_ij.v);
+  pju_dt.v =
+      mi.v * dvdr.v * (pjPOrho2.v * wj_dr.v + vec_set1(0.125f) * Pi_ij.v);
+
+  /* compute the signal velocity (this is always symmetrical). */
+  vi_sig.v = vec_fmax(vi_sig.v, v_sig.v);
+  vj_sig.v = vec_fmax(vj_sig.v, v_sig.v);
+
+  /* Store the forces back on the particles. */
+  for (k = 0; k < VEC_SIZE; k++) {
+    pi[k]->force.u_dt += piu_dt.f[k];
+    pj[k]->force.u_dt += pju_dt.f[k];
+    pi[k]->force.h_dt -= pih_dt.f[k];
+    pj[k]->force.h_dt -= pjh_dt.f[k];
+    pi[k]->force.v_sig = vi_sig.f[k];
+    pj[k]->force.v_sig = vj_sig.f[k];
+    for (j = 0; j < 3; j++) {
+      pi[k]->a[j] -= pia[j].f[k];
+      pj[k]->a[j] += pja[j].f[k];
+    }
+  }
+
 #else
 
-    for ( int k = 0 ; k < VEC_SIZE ; k++ )
-        runner_iact_force( R2[k] , &Dx[3*k] , Hi[k] , Hj[k] , pi[k] , pj[k] );
+  for (int k = 0; k < VEC_SIZE; k++)
+    runner_iact_force(R2[k], &Dx[3 * k], Hi[k], Hj[k], pi[k], pj[k]);
 
 #endif
-        
-    }
-    
+}
 
 /**
  * @brief Force loop (non-symmetric version)
  */
 
-__attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_force ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) {
-
-    float r = sqrtf( r2 ), ri = 1.0f / r;
-    float xi, xj;
-    float hi_inv, hi2_inv;
-    float hj_inv, hj2_inv;
-    float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr;
-    float /*mi,*/ mj, POrho2i, POrho2j, rhoi, rhoj;
-    float v_sig, omega_ij, Pi_ij;
-    // float dt_max;
-    float f;
-    int k;
-    
-    /* Get some values in local variables. */
-    // mi = pi->mass;
-    mj = pj->mass;
-    rhoi = pi->rho; rhoj = pj->rho;
-    POrho2i = pi->force.POrho2;
-    POrho2j = pj->force.POrho2;
-    
-    /* Get the kernel for hi. */
-    hi_inv = 1.0f / hi;
-    hi2_inv = hi_inv * hi_inv;
-    xi = r * hi_inv;
-    kernel_deval( xi , &wi , &wi_dx );
-    wi_dr = hi2_inv * hi2_inv * wi_dx;
-        
-    /* Get the kernel for hj. */
-    hj_inv = 1.0f / hj;
-    hj2_inv = hj_inv * hj_inv;
-    xj = r * hj_inv;
-    kernel_deval( xj , &wj , &wj_dx );
-    wj_dr = hj2_inv * hj2_inv * wj_dx;
-                
-    /* Compute dv dot r. */
-    dvdr = ( pi->v[0] - pj->v[0] ) * dx[0] + ( pi->v[1] - pj->v[1] ) * dx[1] + ( pi->v[2] - pj->v[2] ) * dx[2];
-    dvdr *= ri;
-
-    /* Compute the relative velocity. (This is 0 if the particles move away from each other and negative otherwise) */
-    omega_ij = fminf( dvdr , 0.f );
-    
-    /* Compute signal velocity */
-    v_sig = pi->force.c + pj->force.c - 3.0f*omega_ij;
-
-    /* Compute viscosity tensor */
-    Pi_ij = -const_viscosity_alpha * v_sig * omega_ij / ( rhoi + rhoj );
-
-    /* Apply balsara switch */
-    Pi_ij *= ( pi->force.balsara + pj->force.balsara );
-
-    /* Get the common factor out. */
-    w = ri * ( ( POrho2i * wi_dr + POrho2j * wj_dr ) + 0.25f * Pi_ij * ( wi_dr + wj_dr ) );
-
-    /* Use the force, Luke! */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        f = dx[k] * w;
-        pi->a[k] -= mj * f;
-        }
-                
-    /* Get the time derivative for u. */
-    pi->force.u_dt += mj * dvdr * ( POrho2i * wi_dr + 0.125f * Pi_ij * ( wi_dr + wj_dr ) );
-    
-    /* Get the time derivative for h. */
-    pi->force.h_dt -= mj * dvdr / rhoj * wi_dr;
-    
-    /* Update the signal velocity. */
-    pi->force.v_sig = fmaxf( pi->force.v_sig , v_sig );
-    pj->force.v_sig = fmaxf( pj->force.v_sig , v_sig );
-    
-    }
-    
+__attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
+    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
+
+  float r = sqrtf(r2), ri = 1.0f / r;
+  float xi, xj;
+  float hi_inv, hi2_inv;
+  float hj_inv, hj2_inv;
+  float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr;
+  float /*mi,*/ mj, POrho2i, POrho2j, rhoi, rhoj;
+  float v_sig, omega_ij, Pi_ij;
+  // float dt_max;
+  float f;
+  int k;
+
+  /* Get some values in local variables. */
+  // mi = pi->mass;
+  mj = pj->mass;
+  rhoi = pi->rho;
+  rhoj = pj->rho;
+  POrho2i = pi->force.POrho2;
+  POrho2j = pj->force.POrho2;
+
+  /* Get the kernel for hi. */
+  hi_inv = 1.0f / hi;
+  hi2_inv = hi_inv * hi_inv;
+  xi = r * hi_inv;
+  kernel_deval(xi, &wi, &wi_dx);
+  wi_dr = hi2_inv * hi2_inv * wi_dx;
+
+  /* Get the kernel for hj. */
+  hj_inv = 1.0f / hj;
+  hj2_inv = hj_inv * hj_inv;
+  xj = r * hj_inv;
+  kernel_deval(xj, &wj, &wj_dx);
+  wj_dr = hj2_inv * hj2_inv * wj_dx;
+
+  /* Compute dv dot r. */
+  dvdr = (pi->v[0] - pj->v[0]) * dx[0] + (pi->v[1] - pj->v[1]) * dx[1] +
+         (pi->v[2] - pj->v[2]) * dx[2];
+  dvdr *= ri;
+
+  /* Compute the relative velocity. (This is 0 if the particles move away from
+   * each other and negative otherwise) */
+  omega_ij = fminf(dvdr, 0.f);
+
+  /* Compute signal velocity */
+  v_sig = pi->force.c + pj->force.c - 3.0f * omega_ij;
+
+  /* Compute viscosity tensor */
+  Pi_ij = -const_viscosity_alpha * v_sig * omega_ij / (rhoi + rhoj);
+
+  /* Apply balsara switch */
+  Pi_ij *= (pi->force.balsara + pj->force.balsara);
+
+  /* Get the common factor out. */
+  w = ri *
+      ((POrho2i * wi_dr + POrho2j * wj_dr) + 0.25f * Pi_ij * (wi_dr + wj_dr));
+
+  /* Use the force, Luke! */
+  for (k = 0; k < 3; k++) {
+    f = dx[k] * w;
+    pi->a[k] -= mj * f;
+  }
+
+  /* Get the time derivative for u. */
+  pi->force.u_dt +=
+      mj * dvdr * (POrho2i * wi_dr + 0.125f * Pi_ij * (wi_dr + wj_dr));
+
+  /* Get the time derivative for h. */
+  pi->force.h_dt -= mj * dvdr / rhoj * wi_dr;
+
+  /* Update the signal velocity. */
+  pi->force.v_sig = fmaxf(pi->force.v_sig, v_sig);
+  pj->force.v_sig = fmaxf(pj->force.v_sig, v_sig);
+}
 
 /**
  * @brief Force loop (Vectorized non-symmetric version)
  */
 
-__attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force ( float *R2 , float *Dx , float *Hi , float *Hj , struct part **pi , struct part **pj ) {
+__attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
+    float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
+    struct part **pj) {
 
 #ifdef VECTORIZE
 
-    vector r, r2, ri;
-    vector xi, xj;
-    vector hi, hj, hi_inv, hj_inv;
-    vector hi2_inv, hj2_inv;
-    vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
-    vector w;
-    vector piPOrho2, pjPOrho2, pirho, pjrho;
-    vector mj;
-    vector f;
-    vector dx[3];
-    vector vi[3], vj[3];
-    vector pia[3];
-    vector piu_dt;
-    vector pih_dt;
-    vector ci, cj, v_sig, vi_sig, vj_sig;
-    vector omega_ij, Pi_ij, balsara;
-    int j, k;
-
-    /* Load stuff. */
-    #if VEC_SIZE==8
-        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass );
-        piPOrho2.v = vec_set( pi[0]->force.POrho2 , pi[1]->force.POrho2 , pi[2]->force.POrho2 , pi[3]->force.POrho2 , pi[4]->force.POrho2 , pi[5]->force.POrho2 , pi[6]->force.POrho2 , pi[7]->force.POrho2 );
-        pjPOrho2.v = vec_set( pj[0]->force.POrho2 , pj[1]->force.POrho2 , pj[2]->force.POrho2 , pj[3]->force.POrho2 , pj[4]->force.POrho2 , pj[5]->force.POrho2 , pj[6]->force.POrho2 , pj[7]->force.POrho2 );
-        pirho.v = vec_set( pi[0]->rho , pi[1]->rho , pi[2]->rho , pi[3]->rho , pi[4]->rho , pi[5]->rho , pi[6]->rho , pi[7]->rho );
-        pjrho.v = vec_set( pj[0]->rho , pj[1]->rho , pj[2]->rho , pj[3]->rho , pj[4]->rho , pj[5]->rho , pj[6]->rho , pj[7]->rho );
-        ci.v = vec_set( pi[0]->force.c , pi[1]->force.c , pi[2]->force.c , pi[3]->force.c , pi[4]->force.c , pi[5]->force.c , pi[6]->force.c , pi[7]->force.c );
-        cj.v = vec_set( pj[0]->force.c , pj[1]->force.c , pj[2]->force.c , pj[3]->force.c , pj[4]->force.c , pj[5]->force.c , pj[6]->force.c , pj[7]->force.c );
-        vi_sig.v = vec_set( pi[0]->force.v_sig , pi[1]->force.v_sig , pi[2]->force.v_sig , pi[3]->force.v_sig , pi[4]->force.v_sig , pi[5]->force.v_sig , pi[6]->force.v_sig , pi[7]->force.v_sig );
-        vj_sig.v = vec_set( pj[0]->force.v_sig , pj[1]->force.v_sig , pj[2]->force.v_sig , pj[3]->force.v_sig , pj[4]->force.v_sig , pj[5]->force.v_sig , pj[6]->force.v_sig , pj[7]->force.v_sig );
-        for ( k = 0 ; k < 3 ; k++ ) {
-            vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] , pi[4]->v[k] , pi[5]->v[k] , pi[6]->v[k] , pi[7]->v[k] );
-            vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] , pj[4]->v[k] , pj[5]->v[k] , pj[6]->v[k] , pj[7]->v[k] );
-            }
-        for ( k = 0 ; k < 3 ; k++ )
-            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] );
-        balsara.v = vec_set( pi[0]->force.balsara , pi[1]->force.balsara , pi[2]->force.balsara , pi[3]->force.balsara , pi[4]->force.balsara , pi[5]->force.balsara , pi[6]->force.balsara , pi[7]->force.balsara ) +
-                    vec_set( pj[0]->force.balsara , pj[1]->force.balsara , pj[2]->force.balsara , pj[3]->force.balsara , pj[4]->force.balsara , pj[5]->force.balsara , pj[6]->force.balsara , pj[7]->force.balsara );
-    #elif VEC_SIZE==4
-        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass );
-        piPOrho2.v = vec_set( pi[0]->force.POrho2 , pi[1]->force.POrho2 , pi[2]->force.POrho2 , pi[3]->force.POrho2 );
-        pjPOrho2.v = vec_set( pj[0]->force.POrho2 , pj[1]->force.POrho2 , pj[2]->force.POrho2 , pj[3]->force.POrho2 );
-        pirho.v = vec_set( pi[0]->rho , pi[1]->rho , pi[2]->rho , pi[3]->rho );
-        pjrho.v = vec_set( pj[0]->rho , pj[1]->rho , pj[2]->rho , pj[3]->rho );
-        ci.v = vec_set( pi[0]->force.c , pi[1]->force.c , pi[2]->force.c , pi[3]->force.c );
-        cj.v = vec_set( pj[0]->force.c , pj[1]->force.c , pj[2]->force.c , pj[3]->force.c );
-        vi_sig.v = vec_set( pi[0]->force.v_sig , pi[1]->force.v_sig , pi[2]->force.v_sig , pi[3]->force.v_sig );
-        vj_sig.v = vec_set( pj[0]->force.v_sig , pj[1]->force.v_sig , pj[2]->force.v_sig , pj[3]->force.v_sig );
-        for ( k = 0 ; k < 3 ; k++ ) {
-            vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] );
-            vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] );
-            }
-        for ( k = 0 ; k < 3 ; k++ )
-            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] );
-        balsara.v = vec_set( pi[0]->force.balsara , pi[1]->force.balsara , pi[2]->force.balsara , pi[3]->force.balsara ) +
-                    vec_set( pj[0]->force.balsara , pj[1]->force.balsara , pj[2]->force.balsara , pj[3]->force.balsara );
-    #else
-        #error
-    #endif
-
-    /* Get the radius and inverse radius. */
-    r2.v = vec_load( R2 );
-    ri.v = vec_rsqrt( r2.v );
-    ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) );
-    r.v = r2.v * ri.v;
-    
-    /* Get the kernel for hi. */
-    hi.v = vec_load( Hi );
-    hi_inv.v = vec_rcp( hi.v );
-    hi_inv.v = hi_inv.v - hi_inv.v * ( hi.v * hi_inv.v - vec_set1( 1.0f ) );
-    hi2_inv.v = hi_inv.v * hi_inv.v;
-    xi.v = r.v * hi_inv.v;
-    kernel_deval_vec( &xi , &wi , &wi_dx );
-    wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v;
-        
-    /* Get the kernel for hj. */
-    hj.v = vec_load( Hj );
-    hj_inv.v = vec_rcp( hj.v );
-    hj_inv.v = hj_inv.v - hj_inv.v * ( hj.v * hj_inv.v - vec_set1( 1.0f ) );
-    hj2_inv.v = hj_inv.v * hj_inv.v;
-    xj.v = r.v * hj_inv.v;
-    kernel_deval_vec( &xj , &wj , &wj_dx );
-    wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v;
-        
-    /* Compute dv dot r. */
-    dvdr.v = ( (vi[0].v - vj[0].v) * dx[0].v ) + ( (vi[1].v - vj[1].v) * dx[1].v ) + ( (vi[2].v - vj[2].v) * dx[2].v );
-    dvdr.v = dvdr.v * ri.v;
-        
-    /* Get the time derivative for h. */
-    pih_dt.v = mj.v / pjrho.v * dvdr.v * wi_dr.v;
-    
-    /* Compute the relative velocity. (This is 0 if the particles move away from each other and negative otherwise) */
-    omega_ij.v = vec_fmin( dvdr.v , vec_set1( 0.0f ) );
-    
-    /* Compute signal velocity */
-    v_sig.v = ci.v + cj.v - vec_set1( 3.0f )*omega_ij.v;
-
-    /* Compute viscosity tensor */
-    Pi_ij.v = -balsara.v * vec_set1( const_viscosity_alpha ) * v_sig.v * omega_ij.v / (pirho.v + pjrho.v);
-    Pi_ij.v *= ( wi_dr.v + wj_dr.v );
-
-    /* Get the common factor out. */
-    w.v = ri.v * ( ( piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v ) + vec_set1( 0.25f ) * Pi_ij.v );
-
-    /* Use the force, Luke! */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        f.v = dx[k].v * w.v;
-        pia[k].v = mj.v * f.v;
-        }
-        
-    /* Get the time derivative for u. */
-    piu_dt.v = mj.v * dvdr.v * ( piPOrho2.v * wi_dr.v + vec_set1( 0.125f ) * Pi_ij.v );
-    
-    /* compute the signal velocity (this is always symmetrical). */
-    vi_sig.v = vec_fmax( vi_sig.v , v_sig.v );
-    vj_sig.v = vec_fmax( vj_sig.v , v_sig.v );
-
-    /* Store the forces back on the particles. */
-    for ( k = 0 ; k < VEC_SIZE ; k++ ) {
-        pi[k]->force.u_dt += piu_dt.f[k];
-        pi[k]->force.h_dt -= pih_dt.f[k];
-        pi[k]->force.v_sig = vi_sig.f[k];
-        pj[k]->force.v_sig = vj_sig.f[k];
-        for ( j = 0 ; j < 3 ; j++ )
-            pi[k]->a[j] -= pia[j].f[k];
-        }
-
+  vector r, r2, ri;
+  vector xi, xj;
+  vector hi, hj, hi_inv, hj_inv;
+  vector hi2_inv, hj2_inv;
+  vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
+  vector w;
+  vector piPOrho2, pjPOrho2, pirho, pjrho;
+  vector mj;
+  vector f;
+  vector dx[3];
+  vector vi[3], vj[3];
+  vector pia[3];
+  vector piu_dt;
+  vector pih_dt;
+  vector ci, cj, v_sig, vi_sig, vj_sig;
+  vector omega_ij, Pi_ij, balsara;
+  int j, k;
+
+/* Load stuff. */
+#if VEC_SIZE == 8
+  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
+                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
+  piPOrho2.v =
+      vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2, pi[2]->force.POrho2,
+              pi[3]->force.POrho2, pi[4]->force.POrho2, pi[5]->force.POrho2,
+              pi[6]->force.POrho2, pi[7]->force.POrho2);
+  pjPOrho2.v =
+      vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2, pj[2]->force.POrho2,
+              pj[3]->force.POrho2, pj[4]->force.POrho2, pj[5]->force.POrho2,
+              pj[6]->force.POrho2, pj[7]->force.POrho2);
+  pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho,
+                    pi[5]->rho, pi[6]->rho, pi[7]->rho);
+  pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho,
+                    pj[5]->rho, pj[6]->rho, pj[7]->rho);
+  ci.v =
+      vec_set(pi[0]->force.c, pi[1]->force.c, pi[2]->force.c, pi[3]->force.c,
+              pi[4]->force.c, pi[5]->force.c, pi[6]->force.c, pi[7]->force.c);
+  cj.v =
+      vec_set(pj[0]->force.c, pj[1]->force.c, pj[2]->force.c, pj[3]->force.c,
+              pj[4]->force.c, pj[5]->force.c, pj[6]->force.c, pj[7]->force.c);
+  vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig,
+                     pi[3]->force.v_sig, pi[4]->force.v_sig, pi[5]->force.v_sig,
+                     pi[6]->force.v_sig, pi[7]->force.v_sig);
+  vj_sig.v = vec_set(pj[0]->force.v_sig, pj[1]->force.v_sig, pj[2]->force.v_sig,
+                     pj[3]->force.v_sig, pj[4]->force.v_sig, pj[5]->force.v_sig,
+                     pj[6]->force.v_sig, pj[7]->force.v_sig);
+  for (k = 0; k < 3; k++) {
+    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
+                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
+    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
+                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
+  }
+  for (k = 0; k < 3; k++)
+    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
+                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
+  balsara.v =
+      vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara,
+              pi[3]->force.balsara, pi[4]->force.balsara, pi[5]->force.balsara,
+              pi[6]->force.balsara, pi[7]->force.balsara) +
+      vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara,
+              pj[3]->force.balsara, pj[4]->force.balsara, pj[5]->force.balsara,
+              pj[6]->force.balsara, pj[7]->force.balsara);
+#elif VEC_SIZE == 4
+  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
+  piPOrho2.v = vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2,
+                       pi[2]->force.POrho2, pi[3]->force.POrho2);
+  pjPOrho2.v = vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2,
+                       pj[2]->force.POrho2, pj[3]->force.POrho2);
+  pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho);
+  pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho);
+  ci.v =
+      vec_set(pi[0]->force.c, pi[1]->force.c, pi[2]->force.c, pi[3]->force.c);
+  cj.v =
+      vec_set(pj[0]->force.c, pj[1]->force.c, pj[2]->force.c, pj[3]->force.c);
+  vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig,
+                     pi[3]->force.v_sig);
+  vj_sig.v = vec_set(pj[0]->force.v_sig, pj[1]->force.v_sig, pj[2]->force.v_sig,
+                     pj[3]->force.v_sig);
+  for (k = 0; k < 3; k++) {
+    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
+    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
+  }
+  for (k = 0; k < 3; k++)
+    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
+  balsara.v = vec_set(pi[0]->force.balsara, pi[1]->force.balsara,
+                      pi[2]->force.balsara, pi[3]->force.balsara) +
+              vec_set(pj[0]->force.balsara, pj[1]->force.balsara,
+                      pj[2]->force.balsara, pj[3]->force.balsara);
 #else
+#error
+#endif
 
-    for ( int k = 0 ; k < VEC_SIZE ; k++ )
-        runner_iact_nonsym_force( R2[k] , &Dx[3*k] , Hi[k] , Hj[k] , pi[k] , pj[k] );
+  /* Get the radius and inverse radius. */
+  r2.v = vec_load(R2);
+  ri.v = vec_rsqrt(r2.v);
+  ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f));
+  r.v = r2.v * ri.v;
+
+  /* Get the kernel for hi. */
+  hi.v = vec_load(Hi);
+  hi_inv.v = vec_rcp(hi.v);
+  hi_inv.v = hi_inv.v - hi_inv.v * (hi.v * hi_inv.v - vec_set1(1.0f));
+  hi2_inv.v = hi_inv.v * hi_inv.v;
+  xi.v = r.v * hi_inv.v;
+  kernel_deval_vec(&xi, &wi, &wi_dx);
+  wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v;
+
+  /* Get the kernel for hj. */
+  hj.v = vec_load(Hj);
+  hj_inv.v = vec_rcp(hj.v);
+  hj_inv.v = hj_inv.v - hj_inv.v * (hj.v * hj_inv.v - vec_set1(1.0f));
+  hj2_inv.v = hj_inv.v * hj_inv.v;
+  xj.v = r.v * hj_inv.v;
+  kernel_deval_vec(&xj, &wj, &wj_dx);
+  wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v;
+
+  /* Compute dv dot r. */
+  dvdr.v = ((vi[0].v - vj[0].v) * dx[0].v) + ((vi[1].v - vj[1].v) * dx[1].v) +
+           ((vi[2].v - vj[2].v) * dx[2].v);
+  dvdr.v = dvdr.v * ri.v;
+
+  /* Get the time derivative for h. */
+  pih_dt.v = mj.v / pjrho.v * dvdr.v * wi_dr.v;
+
+  /* Compute the relative velocity. (This is 0 if the particles move away from
+   * each other and negative otherwise) */
+  omega_ij.v = vec_fmin(dvdr.v, vec_set1(0.0f));
+
+  /* Compute signal velocity */
+  v_sig.v = ci.v + cj.v - vec_set1(3.0f) * omega_ij.v;
+
+  /* Compute viscosity tensor */
+  Pi_ij.v = -balsara.v * vec_set1(const_viscosity_alpha) * v_sig.v *
+            omega_ij.v / (pirho.v + pjrho.v);
+  Pi_ij.v *= (wi_dr.v + wj_dr.v);
+
+  /* Get the common factor out. */
+  w.v = ri.v * ((piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v) +
+                vec_set1(0.25f) * Pi_ij.v);
+
+  /* Use the force, Luke! */
+  for (k = 0; k < 3; k++) {
+    f.v = dx[k].v * w.v;
+    pia[k].v = mj.v * f.v;
+  }
+
+  /* Get the time derivative for u. */
+  piu_dt.v =
+      mj.v * dvdr.v * (piPOrho2.v * wi_dr.v + vec_set1(0.125f) * Pi_ij.v);
+
+  /* compute the signal velocity (this is always symmetrical). */
+  vi_sig.v = vec_fmax(vi_sig.v, v_sig.v);
+  vj_sig.v = vec_fmax(vj_sig.v, v_sig.v);
+
+  /* Store the forces back on the particles. */
+  for (k = 0; k < VEC_SIZE; k++) {
+    pi[k]->force.u_dt += piu_dt.f[k];
+    pi[k]->force.h_dt -= pih_dt.f[k];
+    pi[k]->force.v_sig = vi_sig.f[k];
+    pj[k]->force.v_sig = vj_sig.f[k];
+    for (j = 0; j < 3; j++) pi[k]->a[j] -= pia[j].f[k];
+  }
 
-#endif
-        
-    }
-    
+#else
 
+  for (int k = 0; k < VEC_SIZE; k++)
+    runner_iact_nonsym_force(R2[k], &Dx[3 * k], Hi[k], Hj[k], pi[k], pj[k]);
 
+#endif
+}
 
+#endif /* SWIFT_RUNNER_IACT_LEGACY_H */
diff --git a/src/scheduler.c b/src/scheduler.c
index 4c45303f1fb60a4ae2daf80dfda70de1c21361bc..02defc31710e3ab0de067cf359d4281b94c17d90 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1,56 +1,48 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 
 /* Config parameters. */
 #include "../config.h"
 
 /* Some standard headers. */
+#include <limits.h>
+#include <math.h>
+#include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <pthread.h>
-#include <limits.h>
 
 /* MPI headers. */
 #ifdef WITH_MPI
-    #include <mpi.h>
+#include <mpi.h>
 #endif
 
+/* This object's header. */
+#include "scheduler.h"
+
 /* Local headers. */
-#include "error.h"
-#include "cycle.h"
 #include "atomic.h"
-#include "timers.h"
 #include "const.h"
-#include "vector.h"
-#include "lock.h"
-#include "task.h"
-#include "part.h"
-#include "debug.h"
-#include "space.h"
-#include "multipole.h"
-#include "cell.h"
-#include "queue.h"
+#include "cycle.h"
+#include "error.h"
 #include "kernel.h"
-#include "scheduler.h"
-
+#include "timers.h"
 
 /**
  * @brief Add an unlock_task to the given task.
@@ -59,490 +51,588 @@
  * @param ta The unlocking #task.
  * @param tb The #task that will be unlocked.
  */
- 
-void scheduler_addunlock ( struct scheduler *s , struct task *ta , struct task *tb ) {
-
-    /* Main loop. */
-    while ( 1 ) {
 
-        /* Follow the links. */
-        while ( ta->nr_unlock_tasks == task_maxunlock+1 )
-            ta = ta->unlock_tasks[ task_maxunlock ];
+void scheduler_addunlock(struct scheduler *s, struct task *ta,
+                         struct task *tb) {
 
-        /* Get the index of the next free task. */
-        int ind = atomic_inc( &ta->nr_unlock_tasks );
+  /* Main loop. */
+  while (1) {
 
-        /* Is there room in this task? */
-        if ( ind < task_maxunlock ) {
-            ta->unlock_tasks[ ind ] = tb;
-            break;
-            }
+    /* Follow the links. */
+    while (ta->nr_unlock_tasks == task_maxunlock + 1)
+      ta = ta->unlock_tasks[task_maxunlock];
 
-        /* Otherwise, generate a link task. */
-        else {
-        
-            /* Only one thread should have to do this. */
-            if ( ind == task_maxunlock ) {
-                ta->unlock_tasks[ task_maxunlock ] = scheduler_addtask( s , task_type_link , task_subtype_none , ta->flags , 0 , ta->ci , ta->cj , 0 );
-                ta->unlock_tasks[ task_maxunlock ]->implicit = 1;
-                }
+    /* Get the index of the next free task. */
+    int ind = atomic_inc(&ta->nr_unlock_tasks);
 
-            /* Otherwise, reduce the count. */
-            else
-                atomic_dec( &ta->nr_unlock_tasks );
+    /* Is there room in this task? */
+    if (ind < task_maxunlock) {
+      ta->unlock_tasks[ind] = tb;
+      break;
+    }
 
-            }
-            
-        }
+    /* Otherwise, generate a link task. */
+    else {
 
+      /* Only one thread should have to do this. */
+      if (ind == task_maxunlock) {
+        ta->unlock_tasks[task_maxunlock] =
+            scheduler_addtask(s, task_type_link, task_subtype_none, ta->flags,
+                              0, ta->ci, ta->cj, 0);
+        ta->unlock_tasks[task_maxunlock]->implicit = 1;
+      }
+
+      /* Otherwise, reduce the count. */
+      else
+        atomic_dec(&ta->nr_unlock_tasks);
     }
-    
+  }
+}
 
 /**
  * @brief Split tasks that may be too large.
  *
  * @param s The #scheduler we are working in.
  */
- 
-void scheduler_splittasks ( struct scheduler *s ) {
-
-    int j, k, ind, sid, tid = 0, redo;
-    struct cell *ci, *cj;
-    double hi, hj, shift[3];
-    struct task *t, *t_old;
-    // float dt_step = s->dt_step;
-    int pts[7][8] = { { -1 , 12 , 10 ,  9 ,  4 ,  3 ,  1 ,  0 } ,
-                      { -1 , -1 , 11 , 10 ,  5 ,  4 ,  2 ,  1 } ,
-                      { -1 , -1 , -1 , 12 ,  7 ,  6 ,  4 ,  3 } , 
-                      { -1 , -1 , -1 , -1 ,  8 ,  7 ,  5 ,  4 } ,
-                      { -1 , -1 , -1 , -1 , -1 , 12 , 10 ,  9 } ,
-                      { -1 , -1 , -1 , -1 , -1 , -1 , 11 , 10 } ,
-                      { -1 , -1 , -1 , -1 , -1 , -1 , -1 , 12 } };
-    float sid_scale[13] = { 0.1897 , 0.4025 , 0.1897 , 0.4025 , 0.5788 , 0.4025 ,
-                            0.1897 , 0.4025 , 0.1897 , 0.4025 , 0.5788 , 0.4025 , 
-                            0.5788 };
-
-    /* Loop through the tasks... */
-    redo = 0; t_old = t = NULL;
-    while ( 1 ) {
-    
-        /* Get a pointer on the task. */
-        if ( redo ) {
-            redo = 0;
-            t = t_old;
-            }
+
+void scheduler_splittasks(struct scheduler *s) {
+
+  int j, k, ind, sid, tid = 0, redo;
+  struct cell *ci, *cj;
+  double hi, hj, shift[3];
+  struct task *t, *t_old;
+  // float dt_step = s->dt_step;
+  int pts[7][8] = {{-1, 12, 10, 9, 4, 3, 1, 0},
+                   {-1, -1, 11, 10, 5, 4, 2, 1},
+                   {-1, -1, -1, 12, 7, 6, 4, 3},
+                   {-1, -1, -1, -1, 8, 7, 5, 4},
+                   {-1, -1, -1, -1, -1, 12, 10, 9},
+                   {-1, -1, -1, -1, -1, -1, 11, 10},
+                   {-1, -1, -1, -1, -1, -1, -1, 12}};
+  float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.1897,
+                         0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.5788};
+
+  /* Loop through the tasks... */
+  redo = 0;
+  t_old = t = NULL;
+  while (1) {
+
+    /* Get a pointer on the task. */
+    if (redo) {
+      redo = 0;
+      t = t_old;
+    } else {
+      if ((ind = atomic_inc(&tid)) < s->nr_tasks)
+        t_old = t = &s->tasks[s->tasks_ind[ind]];
+      else
+        break;
+    }
+
+    /* Empty task? */
+    if (t->ci == NULL || (t->type == task_type_pair && t->cj == NULL)) {
+      t->type = task_type_none;
+      t->skip = 1;
+      continue;
+    }
+
+    /* Non-local kick task? */
+    if ((t->type == task_type_kick1 || t->type == task_type_kick2) &&
+        t->ci->nodeID != s->nodeID) {
+      t->type = task_type_none;
+      t->skip = 1;
+      continue;
+    }
+
+    /* Self-interaction? */
+    if (t->type == task_type_self) {
+
+      /* Get a handle on the cell involved. */
+      ci = t->ci;
+
+      /* Foreign task? */
+      if (ci->nodeID != s->nodeID) {
+        t->skip = 1;
+        continue;
+      }
+
+      /* Is this cell even split? */
+      if (ci->split) {
+
+        /* Make a sub? */
+        if (scheduler_dosub && ci->count < space_subsize / ci->count) {
+
+          /* convert to a self-subtask. */
+          t->type = task_type_sub;
+
+        }
+
+        /* Otherwise, make tasks explicitly. */
         else {
-            if ( ( ind = atomic_inc( &tid ) ) < s->nr_tasks )
-                t_old = t = &s->tasks[ s->tasks_ind[ ind ] ];
-            else
-                break;
-            }
-        
-        /* Empty task? */
-        if ( t->ci == NULL || ( t->type == task_type_pair && t->cj == NULL ) ) {
-            t->type = task_type_none;
-            t->skip = 1;
-            continue;
-            }
-            
-        /* Non-local kick task? */
-        if ( (t->type == task_type_kick1 || t->type == task_type_kick2 ) &&
-             t->ci->nodeID != s->nodeID ) {
+
+          /* Take a step back (we're going to recycle the current task)... */
+          redo = 1;
+
+          /* Add the self taks. */
+          for (k = 0; ci->progeny[k] == NULL; k++)
+            ;
+          t->ci = ci->progeny[k];
+          for (k += 1; k < 8; k++)
+            if (ci->progeny[k] != NULL)
+              scheduler_addtask(s, task_type_self, task_subtype_density, 0, 0,
+                                ci->progeny[k], NULL, 0);
+
+          /* Make a task for each pair of progeny. */
+          for (j = 0; j < 8; j++)
+            if (ci->progeny[j] != NULL)
+              for (k = j + 1; k < 8; k++)
+                if (ci->progeny[k] != NULL)
+                  scheduler_addtask(s, task_type_pair, task_subtype_density,
+                                    pts[j][k], 0, ci->progeny[j],
+                                    ci->progeny[k], 0);
+        }
+      }
+
+    }
+
+    /* Pair interaction? */
+    else if (t->type == task_type_pair) {
+
+      /* Get a handle on the cells involved. */
+      ci = t->ci;
+      cj = t->cj;
+      hi = ci->dmin;
+      hj = cj->dmin;
+
+      /* Foreign task? */
+      if (ci->nodeID != s->nodeID && cj->nodeID != s->nodeID) {
+        t->skip = 1;
+        continue;
+      }
+
+      /* Get the sort ID, use space_getsid and not t->flags
+         to make sure we get ci and cj swapped if needed. */
+      sid = space_getsid(s->space, &ci, &cj, shift);
+
+      /* Should this task be split-up? */
+      if (ci->split && cj->split &&
+          ci->h_max * kernel_gamma * space_stretch < hi / 2 &&
+          cj->h_max * kernel_gamma * space_stretch < hj / 2) {
+
+        /* Replace by a single sub-task? */
+        if (scheduler_dosub &&
+            ci->count * sid_scale[sid] < space_subsize / cj->count &&
+            sid != 0 && sid != 2 && sid != 6 && sid != 8) {
+
+          /* Make this task a sub task. */
+          t->type = task_type_sub;
+
+        }
+
+        /* Otherwise, split it. */
+        else {
+
+          /* Take a step back (we're going to recycle the current task)... */
+          redo = 1;
+
+          /* For each different sorting type... */
+          switch (sid) {
+
+            case 0: /* (  1 ,  1 ,  1 ) */
+              t->ci = ci->progeny[7];
+              t->cj = cj->progeny[0];
+              t->flags = 0;
+              break;
+
+            case 1: /* (  1 ,  1 ,  0 ) */
+              t->ci = ci->progeny[6];
+              t->cj = cj->progeny[0];
+              t->flags = 1;
+              t->tight = 1;
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 1, 0,
+                                    ci->progeny[7], cj->progeny[1], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
+                                    ci->progeny[6], cj->progeny[1], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 2, 0,
+                                    ci->progeny[7], cj->progeny[0], 1);
+              break;
+
+            case 2: /* (  1 ,  1 , -1 ) */
+              t->ci = ci->progeny[6];
+              t->cj = cj->progeny[1];
+              t->flags = 2;
+              t->tight = 1;
+              break;
+
+            case 3: /* (  1 ,  0 ,  1 ) */
+              t->ci = ci->progeny[5];
+              t->cj = cj->progeny[0];
+              t->flags = 3;
+              t->tight = 1;
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 3, 0,
+                                    ci->progeny[7], cj->progeny[2], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
+                                    ci->progeny[5], cj->progeny[2], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 6, 0,
+                                    ci->progeny[7], cj->progeny[0], 1);
+              break;
+
+            case 4: /* (  1 ,  0 ,  0 ) */
+              t->ci = ci->progeny[4];
+              t->cj = cj->progeny[0];
+              t->flags = 4;
+              t->tight = 1;
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 5, 0,
+                                    ci->progeny[5], cj->progeny[0], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 7, 0,
+                                    ci->progeny[6], cj->progeny[0], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 8, 0,
+                                    ci->progeny[7], cj->progeny[0], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 3, 0,
+                                    ci->progeny[4], cj->progeny[1], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 4, 0,
+                                    ci->progeny[5], cj->progeny[1], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 6, 0,
+                                    ci->progeny[6], cj->progeny[1], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 7, 0,
+                                    ci->progeny[7], cj->progeny[1], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 1, 0,
+                                    ci->progeny[4], cj->progeny[2], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 2, 0,
+                                    ci->progeny[5], cj->progeny[2], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 4, 0,
+                                    ci->progeny[6], cj->progeny[2], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 5, 0,
+                                    ci->progeny[7], cj->progeny[2], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
+                                    ci->progeny[4], cj->progeny[3], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 1, 0,
+                                    ci->progeny[5], cj->progeny[3], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 3, 0,
+                                    ci->progeny[6], cj->progeny[3], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 4, 0,
+                                    ci->progeny[7], cj->progeny[3], 1);
+              break;
+
+            case 5: /* (  1 ,  0 , -1 ) */
+              t->ci = ci->progeny[4];
+              t->cj = cj->progeny[1];
+              t->flags = 5;
+              t->tight = 1;
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 5, 0,
+                                    ci->progeny[6], cj->progeny[3], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 2, 0,
+                                    ci->progeny[4], cj->progeny[3], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 8, 0,
+                                    ci->progeny[6], cj->progeny[1], 1);
+              break;
+
+            case 6: /* (  1 , -1 ,  1 ) */
+              t->ci = ci->progeny[5];
+              t->cj = cj->progeny[2];
+              t->flags = 6;
+              t->tight = 1;
+              break;
+
+            case 7: /* (  1 , -1 ,  0 ) */
+              t->ci = ci->progeny[4];
+              t->cj = cj->progeny[3];
+              t->flags = 6;
+              t->tight = 1;
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 8, 0,
+                                    ci->progeny[5], cj->progeny[2], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 7, 0,
+                                    ci->progeny[4], cj->progeny[2], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 7, 0,
+                                    ci->progeny[5], cj->progeny[3], 1);
+              break;
+
+            case 8: /* (  1 , -1 , -1 ) */
+              t->ci = ci->progeny[4];
+              t->cj = cj->progeny[3];
+              t->flags = 8;
+              t->tight = 1;
+              break;
+
+            case 9: /* (  0 ,  1 ,  1 ) */
+              t->ci = ci->progeny[3];
+              t->cj = cj->progeny[0];
+              t->flags = 9;
+              t->tight = 1;
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 9, 0,
+                                    ci->progeny[7], cj->progeny[4], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
+                                    ci->progeny[3], cj->progeny[4], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 8, 0,
+                                    ci->progeny[7], cj->progeny[0], 1);
+              break;
+
+            case 10: /* (  0 ,  1 ,  0 ) */
+              t->ci = ci->progeny[2];
+              t->cj = cj->progeny[0];
+              t->flags = 10;
+              t->tight = 1;
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 11, 0,
+                                    ci->progeny[3], cj->progeny[0], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 7, 0,
+                                    ci->progeny[6], cj->progeny[0], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 6, 0,
+                                    ci->progeny[7], cj->progeny[0], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 9, 0,
+                                    ci->progeny[2], cj->progeny[1], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 10, 0,
+                                    ci->progeny[3], cj->progeny[1], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 8, 0,
+                                    ci->progeny[6], cj->progeny[1], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 7, 0,
+                                    ci->progeny[7], cj->progeny[1], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 1, 0,
+                                    ci->progeny[2], cj->progeny[4], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 2, 0,
+                                    ci->progeny[3], cj->progeny[4], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 10, 0,
+                                    ci->progeny[6], cj->progeny[4], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 11, 0,
+                                    ci->progeny[7], cj->progeny[4], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
+                                    ci->progeny[2], cj->progeny[5], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 1, 0,
+                                    ci->progeny[3], cj->progeny[5], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 9, 0,
+                                    ci->progeny[6], cj->progeny[5], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 10, 0,
+                                    ci->progeny[7], cj->progeny[5], 1);
+              break;
+
+            case 11: /* (  0 ,  1 , -1 ) */
+              t->ci = ci->progeny[2];
+              t->cj = cj->progeny[1];
+              t->flags = 11;
+              t->tight = 1;
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 11, 0,
+                                    ci->progeny[6], cj->progeny[5], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 2, 0,
+                                    ci->progeny[2], cj->progeny[5], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 6, 0,
+                                    ci->progeny[6], cj->progeny[1], 1);
+              break;
+
+            case 12: /* (  0 ,  0 ,  1 ) */
+              t->ci = ci->progeny[1];
+              t->cj = cj->progeny[0];
+              t->flags = 12;
+              t->tight = 1;
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 11, 0,
+                                    ci->progeny[3], cj->progeny[0], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 5, 0,
+                                    ci->progeny[5], cj->progeny[0], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 2, 0,
+                                    ci->progeny[7], cj->progeny[0], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 9, 0,
+                                    ci->progeny[1], cj->progeny[2], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 12, 0,
+                                    ci->progeny[3], cj->progeny[2], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 8, 0,
+                                    ci->progeny[5], cj->progeny[2], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 5, 0,
+                                    ci->progeny[7], cj->progeny[2], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 3, 0,
+                                    ci->progeny[1], cj->progeny[4], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 6, 0,
+                                    ci->progeny[3], cj->progeny[4], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 12, 0,
+                                    ci->progeny[5], cj->progeny[4], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 11, 0,
+                                    ci->progeny[7], cj->progeny[4], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
+                                    ci->progeny[1], cj->progeny[6], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 3, 0,
+                                    ci->progeny[3], cj->progeny[6], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 9, 0,
+                                    ci->progeny[5], cj->progeny[6], 1);
+              t = scheduler_addtask(s, task_type_pair, t->subtype, 12, 0,
+                                    ci->progeny[7], cj->progeny[6], 1);
+              break;
+          }
+        }
+
+      } /* split this task? */
+
+      /* Otherwise, break it up if it is too large? */
+      else if (scheduler_doforcesplit && ci->split && cj->split &&
+               (ci->count > space_maxsize / cj->count)) {
+
+        // message( "force splitting pair with %i and %i parts." , ci->count ,
+        // cj->count );
+
+        /* Replace the current task. */
+        t->type = task_type_none;
+
+        for (j = 0; j < 8; j++)
+          if (ci->progeny[j] != NULL)
+            for (k = 0; k < 8; k++)
+              if (cj->progeny[k] != NULL) {
+                t = scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
+                                      ci->progeny[j], cj->progeny[k], 0);
+                t->flags = space_getsid(s->space, &t->ci, &t->cj, shift);
+              }
+
+      }
+
+      /* Otherwise, if not spilt, stitch-up the sorting. */
+      else {
+
+        /* Create the sort for ci. */
+        // lock_lock( &ci->lock );
+        if (ci->sorts == NULL)
+          ci->sorts =
+              scheduler_addtask(s, task_type_sort, 0, 1 << sid, 0, ci, NULL, 0);
+        else
+          ci->sorts->flags |= (1 << sid);
+        // lock_unlock_blind( &ci->lock );
+        scheduler_addunlock(s, ci->sorts, t);
+
+        /* Create the sort for cj. */
+        // lock_lock( &cj->lock );
+        if (cj->sorts == NULL)
+          cj->sorts =
+              scheduler_addtask(s, task_type_sort, 0, 1 << sid, 0, cj, NULL, 0);
+        else
+          cj->sorts->flags |= (1 << sid);
+        // lock_unlock_blind( &cj->lock );
+        scheduler_addunlock(s, cj->sorts, t);
+      }
+
+    } /* pair interaction? */
+
+    /* Gravity interaction? */
+    else if (t->type == task_type_grav_mm) {
+
+      /* Get a handle on the cells involved. */
+      ci = t->ci;
+      cj = t->cj;
+
+      /* Self-interaction? */
+      if (cj == NULL) {
+
+        /* Ignore this task if the cell has no gparts. */
+        if (ci->gcount == 0) t->type = task_type_none;
+
+        /* If the cell is split, recurse. */
+        else if (ci->split) {
+
+          /* Make a single sub-task? */
+          if (scheduler_dosub && ci->count < space_subsize / ci->count) {
+
+            t->type = task_type_sub;
+            t->subtype = task_subtype_grav;
+
+          }
+
+          /* Otherwise, just split the task. */
+          else {
+
+            /* Split this task into tasks on its progeny. */
             t->type = task_type_none;
-            t->skip = 1;
-            continue;
-            }
-            
-        /* Self-interaction? */
-        if ( t->type == task_type_self ) {
-        
-            /* Get a handle on the cell involved. */
-            ci = t->ci;
-            
-            /* Foreign task? */
-            if ( ci->nodeID != s->nodeID ) {
-                t->skip = 1;
-                continue;
-                }
-            
-            /* Is this cell even split? */
-            if ( ci->split ) {
-            
-                /* Make a sub? */
-                if ( scheduler_dosub && ci->count < space_subsize/ci->count ) {
+            for (j = 0; j < 8; j++)
+              if (ci->progeny[j] != NULL && ci->progeny[j]->gcount > 0) {
+                if (t->type == task_type_none) {
+                  t->type = task_type_grav_mm;
+                  t->ci = ci->progeny[j];
+                  t->cj = NULL;
+                } else
+                  t = scheduler_addtask(s, task_type_grav_mm, task_subtype_none,
+                                        0, 0, ci->progeny[j], NULL, 0);
+                for (k = j + 1; k < 8; k++)
+                  if (ci->progeny[k] != NULL && ci->progeny[k]->gcount > 0) {
+                    if (t->type == task_type_none) {
+                      t->type = task_type_grav_mm;
+                      t->ci = ci->progeny[j];
+                      t->cj = ci->progeny[k];
+                    } else
+                      t = scheduler_addtask(s, task_type_grav_mm,
+                                            task_subtype_none, 0, 0,
+                                            ci->progeny[j], ci->progeny[k], 0);
+                  }
+              }
+            redo = (t->type != task_type_none);
+          }
 
-                    /* convert to a self-subtask. */
-                    t->type = task_type_sub;
+        }
 
-                    }
+        /* Otherwise, just make a pp task out of it. */
+        else
+          t->type = task_type_grav_pp;
 
-                /* Otherwise, make tasks explicitly. */
-                else {
-
-                    /* Take a step back (we're going to recycle the current task)... */
-                    redo = 1;
-
-                    /* Add the self taks. */
-                    for ( k = 0 ; ci->progeny[k] == NULL ; k++ );
-                    t->ci = ci->progeny[k];
-                    for ( k += 1 ; k < 8 ; k++ )
-                        if ( ci->progeny[k] != NULL )
-                            scheduler_addtask( s , task_type_self , task_subtype_density , 0 , 0 , ci->progeny[k] , NULL , 0 );
-
-                    /* Make a task for each pair of progeny. */
-                    for ( j = 0 ; j < 8 ; j++ )
-                        if ( ci->progeny[j] != NULL )
-                            for ( k = j + 1 ; k < 8 ; k++ )
-                                if ( ci->progeny[k] != NULL )
-                                    scheduler_addtask( s , task_type_pair , task_subtype_density , pts[j][k] , 0 , ci->progeny[j] , ci->progeny[k] , 0 );
-                    }
+      }
 
-                }
-        
-            }
-    
-        /* Pair interaction? */
-        else if ( t->type == task_type_pair ) {
-            
-            /* Get a handle on the cells involved. */
-            ci = t->ci;
-            cj = t->cj;
-            hi = ci->dmin;
-            hj = cj->dmin;
-
-            /* Foreign task? */
-            if ( ci->nodeID != s->nodeID && cj->nodeID != s->nodeID ) {
-                t->skip = 1;
-                continue;
-                }
-            
-            /* Get the sort ID, use space_getsid and not t->flags
-               to make sure we get ci and cj swapped if needed. */
-            sid = space_getsid( s->space , &ci , &cj , shift );
-                
-            /* Should this task be split-up? */
-            if ( ci->split && cj->split &&
-                 ci->h_max*kernel_gamma*space_stretch < hi/2 &&
-                 cj->h_max*kernel_gamma*space_stretch < hj/2 ) {
-                 
-                /* Replace by a single sub-task? */
-                if ( scheduler_dosub &&
-                     ci->count * sid_scale[sid] < space_subsize/cj->count &&
-                     sid != 0 && sid != 2 && sid != 6 && sid != 8 ) {
-                
-                    /* Make this task a sub task. */
-                    t->type = task_type_sub;
+      /* Nope, pair. */
+      else {
 
-                    }
-                    
-                /* Otherwise, split it. */
-                else {
-
-                    /* Take a step back (we're going to recycle the current task)... */
-                    redo = 1;
-
-                    /* For each different sorting type... */
-                    switch ( sid ) {
-
-                        case 0: /* (  1 ,  1 ,  1 ) */
-                            t->ci = ci->progeny[7]; t->cj = cj->progeny[0]; t->flags = 0;
-                            break;
-
-                        case 1: /* (  1 ,  1 ,  0 ) */
-                            t->ci = ci->progeny[6]; t->cj = cj->progeny[0]; t->flags = 1; t->tight = 1;
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 1 , 0 , ci->progeny[7] , cj->progeny[1] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 0 , 0 , ci->progeny[6] , cj->progeny[1] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 2 , 0 , ci->progeny[7] , cj->progeny[0] , 1 );
-                            break;
-
-                        case 2: /* (  1 ,  1 , -1 ) */
-                            t->ci = ci->progeny[6]; t->cj = cj->progeny[1]; t->flags = 2; t->tight = 1;
-                            break;
-
-                        case 3: /* (  1 ,  0 ,  1 ) */
-                            t->ci = ci->progeny[5]; t->cj = cj->progeny[0]; t->flags = 3; t->tight = 1;
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 3 , 0 , ci->progeny[7] , cj->progeny[2] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 0 , 0 , ci->progeny[5] , cj->progeny[2] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 6 , 0 , ci->progeny[7] , cj->progeny[0] , 1 );
-                            break;
-
-                        case 4: /* (  1 ,  0 ,  0 ) */
-                            t->ci = ci->progeny[4]; t->cj = cj->progeny[0]; t->flags = 4; t->tight = 1;
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 5 , 0 , ci->progeny[5] , cj->progeny[0] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 7 , 0 , ci->progeny[6] , cj->progeny[0] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 8 , 0 , ci->progeny[7] , cj->progeny[0] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 3 , 0 , ci->progeny[4] , cj->progeny[1] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 4 , 0 , ci->progeny[5] , cj->progeny[1] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 6 , 0 , ci->progeny[6] , cj->progeny[1] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 7 , 0 , ci->progeny[7] , cj->progeny[1] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 1 , 0 , ci->progeny[4] , cj->progeny[2] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 2 , 0 , ci->progeny[5] , cj->progeny[2] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 4 , 0 , ci->progeny[6] , cj->progeny[2] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 5 , 0 , ci->progeny[7] , cj->progeny[2] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 0 , 0 , ci->progeny[4] , cj->progeny[3] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 1 , 0 , ci->progeny[5] , cj->progeny[3] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 3 , 0 , ci->progeny[6] , cj->progeny[3] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 4 , 0 , ci->progeny[7] , cj->progeny[3] , 1 );
-                            break;
-
-                        case 5: /* (  1 ,  0 , -1 ) */
-                            t->ci = ci->progeny[4]; t->cj = cj->progeny[1]; t->flags = 5; t->tight = 1;
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 5 , 0 , ci->progeny[6] , cj->progeny[3] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 2 , 0 , ci->progeny[4] , cj->progeny[3] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 8 , 0 , ci->progeny[6] , cj->progeny[1] , 1 );
-                            break;
-
-                        case 6: /* (  1 , -1 ,  1 ) */
-                            t->ci = ci->progeny[5]; t->cj = cj->progeny[2]; t->flags = 6; t->tight = 1;
-                            break;
-
-                        case 7: /* (  1 , -1 ,  0 ) */
-                            t->ci = ci->progeny[4]; t->cj = cj->progeny[3]; t->flags = 6; t->tight = 1;
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 8 , 0 , ci->progeny[5] , cj->progeny[2] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 7 , 0 , ci->progeny[4] , cj->progeny[2] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 7 , 0 , ci->progeny[5] , cj->progeny[3] , 1 );
-                            break;
-
-                        case 8: /* (  1 , -1 , -1 ) */
-                            t->ci = ci->progeny[4]; t->cj = cj->progeny[3]; t->flags = 8; t->tight = 1;
-                            break;
-
-                        case 9: /* (  0 ,  1 ,  1 ) */
-                            t->ci = ci->progeny[3]; t->cj = cj->progeny[0]; t->flags = 9; t->tight = 1;
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 9 , 0 , ci->progeny[7] , cj->progeny[4] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 0 , 0 , ci->progeny[3] , cj->progeny[4] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 8 , 0 , ci->progeny[7] , cj->progeny[0] , 1 );
-                            break;
-
-                        case 10: /* (  0 ,  1 ,  0 ) */
-                            t->ci = ci->progeny[2]; t->cj = cj->progeny[0]; t->flags = 10; t->tight = 1;
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 11 , 0 , ci->progeny[3] , cj->progeny[0] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 7 , 0 , ci->progeny[6] , cj->progeny[0] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 6 , 0 , ci->progeny[7] , cj->progeny[0] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 9 , 0 , ci->progeny[2] , cj->progeny[1] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 10 , 0 , ci->progeny[3] , cj->progeny[1] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 8 , 0 , ci->progeny[6] , cj->progeny[1] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 7 , 0 , ci->progeny[7] , cj->progeny[1] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 1 , 0 , ci->progeny[2] , cj->progeny[4] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 2 , 0 , ci->progeny[3] , cj->progeny[4] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 10 , 0 , ci->progeny[6] , cj->progeny[4] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 11 , 0 , ci->progeny[7] , cj->progeny[4] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 0 , 0 , ci->progeny[2] , cj->progeny[5] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 1 , 0 , ci->progeny[3] , cj->progeny[5] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 9 , 0 , ci->progeny[6] , cj->progeny[5] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 10 , 0 , ci->progeny[7] , cj->progeny[5] , 1 );
-                            break;
-
-                        case 11: /* (  0 ,  1 , -1 ) */
-                            t->ci = ci->progeny[2]; t->cj = cj->progeny[1]; t->flags = 11; t->tight = 1;
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 11 , 0 , ci->progeny[6] , cj->progeny[5] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 2 , 0 , ci->progeny[2] , cj->progeny[5] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 6 , 0 , ci->progeny[6] , cj->progeny[1] , 1 );
-                            break;
-
-                        case 12: /* (  0 ,  0 ,  1 ) */
-                            t->ci = ci->progeny[1]; t->cj = cj->progeny[0]; t->flags = 12; t->tight = 1;
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 11 , 0 , ci->progeny[3] , cj->progeny[0] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 5 , 0 , ci->progeny[5] , cj->progeny[0] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 2 , 0 , ci->progeny[7] , cj->progeny[0] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 9 , 0 , ci->progeny[1] , cj->progeny[2] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 12 , 0 , ci->progeny[3] , cj->progeny[2] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 8 , 0 , ci->progeny[5] , cj->progeny[2] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 5 , 0 , ci->progeny[7] , cj->progeny[2] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 3 , 0 , ci->progeny[1] , cj->progeny[4] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 6 , 0 , ci->progeny[3] , cj->progeny[4] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 12 , 0 , ci->progeny[5] , cj->progeny[4] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 11 , 0 , ci->progeny[7] , cj->progeny[4] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 0 , 0 , ci->progeny[1] , cj->progeny[6] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 3 , 0 , ci->progeny[3] , cj->progeny[6] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 9 , 0 , ci->progeny[5] , cj->progeny[6] , 1 );
-                            t = scheduler_addtask( s , task_type_pair , t->subtype , 12 , 0 , ci->progeny[7] , cj->progeny[6] , 1 );
-                            break;
-
-                        }
-                        
-                    }
+        /* Make a sub-task? */
+        if (scheduler_dosub && ci->count < space_subsize / cj->count) {
 
-                } /* split this task? */
-                
-            /* Otherwise, break it up if it is too large? */
-            else if ( scheduler_doforcesplit && ci->split && cj->split &&
-                      ( ci->count > space_maxsize / cj->count ) ) {
-                      
-                // message( "force splitting pair with %i and %i parts." , ci->count , cj->count );
-                      
-                /* Replace the current task. */
-                t->type = task_type_none;
-                
-                for ( j = 0 ; j < 8 ; j++ )
-                    if ( ci->progeny[j] != NULL )
-                        for ( k = 0 ; k < 8 ; k++ )
-                            if ( cj->progeny[k] != NULL ) {
-                                t = scheduler_addtask( s , task_type_pair , t->subtype , 0 , 0 , ci->progeny[j] , cj->progeny[k] , 0 );
-                                t->flags = space_getsid( s->space , &t->ci , &t->cj , shift );
-                                }
-                      
-                }
-                
-            /* Otherwise, if not spilt, stitch-up the sorting. */
-            else {
-            
-                /* Create the sort for ci. */
-                // lock_lock( &ci->lock );
-                if ( ci->sorts == NULL )
-                    ci->sorts = scheduler_addtask( s , task_type_sort , 0 , 1 << sid , 0 , ci , NULL , 0 );
-                else
-                    ci->sorts->flags |= (1 << sid);
-                // lock_unlock_blind( &ci->lock );
-                scheduler_addunlock( s , ci->sorts , t );
-                
-                /* Create the sort for cj. */
-                // lock_lock( &cj->lock );
-                if ( cj->sorts == NULL )
-                    cj->sorts = scheduler_addtask( s , task_type_sort , 0 , 1 << sid , 0 , cj , NULL , 0 );
-                else
-                    cj->sorts->flags |= (1 << sid);
-                // lock_unlock_blind( &cj->lock );
-                scheduler_addunlock( s , cj->sorts , t );
-                
-                }
-                
-            } /* pair interaction? */
-            
-        /* Gravity interaction? */
-        else if ( t->type == task_type_grav_mm ) {
-        
-            /* Get a handle on the cells involved. */
-            ci = t->ci;
-            cj = t->cj;
-            
-            /* Self-interaction? */
-            if ( cj == NULL ) {
-            
-                /* Ignore this task if the cell has no gparts. */
-                if ( ci->gcount == 0 )
-                    t->type = task_type_none;
-            
-                /* If the cell is split, recurse. */
-                else if ( ci->split ) {
-                
-                    /* Make a single sub-task? */
-                    if ( scheduler_dosub && ci->count < space_subsize/ci->count ) {
-                    
-                        t->type = task_type_sub;
-                        t->subtype = task_subtype_grav;
-                    
-                        }
-                        
-                    /* Otherwise, just split the task. */
-                    else {
-                
-                        /* Split this task into tasks on its progeny. */
-                        t->type = task_type_none;
-                        for ( j = 0 ; j < 8 ; j++ )
-                            if ( ci->progeny[j] != NULL && ci->progeny[j]->gcount > 0 ) {
-                                if ( t->type == task_type_none ) {
-                                    t->type = task_type_grav_mm;
-                                    t->ci = ci->progeny[j];
-                                    t->cj = NULL;
-                                    }
-                                else
-                                    t = scheduler_addtask( s , task_type_grav_mm , task_subtype_none , 0 , 0 , ci->progeny[j] , NULL , 0 );
-                                for ( k = j+1 ; k < 8 ; k++ )
-                                    if ( ci->progeny[k] != NULL && ci->progeny[k]->gcount > 0 ) {
-                                        if ( t->type == task_type_none ) {
-                                            t->type = task_type_grav_mm;
-                                            t->ci = ci->progeny[j];
-                                            t->cj = ci->progeny[k];
-                                            }
-                                        else
-                                            t = scheduler_addtask( s , task_type_grav_mm , task_subtype_none , 0 , 0 , ci->progeny[j] , ci->progeny[k] , 0 );
-                                        }
-                                }
-                        redo = ( t->type != task_type_none );
-                        
-                        }
-                      
+          t->type = task_type_sub;
+          t->subtype = task_subtype_grav;
+
+        }
+
+        /* Otherwise, split the task. */
+        else {
+
+          /* Get the opening angle theta. */
+          float dx[3], theta;
+          for (k = 0; k < 3; k++) {
+            dx[k] = fabsf(ci->loc[k] - cj->loc[k]);
+            if (s->space->periodic && dx[k] > 0.5 * s->space->dim[k])
+              dx[k] = -dx[k] + s->space->dim[k];
+            if (dx[k] > 0.0f) dx[k] -= ci->h[k];
+          }
+          theta =
+              (dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]) /
+              (ci->h[0] * ci->h[0] + ci->h[1] * ci->h[1] + ci->h[2] * ci->h[2]);
+
+          /* Ignore this task if the cell has no gparts. */
+          if (ci->gcount == 0 || cj->gcount == 0) t->type = task_type_none;
+
+          /* Split the interacton? */
+          else if (theta < const_theta_max * const_theta_max) {
+
+            /* Are both ci and cj split? */
+            if (ci->split && cj->split) {
+
+              /* Split this task into tasks on its progeny. */
+              t->type = task_type_none;
+              for (j = 0; j < 8; j++)
+                if (ci->progeny[j] != NULL && ci->progeny[j]->gcount > 0) {
+                  for (k = 0; k < 8; k++)
+                    if (cj->progeny[k] != NULL && cj->progeny[k]->gcount > 0) {
+                      if (t->type == task_type_none) {
+                        t->type = task_type_grav_mm;
+                        t->ci = ci->progeny[j];
+                        t->cj = cj->progeny[k];
+                      } else
+                        t = scheduler_addtask(
+                            s, task_type_grav_mm, task_subtype_none, 0, 0,
+                            ci->progeny[j], cj->progeny[k], 0);
                     }
-                    
-                /* Otherwise, just make a pp task out of it. */
-                else
-                    t->type = task_type_grav_pp;
-                
                 }
-                
-            /* Nope, pair. */
-            else {
-            
-                /* Make a sub-task? */
-                if ( scheduler_dosub && ci->count < space_subsize/cj->count ) {
-                
-                    t->type = task_type_sub;
-                    t->subtype = task_subtype_grav;
-                
-                    }
-                    
-                /* Otherwise, split the task. */
-                else {
-        
-                    /* Get the opening angle theta. */
-                    float dx[3], theta;
-                    for ( k = 0 ; k < 3 ; k++ ) {
-                        dx[k] = fabsf( ci->loc[k] - cj->loc[k] );
-                        if ( s->space->periodic && dx[k] > 0.5*s->space->dim[k] )
-                            dx[k] = -dx[k] + s->space->dim[k];
-                        if ( dx[k] > 0.0f )
-                            dx[k] -= ci->h[k];
-                        }
-                    theta = ( dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2] ) / 
-                            ( ci->h[0]*ci->h[0] + ci->h[1]*ci->h[1] + ci->h[2]*ci->h[2] );
-
-                    /* Ignore this task if the cell has no gparts. */
-                    if ( ci->gcount == 0 || cj->gcount == 0 )
-                        t->type = task_type_none;
-
-                    /* Split the interacton? */
-                    else if ( theta < const_theta_max*const_theta_max ) {
-
-                        /* Are both ci and cj split? */
-                        if ( ci->split && cj->split ) {
-
-                            /* Split this task into tasks on its progeny. */
-                            t->type = task_type_none;
-                            for ( j = 0 ; j < 8 ; j++ )
-                                if ( ci->progeny[j] != NULL && ci->progeny[j]->gcount > 0 ) {
-                                    for ( k = 0 ; k < 8 ; k++ )
-                                        if ( cj->progeny[k] != NULL && cj->progeny[k]->gcount > 0 ) {
-                                            if ( t->type == task_type_none ) {
-                                                t->type = task_type_grav_mm;
-                                                t->ci = ci->progeny[j];
-                                                t->cj = cj->progeny[k];
-                                                }
-                                            else
-                                                t = scheduler_addtask( s , task_type_grav_mm , task_subtype_none , 0 , 0 , ci->progeny[j] , cj->progeny[k] , 0 );
-                                            }
-                                    }
-                            redo = ( t->type != task_type_none );
-
-                            }
-
-                        /* Otherwise, make a pp task out of it. */
-                        else
-                            t->type = task_type_grav_pp;
-
-                        }
-                        
-                    }
-                
-                } /* gravity pair interaction? */
-        
-            } /* gravity interaction? */
-    
-        } /* loop over all tasks. */
-        
-    }
-    
-    
+              redo = (t->type != task_type_none);
+
+            }
+
+            /* Otherwise, make a pp task out of it. */
+            else
+              t->type = task_type_grav_pp;
+          }
+        }
+
+      } /* gravity pair interaction? */
+
+    } /* gravity interaction? */
+
+  } /* loop over all tasks. */
+}
+
 /**
  * @brief Add a #task to the #scheduler.
  *
@@ -550,116 +640,111 @@ void scheduler_splittasks ( struct scheduler *s ) {
  * @param type The type of the task.
  * @param subtype The sub-type of the task.
  * @param flags The flags of the task.
- * @param wait 
+ * @param wait
  * @param ci The first cell to interact.
  * @param cj The second cell to interact.
  * @param tight
  */
- 
-struct task *scheduler_addtask ( struct scheduler *s , int type , int subtype , int flags , int wait , struct cell *ci , struct cell *cj , int tight ) {
-
-    int ind;
-    struct task *t;
-    
-    /* Get the next free task. */
-    ind = atomic_inc( &s->tasks_next );
-    
-    /* Overflow? */
-    if ( ind >= s->size )
-        error( "Task list overflow." );
-    
-    /* Get a pointer to the new task. */
-    t = &s->tasks[ ind ];
-    
-    /* Copy the data. */
-    t->type = type;
-    t->subtype = subtype;
-    t->flags = flags;
-    t->wait = wait;
-    t->ci = ci;
-    t->cj = cj;
-    t->skip = 0;
-    t->tight = tight;
-    t->implicit = 0;
-    t->weight = 0;
-    t->rank = 0;
-    t->tic = 0;
-    t->toc = 0;
-    t->nr_unlock_tasks = 0;
-    
-    /* Init the lock. */
-    lock_init( &t->lock );
-    
-    /* Add an index for it. */
-    // lock_lock( &s->lock );
-    s->tasks_ind[ atomic_inc( &s->nr_tasks ) ] = ind;
-    // lock_unlock_blind( &s->lock );
-    
-    /* Return a pointer to the new task. */
-    return t;
-
-    }
-
 
+struct task *scheduler_addtask(struct scheduler *s, int type, int subtype,
+                               int flags, int wait, struct cell *ci,
+                               struct cell *cj, int tight) {
+
+  int ind;
+  struct task *t;
+
+  /* Get the next free task. */
+  ind = atomic_inc(&s->tasks_next);
+
+  /* Overflow? */
+  if (ind >= s->size) error("Task list overflow.");
+
+  /* Get a pointer to the new task. */
+  t = &s->tasks[ind];
+
+  /* Copy the data. */
+  t->type = type;
+  t->subtype = subtype;
+  t->flags = flags;
+  t->wait = wait;
+  t->ci = ci;
+  t->cj = cj;
+  t->skip = 0;
+  t->tight = tight;
+  t->implicit = 0;
+  t->weight = 0;
+  t->rank = 0;
+  t->tic = 0;
+  t->toc = 0;
+  t->nr_unlock_tasks = 0;
+
+  /* Init the lock. */
+  lock_init(&t->lock);
+
+  /* Add an index for it. */
+  // lock_lock( &s->lock );
+  s->tasks_ind[atomic_inc(&s->nr_tasks)] = ind;
+  // lock_unlock_blind( &s->lock );
+
+  /* Return a pointer to the new task. */
+  return t;
+}
 
-/** 
+/**
  * @brief Sort the tasks in topological order over all queues.
  *
  * @param s The #scheduler.
  */
- 
-void scheduler_ranktasks ( struct scheduler *s ) {
-
-    int i, j = 0, k, temp, left = 0, rank;
-    struct task *t, *tasks = s->tasks;
-    int *tid = s->tasks_ind, nr_tasks = s->nr_tasks;
-    
-    /* Run throught the tasks and get all the waits right. */
-    for ( i = 0 , k = 0 ; k < nr_tasks ; k++ ) {
-        tid[k] = k;
-        for ( j = 0 ; j < tasks[k].nr_unlock_tasks ; j++ )
-            tasks[k].unlock_tasks[j]->wait += 1;
-        }
-        
-    /* Main loop. */
-    for ( j = 0 , rank = 0 ; left < nr_tasks ; rank++ ) {
-        
-        /* Load the tids of tasks with no waits. */
-        for ( k = left ; k < nr_tasks ; k++ )
-            if ( tasks[ tid[k] ].wait == 0 ) {
-                temp = tid[j]; tid[j] = tid[k]; tid[k] = temp;
-                j += 1;
-                }
-                
-        /* Did we get anything? */
-        if ( j == left )
-            error( "Unsatisfiable task dependencies detected." );
-
-        /* Unlock the next layer of tasks. */
-        for ( i = left ; i < j ; i++ ) {
-            t = &tasks[ tid[i] ];
-            t->rank = rank;
-            tid[i] = t - tasks;
-            if ( tid[i] >= nr_tasks )
-                error( "Task index overshoot." );
-            /* message( "task %i of type %s has rank %i." , i , 
-                (t->type == task_type_self) ? "self" : (t->type == task_type_pair) ? "pair" : "sort" , rank ); */
-            for ( k = 0 ; k < t->nr_unlock_tasks ; k++ )
-                t->unlock_tasks[k]->wait -= 1;
-            }
-            
-        /* The new left (no, not tony). */
-        left = j;
-            
-        }
-        
-    /* Verify that the tasks were ranked correctly. */
-    /* for ( k = 1 ; k < s->nr_tasks ; k++ )
-        if ( tasks[ tid[k-1] ].rank > tasks[ tid[k-1] ].rank )
-            error( "Task ranking failed." ); */
-        
+
+void scheduler_ranktasks(struct scheduler *s) {
+
+  int i, j = 0, k, temp, left = 0, rank;
+  struct task *t, *tasks = s->tasks;
+  int *tid = s->tasks_ind, nr_tasks = s->nr_tasks;
+
+  /* Run throught the tasks and get all the waits right. */
+  for (i = 0, k = 0; k < nr_tasks; k++) {
+    tid[k] = k;
+    for (j = 0; j < tasks[k].nr_unlock_tasks; j++)
+      tasks[k].unlock_tasks[j]->wait += 1;
+  }
+
+  /* Main loop. */
+  for (j = 0, rank = 0; left < nr_tasks; rank++) {
+
+    /* Load the tids of tasks with no waits. */
+    for (k = left; k < nr_tasks; k++)
+      if (tasks[tid[k]].wait == 0) {
+        temp = tid[j];
+        tid[j] = tid[k];
+        tid[k] = temp;
+        j += 1;
+      }
+
+    /* Did we get anything? */
+    if (j == left) error("Unsatisfiable task dependencies detected.");
+
+    /* Unlock the next layer of tasks. */
+    for (i = left; i < j; i++) {
+      t = &tasks[tid[i]];
+      t->rank = rank;
+      tid[i] = t - tasks;
+      if (tid[i] >= nr_tasks) error("Task index overshoot.");
+      /* message( "task %i of type %s has rank %i." , i ,
+          (t->type == task_type_self) ? "self" : (t->type == task_type_pair) ?
+         "pair" : "sort" , rank ); */
+      for (k = 0; k < t->nr_unlock_tasks; k++) t->unlock_tasks[k]->wait -= 1;
     }
 
+    /* The new left (no, not tony). */
+    left = j;
+  }
+
+  /* Verify that the tasks were ranked correctly. */
+  /* for ( k = 1 ; k < s->nr_tasks ; k++ )
+      if ( tasks[ tid[k-1] ].rank > tasks[ tid[k-1] ].rank )
+          error( "Task ranking failed." ); */
+}
 
 /**
  * @brief (Re)allocate the task arrays.
@@ -667,131 +752,123 @@ void scheduler_ranktasks ( struct scheduler *s ) {
  * @param s The #scheduler.
  * @param size The maximum number of tasks in the #scheduler.
  */
- 
-void scheduler_reset ( struct scheduler *s , int size ) {
 
-    int k;
+void scheduler_reset(struct scheduler *s, int size) {
 
-    /* Do we need to re-allocate? */
-    if ( size > s->size ) {
+  int k;
 
-        /* Free exising task lists if necessary. */
-        if ( s->tasks != NULL )
-            free( s->tasks );
-        if ( s->tasks_ind != NULL )
-            free( s->tasks_ind );
+  /* Do we need to re-allocate? */
+  if (size > s->size) {
 
-        /* Allocate the new lists. */
-        if ( ( s->tasks = (struct task *)malloc( sizeof(struct task) * size ) ) == NULL ||
-             ( s->tasks_ind = (int *)malloc( sizeof(int) * size ) ) == NULL )
-            error( "Failed to allocate task lists." );
-            
-        }
-        
-    /* Reset the task data. */
-    bzero( s->tasks , sizeof(struct task) * size );
-        
-    /* Reset the counters. */
-    s->size = size;
-    s->nr_tasks = 0;
-    s->tasks_next = 0;
-    s->waiting = 0;
-    
-    /* Set the task pointers in the queues. */
-    for ( k = 0 ; k < s->nr_queues ; k++ )
-        s->queues[k].tasks = s->tasks;
+    /* Free exising task lists if necessary. */
+    if (s->tasks != NULL) free(s->tasks);
+    if (s->tasks_ind != NULL) free(s->tasks_ind);
 
-    }
+    /* Allocate the new lists. */
+    if ((s->tasks = (struct task *)malloc(sizeof(struct task) *size)) == NULL ||
+        (s->tasks_ind = (int *)malloc(sizeof(int) * size)) == NULL)
+      error("Failed to allocate task lists.");
+  }
 
+  /* Reset the task data. */
+  bzero(s->tasks, sizeof(struct task) * size);
+
+  /* Reset the counters. */
+  s->size = size;
+  s->nr_tasks = 0;
+  s->tasks_next = 0;
+  s->waiting = 0;
+
+  /* Set the task pointers in the queues. */
+  for (k = 0; k < s->nr_queues; k++) s->queues[k].tasks = s->tasks;
+}
 
 /**
  * @brief Compute the task weights
  *
  * @param s The #scheduler.
  */
- 
-void scheduler_reweight ( struct scheduler *s ) {
-
-    int k, j, nr_tasks = s->nr_tasks, *tid = s->tasks_ind;
-    struct task *t, *tasks = s->tasks;
-    int nodeID = s->nodeID;
-    float sid_scale[13] = { 0.1897 , 0.4025 , 0.1897 , 0.4025 , 0.5788 , 0.4025 ,
-                            0.1897 , 0.4025 , 0.1897 , 0.4025 , 0.5788 , 0.4025 , 
-                            0.5788 };
-    float wscale = 0.001;
-    // ticks tic;
-    
-    /* Run throught the tasks backwards and set their waits and
-       weights. */
-    // tic = getticks();
-    for ( k = nr_tasks-1 ; k >= 0 ; k-- ) {
-        t = &tasks[ tid[k] ];
-        t->weight = 0;
-        for ( j = 0 ; j < t->nr_unlock_tasks ; j++ )
-            if ( t->unlock_tasks[j]->weight > t->weight )
-                t->weight = t->unlock_tasks[j]->weight;
-        if ( !t->implicit && t->tic > 0 )
-            t->weight += wscale * (t->toc - t->tic);
-        else
-            switch ( t->type ) {
-                case task_type_sort:
-                    t->weight += wscale * __builtin_popcount( t->flags ) * t->ci->count * ( sizeof(int)*8 - __builtin_clz( t->ci->count ) );
-                    break;
-                case task_type_self:
-                    t->weight += 1 * t->ci->count * t->ci->count;
-                    break;
-                case task_type_pair:
-                    if ( t->ci->nodeID != nodeID || t->cj->nodeID != nodeID )
-                        t->weight += 3 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ];
-                    else
-                        t->weight += 2 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ];
-                    break;
-                case task_type_sub:
-                    if ( t->cj != NULL ) {
-                        if ( t->ci->nodeID != nodeID || t->cj->nodeID != nodeID ) {
-                            if ( t->flags < 0 )
-                                t->weight += 3 * wscale * t->ci->count * t->cj->count;
-                            else
-                                t->weight += 3 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ];
-                            }
-                        else {
-                            if ( t->flags < 0 )
-                                t->weight += 2 * wscale * t->ci->count * t->cj->count;
-                            else
-                                t->weight += 2 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ];
-                            }
-                        }
-                    else
-                        t->weight += 1 * wscale * t->ci->count * t->ci->count;
-                    break;
-                case task_type_ghost:
-                    if ( t->ci == t->ci->super )
-                        t->weight += wscale * t->ci->count;
-                    break;
-                case task_type_kick1:
-                case task_type_kick2:
-                    t->weight += wscale * t->ci->count;
-                    break;
-                default:
-                    break;
-                }
-        if ( t->type == task_type_send )
-            t->weight  = INT_MAX / 8;
-        if ( t->type == task_type_recv )
-            t->weight *= 1.41; 
-        }
-    // message( "weighting tasks took %.3f ms." , (double)( getticks() - tic ) / CPU_TPS * 1000 );
-
-    /* int min = tasks[0].weight, max = tasks[0].weight;
-    for ( k = 1 ; k < nr_tasks ; k++ )
-    	if ( tasks[k].weight < min )
-	    min = tasks[k].weight;
-	else if ( tasks[k].weight > max )
-	    max = tasks[k].weight;
-    message( "task weights are in [ %i , %i ]." , min , max ); */
-        
-    }
 
+void scheduler_reweight(struct scheduler *s) {
+
+  int k, j, nr_tasks = s->nr_tasks, *tid = s->tasks_ind;
+  struct task *t, *tasks = s->tasks;
+  int nodeID = s->nodeID;
+  float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.1897,
+                         0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.5788};
+  float wscale = 0.001;
+  // ticks tic;
+
+  /* Run throught the tasks backwards and set their waits and
+     weights. */
+  // tic = getticks();
+  for (k = nr_tasks - 1; k >= 0; k--) {
+    t = &tasks[tid[k]];
+    t->weight = 0;
+    for (j = 0; j < t->nr_unlock_tasks; j++)
+      if (t->unlock_tasks[j]->weight > t->weight)
+        t->weight = t->unlock_tasks[j]->weight;
+    if (!t->implicit && t->tic > 0)
+      t->weight += wscale * (t->toc - t->tic);
+    else
+      switch (t->type) {
+        case task_type_sort:
+          t->weight += wscale * __builtin_popcount(t->flags) * t->ci->count *
+                       (sizeof(int) * 8 - __builtin_clz(t->ci->count));
+          break;
+        case task_type_self:
+          t->weight += 1 * t->ci->count * t->ci->count;
+          break;
+        case task_type_pair:
+          if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID)
+            t->weight +=
+                3 * wscale * t->ci->count * t->cj->count * sid_scale[t->flags];
+          else
+            t->weight +=
+                2 * wscale * t->ci->count * t->cj->count * sid_scale[t->flags];
+          break;
+        case task_type_sub:
+          if (t->cj != NULL) {
+            if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) {
+              if (t->flags < 0)
+                t->weight += 3 * wscale * t->ci->count * t->cj->count;
+              else
+                t->weight += 3 * wscale * t->ci->count * t->cj->count *
+                             sid_scale[t->flags];
+            } else {
+              if (t->flags < 0)
+                t->weight += 2 * wscale * t->ci->count * t->cj->count;
+              else
+                t->weight += 2 * wscale * t->ci->count * t->cj->count *
+                             sid_scale[t->flags];
+            }
+          } else
+            t->weight += 1 * wscale * t->ci->count * t->ci->count;
+          break;
+        case task_type_ghost:
+          if (t->ci == t->ci->super) t->weight += wscale * t->ci->count;
+          break;
+        case task_type_kick1:
+        case task_type_kick2:
+          t->weight += wscale * t->ci->count;
+          break;
+        default:
+          break;
+      }
+    if (t->type == task_type_send) t->weight = INT_MAX / 8;
+    if (t->type == task_type_recv) t->weight *= 1.41;
+  }
+  // message( "weighting tasks took %.3f ms." , (double)( getticks() - tic ) /
+  // CPU_TPS * 1000 );
+
+  /* int min = tasks[0].weight, max = tasks[0].weight;
+  for ( k = 1 ; k < nr_tasks ; k++ )
+      if ( tasks[k].weight < min )
+          min = tasks[k].weight;
+      else if ( tasks[k].weight > max )
+          max = tasks[k].weight;
+  message( "task weights are in [ %i , %i ]." , min , max ); */
+}
 
 /**
  * @brief Start the scheduler, i.e. fill the queues with ready tasks.
@@ -799,46 +876,44 @@ void scheduler_reweight ( struct scheduler *s ) {
  * @param s The #scheduler.
  * @param mask The task types to enqueue.
  */
- 
-void scheduler_start ( struct scheduler *s , unsigned int mask ) {
-
-    int k, j, nr_tasks = s->nr_tasks, *tid = s->tasks_ind;
-    struct task *t, *tasks = s->tasks;
-    // ticks tic;
-    
-    /* Run throught the tasks and set their waits. */
-    // tic = getticks();
-    for ( k = nr_tasks - 1 ; k >= 0 ; k-- ) {
-        t = &tasks[ tid[k] ];
-        t->wait = 0;
-        t->rid = -1;
-        if ( !( (1 << t->type) & mask ) || t->skip )
-            continue;
-        for ( j = 0 ; j < t->nr_unlock_tasks ; j++ )
-            atomic_inc( &t->unlock_tasks[j]->wait );
-        }
-    // message( "waiting tasks took %.3f ms." , (double)( getticks() - tic ) / CPU_TPS * 1000 );
-        
-    /* Don't enqueue link tasks directly. */
-    mask &= ~(1 << task_type_link);
-        
-    /* Loop over the tasks and enqueue whoever is ready. */
-    // tic = getticks();
-    for ( k = 0 ; k < nr_tasks ; k++) {
-        t = &tasks[ tid[k] ];
-        if ( ( (1 << t->type) & mask ) && !t->skip ) {
-            if ( t->wait == 0 ) {
-		        scheduler_enqueue( s , t );
-		        pthread_cond_broadcast( &s->sleep_cond );
-		        }
-	        else
-	            break;
-            }
-        }
-    // message( "enqueueing tasks took %.3f ms." , (double)( getticks() - tic ) / CPU_TPS * 1000 );
-        
-    }
 
+void scheduler_start(struct scheduler *s, unsigned int mask) {
+
+  int k, j, nr_tasks = s->nr_tasks, *tid = s->tasks_ind;
+  struct task *t, *tasks = s->tasks;
+  // ticks tic;
+
+  /* Run throught the tasks and set their waits. */
+  // tic = getticks();
+  for (k = nr_tasks - 1; k >= 0; k--) {
+    t = &tasks[tid[k]];
+    t->wait = 0;
+    t->rid = -1;
+    if (!((1 << t->type) & mask) || t->skip) continue;
+    for (j = 0; j < t->nr_unlock_tasks; j++)
+      atomic_inc(&t->unlock_tasks[j]->wait);
+  }
+  // message( "waiting tasks took %.3f ms." , (double)( getticks() - tic ) /
+  // CPU_TPS * 1000 );
+
+  /* Don't enqueue link tasks directly. */
+  mask &= ~(1 << task_type_link);
+
+  /* Loop over the tasks and enqueue whoever is ready. */
+  // tic = getticks();
+  for (k = 0; k < nr_tasks; k++) {
+    t = &tasks[tid[k]];
+    if (((1 << t->type) & mask) && !t->skip) {
+      if (t->wait == 0) {
+        scheduler_enqueue(s, t);
+        pthread_cond_broadcast(&s->sleep_cond);
+      } else
+        break;
+    }
+  }
+  // message( "enqueueing tasks took %.3f ms." , (double)( getticks() - tic ) /
+  // CPU_TPS * 1000 );
+}
 
 /**
  * @brief Put a task on one of the queues.
@@ -846,97 +921,97 @@ void scheduler_start ( struct scheduler *s , unsigned int mask ) {
  * @param s The #scheduler.
  * @param t The #task.
  */
- 
-void scheduler_enqueue ( struct scheduler *s , struct task *t ) {
-
-    int qid = -1;
-    #ifdef WITH_MPI
-        int err;
-    #endif
-    
-    /* Ignore skipped tasks. */
-    if ( t->skip  || atomic_cas( &t->rid , -1 , 0 ) != -1 )
-        return;
-        
-    /* If this is an implicit task, just pretend it's done. */
-    if ( t->implicit ) {
-        for ( int j = 0 ; j < t->nr_unlock_tasks ; j++ ) {
-            struct task *t2 = t->unlock_tasks[j];
-            if ( atomic_dec( &t2->wait ) == 1 && !t2->skip )
-                scheduler_enqueue( s , t2 );
-            }
+
+void scheduler_enqueue(struct scheduler *s, struct task *t) {
+
+  int qid = -1;
+#ifdef WITH_MPI
+  int err;
+#endif
+
+  /* Ignore skipped tasks. */
+  if (t->skip || atomic_cas(&t->rid, -1, 0) != -1) return;
+
+  /* If this is an implicit task, just pretend it's done. */
+  if (t->implicit) {
+    for (int j = 0; j < t->nr_unlock_tasks; j++) {
+      struct task *t2 = t->unlock_tasks[j];
+      if (atomic_dec(&t2->wait) == 1 && !t2->skip) scheduler_enqueue(s, t2);
+    }
+  }
+
+  /* Otherwise, look for a suitable queue. */
+  else {
+
+    /* Find the previous owner for each task type, and do
+       any pre-processing needed. */
+    switch (t->type) {
+      case task_type_self:
+      case task_type_sort:
+      case task_type_ghost:
+      case task_type_kick2:
+        qid = t->ci->super->owner;
+        break;
+      case task_type_pair:
+      case task_type_sub:
+        qid = t->ci->super->owner;
+        if (t->cj != NULL &&
+            (qid < 0 ||
+             s->queues[qid].count > s->queues[t->cj->super->owner].count))
+          qid = t->cj->super->owner;
+        break;
+      case task_type_recv:
+#ifdef WITH_MPI
+        if ((err = MPI_Irecv(t->ci->parts, sizeof(struct part) * t->ci->count,
+                             MPI_BYTE, t->ci->nodeID, t->flags, MPI_COMM_WORLD,
+                             &t->req)) != MPI_SUCCESS) {
+          char buff[MPI_MAX_ERROR_STRING];
+          int len;
+          MPI_Error_string(err, buff, &len);
+          error("Failed to emit irecv for particle data (%s).", buff);
         }
-        
-    /* Otherwise, look for a suitable queue. */
-    else {
-        
-        /* Find the previous owner for each task type, and do
-           any pre-processing needed. */
-        switch ( t->type ) {
-            case task_type_self:
-            case task_type_sort:
-            case task_type_ghost:
-            case task_type_kick2:
-                qid = t->ci->super->owner;
-                break;
-            case task_type_pair:
-            case task_type_sub:
-                qid = t->ci->super->owner;
-                if ( t->cj != NULL && 
-                     ( qid < 0 || s->queues[qid].count > s->queues[t->cj->super->owner].count ) )
-                    qid = t->cj->super->owner;
-                break;
-            case task_type_recv:
-                #ifdef WITH_MPI
-                    if ( ( err = MPI_Irecv( t->ci->parts , sizeof(struct part) * t->ci->count , MPI_BYTE , t->ci->nodeID , t->flags , MPI_COMM_WORLD , &t->req ) ) != MPI_SUCCESS ) {
-                        char buff[ MPI_MAX_ERROR_STRING ];
-                        int len;
-                        MPI_Error_string( err , buff , &len );
-                        error( "Failed to emit irecv for particle data (%s)." , buff );
-                        }
-                    // message( "recieving %i parts with tag=%i from %i to %i." ,
-                    //     t->ci->count , t->flags , t->ci->nodeID , s->nodeID ); fflush(stdout);
-                    qid = 1 % s->nr_queues;
-                #else
-                    error( "SWIFT was not compiled with MPI support." );
-                #endif
-                break;
-            case task_type_send:
-                #ifdef WITH_MPI
-                    if ( ( err = MPI_Isend( t->ci->parts , sizeof(struct part) * t->ci->count , MPI_BYTE , t->cj->nodeID , t->flags , MPI_COMM_WORLD , &t->req ) ) != MPI_SUCCESS ) {
-                        char buff[ MPI_MAX_ERROR_STRING ];
-                        int len;
-                        MPI_Error_string( err , buff , &len );
-                        error( "Failed to emit isend for particle data (%s)." , buff );
-                        }
-                    // message( "sending %i parts with tag=%i from %i to %i." ,
-                    //     t->ci->count , t->flags , s->nodeID , t->cj->nodeID ); fflush(stdout);
-                    qid = 0;
-                #else
-                    error( "SWIFT was not compiled with MPI support." );
-                #endif
-                break;
-            default:
-                qid = -1;
-            }
-            
-        if ( qid >= s->nr_queues )
-            error( "Bad computed qid." );
-            
-        /* If no previous owner, find the shortest queue. */
-        if ( qid < 0 )
-            qid = rand() % s->nr_queues;
-
-        /* Increase the waiting counter. */
-        atomic_inc( &s->waiting );
-
-        /* Insert the task into that queue. */
-        queue_insert( &s->queues[qid] , t );
-        
+        // message( "recieving %i parts with tag=%i from %i to %i." ,
+        //     t->ci->count , t->flags , t->ci->nodeID , s->nodeID );
+        // fflush(stdout);
+        qid = 1 % s->nr_queues;
+#else
+        error("SWIFT was not compiled with MPI support.");
+#endif
+        break;
+      case task_type_send:
+#ifdef WITH_MPI
+        if ((err = MPI_Isend(t->ci->parts, sizeof(struct part) * t->ci->count,
+                             MPI_BYTE, t->cj->nodeID, t->flags, MPI_COMM_WORLD,
+                             &t->req)) != MPI_SUCCESS) {
+          char buff[MPI_MAX_ERROR_STRING];
+          int len;
+          MPI_Error_string(err, buff, &len);
+          error("Failed to emit isend for particle data (%s).", buff);
         }
-        
+        // message( "sending %i parts with tag=%i from %i to %i." ,
+        //     t->ci->count , t->flags , s->nodeID , t->cj->nodeID );
+        // fflush(stdout);
+        qid = 0;
+#else
+        error("SWIFT was not compiled with MPI support.");
+#endif
+        break;
+      default:
+        qid = -1;
     }
 
+    if (qid >= s->nr_queues) error("Bad computed qid.");
+
+    /* If no previous owner, find the shortest queue. */
+    if (qid < 0) qid = rand() % s->nr_queues;
+
+    /* Increase the waiting counter. */
+    atomic_inc(&s->waiting);
+
+    /* Insert the task into that queue. */
+    queue_insert(&s->queues[qid], t);
+  }
+}
 
 /**
  * @brief Take care of a tasks dependencies.
@@ -947,58 +1022,49 @@ void scheduler_enqueue ( struct scheduler *s , struct task *t ) {
  * @return A pointer to the next task, if a suitable one has
  *         been identified.
  */
- 
-struct task *scheduler_done ( struct scheduler *s , struct task *t ) {
-
-    int k, res;
-    struct task *t2, *next = NULL;
-    struct cell *super = t->ci->super;
-    
-    /* Release whatever locks this task held. */
-    if ( !t->implicit )
-        task_unlock( t );
-        
-    /* Loop through the dependencies and add them to a queue if
-       they are ready. */
-    for ( k = 0 ; k < t->nr_unlock_tasks ; k++ ) {
-        t2 = t->unlock_tasks[k];
-        if ( ( res = atomic_dec( &t2->wait ) ) < 1 )
-            error( "Negative wait!" );
-        if ( res == 1 && !t2->skip ) {
-            if ( 0 && !t2->implicit &&
-                 t2->ci->super == super &&
-                 ( next == NULL || t2->weight > next->weight ) &&
-                 task_lock( t2 ) ) {
-                if ( next != NULL ) {
-                    task_unlock( next );
-                    scheduler_enqueue( s , next );
-                    }
-                next = t2;
-                }
-            else
-                scheduler_enqueue( s , t2 );
-            }
-        }
-        
-    /* Task definitely done. */
-    if ( !t->implicit ) {
-        t->toc = getticks();
-        pthread_mutex_lock( &s->sleep_mutex );
-        if ( next == NULL )
-            atomic_dec( &s->waiting );
-        pthread_cond_broadcast( &s->sleep_cond );
-        pthread_mutex_unlock( &s->sleep_mutex );
+
+struct task *scheduler_done(struct scheduler *s, struct task *t) {
+
+  int k, res;
+  struct task *t2, *next = NULL;
+  struct cell *super = t->ci->super;
+
+  /* Release whatever locks this task held. */
+  if (!t->implicit) task_unlock(t);
+
+  /* Loop through the dependencies and add them to a queue if
+     they are ready. */
+  for (k = 0; k < t->nr_unlock_tasks; k++) {
+    t2 = t->unlock_tasks[k];
+    if ((res = atomic_dec(&t2->wait)) < 1) error("Negative wait!");
+    if (res == 1 && !t2->skip) {
+      if (0 && !t2->implicit && t2->ci->super == super &&
+          (next == NULL || t2->weight > next->weight) && task_lock(t2)) {
+        if (next != NULL) {
+          task_unlock(next);
+          scheduler_enqueue(s, next);
         }
+        next = t2;
+      } else
+        scheduler_enqueue(s, t2);
+    }
+  }
 
-    /* Start the clock on the follow-up task. */
-    if ( next != NULL )
-        next->tic = getticks();
-        
-    /* Return the next best task. */
-    return next;
+  /* Task definitely done. */
+  if (!t->implicit) {
+    t->toc = getticks();
+    pthread_mutex_lock(&s->sleep_mutex);
+    if (next == NULL) atomic_dec(&s->waiting);
+    pthread_cond_broadcast(&s->sleep_cond);
+    pthread_mutex_unlock(&s->sleep_mutex);
+  }
 
-    }
+  /* Start the clock on the follow-up task. */
+  if (next != NULL) next->tic = getticks();
 
+  /* Return the next best task. */
+  return next;
+}
 
 /**
  * @brief Resolve a single dependency by hand.
@@ -1009,41 +1075,35 @@ struct task *scheduler_done ( struct scheduler *s , struct task *t ) {
  * @return A pointer to the next task, if a suitable one has
  *         been identified.
  */
- 
-struct task *scheduler_unlock ( struct scheduler *s , struct task *t ) {
-
-    int k, res;
-    struct task *t2, *next = NULL;
-    
-    /* Loop through the dependencies and add them to a queue if
-       they are ready. */
-    for ( k = 0 ; k < t->nr_unlock_tasks ; k++ ) {
-        t2 = t->unlock_tasks[k];
-        if ( ( res = atomic_dec( &t2->wait ) ) < 1 )
-            error( "Negative wait!" );
-        if ( res == 1 && !t2->skip )
-            scheduler_enqueue( s , t2 );
-        }
-        
-    /* Task definitely done. */
-    if ( !t->implicit ) {
-        t->toc = getticks();
-        pthread_mutex_lock( &s->sleep_mutex );
-        if ( next == NULL )
-            atomic_dec( &s->waiting );
-        pthread_cond_broadcast( &s->sleep_cond );
-        pthread_mutex_unlock( &s->sleep_mutex );
-        }
 
-    /* Start the clock on the follow-up task. */
-    if ( next != NULL )
-        next->tic = getticks();
-        
-    /* Return the next best task. */
-    return next;
+struct task *scheduler_unlock(struct scheduler *s, struct task *t) {
 
-    }
+  int k, res;
+  struct task *t2, *next = NULL;
+
+  /* Loop through the dependencies and add them to a queue if
+     they are ready. */
+  for (k = 0; k < t->nr_unlock_tasks; k++) {
+    t2 = t->unlock_tasks[k];
+    if ((res = atomic_dec(&t2->wait)) < 1) error("Negative wait!");
+    if (res == 1 && !t2->skip) scheduler_enqueue(s, t2);
+  }
 
+  /* Task definitely done. */
+  if (!t->implicit) {
+    t->toc = getticks();
+    pthread_mutex_lock(&s->sleep_mutex);
+    if (next == NULL) atomic_dec(&s->waiting);
+    pthread_cond_broadcast(&s->sleep_cond);
+    pthread_mutex_unlock(&s->sleep_mutex);
+  }
+
+  /* Start the clock on the follow-up task. */
+  if (next != NULL) next->tic = getticks();
+
+  /* Return the next best task. */
+  return next;
+}
 
 /**
  * @brief Get a task, preferably from the given queue.
@@ -1054,79 +1114,72 @@ struct task *scheduler_unlock ( struct scheduler *s , struct task *t ) {
  *
  * @return A pointer to a #task or @c NULL if there are no available tasks.
  */
- 
-struct task *scheduler_gettask ( struct scheduler *s , int qid , struct cell *super ) {
-
-    struct task *res = NULL;
-    int k, nr_queues = s->nr_queues;
-    unsigned int seed = qid;
-    
-    /* Check qid. */
-    if ( qid >= nr_queues || qid < 0 )
-	    error( "Bad queue ID." );
-
-    /* Loop as long as there are tasks... */
-    while ( s->waiting > 0 && res == NULL ) {
-        
-        /* Try more than once before sleeping. */
-        for ( int tries = 0 ; res == NULL && s->waiting && tries < scheduler_maxtries ; tries++ ) {
-        
-            /* Try to get a task from the suggested queue. */
-            if ( s->queues[qid].count > 0 ) {
-                TIMER_TIC
-                res = queue_gettask( &s->queues[qid] , super , 0 );
-                TIMER_TOC( timer_qget );
-                if ( res != NULL )
-                    break;
-                }
 
-            /* If unsucessful, try stealing from the other queues. */
-            if ( s->flags & scheduler_flag_steal ) {
-                int count = 0, qids[ nr_queues ];
-                for ( k = 0 ; k < nr_queues ; k++ )
-                    if ( s->queues[k].count > 0 )
-                        qids[ count++ ] = k;
-                for ( k = 0 ; k < scheduler_maxsteal && count > 0 ; k++ ) {
-                    int ind = rand_r( &seed ) % count;
-                    TIMER_TIC
-                    res = queue_gettask( &s->queues[ qids[ ind ] ] , super , 0 );
-                    TIMER_TOC( timer_qsteal );
-                    if ( res != NULL )
-                        break;
-                    else 
-                        qids[ ind ] = qids[ --count ];
-                    }
-                if ( res != NULL )
-                    break;
-                }
-                
-            }
-
-        /* If we failed, take a short nap. */
-        #ifdef WITH_MPI
-	    if ( res == NULL && qid > 1 ) {
-	#else
-            if ( res == NULL ) {
-	#endif
-                pthread_mutex_lock( &s->sleep_mutex );
-                if ( s->waiting > 0 )
-                    pthread_cond_wait( &s->sleep_cond , &s->sleep_mutex );
-                pthread_mutex_unlock( &s->sleep_mutex );
-                }
-
-        }
-        
-    /* Start the timer on this task, if we got one. */
-    if ( res != NULL ) {
-        res->tic = getticks();
-        res->rid = qid;
+struct task *scheduler_gettask(struct scheduler *s, int qid,
+                               struct cell *super) {
+
+  struct task *res = NULL;
+  int k, nr_queues = s->nr_queues;
+  unsigned int seed = qid;
+
+  /* Check qid. */
+  if (qid >= nr_queues || qid < 0) error("Bad queue ID.");
+
+  /* Loop as long as there are tasks... */
+  while (s->waiting > 0 && res == NULL) {
+
+    /* Try more than once before sleeping. */
+    for (int tries = 0; res == NULL && s->waiting && tries < scheduler_maxtries;
+         tries++) {
+
+      /* Try to get a task from the suggested queue. */
+      if (s->queues[qid].count > 0) {
+        TIMER_TIC
+        res = queue_gettask(&s->queues[qid], super, 0);
+        TIMER_TOC(timer_qget);
+        if (res != NULL) break;
+      }
+
+      /* If unsucessful, try stealing from the other queues. */
+      if (s->flags & scheduler_flag_steal) {
+        int count = 0, qids[nr_queues];
+        for (k = 0; k < nr_queues; k++)
+          if (s->queues[k].count > 0) qids[count++] = k;
+        for (k = 0; k < scheduler_maxsteal && count > 0; k++) {
+          int ind = rand_r(&seed) % count;
+          TIMER_TIC
+          res = queue_gettask(&s->queues[qids[ind]], super, 0);
+          TIMER_TOC(timer_qsteal);
+          if (res != NULL)
+            break;
+          else
+            qids[ind] = qids[--count];
         }
-        
-    /* No milk today. */
-    return res;
+        if (res != NULL) break;
+      }
+    }
 
+/* If we failed, take a short nap. */
+#ifdef WITH_MPI
+    if (res == NULL && qid > 1) {
+#else
+    if (res == NULL) {
+#endif
+      pthread_mutex_lock(&s->sleep_mutex);
+      if (s->waiting > 0) pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
+      pthread_mutex_unlock(&s->sleep_mutex);
     }
+  }
+
+  /* Start the timer on this task, if we got one. */
+  if (res != NULL) {
+    res->tic = getticks();
+    res->rid = qid;
+  }
 
+  /* No milk today. */
+  return res;
+}
 
 /**
  * @brief Initialize the #scheduler.
@@ -1137,40 +1190,39 @@ struct task *scheduler_gettask ( struct scheduler *s , int qid , struct cell *su
  * @param flags The #scheduler flags.
  * @param nodeID The MPI rank
  */
- 
-void scheduler_init ( struct scheduler *s , struct space *space , int nr_queues , unsigned int flags , int nodeID ) {
-    
-    int k;
-    
-    /* Init the lock. */
-    lock_init( &s->lock );
-
-    /* Allocate the queues. */
-    if ( ( s->queues = (struct queue *)malloc( sizeof(struct queue) * nr_queues ) ) == NULL )
-        error( "Failed to allocate queues." );
-        
-    /* Initialize each queue. */
-    for ( k = 0 ; k < nr_queues ; k++ )
-        queue_init( &s->queues[k] , NULL );
-        
-    /* Init the sleep mutex and cond. */
-    if ( pthread_cond_init( &s->sleep_cond , NULL ) != 0 ||
-         pthread_mutex_init( &s->sleep_mutex , NULL ) != 0 )
-        error( "Failed to initialize sleep barrier." );
-        
-    /* Set the scheduler variables. */
-    s->nr_queues = nr_queues;
-    s->flags = flags;
-    s->space = space;
-    s->nodeID = nodeID;
-    
-    /* Init other values. */
-    s->tasks = NULL;
-    s->tasks_ind = NULL;
-    s->waiting = 0;
-    s->size = 0;
-    s->nr_tasks = 0;
-    s->tasks_next = 0;
-
-    }
 
+void scheduler_init(struct scheduler *s, struct space *space, int nr_queues,
+                    unsigned int flags, int nodeID) {
+
+  int k;
+
+  /* Init the lock. */
+  lock_init(&s->lock);
+
+  /* Allocate the queues. */
+  if ((s->queues = (struct queue *)malloc(sizeof(struct queue) * nr_queues)) ==
+      NULL)
+    error("Failed to allocate queues.");
+
+  /* Initialize each queue. */
+  for (k = 0; k < nr_queues; k++) queue_init(&s->queues[k], NULL);
+
+  /* Init the sleep mutex and cond. */
+  if (pthread_cond_init(&s->sleep_cond, NULL) != 0 ||
+      pthread_mutex_init(&s->sleep_mutex, NULL) != 0)
+    error("Failed to initialize sleep barrier.");
+
+  /* Set the scheduler variables. */
+  s->nr_queues = nr_queues;
+  s->flags = flags;
+  s->space = space;
+  s->nodeID = nodeID;
+
+  /* Init other values. */
+  s->tasks = NULL;
+  s->tasks_ind = NULL;
+  s->waiting = 0;
+  s->size = 0;
+  s->nr_tasks = 0;
+  s->tasks_next = 0;
+}
diff --git a/src/scheduler.h b/src/scheduler.h
index a71683db102a65e7c053677a3f3789c979626198..620b712885a1653397b3e9fd0e632cc0e562cf19 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -1,88 +1,103 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_SCHEDULER_H
+#define SWIFT_SCHEDULER_H
 
+/* Some standard headers. */
+#include <pthread.h>
+
+/* Includes. */
+#include "cell.h"
+#include "lock.h"
+#include "queue.h"
+#include "space.h"
+#include "task.h"
 
 /* Some constants. */
-#define scheduler_maxwait                    3
-#define scheduler_maxunlock                  40
-#define scheduler_dosub                      1
-#define scheduler_maxsteal                   10
-#define scheduler_maxtries                   2
-#define scheduler_doforcesplit               0 /* Beware: switching this on can/will
-                                                  break engine_addlink as it assumes
-                                                  a maximum number of tasks per cell. */
+#define scheduler_maxwait 3
+#define scheduler_maxunlock 40
+#define scheduler_dosub 1
+#define scheduler_maxsteal 10
+#define scheduler_maxtries 2
+#define scheduler_doforcesplit            \
+  0 /* Beware: switching this on can/will \
+       break engine_addlink as it assumes \
+       a maximum number of tasks per cell. */
 
 /* Flags . */
-#define scheduler_flag_none                  0
-#define scheduler_flag_steal                 1
-
+#define scheduler_flag_none 0
+#define scheduler_flag_steal 1
 
 /* Data of a scheduler. */
 struct scheduler {
 
-    /* Scheduler flags. */
-    unsigned int flags;
-
-    /* Number of queues in this scheduler. */
-    int nr_queues;
-    
-    /* Array of queues. */
-    struct queue *queues;
-    
-    /* Total number of tasks. */
-    int nr_tasks, size, tasks_next;
-    
-    /* Total number of waiting tasks. */
-    int waiting;
-    
-    /* The task array. */
-    struct task *tasks;
-    
-    /* The task indices. */
-    int *tasks_ind;
-    
-    /* Lock for this scheduler. */
-    lock_type lock;
-    
-    /* Waiting queue. */
-    pthread_mutex_t sleep_mutex;
-    pthread_cond_t sleep_cond;
-    
-    /* The space associated with this scheduler. */
-    struct space *space;
-    
-    /* The node we are working on. */
-    int nodeID;
-
-    };
+  /* Scheduler flags. */
+  unsigned int flags;
+
+  /* Number of queues in this scheduler. */
+  int nr_queues;
+
+  /* Array of queues. */
+  struct queue *queues;
+
+  /* Total number of tasks. */
+  int nr_tasks, size, tasks_next;
+
+  /* Total number of waiting tasks. */
+  int waiting;
 
+  /* The task array. */
+  struct task *tasks;
+
+  /* The task indices. */
+  int *tasks_ind;
+
+  /* Lock for this scheduler. */
+  lock_type lock;
+
+  /* Waiting queue. */
+  pthread_mutex_t sleep_mutex;
+  pthread_cond_t sleep_cond;
+
+  /* The space associated with this scheduler. */
+  struct space *space;
+
+  /* The node we are working on. */
+  int nodeID;
+};
 
 /* Function prototypes. */
-void scheduler_init ( struct scheduler *s , struct space *space , int nr_queues , unsigned int flags , int nodeID );
-struct task *scheduler_gettask ( struct scheduler *s , int qid , struct cell *super );
-void scheduler_enqueue ( struct scheduler *s , struct task *t );
-void scheduler_start ( struct scheduler *s , unsigned int mask );
-void scheduler_reset ( struct scheduler *s , int nr_tasks );
-void scheduler_ranktasks ( struct scheduler *s );
-void scheduler_reweight ( struct scheduler *s );
-struct task *scheduler_addtask ( struct scheduler *s , int type , int subtype , int flags , int wait , struct cell *ci , struct cell *cj , int tight );
-void scheduler_splittasks ( struct scheduler *s );
-struct task *scheduler_done ( struct scheduler *s , struct task *t );
-struct task *scheduler_unlock ( struct scheduler *s , struct task *t );
-void scheduler_addunlock ( struct scheduler *s , struct task *ta , struct task *tb );
+void scheduler_init(struct scheduler *s, struct space *space, int nr_queues,
+                    unsigned int flags, int nodeID);
+struct task *scheduler_gettask(struct scheduler *s, int qid,
+                               struct cell *super);
+void scheduler_enqueue(struct scheduler *s, struct task *t);
+void scheduler_start(struct scheduler *s, unsigned int mask);
+void scheduler_reset(struct scheduler *s, int nr_tasks);
+void scheduler_ranktasks(struct scheduler *s);
+void scheduler_reweight(struct scheduler *s);
+struct task *scheduler_addtask(struct scheduler *s, int type, int subtype,
+                               int flags, int wait, struct cell *ci,
+                               struct cell *cj, int tight);
+void scheduler_splittasks(struct scheduler *s);
+struct task *scheduler_done(struct scheduler *s, struct task *t);
+struct task *scheduler_unlock(struct scheduler *s, struct task *t);
+void scheduler_addunlock(struct scheduler *s, struct task *ta, struct task *tb);
+
+#endif /* SWIFT_SCHEDULER_H */
diff --git a/src/serial_io.c b/src/serial_io.c
index fa54be0b30a3f9e9c33221ab4ba5f668d274b209..f771f3d4f1b0ed5b94ff27e9e54fea484e02f381 100644
--- a/src/serial_io.c
+++ b/src/serial_io.c
@@ -2,20 +2,20 @@
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
  *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk).
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 
 /* Config parameters. */
@@ -23,30 +23,25 @@
 
 #if defined(HAVE_HDF5) && defined(WITH_MPI) && !defined(HAVE_PARALLEL_HDF5)
 
-
 /* Some standard headers. */
+#include <hdf5.h>
+#include <math.h>
+#include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <stddef.h>
-#include <hdf5.h>
-#include <math.h>
 
-#include "mpi.h"
-
-#include "const.h"
-#include "cycle.h"
-#include "lock.h"
-#include "task.h"
-#include "part.h"
-#include "space.h"
-#include "scheduler.h"
-#include "engine.h"
-#include "error.h"
-#include "kernel.h"
-#include "common_io.h"
+/* MPI headers. */
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
 
+/* This object's header. */
+#include "serial_io.h"
 
+/* Local includes. */
+#include "common_io.h"
+#include "error.h"
 
 /*-----------------------------------------------------------------------------
  * Routines reading an IC file
@@ -60,21 +55,25 @@
  * @param type The #DATA_TYPE of the attribute.
  * @param N The number of particles.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
- * @param part_c A (char*) pointer on the first occurence of the field of interest in the parts array
- * @param importance If COMPULSORY, the data must be present in the IC file. If OPTIONAL, the array will be zeroed when the data is not present.
+ * @param part_c A (char*) pointer on the first occurence of the field of
+ *interest in the parts array
+ * @param importance If COMPULSORY, the data must be present in the IC file. If
+ *OPTIONAL, the array will be zeroed when the data is not present.
  *
- * @todo A better version using HDF5 hyperslabs to read the file directly into the part array 
+ * @todo A better version using HDF5 hyperslabs to read the file directly into
+ *the part array
  * will be written once the strucutres have been stabilized.
- *  
+ *
  * Calls #error() if an error occurs.
  */
-void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim, long long N_total, long long offset, char* part_c, enum DATA_IMPORTANCE importance)
-{
-  hid_t h_data=0, h_err=0, h_type=0, h_memspace=0, h_filespace=0;
+void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
+                      int dim, long long N_total, long long offset,
+                      char* part_c, enum DATA_IMPORTANCE importance) {
+  hid_t h_data = 0, h_err = 0, h_type = 0, h_memspace = 0, h_filespace = 0;
   hsize_t shape[2], offsets[2];
-  htri_t exist=0;
+  htri_t exist = 0;
   void* temp;
-  int i=0, rank=0;
+  int i = 0, rank = 0;
   const size_t typeSize = sizeOfType(type);
   const size_t copySize = typeSize * dim;
   const size_t partSize = sizeof(struct part);
@@ -82,56 +81,48 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim
 
   /* Check whether the dataspace exists or not */
   exist = H5Lexists(grp, name, 0);
-  if(exist < 0)
-    {
-      error( "Error while checking the existence of data set '%s'." , name );
-    }
-  else if(exist == 0)
-    {
-      if(importance == COMPULSORY)
-	{
-	  error( "Compulsory data set '%s' not present in the file." , name );
-	}
-      else
-	{
-	  for(i=0; i<N; ++i)
-	    memset(part_c+i*partSize, 0, copySize);	  
-	  return;
-	}
+  if (exist < 0) {
+    error("Error while checking the existence of data set '%s'.", name);
+  } else if (exist == 0) {
+    if (importance == COMPULSORY) {
+      error("Compulsory data set '%s' not present in the file.", name);
+    } else {
+      for (i = 0; i < N; ++i) memset(part_c + i * partSize, 0, copySize);
+      return;
     }
+  }
 
-  /* message( "Reading %s '%s' array...", importance == COMPULSORY ? "compulsory": "optional  ", name); */
+  /* message( "Reading %s '%s' array...", importance == COMPULSORY ?
+   * "compulsory": "optional  ", name); */
 
   /* Open data space */
   h_data = H5Dopen1(grp, name);
-  if(h_data < 0)
-    error( "Error while opening data space '%s'." , name );
+  if (h_data < 0) error("Error while opening data space '%s'.", name);
 
   /* Check data type */
   h_type = H5Dget_type(h_data);
-  if(h_type < 0)
-    error("Unable to retrieve data type from the file");
-  if(!H5Tequal(h_type, hdf5Type(type)))
+  if (h_type < 0) error("Unable to retrieve data type from the file");
+  if (!H5Tequal(h_type, hdf5Type(type)))
     error("Non-matching types between the code and the file");
-  
+
   /* Allocate temporary buffer */
   temp = malloc(N * dim * sizeOfType(type));
-  if(temp == NULL)
-    error("Unable to allocate memory for temporary buffer");
+  if (temp == NULL) error("Unable to allocate memory for temporary buffer");
 
   /* Prepare information for hyperslab */
-  if(dim > 1)
-    {
-      rank = 2;
-      shape[0] = N; shape[1] = dim;
-      offsets[0] = offset; offsets[1] = 0;
-    }
-  else
-    {
-      rank = 1;
-      shape[0] = N; shape[1] = 0;
-      offsets[0] = offset; offsets[1] = 0;
-    }
+  if (dim > 1) {
+    rank = 2;
+    shape[0] = N;
+    shape[1] = dim;
+    offsets[0] = offset;
+    offsets[1] = 0;
+  } else {
+    rank = 1;
+    shape[0] = N;
+    shape[1] = 0;
+    offsets[0] = offset;
+    offsets[1] = 0;
+  }
 
   /* Create data space in memory */
   h_memspace = H5Screate_simple(rank, shape, NULL);
@@ -140,21 +131,20 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim
   h_filespace = H5Dget_space(h_data);
   H5Sselect_hyperslab(h_filespace, H5S_SELECT_SET, offsets, NULL, shape, NULL);
 
-
   /* Read HDF5 dataspace in temporary buffer */
   /* Dirty version that happens to work for vectors but should be improved */
   /* Using HDF5 dataspaces would be better */
-  h_err = H5Dread(h_data, hdf5Type(type), h_memspace, h_filespace, H5P_DEFAULT, temp);
-  if(h_err < 0)
-    {
-      error( "Error while reading data array '%s'." , name );
-    }
+  h_err = H5Dread(h_data, hdf5Type(type), h_memspace, h_filespace, H5P_DEFAULT,
+                  temp);
+  if (h_err < 0) {
+    error("Error while reading data array '%s'.", name);
+  }
 
   /* Copy temporary buffer to particle data */
   temp_c = temp;
-  for(i=0; i<N; ++i)
-    memcpy(part_c+i*partSize, &temp_c[i*copySize], copySize);
-  
+  for (i = 0; i < N; ++i)
+    memcpy(part_c + i * partSize, &temp_c[i * copySize], copySize);
+
   /* Free and close everything */
   free(temp);
   H5Sclose(h_filespace);
@@ -178,7 +168,10 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim
  * @param importance Is the data compulsory or not
  *
  */
-#define readArray(grp, name, type, N, dim, part, N_total, offset, field, importance) readArrayBackEnd(grp, name, type, N, dim, N_total, offset, (char*)(&(part[0]).field), importance)
+#define readArray(grp, name, type, N, dim, part, N_total, offset, field, \
+                  importance)                                            \
+  readArrayBackEnd(grp, name, type, N, dim, N_total, offset,             \
+                   (char*)(&(part[0]).field), importance)
 
 /**
  * @brief Reads an HDF5 initial condition file (GADGET-3 type)
@@ -199,12 +192,15 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim
  * Calls #error() if an error occurs.
  *
  */
-void read_ic_serial ( char* fileName, double dim[3], struct part **parts,  int* N, int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm, MPI_Info info)
-{
-  hid_t h_file=0, h_grp=0;
-  double boxSize[3]={0.0,-1.0,-1.0};         /* GADGET has only cubic boxes (in cosmological mode) */
-  int numParticles[6]={0};   /* GADGET has 6 particle types. We only keep the type 0*/
-  int numParticles_highWord[6]={0};
+void read_ic_serial(char* fileName, double dim[3], struct part** parts, int* N,
+                    int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm,
+                    MPI_Info info) {
+  hid_t h_file = 0, h_grp = 0;
+  double boxSize[3] = {
+      0.0, -1.0, -1.0}; /* GADGET has only cubic boxes (in cosmological mode) */
+  int numParticles[6] = {
+      0}; /* GADGET has 6 particle types. We only keep the type 0*/
+  int numParticles_highWord[6] = {0};
   long long offset = 0;
   long long N_total = 0;
   int rank;
@@ -215,40 +211,39 @@ void read_ic_serial ( char* fileName, double dim[3], struct part **parts,  int*
     /* Open file */
     /* message("Opening file '%s' as IC.", fileName); */
     h_file = H5Fopen(fileName, H5F_ACC_RDONLY, H5P_DEFAULT);
-    if(h_file < 0)
-      error( "Error while opening file '%s' for inital read." , fileName );
-    
+    if (h_file < 0)
+      error("Error while opening file '%s' for inital read.", fileName);
+
     /* Open header to read simulation properties */
     /* message("Reading runtime parameters..."); */
     h_grp = H5Gopen1(h_file, "/RuntimePars");
-    if(h_grp < 0)
-      error("Error while opening runtime parameters\n");
-    
+    if (h_grp < 0) error("Error while opening runtime parameters\n");
+
     /* Read the relevant information */
     readAttribute(h_grp, "PeriodicBoundariesOn", INT, periodic);
 
     /* Close runtime parameters */
     H5Gclose(h_grp);
-  
+
     /* Open header to read simulation properties */
     /* message("Reading file header..."); */
     h_grp = H5Gopen1(h_file, "/Header");
-    if(h_grp < 0)
-      error("Error while opening file header\n");
-    
+    if (h_grp < 0) error("Error while opening file header\n");
+
     /* Read the relevant information and print status */
     readAttribute(h_grp, "BoxSize", DOUBLE, boxSize);
     readAttribute(h_grp, "NumPart_Total", UINT, numParticles);
     readAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticles_highWord);
 
-    N_total = ((long long) numParticles[0]) + ((long long) numParticles_highWord[0] << 32);   
+    N_total = ((long long)numParticles[0]) +
+              ((long long)numParticles_highWord[0] << 32);
     dim[0] = boxSize[0];
-    dim[1] = ( boxSize[1] < 0 ) ? boxSize[0] : boxSize[1];
-    dim[2] = ( boxSize[2] < 0 ) ? boxSize[0] : boxSize[2];
+    dim[1] = (boxSize[1] < 0) ? boxSize[0] : boxSize[1];
+    dim[2] = (boxSize[2] < 0) ? boxSize[0] : boxSize[2];
 
     /* message("Found %d particles in a %speriodic box of size [%f %f %f].",  */
     /* 	 *N, (periodic ? "": "non-"), dim[0], dim[1], dim[2]); */
-    
+
     /* Close header */
     H5Gclose(h_grp);
 
@@ -256,78 +251,82 @@ void read_ic_serial ( char* fileName, double dim[3], struct part **parts,  int*
     H5Fclose(h_file);
   }
 
-
   /* Now need to broadcast that information to all ranks. */
   MPI_Bcast(periodic, 1, MPI_INT, 0, comm);
   MPI_Bcast(&N_total, 1, MPI_LONG_LONG, 0, comm);
   MPI_Bcast(dim, 3, MPI_DOUBLE, 0, comm);
-  
 
   /* Divide the particles among the tasks. */
   offset = mpi_rank * N_total / mpi_size;
   *N = (mpi_rank + 1) * N_total / mpi_size - offset;
 
-
   /* Allocate memory to store particles */
-  if(posix_memalign( (void*)parts , part_align , (*N) * sizeof(struct part)) != 0)
+  if (posix_memalign((void*)parts, part_align, (*N) * sizeof(struct part)) != 0)
     error("Error while allocating memory for particles");
-  bzero( *parts , *N * sizeof(struct part) );
-  /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) / (1024.*1024.)); */
+  bzero(*parts, *N * sizeof(struct part));
+  /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) /
+   * (1024.*1024.)); */
 
   /* Now loop over ranks and read the data */
-  for ( rank = 0; rank < mpi_size ; ++ rank ) {
+  for (rank = 0; rank < mpi_size; ++rank) {
 
     /* Is it this rank's turn to read ? */
-    if ( rank == mpi_rank ) {
+    if (rank == mpi_rank) {
 
       h_file = H5Fopen(fileName, H5F_ACC_RDONLY, H5P_DEFAULT);
-      if(h_file < 0)
-	error( "Error while opening file '%s' on rank %d." , fileName, mpi_rank );	
-      		  
+      if (h_file < 0)
+        error("Error while opening file '%s' on rank %d.", fileName, mpi_rank);
+
       /* Open SPH particles group */
       /* message("Reading particle arrays..."); */
       h_grp = H5Gopen1(h_file, "/PartType0");
-      if(h_grp < 0)
-	error( "Error while opening particle group on rank %d.\n", mpi_rank);
-      
+      if (h_grp < 0)
+        error("Error while opening particle group on rank %d.\n", mpi_rank);
+
       /* Read arrays */
-      readArray(h_grp, "Coordinates", DOUBLE, *N, 3, *parts, N_total, offset, x, COMPULSORY);
-      readArray(h_grp, "Velocities", FLOAT, *N, 3, *parts, N_total, offset, v, COMPULSORY);
-      readArray(h_grp, "Masses", FLOAT, *N, 1, *parts, N_total, offset, mass, COMPULSORY);
-      readArray(h_grp, "SmoothingLength", FLOAT, *N, 1, *parts, N_total, offset, h, COMPULSORY);
-      readArray(h_grp, "InternalEnergy", FLOAT, *N, 1, *parts, N_total, offset, u, COMPULSORY);
-      readArray(h_grp, "ParticleIDs", ULONGLONG, *N, 1, *parts, N_total, offset, id, COMPULSORY);
-      readArray(h_grp, "TimeStep", FLOAT, *N, 1, *parts, N_total, offset, dt, OPTIONAL);
-      readArray(h_grp, "Acceleration", FLOAT, *N, 3, *parts, N_total, offset, a, OPTIONAL);
-      readArray(h_grp, "Density", FLOAT, *N, 1, *parts, N_total, offset, rho, OPTIONAL );
-      
+      readArray(h_grp, "Coordinates", DOUBLE, *N, 3, *parts, N_total, offset, x,
+                COMPULSORY);
+      readArray(h_grp, "Velocities", FLOAT, *N, 3, *parts, N_total, offset, v,
+                COMPULSORY);
+      readArray(h_grp, "Masses", FLOAT, *N, 1, *parts, N_total, offset, mass,
+                COMPULSORY);
+      readArray(h_grp, "SmoothingLength", FLOAT, *N, 1, *parts, N_total, offset,
+                h, COMPULSORY);
+      readArray(h_grp, "InternalEnergy", FLOAT, *N, 1, *parts, N_total, offset,
+                u, COMPULSORY);
+      readArray(h_grp, "ParticleIDs", ULONGLONG, *N, 1, *parts, N_total, offset,
+                id, COMPULSORY);
+      readArray(h_grp, "TimeStep", FLOAT, *N, 1, *parts, N_total, offset, dt,
+                OPTIONAL);
+      readArray(h_grp, "Acceleration", FLOAT, *N, 3, *parts, N_total, offset, a,
+                OPTIONAL);
+      readArray(h_grp, "Density", FLOAT, *N, 1, *parts, N_total, offset, rho,
+                OPTIONAL);
+
       /* Close particle group */
       H5Gclose(h_grp);
 
       /* Close file */
       H5Fclose(h_file);
-
     }
 
     /* Wait for the read of the reading to complete */
     MPI_Barrier(comm);
-
   }
 
   /* message("Done Reading particles..."); */
-
 }
 
-
 /*-----------------------------------------------------------------------------
  * Routines writing an output file
  *-----------------------------------------------------------------------------*/
 
-void prepareArray(hid_t grp, char* fileName, FILE* xmfFile, char* name, enum DATA_TYPE type, long long N_total, int dim,  struct UnitSystem* us, enum UnitConversionFactor convFactor)
-{
-  hid_t h_data=0, h_err=0, h_space=0;
+void prepareArray(hid_t grp, char* fileName, FILE* xmfFile, char* name,
+                  enum DATA_TYPE type, long long N_total, int dim,
+                  struct UnitSystem* us, enum UnitConversionFactor convFactor) {
+  hid_t h_data = 0, h_err = 0, h_space = 0;
   void* temp = 0;
-  int i=0, rank=0;
+  int i = 0, rank = 0;
   const size_t typeSize = sizeOfType(type);
   const size_t copySize = typeSize * dim;
   const size_t partSize = sizeof(struct part);
@@ -337,45 +336,42 @@ void prepareArray(hid_t grp, char* fileName, FILE* xmfFile, char* name, enum DAT
 
   /* Create data space */
   h_space = H5Screate(H5S_SIMPLE);
-  if(h_space < 0)
-    {
-      error( "Error while creating data space for field '%s'." , name );
-    }
-  
-  if(dim > 1)
-    {
-      rank = 2;
-      shape[0] = N_total; shape[1] = dim;
-    }
-  else
-    {
-      rank = 1;
-      shape[0] = N_total; shape[1] = 0;
-    }
-  
+  if (h_space < 0) {
+    error("Error while creating data space for field '%s'.", name);
+  }
+
+  if (dim > 1) {
+    rank = 2;
+    shape[0] = N_total;
+    shape[1] = dim;
+  } else {
+    rank = 1;
+    shape[0] = N_total;
+    shape[1] = 0;
+  }
+
   /* Change shape of data space */
   h_err = H5Sset_extent_simple(h_space, rank, shape, NULL);
-  if(h_err < 0)
-    {
-      error( "Error while changing data space shape for field '%s'." , name );
-    }
-  
+  if (h_err < 0) {
+    error("Error while changing data space shape for field '%s'.", name);
+  }
+
   /* Create dataset */
   h_data = H5Dcreate1(grp, name, hdf5Type(type), h_space, H5P_DEFAULT);
-  if(h_data < 0)
-    {
-      error( "Error while creating dataspace '%s'." , name );
-    }
+  if (h_data < 0) {
+    error("Error while creating dataspace '%s'.", name);
+  }
 
   /* Write XMF description for this data set */
   writeXMFline(xmfFile, fileName, name, N_total, dim, type);
 
   /* Write unit conversion factors for this data set */
-  conversionString( buffer, us, convFactor );
-  writeAttribute_d( h_data, "CGS conversion factor", conversionFactor( us, convFactor ) );
-  writeAttribute_f( h_data, "h-scale exponant", hFactor( us, convFactor ) );
-  writeAttribute_f( h_data, "a-scale exponant", aFactor( us, convFactor ) );
-  writeAttribute_s( h_data, "Conversion factor", buffer );
+  conversionString(buffer, us, convFactor);
+  writeAttribute_d(h_data, "CGS conversion factor",
+                   conversionFactor(us, convFactor));
+  writeAttribute_f(h_data, "h-scale exponant", hFactor(us, convFactor));
+  writeAttribute_f(h_data, "a-scale exponant", aFactor(us, convFactor));
+  writeAttribute_s(h_data, "Conversion factor", buffer);
 
   H5Dclose(h_data);
   H5Sclose(h_space);
@@ -391,19 +387,21 @@ void prepareArray(hid_t grp, char* fileName, FILE* xmfFile, char* name, enum DAT
  * @param type The #DATA_TYPE of the array.
  * @param N The number of particles to write.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
- * @param part_c A (char*) pointer on the first occurence of the field of interest in the parts array
+ * @param part_c A (char*) pointer on the first occurence of the field of
+ *interest in the parts array
  * @param us The UnitSystem currently in use
  * @param convFactor The UnitConversionFactor for this array
  *
  *
  * Calls #error() if an error occurs.
  */
-void writeArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim, long long N_total, long long offset, char* part_c)
-{
-  hid_t h_data=0, h_err=0, h_memspace=0, h_filespace=0;
+void writeArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
+                       int dim, long long N_total, long long offset,
+                       char* part_c) {
+  hid_t h_data = 0, h_err = 0, h_memspace = 0, h_filespace = 0;
   hsize_t shape[2], shape_total[2], offsets[2];
   void* temp = 0;
-  int i=0, rank=0;
+  int i = 0, rank = 0;
   const size_t typeSize = sizeOfType(type);
   const size_t copySize = typeSize * dim;
   const size_t partSize = sizeof(struct part);
@@ -413,55 +411,56 @@ void writeArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int di
 
   /* Allocate temporary buffer */
   temp = malloc(N * dim * sizeOfType(type));
-  if(temp == NULL)
-    error("Unable to allocate memory for temporary buffer");
+  if (temp == NULL) error("Unable to allocate memory for temporary buffer");
 
   /* Copy particle data to temporary buffer */
   temp_c = temp;
-  for(i=0; i<N; ++i)
-    memcpy(&temp_c[i*copySize], part_c+i*partSize, copySize);
+  for (i = 0; i < N; ++i)
+    memcpy(&temp_c[i * copySize], part_c + i * partSize, copySize);
 
   /* Construct information for the hyperslab */
-  if(dim > 1)
-    {
-      rank = 2;
-      shape[0] = N; shape[1] = dim;
-      shape_total[0] = N_total; shape_total[1] = dim;
-      offsets[0] = offset; offsets[1] = 0;
-    }
-  else
-    {
-      rank = 1;
-      shape[0] = N; shape[1] = 0;
-      shape_total[0] = N_total; shape_total[1] = 0;
-      offsets[0] = offset; offsets[1] = 0;
-    }
+  if (dim > 1) {
+    rank = 2;
+    shape[0] = N;
+    shape[1] = dim;
+    shape_total[0] = N_total;
+    shape_total[1] = dim;
+    offsets[0] = offset;
+    offsets[1] = 0;
+  } else {
+    rank = 1;
+    shape[0] = N;
+    shape[1] = 0;
+    shape_total[0] = N_total;
+    shape_total[1] = 0;
+    offsets[0] = offset;
+    offsets[1] = 0;
+  }
 
-  
   /* Create data space in memory */
   h_memspace = H5Screate(H5S_SIMPLE);
-  if(h_memspace < 0)
-      error( "Error while creating data space (memory) for field '%s'." , name );
+  if (h_memspace < 0)
+    error("Error while creating data space (memory) for field '%s'.", name);
 
   /* Change shape of memory data space */
   h_err = H5Sset_extent_simple(h_memspace, rank, shape, NULL);
-  if(h_err < 0)
-      error( "Error while changing data space (memory) shape for field '%s'." , name );
-  
+  if (h_err < 0)
+    error("Error while changing data space (memory) shape for field '%s'.",
+          name);
+
   /* Open pre-existing data set */
   h_data = H5Dopen(grp, name, H5P_DEFAULT);
-  if(h_data < 0)
-      error( "Error while opening dataset '%s'." , name );
+  if (h_data < 0) error("Error while opening dataset '%s'.", name);
 
   /* Select data space in that data set */
   h_filespace = H5Dget_space(h_data);
   H5Sselect_hyperslab(h_filespace, H5S_SELECT_SET, offsets, NULL, shape, NULL);
 
   /* Write temporary buffer to HDF5 dataspace */
-  h_err = H5Dwrite(h_data, hdf5Type(type), h_memspace, h_filespace, H5P_DEFAULT, temp);
-  if(h_err < 0)
-    error( "Error while writing data array '%s'." , name );
-    
+  h_err = H5Dwrite(h_data, hdf5Type(type), h_memspace, h_filespace, H5P_DEFAULT,
+                   temp);
+  if (h_err < 0) error("Error while writing data array '%s'.", name);
+
   /* Free and close everything */
   free(temp);
   H5Dclose(h_data);
@@ -479,14 +478,16 @@ void writeArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int di
  * @param type The #DATA_TYPE of the array.
  * @param N The number of particles to write.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
- * @param part A (char*) pointer on the first occurence of the field of interest in the parts array
+ * @param part A (char*) pointer on the first occurence of the field of interest
+ *in the parts array
  * @param field The name (code name) of the field to read from.
  * @param us The UnitSystem currently in use
  * @param convFactor The UnitConversionFactor for this array
  *
  */
-#define writeArray(grp, name, type, N, dim, N_total, offset, part, field) writeArrayBackEnd(grp, name, type, N, dim, N_total, offset, (char*)(&(part[0]).field))
-
+#define writeArray(grp, name, type, N, dim, N_total, offset, part, field) \
+  writeArrayBackEnd(grp, name, type, N, dim, N_total, offset,             \
+                    (char*)(&(part[0]).field))
 
 /**
  * @brief Writes an HDF5 output file (GADGET-3 type) with its XMF descriptor
@@ -496,20 +497,20 @@ void writeArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int di
  *
  * Creates an HDF5 output file and writes the particles contained
  * in the engine. If such a file already exists, it is erased and replaced
- * by the new one. 
+ * by the new one.
  * The companion XMF file is also updated accordingly.
  *
  * Calls #error() if an error occurs.
  *
  */
-void write_output_serial ( struct engine* e, struct UnitSystem* us, int mpi_rank, int mpi_size, MPI_Comm comm, MPI_Info info )
-{  
-  hid_t h_file=0, h_grp=0;
+void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
+                         int mpi_size, MPI_Comm comm, MPI_Info info) {
+  hid_t h_file = 0, h_grp = 0;
   int N = e->s->nr_parts;
   int periodic = e->s->periodic;
-  int numParticles[6]={N,0};
-  int numParticlesHighWord[6]={0};
-  unsigned int flagEntropy[6]={0};
+  int numParticles[6] = {N, 0};
+  int numParticlesHighWord[6] = {0};
+  unsigned int flagEntropy[6] = {0};
   long long N_total = 0, offset = 0;
   double offset_d = 0., N_d = 0., N_total_d = 0.;
   int numFiles = 1;
@@ -517,7 +518,7 @@ void write_output_serial ( struct engine* e, struct UnitSystem* us, int mpi_rank
   struct part* parts = e->s->parts;
   FILE* xmfFile = 0;
   static int outputCount = 0;
-  
+
   /* File name */
   char fileName[200];
   sprintf(fileName, "output_%03i.hdf5", outputCount);
@@ -527,62 +528,60 @@ void write_output_serial ( struct engine* e, struct UnitSystem* us, int mpi_rank
   N_d = (double)N;
   MPI_Exscan(&N_d, &offset_d, 1, MPI_DOUBLE, MPI_SUM, comm);
   N_total_d = offset_d + N_d;
-  MPI_Bcast(&N_total_d, 1, MPI_DOUBLE, mpi_size-1, comm);
-  if(N_total_d > 1.e15)
-    error("Error while computing the offest for parallel output: Simulation has more than 10^15 particles.\n");
-  N_total = (long long) N_total_d;
-  offset = (long long) offset_d;
-
+  MPI_Bcast(&N_total_d, 1, MPI_DOUBLE, mpi_size - 1, comm);
+  if (N_total_d > 1.e15)
+    error(
+        "Error while computing the offest for parallel output: Simulation has "
+        "more than 10^15 particles.\n");
+  N_total = (long long)N_total_d;
+  offset = (long long)offset_d;
 
   /* Do common stuff first */
-  if ( mpi_rank == 0 ) {
+  if (mpi_rank == 0) {
 
     /* First time, we need to create the XMF file */
-    if(outputCount == 0)
-      createXMFfile();
-    
+    if (outputCount == 0) createXMFfile();
+
     /* Prepare the XMF file for the new entry */
     xmfFile = prepareXMFfile();
-    
+
     /* Write the part corresponding to this specific output */
     writeXMFheader(xmfFile, N_total, fileName, e->time);
 
     /* Open file */
     /* message("Opening file '%s'.", fileName); */
-    h_file = H5Fcreate(fileName, H5F_ACC_TRUNC, H5P_DEFAULT,H5P_DEFAULT);
-    if(h_file < 0)
-      {
-	error( "Error while opening file '%s'." , fileName );
-      }
+    h_file = H5Fcreate(fileName, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
+    if (h_file < 0) {
+      error("Error while opening file '%s'.", fileName);
+    }
 
     /* Open header to write simulation properties */
     /* message("Writing runtime parameters..."); */
     h_grp = H5Gcreate1(h_file, "/RuntimePars", 0);
-    if(h_grp < 0)
-      error("Error while creating runtime parameters group\n");
+    if (h_grp < 0) error("Error while creating runtime parameters group\n");
 
     /* Write the relevant information */
     writeAttribute(h_grp, "PeriodicBoundariesOn", INT, &periodic, 1);
 
     /* Close runtime parameters */
     H5Gclose(h_grp);
-  
+
     /* Open header to write simulation properties */
     /* message("Writing file header..."); */
     h_grp = H5Gcreate1(h_file, "/Header", 0);
-    if(h_grp < 0)
-      error("Error while creating file header\n");
-    
+    if (h_grp < 0) error("Error while creating file header\n");
+
     /* Print the relevant information and print status */
     writeAttribute(h_grp, "BoxSize", DOUBLE, e->s->dim, 3);
     writeAttribute(h_grp, "Time", DOUBLE, &e->time, 1);
 
     /* GADGET-2 legacy values */
-    numParticles[0] = (unsigned int) N_total ;
+    numParticles[0] = (unsigned int)N_total;
     writeAttribute(h_grp, "NumPart_ThisFile", UINT, numParticles, 6);
     writeAttribute(h_grp, "NumPart_Total", UINT, numParticles, 6);
-    numParticlesHighWord[0] = (unsigned int) (N_total >> 32);
-    writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord, 6);
+    numParticlesHighWord[0] = (unsigned int)(N_total >> 32);
+    writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord,
+                   6);
     double MassTable[6] = {0., 0., 0., 0., 0., 0.};
     writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, 6);
     writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy, 6);
@@ -596,24 +595,32 @@ void write_output_serial ( struct engine* e, struct UnitSystem* us, int mpi_rank
 
     /* Print the system of Units */
     writeUnitSystem(h_file, us);
-		  
+
     /* Create SPH particles group */
     /* message("Writing particle arrays..."); */
     h_grp = H5Gcreate1(h_file, "/PartType0", 0);
-    if(h_grp < 0)
-      error( "Error while creating particle group.\n");
+    if (h_grp < 0) error("Error while creating particle group.\n");
 
     /* Prepare the arrays in the file */
-    prepareArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N_total, 3, us, UNIT_CONV_LENGTH);
-    prepareArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N_total, 3, us, UNIT_CONV_SPEED);
-    prepareArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N_total, 1, us, UNIT_CONV_MASS);
-    prepareArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N_total, 1, us, UNIT_CONV_LENGTH);
-    prepareArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N_total, 1, us, UNIT_CONV_ENERGY_PER_UNIT_MASS);
-    prepareArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N_total, 1, us, UNIT_CONV_NO_UNITS);
-    prepareArray(h_grp, fileName, xmfFile, "TimeStep", FLOAT, N_total, 1, us, UNIT_CONV_TIME);
-    prepareArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N_total, 3, us, UNIT_CONV_ACCELERATION);
-    prepareArray(h_grp, fileName, xmfFile, "Density", FLOAT, N_total, 1, us, UNIT_CONV_DENSITY);
- 
+    prepareArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N_total, 3,
+                 us, UNIT_CONV_LENGTH);
+    prepareArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N_total, 3, us,
+                 UNIT_CONV_SPEED);
+    prepareArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N_total, 1, us,
+                 UNIT_CONV_MASS);
+    prepareArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N_total, 1,
+                 us, UNIT_CONV_LENGTH);
+    prepareArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N_total, 1,
+                 us, UNIT_CONV_ENERGY_PER_UNIT_MASS);
+    prepareArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N_total, 1,
+                 us, UNIT_CONV_NO_UNITS);
+    prepareArray(h_grp, fileName, xmfFile, "TimeStep", FLOAT, N_total, 1, us,
+                 UNIT_CONV_TIME);
+    prepareArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N_total, 3,
+                 us, UNIT_CONV_ACCELERATION);
+    prepareArray(h_grp, fileName, xmfFile, "Density", FLOAT, N_total, 1, us,
+                 UNIT_CONV_DENSITY);
+
     /* Close particle group */
     H5Gclose(h_grp);
 
@@ -624,31 +631,32 @@ void write_output_serial ( struct engine* e, struct UnitSystem* us, int mpi_rank
     writeXMFfooter(xmfFile);
   }
 
-
-
   /* Now loop over ranks and write the data */
-  for ( rank = 0; rank < mpi_size ; ++ rank ) {
+  for (rank = 0; rank < mpi_size; ++rank) {
 
     /* Is it this rank's turn to write ? */
-    if ( rank == mpi_rank ) {
+    if (rank == mpi_rank) {
 
       h_file = H5Fopen(fileName, H5F_ACC_RDWR, H5P_DEFAULT);
-      if(h_file < 0)
-	error( "Error while opening file '%s' on rank %d." , fileName, mpi_rank );	
+      if (h_file < 0)
+        error("Error while opening file '%s' on rank %d.", fileName, mpi_rank);
 
       /* Open SPH particles group */
       /* message("Reading particle arrays..."); */
       h_grp = H5Gopen1(h_file, "/PartType0");
-      if(h_grp < 0)
-	error( "Error while opening particle group on rank %d.\n", mpi_rank);
+      if (h_grp < 0)
+        error("Error while opening particle group on rank %d.\n", mpi_rank);
 
       /* Write arrays */
       writeArray(h_grp, "Coordinates", DOUBLE, N, 3, N_total, offset, parts, x);
       writeArray(h_grp, "Velocities", FLOAT, N, 3, N_total, offset, parts, v);
       writeArray(h_grp, "Masses", FLOAT, N, 1, N_total, offset, parts, mass);
-      writeArray(h_grp, "SmoothingLength", FLOAT, N, 1, N_total, offset, parts, h);
-      writeArray(h_grp, "InternalEnergy", FLOAT, N, 1, N_total, offset, parts, u);
-      writeArray(h_grp, "ParticleIDs", ULONGLONG, N, 1, N_total, offset, parts, id);
+      writeArray(h_grp, "SmoothingLength", FLOAT, N, 1, N_total, offset, parts,
+                 h);
+      writeArray(h_grp, "InternalEnergy", FLOAT, N, 1, N_total, offset, parts,
+                 u);
+      writeArray(h_grp, "ParticleIDs", ULONGLONG, N, 1, N_total, offset, parts,
+                 id);
       writeArray(h_grp, "TimeStep", FLOAT, N, 1, N_total, offset, parts, dt);
       writeArray(h_grp, "Acceleration", FLOAT, N, 3, N_total, offset, parts, a);
       writeArray(h_grp, "Density", FLOAT, N, 1, N_total, offset, parts, rho);
@@ -658,19 +666,14 @@ void write_output_serial ( struct engine* e, struct UnitSystem* us, int mpi_rank
 
       /* Close file */
       H5Fclose(h_file);
-
     }
 
     /* Wait for the read of the reading to complete */
     MPI_Barrier(comm);
-
   }
 
   /* message("Done writing particles..."); */
   ++outputCount;
 }
 
-
-#endif  /* HAVE_HDF5 */
-
-
+#endif /* HAVE_HDF5 */
diff --git a/src/serial_io.h b/src/serial_io.h
index 3349f221531ce7c4a2a290b121500e5d4336ed6b..bb05fc61bdca1b0db36386e6773a034cc17ea6b9 100644
--- a/src/serial_io.h
+++ b/src/serial_io.h
@@ -1,28 +1,43 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_SERIAL_IO_H
+#define SWIFT_SERIAL_IO_H
 
+/* MPI headers. */
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+/* Includes. */
+#include "engine.h"
+#include "part.h"
+#include "units.h"
 
 #if defined(HAVE_HDF5) && defined(WITH_MPI) && !defined(HAVE_PARALLEL_HDF5)
 
-void read_ic_serial ( char* fileName, double dim[3], struct part **parts,  int* N, int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm, MPI_Info info);
+void read_ic_serial(char* fileName, double dim[3], struct part** parts, int* N,
+                    int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm,
+                    MPI_Info info);
 
-void write_output_serial ( struct engine* e, struct UnitSystem* us, int mpi_rank, int mpi_size, MPI_Comm comm, MPI_Info info );
+void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
+                         int mpi_size, MPI_Comm comm, MPI_Info info);
 
 #endif
 
+#endif /* SWIFT_SERIAL_IO_H */
diff --git a/src/single_io.c b/src/single_io.c
index 485cb60aa51140682ef868d0323b31f00ce4ed9e..0874442982df747ed0eff38bf060e50e0d205034 100644
--- a/src/single_io.c
+++ b/src/single_io.c
@@ -2,20 +2,20 @@
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
  *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk).
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 
 /* Config parameters. */
@@ -23,28 +23,20 @@
 
 #if defined(HAVE_HDF5) && !defined(WITH_MPI)
 
-
 /* Some standard headers. */
+#include <hdf5.h>
+#include <math.h>
+#include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <stddef.h>
-#include <hdf5.h>
-#include <math.h>
-
-#include "const.h"
-#include "cycle.h"
-#include "lock.h"
-#include "task.h"
-#include "part.h"
-#include "space.h"
-#include "scheduler.h"
-#include "engine.h"
-#include "error.h"
-#include "kernel.h"
-#include "common_io.h"
 
+/* This object's header. */
+#include "single_io.h"
 
+/* Local includes. */
+#include "common_io.h"
+#include "error.h"
 
 /*-----------------------------------------------------------------------------
  * Routines reading an IC file
@@ -58,20 +50,23 @@
  * @param type The #DATA_TYPE of the attribute.
  * @param N The number of particles.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
- * @param part_c A (char*) pointer on the first occurence of the field of interest in the parts array
- * @param importance If COMPULSORY, the data must be present in the IC file. If OPTIONAL, the array will be zeroed when the data is not present.
+ * @param part_c A (char*) pointer on the first occurence of the field of
+ *interest in the parts array
+ * @param importance If COMPULSORY, the data must be present in the IC file. If
+ *OPTIONAL, the array will be zeroed when the data is not present.
  *
- * @todo A better version using HDF5 hyperslabs to read the file directly into the part array 
+ * @todo A better version using HDF5 hyperslabs to read the file directly into
+ *the part array
  * will be written once the strucutres have been stabilized.
- *  
+ *
  * Calls #error() if an error occurs.
  */
-void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim, char* part_c, enum DATA_IMPORTANCE importance)
-{
-  hid_t h_data=0, h_err=0, h_type=0;
-  htri_t exist=0;
+void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
+                      int dim, char* part_c, enum DATA_IMPORTANCE importance) {
+  hid_t h_data = 0, h_err = 0, h_type = 0;
+  htri_t exist = 0;
   void* temp;
-  int i=0;
+  int i = 0;
   const size_t typeSize = sizeOfType(type);
   const size_t copySize = typeSize * dim;
   const size_t partSize = sizeof(struct part);
@@ -79,62 +74,53 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim
 
   /* Check whether the dataspace exists or not */
   exist = H5Lexists(grp, name, 0);
-  if(exist < 0)
-    {
-      error( "Error while checking the existence of data set '%s'." , name );
+  if (exist < 0) {
+    error("Error while checking the existence of data set '%s'.", name);
+  } else if (exist == 0) {
+    if (importance == COMPULSORY) {
+      error("Compulsory data set '%s' not present in the file.", name);
+    } else {
+      /* message("Optional data set '%s' not present. Zeroing this particle
+       * field...", name);	   */
+
+      for (i = 0; i < N; ++i) memset(part_c + i * partSize, 0, copySize);
+
+      return;
     }
-  else if(exist == 0)
-    {
-      if(importance == COMPULSORY)
-	{
-	  error( "Compulsory data set '%s' not present in the file." , name );
-	}
-      else
-	{
-	  /* message("Optional data set '%s' not present. Zeroing this particle field...", name);	   */
-	  
-	  for(i=0; i<N; ++i)
-	    memset(part_c+i*partSize, 0, copySize);
-	  
-	  return;
-	}
-   }
-
-  /* message( "Reading %s '%s' array...", importance == COMPULSORY ? "compulsory": "optional  ", name); */
+  }
+
+  /* message( "Reading %s '%s' array...", importance == COMPULSORY ?
+   * "compulsory": "optional  ", name); */
 
   /* Open data space */
   h_data = H5Dopen1(grp, name);
-  if(h_data < 0)
-    {
-      error( "Error while opening data space '%s'." , name );
-    }
+  if (h_data < 0) {
+    error("Error while opening data space '%s'.", name);
+  }
 
   /* Check data type */
   h_type = H5Dget_type(h_data);
-  if(h_type < 0)
-    error("Unable to retrieve data type from the file");
-  if(!H5Tequal(h_type, hdf5Type(type)))
+  if (h_type < 0) error("Unable to retrieve data type from the file");
+  if (!H5Tequal(h_type, hdf5Type(type)))
     error("Non-matching types between the code and the file");
-  
+
   /* Allocate temporary buffer */
   temp = malloc(N * dim * sizeOfType(type));
-  if(temp == NULL)
-    error("Unable to allocate memory for temporary buffer");
+  if (temp == NULL) error("Unable to allocate memory for temporary buffer");
 
   /* Read HDF5 dataspace in temporary buffer */
   /* Dirty version that happens to work for vectors but should be improved */
   /* Using HDF5 dataspaces would be better */
   h_err = H5Dread(h_data, hdf5Type(type), H5S_ALL, H5S_ALL, H5P_DEFAULT, temp);
-  if(h_err < 0)
-    {
-      error( "Error while reading data array '%s'." , name );
-    }
+  if (h_err < 0) {
+    error("Error while reading data array '%s'.", name);
+  }
 
   /* Copy temporary buffer to particle data */
   temp_c = temp;
-  for(i=0; i<N; ++i)
-    memcpy(part_c+i*partSize, &temp_c[i*copySize], copySize);
-  
+  for (i = 0; i < N; ++i)
+    memcpy(part_c + i * partSize, &temp_c[i * copySize], copySize);
+
   /* Free and close everything */
   free(temp);
   H5Tclose(h_type);
@@ -154,7 +140,9 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim
  * @param importance Is the data compulsory or not
  *
  */
-#define readArray(grp, name, type, N, dim, part, field, importance) readArrayBackEnd(grp, name, type, N, dim, (char*)(&(part[0]).field), importance)
+#define readArray(grp, name, type, N, dim, part, field, importance)    \
+  readArrayBackEnd(grp, name, type, N, dim, (char*)(&(part[0]).field), \
+                   importance)
 
 /**
  * @brief Reads an HDF5 initial condition file (GADGET-3 type)
@@ -175,46 +163,45 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim
  * Calls #error() if an error occurs.
  *
  */
-void read_ic_single ( char* fileName, double dim[3], struct part **parts,  int* N, int* periodic)
-{
-  hid_t h_file=0, h_grp=0;
-  double boxSize[3]={0.0,-1.0,-1.0};         /* GADGET has only cubic boxes (in cosmological mode) */
-  int numParticles[6]={0};   /* GADGET has 6 particle types. We only keep the type 0*/
+void read_ic_single(char* fileName, double dim[3], struct part** parts, int* N,
+                    int* periodic) {
+  hid_t h_file = 0, h_grp = 0;
+  double boxSize[3] = {
+      0.0, -1.0, -1.0}; /* GADGET has only cubic boxes (in cosmological mode) */
+  int numParticles[6] = {
+      0}; /* GADGET has 6 particle types. We only keep the type 0*/
 
   /* Open file */
   /* message("Opening file '%s' as IC.", fileName); */
   h_file = H5Fopen(fileName, H5F_ACC_RDONLY, H5P_DEFAULT);
-  if(h_file < 0)
-    {
-      error( "Error while opening file '%s'." , fileName );
-    }
+  if (h_file < 0) {
+    error("Error while opening file '%s'.", fileName);
+  }
 
   /* Open header to read simulation properties */
   /* message("Reading runtime parameters..."); */
   h_grp = H5Gopen1(h_file, "/RuntimePars");
-  if(h_grp < 0)
-    error("Error while opening runtime parameters\n");
+  if (h_grp < 0) error("Error while opening runtime parameters\n");
 
   /* Read the relevant information */
   readAttribute(h_grp, "PeriodicBoundariesOn", INT, periodic);
 
   /* Close runtime parameters */
   H5Gclose(h_grp);
-  
+
   /* Open header to read simulation properties */
   /* message("Reading file header..."); */
   h_grp = H5Gopen1(h_file, "/Header");
-  if(h_grp < 0)
-    error("Error while opening file header\n");
-    
+  if (h_grp < 0) error("Error while opening file header\n");
+
   /* Read the relevant information and print status */
   readAttribute(h_grp, "BoxSize", DOUBLE, boxSize);
   readAttribute(h_grp, "NumPart_Total", UINT, numParticles);
 
   *N = numParticles[0];
   dim[0] = boxSize[0];
-  dim[1] = ( boxSize[1] < 0 ) ? boxSize[0] : boxSize[1];
-  dim[2] = ( boxSize[2] < 0 ) ? boxSize[0] : boxSize[2];
+  dim[1] = (boxSize[1] < 0) ? boxSize[0] : boxSize[1];
+  dim[2] = (boxSize[2] < 0) ? boxSize[0] : boxSize[2];
 
   /* message("Found %d particles in a %speriodic box of size [%f %f %f].",  */
   /* 	 *N, (periodic ? "": "non-"), dim[0], dim[1], dim[2]); */
@@ -223,17 +210,17 @@ void read_ic_single ( char* fileName, double dim[3], struct part **parts,  int*
   H5Gclose(h_grp);
 
   /* Allocate memory to store particles */
-  if(posix_memalign( (void*)parts , part_align , *N * sizeof(struct part)) != 0)
+  if (posix_memalign((void*)parts, part_align, *N * sizeof(struct part)) != 0)
     error("Error while allocating memory for particles");
-  bzero( *parts , *N * sizeof(struct part) );
+  bzero(*parts, *N * sizeof(struct part));
+
+  /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) /
+   * (1024.*1024.)); */
 
-  /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) / (1024.*1024.)); */
-		  
   /* Open SPH particles group */
   /* message("Reading particle arrays..."); */
   h_grp = H5Gopen1(h_file, "/PartType0");
-  if(h_grp < 0)
-    error( "Error while opening particle group.\n");
+  if (h_grp < 0) error("Error while opening particle group.\n");
 
   /* Read arrays */
   readArray(h_grp, "Coordinates", DOUBLE, *N, 3, *parts, x, COMPULSORY);
@@ -244,7 +231,7 @@ void read_ic_single ( char* fileName, double dim[3], struct part **parts,  int*
   readArray(h_grp, "ParticleIDs", ULONGLONG, *N, 1, *parts, id, COMPULSORY);
   readArray(h_grp, "TimeStep", FLOAT, *N, 1, *parts, dt, OPTIONAL);
   readArray(h_grp, "Acceleration", FLOAT, *N, 3, *parts, a, OPTIONAL);
-  readArray(h_grp, "Density", FLOAT, *N, 1, *parts, rho, OPTIONAL );
+  readArray(h_grp, "Density", FLOAT, *N, 1, *parts, rho, OPTIONAL);
 
   /* Close particle group */
   H5Gclose(h_grp);
@@ -255,7 +242,6 @@ void read_ic_single ( char* fileName, double dim[3], struct part **parts,  int*
   H5Fclose(h_file);
 }
 
-
 /*-----------------------------------------------------------------------------
  * Routines writing an output file
  *-----------------------------------------------------------------------------*/
@@ -270,20 +256,24 @@ void read_ic_single ( char* fileName, double dim[3], struct part **parts,  int*
  * @param type The #DATA_TYPE of the array.
  * @param N The number of particles to write.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
- * @param part_c A (char*) pointer on the first occurence of the field of interest in the parts array
+ * @param part_c A (char*) pointer on the first occurence of the field of
+ *interest in the parts array
  * @param us The UnitSystem currently in use
  * @param convFactor The UnitConversionFactor for this array
  *
- * @todo A better version using HDF5 hyperslabs to write the file directly from the part array
+ * @todo A better version using HDF5 hyperslabs to write the file directly from
+ *the part array
  * will be written once the strucutres have been stabilized.
  *
  * Calls #error() if an error occurs.
  */
-void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, enum DATA_TYPE type, int N, int dim, char* part_c, struct UnitSystem* us, enum UnitConversionFactor convFactor)
-{
-  hid_t h_data=0, h_err=0, h_space=0;
+void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
+                       enum DATA_TYPE type, int N, int dim, char* part_c,
+                       struct UnitSystem* us,
+                       enum UnitConversionFactor convFactor) {
+  hid_t h_data = 0, h_err = 0, h_space = 0;
   void* temp = 0;
-  int i=0, rank=0;
+  int i = 0, rank = 0;
   const size_t typeSize = sizeOfType(type);
   const size_t copySize = typeSize * dim;
   const size_t partSize = sizeof(struct part);
@@ -295,63 +285,58 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, enu
 
   /* Allocate temporary buffer */
   temp = malloc(N * dim * sizeOfType(type));
-  if(temp == NULL)
-    error("Unable to allocate memory for temporary buffer");
+  if (temp == NULL) error("Unable to allocate memory for temporary buffer");
 
   /* Copy particle data to temporary buffer */
   temp_c = temp;
-  for(i=0; i<N; ++i)
-    memcpy(&temp_c[i*copySize], part_c+i*partSize, copySize);
+  for (i = 0; i < N; ++i)
+    memcpy(&temp_c[i * copySize], part_c + i * partSize, copySize);
 
   /* Create data space */
   h_space = H5Screate(H5S_SIMPLE);
-  if(h_space < 0)
-    {
-      error( "Error while creating data space for field '%s'." , name );
-    }
-  
-  if(dim > 1)
-    {
-      rank = 2;
-      shape[0] = N; shape[1] = dim;
-    }
-  else
-    {
-      rank = 1;
-      shape[0] = N; shape[1] = 0;
-    }
-  
+  if (h_space < 0) {
+    error("Error while creating data space for field '%s'.", name);
+  }
+
+  if (dim > 1) {
+    rank = 2;
+    shape[0] = N;
+    shape[1] = dim;
+  } else {
+    rank = 1;
+    shape[0] = N;
+    shape[1] = 0;
+  }
+
   /* Change shape of data space */
   h_err = H5Sset_extent_simple(h_space, rank, shape, NULL);
-  if(h_err < 0)
-    {
-      error( "Error while changing data space shape for field '%s'." , name );
-    }
-  
+  if (h_err < 0) {
+    error("Error while changing data space shape for field '%s'.", name);
+  }
+
   /* Create dataset */
   h_data = H5Dcreate1(grp, name, hdf5Type(type), h_space, H5P_DEFAULT);
-  if(h_data < 0)
-    {
-      error( "Error while creating dataspace '%s'." , name );
-    }
-  
+  if (h_data < 0) {
+    error("Error while creating dataspace '%s'.", name);
+  }
+
   /* Write temporary buffer to HDF5 dataspace */
   h_err = H5Dwrite(h_data, hdf5Type(type), h_space, H5S_ALL, H5P_DEFAULT, temp);
-  if(h_err < 0)
-    {
-      error( "Error while writing data array '%s'." , name );
-    }
+  if (h_err < 0) {
+    error("Error while writing data array '%s'.", name);
+  }
 
   /* Write XMF description for this data set */
   writeXMFline(xmfFile, fileName, name, N, dim, type);
 
   /* Write unit conversion factors for this data set */
-  conversionString( buffer, us, convFactor );
-  writeAttribute_d( h_data, "CGS conversion factor", conversionFactor( us, convFactor ) );
-  writeAttribute_f( h_data, "h-scale exponant", hFactor( us, convFactor ) );
-  writeAttribute_f( h_data, "a-scale exponant", aFactor( us, convFactor ) );
-  writeAttribute_s( h_data, "Conversion factor", buffer );
-  
+  conversionString(buffer, us, convFactor);
+  writeAttribute_d(h_data, "CGS conversion factor",
+                   conversionFactor(us, convFactor));
+  writeAttribute_f(h_data, "h-scale exponant", hFactor(us, convFactor));
+  writeAttribute_f(h_data, "a-scale exponant", aFactor(us, convFactor));
+  writeAttribute_s(h_data, "Conversion factor", buffer);
+
   /* Free and close everything */
   free(temp);
   H5Dclose(h_data);
@@ -368,13 +353,17 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, enu
  * @param type The #DATA_TYPE of the array.
  * @param N The number of particles to write.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
- * @param part A (char*) pointer on the first occurence of the field of interest in the parts array
+ * @param part A (char*) pointer on the first occurence of the field of interest
+ *in the parts array
  * @param field The name (code name) of the field to read from.
  * @param us The UnitSystem currently in use
  * @param convFactor The UnitConversionFactor for this array
  *
  */
-#define writeArray(grp, fileName, xmfFile, name, type, N, dim, part, field, us, convFactor) writeArrayBackEnd(grp, fileName, xmfFile, name, type, N, dim, (char*)(&(part[0]).field), us, convFactor)
+#define writeArray(grp, fileName, xmfFile, name, type, N, dim, part, field, \
+                   us, convFactor)                                          \
+  writeArrayBackEnd(grp, fileName, xmfFile, name, type, N, dim,             \
+                    (char*)(&(part[0]).field), us, convFactor)
 
 /**
  * @brief Writes an HDF5 output file (GADGET-3 type) with its XMF descriptor
@@ -384,66 +373,60 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, enu
  *
  * Creates an HDF5 output file and writes the particles contained
  * in the engine. If such a file already exists, it is erased and replaced
- * by the new one. 
+ * by the new one.
  * The companion XMF file is also updated accordingly.
  *
  * Calls #error() if an error occurs.
  *
  */
-void write_output_single (struct engine *e, struct UnitSystem* us)
-{
-  
-  hid_t h_file=0, h_grp=0;
+void write_output_single(struct engine* e, struct UnitSystem* us) {
+
+  hid_t h_file = 0, h_grp = 0;
   int N = e->s->nr_parts;
   int periodic = e->s->periodic;
-  int numParticles[6]={N,0};
-  int numParticlesHighWord[6]={0};
+  int numParticles[6] = {N, 0};
+  int numParticlesHighWord[6] = {0};
   int numFiles = 1;
   struct part* parts = e->s->parts;
   FILE* xmfFile = 0;
   static int outputCount = 0;
-  
+
   /* File name */
   char fileName[200];
   sprintf(fileName, "output_%03i.hdf5", outputCount);
 
   /* First time, we need to create the XMF file */
-  if(outputCount == 0)
-    createXMFfile();
-  
+  if (outputCount == 0) createXMFfile();
+
   /* Prepare the XMF file for the new entry */
   xmfFile = prepareXMFfile();
 
   /* Write the part corresponding to this specific output */
   writeXMFheader(xmfFile, N, fileName, e->time);
 
-
   /* Open file */
   /* message("Opening file '%s'.", fileName); */
-  h_file = H5Fcreate(fileName, H5F_ACC_TRUNC, H5P_DEFAULT,H5P_DEFAULT);
-  if(h_file < 0)
-    {
-      error( "Error while opening file '%s'." , fileName );
-    }
+  h_file = H5Fcreate(fileName, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
+  if (h_file < 0) {
+    error("Error while opening file '%s'.", fileName);
+  }
 
   /* Open header to write simulation properties */
   /* message("Writing runtime parameters..."); */
   h_grp = H5Gcreate1(h_file, "/RuntimePars", 0);
-  if(h_grp < 0)
-    error("Error while creating runtime parameters group\n");
+  if (h_grp < 0) error("Error while creating runtime parameters group\n");
 
   /* Write the relevant information */
   writeAttribute(h_grp, "PeriodicBoundariesOn", INT, &periodic, 1);
 
   /* Close runtime parameters */
   H5Gclose(h_grp);
-  
+
   /* Open header to write simulation properties */
   /* message("Writing file header..."); */
   h_grp = H5Gcreate1(h_file, "/Header", 0);
-  if(h_grp < 0)
-    error("Error while creating file header\n");
-    
+  if (h_grp < 0) error("Error while creating file header\n");
+
   /* Print the relevant information and print status */
   writeAttribute(h_grp, "BoxSize", DOUBLE, e->s->dim, 3);
   writeAttribute(h_grp, "NumPart_ThisFile", UINT, numParticles, 6);
@@ -451,7 +434,8 @@ void write_output_single (struct engine *e, struct UnitSystem* us)
 
   /* GADGET-2 legacy values */
   writeAttribute(h_grp, "NumPart_Total", UINT, numParticles, 6);
-  writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord, 6);
+  writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord,
+                 6);
   double MassTable[6] = {0., 0., 0., 0., 0., 0.};
   writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, 6);
   writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, numParticlesHighWord, 6);
@@ -465,23 +449,31 @@ void write_output_single (struct engine *e, struct UnitSystem* us)
 
   /* Print the system of Units */
   writeUnitSystem(h_file, us);
-		  
+
   /* Create SPH particles group */
   /* message("Writing particle arrays..."); */
   h_grp = H5Gcreate1(h_file, "/PartType0", 0);
-  if(h_grp < 0)
-    error( "Error while creating particle group.\n");
+  if (h_grp < 0) error("Error while creating particle group.\n");
 
   /* Write arrays */
-  writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N, 3, parts, x, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N, 3, parts, v, us, UNIT_CONV_SPEED);
-  writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N, 1, parts, mass, us, UNIT_CONV_MASS);
-  writeArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N, 1, parts, h, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N, 1, parts, u, us, UNIT_CONV_ENERGY_PER_UNIT_MASS);
-  writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N, 1, parts, id, us, UNIT_CONV_NO_UNITS);
-  writeArray(h_grp, fileName, xmfFile, "TimeStep", FLOAT, N, 1, parts, dt, us, UNIT_CONV_TIME);
-  writeArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N, 3, parts, a, us, UNIT_CONV_ACCELERATION);
-  writeArray(h_grp, fileName, xmfFile, "Density", FLOAT, N, 1, parts, rho, us, UNIT_CONV_DENSITY);
+  writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N, 3, parts, x,
+             us, UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N, 3, parts, v, us,
+             UNIT_CONV_SPEED);
+  writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N, 1, parts, mass, us,
+             UNIT_CONV_MASS);
+  writeArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N, 1, parts, h,
+             us, UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N, 1, parts, u,
+             us, UNIT_CONV_ENERGY_PER_UNIT_MASS);
+  writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N, 1, parts,
+             id, us, UNIT_CONV_NO_UNITS);
+  writeArray(h_grp, fileName, xmfFile, "TimeStep", FLOAT, N, 1, parts, dt, us,
+             UNIT_CONV_TIME);
+  writeArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N, 3, parts, a,
+             us, UNIT_CONV_ACCELERATION);
+  writeArray(h_grp, fileName, xmfFile, "Density", FLOAT, N, 1, parts, rho, us,
+             UNIT_CONV_DENSITY);
 
   /* Close particle group */
   H5Gclose(h_grp);
@@ -497,7 +489,4 @@ void write_output_single (struct engine *e, struct UnitSystem* us)
   ++outputCount;
 }
 
-
-#endif  /* HAVE_HDF5 */
-
-
+#endif /* HAVE_HDF5 */
diff --git a/src/single_io.h b/src/single_io.h
index 3cc58a46cc5398affd63e5d7e22b317ae79db3f5..91d229178bbd45df3ba358172d1f52c70008adb7 100644
--- a/src/single_io.h
+++ b/src/single_io.h
@@ -1,28 +1,36 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_SINGLE_IO_H
+#define SWIFT_SINGLE_IO_H
 
+/* Includes. */
+#include "engine.h"
+#include "part.h"
+#include "units.h"
 
 #if defined(HAVE_HDF5) && !defined(WITH_MPI)
 
-void read_ic_single ( char* fileName, double dim[3], struct part **parts,  int* N, int* periodic);
+void read_ic_single(char* fileName, double dim[3], struct part** parts, int* N,
+                    int* periodic);
 
-void write_output_single ( struct engine* e, struct UnitSystem* us );
+void write_output_single(struct engine* e, struct UnitSystem* us);
 
 #endif
 
+#endif /* SWIFT_SINGLE_IO_H */
diff --git a/src/space.c b/src/space.c
index f9aa0d142a55007a9aa1bdaa83147034d7111048..fcdbfa24906153252f3b8a8855ca63703c648da6 100644
--- a/src/space.c
+++ b/src/space.c
@@ -1,54 +1,46 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 
 /* Config parameters. */
 #include "../config.h"
 
 /* Some standard headers. */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <pthread.h>
 #include <float.h>
 #include <limits.h>
 #include <math.h>
-
+#include <string.h>
 
 /* MPI headers. */
 #ifdef WITH_MPI
-    #include <mpi.h>
+#include <mpi.h>
 #endif
 
-/* Local headers. */
-#include "const.h"
-#include "cycle.h"
-#include "lock.h"
-#include "task.h"
-#include "kernel.h"
-#include "part.h"
+/* This object's header. */
 #include "space.h"
-#include "multipole.h"
-#include "cell.h"
-#include "scheduler.h"
+
+/* Local headers. */
+#include "atomic.h"
 #include "engine.h"
-#include "runner.h"
 #include "error.h"
+#include "kernel.h"
+#include "lock.h"
+#include "runner.h"
 
 /* Split size. */
 int space_splitsize = space_splitsize_default;
@@ -57,36 +49,34 @@ int space_maxsize = space_maxsize_default;
 
 /* Map shift vector to sortlist. */
 const int sortlistID[27] = {
-    /* ( -1 , -1 , -1 ) */   0 ,
-    /* ( -1 , -1 ,  0 ) */   1 , 
-    /* ( -1 , -1 ,  1 ) */   2 ,
-    /* ( -1 ,  0 , -1 ) */   3 ,
-    /* ( -1 ,  0 ,  0 ) */   4 , 
-    /* ( -1 ,  0 ,  1 ) */   5 ,
-    /* ( -1 ,  1 , -1 ) */   6 ,
-    /* ( -1 ,  1 ,  0 ) */   7 , 
-    /* ( -1 ,  1 ,  1 ) */   8 ,
-    /* (  0 , -1 , -1 ) */   9 ,
-    /* (  0 , -1 ,  0 ) */   10 , 
-    /* (  0 , -1 ,  1 ) */   11 ,
-    /* (  0 ,  0 , -1 ) */   12 ,
-    /* (  0 ,  0 ,  0 ) */   0 , 
-    /* (  0 ,  0 ,  1 ) */   12 ,
-    /* (  0 ,  1 , -1 ) */   11 ,
-    /* (  0 ,  1 ,  0 ) */   10 , 
-    /* (  0 ,  1 ,  1 ) */   9 ,
-    /* (  1 , -1 , -1 ) */   8 ,
-    /* (  1 , -1 ,  0 ) */   7 , 
-    /* (  1 , -1 ,  1 ) */   6 ,
-    /* (  1 ,  0 , -1 ) */   5 ,
-    /* (  1 ,  0 ,  0 ) */   4 , 
-    /* (  1 ,  0 ,  1 ) */   3 ,
-    /* (  1 ,  1 , -1 ) */   2 ,
-    /* (  1 ,  1 ,  0 ) */   1 , 
-    /* (  1 ,  1 ,  1 ) */   0 
-    };
-    
-    
+    /* ( -1 , -1 , -1 ) */ 0,
+    /* ( -1 , -1 ,  0 ) */ 1,
+    /* ( -1 , -1 ,  1 ) */ 2,
+    /* ( -1 ,  0 , -1 ) */ 3,
+    /* ( -1 ,  0 ,  0 ) */ 4,
+    /* ( -1 ,  0 ,  1 ) */ 5,
+    /* ( -1 ,  1 , -1 ) */ 6,
+    /* ( -1 ,  1 ,  0 ) */ 7,
+    /* ( -1 ,  1 ,  1 ) */ 8,
+    /* (  0 , -1 , -1 ) */ 9,
+    /* (  0 , -1 ,  0 ) */ 10,
+    /* (  0 , -1 ,  1 ) */ 11,
+    /* (  0 ,  0 , -1 ) */ 12,
+    /* (  0 ,  0 ,  0 ) */ 0,
+    /* (  0 ,  0 ,  1 ) */ 12,
+    /* (  0 ,  1 , -1 ) */ 11,
+    /* (  0 ,  1 ,  0 ) */ 10,
+    /* (  0 ,  1 ,  1 ) */ 9,
+    /* (  1 , -1 , -1 ) */ 8,
+    /* (  1 , -1 ,  0 ) */ 7,
+    /* (  1 , -1 ,  1 ) */ 6,
+    /* (  1 ,  0 , -1 ) */ 5,
+    /* (  1 ,  0 ,  0 ) */ 4,
+    /* (  1 ,  0 ,  1 ) */ 3,
+    /* (  1 ,  1 , -1 ) */ 2,
+    /* (  1 ,  1 ,  0 ) */ 1,
+    /* (  1 ,  1 ,  1 ) */ 0};
+
 /**
  * @brief Get the shift-id of the given pair of cells, swapping them
  *      if need be.
@@ -98,199 +88,200 @@ const int sortlistID[27] = {
  *
  * @return The shift ID and set shift, may or may not swap ci and cj.
  */
- 
-int space_getsid ( struct space *s , struct cell **ci , struct cell **cj , double *shift ) {
-
-    int k, sid = 0, periodic = s->periodic;
-    struct cell *temp;
-    double dx[3];
-
-    /* Get the relative distance between the pairs, wrapping. */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        dx[k] = (*cj)->loc[k] - (*ci)->loc[k];
-        if ( periodic && dx[k] < -s->dim[k]/2 )
-            shift[k] = s->dim[k];
-        else if ( periodic && dx[k] > s->dim[k]/2 )
-            shift[k] = -s->dim[k];
-        else
-            shift[k] = 0.0;
-        dx[k] += shift[k];
-        }
-        
-    /* Get the sorting index. */
-    for ( k = 0 ; k < 3 ; k++ )
-        sid = 3*sid + ( (dx[k] < 0.0) ? 0 : ( (dx[k] > 0.0) ? 2 : 1 ) );
-
-    /* Switch the cells around? */
-    if ( runner_flip[sid] ) {
-        temp = *ci; *ci = *cj; *cj = temp;
-        for ( k = 0 ; k < 3 ; k++ )
-            shift[k] = -shift[k];
-        }
-    sid = sortlistID[sid];
-    
-    /* Return the sort ID. */
-    return sid;
-
-    }
 
+int space_getsid(struct space *s, struct cell **ci, struct cell **cj,
+                 double *shift) {
+
+  int k, sid = 0, periodic = s->periodic;
+  struct cell *temp;
+  double dx[3];
+
+  /* Get the relative distance between the pairs, wrapping. */
+  for (k = 0; k < 3; k++) {
+    dx[k] = (*cj)->loc[k] - (*ci)->loc[k];
+    if (periodic && dx[k] < -s->dim[k] / 2)
+      shift[k] = s->dim[k];
+    else if (periodic && dx[k] > s->dim[k] / 2)
+      shift[k] = -s->dim[k];
+    else
+      shift[k] = 0.0;
+    dx[k] += shift[k];
+  }
+
+  /* Get the sorting index. */
+  for (k = 0; k < 3; k++)
+    sid = 3 * sid + ((dx[k] < 0.0) ? 0 : ((dx[k] > 0.0) ? 2 : 1));
+
+  /* Switch the cells around? */
+  if (runner_flip[sid]) {
+    temp = *ci;
+    *ci = *cj;
+    *cj = temp;
+    for (k = 0; k < 3; k++) shift[k] = -shift[k];
+  }
+  sid = sortlistID[sid];
+
+  /* Return the sort ID. */
+  return sid;
+}
 
 /**
  * @brief Recursively dismantle a cell tree.
  *
  */
- 
-void space_rebuild_recycle ( struct space *s , struct cell *c ) {
-    
-    int k;
-    
-    if ( c->split )
-        for ( k = 0 ; k < 8 ; k++ )
-            if ( c->progeny[k] != NULL ) {
-                space_rebuild_recycle( s , c->progeny[k] );
-                space_recycle( s , c->progeny[k] );
-                c->progeny[k] = NULL;
-                }
-    
-    }
-    
-    
+
+void space_rebuild_recycle(struct space *s, struct cell *c) {
+
+  int k;
+
+  if (c->split)
+    for (k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL) {
+        space_rebuild_recycle(s, c->progeny[k]);
+        space_recycle(s, c->progeny[k]);
+        c->progeny[k] = NULL;
+      }
+}
+
 /**
  * @brief Re-build the cell grid.
  *
  * @param s The #space.
  * @param cell_max Maximum cell edge length.
  */
- 
-void space_regrid ( struct space *s , double cell_max ) {
-
-    float h_max = s->cell_min / kernel_gamma / space_stretch, dmin;
-    int i, j, k, cdim[3], nr_parts = s->nr_parts;
-    struct cell *restrict c;
-    // ticks tic;
-    
-    /* Run through the parts and get the current h_max. */
-    // tic = getticks();
-    if ( s->cells != NULL ) {
-        for ( k = 0 ; k < s->nr_cells ; k++ ) {
-            if ( s->cells[k].h_max > h_max )
-                h_max = s->cells[k].h_max;
-            }
-        }
-    else {
-        for ( k = 0 ; k < nr_parts ; k++ ) {
-            if ( s->parts[k].h > h_max )
-                h_max = s->parts[k].h;
-            }
-        s->h_max = h_max;
-        }
-        
-    /* If we are running in parallel, make sure everybody agrees on
-       how large the largest cell should be. */
-    #ifdef WITH_MPI
-    {
-      float buff;
-      if ( MPI_Allreduce( &h_max , &buff , 1 , MPI_FLOAT , MPI_MAX , MPI_COMM_WORLD ) != MPI_SUCCESS )
-          error( "Failed to aggreggate the rebuild flag accross nodes." );
-      h_max = buff;
+
+void space_regrid(struct space *s, double cell_max) {
+
+  float h_max = s->cell_min / kernel_gamma / space_stretch, dmin;
+  int i, j, k, cdim[3], nr_parts = s->nr_parts;
+  struct cell *restrict c;
+  // ticks tic;
+
+  /* Run through the parts and get the current h_max. */
+  // tic = getticks();
+  if (s->cells != NULL) {
+    for (k = 0; k < s->nr_cells; k++) {
+      if (s->cells[k].h_max > h_max) h_max = s->cells[k].h_max;
+    }
+  } else {
+    for (k = 0; k < nr_parts; k++) {
+      if (s->parts[k].h > h_max) h_max = s->parts[k].h;
     }
-    #endif
-    message( "h_max is %.3e (cell_max=%.3e)." , h_max , cell_max );
-    
-    /* Get the new putative cell dimensions. */
-    for ( k = 0 ; k < 3 ; k++ )
-        cdim[k] = floor( s->dim[k] / fmax( h_max*kernel_gamma*space_stretch , cell_max ) );
-        
-    /* Check if we have enough cells for periodicity. */
-    if ( s->periodic && (cdim[0] < 3 || cdim[1] < 3 || cdim[2] < 3) )
-        error( "Must have at least 3 cells in each spatial dimension when periodicity is switched on." );
-        
-    /* In MPI-Land, we're not allowed to change the top-level cell size. */
-    #ifdef WITH_MPI
-        if ( cdim[0] < s->cdim[0] || cdim[1] < s->cdim[1] || cdim[2] < s->cdim[2] )
-            error( "Root-level change of cell size not allowed." );
-    #endif
-        
-    /* Do we need to re-build the upper-level cells? */
-    // tic = getticks();
-    if ( s->cells == NULL ||
-         cdim[0] < s->cdim[0] || cdim[1] < s->cdim[1] || cdim[2] < s->cdim[2] ) {
-    
-        /* Free the old cells, if they were allocated. */
-        if ( s->cells != NULL ) {
-            for ( k = 0 ; k < s->nr_cells ; k++ ) {
-                space_rebuild_recycle( s , &s->cells[k] );
-                if ( s->cells[k].sort != NULL )
-                    free( s->cells[k].sort );
-                }
-            free( s->cells );
-            s->maxdepth = 0;
-            }
-            
-        /* Set the new cell dimensions only if smaller. */
-        for ( k = 0 ; k < 3 ; k++ ) {
-            s->cdim[k] = cdim[k];
-            s->h[k] = s->dim[k] / cdim[k];
-            s->ih[k] = 1.0 / s->h[k];
-            }
-        dmin = fminf( s->h[0] , fminf( s->h[1] , s->h[2] ) );
-
-        /* Allocate the highest level of cells. */
-        s->tot_cells = s->nr_cells = cdim[0] * cdim[1] * cdim[2];
-        if ( posix_memalign( (void *)&s->cells , 64 , s->nr_cells * sizeof(struct cell) ) != 0 )
-            error( "Failed to allocate cells." );
-        bzero( s->cells , s->nr_cells * sizeof(struct cell) );
-        for ( k = 0 ; k < s->nr_cells ; k++ )
-            if ( lock_init( &s->cells[k].lock ) != 0 )
-                error( "Failed to init spinlock." );
-
-        /* Set the cell location and sizes. */
-        for ( i = 0 ; i < cdim[0] ; i++ )
-            for ( j = 0 ; j < cdim[1] ; j++ )
-                for ( k = 0 ; k < cdim[2] ; k++ ) {
-                    c = &s->cells[ cell_getid( cdim , i , j , k ) ];
-                    c->loc[0] = i*s->h[0]; c->loc[1] = j*s->h[1]; c->loc[2] = k*s->h[2];
-                    c->h[0] = s->h[0]; c->h[1] = s->h[1]; c->h[2] = s->h[2];
-                    c->dmin = dmin;
-                    c->depth = 0;
-                    c->count = 0;
-                    c->gcount = 0;
-                    c->super = c;
-                    lock_init( &c->lock );
-                    }
-           
-        /* Be verbose about the change. */         
-        message( "set cell dimensions to [ %i %i %i ]." , cdim[0] , cdim[1] , cdim[2] ); fflush(stdout);
-                    
-        } /* re-build upper-level cells? */
-    // message( "rebuilding upper-level cells took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 );
-        
-    /* Otherwise, just clean up the cells. */
-    else {
-    
-        /* Free the old cells, if they were allocated. */
-        for ( k = 0 ; k < s->nr_cells ; k++ ) {
-            space_rebuild_recycle( s , &s->cells[k] );
-            s->cells[k].sorts = NULL;
-            s->cells[k].nr_tasks = 0;
-            s->cells[k].nr_density = 0;
-            s->cells[k].nr_force = 0;
-            s->cells[k].density = NULL;
-            s->cells[k].force = NULL;
-            s->cells[k].dx_max = 0.0f;
-            s->cells[k].sorted = 0;
-            s->cells[k].count = 0;
-            s->cells[k].gcount = 0;
-            s->cells[k].kick1 = NULL;
-            s->cells[k].kick2 = NULL;
-            s->cells[k].super = &s->cells[k];
-            }
-        s->maxdepth = 0;
-    
+    s->h_max = h_max;
+  }
+
+/* If we are running in parallel, make sure everybody agrees on
+   how large the largest cell should be. */
+#ifdef WITH_MPI
+  {
+    float buff;
+    if (MPI_Allreduce(&h_max, &buff, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD) !=
+        MPI_SUCCESS)
+      error("Failed to aggreggate the rebuild flag accross nodes.");
+    h_max = buff;
+  }
+#endif
+  message("h_max is %.3e (cell_max=%.3e).", h_max, cell_max);
+
+  /* Get the new putative cell dimensions. */
+  for (k = 0; k < 3; k++)
+    cdim[k] =
+        floor(s->dim[k] / fmax(h_max * kernel_gamma * space_stretch, cell_max));
+
+  /* Check if we have enough cells for periodicity. */
+  if (s->periodic && (cdim[0] < 3 || cdim[1] < 3 || cdim[2] < 3))
+    error(
+        "Must have at least 3 cells in each spatial dimension when periodicity "
+        "is switched on.");
+
+/* In MPI-Land, we're not allowed to change the top-level cell size. */
+#ifdef WITH_MPI
+  if (cdim[0] < s->cdim[0] || cdim[1] < s->cdim[1] || cdim[2] < s->cdim[2])
+    error("Root-level change of cell size not allowed.");
+#endif
+
+  /* Do we need to re-build the upper-level cells? */
+  // tic = getticks();
+  if (s->cells == NULL || cdim[0] < s->cdim[0] || cdim[1] < s->cdim[1] ||
+      cdim[2] < s->cdim[2]) {
+
+    /* Free the old cells, if they were allocated. */
+    if (s->cells != NULL) {
+      for (k = 0; k < s->nr_cells; k++) {
+        space_rebuild_recycle(s, &s->cells[k]);
+        if (s->cells[k].sort != NULL) free(s->cells[k].sort);
+      }
+      free(s->cells);
+      s->maxdepth = 0;
+    }
+
+    /* Set the new cell dimensions only if smaller. */
+    for (k = 0; k < 3; k++) {
+      s->cdim[k] = cdim[k];
+      s->h[k] = s->dim[k] / cdim[k];
+      s->ih[k] = 1.0 / s->h[k];
+    }
+    dmin = fminf(s->h[0], fminf(s->h[1], s->h[2]));
+
+    /* Allocate the highest level of cells. */
+    s->tot_cells = s->nr_cells = cdim[0] * cdim[1] * cdim[2];
+    if (posix_memalign((void *)&s->cells, 64,
+                       s->nr_cells * sizeof(struct cell)) != 0)
+      error("Failed to allocate cells.");
+    bzero(s->cells, s->nr_cells * sizeof(struct cell));
+    for (k = 0; k < s->nr_cells; k++)
+      if (lock_init(&s->cells[k].lock) != 0) error("Failed to init spinlock.");
+
+    /* Set the cell location and sizes. */
+    for (i = 0; i < cdim[0]; i++)
+      for (j = 0; j < cdim[1]; j++)
+        for (k = 0; k < cdim[2]; k++) {
+          c = &s->cells[cell_getid(cdim, i, j, k)];
+          c->loc[0] = i * s->h[0];
+          c->loc[1] = j * s->h[1];
+          c->loc[2] = k * s->h[2];
+          c->h[0] = s->h[0];
+          c->h[1] = s->h[1];
+          c->h[2] = s->h[2];
+          c->dmin = dmin;
+          c->depth = 0;
+          c->count = 0;
+          c->gcount = 0;
+          c->super = c;
+          lock_init(&c->lock);
         }
-        
+
+    /* Be verbose about the change. */
+    message("set cell dimensions to [ %i %i %i ].", cdim[0], cdim[1], cdim[2]);
+    fflush(stdout);
+
+  } /* re-build upper-level cells? */
+  // message( "rebuilding upper-level cells took %.3f ms." , (double)(getticks()
+  // - tic) / CPU_TPS * 1000 );
+
+  /* Otherwise, just clean up the cells. */
+  else {
+
+    /* Free the old cells, if they were allocated. */
+    for (k = 0; k < s->nr_cells; k++) {
+      space_rebuild_recycle(s, &s->cells[k]);
+      s->cells[k].sorts = NULL;
+      s->cells[k].nr_tasks = 0;
+      s->cells[k].nr_density = 0;
+      s->cells[k].nr_force = 0;
+      s->cells[k].density = NULL;
+      s->cells[k].force = NULL;
+      s->cells[k].dx_max = 0.0f;
+      s->cells[k].sorted = 0;
+      s->cells[k].count = 0;
+      s->cells[k].gcount = 0;
+      s->cells[k].kick1 = NULL;
+      s->cells[k].kick2 = NULL;
+      s->cells[k].super = &s->cells[k];
     }
-    
+    s->maxdepth = 0;
+  }
+}
 
 /**
  * @brief Re-build the cells as well as the tasks.
@@ -299,177 +290,187 @@ void space_regrid ( struct space *s , double cell_max ) {
  * @param cell_max Maximal cell size.
  *
  */
- 
-void space_rebuild ( struct space *s , double cell_max ) {
-
-    int j, k, cdim[3], nr_parts = s->nr_parts, nr_gparts = s->nr_gparts;
-    struct cell *restrict c, *restrict cells;
-    struct part *restrict finger, *restrict p, *parts = s->parts;
-    struct xpart *xfinger, *xparts = s->xparts;
-    struct gpart *gp, *gparts = s->gparts, *gfinger;
-    int *ind;
-    double ih[3], dim[3];
-    // ticks tic;
-    
-    /* Be verbose about this. */
-    // message( "re)building space..." ); fflush(stdout);
-    
-    /* Re-grid if necessary, or just re-set the cell data. */
-    space_regrid( s , cell_max );
-    cells = s->cells;
-        
-    /* Run through the particles and get their cell index. */
-    // tic = getticks();
-    const int ind_size = s->size_parts;
-    if ( ( ind = (int *)malloc( sizeof(int) * ind_size ) ) == NULL )
-        error( "Failed to allocate temporary particle indices." );
-    ih[0] = s->ih[0]; ih[1] = s->ih[1]; ih[2] = s->ih[2];
-    dim[0] = s->dim[0]; dim[1] = s->dim[1]; dim[2] = s->dim[2];
-    cdim[0] = s->cdim[0]; cdim[1] = s->cdim[1]; cdim[2] = s->cdim[2];
-    for ( k = 0 ; k < nr_parts ; k++ )  {
-        p = &parts[k];
-        for ( j = 0 ; j < 3 ; j++ )
-            if ( p->x[j] < 0.0 )
-                p->x[j] += dim[j];
-            else if ( p->x[j] >= dim[j] )
-                p->x[j] -= dim[j];
-        ind[k] = cell_getid( cdim , p->x[0]*ih[0] , p->x[1]*ih[1] , p->x[2]*ih[2] );
-        cells[ ind[k] ].count++;
-        }
-    // message( "getting particle indices took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 );
-
-
-    #ifdef WITH_MPI
-        /* Move non-local parts to the end of the list. */
-        int nodeID = s->e->nodeID;
-        for ( k = 0 ; k < nr_parts ; k++ )
-            if ( cells[ ind[k] ].nodeID != nodeID ) {
-                cells[ ind[k] ].count -= 1;
-                nr_parts -= 1;
-                struct part tp = parts[k];
-                parts[k] = parts[ nr_parts ];
-                parts[ nr_parts ] = tp;
-                struct xpart txp = xparts[k];
-                xparts[k] = xparts[ nr_parts ];
-                xparts[ nr_parts ] = txp;
-                int t = ind[k];
-                ind[k] = ind[ nr_parts ];
-                ind[ nr_parts ] = t;
-                }
-                
-        /* Exchange the strays, note that this potentially re-allocates
-           the parts arrays. */
-        s->nr_parts = nr_parts + engine_exchange_strays( s->e , nr_parts , &ind[nr_parts] , s->nr_parts - nr_parts );
-        parts = s->parts;
-        xparts = s->xparts;
-        
-        /* Re-allocate the index array if needed.. */
-        if (s->nr_parts > ind_size) {
-          int *ind_new;
-          if ( ( ind_new = (int *)malloc( sizeof(int) * s->nr_parts ) ) == NULL )
-              error( "Failed to allocate temporary particle indices." );
-          memcpy(ind_new, ind, sizeof(int) * nr_parts);
-          free(ind); ind = ind_new;
-        }
-        
-        /* Assign each particle to its cell. */
-        for ( k = nr_parts ; k < s->nr_parts ; k++ ) {
-            p = &parts[k];
-            ind[k] = cell_getid( cdim , p->x[0]*ih[0] , p->x[1]*ih[1] , p->x[2]*ih[2] );
-            cells[ ind[k] ].count += 1;
-            /* if ( cells[ ind[k] ].nodeID != nodeID )
-                error( "Received part that does not belong to me (nodeID=%i)." , cells[ ind[k] ].nodeID ); */
-            }
-        nr_parts = s->nr_parts;
-    #endif
-    
-
-    /* Sort the parts according to their cells. */
-    // tic = getticks();
-    parts_sort( parts , xparts , ind , nr_parts , 0 , s->nr_cells-1 );
-    // message( "parts_sort took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 );
-    
-    /* Re-link the gparts. */
-    for ( k = 0 ; k < nr_parts ; k++ )
-        if ( parts[k].gpart != NULL )
-            parts[k].gpart->part = &parts[k];
-    
-    /* Verify sort. */
-    /* for ( k = 1 ; k < nr_parts ; k++ ) {
-        if ( ind[k-1] > ind[k] ) {
-            error( "Sort failed!" );
-            }
-        else if ( ind[k] != cell_getid( cdim , parts[k].x[0]*ih[0] , parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] ) )
-            error( "Incorrect indices!" );
-        } */
-    
-    /* We no longer need the indices as of here. */
-    free( ind );    
-
-
-
-    /* Run through the gravity particles and get their cell index. */
-    // tic = getticks();
-    if ( ( ind = (int *)malloc( sizeof(int) * s->size_gparts ) ) == NULL )
-        error( "Failed to allocate temporary particle indices." );
-    for ( k = 0 ; k < nr_gparts ; k++ )  {
-        gp = &gparts[k];
-        for ( j = 0 ; j < 3 ; j++ )
-            if ( gp->x[j] < 0.0 )
-                gp->x[j] += dim[j];
-            else if ( gp->x[j] >= dim[j] )
-                gp->x[j] -= dim[j];
-        ind[k] = cell_getid( cdim , gp->x[0]*ih[0] , gp->x[1]*ih[1] , gp->x[2]*ih[2] );
-        cells[ ind[k] ].gcount++;
-        }
-    // message( "getting particle indices took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 );
-
-    /* TODO: Here we should exchange the gparts as well! */
-
-    /* Sort the parts according to their cells. */
-    // tic = getticks();
-    gparts_sort( gparts ,ind , nr_gparts , 0 , s->nr_cells-1 );
-    // message( "gparts_sort took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 );
-    
-    /* Re-link the parts. */
-    for ( k = 0 ; k < nr_gparts ; k++ )
-        if ( gparts[k].id > 0 )
-            gparts[k].part->gpart = &gparts[k];
-
-    /* We no longer need the indices as of here. */
-    free( ind );    
-
-
-
-    /* Hook the cells up to the parts. */
-    // tic = getticks();
-    finger = parts;
-    xfinger = xparts;
-    gfinger = gparts;
-    for ( k = 0 ; k < s->nr_cells ; k++ ) {
-        c = &cells[ k ];
-        c->parts = finger;
-        c->xparts = xfinger;
-        c->gparts = gfinger;
-        finger = &finger[ c->count ];
-        xfinger = &xfinger[ c->count ];
-        gfinger = &gfinger[ c->gcount ];
-        }
-    // message( "hooking up cells took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 );
-        
-    /* At this point, we have the upper-level cells, old or new. Now make
-       sure that the parts in each cell are ok. */
-    // tic = getticks();
-    for ( k = 0; k < s->nr_cells; k++ )
-      space_split( s , &cells[k] );
-     
-    // message( "space_split took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 );
-    
+
+void space_rebuild(struct space *s, double cell_max) {
+
+  int j, k, cdim[3], nr_parts = s->nr_parts, nr_gparts = s->nr_gparts;
+  struct cell *restrict c, *restrict cells;
+  struct part *restrict finger, *restrict p, *parts = s->parts;
+  struct xpart *xfinger, *xparts = s->xparts;
+  struct gpart *gp, *gparts = s->gparts, *gfinger;
+  int *ind;
+  double ih[3], dim[3];
+  // ticks tic;
+
+  /* Be verbose about this. */
+  // message( "re)building space..." ); fflush(stdout);
+
+  /* Re-grid if necessary, or just re-set the cell data. */
+  space_regrid(s, cell_max);
+  cells = s->cells;
+
+  /* Run through the particles and get their cell index. */
+  // tic = getticks();
+  const int ind_size = s->size_parts;
+  if ((ind = (int *)malloc(sizeof(int) * ind_size)) == NULL)
+    error("Failed to allocate temporary particle indices.");
+  ih[0] = s->ih[0];
+  ih[1] = s->ih[1];
+  ih[2] = s->ih[2];
+  dim[0] = s->dim[0];
+  dim[1] = s->dim[1];
+  dim[2] = s->dim[2];
+  cdim[0] = s->cdim[0];
+  cdim[1] = s->cdim[1];
+  cdim[2] = s->cdim[2];
+  for (k = 0; k < nr_parts; k++) {
+    p = &parts[k];
+    for (j = 0; j < 3; j++)
+      if (p->x[j] < 0.0)
+        p->x[j] += dim[j];
+      else if (p->x[j] >= dim[j])
+        p->x[j] -= dim[j];
+    ind[k] =
+        cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]);
+    cells[ind[k]].count++;
+  }
+// message( "getting particle indices took %.3f ms." , (double)(getticks() -
+// tic) / CPU_TPS * 1000 );
+
+#ifdef WITH_MPI
+  /* Move non-local parts to the end of the list. */
+  int nodeID = s->e->nodeID;
+  for (k = 0; k < nr_parts; k++)
+    if (cells[ind[k]].nodeID != nodeID) {
+      cells[ind[k]].count -= 1;
+      nr_parts -= 1;
+      struct part tp = parts[k];
+      parts[k] = parts[nr_parts];
+      parts[nr_parts] = tp;
+      struct xpart txp = xparts[k];
+      xparts[k] = xparts[nr_parts];
+      xparts[nr_parts] = txp;
+      int t = ind[k];
+      ind[k] = ind[nr_parts];
+      ind[nr_parts] = t;
     }
 
+  /* Exchange the strays, note that this potentially re-allocates
+     the parts arrays. */
+  s->nr_parts =
+      nr_parts + engine_exchange_strays(s->e, nr_parts, &ind[nr_parts],
+                                        s->nr_parts - nr_parts);
+  parts = s->parts;
+  xparts = s->xparts;
+
+  /* Re-allocate the index array if needed.. */
+  if (s->nr_parts > ind_size) {
+    int *ind_new;
+    if ((ind_new = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
+      error("Failed to allocate temporary particle indices.");
+    memcpy(ind_new, ind, sizeof(int) * nr_parts);
+    free(ind);
+    ind = ind_new;
+  }
+
+  /* Assign each particle to its cell. */
+  for (k = nr_parts; k < s->nr_parts; k++) {
+    p = &parts[k];
+    ind[k] =
+        cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]);
+    cells[ind[k]].count += 1;
+    /* if ( cells[ ind[k] ].nodeID != nodeID )
+        error( "Received part that does not belong to me (nodeID=%i)." , cells[
+       ind[k] ].nodeID ); */
+  }
+  nr_parts = s->nr_parts;
+#endif
+
+  /* Sort the parts according to their cells. */
+  // tic = getticks();
+  parts_sort(parts, xparts, ind, nr_parts, 0, s->nr_cells - 1);
+  // message( "parts_sort took %.3f ms." , (double)(getticks() - tic) / CPU_TPS
+  // * 1000 );
+
+  /* Re-link the gparts. */
+  for (k = 0; k < nr_parts; k++)
+    if (parts[k].gpart != NULL) parts[k].gpart->part = &parts[k];
+
+  /* Verify sort. */
+  /* for ( k = 1 ; k < nr_parts ; k++ ) {
+      if ( ind[k-1] > ind[k] ) {
+          error( "Sort failed!" );
+          }
+      else if ( ind[k] != cell_getid( cdim , parts[k].x[0]*ih[0] ,
+     parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] ) )
+          error( "Incorrect indices!" );
+      } */
+
+  /* We no longer need the indices as of here. */
+  free(ind);
+
+  /* Run through the gravity particles and get their cell index. */
+  // tic = getticks();
+  if ((ind = (int *)malloc(sizeof(int) * s->size_gparts)) == NULL)
+    error("Failed to allocate temporary particle indices.");
+  for (k = 0; k < nr_gparts; k++) {
+    gp = &gparts[k];
+    for (j = 0; j < 3; j++)
+      if (gp->x[j] < 0.0)
+        gp->x[j] += dim[j];
+      else if (gp->x[j] >= dim[j])
+        gp->x[j] -= dim[j];
+    ind[k] =
+        cell_getid(cdim, gp->x[0] * ih[0], gp->x[1] * ih[1], gp->x[2] * ih[2]);
+    cells[ind[k]].gcount++;
+  }
+  // message( "getting particle indices took %.3f ms." , (double)(getticks() -
+  // tic) / CPU_TPS * 1000 );
+
+  /* TODO: Here we should exchange the gparts as well! */
+
+  /* Sort the parts according to their cells. */
+  // tic = getticks();
+  gparts_sort(gparts, ind, nr_gparts, 0, s->nr_cells - 1);
+  // message( "gparts_sort took %.3f ms." , (double)(getticks() - tic) / CPU_TPS
+  // * 1000 );
+
+  /* Re-link the parts. */
+  for (k = 0; k < nr_gparts; k++)
+    if (gparts[k].id > 0) gparts[k].part->gpart = &gparts[k];
+
+  /* We no longer need the indices as of here. */
+  free(ind);
+
+  /* Hook the cells up to the parts. */
+  // tic = getticks();
+  finger = parts;
+  xfinger = xparts;
+  gfinger = gparts;
+  for (k = 0; k < s->nr_cells; k++) {
+    c = &cells[k];
+    c->parts = finger;
+    c->xparts = xfinger;
+    c->gparts = gfinger;
+    finger = &finger[c->count];
+    xfinger = &xfinger[c->count];
+    gfinger = &gfinger[c->gcount];
+  }
+  // message( "hooking up cells took %.3f ms." , (double)(getticks() - tic) /
+  // CPU_TPS * 1000 );
+
+  /* At this point, we have the upper-level cells, old or new. Now make
+     sure that the parts in each cell are ok. */
+  // tic = getticks();
+  for (k = 0; k < s->nr_cells; k++) space_split(s, &cells[k]);
+
+  // message( "space_split took %.3f ms." , (double)(getticks() - tic) / CPU_TPS
+  // * 1000 );
+}
 
 /**
- * @brief Sort the particles and condensed particles according to the given indices.
+ * @brief Sort the particles and condensed particles according to the given
+ *indices.
  *
  * @param parts The list of #part
  * @param xparts The list of reduced particles
@@ -478,313 +479,303 @@ void space_rebuild ( struct space *s , double cell_max ) {
  * @param min Lowest index.
  * @param max highest index.
  */
- 
-void parts_sort ( struct part *parts , struct xpart *xparts , int *ind , int N , int min , int max ) {
-
-    struct qstack {
-        volatile int i, j, min, max;
-        volatile int ready;
-        };
-    struct qstack *qstack;
-    int qstack_size = 2*(max-min) + 10;
-    volatile unsigned int first, last, waiting;
-    
-    int pivot;
-    int i, ii, j, jj, temp_i, qid;
-    struct part temp_p;
-    struct xpart temp_xp;
-
-    /* for ( int k = 0 ; k < N ; k++ )
-        if ( ind[k] > max || ind[k] < min )
-	    error( "ind[%i]=%i is not in [%i,%i]." , k , ind[k] , min , max ); */
-    
-    /* Allocate the stack. */
-    if ( ( qstack = malloc( sizeof(struct qstack) * qstack_size ) ) == NULL )
-        error( "Failed to allocate qstack." );
-    
-    /* Init the interval stack. */
-    qstack[0].i = 0;
-    qstack[0].j = N-1;
-    qstack[0].min = min;
-    qstack[0].max = max;
-    qstack[0].ready = 1;
-    for ( i = 1 ; i < qstack_size ; i++ )
-        qstack[i].ready = 0;
-    first = 0; last = 1; waiting = 1;
-    
-    /* Main loop. */
-    while ( waiting > 0 ) {
-        
-      /* Grab an interval off the queue. */
-      qid = ( first++ ) % qstack_size;
-            
-        
-      /* Get the stack entry. */
-      i = qstack[qid].i;
-      j = qstack[qid].j;
-      min = qstack[qid].min;
-      max = qstack[qid].max;
-      qstack[qid].ready = 0;
-      
-            
-      /* Loop over sub-intervals. */
-      while ( 1 ) {
-	
-	/* Bring beer. */
-	pivot = (min + max) / 2;
-        
-	/* One pass of QuickSort's partitioning. */
-	ii = i; jj = j;
-	while ( ii < jj ) {
-	  while ( ii <= j && ind[ii] <= pivot )
-	    ii++;
-	  while ( jj >= i && ind[jj] > pivot )
-	    jj--;
-	  if ( ii < jj ) {
-	    temp_i = ind[ii]; ind[ii] = ind[jj]; ind[jj] = temp_i;
-	    temp_p = parts[ii]; parts[ii] = parts[jj]; parts[jj] = temp_p;
-	    temp_xp = xparts[ii]; xparts[ii] = xparts[jj]; xparts[jj] = temp_xp;
-	  }
-	}
-	
-	/* Verify sort. */
-	/* for ( int k = i ; k <= jj ; k++ )
-	   if ( ind[k] > pivot ) {
-	   message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, N=%i." , k , ind[k] , pivot , i , j , N );
-	   error( "Partition failed (<=pivot)." );
-	   }
-	   for ( int k = jj+1 ; k <= j ; k++ )
-	   if ( ind[k] <= pivot ) {
-	   message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, N=%i." , k , ind[k] , pivot , i , j , N );
-	   error( "Partition failed (>pivot)." );
-	   } */
-                        
-	/* Split-off largest interval. */
-	if ( jj - i > j - jj+1 ) {
-	  
-	  /* Recurse on the left? */
-	  if ( jj > i  && pivot > min ) {
-	    qid = ( last++ ) % qstack_size;
-	    qstack[qid].i = i;
-	    qstack[qid].j = jj;
-	    qstack[qid].min = min;
-	    qstack[qid].max = pivot;
-	    qstack[qid].ready = 1;
-	    if ( waiting++  >= qstack_size )
-	      error( "Qstack overflow." );
-	  }
-	  
-	  /* Recurse on the right? */
-	  if ( jj+1 < j && pivot+1 < max ) {
-	    i = jj+1;
-	    min = pivot+1;
-	  }
-	  else
-	    break;
-	  
-	}
-        
-	else {
-	  
-	  /* Recurse on the right? */
-	  if ( jj+1 < j && pivot+1 < max ) {
-	    qid = ( last++ ) % qstack_size;
-	    qstack[qid].i = jj+1;
-	    qstack[qid].j = j;
-	    qstack[qid].min = pivot+1;
-	    qstack[qid].max = max;
-	    qstack[qid].ready = 1;
-	    if ( ( waiting++ ) >= qstack_size )
-	      error( "Qstack overflow." );
-	  }
-          
-	  /* Recurse on the left? */
-	  if ( jj > i  && pivot > min ) {
-	    j = jj;
-	    max = pivot;
-	  }
-	  else
-	    break;
-	  
-	}
-        
-      } /* loop over sub-intervals. */
-      
-      waiting--;
-      
-    } /* main loop. */
-    
-    
-    /* Verify sort. */
-    /* for ( i = 1 ; i < N ; i++ )
-        if ( ind[i-1] > ind[i] )
-            error( "Sorting failed (ind[%i]=%i,ind[%i]=%i)." , i-1 , ind[i-1] , i , ind[i] ); */
-            
-    /* Clean up. */
-    free( qstack );
 
-    }
+void parts_sort(struct part *parts, struct xpart *xparts, int *ind, int N,
+                int min, int max) {
+
+  struct qstack {
+    volatile int i, j, min, max;
+    volatile int ready;
+  };
+  struct qstack *qstack;
+  int qstack_size = 2 * (max - min) + 10;
+  volatile unsigned int first, last, waiting;
+
+  int pivot;
+  int i, ii, j, jj, temp_i, qid;
+  struct part temp_p;
+  struct xpart temp_xp;
+
+  /* for ( int k = 0 ; k < N ; k++ )
+      if ( ind[k] > max || ind[k] < min )
+          error( "ind[%i]=%i is not in [%i,%i]." , k , ind[k] , min , max ); */
+
+  /* Allocate the stack. */
+  if ((qstack = malloc(sizeof(struct qstack) * qstack_size)) == NULL)
+    error("Failed to allocate qstack.");
+
+  /* Init the interval stack. */
+  qstack[0].i = 0;
+  qstack[0].j = N - 1;
+  qstack[0].min = min;
+  qstack[0].max = max;
+  qstack[0].ready = 1;
+  for (i = 1; i < qstack_size; i++) qstack[i].ready = 0;
+  first = 0;
+  last = 1;
+  waiting = 1;
+
+  /* Main loop. */
+  while (waiting > 0) {
+
+    /* Grab an interval off the queue. */
+    qid = (first++) % qstack_size;
+
+    /* Get the stack entry. */
+    i = qstack[qid].i;
+    j = qstack[qid].j;
+    min = qstack[qid].min;
+    max = qstack[qid].max;
+    qstack[qid].ready = 0;
+
+    /* Loop over sub-intervals. */
+    while (1) {
+
+      /* Bring beer. */
+      pivot = (min + max) / 2;
+
+      /* One pass of QuickSort's partitioning. */
+      ii = i;
+      jj = j;
+      while (ii < jj) {
+        while (ii <= j && ind[ii] <= pivot) ii++;
+        while (jj >= i && ind[jj] > pivot) jj--;
+        if (ii < jj) {
+          temp_i = ind[ii];
+          ind[ii] = ind[jj];
+          ind[jj] = temp_i;
+          temp_p = parts[ii];
+          parts[ii] = parts[jj];
+          parts[jj] = temp_p;
+          temp_xp = xparts[ii];
+          xparts[ii] = xparts[jj];
+          xparts[jj] = temp_xp;
+        }
+      }
 
+      /* Verify sort. */
+      /* for ( int k = i ; k <= jj ; k++ )
+         if ( ind[k] > pivot ) {
+         message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i,
+         N=%i." , k , ind[k] , pivot , i , j , N );
+         error( "Partition failed (<=pivot)." );
+         }
+         for ( int k = jj+1 ; k <= j ; k++ )
+         if ( ind[k] <= pivot ) {
+         message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i,
+         N=%i." , k , ind[k] , pivot , i , j , N );
+         error( "Partition failed (>pivot)." );
+         } */
+
+      /* Split-off largest interval. */
+      if (jj - i > j - jj + 1) {
+
+        /* Recurse on the left? */
+        if (jj > i && pivot > min) {
+          qid = (last++) % qstack_size;
+          qstack[qid].i = i;
+          qstack[qid].j = jj;
+          qstack[qid].min = min;
+          qstack[qid].max = pivot;
+          qstack[qid].ready = 1;
+          if (waiting++ >= qstack_size) error("Qstack overflow.");
+        }
 
-void gparts_sort ( struct gpart *gparts , int *ind , int N , int min , int max ) {
-
-    struct qstack {
-        volatile int i, j, min, max;
-        volatile int ready;
-        };
-    struct qstack *qstack;
-    int qstack_size = 2*(max-min) + 10;
-    volatile unsigned int first, last, waiting;
-    
-    int pivot;
-    int i, ii, j, jj, temp_i, qid;
-    struct gpart temp_p;
-
-    /* for ( int k = 0 ; k < N ; k++ )
-        if ( ind[k] > max || ind[k] < min )
-	    error( "ind[%i]=%i is not in [%i,%i]." , k , ind[k] , min , max ); */
-    
-    /* Allocate the stack. */
-    if ( ( qstack = malloc( sizeof(struct qstack) * qstack_size ) ) == NULL )
-        error( "Failed to allocate qstack." );
-    
-    /* Init the interval stack. */
-    qstack[0].i = 0;
-    qstack[0].j = N-1;
-    qstack[0].min = min;
-    qstack[0].max = max;
-    qstack[0].ready = 1;
-    for ( i = 1 ; i < qstack_size ; i++ )
-        qstack[i].ready = 0;
-    first = 0; last = 1; waiting = 1;
-    
-    /* Main loop. */
-    while ( waiting > 0 ) {
-      
-      /* Grab an interval off the queue. */
-      qid = ( first++ ) % qstack_size;
-            
-                 
-      /* Get the stack entry. */
-      i = qstack[qid].i;
-      j = qstack[qid].j;
-      min = qstack[qid].min;
-      max = qstack[qid].max;
-      qstack[qid].ready = 0;
-      
-            
-      /* Loop over sub-intervals. */
-      while ( 1 ) {
-            
-	/* Bring beer. */
-	pivot = (min + max) / 2;
-        
-	/* One pass of QuickSort's partitioning. */
-	ii = i; jj = j;
-	while ( ii < jj ) {
-	  while ( ii <= j && ind[ii] <= pivot )
-	    ii++;
-	  while ( jj >= i && ind[jj] > pivot )
-	    jj--;
-	  if ( ii < jj ) {
-	    temp_i = ind[ii]; ind[ii] = ind[jj]; ind[jj] = temp_i;
-	    temp_p = gparts[ii]; gparts[ii] = gparts[jj]; gparts[jj] = temp_p;
-	  }
-	}
-
-	/* Verify sort. */
-	/* for ( int k = i ; k <= jj ; k++ )
-	   if ( ind[k] > pivot ) {
-	   message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, N=%i." , k , ind[k] , pivot , i , j , N );
-	   error( "Partition failed (<=pivot)." );
-	   }
-	   for ( int k = jj+1 ; k <= j ; k++ )
-	   if ( ind[k] <= pivot ) {
-	   message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, N=%i." , k , ind[k] , pivot , i , j , N );
-	   error( "Partition failed (>pivot)." );
-	   } */
-	
-	/* Split-off largest interval. */
-	if ( jj - i > j - jj+1 ) {
-	  
-	  /* Recurse on the left? */
-	  if ( jj > i  && pivot > min ) {
-	    qid = ( last++ ) % qstack_size;
-	    qstack[qid].i = i;
-	    qstack[qid].j = jj;
-	    qstack[qid].min = min;
-	    qstack[qid].max = pivot;
-	    qstack[qid].ready = 1;
-	    if ( ( waiting++ ) >= qstack_size )
-	      error( "Qstack overflow." );
-	  }
-	  
-	  /* Recurse on the right? */
-	  if ( jj+1 < j && pivot+1 < max ) {
-	    i = jj+1;
-	    min = pivot+1;
-	  }
-	  else
-	    break;
-	  
-	}
-                    
-	else {
-                
-	  /* Recurse on the right? */
-	  if ( jj+1 < j && pivot+1 < max ) {
-	    qid = ( last++ ) % qstack_size;
-	    qstack[qid].i = jj+1;
-	    qstack[qid].j = j;
-	    qstack[qid].min = pivot+1;
-	    qstack[qid].max = max;
-	    qstack[qid].ready = 1;
-	    if ( ( waiting++ ) >= qstack_size )
-	      error( "Qstack overflow." );
-	  }
-          
-	  /* Recurse on the left? */
-	  if ( jj > i  && pivot > min ) {
-	    j = jj;
-	    max = pivot;
-	  }
-	  else
-	    break;
-	  
-	}
-        
-      } /* loop over sub-intervals. */
-    
-      waiting--;
-
-    } /* main loop. */
-    
-    
-   
-    /* Verify sort. */
-    /* for ( i = 1 ; i < N ; i++ )
-        if ( ind[i-1] > ind[i] )
-            error( "Sorting failed (ind[%i]=%i,ind[%i]=%i)." , i-1 , ind[i-1] , i , ind[i] ); */
-            
-    /* Clean up. */
-    free( qstack );
+        /* Recurse on the right? */
+        if (jj + 1 < j && pivot + 1 < max) {
+          i = jj + 1;
+          min = pivot + 1;
+        } else
+          break;
+
+      } else {
+
+        /* Recurse on the right? */
+        if (jj + 1 < j && pivot + 1 < max) {
+          qid = (last++) % qstack_size;
+          qstack[qid].i = jj + 1;
+          qstack[qid].j = j;
+          qstack[qid].min = pivot + 1;
+          qstack[qid].max = max;
+          qstack[qid].ready = 1;
+          if ((waiting++) >= qstack_size) error("Qstack overflow.");
+        }
 
-    }
+        /* Recurse on the left? */
+        if (jj > i && pivot > min) {
+          j = jj;
+          max = pivot;
+        } else
+          break;
+      }
 
+    } /* loop over sub-intervals. */
+
+    waiting--;
+
+  } /* main loop. */
+
+  /* Verify sort. */
+  /* for ( i = 1 ; i < N ; i++ )
+      if ( ind[i-1] > ind[i] )
+          error( "Sorting failed (ind[%i]=%i,ind[%i]=%i)." , i-1 , ind[i-1] , i
+     , ind[i] ); */
+
+  /* Clean up. */
+  free(qstack);
+}
+
+void gparts_sort(struct gpart *gparts, int *ind, int N, int min, int max) {
+
+  struct qstack {
+    volatile int i, j, min, max;
+    volatile int ready;
+  };
+  struct qstack *qstack;
+  int qstack_size = 2 * (max - min) + 10;
+  volatile unsigned int first, last, waiting;
+
+  int pivot;
+  int i, ii, j, jj, temp_i, qid;
+  struct gpart temp_p;
+
+  /* for ( int k = 0 ; k < N ; k++ )
+      if ( ind[k] > max || ind[k] < min )
+          error( "ind[%i]=%i is not in [%i,%i]." , k , ind[k] , min , max ); */
+
+  /* Allocate the stack. */
+  if ((qstack = malloc(sizeof(struct qstack) * qstack_size)) == NULL)
+    error("Failed to allocate qstack.");
+
+  /* Init the interval stack. */
+  qstack[0].i = 0;
+  qstack[0].j = N - 1;
+  qstack[0].min = min;
+  qstack[0].max = max;
+  qstack[0].ready = 1;
+  for (i = 1; i < qstack_size; i++) qstack[i].ready = 0;
+  first = 0;
+  last = 1;
+  waiting = 1;
+
+  /* Main loop. */
+  while (waiting > 0) {
+
+    /* Grab an interval off the queue. */
+    qid = (first++) % qstack_size;
+
+    /* Get the stack entry. */
+    i = qstack[qid].i;
+    j = qstack[qid].j;
+    min = qstack[qid].min;
+    max = qstack[qid].max;
+    qstack[qid].ready = 0;
+
+    /* Loop over sub-intervals. */
+    while (1) {
+
+      /* Bring beer. */
+      pivot = (min + max) / 2;
+
+      /* One pass of QuickSort's partitioning. */
+      ii = i;
+      jj = j;
+      while (ii < jj) {
+        while (ii <= j && ind[ii] <= pivot) ii++;
+        while (jj >= i && ind[jj] > pivot) jj--;
+        if (ii < jj) {
+          temp_i = ind[ii];
+          ind[ii] = ind[jj];
+          ind[jj] = temp_i;
+          temp_p = gparts[ii];
+          gparts[ii] = gparts[jj];
+          gparts[jj] = temp_p;
+        }
+      }
+
+      /* Verify sort. */
+      /* for ( int k = i ; k <= jj ; k++ )
+         if ( ind[k] > pivot ) {
+         message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i,
+         N=%i." , k , ind[k] , pivot , i , j , N );
+         error( "Partition failed (<=pivot)." );
+         }
+         for ( int k = jj+1 ; k <= j ; k++ )
+         if ( ind[k] <= pivot ) {
+         message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i,
+         N=%i." , k , ind[k] , pivot , i , j , N );
+         error( "Partition failed (>pivot)." );
+         } */
+
+      /* Split-off largest interval. */
+      if (jj - i > j - jj + 1) {
+
+        /* Recurse on the left? */
+        if (jj > i && pivot > min) {
+          qid = (last++) % qstack_size;
+          qstack[qid].i = i;
+          qstack[qid].j = jj;
+          qstack[qid].min = min;
+          qstack[qid].max = pivot;
+          qstack[qid].ready = 1;
+          if ((waiting++) >= qstack_size) error("Qstack overflow.");
+        }
+
+        /* Recurse on the right? */
+        if (jj + 1 < j && pivot + 1 < max) {
+          i = jj + 1;
+          min = pivot + 1;
+        } else
+          break;
+
+      } else {
+
+        /* Recurse on the right? */
+        if (jj + 1 < j && pivot + 1 < max) {
+          qid = (last++) % qstack_size;
+          qstack[qid].i = jj + 1;
+          qstack[qid].j = j;
+          qstack[qid].min = pivot + 1;
+          qstack[qid].max = max;
+          qstack[qid].ready = 1;
+          if ((waiting++) >= qstack_size) error("Qstack overflow.");
+        }
+
+        /* Recurse on the left? */
+        if (jj > i && pivot > min) {
+          j = jj;
+          max = pivot;
+        } else
+          break;
+      }
+
+    } /* loop over sub-intervals. */
+
+    waiting--;
+
+  } /* main loop. */
+
+  /* Verify sort. */
+  /* for ( i = 1 ; i < N ; i++ )
+      if ( ind[i-1] > ind[i] )
+          error( "Sorting failed (ind[%i]=%i,ind[%i]=%i)." , i-1 , ind[i-1] , i
+     , ind[i] ); */
+
+  /* Clean up. */
+  free(qstack);
+}
 
 /**
  * @brief Mapping function to free the sorted indices buffers.
  */
 
-void space_map_clearsort ( struct cell *c , void *data ) {
-
-    if ( c->sort != NULL ) {
-        free( c->sort );
-        c->sort = NULL;
-        }
-
-    }
+void space_map_clearsort(struct cell *c, void *data) {
 
+  if (c->sort != NULL) {
+    free(c->sort);
+    c->sort = NULL;
+  }
+}
 
 /**
  * @brief Map a function to all particles in a aspace.
@@ -793,34 +784,30 @@ void space_map_clearsort ( struct cell *c , void *data ) {
  * @param fun Function pointer to apply on the cells.
  * @param data Data passed to the function fun.
  */
- 
-void space_map_parts ( struct space *s , void (*fun)( struct part *p , struct cell *c , void *data ) , void *data ) {
-
-    int cid = 0;
-
-    void rec_map ( struct cell *c ) {
-    
-        int k;
-        
-        /* No progeny? */
-        if ( !c->split )
-            for ( k = 0 ; k < c->count ; k++ )
-                fun( &c->parts[k] , c , data );
-                
-        /* Otherwise, recurse. */
-        else
-            for ( k = 0 ; k < 8 ; k++ )
-                if ( c->progeny[k] != NULL )
-                    rec_map( c->progeny[k] );
-                
-        }
-        
-    /* Call the recursive function on all higher-level cells. */
-    for( cid = 0; cid < s->nr_cells; cid++ )
-      rec_map( &s->cells[cid] );
-        
-    }
 
+void space_map_parts(struct space *s,
+                     void (*fun)(struct part *p, struct cell *c, void *data),
+                     void *data) {
+
+  int cid = 0;
+
+  void rec_map(struct cell * c) {
+
+    int k;
+
+    /* No progeny? */
+    if (!c->split)
+      for (k = 0; k < c->count; k++) fun(&c->parts[k], c, data);
+
+    /* Otherwise, recurse. */
+    else
+      for (k = 0; k < 8; k++)
+        if (c->progeny[k] != NULL) rec_map(c->progeny[k]);
+  }
+
+  /* Call the recursive function on all higher-level cells. */
+  for (cid = 0; cid < s->nr_cells; cid++) rec_map(&s->cells[cid]);
+}
 
 /**
  * @brief Map a function to all particles in a aspace.
@@ -830,61 +817,50 @@ void space_map_parts ( struct space *s , void (*fun)( struct part *p , struct ce
  * @param fun Function pointer to apply on the cells.
  * @param data Data passed to the function fun.
  */
- 
-void space_map_cells_post ( struct space *s , int full , void (*fun)( struct cell *c , void *data ) , void *data ) {
-
-    int cid = 0;
-
-    void rec_map ( struct cell *c ) {
-    
-        int k;
-        
-        /* Recurse. */
-        if ( c->split )
-            for ( k = 0 ; k < 8 ; k++ )
-                if ( c->progeny[k] != NULL )
-                    rec_map( c->progeny[k] );
-                
-        /* No progeny? */
-        if ( full || !c->split )
-            fun( c , data );
-                
-        }
-        
-    /* Call the recursive function on all higher-level cells. */
-    for ( cid = 0; cid < s->nr_cells; cid++ )
-	 rec_map( &s->cells[cid] );
 
-    }
+void space_map_cells_post(struct space *s, int full,
+                          void (*fun)(struct cell *c, void *data), void *data) {
 
+  int cid = 0;
 
-void space_map_cells_pre ( struct space *s , int full , void (*fun)( struct cell *c , void *data ) , void *data ) {
-
-    int cid = 0;
-
-    void rec_map ( struct cell *c ) {
-    
-        int k;
-        
-        /* No progeny? */
-        if ( full || !c->split )
-            fun( c , data );
-                
-        /* Recurse. */
-        if ( c->split )
-            for ( k = 0 ; k < 8 ; k++ )
-                if ( c->progeny[k] != NULL )
-                    rec_map( c->progeny[k] );
-                
-        }
-        
-    /* Call the recursive function on all higher-level cells. */
-   for (cid = 0; cid < s->nr_cells; cid++ )
-      rec_map( &s->cells[cid] );
-    
+  void rec_map(struct cell * c) {
 
-    }
+    int k;
+
+    /* Recurse. */
+    if (c->split)
+      for (k = 0; k < 8; k++)
+        if (c->progeny[k] != NULL) rec_map(c->progeny[k]);
+
+    /* No progeny? */
+    if (full || !c->split) fun(c, data);
+  }
+
+  /* Call the recursive function on all higher-level cells. */
+  for (cid = 0; cid < s->nr_cells; cid++) rec_map(&s->cells[cid]);
+}
 
+void space_map_cells_pre(struct space *s, int full,
+                         void (*fun)(struct cell *c, void *data), void *data) {
+
+  int cid = 0;
+
+  void rec_map(struct cell * c) {
+
+    int k;
+
+    /* No progeny? */
+    if (full || !c->split) fun(c, data);
+
+    /* Recurse. */
+    if (c->split)
+      for (k = 0; k < 8; k++)
+        if (c->progeny[k] != NULL) rec_map(c->progeny[k]);
+  }
+
+  /* Call the recursive function on all higher-level cells. */
+  for (cid = 0; cid < s->nr_cells; cid++) rec_map(&s->cells[cid]);
+}
 
 /**
  * @brief Split cells that contain too many particles.
@@ -892,114 +868,103 @@ void space_map_cells_pre ( struct space *s , int full , void (*fun)( struct cell
  * @param s The #space we are working in.
  * @param c The #cell under consideration.
  */
- 
-void space_split ( struct space *s , struct cell *c ) {
-
-    int k, count = c->count, gcount = c->gcount, maxdepth = 0;
-    float h, h_max = 0.0f, dt, dt_min = c->parts[0].dt, dt_max = dt_min;
-    struct cell *temp;
-    struct part *p, *parts = c->parts;
-    struct xpart *xp, *xparts = c->xparts;
-    
-    /* Check the depth. */
-    if ( c->depth > s->maxdepth )
-        s->maxdepth = c->depth;
-    
-    /* Split or let it be? */
-    if ( count > space_splitsize || gcount > space_splitsize ) {
-    
-        /* No longer just a leaf. */
-        c->split = 1;
-        
-        /* Create the cell's progeny. */
-        for ( k = 0 ; k < 8 ; k++ ) {
-            temp = space_getcell( s );
-            temp->count = 0;
-            temp->gcount = 0;
-            temp->loc[0] = c->loc[0];
-            temp->loc[1] = c->loc[1];
-            temp->loc[2] = c->loc[2];
-            temp->h[0] = c->h[0]/2;
-            temp->h[1] = c->h[1]/2;
-            temp->h[2] = c->h[2]/2;
-            temp->dmin = c->dmin/2;
-            if ( k & 4 )
-                temp->loc[0] += temp->h[0];
-            if ( k & 2 )
-                temp->loc[1] += temp->h[1];
-            if ( k & 1 )
-                temp->loc[2] += temp->h[2];
-            temp->depth = c->depth + 1;
-            temp->split = 0;
-            temp->h_max = 0.0;
-            temp->dx_max = 0.0;
-            temp->nodeID = c->nodeID;
-            temp->parent = c;
-            c->progeny[k] = temp;
-            }
-            
-        /* Split the cell data. */
-        cell_split( c );
-            
-        /* Remove any progeny with zero parts. */
-        for ( k = 0 ; k < 8 ; k++ )
-            if ( c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0 ) {
-                space_recycle( s , c->progeny[k] );
-                c->progeny[k] = NULL;
-                }
-            else {
-                space_split( s , c->progeny[k] );
-                h_max = fmaxf( h_max , c->progeny[k]->h_max );
-                dt_min = fminf( dt_min , c->progeny[k]->dt_min );
-                dt_max = fmaxf( dt_max , c->progeny[k]->dt_max );
-                if ( c->progeny[k]->maxdepth > maxdepth )
-                    maxdepth = c->progeny[k]->maxdepth;
-                }
-                
-        /* Set the values for this cell. */
-        c->h_max = h_max;
-        c->dt_min = dt_min;
-        c->dt_max = dt_max;
-        c->maxdepth = maxdepth;
-                
-        }
-        
-    /* Otherwise, collect the data for this cell. */
-    else {
-    
-        /* Clear the progeny. */
-        bzero( c->progeny , sizeof(struct cell *) * 8 );
-        c->split = 0;
-        c->maxdepth = c->depth;
-        
-        /* Get dt_min/dt_max. */
-        
-        for ( k = 0 ; k < count ; k++ ) {
-            p = &parts[k];
-            xp = &xparts[k];
-            xp->x_old[0] = p->x[0];
-            xp->x_old[1] = p->x[1];
-            xp->x_old[2] = p->x[2];
-            dt = p->dt;
-            h = p->h;
-            if ( h > h_max )
-                h_max = h;
-            if ( dt < dt_min )
-                dt_min = dt;
-            if ( dt > dt_max )
-                dt_max = dt;
-            }
-        c->h_max = h_max;
-        c->dt_min = dt_min;
-        c->dt_max = dt_max;
-            
-        }
-        
-    /* Set ownership accorind to the start of the parts array. */
-    c->owner = ( ( c->parts - s->parts ) % s->nr_parts ) * s->nr_queues / s->nr_parts;
 
+void space_split(struct space *s, struct cell *c) {
+
+  int k, count = c->count, gcount = c->gcount, maxdepth = 0;
+  float h, h_max = 0.0f, dt, dt_min = c->parts[0].dt, dt_max = dt_min;
+  struct cell *temp;
+  struct part *p, *parts = c->parts;
+  struct xpart *xp, *xparts = c->xparts;
+
+  /* Check the depth. */
+  if (c->depth > s->maxdepth) s->maxdepth = c->depth;
+
+  /* Split or let it be? */
+  if (count > space_splitsize || gcount > space_splitsize) {
+
+    /* No longer just a leaf. */
+    c->split = 1;
+
+    /* Create the cell's progeny. */
+    for (k = 0; k < 8; k++) {
+      temp = space_getcell(s);
+      temp->count = 0;
+      temp->gcount = 0;
+      temp->loc[0] = c->loc[0];
+      temp->loc[1] = c->loc[1];
+      temp->loc[2] = c->loc[2];
+      temp->h[0] = c->h[0] / 2;
+      temp->h[1] = c->h[1] / 2;
+      temp->h[2] = c->h[2] / 2;
+      temp->dmin = c->dmin / 2;
+      if (k & 4) temp->loc[0] += temp->h[0];
+      if (k & 2) temp->loc[1] += temp->h[1];
+      if (k & 1) temp->loc[2] += temp->h[2];
+      temp->depth = c->depth + 1;
+      temp->split = 0;
+      temp->h_max = 0.0;
+      temp->dx_max = 0.0;
+      temp->nodeID = c->nodeID;
+      temp->parent = c;
+      c->progeny[k] = temp;
     }
 
+    /* Split the cell data. */
+    cell_split(c);
+
+    /* Remove any progeny with zero parts. */
+    for (k = 0; k < 8; k++)
+      if (c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0) {
+        space_recycle(s, c->progeny[k]);
+        c->progeny[k] = NULL;
+      } else {
+        space_split(s, c->progeny[k]);
+        h_max = fmaxf(h_max, c->progeny[k]->h_max);
+        dt_min = fminf(dt_min, c->progeny[k]->dt_min);
+        dt_max = fmaxf(dt_max, c->progeny[k]->dt_max);
+        if (c->progeny[k]->maxdepth > maxdepth)
+          maxdepth = c->progeny[k]->maxdepth;
+      }
+
+    /* Set the values for this cell. */
+    c->h_max = h_max;
+    c->dt_min = dt_min;
+    c->dt_max = dt_max;
+    c->maxdepth = maxdepth;
+
+  }
+
+  /* Otherwise, collect the data for this cell. */
+  else {
+
+    /* Clear the progeny. */
+    bzero(c->progeny, sizeof(struct cell *) * 8);
+    c->split = 0;
+    c->maxdepth = c->depth;
+
+    /* Get dt_min/dt_max. */
+
+    for (k = 0; k < count; k++) {
+      p = &parts[k];
+      xp = &xparts[k];
+      xp->x_old[0] = p->x[0];
+      xp->x_old[1] = p->x[1];
+      xp->x_old[2] = p->x[2];
+      dt = p->dt;
+      h = p->h;
+      if (h > h_max) h_max = h;
+      if (dt < dt_min) dt_min = dt;
+      if (dt > dt_max) dt_max = dt;
+    }
+    c->h_max = h_max;
+    c->dt_min = dt_min;
+    c->dt_max = dt_max;
+  }
+
+  /* Set ownership accorind to the start of the parts array. */
+  c->owner = ((c->parts - s->parts) % s->nr_parts) * s->nr_queues / s->nr_parts;
+}
 
 /**
  * @brief Return a used cell to the cell buffer.
@@ -1007,77 +972,71 @@ void space_split ( struct space *s , struct cell *c ) {
  * @param s The #space.
  * @param c The #cell.
  */
- 
-void space_recycle ( struct space *s , struct cell *c ) {
-
-    /* Lock the space. */
-    lock_lock( &s->lock );
-    
-    /* Clear the cell. */
-    if ( lock_destroy( &c->lock ) != 0 )
-        error( "Failed to destroy spinlock." );
-        
-    /* Clear this cell's sort arrays. */
-    if ( c->sort != NULL )
-        free( c->sort );
-        
-    /* Clear the cell data. */
-    bzero( c , sizeof(struct cell) );
-    
-    /* Hook this cell into the buffer. */
-    c->next = s->cells_new;
-    s->cells_new = c;
-    s->tot_cells -= 1;
-    
-    /* Unlock the space. */
-    lock_unlock_blind( &s->lock );
-    
-    }
 
+void space_recycle(struct space *s, struct cell *c) {
+
+  /* Lock the space. */
+  lock_lock(&s->lock);
+
+  /* Clear the cell. */
+  if (lock_destroy(&c->lock) != 0) error("Failed to destroy spinlock.");
+
+  /* Clear this cell's sort arrays. */
+  if (c->sort != NULL) free(c->sort);
+
+  /* Clear the cell data. */
+  bzero(c, sizeof(struct cell));
+
+  /* Hook this cell into the buffer. */
+  c->next = s->cells_new;
+  s->cells_new = c;
+  s->tot_cells -= 1;
+
+  /* Unlock the space. */
+  lock_unlock_blind(&s->lock);
+}
 
 /**
  * @brief Get a new empty cell.
  *
  * @param s The #space.
  */
- 
-struct cell *space_getcell ( struct space *s ) {
 
-    struct cell *c;
-    int k;
-    
-    /* Lock the space. */
-    lock_lock( &s->lock );
-    
-    /* Is the buffer empty? */
-    if ( s->cells_new == NULL ) {
-        if ( posix_memalign( (void *)&s->cells_new , 64 , space_cellallocchunk * sizeof(struct cell) ) != 0 )
-            error( "Failed to allocate more cells." );
-        bzero( s->cells_new , space_cellallocchunk * sizeof(struct cell) );
-        for ( k = 0 ; k < space_cellallocchunk-1 ; k++ )
-            s->cells_new[k].next = &s->cells_new[k+1];
-        s->cells_new[ space_cellallocchunk-1 ].next = NULL;
-        }
+struct cell *space_getcell(struct space *s) {
 
-    /* Pick off the next cell. */
-    c = s->cells_new;
-    s->cells_new = c->next;
-    s->tot_cells += 1;
-    
-    /* Unlock the space. */
-    lock_unlock_blind( &s->lock );
-    
-    /* Init some things in the cell. */
-    bzero( c , sizeof(struct cell) );
-    c->nodeID = -1;
-    if ( lock_init( &c->lock ) != 0 ||
-         lock_init( &c->glock ) != 0 )
-        error( "Failed to initialize cell spinlocks." );
-        
-    return c;
+  struct cell *c;
+  int k;
 
-    }
+  /* Lock the space. */
+  lock_lock(&s->lock);
+
+  /* Is the buffer empty? */
+  if (s->cells_new == NULL) {
+    if (posix_memalign((void *)&s->cells_new, 64,
+                       space_cellallocchunk * sizeof(struct cell)) != 0)
+      error("Failed to allocate more cells.");
+    bzero(s->cells_new, space_cellallocchunk * sizeof(struct cell));
+    for (k = 0; k < space_cellallocchunk - 1; k++)
+      s->cells_new[k].next = &s->cells_new[k + 1];
+    s->cells_new[space_cellallocchunk - 1].next = NULL;
+  }
+
+  /* Pick off the next cell. */
+  c = s->cells_new;
+  s->cells_new = c->next;
+  s->tot_cells += 1;
 
+  /* Unlock the space. */
+  lock_unlock_blind(&s->lock);
+
+  /* Init some things in the cell. */
+  bzero(c, sizeof(struct cell));
+  c->nodeID = -1;
+  if (lock_init(&c->lock) != 0 || lock_init(&c->glock) != 0)
+    error("Failed to initialize cell spinlocks.");
+
+  return c;
+}
 
 /**
  * @brief Split the space into cells given the array of particles.
@@ -1095,78 +1054,76 @@ struct cell *space_getcell ( struct space *s ) {
  * recursively.
  */
 
-
-void space_init ( struct space *s , double dim[3] , struct part *parts , int N , int periodic , double h_max ) {
-
-    /* Store eveything in the space. */
-    s->dim[0] = dim[0]; s->dim[1] = dim[1]; s->dim[2] = dim[2];
-    s->periodic = periodic;
-    s->nr_parts = N;
-    s->size_parts = N;
-    s->parts = parts;
-    s->cell_min = h_max;
-    s->nr_queues = 1;
-    s->size_parts_foreign = 0;
-    
-    /* Check that all the particle positions are reasonable, wrap if periodic. */
-    if ( periodic ) {
-      for ( int k = 0 ; k < N ; k++ )
-        for ( int j = 0 ; j < 3 ; j++ ) {
-          while ( parts[k].x[j] < 0 ) parts[k].x[j] += dim[j];
-          while ( parts[k].x[j] >= dim[j] ) parts[k].x[j] -= dim[j];
-          }
+void space_init(struct space *s, double dim[3], struct part *parts, int N,
+                int periodic, double h_max) {
+
+  /* Store eveything in the space. */
+  s->dim[0] = dim[0];
+  s->dim[1] = dim[1];
+  s->dim[2] = dim[2];
+  s->periodic = periodic;
+  s->nr_parts = N;
+  s->size_parts = N;
+  s->parts = parts;
+  s->cell_min = h_max;
+  s->nr_queues = 1;
+  s->size_parts_foreign = 0;
+
+  /* Check that all the particle positions are reasonable, wrap if periodic. */
+  if (periodic) {
+    for (int k = 0; k < N; k++)
+      for (int j = 0; j < 3; j++) {
+        while (parts[k].x[j] < 0) parts[k].x[j] += dim[j];
+        while (parts[k].x[j] >= dim[j]) parts[k].x[j] -= dim[j];
       }
-    else {
-      for ( int k = 0 ; k < N ; k++ )
-        for ( int j = 0 ; j < 3 ; j++ )
-          if ( parts[k].x[j] < 0 || parts[k].x[j] >= dim[j] )
-            error( "Not all particles are within the specified domain." );
+  } else {
+    for (int k = 0; k < N; k++)
+      for (int j = 0; j < 3; j++)
+        if (parts[k].x[j] < 0 || parts[k].x[j] >= dim[j])
+          error("Not all particles are within the specified domain.");
+  }
+
+  /* Allocate the xtra parts array. */
+  if (posix_memalign((void *)&s->xparts, part_align,
+                     N * sizeof(struct xpart)) != 0)
+    error("Failed to allocate xparts.");
+  bzero(s->xparts, N * sizeof(struct xpart));
+
+  /* Initialize the velocities and internal energies. */
+  for (int k = 0; k < N; k++) {
+    struct part *p = &parts[k];
+    struct xpart *xp = &s->xparts[k];
+    xp->v_hdt[0] = p->v[0];
+    xp->v_hdt[1] = p->v[1];
+    xp->v_hdt[2] = p->v[2];
+    xp->u_hdt = p->u;
+  }
+
+  /* For now, clone the parts to make gparts. */
+  if (posix_memalign((void *)&s->gparts, part_align,
+                     N * sizeof(struct gpart)) != 0)
+    error("Failed to allocate gparts.");
+  bzero(s->gparts, N * sizeof(struct gpart));
+  /* for ( int k = 0 ; k < N ; k++ ) {
+      s->gparts[k].x[0] = s->parts[k].x[0];
+      s->gparts[k].x[1] = s->parts[k].x[1];
+      s->gparts[k].x[2] = s->parts[k].x[2];
+      s->gparts[k].v[0] = s->parts[k].v[0];
+      s->gparts[k].v[1] = s->parts[k].v[1];
+      s->gparts[k].v[2] = s->parts[k].v[2];
+      s->gparts[k].mass = s->parts[k].mass;
+      s->gparts[k].dt = s->parts[k].dt;
+      s->gparts[k].id = s->parts[k].id;
+      s->gparts[k].part = &s->parts[k];
+      s->parts[k].gpart = &s->gparts[k];
       }
-    
-    /* Allocate the xtra parts array. */
-    if ( posix_memalign( (void *)&s->xparts , part_align , N * sizeof(struct xpart) ) != 0 )
-        error( "Failed to allocate xparts." );
-    bzero( s->xparts , N * sizeof(struct xpart) );
-    
-    /* Initialize the velocities and internal energies. */
-    for ( int k = 0 ; k < N ; k++ ) {
-        struct part *p = &parts[k];
-        struct xpart *xp = &s->xparts[k];
-        xp->v_hdt[0] = p->v[0];
-        xp->v_hdt[1] = p->v[1];
-        xp->v_hdt[2] = p->v[2];
-        xp->u_hdt = p->u;
-        }
-        
-        
-    /* For now, clone the parts to make gparts. */
-    if ( posix_memalign( (void *)&s->gparts , part_align , N * sizeof(struct gpart) ) != 0 )
-        error( "Failed to allocate gparts." );
-    bzero( s->gparts , N * sizeof(struct gpart) );
-    /* for ( int k = 0 ; k < N ; k++ ) {
-        s->gparts[k].x[0] = s->parts[k].x[0];
-        s->gparts[k].x[1] = s->parts[k].x[1];
-        s->gparts[k].x[2] = s->parts[k].x[2];
-        s->gparts[k].v[0] = s->parts[k].v[0];
-        s->gparts[k].v[1] = s->parts[k].v[1];
-        s->gparts[k].v[2] = s->parts[k].v[2];
-        s->gparts[k].mass = s->parts[k].mass;
-        s->gparts[k].dt = s->parts[k].dt;
-        s->gparts[k].id = s->parts[k].id;
-        s->gparts[k].part = &s->parts[k];
-        s->parts[k].gpart = &s->gparts[k];
-        }
-    s->nr_gparts = s->nr_parts; */
-    s->nr_gparts = 0;
-    s->size_gparts = s->size_parts;
-    
-        
-    /* Init the space lock. */
-    if ( lock_init( &s->lock ) != 0 )
-        error( "Failed to create space spin-lock." );
-    
-    /* Build the cells and the tasks. */
-    space_regrid( s , h_max );
-        
-    }
+  s->nr_gparts = s->nr_parts; */
+  s->nr_gparts = 0;
+  s->size_gparts = s->size_parts;
+
+  /* Init the space lock. */
+  if (lock_init(&s->lock) != 0) error("Failed to create space spin-lock.");
 
+  /* Build the cells and the tasks. */
+  space_regrid(s, h_max);
+}
diff --git a/src/space.h b/src/space.h
index 9d1f849d3b29b26d80b12a9767d6505040a1c74c..e0bad6773547f813d70943c2ca2703529a0306a8 100644
--- a/src/space.h
+++ b/src/space.h
@@ -1,39 +1,45 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_SPACE_H
+#define SWIFT_SPACE_H
 
+/* Includes. */
+#include "cell.h"
+#include "part.h"
 
-
+/* Forward-declare the engine to avoid cyclic includes. */
+struct engine;
 
 /* Some constants. */
-#define space_maxdepth                  10
-#define space_cellallocchunk            1000
-#define space_splitratio                0.875f
-#define space_splitsize_default         400
-#define space_maxsize_default           8000000
-#define space_subsize_default           8000000
-#define space_stretch                   1.10f
-#define space_maxreldx                  0.25f
-#define space_qstack                    2048
-
+#define space_maxdepth 10
+#define space_cellallocchunk 1000
+#define space_splitratio 0.875f
+#define space_splitsize_default 400
+#define space_maxsize_default 8000000
+#define space_subsize_default 8000000
+#define space_stretch 1.10f
+#define space_maxreldx 0.25f
+#define space_qstack 2048
 
 /* Convert cell location to ID. */
-#define cell_getid( cdim , i , j , k ) ( (int)(k) + (cdim)[2]*( (int)(j) + (cdim)[1]*(int)(i) ) )
+#define cell_getid(cdim, i, j, k) \
+  ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i)))
 
 /* Split size. */
 extern int space_splitsize;
@@ -42,83 +48,87 @@ extern int space_subsize;
 
 /* Map shift vector to sortlist. */
 extern const int sortlistID[27];
-    
-    
+
 /* Entry in a list of sorted indices. */
 struct entry {
-    float d;
-    int i;
-    };
-    
-    
+  float d;
+  int i;
+};
+
 /* The space in which the cells reside. */
 struct space {
 
-    /* Spatial extent. */
-    double dim[3];
-    
-    /* Cell widths. */
-    double h[3], ih[3];
-    
-    /* The minimum and maximum cutoff radii. */
-    double h_max, cell_min;
-    
-    /* Current time step for particles. */
-    float dt_step;
-    
-    /* Current maximum displacement for particles. */
-    float dx_max;
-    
-    /* Number of cells. */
-    int nr_cells, tot_cells;
-    
-    /* Space dimensions in number of cells. */
-    int maxdepth, cdim[3];
-    
-    /* The (level 0) cells themselves. */
-    struct cell *cells;
-    
-    /* Buffer of unused cells. */
-    struct cell *cells_new;
-    
-    /* The particle data (cells have pointers to this). */
-    struct part *parts;
-    struct xpart *xparts;
-    struct gpart *gparts;
-    
-    /* The total number of parts in the space. */
-    int nr_parts, size_parts;
-    int nr_gparts, size_gparts;
-    
-    /* Is the space periodic? */
-    int periodic;
-    
-    /* General-purpose lock for this space. */
-    lock_type lock;
-    
-    /* Number of queues in the system. */
-    int nr_queues;
-    
-    /* The associated engine. */
-    struct engine *e;
-    
-    /* Buffers for parts that we will receive from foreign cells. */
-    struct part *parts_foreign;
-    int nr_parts_foreign, size_parts_foreign;
-    
-    };
+  /* Spatial extent. */
+  double dim[3];
+
+  /* Cell widths. */
+  double h[3], ih[3];
+
+  /* The minimum and maximum cutoff radii. */
+  double h_max, cell_min;
+
+  /* Current time step for particles. */
+  float dt_step;
+
+  /* Current maximum displacement for particles. */
+  float dx_max;
+
+  /* Number of cells. */
+  int nr_cells, tot_cells;
+
+  /* Space dimensions in number of cells. */
+  int maxdepth, cdim[3];
+
+  /* The (level 0) cells themselves. */
+  struct cell *cells;
+
+  /* Buffer of unused cells. */
+  struct cell *cells_new;
+
+  /* The particle data (cells have pointers to this). */
+  struct part *parts;
+  struct xpart *xparts;
+  struct gpart *gparts;
+
+  /* The total number of parts in the space. */
+  int nr_parts, size_parts;
+  int nr_gparts, size_gparts;
+
+  /* Is the space periodic? */
+  int periodic;
+
+  /* General-purpose lock for this space. */
+  lock_type lock;
+
+  /* Number of queues in the system. */
+  int nr_queues;
+
+  /* The associated engine. */
+  struct engine *e;
 
+  /* Buffers for parts that we will receive from foreign cells. */
+  struct part *parts_foreign;
+  int nr_parts_foreign, size_parts_foreign;
+};
 
 /* function prototypes. */
-void parts_sort ( struct part *parts , struct xpart *xparts , int *ind , int N , int min , int max );
-void gparts_sort ( struct gpart *gparts , int *ind , int N , int min , int max );
-struct cell *space_getcell ( struct space *s );
-int space_getsid ( struct space *s , struct cell **ci , struct cell **cj , double *shift );
-void space_init ( struct space *s , double dim[3] , struct part *parts , int N , int periodic , double h_max );
-void space_map_cells_pre ( struct space *s , int full , void (*fun)( struct cell *c , void *data ) , void *data );
-void space_map_parts ( struct space *s , void (*fun)( struct part *p , struct cell *c , void *data ) , void *data );
-void space_map_cells_post ( struct space *s , int full , void (*fun)( struct cell *c , void *data ) , void *data );
-void space_rebuild ( struct space *s , double h_max );
-void space_recycle ( struct space *s , struct cell *c );
-void space_split ( struct space *s , struct cell *c );
+void parts_sort(struct part *parts, struct xpart *xparts, int *ind, int N,
+                int min, int max);
+void gparts_sort(struct gpart *gparts, int *ind, int N, int min, int max);
+struct cell *space_getcell(struct space *s);
+int space_getsid(struct space *s, struct cell **ci, struct cell **cj,
+                 double *shift);
+void space_init(struct space *s, double dim[3], struct part *parts, int N,
+                int periodic, double h_max);
+void space_map_cells_pre(struct space *s, int full,
+                         void (*fun)(struct cell *c, void *data), void *data);
+void space_map_parts(struct space *s,
+                     void (*fun)(struct part *p, struct cell *c, void *data),
+                     void *data);
+void space_map_cells_post(struct space *s, int full,
+                          void (*fun)(struct cell *c, void *data), void *data);
+void space_rebuild(struct space *s, double h_max);
+void space_recycle(struct space *s, struct cell *c);
+void space_split(struct space *s, struct cell *c);
 
+#endif /* SWIFT_SPACE_H */
diff --git a/src/swift.h b/src/swift.h
index 7652191b2e9cfb864cc64e157eeda98627cdccdc..b302bca9b007cec47c96e1ab07770a3a3dc84966 100644
--- a/src/swift.h
+++ b/src/swift.h
@@ -1,47 +1,49 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_SWIFT_H
+#define SWIFT_SWIFT_H
 
 /* Config parameters. */
 #include "../config.h"
 
 /* Local headers. */
+#include "atomic.h"
+#include "cell.h"
 #include "const.h"
-#include "error.h"
-#include "cycle.h"
-#include "timers.h"
 #include "const.h"
-#include "atomic.h"
+#include "cycle.h"
+#include "debug.h"
+#include "engine.h"
+#include "error.h"
 #include "lock.h"
-#include "task.h"
-#include "scheduler.h"
-#include "part.h"
 #include "multipole.h"
-#include "cell.h"
-#include "space.h"
+#include "parallel_io.h"
+#include "part.h"
 #include "queue.h"
 #include "runner.h"
-#include "engine.h"
-#include "units.h"
-#include "single_io.h"
+#include "scheduler.h"
 #include "serial_io.h"
-#include "parallel_io.h"
-#include "debug.h"
+#include "single_io.h"
+#include "space.h"
+#include "task.h"
+#include "timers.h"
+#include "units.h"
 #include "version.h"
 
 #ifdef LEGACY_GADGET2_SPH
@@ -50,3 +52,5 @@
 #include "runner_iact.h"
 #endif
 #include "runner_iact_grav.h"
+
+#endif /* SWIFT_SWIFT_H */
diff --git a/src/task.c b/src/task.c
index 94bacd3766d33865da8a6cbf64a0eb2f3aa2bad2..949caab56c4c4d8a0e3c73d05014ebc5ad68657a 100644
--- a/src/task.c
+++ b/src/task.c
@@ -1,165 +1,144 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 
 /* Config parameters. */
 #include "../config.h"
 
 /* Some standard headers. */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <pthread.h>
-#include <math.h>
 #include <float.h>
 #include <limits.h>
 #include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
 /* MPI headers. */
 #ifdef WITH_MPI
-    #include <mpi.h>
+#include <mpi.h>
 #endif
 
+/* This object's header. */
+#include "task.h"
+
 /* Local headers. */
-#include "const.h"
-#include "cycle.h"
 #include "atomic.h"
-#include "lock.h"
-#include "space.h"
-#include "part.h"
-#include "multipole.h"
-#include "cell.h"
-#include "task.h"
 #include "error.h"
+#include "lock.h"
 
 /* Task type names. */
-const char *taskID_names[task_type_count] = {   
-    "none" , "sort" , "self" , "pair" , "sub" , "ghost" , 
-    "kick1" , "kick2" , "send" , "recv" , "link" , "grav_pp" ,
-    "grav_mm" , "grav_up" , "grav_down" };
-
+const char *taskID_names[task_type_count] = {
+    "none",  "sort",    "self",    "pair",    "sub",
+    "ghost", "kick1",   "kick2",   "send",    "recv",
+    "link",  "grav_pp", "grav_mm", "grav_up", "grav_down"};
 
 /**
  * @brief Unlock the cell held by this task.
- * 
+ *
  * @param t The #task.
  */
- 
-void task_unlock ( struct task *t ) {
-
-    /* Act based on task type. */
-    switch ( t->type ) {
-        case task_type_self:
-        case task_type_sort:
-            cell_unlocktree( t->ci );
-            break;
-        case task_type_pair:
-        case task_type_sub:
-            cell_unlocktree( t->ci );
-            if ( t->cj != NULL )
-                cell_unlocktree( t->cj );
-            break;
-        case task_type_grav_pp:
-        case task_type_grav_mm:
-        case task_type_grav_down:
-            cell_gunlocktree( t->ci );
-            if ( t->cj != NULL )
-                cell_gunlocktree( t->cj );
-            break;
-        default:
-            break;
-        }
-        
-    }
 
+void task_unlock(struct task *t) {
+
+  /* Act based on task type. */
+  switch (t->type) {
+    case task_type_self:
+    case task_type_sort:
+      cell_unlocktree(t->ci);
+      break;
+    case task_type_pair:
+    case task_type_sub:
+      cell_unlocktree(t->ci);
+      if (t->cj != NULL) cell_unlocktree(t->cj);
+      break;
+    case task_type_grav_pp:
+    case task_type_grav_mm:
+    case task_type_grav_down:
+      cell_gunlocktree(t->ci);
+      if (t->cj != NULL) cell_gunlocktree(t->cj);
+      break;
+    default:
+      break;
+  }
+}
 
 /**
  * @brief Try to lock the cells associated with this task.
  *
  * @param t the #task.
  */
- 
-int task_lock ( struct task *t ) {
-
-    int type = t->type;
-    struct cell *ci = t->ci, *cj = t->cj;
-
-    /* Communication task? */
-    if ( type == task_type_recv ||
-         type == task_type_send ) {
-    
-        #ifdef WITH_MPI
-            /* Check the status of the MPI request. */
-            int res, err;
-            MPI_Status stat;
-            if ( ( err = MPI_Test( &t->req , &res , &stat ) ) != MPI_SUCCESS ) {
-                char buff[ MPI_MAX_ERROR_STRING ];
-                int len;
-                MPI_Error_string( err , buff , &len );
-                error( "Failed to test request on send/recv task (tag=%i, %s)." , t->flags , buff );
-                }
-            return res;
-        #else
-            error( "SWIFT was not compiled with MPI support." );
-        #endif
-    
-        }
-
-    /* Unary lock? */
-    else if ( type == task_type_self || 
-         type == task_type_sort || 
-         (type == task_type_sub && cj == NULL) ) {
-        if ( cell_locktree( ci ) != 0 )
-            return 0;
-        }
-        
-    /* Otherwise, binary lock. */
-    else if ( type == task_type_pair || 
-              ( type == task_type_sub && cj != NULL ) ) {
-        if ( ci->hold || cj->hold )
-            return 0;
-        if ( cell_locktree( ci ) != 0 )
-            return 0;
-        if ( cell_locktree( cj ) != 0 ) {
-            cell_unlocktree( ci );
-            return 0;
-            }
-        }
-        
-    /* Gravity tasks? */
-    else if ( type == task_type_grav_mm ||
-              type == task_type_grav_pp ||
-              type == task_type_grav_down ) {
-        if ( ci->ghold || ( cj != NULL && cj->ghold ) )
-            return 0;
-        if ( cell_glocktree( ci ) != 0 )
-            return 0;
-        if ( cj != NULL && cell_glocktree( cj ) != 0 ) {
-            cell_gunlocktree( ci );
-            return 0;
-            }
-        }
-        
-    /* If we made it this far, we've got a lock. */
-    return 1;
-            
+
+int task_lock(struct task *t) {
+
+  int type = t->type;
+  struct cell *ci = t->ci, *cj = t->cj;
+
+  /* Communication task? */
+  if (type == task_type_recv || type == task_type_send) {
+
+#ifdef WITH_MPI
+    /* Check the status of the MPI request. */
+    int res, err;
+    MPI_Status stat;
+    if ((err = MPI_Test(&t->req, &res, &stat)) != MPI_SUCCESS) {
+      char buff[MPI_MAX_ERROR_STRING];
+      int len;
+      MPI_Error_string(err, buff, &len);
+      error("Failed to test request on send/recv task (tag=%i, %s).", t->flags,
+            buff);
     }
+    return res;
+#else
+    error("SWIFT was not compiled with MPI support.");
+#endif
+
+  }
+
+  /* Unary lock? */
+  else if (type == task_type_self || type == task_type_sort ||
+           (type == task_type_sub && cj == NULL)) {
+    if (cell_locktree(ci) != 0) return 0;
+  }
+
+  /* Otherwise, binary lock. */
+  else if (type == task_type_pair || (type == task_type_sub && cj != NULL)) {
+    if (ci->hold || cj->hold) return 0;
+    if (cell_locktree(ci) != 0) return 0;
+    if (cell_locktree(cj) != 0) {
+      cell_unlocktree(ci);
+      return 0;
+    }
+  }
+
+  /* Gravity tasks? */
+  else if (type == task_type_grav_mm || type == task_type_grav_pp ||
+           type == task_type_grav_down) {
+    if (ci->ghold || (cj != NULL && cj->ghold)) return 0;
+    if (cell_glocktree(ci) != 0) return 0;
+    if (cj != NULL && cell_glocktree(cj) != 0) {
+      cell_gunlocktree(ci);
+      return 0;
+    }
+  }
 
+  /* If we made it this far, we've got a lock. */
+  return 1;
+}
 
 /**
  * @brief Remove all unlocks to tasks that are of the given type.
@@ -167,23 +146,21 @@ int task_lock ( struct task *t ) {
  * @param t The #task.
  * @param type The task type ID to remove.
  */
- 
-void task_cleanunlock ( struct task *t , int type ) {
-
-    int k;
-    
-    lock_lock( &t->lock );
-    
-    for ( k = 0 ; k < t->nr_unlock_tasks ; k++ )
-        if ( t->unlock_tasks[k]->type == type ) {
-            t->nr_unlock_tasks -= 1;
-            t->unlock_tasks[k] = t->unlock_tasks[ t->nr_unlock_tasks ];
-            }
-    
-    lock_unlock_blind( &t->lock );
-    
+
+void task_cleanunlock(struct task *t, int type) {
+
+  int k;
+
+  lock_lock(&t->lock);
+
+  for (k = 0; k < t->nr_unlock_tasks; k++)
+    if (t->unlock_tasks[k]->type == type) {
+      t->nr_unlock_tasks -= 1;
+      t->unlock_tasks[k] = t->unlock_tasks[t->nr_unlock_tasks];
     }
 
+  lock_unlock_blind(&t->lock);
+}
 
 /**
  * @brief Remove an unlock_task from the given task.
@@ -191,24 +168,22 @@ void task_cleanunlock ( struct task *t , int type ) {
  * @param ta The unlocking #task.
  * @param tb The #task that will be unlocked.
  */
- 
-void task_rmunlock ( struct task *ta , struct task *tb ) {
-
-    int k;
-    
-    lock_lock( &ta->lock );
-    
-    for ( k = 0 ; k < ta->nr_unlock_tasks ; k++ )
-        if ( ta->unlock_tasks[k] == tb ) {
-            ta->nr_unlock_tasks -= 1;
-            ta->unlock_tasks[k] = ta->unlock_tasks[ ta->nr_unlock_tasks ];
-            lock_unlock_blind( &ta->lock );
-            return;
-            }
-    error( "Task not found." );
 
+void task_rmunlock(struct task *ta, struct task *tb) {
+
+  int k;
+
+  lock_lock(&ta->lock);
+
+  for (k = 0; k < ta->nr_unlock_tasks; k++)
+    if (ta->unlock_tasks[k] == tb) {
+      ta->nr_unlock_tasks -= 1;
+      ta->unlock_tasks[k] = ta->unlock_tasks[ta->nr_unlock_tasks];
+      lock_unlock_blind(&ta->lock);
+      return;
     }
-    
+  error("Task not found.");
+}
 
 /**
  * @brief Remove an unlock_task from the given task.
@@ -219,24 +194,22 @@ void task_rmunlock ( struct task *ta , struct task *tb ) {
  * Differs from #task_rmunlock in that it will not fail if
  * the task @c tb is not in the unlocks of @c ta.
  */
- 
-void task_rmunlock_blind ( struct task *ta , struct task *tb ) {
-
-    int k;
-    
-    lock_lock( &ta->lock );
-    
-    for ( k = 0 ; k < ta->nr_unlock_tasks ; k++ )
-        if ( ta->unlock_tasks[k] == tb ) {
-            ta->nr_unlock_tasks -= 1;
-            ta->unlock_tasks[k] = ta->unlock_tasks[ ta->nr_unlock_tasks ];
-            break;
-            }
-            
-    lock_unlock_blind( &ta->lock );
 
+void task_rmunlock_blind(struct task *ta, struct task *tb) {
+
+  int k;
+
+  lock_lock(&ta->lock);
+
+  for (k = 0; k < ta->nr_unlock_tasks; k++)
+    if (ta->unlock_tasks[k] == tb) {
+      ta->nr_unlock_tasks -= 1;
+      ta->unlock_tasks[k] = ta->unlock_tasks[ta->nr_unlock_tasks];
+      break;
     }
-    
+
+  lock_unlock_blind(&ta->lock);
+}
 
 /**
  * @brief Add an unlock_task to the given task.
@@ -244,43 +217,38 @@ void task_rmunlock_blind ( struct task *ta , struct task *tb ) {
  * @param ta The unlocking #task.
  * @param tb The #task that will be unlocked.
  */
- 
-void task_addunlock ( struct task *ta , struct task *tb ) {
 
-    error( "Use sched_addunlock instead." );
+void task_addunlock(struct task *ta, struct task *tb) {
 
-    /* Add the lock atomically. */
-    ta->unlock_tasks[ atomic_inc( &ta->nr_unlock_tasks ) ] = tb;
+  error("Use sched_addunlock instead.");
 
-    /* Check a posteriori if we did not overshoot. */
-    if ( ta->nr_unlock_tasks > task_maxunlock )
-        error( "Too many unlock_tasks in task." );
-        
-    }
-    
-
-void task_addunlock_old ( struct task *ta , struct task *tb ) {
-
-    int k;
-    
-    lock_lock( &ta->lock );
-    
-    /* Check if ta already unlocks tb. */
-    for ( k = 0 ; k < ta->nr_unlock_tasks ; k++ )
-        if ( ta->unlock_tasks[k] == tb ) {
-            error( "Duplicate unlock." );
-            lock_unlock_blind( &ta->lock );
-            return;
-            }
-
-    if ( ta->nr_unlock_tasks == task_maxunlock )
-        error( "Too many unlock_tasks in task." );
-        
-    ta->unlock_tasks[ ta->nr_unlock_tasks] = tb;
-    ta->nr_unlock_tasks += 1;
-
-    lock_unlock_blind( &ta->lock );
-    
+  /* Add the lock atomically. */
+  ta->unlock_tasks[atomic_inc(&ta->nr_unlock_tasks)] = tb;
+
+  /* Check a posteriori if we did not overshoot. */
+  if (ta->nr_unlock_tasks > task_maxunlock)
+    error("Too many unlock_tasks in task.");
+}
+
+void task_addunlock_old(struct task *ta, struct task *tb) {
+
+  int k;
+
+  lock_lock(&ta->lock);
+
+  /* Check if ta already unlocks tb. */
+  for (k = 0; k < ta->nr_unlock_tasks; k++)
+    if (ta->unlock_tasks[k] == tb) {
+      error("Duplicate unlock.");
+      lock_unlock_blind(&ta->lock);
+      return;
     }
-    
 
+  if (ta->nr_unlock_tasks == task_maxunlock)
+    error("Too many unlock_tasks in task.");
+
+  ta->unlock_tasks[ta->nr_unlock_tasks] = tb;
+  ta->nr_unlock_tasks += 1;
+
+  lock_unlock_blind(&ta->lock);
+}
diff --git a/src/task.h b/src/task.h
index 0505815ff2d5dcc186b30011a906458947589bd8..0d3a68e1e8a892d554f8fb83f8f16d7030d5a54c 100644
--- a/src/task.h
+++ b/src/task.h
@@ -1,92 +1,94 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_TASK_H
+#define SWIFT_TASK_H
 
+/* Includes. */
+#include "cell.h"
+#include "cycle.h"
 
 /* Some constants. */
-#define task_maxwait                    3
-#define task_maxunlock                  15
-
+#define task_maxwait 3
+#define task_maxunlock 15
 
 /* The different task types. */
 enum task_types {
-    task_type_none = 0,
-    task_type_sort,
-    task_type_self,
-    task_type_pair,
-    task_type_sub,
-    task_type_ghost,
-    task_type_kick1,
-    task_type_kick2,
-    task_type_send,
-    task_type_recv,
-    task_type_link,
-    task_type_grav_pp,
-    task_type_grav_mm,
-    task_type_grav_up,
-    task_type_grav_down,
-    task_type_count
-    };
-    
+  task_type_none = 0,
+  task_type_sort,
+  task_type_self,
+  task_type_pair,
+  task_type_sub,
+  task_type_ghost,
+  task_type_kick1,
+  task_type_kick2,
+  task_type_send,
+  task_type_recv,
+  task_type_link,
+  task_type_grav_pp,
+  task_type_grav_mm,
+  task_type_grav_up,
+  task_type_grav_down,
+  task_type_count
+};
+
 extern const char *taskID_names[];
-    
-    
+
 /* The different task sub-types. */
 enum task_subtypes {
-    task_subtype_none = 0,
-    task_subtype_density,
-    task_subtype_force,
-    task_subtype_grav,
-    task_subtype_count
-    };
-    
+  task_subtype_none = 0,
+  task_subtype_density,
+  task_subtype_force,
+  task_subtype_grav,
+  task_subtype_count
+};
+
 extern const char *taskID_names[];
-    
-    
+
 /* Data of a task. */
 struct task {
 
-    enum task_types type;
-    enum task_subtypes subtype;
-    char skip, tight, implicit;
-    int flags, wait, rank, weight;
-    
-    lock_type lock;
-    
-    struct cell *ci, *cj;
-    
-    #ifdef WITH_MPI
-        MPI_Request req;
-    #endif
-    
-    int rid;
-    ticks tic, toc;
-    
-    int nr_unlock_tasks;
-    struct task *unlock_tasks[ task_maxunlock + 1 ];
+  enum task_types type;
+  enum task_subtypes subtype;
+  char skip, tight, implicit;
+  int flags, wait, rank, weight;
 
-    };
+  lock_type lock;
 
+  struct cell *ci, *cj;
+
+#ifdef WITH_MPI
+  MPI_Request req;
+#endif
+
+  int rid;
+  ticks tic, toc;
+
+  int nr_unlock_tasks;
+  struct task *unlock_tasks[task_maxunlock + 1];
+};
 
 /* Function prototypes. */
-void task_rmunlock( struct task *ta , struct task *tb );
-void task_rmunlock_blind( struct task *ta , struct task *tb );
-void task_cleanunlock ( struct task *t , int type );
-void task_addunlock( struct task *ta , struct task *tb );
-void task_unlock ( struct task *t );
-int task_lock ( struct task *t );
+void task_rmunlock(struct task *ta, struct task *tb);
+void task_rmunlock_blind(struct task *ta, struct task *tb);
+void task_cleanunlock(struct task *t, int type);
+void task_addunlock(struct task *ta, struct task *tb);
+void task_unlock(struct task *t);
+int task_lock(struct task *t);
+
+#endif /* SWIFT_TASK_H */
diff --git a/src/timers.c b/src/timers.c
index 0bc1a85d92cacaa79837f59ad3fc17c7d66f3259..01a77d7804241f108b092f7d6857c90be3861cd0 100644
--- a/src/timers.c
+++ b/src/timers.c
@@ -1,33 +1,30 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 
 /* Config parameters. */
 #include "../config.h"
 
-/* Local headers. */
-#include "cycle.h"
+/* This object's header. */
 #include "timers.h"
 
-
 /* The timers. */
-ticks timers[ timer_count ];
-
+ticks timers[timer_count];
 
 /**
  * @brief Re-set the timers.
@@ -36,14 +33,12 @@ ticks timers[ timer_count ];
  *
  * To reset all timers, use the mask #timers_mask_all.
  */
- 
-void timers_reset ( unsigned int mask ) {
 
-    int k;
-    
-    /* Loop over the timers and set the masked ones to zero. */
-    for ( k = 0 ; k < timer_count ; k++ )
-        if ( mask & ( 1 << k ) )
-            timers[ k ] = 0;
+void timers_reset(unsigned int mask) {
+
+  int k;
 
-    }
+  /* Loop over the timers and set the masked ones to zero. */
+  for (k = 0; k < timer_count; k++)
+    if (mask & (1 << k)) timers[k] = 0;
+}
diff --git a/src/timers.h b/src/timers.h
index 58c48ac2444e2cd615f711213474d729b4bbbe70..38ca81222ffb33b1558dcc4d7ee3a0cc1a71cd20 100644
--- a/src/timers.h
+++ b/src/timers.h
@@ -1,82 +1,86 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_TIMERS_H
+#define SWIFT_TIMERS_H
 
+/* Includes. */
+#include "cycle.h"
 #include "inline.h"
 
 /* The timers themselves. */
 enum {
-    timer_none = 0,
-    timer_prepare,
-    timer_kick1,
-    timer_kick2,
-    timer_dosort,
-    timer_doself_density,
-    timer_doself_force,
-    timer_doself_grav,
-    timer_dopair_density,
-    timer_dopair_force,
-    timer_dopair_grav,
-    timer_dosub_density,
-    timer_dosub_force,
-    timer_dosub_grav,
-    timer_dopair_subset,
-    timer_doghost,
-    timer_gettask,
-    timer_qget,
-    timer_qsteal,
-    timer_runners,
-    timer_step,
-    timer_count,
-    };
-    
+  timer_none = 0,
+  timer_prepare,
+  timer_kick1,
+  timer_kick2,
+  timer_dosort,
+  timer_doself_density,
+  timer_doself_force,
+  timer_doself_grav,
+  timer_dopair_density,
+  timer_dopair_force,
+  timer_dopair_grav,
+  timer_dosub_density,
+  timer_dosub_force,
+  timer_dosub_grav,
+  timer_dopair_subset,
+  timer_doghost,
+  timer_gettask,
+  timer_qget,
+  timer_qsteal,
+  timer_runners,
+  timer_step,
+  timer_count,
+};
+
 /* The timers. */
-extern ticks timers[ timer_count ];
+extern ticks timers[timer_count];
 
 /* Mask for all timers. */
-#define timers_mask_all ( (1 << timer_count) - 1 )
-
+#define timers_mask_all ((1 << timer_count) - 1)
 
 /* Define the timer macros. */
 #ifdef TIMER_VERBOSE
-    #ifndef TIMER
-        #define TIMER
-    #endif
+#ifndef TIMER
+#define TIMER
+#endif
 #endif
 #ifdef TIMER
-    #define TIMER_TIC_ND tic = getticks();
-    #define TIMER_TIC2_ND ticks tic2 = getticks();
-    #define TIMER_TIC ticks tic = getticks();
-    #define TIMER_TOC(t) timers_toc( t , tic )
-    #define TIMER_TIC2 ticks tic2 = getticks();
-    #define TIMER_TOC2(t) timers_toc( t , tic2 )
-    INLINE static ticks timers_toc ( int t , ticks tic ) {
-        ticks d = (getticks() - tic);
-        __sync_add_and_fetch( &timers[t] , d );
-        return d;
-        }
+#define TIMER_TIC_ND tic = getticks();
+#define TIMER_TIC2_ND ticks tic2 = getticks();
+#define TIMER_TIC ticks tic = getticks();
+#define TIMER_TOC(t) timers_toc(t, tic)
+#define TIMER_TIC2 ticks tic2 = getticks();
+#define TIMER_TOC2(t) timers_toc(t, tic2)
+INLINE static ticks timers_toc(int t, ticks tic) {
+  ticks d = (getticks() - tic);
+  __sync_add_and_fetch(&timers[t], d);
+  return d;
+}
 #else
-    #define TIMER_TIC
-    #define TIMER_TOC(t)
-    #define TIMER_TIC2
-    #define TIMER_TOC2(t)
+#define TIMER_TIC
+#define TIMER_TOC(t)
+#define TIMER_TIC2
+#define TIMER_TOC2(t)
 #endif
 
-
 /* Function prototypes. */
-void timers_reset ( unsigned int mask );
+void timers_reset(unsigned int mask);
+
+#endif /* SWIFT_TIMERS_H */
diff --git a/src/units.c b/src/units.c
index ffca1974205936fe50dc770ba7eaa73895273737..af705323bdd8089c5ae22f11c49975bfe01c5f83 100644
--- a/src/units.c
+++ b/src/units.c
@@ -2,52 +2,56 @@
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
  *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk).
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 
 /* Config parameters. */
 #include "../config.h"
 
-
+/* Some standard headers. */
+#include <math.h>
+#include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <stddef.h>
-#include <math.h>
+
+/* MPI headers. */
 #ifdef WITH_MPI
 #include <mpi.h>
 #endif
 
+/* This object's header. */
+#include "units.h"
+
+/* Includes. */
 #include "const.h"
-#include "cycle.h"
-#include "part.h"
 #include "error.h"
 #include "units.h"
 
-
 /**
- * @brief Initialises the UnitSystem structure with the constants given in const.h
+ * @brief Initialises the UnitSystem structure with the constants given in
+ * const.h
  * @param us The UnitSystem to initialize
  */
 
-void initUnitSystem(struct UnitSystem* us)
-{
+void initUnitSystem(struct UnitSystem* us) {
   us->UnitMass_in_cgs = const_unit_mass_in_cgs;
   us->UnitLength_in_cgs = const_unit_length_in_cgs;
-  us->UnitTime_in_cgs = 1. / ((double) const_unit_velocity_in_cgs / ( (double)const_unit_length_in_cgs ));
+  us->UnitTime_in_cgs = 1. / ((double)const_unit_velocity_in_cgs /
+                              ((double)const_unit_length_in_cgs));
   us->UnitCurrent_in_cgs = 1.;
   us->UnitTemperature_in_cgs = 1.;
 }
@@ -55,147 +59,214 @@ void initUnitSystem(struct UnitSystem* us)
 /**
  * @brief Returns the base unit conversion factor for a given unit system
  * @param us The UnitSystem used
- * @param baseUnit The base unit 
+ * @param baseUnit The base unit
  */
-double getBaseUnit(struct UnitSystem* us, enum BaseUnits baseUnit)
-{
-  switch(baseUnit)
-    {
-    case UNIT_MASS: return us->UnitMass_in_cgs;
-    case UNIT_LENGTH: return us->UnitLength_in_cgs;
-    case UNIT_TIME: return us->UnitTime_in_cgs;
-    case UNIT_CURRENT: return us->UnitCurrent_in_cgs;
-    case UNIT_TEMPERATURE: return us->UnitTemperature_in_cgs;
-    default: error( "Invalid base Unit" );
-    }
+double getBaseUnit(struct UnitSystem* us, enum BaseUnits baseUnit) {
+  switch (baseUnit) {
+    case UNIT_MASS:
+      return us->UnitMass_in_cgs;
+    case UNIT_LENGTH:
+      return us->UnitLength_in_cgs;
+    case UNIT_TIME:
+      return us->UnitTime_in_cgs;
+    case UNIT_CURRENT:
+      return us->UnitCurrent_in_cgs;
+    case UNIT_TEMPERATURE:
+      return us->UnitTemperature_in_cgs;
+    default:
+      error("Invalid base Unit");
+  }
 }
 
 /**
  * @brief Returns the base unit symbol
- * @param baseUnit The base unit 
+ * @param baseUnit The base unit
  */
-const char* getBaseUnitSymbol(enum BaseUnits baseUnit)
-{
- switch(baseUnit)
-    {
-    case UNIT_MASS: return "U_M";
-    case UNIT_LENGTH: return "U_L";
-    case UNIT_TIME: return "U_t";
-    case UNIT_CURRENT: return "U_I";
-    case UNIT_TEMPERATURE: return "U_T";
-    default: error( "Invalid base Unit" );
-    }
+const char* getBaseUnitSymbol(enum BaseUnits baseUnit) {
+  switch (baseUnit) {
+    case UNIT_MASS:
+      return "U_M";
+    case UNIT_LENGTH:
+      return "U_L";
+    case UNIT_TIME:
+      return "U_t";
+    case UNIT_CURRENT:
+      return "U_I";
+    case UNIT_TEMPERATURE:
+      return "U_T";
+    default:
+      error("Invalid base Unit");
+  }
 }
 
-
 /**
  * @brief Returns the base unit symbol in the cgs system
- * @param baseUnit The base unit 
+ * @param baseUnit The base unit
  */
-const char* getBaseUnitCGSSymbol(enum BaseUnits baseUnit)
-{
- switch(baseUnit)
-    {
-    case UNIT_MASS: return "g";
-    case UNIT_LENGTH: return "cm";
-    case UNIT_TIME: return "s";
-    case UNIT_CURRENT: return "A";
-    case UNIT_TEMPERATURE: return "K";
-    default: error( "Invalid base Unit" );
-    }
+const char* getBaseUnitCGSSymbol(enum BaseUnits baseUnit) {
+  switch (baseUnit) {
+    case UNIT_MASS:
+      return "g";
+    case UNIT_LENGTH:
+      return "cm";
+    case UNIT_TIME:
+      return "s";
+    case UNIT_CURRENT:
+      return "A";
+    case UNIT_TEMPERATURE:
+      return "K";
+    default:
+      error("Invalid base Unit");
+  }
 }
 
-
-void getBaseUnitExponantsArray(float baseUnitsExp[5], enum UnitConversionFactor unit)
-{
-  switch( unit )
-    {
+void getBaseUnitExponantsArray(float baseUnitsExp[5],
+                               enum UnitConversionFactor unit) {
+  switch (unit) {
     case UNIT_CONV_NO_UNITS:
       break;
 
-    case UNIT_CONV_MASS: 
-      baseUnitsExp[UNIT_MASS] = 1.f; break;
+    case UNIT_CONV_MASS:
+      baseUnitsExp[UNIT_MASS] = 1.f;
+      break;
 
-    case UNIT_CONV_LENGTH: 
-      baseUnitsExp[UNIT_LENGTH] = 1.f; break;
+    case UNIT_CONV_LENGTH:
+      baseUnitsExp[UNIT_LENGTH] = 1.f;
+      break;
 
-    case UNIT_CONV_TIME: 
-      baseUnitsExp[UNIT_TIME] = 1.f; break;
+    case UNIT_CONV_TIME:
+      baseUnitsExp[UNIT_TIME] = 1.f;
+      break;
 
-    case UNIT_CONV_FREQUENCY: 
-       baseUnitsExp[UNIT_TIME] = -1.f;  break;
+    case UNIT_CONV_FREQUENCY:
+      baseUnitsExp[UNIT_TIME] = -1.f;
+      break;
 
-    case UNIT_CONV_DENSITY: 
-      baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_LENGTH] = -3.f;  break;
+    case UNIT_CONV_DENSITY:
+      baseUnitsExp[UNIT_MASS] = 1.f;
+      baseUnitsExp[UNIT_LENGTH] = -3.f;
+      break;
 
-    case UNIT_CONV_SPEED: 
-      baseUnitsExp[UNIT_LENGTH] = 1.f; baseUnitsExp[UNIT_TIME] = -1.f;  break;
+    case UNIT_CONV_SPEED:
+      baseUnitsExp[UNIT_LENGTH] = 1.f;
+      baseUnitsExp[UNIT_TIME] = -1.f;
+      break;
 
-    case UNIT_CONV_ACCELERATION: 
-      baseUnitsExp[UNIT_LENGTH] = 1.f; baseUnitsExp[UNIT_TIME] = -2.f;  break;
+    case UNIT_CONV_ACCELERATION:
+      baseUnitsExp[UNIT_LENGTH] = 1.f;
+      baseUnitsExp[UNIT_TIME] = -2.f;
+      break;
 
-    case UNIT_CONV_FORCE: 
-       baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_LENGTH] = 1.f; baseUnitsExp[UNIT_TIME] = -2.f;  break;
+    case UNIT_CONV_FORCE:
+      baseUnitsExp[UNIT_MASS] = 1.f;
+      baseUnitsExp[UNIT_LENGTH] = 1.f;
+      baseUnitsExp[UNIT_TIME] = -2.f;
+      break;
 
-    case UNIT_CONV_ENERGY: 
-       baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_LENGTH] = 2.f; baseUnitsExp[UNIT_TIME] = -2.f;  break;
+    case UNIT_CONV_ENERGY:
+      baseUnitsExp[UNIT_MASS] = 1.f;
+      baseUnitsExp[UNIT_LENGTH] = 2.f;
+      baseUnitsExp[UNIT_TIME] = -2.f;
+      break;
 
-    case UNIT_CONV_ENERGY_PER_UNIT_MASS: 
-      baseUnitsExp[UNIT_LENGTH] = 2.f; baseUnitsExp[UNIT_TIME] = -2.f;  break;
+    case UNIT_CONV_ENERGY_PER_UNIT_MASS:
+      baseUnitsExp[UNIT_LENGTH] = 2.f;
+      baseUnitsExp[UNIT_TIME] = -2.f;
+      break;
 
-    case UNIT_CONV_ENTROPY: 
-      baseUnitsExp[UNIT_MASS] = 1.f - const_hydro_gamma; baseUnitsExp[UNIT_LENGTH] = 3.f * const_hydro_gamma - 1.f; baseUnitsExp[UNIT_TIME] = -2.f;  break;
+    case UNIT_CONV_ENTROPY:
+      baseUnitsExp[UNIT_MASS] = 1.f - const_hydro_gamma;
+      baseUnitsExp[UNIT_LENGTH] = 3.f * const_hydro_gamma - 1.f;
+      baseUnitsExp[UNIT_TIME] = -2.f;
+      break;
 
-    case UNIT_CONV_ENTROPY_PER_UNIT_MASS: 
-      baseUnitsExp[UNIT_MASS] = -const_hydro_gamma; baseUnitsExp[UNIT_LENGTH] = 3.f * const_hydro_gamma - 1.f; baseUnitsExp[UNIT_TIME] = -2.f;  break;
+    case UNIT_CONV_ENTROPY_PER_UNIT_MASS:
+      baseUnitsExp[UNIT_MASS] = -const_hydro_gamma;
+      baseUnitsExp[UNIT_LENGTH] = 3.f * const_hydro_gamma - 1.f;
+      baseUnitsExp[UNIT_TIME] = -2.f;
+      break;
 
-    case UNIT_CONV_POWER: 
-      baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_LENGTH] = 2.f; baseUnitsExp[UNIT_TIME] = -3.f;  break;
+    case UNIT_CONV_POWER:
+      baseUnitsExp[UNIT_MASS] = 1.f;
+      baseUnitsExp[UNIT_LENGTH] = 2.f;
+      baseUnitsExp[UNIT_TIME] = -3.f;
+      break;
 
-    case UNIT_CONV_PRESSURE: 
-      baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_LENGTH] = -1.f; baseUnitsExp[UNIT_TIME] = -2.f;  break;
+    case UNIT_CONV_PRESSURE:
+      baseUnitsExp[UNIT_MASS] = 1.f;
+      baseUnitsExp[UNIT_LENGTH] = -1.f;
+      baseUnitsExp[UNIT_TIME] = -2.f;
+      break;
 
     case UNIT_CONV_ELECTRIC_CHARGE:
-      baseUnitsExp[UNIT_TIME] = 1.f; baseUnitsExp[UNIT_CURRENT] = 1.f; break;
+      baseUnitsExp[UNIT_TIME] = 1.f;
+      baseUnitsExp[UNIT_CURRENT] = 1.f;
+      break;
 
     case UNIT_CONV_ELECTRIC_VOLTAGE:
-      baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_LENGTH] = 2.f; baseUnitsExp[UNIT_TIME] = -3.f; baseUnitsExp[UNIT_CURRENT] = -1.f; break;
-      
+      baseUnitsExp[UNIT_MASS] = 1.f;
+      baseUnitsExp[UNIT_LENGTH] = 2.f;
+      baseUnitsExp[UNIT_TIME] = -3.f;
+      baseUnitsExp[UNIT_CURRENT] = -1.f;
+      break;
+
     case UNIT_CONV_ELECTRIC_CAPACITANCE:
-      baseUnitsExp[UNIT_MASS] = -1.f; baseUnitsExp[UNIT_LENGTH] = -2.f; baseUnitsExp[UNIT_TIME] = 4; baseUnitsExp[UNIT_CURRENT] = 2.f; break;
+      baseUnitsExp[UNIT_MASS] = -1.f;
+      baseUnitsExp[UNIT_LENGTH] = -2.f;
+      baseUnitsExp[UNIT_TIME] = 4;
+      baseUnitsExp[UNIT_CURRENT] = 2.f;
+      break;
 
     case UNIT_CONV_ELECTRIC_RESISTANCE:
-      baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_LENGTH] = 2.f; baseUnitsExp[UNIT_TIME] = -3.f; baseUnitsExp[UNIT_CURRENT] = -2.f; break;
+      baseUnitsExp[UNIT_MASS] = 1.f;
+      baseUnitsExp[UNIT_LENGTH] = 2.f;
+      baseUnitsExp[UNIT_TIME] = -3.f;
+      baseUnitsExp[UNIT_CURRENT] = -2.f;
+      break;
 
     case UNIT_CONV_ELECTRIC_CONDUCTANCE:
-      baseUnitsExp[UNIT_MASS] = -1.f; baseUnitsExp[UNIT_LENGTH] = -2.f; baseUnitsExp[UNIT_TIME] = 3.f; baseUnitsExp[UNIT_CURRENT] = 2.f; break;
-      
+      baseUnitsExp[UNIT_MASS] = -1.f;
+      baseUnitsExp[UNIT_LENGTH] = -2.f;
+      baseUnitsExp[UNIT_TIME] = 3.f;
+      baseUnitsExp[UNIT_CURRENT] = 2.f;
+      break;
+
     case UNIT_CONV_MAGNETIC_FLUX:
-      baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_LENGTH] = 2.f; baseUnitsExp[UNIT_TIME] = -2.f; baseUnitsExp[UNIT_CURRENT] = -1.f; break;
-      
+      baseUnitsExp[UNIT_MASS] = 1.f;
+      baseUnitsExp[UNIT_LENGTH] = 2.f;
+      baseUnitsExp[UNIT_TIME] = -2.f;
+      baseUnitsExp[UNIT_CURRENT] = -1.f;
+      break;
+
     case UNIT_CONV_MAGNETIC_FIELD:
-      baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_TIME] = -2.f; baseUnitsExp[UNIT_CURRENT] = -1.f; break;
+      baseUnitsExp[UNIT_MASS] = 1.f;
+      baseUnitsExp[UNIT_TIME] = -2.f;
+      baseUnitsExp[UNIT_CURRENT] = -1.f;
+      break;
 
     case UNIT_CONV_MAGNETIC_INDUCTANCE:
-      baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_LENGTH] = 2.f; baseUnitsExp[UNIT_TIME] = -2.f; baseUnitsExp[UNIT_CURRENT] = -2.f; break;
+      baseUnitsExp[UNIT_MASS] = 1.f;
+      baseUnitsExp[UNIT_LENGTH] = 2.f;
+      baseUnitsExp[UNIT_TIME] = -2.f;
+      baseUnitsExp[UNIT_CURRENT] = -2.f;
+      break;
 
     case UNIT_CONV_TEMPERATURE:
       baseUnitsExp[UNIT_TEMPERATURE] = 1.f;
-    }
+  }
 }
 
-
 /**
- * @brief Returns the conversion factor for a given unit in the chosen unit system
+ * @brief Returns the conversion factor for a given unit in the chosen unit
+ * system
  * @param us The system of units in use
  * @param unit The unit to convert
  */
-double conversionFactor(struct UnitSystem* us, enum UnitConversionFactor unit)
-{
-  float baseUnitsExp[5] = { 0.f };
+double conversionFactor(struct UnitSystem* us, enum UnitConversionFactor unit) {
+  float baseUnitsExp[5] = {0.f};
 
   getBaseUnitExponantsArray(baseUnitsExp, unit);
-  
+
   return generalConversionFactor(us, baseUnitsExp);
 }
 
@@ -204,102 +275,101 @@ double conversionFactor(struct UnitSystem* us, enum UnitConversionFactor unit)
  * @param us The system of units in use
  * @param unit The unit to convert
  */
-float hFactor(struct UnitSystem* us, enum UnitConversionFactor unit)
-{
-  float baseUnitsExp[5] = { 0.f };
+float hFactor(struct UnitSystem* us, enum UnitConversionFactor unit) {
+  float baseUnitsExp[5] = {0.f};
 
   getBaseUnitExponantsArray(baseUnitsExp, unit);
-  
-  return generalhFactor(us, baseUnitsExp);
 
+  return generalhFactor(us, baseUnitsExp);
 }
 
-
 /**
  * @brief Returns the scaling factor exponentiation for a given unit
  * @param us The system of units in use
  * @param unit The unit to convert
  */
-float aFactor(struct UnitSystem* us, enum UnitConversionFactor unit)
-{
-  float baseUnitsExp[5] = { 0.f };
+float aFactor(struct UnitSystem* us, enum UnitConversionFactor unit) {
+  float baseUnitsExp[5] = {0.f};
 
   getBaseUnitExponantsArray(baseUnitsExp, unit);
-  
-  return generalaFactor(us, baseUnitsExp);
 
+  return generalaFactor(us, baseUnitsExp);
 }
 
-
 /**
- * @brief Returns a string containg the exponants of the base units making up the conversion factors
+ * @brief Returns a string containg the exponants of the base units making up
+ * the conversion factors
  */
-void conversionString(char * buffer, struct UnitSystem* us, enum UnitConversionFactor unit)
-{
-  float baseUnitsExp[5] = { 0.f };
+void conversionString(char* buffer, struct UnitSystem* us,
+                      enum UnitConversionFactor unit) {
+  float baseUnitsExp[5] = {0.f};
 
   getBaseUnitExponantsArray(baseUnitsExp, unit);
- 
+
   generalConversionString(buffer, us, baseUnitsExp);
 }
 
-
-
 /**
- * @brief Returns the conversion factor for a given unit (expressed in terms of the 5 fundamental units) in the chosen unit system
+ * @brief Returns the conversion factor for a given unit (expressed in terms of
+ * the 5 fundamental units) in the chosen unit system
  * @param us The unit system used
- * @param baseUnitsExponants The exponant of each base units required to form the desired quantity. See conversionFactor() for a working example
+ * @param baseUnitsExponants The exponant of each base units required to form
+ * the desired quantity. See conversionFactor() for a working example
  */
-double generalConversionFactor(struct UnitSystem* us, float baseUnitsExponants[5])
-{
+double generalConversionFactor(struct UnitSystem* us,
+                               float baseUnitsExponants[5]) {
   double factor = 1.;
   int i;
 
-  for(i = 0 ; i < 5 ; ++i )
-    if(baseUnitsExponants[i] != 0)
-      factor *= pow( getBaseUnit( us, i )  , baseUnitsExponants[i] );
-  return factor;	
+  for (i = 0; i < 5; ++i)
+    if (baseUnitsExponants[i] != 0)
+      factor *= pow(getBaseUnit(us, i), baseUnitsExponants[i]);
+  return factor;
 }
 
-
 /**
- * @brief Returns the h factor exponentiation for a given unit (expressed in terms of the 5 fundamental units)
+ * @brief Returns the h factor exponentiation for a given unit (expressed in
+ * terms of the 5 fundamental units)
  * @param us The unit system used
- * @param baseUnitsExponants The exponant of each base units required to form the desired quantity. See conversionFactor() for a working example
+ * @param baseUnitsExponants The exponant of each base units required to form
+ * the desired quantity. See conversionFactor() for a working example
  */
-float generalhFactor(struct UnitSystem* us, float baseUnitsExponants[5])
-{
+float generalhFactor(struct UnitSystem* us, float baseUnitsExponants[5]) {
   float factor_exp = 0.f;
-  
+
   factor_exp += -baseUnitsExponants[UNIT_MASS];
   factor_exp += -baseUnitsExponants[UNIT_LENGTH];
   factor_exp += -baseUnitsExponants[UNIT_TIME];
-  
+
   return factor_exp;
 }
 
 /**
- * @brief Returns the scaling factor exponentiation for a given unit (expressed in terms of the 5 fundamental units)
+ * @brief Returns the scaling factor exponentiation for a given unit (expressed
+ * in terms of the 5 fundamental units)
  * @param us The unit system used
- * @param baseUnitsExponants The exponant of each base units required to form the desired quantity. See conversionFactor() for a working example
+ * @param baseUnitsExponants The exponant of each base units required to form
+ * the desired quantity. See conversionFactor() for a working example
  */
-float generalaFactor(struct UnitSystem* us, float baseUnitsExponants[5])
-{
+float generalaFactor(struct UnitSystem* us, float baseUnitsExponants[5]) {
   float factor_exp = 0.f;
-  
+
   factor_exp += baseUnitsExponants[UNIT_LENGTH];
-  
-  return  factor_exp;
+
+  return factor_exp;
 }
 
 /**
- * @brief Returns a string containg the exponants of the base units making up the conversion factors (expressed in terms of the 5 fundamental units)
- * @param buffer The buffer in which to write (The buffer must be long enough, 140 chars at most)
+ * @brief Returns a string containg the exponants of the base units making up
+ * the conversion factors (expressed in terms of the 5 fundamental units)
+ * @param buffer The buffer in which to write (The buffer must be long enough,
+ * 140 chars at most)
  * @param us The UnistSystem in use.
- * @param baseUnitsExponants The exponant of each base units required to form the desired quantity. See conversionFactor() for a working example
+ * @param baseUnitsExponants The exponant of each base units required to form
+ * the desired quantity. See conversionFactor() for a working example
  */
-void generalConversionString(char * buffer, struct UnitSystem* us, float baseUnitsExponants[5])
-{
+void generalConversionString(char* buffer, struct UnitSystem* us,
+                             float baseUnitsExponants[5]) {
   char temp[14];
   double a_exp = generalaFactor(us, baseUnitsExponants);
   double h_exp = generalhFactor(us, baseUnitsExponants);
@@ -307,72 +377,68 @@ void generalConversionString(char * buffer, struct UnitSystem* us, float baseUni
 
   /* Check whether we are unitless or not */
   char isAllNonZero = 1;
-  for(i = 0 ; i < 5 ; ++i )
-    if( baseUnitsExponants[i] != 0.)
-      isAllNonZero = 0;
-
-  if( isAllNonZero )
-    {
-      sprintf(buffer, "[ - ] ");
-      return;
-    }
+  for (i = 0; i < 5; ++i)
+    if (baseUnitsExponants[i] != 0.) isAllNonZero = 0;
 
+  if (isAllNonZero) {
+    sprintf(buffer, "[ - ] ");
+    return;
+  }
 
   /* Add a-factor */
-  if(a_exp == 0)
+  if (a_exp == 0)
     sprintf(buffer, " ");
-  else if(a_exp == 1)
+  else if (a_exp == 1)
     sprintf(buffer, "a ");
-  else  if(remainder(a_exp, 1.) == 0)
-    sprintf(buffer, "a^%d ", (int) a_exp);
+  else if (remainder(a_exp, 1.) == 0)
+    sprintf(buffer, "a^%d ", (int)a_exp);
   else
     sprintf(buffer, "a^%7.4f ", a_exp);
 
   /* Add h-factor */
-  if(h_exp == 0)
+  if (h_exp == 0)
     sprintf(temp, " ");
-  else if(h_exp == 1)
+  else if (h_exp == 1)
     sprintf(temp, "h ");
-  else if(remainder(h_exp, 1.) == 0)
-    sprintf(temp, "h^%d ", (int) h_exp);
+  else if (remainder(h_exp, 1.) == 0)
+    sprintf(temp, "h^%d ", (int)h_exp);
   else
     sprintf(temp, "h^%7.4f ", h_exp);
   strncat(buffer, temp, 12);
 
   /* Add conversion units */
-  for(i = 0 ; i < 5 ; ++i )
-    if(baseUnitsExponants[i] != 0)
-      {
-	if(baseUnitsExponants[i] == 0.)
-	  sprintf(temp, " ");
-	else if(baseUnitsExponants[i] == 1.)
-	  sprintf(temp, "%s ", getBaseUnitSymbol(i));
-	else if(remainder(baseUnitsExponants[i], 1.) == 0)
-	  sprintf(temp, "%s^%d ", getBaseUnitSymbol(i), (int)  baseUnitsExponants[i]);
-	else
-	  sprintf(temp, "%s^%7.4f ", getBaseUnitSymbol(i), baseUnitsExponants[i]);
-	strncat(buffer, temp, 12);
-      }
-
+  for (i = 0; i < 5; ++i)
+    if (baseUnitsExponants[i] != 0) {
+      if (baseUnitsExponants[i] == 0.)
+        sprintf(temp, " ");
+      else if (baseUnitsExponants[i] == 1.)
+        sprintf(temp, "%s ", getBaseUnitSymbol(i));
+      else if (remainder(baseUnitsExponants[i], 1.) == 0)
+        sprintf(temp, "%s^%d ", getBaseUnitSymbol(i),
+                (int)baseUnitsExponants[i]);
+      else
+        sprintf(temp, "%s^%7.4f ", getBaseUnitSymbol(i), baseUnitsExponants[i]);
+      strncat(buffer, temp, 12);
+    }
 
   /* Add CGS units */
   strncat(buffer, " [ ", 3);
-  
-  for(i = 0 ; i < 5 ; ++i )
-    {
-      if(baseUnitsExponants[i] != 0)
-	{
-	  if(baseUnitsExponants[i] == 0.)
-	    continue;
-	  else if(baseUnitsExponants[i] == 1.)
-	    sprintf(temp, "%s ", getBaseUnitCGSSymbol(i));
-	  else if(remainder(baseUnitsExponants[i], 1.) == 0)
-	    sprintf(temp, "%s^%d ", getBaseUnitCGSSymbol(i), (int)  baseUnitsExponants[i]);
-	  else
-	    sprintf(temp, "%s^%7.4f ", getBaseUnitCGSSymbol(i), baseUnitsExponants[i]);
-	  strncat(buffer, temp, 12);
-	}
+
+  for (i = 0; i < 5; ++i) {
+    if (baseUnitsExponants[i] != 0) {
+      if (baseUnitsExponants[i] == 0.)
+        continue;
+      else if (baseUnitsExponants[i] == 1.)
+        sprintf(temp, "%s ", getBaseUnitCGSSymbol(i));
+      else if (remainder(baseUnitsExponants[i], 1.) == 0)
+        sprintf(temp, "%s^%d ", getBaseUnitCGSSymbol(i),
+                (int)baseUnitsExponants[i]);
+      else
+        sprintf(temp, "%s^%7.4f ", getBaseUnitCGSSymbol(i),
+                baseUnitsExponants[i]);
+      strncat(buffer, temp, 12);
     }
-  
+  }
+
   strncat(buffer, "]", 2);
 }
diff --git a/src/units.h b/src/units.h
index 40eb88f4fa0255849a5bdab3d6bebd59c5d9dad7..ba69443ed883446e34894e4f6a47dfc28694ea31 100644
--- a/src/units.h
+++ b/src/units.h
@@ -1,90 +1,95 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
-
+#ifndef SWIFT_UNITS_H
+#define SWIFT_UNITS_H
 
 /**
  * @brief The unit system used internally.
  *
- * This structure contains the conversion factors to the 7 cgs base units to the internal units.
+ * This structure contains the conversion factors to the 7 cgs base units to the
+ *internal units.
  * It is used everytime a conversion is performed or an i/o function is called.
  *
  **/
-struct UnitSystem
-{
-  double UnitMass_in_cgs;           /*< Conversion factor from grams to internal mass units */
+struct UnitSystem {
+  double UnitMass_in_cgs; /*< Conversion factor from grams to internal mass
+                             units */
 
-  double UnitLength_in_cgs;         /*< Conversion factor from centimeters to internal length units. */
+  double UnitLength_in_cgs; /*< Conversion factor from centimeters to internal
+                               length units. */
 
-  double UnitTime_in_cgs;           /*< Conversion factor from seconds to internal time units. */
+  double UnitTime_in_cgs; /*< Conversion factor from seconds to internal time
+                             units. */
 
-  double UnitCurrent_in_cgs;        /*< Conversion factor from Ampere to internal current units. */
+  double UnitCurrent_in_cgs; /*< Conversion factor from Ampere to internal
+                                current units. */
 
-  double UnitTemperature_in_cgs;    /*< Conversion factor from Kelvins to internal temperature units. */
+  double
+      UnitTemperature_in_cgs; /*< Conversion factor from Kelvins to internal
+                                 temperature units. */
 };
 
 /**
- * @brief The base units used in the cgs (and internal) system. All units are derived from those.
+ * @brief The base units used in the cgs (and internal) system. All units are
+ * derived from those.
  */
-enum BaseUnits
-  {
-    UNIT_MASS = 0,
-    UNIT_LENGTH = 1,
-    UNIT_TIME = 2,
-    UNIT_CURRENT = 3,
-    UNIT_TEMPERATURE = 4
-  };
-
+enum BaseUnits {
+  UNIT_MASS = 0,
+  UNIT_LENGTH = 1,
+  UNIT_TIME = 2,
+  UNIT_CURRENT = 3,
+  UNIT_TEMPERATURE = 4
+};
 
 /**
  * @brief  The different conversion factors supported by default
  */
-enum UnitConversionFactor
-  {
-    UNIT_CONV_NO_UNITS,
-    UNIT_CONV_MASS,
-    UNIT_CONV_LENGTH,
-    UNIT_CONV_TIME,
-    UNIT_CONV_DENSITY,
-    UNIT_CONV_SPEED,
-    UNIT_CONV_ACCELERATION,
-    UNIT_CONV_FORCE,
-    UNIT_CONV_ENERGY,
-    UNIT_CONV_ENERGY_PER_UNIT_MASS,
-    UNIT_CONV_ENTROPY,
-    UNIT_CONV_ENTROPY_PER_UNIT_MASS,
-    UNIT_CONV_POWER,
-    UNIT_CONV_PRESSURE,
-    UNIT_CONV_FREQUENCY,
-    UNIT_CONV_ELECTRIC_CHARGE,
-    UNIT_CONV_ELECTRIC_VOLTAGE,
-    UNIT_CONV_ELECTRIC_CAPACITANCE,
-    UNIT_CONV_ELECTRIC_RESISTANCE,
-    UNIT_CONV_ELECTRIC_CONDUCTANCE,
-    UNIT_CONV_MAGNETIC_FLUX,
-    UNIT_CONV_MAGNETIC_FIELD,
-    UNIT_CONV_MAGNETIC_INDUCTANCE,
-    UNIT_CONV_TEMPERATURE
-  };
-
+enum UnitConversionFactor {
+  UNIT_CONV_NO_UNITS,
+  UNIT_CONV_MASS,
+  UNIT_CONV_LENGTH,
+  UNIT_CONV_TIME,
+  UNIT_CONV_DENSITY,
+  UNIT_CONV_SPEED,
+  UNIT_CONV_ACCELERATION,
+  UNIT_CONV_FORCE,
+  UNIT_CONV_ENERGY,
+  UNIT_CONV_ENERGY_PER_UNIT_MASS,
+  UNIT_CONV_ENTROPY,
+  UNIT_CONV_ENTROPY_PER_UNIT_MASS,
+  UNIT_CONV_POWER,
+  UNIT_CONV_PRESSURE,
+  UNIT_CONV_FREQUENCY,
+  UNIT_CONV_ELECTRIC_CHARGE,
+  UNIT_CONV_ELECTRIC_VOLTAGE,
+  UNIT_CONV_ELECTRIC_CAPACITANCE,
+  UNIT_CONV_ELECTRIC_RESISTANCE,
+  UNIT_CONV_ELECTRIC_CONDUCTANCE,
+  UNIT_CONV_MAGNETIC_FLUX,
+  UNIT_CONV_MAGNETIC_FIELD,
+  UNIT_CONV_MAGNETIC_INDUCTANCE,
+  UNIT_CONV_TEMPERATURE
+};
 
 /**
- * @brief Initialises the UnitSystem structure with the constants given in const.h
+ * @brief Initialises the UnitSystem structure with the constants given in
+ * const.h
  */
 void initUnitSystem(struct UnitSystem*);
 
@@ -103,50 +108,53 @@ const char* getBaseUnitSymbol(enum BaseUnits);
  */
 const char* getBaseUnitCGSSymbol(enum BaseUnits);
 
-
 /**
- * @brief Returns the conversion factor for a given unit (expressed in terms of the 5 fundamental units) in the chosen unit system
+ * @brief Returns the conversion factor for a given unit (expressed in terms of
+ * the 5 fundamental units) in the chosen unit system
  */
-double generalConversionFactor(struct UnitSystem* us, float baseUnitsExponants[5]);
-
+double generalConversionFactor(struct UnitSystem* us,
+                               float baseUnitsExponants[5]);
 
 /**
- * @brief Returns the conversion factor for a given unit in the chosen unit system
+ * @brief Returns the conversion factor for a given unit in the chosen unit
+ * system
  */
 double conversionFactor(struct UnitSystem* us, enum UnitConversionFactor unit);
 
-
 /**
- * @brief Returns the h factor for a given unit (expressed in terms of the 5 fundamental units) in the chosen unit system
+ * @brief Returns the h factor for a given unit (expressed in terms of the 5
+ * fundamental units) in the chosen unit system
  */
 float generalhFactor(struct UnitSystem* us, float baseUnitsExponants[5]);
 
-
 /**
  * @brief Returns the h factor for a given unit in the chosen unit system
  */
 float hFactor(struct UnitSystem* us, enum UnitConversionFactor unit);
 
-
 /**
- * @brief Returns the scaling factor for a given unit (expressed in terms of the 5 fundamental units) in the chosen unit system
+ * @brief Returns the scaling factor for a given unit (expressed in terms of the
+ * 5 fundamental units) in the chosen unit system
  */
 float generalaFactor(struct UnitSystem* us, float baseUnitsExponants[5]);
 
-
 /**
  * @brief Returns the scaling factor for a given unit in the chosen unit system
  */
 float aFactor(struct UnitSystem* us, enum UnitConversionFactor unit);
 
-
 /**
- * @brief Returns a string containg the exponants of the base units making up the conversion factors (expressed in terms of the 5 fundamental units)
+ * @brief Returns a string containg the exponants of the base units making up
+ * the conversion factors (expressed in terms of the 5 fundamental units)
  */
-void generalConversionString(char * buffer, struct UnitSystem* us, float baseUnitsExponants[5]);
-
+void generalConversionString(char* buffer, struct UnitSystem* us,
+                             float baseUnitsExponants[5]);
 
 /**
- * @brief Returns a string containg the exponants of the base units making up the conversion factors
+ * @brief Returns a string containg the exponants of the base units making up
+ * the conversion factors
  */
-void conversionString(char * buffer, struct UnitSystem* us, enum UnitConversionFactor unit);
+void conversionString(char* buffer, struct UnitSystem* us,
+                      enum UnitConversionFactor unit);
+
+#endif /* SWIFT_UNITS_H */
diff --git a/src/vector.h b/src/vector.h
index 81efefe9f6218a17149869f3ec5535a4361f6b5c..34eb41eea31f821e03ddf24e871faec98095c920 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -1,137 +1,152 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
+#ifndef SWIFT_VECTOR_H
+#define SWIFT_VECTOR_H
 
 /* Have I already read this file? */
 #ifndef VEC_MACRO
 
-    /* Include the header file with the intrinsics. */
-    #include <immintrin.h>
-    
-    /* Define the vector macro. */
-    #define VEC_MACRO(elcount, type)  __attribute__((vector_size((elcount)*sizeof(type)))) type
+/* Include the header file with the intrinsics. */
+#include <immintrin.h>
 
-    /* So what will the vector size be? */
-    #ifdef __MIC__
-        #define VECTORIZE
-        #define VEC_HAVE_GATHER
-        #define VEC_SIZE 16
-        #define VEC_FLOAT __m512
-        #define VEC_DBL __m512d
-        #define VEC_INT __m512i
-        #define vec_load(a) _mm512_load_ps(a)
-        #define vec_set1(a) _mm512_set1_ps(a)
-        #define vec_set(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) _mm512_set_ps(p,o,n,m,l,k,j,i,h,g,f,e,d,c,b,a)
-        #define vec_dbl_set(a,b,c,d,e,f,g,h) _mm512_set_pd(h,g,f,e,d,c,b,a)
-        #define vec_sqrt(a) _mm512_sqrt_ps(a)
-        #define vec_rcp(a) _mm512_rcp_ps(a)
-        #define vec_rsqrt(a) _mm512_rsqrt_ps(a)
-        #define vec_ftoi(a) _mm512_cvttps_epi32(a)
-        #define vec_fmin(a,b) _mm512_min_ps(a,b)
-        #define vec_fmax(a,b) _mm512_max_ps(a,b)
-        #define vec_fabs(a) _mm512_andnot_ps(_mm512_set1_ps(-0.f), a)
-        #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a,0))
-        #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a,1))
-        #define vec_dbl_tofloat(a,b) _mm512_insertf128( _mm512_castps128_ps512(a) , b , 1 )
-        #define vec_dbl_load(a) _mm512_load_pd(a)
-        #define vec_dbl_set1(a) _mm512_set1_pd(a)
-        #define vec_dbl_sqrt(a) _mm512_sqrt_pd(a)
-        #define vec_dbl_rcp(a) _mm512_rcp_pd(a)
-        #define vec_dbl_rsqrt(a) _mm512_rsqrt_pd(a)
-        #define vec_dbl_ftoi(a) _mm512_cvttpd_epi32(a)
-        #define vec_dbl_fmin(a,b) _mm512_min_pd(a,b)
-        #define vec_dbl_fmax(a,b) _mm512_max_pd(a,b)
-        #define vec_getoffsets(ptrs) _mm512_insertf64x4( _mm512_insertf64x4( _mm512_setzero_pd() , _mm512_cvtepi64_epi32( _mm512_load_epi64(ptrs) - _mm512_set1_epi64(ptrs[0]) ) , 0 ) , _mm512_cvtepi64_epi32( _mm512_load_epi64(&ptrs[4]) - _mm512_set1_epi64(ptrs[0]) ) , 1 ) 
-        #define vec_gather(base,offsets) _mm512_i32gather_ps( offsets.m , base , 1 )
-    #elif defined( NO__AVX__ )
-        #define VECTORIZE
-        #define VEC_SIZE 8
-        #define VEC_FLOAT __m256
-        #define VEC_DBL __m256d
-        #define VEC_INT __m256i
-        #define vec_load(a) _mm256_load_ps(a)
-        #define vec_set1(a) _mm256_set1_ps(a)
-        #define vec_set(a,b,c,d,e,f,g,h) _mm256_set_ps(h,g,f,e,d,c,b,a)
-        #define vec_dbl_set(a,b,c,d) _mm256_set_pd(d,c,b,a)
-        #define vec_sqrt(a) _mm256_sqrt_ps(a)
-        #define vec_rcp(a) _mm256_rcp_ps(a)
-        #define vec_rsqrt(a) _mm256_rsqrt_ps(a)
-        #define vec_ftoi(a) _mm256_cvttps_epi32(a)
-        #define vec_fmin(a,b) _mm256_min_ps(a,b)
-        #define vec_fmax(a,b) _mm256_max_ps(a,b)
-        #define vec_fabs(a) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a)
-        #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a,0))
-        #define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a,1))
-        #define vec_dbl_tofloat(a,b) _mm256_insertf128( _mm256_castps128_ps256(a) , b , 1 )
-        #define vec_dbl_load(a) _mm256_load_pd(a)
-        #define vec_dbl_set1(a) _mm256_set1_pd(a)
-        #define vec_dbl_sqrt(a) _mm256_sqrt_pd(a)
-        #define vec_dbl_rcp(a) _mm256_rcp_pd(a)
-        #define vec_dbl_rsqrt(a) _mm256_rsqrt_pd(a)
-        #define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a)
-        #define vec_dbl_fmin(a,b) _mm256_min_pd(a,b)
-        #define vec_dbl_fmax(a,b) _mm256_max_pd(a,b)
-        #ifdef __AVX2__
-            #define VEC_HAVE_GATHER
-            #define vec_gather(base,offsets) _mm256_i32gather_ps( base , offsets.m , 1 )
-        #endif
-    #elif defined( NO__SSE2__ )
-        #define VECTORIZE
-        #define VEC_SIZE 4
-        #define VEC_FLOAT __m128
-        #define VEC_DBL __m128d
-        #define VEC_INT __m128i
-        #define vec_load(a) _mm_load_ps(a)
-        #define vec_set1(a) _mm_set1_ps(a)
-        #define vec_set(a,b,c,d) _mm_set_ps(d,c,b,a)
-        #define vec_dbl_set(a,b) _mm_set_pd(b,a)
-        #define vec_sqrt(a) _mm_sqrt_ps(a)
-        #define vec_rcp(a) _mm_rcp_ps(a)
-        #define vec_rsqrt(a) _mm_rsqrt_ps(a)
-        #define vec_ftoi(a) _mm_cvttps_epi32(a)
-        #define vec_fmin(a,b) _mm_min_ps(a,b)
-        #define vec_fmax(a,b) _mm_max_ps(a,b)
-        #define vec_fabs(a) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
-        #define vec_todbl_lo(a) _mm_cvtps_pd(a)
-        #define vec_todbl_hi(a) _mm_cvtps_pd(_mm_movehl_ps(a,a))
-        #define vec_dbl_tofloat(a,b) _mm_movelh_ps( _mm_cvtpd_ps(a) , _mm_cvtpd_ps(b) )
-        #define vec_dbl_load(a) _mm_load_pd(a)
-        #define vec_dbl_set1(a) _mm_set1_pd(a)
-        #define vec_dbl_sqrt(a) _mm_sqrt_pd(a)
-        #define vec_dbl_rcp(a) _mm_rcp_pd(a)
-        #define vec_dbl_rsqrt(a) _mm_rsqrt_pd(a)
-        #define vec_dbl_ftoi(a) _mm_cvttpd_epi32(a)
-        #define vec_dbl_fmin(a,b) _mm_min_pd(a,b)
-        #define vec_dbl_fmax(a,b) _mm_max_pd(a,b)
-    #else
-        #define VEC_SIZE 4
-    #endif
+/* Define the vector macro. */
+#define VEC_MACRO(elcount, type) \
+  __attribute__((vector_size((elcount) * sizeof(type)))) type
 
-    /* Define the composite types for element access. */
-    #ifdef VECTORIZE
-    typedef union {
-        VEC_FLOAT v;
-        VEC_DBL vd;
-        VEC_INT m;
-        float f[VEC_SIZE];
-        double d[VEC_SIZE/2];
-        int i[VEC_SIZE];
-        } vector;
-    #endif
+/* So what will the vector size be? */
+#ifdef __MIC__
+#define VECTORIZE
+#define VEC_HAVE_GATHER
+#define VEC_SIZE 16
+#define VEC_FLOAT __m512
+#define VEC_DBL __m512d
+#define VEC_INT __m512i
+#define vec_load(a) _mm512_load_ps(a)
+#define vec_set1(a) _mm512_set1_ps(a)
+#define vec_set(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
+  _mm512_set_ps(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a)
+#define vec_dbl_set(a, b, c, d, e, f, g, h) \
+  _mm512_set_pd(h, g, f, e, d, c, b, a)
+#define vec_sqrt(a) _mm512_sqrt_ps(a)
+#define vec_rcp(a) _mm512_rcp_ps(a)
+#define vec_rsqrt(a) _mm512_rsqrt_ps(a)
+#define vec_ftoi(a) _mm512_cvttps_epi32(a)
+#define vec_fmin(a, b) _mm512_min_ps(a, b)
+#define vec_fmax(a, b) _mm512_max_ps(a, b)
+#define vec_fabs(a) _mm512_andnot_ps(_mm512_set1_ps(-0.f), a)
+#define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
+#define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
+#define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1)
+#define vec_dbl_load(a) _mm512_load_pd(a)
+#define vec_dbl_set1(a) _mm512_set1_pd(a)
+#define vec_dbl_sqrt(a) _mm512_sqrt_pd(a)
+#define vec_dbl_rcp(a) _mm512_rcp_pd(a)
+#define vec_dbl_rsqrt(a) _mm512_rsqrt_pd(a)
+#define vec_dbl_ftoi(a) _mm512_cvttpd_epi32(a)
+#define vec_dbl_fmin(a, b) _mm512_min_pd(a, b)
+#define vec_dbl_fmax(a, b) _mm512_max_pd(a, b)
+#define vec_getoffsets(ptrs)                                                \
+  _mm512_insertf64x4(                                                       \
+      _mm512_insertf64x4(_mm512_setzero_pd(),                               \
+                         _mm512_cvtepi64_epi32(_mm512_load_epi64(ptrs) -    \
+                                               _mm512_set1_epi64(ptrs[0])), \
+                         0),                                                \
+      _mm512_cvtepi64_epi32(_mm512_load_epi64(&ptrs[4]) -                   \
+                            _mm512_set1_epi64(ptrs[0])),                    \
+      1)
+#define vec_gather(base, offsets) _mm512_i32gather_ps(offsets.m, base, 1)
+#elif defined(NO__AVX__)
+#define VECTORIZE
+#define VEC_SIZE 8
+#define VEC_FLOAT __m256
+#define VEC_DBL __m256d
+#define VEC_INT __m256i
+#define vec_load(a) _mm256_load_ps(a)
+#define vec_set1(a) _mm256_set1_ps(a)
+#define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a)
+#define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a)
+#define vec_sqrt(a) _mm256_sqrt_ps(a)
+#define vec_rcp(a) _mm256_rcp_ps(a)
+#define vec_rsqrt(a) _mm256_rsqrt_ps(a)
+#define vec_ftoi(a) _mm256_cvttps_epi32(a)
+#define vec_fmin(a, b) _mm256_min_ps(a, b)
+#define vec_fmax(a, b) _mm256_max_ps(a, b)
+#define vec_fabs(a) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a)
+#define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0))
+#define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1))
+#define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1)
+#define vec_dbl_load(a) _mm256_load_pd(a)
+#define vec_dbl_set1(a) _mm256_set1_pd(a)
+#define vec_dbl_sqrt(a) _mm256_sqrt_pd(a)
+#define vec_dbl_rcp(a) _mm256_rcp_pd(a)
+#define vec_dbl_rsqrt(a) _mm256_rsqrt_pd(a)
+#define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a)
+#define vec_dbl_fmin(a, b) _mm256_min_pd(a, b)
+#define vec_dbl_fmax(a, b) _mm256_max_pd(a, b)
+#ifdef __AVX2__
+#define VEC_HAVE_GATHER
+#define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1)
+#endif
+#elif defined(NO__SSE2__)
+#define VECTORIZE
+#define VEC_SIZE 4
+#define VEC_FLOAT __m128
+#define VEC_DBL __m128d
+#define VEC_INT __m128i
+#define vec_load(a) _mm_load_ps(a)
+#define vec_set1(a) _mm_set1_ps(a)
+#define vec_set(a, b, c, d) _mm_set_ps(d, c, b, a)
+#define vec_dbl_set(a, b) _mm_set_pd(b, a)
+#define vec_sqrt(a) _mm_sqrt_ps(a)
+#define vec_rcp(a) _mm_rcp_ps(a)
+#define vec_rsqrt(a) _mm_rsqrt_ps(a)
+#define vec_ftoi(a) _mm_cvttps_epi32(a)
+#define vec_fmin(a, b) _mm_min_ps(a, b)
+#define vec_fmax(a, b) _mm_max_ps(a, b)
+#define vec_fabs(a) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
+#define vec_todbl_lo(a) _mm_cvtps_pd(a)
+#define vec_todbl_hi(a) _mm_cvtps_pd(_mm_movehl_ps(a, a))
+#define vec_dbl_tofloat(a, b) _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b))
+#define vec_dbl_load(a) _mm_load_pd(a)
+#define vec_dbl_set1(a) _mm_set1_pd(a)
+#define vec_dbl_sqrt(a) _mm_sqrt_pd(a)
+#define vec_dbl_rcp(a) _mm_rcp_pd(a)
+#define vec_dbl_rsqrt(a) _mm_rsqrt_pd(a)
+#define vec_dbl_ftoi(a) _mm_cvttpd_epi32(a)
+#define vec_dbl_fmin(a, b) _mm_min_pd(a, b)
+#define vec_dbl_fmax(a, b) _mm_max_pd(a, b)
+#else
+#define VEC_SIZE 4
+#endif
 
+/* Define the composite types for element access. */
+#ifdef VECTORIZE
+typedef union {
+  VEC_FLOAT v;
+  VEC_DBL vd;
+  VEC_INT m;
+  float f[VEC_SIZE];
+  double d[VEC_SIZE / 2];
+  int i[VEC_SIZE];
+} vector;
 #endif
+
+#endif
+
+#endif /* SWIFT_VECTOR_H */
diff --git a/src/version.c b/src/version.c
index 705018b8726605e214cb02468f3baee628d8cf54..eb622b85571786c6827dcdff0a869e9dc833b4e5 100644
--- a/src/version.c
+++ b/src/version.c
@@ -2,57 +2,57 @@
  * This file is part of SWIFT.
  * Copyright (C) 2012 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
  * Copyright (C) 2015 Peter W. Draper (p.w.draper@durham.ac.uk).
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 
+/* Some standard headers. */
 #include <stdio.h>
+
+/* This object's header. */
 #include "version.h"
 
 /**
  * @brief Return the source code git revision
  *
- * @details The SHA of the code checked out when the library was last built. 
+ * @details The SHA of the code checked out when the library was last built.
  * Will include -dirty if they are local modifications.
  */
-const char *git_revision( void )
-{
-    static const char *revision = GIT_REVISION;
-    return revision;
+const char *git_revision(void) {
+  static const char *revision = GIT_REVISION;
+  return revision;
 }
 
 /**
  * @brief The version of SWIFT
  */
-const char *package_version( void )
-{
-    static const char *version = PACKAGE_VERSION;
-    return version;
+const char *package_version(void) {
+  static const char *version = PACKAGE_VERSION;
+  return version;
 }
 
 /**
  * @brief A description of the package version and code status.
  */
-const char *package_description( void )
-{
-    static char buf[256];
-    static int initialised = 0;
-    if ( ! initialised ) {
-        sprintf( buf, "SWIFT version: %s, at revision: %s", 
-                 PACKAGE_VERSION, GIT_REVISION );
-        initialised = 1;
-    }
-    return buf;
+const char *package_description(void) {
+  static char buf[256];
+  static int initialised = 0;
+  if (!initialised) {
+    sprintf(buf, "SWIFT version: %s, at revision: %s", PACKAGE_VERSION,
+            GIT_REVISION);
+    initialised = 1;
+  }
+  return buf;
 }