diff --git a/src/cell.c b/src/cell.c
index 23895acbbf7b914a1057c2c1928344698668c632..d614941ff9456c442722612915f4e6b92402439e 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -80,12 +80,11 @@ int cell_getsize ( struct cell *c ) {
  * @param pc An array of packed #pcell.
  * @param c The #cell in which to unpack the #pcell.
  * @param s The #space in which the cells are created.
- * @param parts The #part array holding the particle data.
  *
  * @return The number of cells created.
  */
  
-int cell_unpack ( struct pcell *pc , struct cell *c , struct space *s , struct part *parts ) {
+int cell_unpack ( struct pcell *pc , struct cell *c , struct space *s ) {
 
     int k, count = 1;
     struct cell *temp;
@@ -95,7 +94,6 @@ int cell_unpack ( struct pcell *pc , struct cell *c , struct space *s , struct p
     c->dt_min = pc->dt_min;
     c->dt_max = pc->dt_max;
     c->count = pc->count;
-    c->parts = parts;
     
     /* Fill the progeny recursively, depth-first. */
     for ( k = 0 ; k < 8 ; k++ )
@@ -122,8 +120,7 @@ int cell_unpack ( struct pcell *pc , struct cell *c , struct space *s , struct p
             temp->parent = c;
             c->progeny[k] = temp;
             c->split = 1;
-            count += cell_unpack( &pc[ pc->progeny[k] ] , temp , s , parts );
-            parts = &parts[ temp->count ];
+            count += cell_unpack( &pc[ pc->progeny[k] ] , temp , s );
             }
             
     /* Return the total number of unpacked cells. */
@@ -132,6 +129,33 @@ int cell_unpack ( struct pcell *pc , struct cell *c , struct space *s , struct p
     }
 
 
+/**
+ * @brief Link the cells recursively to the given part array.
+ *
+ * @param c The #cell.
+ * @param parts The #part array.
+ *
+ * @return The number of particles linked.
+ */
+
+int cell_link ( struct cell *c , struct part *parts ) {
+
+    int k, ind = 0;
+    
+    c->parts = parts;
+    
+    /* Fill the progeny recursively, depth-first. */
+    if ( c->split )
+        for ( k = 0 ; k < 8 ; k++ )
+            if ( c->progeny[k] != NULL )
+                ind += cell_link( c->progeny[k] , &parts[ind] );
+            
+    /* Return the total number of unpacked cells. */
+    return c->count;
+
+    }
+
+
 /**
  * @brief Pack the data of the given cell and all it's sub-cells.
  *
diff --git a/src/cell.h b/src/cell.h
index 793dc9b3572b7c4ebf86c9b2c008f0b2ca3b94b4..548da065c1ce6f2c0d1e09c8ca4caea5241b0faa 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -147,5 +147,6 @@ void cell_split ( struct cell *c  );
 int cell_locktree( struct cell *c );
 void cell_unlocktree( struct cell *c );
 int cell_pack ( struct cell *c , struct pcell *pc );
-int cell_unpack ( struct pcell *pc , struct cell *c , struct space *s , struct part *parts );
+int cell_unpack ( struct pcell *pc , struct cell *c , struct space *s );
 int cell_getsize ( struct cell *c );
+int cell_link ( struct cell *c , struct part *parts );
diff --git a/src/engine.c b/src/engine.c
index c1591b0d9cbe769e86028f57f94fead9f9b5fd97..2c46aa768f5186719658159b90ccca329dc926c8 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -37,6 +37,11 @@
     #include <mpi.h>
 #endif
 
+/* METIS headers. */
+#ifdef HAVE_METIS
+    #include <metis.h>
+#endif
+
 /* Local headers. */
 #include "const.h"
 #include "cycle.h"
@@ -72,6 +77,298 @@
 int engine_rank;
 
 
+/**
+ * @breif Repartition the cells amongst the nodes.
+ *
+ * @param e The #engine.
+ */
+ 
+void engine_repartition ( struct engine *e ) {
+
+#if defined(WITH_MPI) && defined(HAVE_METIS)
+
+    int i, j, k, l, cid, cjd, ii, jj, kk, res;
+    idx_t *inds;
+    idx_t *weights_v, *weights_e;
+    struct space *s = e->s;
+    int nr_cells = s->nr_cells;
+    struct cell *cells = s->cells;
+    int ind[3], *cdim = s->cdim;
+    struct task *t, *tasks = e->sched.tasks;
+    struct cell *ci, *cj;
+    int nr_nodes = e->nr_nodes, nodeID = e->nodeID, *nodeIDs;
+    float sid_scale[13] = { 0.1897 , 0.4025 , 0.1897 , 0.4025 , 0.5788 , 0.4025 ,
+                            0.1897 , 0.4025 , 0.1897 , 0.4025 , 0.5788 , 0.4025 , 
+                            0.5788 };
+    float wscale = 0.001;
+    
+    /* Clear the repartition flag. */
+    e->forcerepart = 0;
+    
+    /* Allocate the inds and weights. */
+    if ( ( inds = (idx_t *)malloc( sizeof(idx_t) * 26*nr_cells ) ) == NULL ||
+         ( weights_v = (idx_t *)malloc( sizeof(idx_t) * nr_cells ) ) == NULL ||
+         ( weights_e = (idx_t *)malloc( sizeof(idx_t) * 26*nr_cells ) ) == NULL ||
+         ( nodeIDs = (idx_t *)malloc( sizeof(idx_t) * nr_cells ) ) == NULL )
+        error( "Failed to allocate inds and weights arrays." );
+        
+    /* Fill the inds array. */
+    for ( cid = 0 ; cid < nr_cells ; cid++ ) {
+        ind[0] = cells[cid].loc[0] / s->cells[cid].h[0] + 0.5;
+        ind[1] = cells[cid].loc[1] / s->cells[cid].h[1] + 0.5;
+        ind[2] = cells[cid].loc[2] / s->cells[cid].h[2] + 0.5;
+        l = 0;
+        for ( i = -1 ; i <= 1 ; i++ ) {
+            ii = ind[0] + i;
+            if ( ii < 0 ) ii += cdim[0];
+            else if ( ii >= cdim[0] ) ii -= cdim[0];
+            for ( j = -1 ; j <= 1 ; j++ ) {
+                jj = ind[1] + j;
+                if ( jj < 0 ) jj += cdim[1];
+                else if ( jj >= cdim[1] ) jj -= cdim[1];
+                for ( k = -1 ; k <= 1 ; k++ ) {
+                    kk = ind[2] + k;
+                    if ( kk < 0 ) kk += cdim[2];
+                    else if ( kk >= cdim[2] ) kk -= cdim[2];
+                    if ( i || j || k ) {
+                        inds[ cid*26 + l ] = cell_getid( cdim , ii , jj , kk );
+                        l += 1;
+                        }
+                    }
+                }
+            }
+        }
+        
+    /* Init the weights arrays. */
+    bzero( weights_e , sizeof(idx_t) * 26*nr_cells );
+    for ( k = 0 ; k < nr_cells ; k++ )
+        if ( cells[k].nodeID == nodeID )
+            weights_v[k] = 10.0 * cells[k].count;
+        else
+            weights_v[k] = 0.0;
+    
+    /* Loop over the tasks... */
+    for ( j = 0 ; j < e->sched.nr_tasks ; j++ ) {
+    
+        /* Get a pointer to the kth task. */
+        t = &tasks[j];
+        
+        /* Skip un-interesting tasks. */
+        if ( t->type != task_type_self &&
+             t->type != task_type_pair &&
+             t->type != task_type_sub )
+            continue;
+        
+        /* Get the top-level cells involved. */
+        for ( ci = t->ci ; ci->parent != NULL ; ci = ci->parent );
+        if ( t->cj != NULL )
+            for ( cj = t->cj ; cj->parent != NULL ; cj = cj->parent );
+        else
+            cj = NULL;
+            
+        /* Get the cell IDs. */
+        cid = ci - cells;
+            
+        /* Different weights for different tasks. */
+        if ( ( t->type == task_type_self && ci->nodeID == nodeID ) ||
+             ( t->type == task_type_sub && cj == NULL && ci->nodeID == nodeID ) ) {
+        
+            /* Self interactions add only to vertex weight. */
+            weights_v[cid] += t->ci->count * t->ci->count * wscale;
+            
+            }
+            
+        /* Pair? */
+        else if ( t->type == task_type_pair ||
+                  ( t->type == task_type_sub && cj != NULL ) ) {
+                  
+            /* In-cell pair? */
+            if ( ci == cj ) {
+            
+                /* Add weight to vertex for ci. */
+                weights_v[cid] += t->ci->count * t->cj->count * sid_scale[ t->flags ] * wscale;
+            
+                }
+                
+            /* Distinct cells with local ci? */
+            else if ( ci->nodeID == nodeID ) {
+            
+                /* Index of the jth cell. */
+                cjd = cj - cells;
+                
+                /* Add half of weight to each cell. */
+                if ( ci->nodeID == nodeID )
+                    weights_v[cid] += t->ci->count * t->cj->count * sid_scale[ t->flags ] * wscale;
+                if ( cj->nodeID == nodeID )
+                    weights_v[cjd] += t->ci->count * t->cj->count * sid_scale[ t->flags ] * wscale;
+                    
+                /* Add Weight to edge. */
+                for ( k = 26*cid ; inds[k] != cjd ; k++ );
+                weights_e[ k ] += t->ci->count * t->cj->count * sid_scale[ t->flags ] * wscale;
+                for ( k = 26*cjd ; inds[k] != cid ; k++ );
+                weights_e[ k ] += t->ci->count * t->cj->count * sid_scale[ t->flags ] * wscale;
+            
+                }
+                  
+            }
+    
+        }
+        
+    /* Merge the weights arrays accross all nodes. */
+    if ( ( res = MPI_Reduce( ( nodeID == 0 ) ? MPI_IN_PLACE : weights_v , weights_v , nr_cells , MPI_INT , MPI_SUM , 0 , MPI_COMM_WORLD ) ) != MPI_SUCCESS ) {
+        char buff[ MPI_MAX_ERROR_STRING ];
+        MPI_Error_string( res , buff , &i );
+        error( "Failed to allreduce vertex weights (%s)." , buff );
+        }
+    if ( MPI_Reduce( ( nodeID == 0 ) ? MPI_IN_PLACE : weights_e , weights_e , 26*nr_cells , MPI_INT , MPI_SUM , 0 , MPI_COMM_WORLD ) != MPI_SUCCESS )
+        error( "Failed to allreduce edge weights." );
+        
+    /* As of here, only one node needs to compute the partition. */
+    if ( nodeID == 0 ) {
+    
+        /* Allocate and fill the connection array. */
+        idx_t *offsets;
+        if ( ( offsets = (idx_t *)malloc( sizeof(idx_t) * (nr_cells + 1) ) ) == NULL )
+            error( "Failed to allocate offsets buffer." );
+        offsets[0] = 0;
+        for ( k = 0 ; k < nr_cells ; k++ )
+            offsets[k+1] = offsets[k] + 26;
+            
+        /* Set the METIS options. */
+        idx_t options[METIS_NOPTIONS];
+        METIS_SetDefaultOptions( options );
+        options[ METIS_OPTION_OBJTYPE ] = METIS_OBJTYPE_CUT;
+        options[ METIS_OPTION_NUMBERING ] = 0;
+        options[ METIS_OPTION_CONTIG ] = 1;
+        
+        /* Call METIS. */
+        int one = 1;
+        idx_t objval;
+        if ( METIS_PartGraphKway( &nr_cells , &one , offsets , inds , weights_v , NULL , weights_e , &nr_nodes , NULL , NULL , options , &objval , nodeIDs ) != METIS_OK )
+            error( "Call to METIS_PartGraphKway failed." );
+    
+        }
+        
+    /* Broadcast the result of the partition. */
+    if ( MPI_Bcast( nodeIDs , nr_cells , MPI_INT , 0 , MPI_COMM_WORLD ) != MPI_SUCCESS )
+        error( "Failed to bcast the node IDs." );
+        
+    /* Set the cell nodeIDs and clear any non-local parts. */
+    for ( k = 0 ; k < nr_cells ; k++ )
+        cells[k].nodeID = nodeIDs[k];
+        
+    /* Clean up. */
+    free( inds );
+    free( weights_v );
+    free( weights_e );
+    free( nodeIDs );
+        
+    /* Now comes the tricky part: Exchange particles between all nodes.
+       This is done in two steps, first allreducing a matrix of 
+       how many particles go from where to where, then re-allocating
+       the parts array, and emiting the sends and receives.
+       Finally, the space, tasks, and proxies need to be rebuilt. */
+       
+    /* Start by sorting the particles according to their nodes and
+       getting the counts. */
+    int *counts, *dest;
+    struct part *parts = s->parts;
+    float ih[3];
+    ih[0] = s->ih[0]; ih[1] = s->ih[1]; ih[2] = s->ih[2];
+    if ( ( counts = (int *)malloc( sizeof(int) * nr_nodes * nr_nodes ) ) == NULL ||
+         ( dest = (int *)malloc( sizeof(int) * s->nr_parts ) ) == NULL )
+        error( "Failed to allocate count and dest buffers." );
+    bzero( counts , sizeof(int) * nr_nodes * nr_nodes );
+    for ( k = 0 ; k < s->nr_parts ; k++ ) {
+        cid = cell_getid( cdim , parts[k].x[0]*ih[0] , parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] );
+        dest[k] = cells[ cid ].nodeID;
+        counts[ nodeID*nr_nodes + dest[k] ] += 1;
+        }
+    parts_sort( s->parts , s->xparts , dest , s->nr_parts , 0 , nr_nodes-1 );
+    
+    /* Get all the counts from all the nodes. */
+    if ( MPI_Allreduce( MPI_IN_PLACE , counts , nr_nodes * nr_nodes , MPI_INT , MPI_SUM , MPI_COMM_WORLD ) != MPI_SUCCESS )
+        error( "Failed to allreduce particle transfer counts." );
+        
+    /* Get the new number of parts for this node, be generous in allocating. */
+    int nr_parts = 0;
+    for ( k = 0 ; k < nr_nodes ; k++ )
+        nr_parts += counts[ k*nr_nodes + nodeID ];
+    struct part *parts_new;
+    struct xpart *xparts_new, *xparts = s->xparts;
+    if ( posix_memalign( (void **)&parts_new , part_align , sizeof(struct part) * nr_parts * 2 ) != 0 ||
+         posix_memalign( (void **)&xparts_new , part_align , sizeof(struct xpart) * nr_parts * 2 ) != 0 )
+        error( "Failed to allocate new part data." );
+        
+    /* Emit the sends and recvs for the particle data. */
+    MPI_Request *reqs;
+    if ( ( reqs = (MPI_Request *)malloc( sizeof(MPI_Request) * 4 * nr_nodes ) ) == NULL )
+        error( "Failed to allocate MPI request list." );
+    for ( k = 0 ; k < 4*nr_nodes ; k++ )
+        reqs[k] = MPI_REQUEST_NULL;
+    for ( i = 0 , j = 0 , k = 0 ; k < nr_nodes ; k++ ) {
+        if ( k == nodeID && counts[ nodeID*nr_nodes + k ] > 0 ) {
+            memcpy( &parts_new[j] , &parts[i] , sizeof(struct part) * counts[ k*nr_nodes + nodeID ] );
+            memcpy( &xparts_new[j] , &xparts[i] , sizeof(struct xpart) * counts[ k*nr_nodes + nodeID ] );
+            i += counts[ nodeID*nr_nodes + k ];
+            j += counts[ k*nr_nodes + nodeID ];
+            }
+        if ( k != nodeID && counts[ nodeID*nr_nodes + k ] > 0 ) {
+            if ( MPI_Isend( &parts[i] , sizeof(struct part) * counts[ nodeID*nr_nodes + k ] , MPI_BYTE , k , 0 , MPI_COMM_WORLD , &reqs[4*k] ) != MPI_SUCCESS )
+                error( "Failed to isend parts to node %i." , k );
+            if ( MPI_Isend( &xparts[i] , sizeof(struct xpart) * counts[ nodeID*nr_nodes + k ] , MPI_BYTE , k , 1 , MPI_COMM_WORLD , &reqs[4*k+1] ) != MPI_SUCCESS )
+                error( "Failed to isend xparts to node %i." , k );
+            i += counts[ nodeID*nr_nodes + k ];
+            }
+        if ( k != nodeID && counts[ k*nr_nodes + nodeID ] > 0 ) {
+            if ( MPI_Irecv( &parts_new[j] , sizeof(struct part) * counts[ k*nr_nodes + nodeID ] , MPI_BYTE , k , 0 , MPI_COMM_WORLD , &reqs[4*k+2] ) != MPI_SUCCESS )
+                error( "Failed to emit irecv of parts from node %i." , k );
+            if ( MPI_Irecv( &xparts_new[j] , sizeof(struct xpart) * counts[ k*nr_nodes + nodeID ] , MPI_BYTE , k , 1 , MPI_COMM_WORLD , &reqs[4*k+3] ) != MPI_SUCCESS )
+                error( "Failed to emit irecv of parts from node %i." , k );
+            j += counts[ k*nr_nodes + nodeID ];
+            }
+        }
+        
+    /* Wait for all the recvs to tumble in. */
+    if ( MPI_Waitall( 4*nr_nodes , reqs , MPI_STATUSES_IGNORE ) != MPI_SUCCESS )
+        error( "Failed during waitall for part data." );
+        
+    /* Verify that all parts are in the right place. */
+    /* for ( k = 0 ; k < nr_parts ; k++ ) {
+        cid = cell_getid( cdim , parts_new[k].x[0]*ih[0] , parts_new[k].x[1]*ih[1] , parts_new[k].x[2]*ih[2] );
+        if ( cells[ cid ].nodeID != nodeID )
+            error( "Received particle (%i) that does not belong here (nodeID=%i)." , k , cells[ cid ].nodeID );
+        } */
+        
+    /* Set the new part data, free the old. */
+    free( parts );
+    free( xparts );
+    s->parts = parts_new;
+    s->xparts = xparts_new;
+    s->nr_parts = nr_parts;
+    s->size_parts = 2*nr_parts;
+    
+    /* Be verbose about what just happened. */
+    message( "node %i now has %i parts." , nodeID , nr_parts );
+    
+    /* Clean up other stuff. */
+    free( reqs );
+    free( counts );
+    free( dest );
+        
+    /* Make the proxies. */
+    engine_makeproxies( e );
+        
+    /* Tell the engine it should re-build whenever possible */
+    e->forcerebuild = 1;
+    
+#else
+    error( "SWIFT was not compiled with MPI and METIS support." );
+#endif
+
+    }
+
+
 /**
  * @brief Add send tasks to a hierarchy of cells.
  *
@@ -196,12 +493,15 @@ void engine_exchange_cells ( struct engine *e ) {
 
     int j, k, pid, count = 0;
     struct pcell *pcells;
-    struct cell *cells = e->s->cells;
-    int nr_cells = e->s->nr_cells;
+    struct space *s = e->s;
+    struct cell *cells = s->cells;
+    int nr_cells = s->nr_cells;
+    int nr_proxies = e->nr_proxies;
     int offset[ nr_cells ];
-    MPI_Request reqs[27];
+    MPI_Request reqs_in[ engine_maxproxies ];
+    MPI_Request reqs_out[ engine_maxproxies ];
     MPI_Status status;
-    struct part *parts = &e->s->parts[ e->s->nr_parts ];
+    struct part *parts = &s->parts[ s->nr_parts ];
     
     /* Run through the cells and get the size of the ones that will be sent off. */
     for ( k = 0 ; k < nr_cells ; k++ ) {
@@ -222,37 +522,66 @@ void engine_exchange_cells ( struct engine *e ) {
             }
 
     /* Launch the proxies. */
-    for ( k = 0 ; k < e->nr_proxies ; k++ ) {
+    for ( k = 0 ; k < nr_proxies ; k++ ) {
         proxy_cells_exch1( &e->proxies[k] );
-        reqs[k] = e->proxies[k].req_cells_count_in;
+        reqs_in[k] = e->proxies[k].req_cells_count_in;
         }
         
     /* Wait for each count to come in and start the recv. */
-    for ( k = 0 ; k < e->nr_proxies ; k++ ) {
-        if ( MPI_Waitany( e->nr_proxies , reqs , &pid , &status ) != MPI_SUCCESS ||
+    for ( k = 0 ; k < nr_proxies ; k++ ) {
+        if ( MPI_Waitany( nr_proxies , reqs_in , &pid , &status ) != MPI_SUCCESS ||
              pid == MPI_UNDEFINED )
             error( "MPI_Waitany failed." );
         // message( "request from proxy %i has arrived." , pid );
-        reqs[pid] = MPI_REQUEST_NULL;
         proxy_cells_exch2( &e->proxies[pid] );
         }
         
     /* Set the requests for the cells. */
-    for ( k = 0 ; k < e->nr_proxies ; k++ )
-        reqs[k] = e->proxies[k].req_cells_in;
+    for ( k = 0 ; k < nr_proxies ; k++ ) {
+        reqs_in[k] = e->proxies[k].req_cells_in;
+        reqs_out[k] = e->proxies[k].req_cells_out;
+        }
     
     /* Wait for each pcell array to come in from the proxies. */
-    for ( k = 0 ; k < e->nr_proxies ; k++ ) {
-        if ( MPI_Waitany( e->nr_proxies , reqs , &pid , &status ) != MPI_SUCCESS ||
+    for ( k = 0 ; k < nr_proxies ; k++ ) {
+        if ( MPI_Waitany( nr_proxies , reqs_in , &pid , &status ) != MPI_SUCCESS ||
              pid == MPI_UNDEFINED )
             error( "MPI_Waitany failed." );
-        // message( "request from proxy %i has arrived." , pid );
-        reqs[pid] = MPI_REQUEST_NULL;
-        for ( count = 0 , j = 0 ; j < e->proxies[pid].nr_cells_in ; j++ ) {
-            count += cell_unpack( &e->proxies[pid].pcells_in[count] , e->proxies[pid].cells_in[j] , e->s , parts );
-            parts = &parts[ e->proxies[pid].cells_in[j]->count ];
+        // message( "cell data from proxy %i has arrived." , pid );
+        for ( count = 0 , j = 0 ; j < e->proxies[pid].nr_cells_in ; j++ )
+            count += cell_unpack( &e->proxies[pid].pcells_in[count] , e->proxies[pid].cells_in[j] , e->s );
+        }
+        
+    /* Wait for all the sends to have finnished too. */
+    if ( MPI_Waitall( nr_proxies , reqs_out , &status ) != MPI_SUCCESS )
+        error( "MPI_Waitall on sends failed." );
+        
+    /* Count the number of particles we need to import and re-allocate
+       the buffer if needed. */
+    for ( count = 0 , k = 0 ; k < nr_proxies ; k++ )
+        for ( j = 0 ; j < e->proxies[k].nr_cells_in ; j++ )
+            count += e->proxies[k].cells_in[j]->count;
+    if ( count > s->size_parts_foreign ) {
+        if ( s->parts_foreign != NULL )
+            free( s->parts_foreign );
+        s->size_parts_foreign = 1.1 * count;
+        if ( posix_memalign( (void **)&s->parts_foreign , part_align , sizeof(struct part) * s->size_parts_foreign ) != 0 )
+            error( "Failed to allocate foreign part data." );
+        }
+        
+    /* Unpack the cells and link to the particle data. */
+    parts = s->parts_foreign;
+    for ( k = 0 ; k < nr_proxies ; k++ ) {
+        for ( count = 0 , j = 0 ; j < e->proxies[k].nr_cells_in ; j++ ) {
+            count += cell_link( e->proxies[k].cells_in[j] , parts );
+            parts = &parts[ e->proxies[k].cells_in[j]->count ];
             }
         }
+    s->nr_parts_foreign = parts - s->parts_foreign;
+        
+    /* Is the parts buffer large enough? */
+    if ( s->nr_parts_foreign > s->size_parts_foreign )
+        error( "Foreign parts buffer too small." );
         
     /* Free the pcell buffer. */
     free( pcells );
@@ -281,7 +610,8 @@ int engine_exchange_strays ( struct engine *e , struct part *parts , struct xpar
 #ifdef WITH_MPI
 
     int k, pid, count = 0;
-    MPI_Request reqs[27];
+    MPI_Request reqs_in[ engine_maxproxies ];
+    MPI_Request reqs_out[ engine_maxproxies ];
     MPI_Status status;
     struct proxy *p;
 
@@ -300,33 +630,32 @@ int engine_exchange_strays ( struct engine *e , struct part *parts , struct xpar
     /* Launch the proxies. */
     for ( k = 0 ; k < e->nr_proxies ; k++ ) {
         proxy_parts_exch1( &e->proxies[k] );
-        reqs[k] = e->proxies[k].req_parts_count_in;
+        reqs_in[k] = e->proxies[k].req_parts_count_in;
         }
         
     /* Wait for each count to come in and start the recv. */
     for ( k = 0 ; k < e->nr_proxies ; k++ ) {
-        if ( MPI_Waitany( e->nr_proxies , reqs , &pid , &status ) != MPI_SUCCESS ||
+        if ( MPI_Waitany( e->nr_proxies , reqs_in , &pid , &status ) != MPI_SUCCESS ||
              pid == MPI_UNDEFINED )
             error( "MPI_Waitany failed." );
         // message( "request from proxy %i has arrived." , pid );
-        reqs[pid] = MPI_REQUEST_NULL;
         proxy_parts_exch2( &e->proxies[pid] );
         }
         
     /* Set the requests for the particle data. */
-    for ( k = 0 ; k < e->nr_proxies ; k++ )
-        reqs[k] = e->proxies[k].req_xparts_in;
+    for ( k = 0 ; k < e->nr_proxies ; k++ ) {
+        reqs_in[k] = e->proxies[k].req_xparts_in;
+        reqs_out[k] = e->proxies[k].req_xparts_out;
+        }
     
     /* Wait for each part array to come in and collect the new
        parts from the proxies. */
     for ( k = 0 ; k < e->nr_proxies ; k++ ) {
-        if ( MPI_Waitany( e->nr_proxies , reqs , &pid , &status ) != MPI_SUCCESS ||
+        if ( MPI_Waitany( e->nr_proxies , reqs_in , &pid , &status ) != MPI_SUCCESS ||
              pid == MPI_UNDEFINED )
             error( "MPI_Waitany failed." );
         // message( "request from proxy %i has arrived." , pid );
         p = &e->proxies[pid];
-        reqs[pid] = MPI_REQUEST_NULL;
-        MPI_Request_free( &p->req_parts_in );
         memcpy( &parts[count] , p->parts_in , sizeof(struct part) * p->nr_parts_in );
         memcpy( &xparts[count] , p->xparts_in , sizeof(struct xpart) * p->nr_parts_in );
         count += p->nr_parts_in;
@@ -336,6 +665,10 @@ int engine_exchange_strays ( struct engine *e , struct part *parts , struct xpar
                 p->parts_in[k].h , p->nodeID ); */
         }
     
+    /* Wait for all the sends to have finnished too. */
+    if ( MPI_Waitall( e->nr_proxies , reqs_out , &status ) != MPI_SUCCESS )
+        error( "MPI_Waitall on sends failed." );
+        
     /* Return the number of harvested parts. */
     return count;
     
@@ -533,7 +866,7 @@ void engine_maketasks ( struct engine *e ) {
             
             /* Loop through the proxy's outgoing cells and add the
                send tasks. */
-            for ( k = 0 ; k < p->nr_cells_in ; k++ )
+            for ( k = 0 ; k < p->nr_cells_out ; k++ )
                 engine_addtasks_send( e , p->cells_out[k] , p->cells_in[0] );
             
             }
@@ -678,6 +1011,65 @@ int engine_marktasks ( struct engine *e ) {
     return 0;
     
     }
+    
+
+/**
+ * @brief Rebuild the space and tasks.
+ *
+ * @param e The #engine.
+ */
+ 
+void engine_rebuild ( struct engine *e ) {
+
+    int k;
+    struct scheduler *sched = &e->sched;
+    
+    /* Clear the forcerebuild flag, whatever it was. */
+    e->forcerebuild = 0;
+
+    /* Re-build the space. */
+    // tic = getticks();
+    space_rebuild( e->s , 0.0 );
+    // message( "space_rebuild took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
+
+    /* If in parallel, exchange the cell structure. */
+    #ifdef WITH_MPI
+        // tic = getticks();
+        engine_exchange_cells( e );
+        // message( "engine_exchange_cells took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
+    #endif
+
+    /* Re-build the tasks. */
+    // tic = getticks();
+    engine_maketasks( e );
+    // message( "engine_maketasks took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
+
+    /* Run through the tasks and mark as skip or not. */
+    // tic = getticks();
+    if ( engine_marktasks( e ) )
+        error( "engine_marktasks failed after space_rebuild." );
+    // message( "engine_marktasks took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
+
+    /* Count and print the number of each task type. */
+    int counts[ task_type_count+1 ];
+    for ( k = 0 ; k <= task_type_count ; k++ )
+        counts[k] = 0;
+    for ( k = 0 ; k < sched->nr_tasks ; k++ )
+        if ( !sched->tasks[k].skip )
+            counts[ (int)sched->tasks[k].type ] += 1;
+        else
+            counts[ task_type_count ] += 1;
+    #ifdef WITH_MPI
+        printf( "[%03i] engine_prepare: task counts are [ %s=%i" , e->nodeID , taskID_names[0] , counts[0] );
+    #else
+        printf( "engine_prepare: task counts are [ %s=%i" , taskID_names[0] , counts[0] );
+    #endif
+    for ( k = 1 ; k < task_type_count ; k++ )
+        printf( " %s=%i" , taskID_names[k] , counts[k] );
+    printf( " skipped=%i ]\n" , counts[ task_type_count ] ); fflush(stdout);
+    message( "nr_parts = %i." , e->s->nr_parts );
+    
+    }
 
 
 /**
@@ -688,14 +1080,14 @@ int engine_marktasks ( struct engine *e ) {
  
 void engine_prepare ( struct engine *e ) {
     
-    int k, rebuild;
+    int rebuild;
     struct scheduler *sched = &e->sched;
     
     TIMER_TIC
 
     /* Run through the tasks and mark as skip or not. */
     // tic = getticks();
-    rebuild = ( e->step == 0 || engine_marktasks( e ) );
+    rebuild = ( e->forcerebuild || engine_marktasks( e ) );
     // message( "space_marktasks took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
         
     /* Collect the values of rebuild from all nodes. */
@@ -707,51 +1099,8 @@ void engine_prepare ( struct engine *e ) {
     #endif
     
     /* Did this not go through? */
-    if ( rebuild ) {
-    
-        /* Re-build the space. */
-        // tic = getticks();
-        space_rebuild( e->s , 0.0 );
-        // message( "space_rebuild took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
-    
-        /* If in parallel, exchange the cell structure. */
-        #ifdef WITH_MPI
-            // tic = getticks();
-            engine_exchange_cells( e );
-            // message( "engine_exchange_cells took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
-        #endif
-    
-        /* Re-build the tasks. */
-        // tic = getticks();
-        engine_maketasks( e );
-        // message( "engine_maketasks took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
-    
-        /* Run through the tasks and mark as skip or not. */
-        // tic = getticks();
-        if ( engine_marktasks( e ) )
-            error( "engine_marktasks failed after space_rebuild." );
-        // message( "engine_marktasks took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
-        
-        /* Count the number of each task type. */
-        int counts[ task_type_count+1 ];
-        for ( k = 0 ; k <= task_type_count ; k++ )
-            counts[k] = 0;
-        for ( k = 0 ; k < sched->nr_tasks ; k++ )
-            if ( !sched->tasks[k].skip )
-                counts[ (int)sched->tasks[k].type ] += 1;
-            else
-                counts[ task_type_count ] += 1;
-        #ifdef WITH_MPI
-            printf( "engine_prepare[%03i]: task counts are [ %s=%i" , e->nodeID , taskID_names[0] , counts[0] );
-        #else
-            printf( "engine_prepare: task counts are [ %s=%i" , taskID_names[0] , counts[0] );
-        #endif
-        for ( k = 1 ; k < task_type_count ; k++ )
-            printf( " %s=%i" , taskID_names[k] , counts[k] );
-        printf( " skipped=%i ]\n" , counts[ task_type_count ] ); fflush(stdout);
-        message( "nr_parts = %i." , e->s->nr_parts );
-    
-        }
+    if ( rebuild )
+        engine_rebuild( e );
 
     /* Start the scheduler. */
     // ticks tic2 = getticks();
@@ -984,6 +1333,19 @@ void engine_launch ( struct engine *e , int nr_runners ) {
             error( "Error while waiting for barrier." );
             
     }
+    
+    
+void hassorted ( struct cell *c ) {
+
+    if ( c->sorted )
+        error( "Suprious sorted flags." );
+        
+    if ( c->split )
+        for ( int k = 0 ; k < 8 ; k++ )
+            if ( c->progeny[k] != NULL )
+                hassorted( c->progeny[k] );
+                    
+    }
 
 
 /**
@@ -1040,6 +1402,10 @@ void engine_step ( struct engine *e ) {
     //   printParticle(parts, k);
     // printParticle( e->s->parts , 3392063069037 , e->s->nr_parts );
  
+    /* Re-distribute the particles amongst the nodes? */
+    if ( e->forcerepart )
+        engine_repartition( e );
+    
     /* Prepare the space. */
     engine_prepare( e );
     
@@ -1162,26 +1528,20 @@ void engine_step ( struct engine *e ) {
     }
     
     
-/** 
- * @brief Split the underlying space according to the given grid.
+/**
+ * @brief Create and fill the proxies.
  *
  * @param e The #engine.
- * @param grid The grid.
  */
  
-void engine_split ( struct engine *e , int *grid ) {
+void engine_makeproxies ( struct engine *e ) {
 
-    int i, j, k, ii, jj, kk, jd;
-    float scale[3];
-    int cid, cjd, pid, ind[3], jnd[3], *cdim = e->s->cdim;
+    int i, j, k, ii, jj, kk;
+    int cid, cjd, pid, ind[3], *cdim = e->s->cdim;
     struct space *s = e->s;
-    struct cell *c;
-    struct part *p;
+    struct cell *cells = s->cells;
+    struct proxy *proxies = e->proxies;
     
-    /* If we've got the wrong number of nodes, fail. */
-    if ( e->nr_nodes != grid[0]*grid[1]*grid[2] )
-        error( "Grid size does not match number of nodes." );
-        
     /* Prepare the proxies and the proxy index. */
     if ( e->proxy_ind != NULL )
         free( e->proxy_ind );
@@ -1191,59 +1551,15 @@ void engine_split ( struct engine *e , int *grid ) {
         e->proxy_ind[k] = -1;
     e->nr_proxies = 0;
     
-    /* Get the scale. */
-    for ( j = 0 ; j < 3 ; j++ )
-        scale[j] = ((float)grid[j]) / s->cdim[j];
-    
-    /* Run through the cells and set their nodeID. */
-    for ( k = 0 ; k < s->nr_cells ; k++ ) {
-        c = &s->cells[k];
-        for ( j = 0 ; j < 3 ; j++ )
-            ind[j] = c->loc[j] * s->ih[j] * scale[j];
-        c->nodeID = ind[0] + grid[0]*( ind[1] + grid[1]*ind[2] );
-        }
-        
-    /* Identify the neighbours of this proxy. */
-    ind[0] = e->nodeID % grid[0];
-    ind[1] = ( e->nodeID / grid[0] ) % grid[1];
-    ind[2] = e->nodeID / ( grid[0]*grid[1] );
-    message( "node %i is [ %i %i %i ] on grid [ %i %i %i ]." ,
-        e->nodeID , ind[0] , ind[1] , ind[2] , grid[0] , grid[1] , grid[2] );
-    for ( i = -1 ; i <= 1 ; i++ ) {
-        jnd[0] = ind[0] + i;
-        if ( jnd[0] < 0 ) jnd[0] += grid[0];
-        if ( jnd[0] >= grid[0] ) jnd[0] -= grid[0];
-        for ( j = -1 ; j <= 1 ; j++ ) {
-            jnd[1] = ind[1] + j;
-            if ( jnd[1] < 0 ) jnd[1] += grid[1];
-            if ( jnd[1] >= grid[1] ) jnd[1] -= grid[1];
-            for ( k = -1 ; k <= 1 ; k++ ) {
-                jnd[2] = ind[2] + k;
-                if ( jnd[2] < 0 ) jnd[2] += grid[2];
-                if ( jnd[2] >= grid[2] ) jnd[2] -= grid[2];
-                
-                /* Are ind and jnd the same node? */
-                jd = jnd[0] + grid[0]*( jnd[1] + grid[1]*jnd[2] );
-                if ( jd == e->nodeID )
-                    continue;
-                
-                /* Add jnd? */
-                if ( e->proxy_ind[jd] < 0 ) {
-                    proxy_init( &e->proxies[ e->nr_proxies ] , e->nodeID , jd );
-                    e->proxy_ind[jd] = e->nr_proxies;
-                    e->nr_proxies += 1;
-                    }
-                
-                }
-            }
-        }
-        
-    /* Identify the neighbouring highest-level cells and add them to
-       the respective proxies. */
+    /* Loop over each cell in the space. */
     for ( ind[0] = 0 ; ind[0] < cdim[0] ; ind[0]++ )
         for ( ind[1] = 0 ; ind[1] < cdim[1] ; ind[1]++ )
             for ( ind[2] = 0 ; ind[2] < cdim[2] ; ind[2]++ ) {
+            
+                /* Get the cell ID. */
                 cid = cell_getid( cdim , ind[0] , ind[1] , ind[2] );
+                
+                /* Loop over all its neighbours (periodic). */
                 for ( i = -1 ; i <= 1 ; i++ ) {
                     ii = ind[0] + i;
                     if ( ii >= cdim[0] )
@@ -1262,24 +1578,79 @@ void engine_split ( struct engine *e , int *grid ) {
                                 kk -= cdim[2];
                             else if ( kk < 0 )
                                 kk += cdim[2];
+                            
+                            /* Get the cell ID. */
                             cjd = cell_getid( cdim , ii , jj , kk );
-                            if ( s->cells[cid].nodeID == e->nodeID && s->cells[cjd].nodeID != e->nodeID ) {
-                                pid = e->proxy_ind[ s->cells[cjd].nodeID ];
-                                proxy_addcell_in( &e->proxies[pid] , &s->cells[cjd] );
-                                proxy_addcell_out( &e->proxies[pid] , &s->cells[cid] );
-                                s->cells[cid].sendto |= ( 1 << pid );
+                            
+                            /* Add to proxies? */
+                            if ( cells[cid].nodeID == e->nodeID && cells[cjd].nodeID != e->nodeID ) {
+                                pid = e->proxy_ind[ cells[cjd].nodeID ];
+                                if ( pid < 0 ) {
+                                    proxy_init( &proxies[ e->nr_proxies ] , e->nodeID , cells[cjd].nodeID );
+                                    e->proxy_ind[ cells[cjd].nodeID ] = e->nr_proxies;
+                                    pid = e->nr_proxies;
+                                    e->nr_proxies += 1;
+                                    }
+                                proxy_addcell_in( &proxies[pid] , &cells[cjd] );
+                                proxy_addcell_out( &proxies[pid] , &cells[cid] );
+                                cells[cid].sendto |= ( 1 << pid );
                                 }
-                            if ( s->cells[cjd].nodeID == e->nodeID && s->cells[cid].nodeID != e->nodeID ) {
-                                pid = e->proxy_ind[ s->cells[cid].nodeID ];
-                                proxy_addcell_in( &e->proxies[pid] , &s->cells[cid] );
-                                proxy_addcell_out( &e->proxies[pid] , &s->cells[cjd] );
-                                s->cells[cjd].sendto |= ( 1 << pid );
+                                
+                            if ( cells[cjd].nodeID == e->nodeID && cells[cid].nodeID != e->nodeID ) {
+                                pid = e->proxy_ind[ cells[cid].nodeID ];
+                                if ( pid < 0 ) {
+                                    proxy_init( &proxies[ e->nr_proxies ] , e->nodeID , cells[cid].nodeID );
+                                    e->proxy_ind[ cells[cid].nodeID ] = e->nr_proxies;
+                                    pid = e->nr_proxies;
+                                    e->nr_proxies += 1;
+                                    }
+                                proxy_addcell_in( &proxies[pid] , &cells[cid] );
+                                proxy_addcell_out( &proxies[pid] , &cells[cjd] );
+                                cells[cjd].sendto |= ( 1 << pid );
                                 }
                             }
                         }
                     }
                 }
         
+    }
+    
+    
+/** 
+ * @brief Split the underlying space according to the given grid.
+ *
+ * @param e The #engine.
+ * @param grid The grid.
+ */
+ 
+void engine_split ( struct engine *e , int *grid ) {
+
+    int j, k;
+    float scale[3];
+    int ind[3];
+    struct space *s = e->s;
+    struct cell *c;
+    struct part *p;
+    
+    /* If we've got the wrong number of nodes, fail. */
+    if ( e->nr_nodes != grid[0]*grid[1]*grid[2] )
+        error( "Grid size does not match number of nodes." );
+        
+    /* Get the scale. */
+    for ( j = 0 ; j < 3 ; j++ )
+        scale[j] = ((float)grid[j]) / s->cdim[j];
+    
+    /* Run through the cells and set their nodeID. */
+    for ( k = 0 ; k < s->nr_cells ; k++ ) {
+        c = &s->cells[k];
+        for ( j = 0 ; j < 3 ; j++ )
+            ind[j] = c->loc[j] * s->ih[j] * scale[j];
+        c->nodeID = ind[0] + grid[0]*( ind[1] + grid[1]*ind[2] );
+        }
+        
+    /* Make the proxies. */
+    engine_makeproxies( e );
+        
     /* For now, just kill any particle outside of our grid. */
     for ( k = 0 ; k < s->nr_parts ; k++ ) {
         p = &s->parts[k];
@@ -1327,7 +1698,7 @@ void engine_init ( struct engine *e , struct space *s , float dt , int nr_thread
             #ifdef WITHMPI
                 printf( "engine_init: cpu map is [ " );
             #else
-                printf( "engine_init[%03i]: cpu map is [ " , nodeID );
+                printf( "[%03i] engine_init: cpu map is [ " , nodeID );
             #endif
             for ( i = 0 ; i < nr_cores ; i++ )
                 printf( "%i " , cpuid[i] );
@@ -1346,6 +1717,8 @@ void engine_init ( struct engine *e , struct space *s , float dt , int nr_thread
     e->nodeID = nodeID;
     e->proxy_ind = NULL;
     e->nr_proxies = 0;
+    e->forcerebuild = 1;
+    e->forcerepart = 0;
     engine_rank = nodeID;
     
     /* Make the space link back to the engine. */
@@ -1353,14 +1726,15 @@ void engine_init ( struct engine *e , struct space *s , float dt , int nr_thread
     
     /* Are we doing stuff in parallel? */
     if ( nr_nodes > 1 ) {
-        #if !defined(HAVE_MPI) || !defined(WITH_MPI)
+        #ifndef HAVE_MPI
             error( "SWIFT was not compiled with MPI support." );
+        #else
+            e->policy |= engine_policy_mpi;
+            if ( ( e->proxies = (struct proxy *)malloc( sizeof(struct proxy) * engine_maxproxies ) ) == NULL )
+                error( "Failed to allocate memory for proxies." );
+            bzero( e->proxies , sizeof(struct proxy) * 26 );
+            e->nr_proxies = 0;
         #endif
-        e->policy |= engine_policy_mpi;
-        if ( ( e->proxies = (struct proxy *)malloc( sizeof(struct proxy) * 26 ) ) == NULL )
-            error( "Failed to allocate memory for proxies." );
-        bzero( e->proxies , sizeof(struct proxy) * 26 );
-        e->nr_proxies = 0;
         }
     
     /* First of all, init the barrier and lock it. */
diff --git a/src/engine.h b/src/engine.h
index 401f943792aed147e73e13a1f11f1c283a23f81e..c16d47a184ce0830a2c9ec7179915fae2109589e 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -33,6 +33,7 @@
 
 #define engine_queue_scale          1.2
 #define engine_maxtaskspercell      32
+#define engine_maxproxies           36
 
 
 /* The rank of the engine as a global variable (for messages). */
@@ -90,6 +91,9 @@ struct engine {
     struct proxy *proxies;
     int nr_proxies, *proxy_ind;
     
+    /* Force the engine to rebuild? */
+    int forcerebuild, forcerepart;
+    
     };
 
 
@@ -101,3 +105,6 @@ void engine_step ( struct engine *e );
 void engine_maketasks ( struct engine *e );
 void engine_split ( struct engine *e , int *grid );
 int engine_exchange_strays ( struct engine *e , struct part *parts , struct xpart *xparts , int *ind , int N );
+void engine_rebuild ( struct engine *e );
+void engine_repartition ( struct engine *e );
+void engine_makeproxies ( struct engine *e );
diff --git a/src/error.h b/src/error.h
index 4b7abe0740290fa65e0b6896b8dca8d0660960f9..57622721b42440b1551ae8fbe63f7fcf389d258a 100644
--- a/src/error.h
+++ b/src/error.h
@@ -39,7 +39,7 @@
  */
 #ifdef WITH_MPI
     extern int engine_rank;
-    #define message(s, ...) printf( "%s[%03i]: " s "\n" , __FUNCTION__ , engine_rank , ##__VA_ARGS__ )
+    #define message(s, ...) printf( "[%03i] %s: " s "\n" , engine_rank , __FUNCTION__ , ##__VA_ARGS__ )
 #else
     #define message(s, ...) printf( "%s: " s "\n" , __FUNCTION__ , ##__VA_ARGS__ )
 #endif
diff --git a/src/io.c b/src/io.c
index 99eaf1127b8a0485231ddd2ce5014975e50852ec..6cb9cebef55564af1fa1b20cd0ccf150e842c8c9 100644
--- a/src/io.c
+++ b/src/io.c
@@ -584,7 +584,7 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, enu
   h_err = H5Dwrite(h_data, hdf5Type(type), h_space, H5S_ALL, H5P_DEFAULT, temp);
   if(h_err < 0)
     {
-      error( "Error while reading data array '%s'." , name );
+      error( "Error while writing data array '%s'." , name );
     }
 
   /* Write XMF description for this data set */
diff --git a/src/proxy.c b/src/proxy.c
index ed383bf02f491ed726e1da88a9b3f825b599042f..8191b3d0b7c060e5d2a950fb208726957fedbfb8 100644
--- a/src/proxy.c
+++ b/src/proxy.c
@@ -90,7 +90,6 @@ void proxy_cells_exch1 ( struct proxy *p ) {
     /* Send the pcell buffer. */
     if ( MPI_Isend( p->pcells_out , sizeof(struct pcell)*p->size_pcells_out , MPI_BYTE , p->nodeID , p->mynodeID*proxy_tag_shift + proxy_tag_cells , MPI_COMM_WORLD , &p->req_cells_out ) != MPI_SUCCESS )
         error( "Failed to pcell_out buffer." );
-    MPI_Request_free( &p->req_cells_out );
     // message( "isent pcells (%i) from node %i to node %i." , p->size_pcells_out , p->mynodeID , p->nodeID ); fflush(stdout);
 
     /* Receive the number of pcells. */
@@ -216,7 +215,6 @@ void proxy_parts_exch1 ( struct proxy *p ) {
          MPI_Isend( p->xparts_out , sizeof(struct xpart)*p->nr_parts_out , MPI_BYTE , p->nodeID , p->mynodeID*proxy_tag_shift + proxy_tag_xparts , MPI_COMM_WORLD , &p->req_xparts_out ) != MPI_SUCCESS )
         error( "Failed to isend part data." );
     MPI_Request_free( &p->req_parts_out );
-    MPI_Request_free( &p->req_xparts_out );
     // message( "isent particle data (%i) to node %i." , p->nr_parts_out , p->nodeID ); fflush(stdout);
     /* for ( int k = 0 ; k < p->nr_parts_out ; k++ )
         message( "sending particle %lli, x=[%.3e %.3e %.3e], h=%.3e, to node %i." ,
diff --git a/src/scheduler.c b/src/scheduler.c
index b6759b41768ce574d153634e27e3c020a0d54971..cf7dc046a707ca01204bbb0d35024d569d3fe1cc 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -764,7 +764,7 @@ void scheduler_enqueue ( struct scheduler *s , struct task *t ) {
             case task_type_pair:
             case task_type_sub:
                 qid = t->ci->super->owner;
-                if ( t->cj != NULL &&
+                if ( t->cj != NULL && 
                      ( qid < 0 || s->queues[qid].count > s->queues[t->cj->super->owner].count ) )
                     qid = t->cj->super->owner;
                 break;
diff --git a/src/space.c b/src/space.c
index 95910efe320977b4823768d941d875ff37c8095f..3cfad92a2d7380110c86f354d30bf2eeec110359 100644
--- a/src/space.c
+++ b/src/space.c
@@ -163,7 +163,7 @@ void space_rebuild_recycle ( struct space *s , struct cell *c ) {
  
 void space_regrid ( struct space *s , double cell_max ) {
 
-    float h_max = s->cell_min / kernel_gamma, dmin;
+    float h_max = s->cell_min / kernel_gamma / space_stretch, dmin;
     int i, j, k, cdim[3], nr_parts = s->nr_parts;
     struct cell *restrict c;
     // ticks tic;
@@ -281,7 +281,7 @@ void space_regrid ( struct space *s , double cell_max ) {
  
 void space_rebuild ( struct space *s , double cell_max ) {
 
-    float h_max = s->cell_min / kernel_gamma, dmin;
+    float h_max = s->cell_min / kernel_gamma / space_stretch, dmin;
     int i, j, k, cdim[3], nr_parts = s->nr_parts;
     struct cell *restrict c, *restrict cells = s->cells;
     struct part *restrict finger, *restrict p, *parts = s->parts;
@@ -437,8 +437,8 @@ void space_rebuild ( struct space *s , double cell_max ) {
             p = &parts[k];
             ind[k] = cell_getid( cdim , p->x[0]*ih[0] , p->x[1]*ih[1] , p->x[2]*ih[2] );
             cells[ ind[k] ].count += 1;
-            if ( cells[ ind[k] ].nodeID != nodeID )
-                error( "Received part that does not belong to me (nodeID=%i)." , cells[ ind[k] ].nodeID );
+            /* if ( cells[ ind[k] ].nodeID != nodeID )
+                error( "Received part that does not belong to me (nodeID=%i)." , cells[ ind[k] ].nodeID ); */
             }
         nr_parts = s->nr_parts;
     #endif
@@ -506,10 +506,12 @@ void space_rebuild ( struct space *s , double cell_max ) {
  
 void parts_sort ( struct part *parts , struct xpart *xparts , int *ind , int N , int min , int max ) {
 
-    struct {
+    struct qstack {
         int i, j, min, max;
         volatile int ready;
-        } qstack[space_qstack];
+        };
+    struct qstack *qstack;
+    int qstack_size = (max-min)/2 + 1;
     volatile unsigned int first, last, waiting;
     
     int pivot;
@@ -517,18 +519,22 @@ void parts_sort ( struct part *parts , struct xpart *xparts , int *ind , int N ,
     struct part temp_p;
     struct xpart temp_xp;
     
+    /* Allocate the stack. */
+    if ( ( qstack = malloc( sizeof(struct qstack) * qstack_size ) ) == NULL )
+        error( "Failed to allocate qstack." );
+    
     /* Init the interval stack. */
     qstack[0].i = 0;
     qstack[0].j = N-1;
     qstack[0].min = min;
     qstack[0].max = max;
     qstack[0].ready = 1;
-    for ( i = 1 ; i < space_qstack ; i++ )
+    for ( i = 1 ; i < qstack_size ; i++ )
         qstack[i].ready = 0;
     first = 0; last = 1; waiting = 1;
     
     /* Parallel bit. */
-    #pragma omp parallel default(none) shared(first,last,waiting,qstack,parts,xparts,ind) private(pivot,i,ii,j,jj,min,max,temp_i,qid,temp_xp,temp_p)
+    #pragma omp parallel default(none) shared(first,last,waiting,qstack,parts,xparts,ind,qstack_size,stderr,engine_rank) private(pivot,i,ii,j,jj,min,max,temp_i,qid,temp_xp,temp_p)
     {
     
         /* Main loop. */
@@ -536,7 +542,7 @@ void parts_sort ( struct part *parts , struct xpart *xparts , int *ind , int N ,
         while ( waiting > 0 ) {
         
             /* Grab an interval off the queue. */
-            qid = atomic_inc( &first ) % space_qstack;
+            qid = atomic_inc( &first ) % qstack_size;
             
             /* Wait for the interval to be ready. */
             while ( waiting > 0 && atomic_cas( &qstack[qid].ready , 1 , 1 ) != 1 );
@@ -590,14 +596,15 @@ void parts_sort ( struct part *parts , struct xpart *xparts , int *ind , int N ,
 
                     /* Recurse on the left? */
                     if ( jj > i  && pivot > min ) {
-                        qid = atomic_inc( &last ) % space_qstack;
+                        qid = atomic_inc( &last ) % qstack_size;
                         while ( atomic_cas( &qstack[qid].ready , 0 , 0 ) != 0 );
                         qstack[qid].i = i;
                         qstack[qid].j = jj;
                         qstack[qid].min = min;
                         qstack[qid].max = pivot;
                         qstack[qid].ready = 1;
-                        atomic_inc( &waiting );
+                        if ( atomic_inc( &waiting ) >= qstack_size )
+                            error( "Qstack overflow." );
                         }
 
                     /* Recurse on the right? */
@@ -614,14 +621,15 @@ void parts_sort ( struct part *parts , struct xpart *xparts , int *ind , int N ,
                 
                     /* Recurse on the right? */
                     if ( jj+1 < j && pivot+1 < max ) {
-                        qid = atomic_inc( &last ) % space_qstack;
+                        qid = atomic_inc( &last ) % qstack_size;
                         while ( atomic_cas( &qstack[qid].ready , 0 , 0 ) != 0 );
                         qstack[qid].i = jj+1;
                         qstack[qid].j = j;
                         qstack[qid].min = pivot+1;
                         qstack[qid].max = max;
                         qstack[qid].ready = 1;
-                        atomic_inc( &waiting );
+                        if ( atomic_inc( &waiting ) >= qstack_size )
+                            error( "Qstack overflow." );
                         }
                         
                     /* Recurse on the left? */
@@ -646,6 +654,9 @@ void parts_sort ( struct part *parts , struct xpart *xparts , int *ind , int N ,
     /* for ( i = 1 ; i < N ; i++ )
         if ( ind[i-1] > ind[i] )
             error( "Sorting failed!" ); */
+            
+    /* Clean up. */
+    free( qstack );
 
     }
 
@@ -976,7 +987,6 @@ struct cell *space_getcell ( struct space *s ) {
     /* Init some things in the cell. */
     bzero( c , sizeof(struct cell) );
     c->nodeID = -1;
-    c->owner = -1;
     if ( lock_init( &c->lock ) != 0 )
         error( "Failed to initialize cell spinlock." );
         
@@ -1012,6 +1022,7 @@ void space_init ( struct space *s , double dim[3] , struct part *parts , int N ,
     s->parts = parts;
     s->cell_min = h_max;
     s->nr_queues = 1;
+    s->size_parts_foreign = 0;
     
     /* Allocate the xtra parts array. */
     if ( posix_memalign( (void *)&s->xparts , 32 , N * sizeof(struct xpart) ) != 0 )
diff --git a/src/space.h b/src/space.h
index 8fb66d564c24236d02675646a25a7f32e356aa37..1cad653701a8af1f71272f5c39526d3e2e666f6b 100644
--- a/src/space.h
+++ b/src/space.h
@@ -99,6 +99,10 @@ struct space {
     /* The associated engine. */
     struct engine *e;
     
+    /* Buffers for parts that we will receive from foreign cells. */
+    struct part *parts_foreign;
+    int nr_parts_foreign, size_parts_foreign;
+    
     };