diff --git a/src/debug.c b/src/debug.c
index f245e881d718c27c075a6ae13d62a49d43b5f7e9..7376e4558b48fa0414cf1df02d4a7c1de400d1d9 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -1,7 +1,9 @@
 /*******************************************************************************
  * This file is part of SWIFT.
- * Copyright (c) 2013 Matthieu Schaller (matthieu.schaller@durham.ac.uk),
- *                    Pedro Gonnet (pedro.gonnet@durham.ac.uk).
+ * Copyright (c) 2013- 2015:
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk),
+ *                    Pedro Gonnet (pedro.gonnet@durham.ac.uk),
+ *                    Peter W. Draper (p.w.draper@durham.ac.uk).
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -20,8 +22,10 @@
 
 #include <stdio.h>
 
+#include "config.h"
 #include "const.h"
 #include "part.h"
+#include "debug.h"
 
 /**
  * @brief Looks for the particle with the given id and prints its information to
@@ -98,3 +102,140 @@ void printParticle_single(struct part *p) {
       p->rho_dh, p->density.div_v, p->u, p->force.u_dt, p->force.balsara,
       p->force.POrho2, p->force.v_sig, p->dt);
 }
+
+#ifdef HAVE_METIS
+
+/**
+ * @brief Dump the METIS graph in standard format, simple format and weights
+ * only, to a file.
+ *
+ * @description The standard format output can be read into the METIS
+ * command-line tools. The simple format is just the cell connectivity (this
+ * should not change between calls).  The weights format is the standard one,
+ * minus the cell connectivity.
+ *
+ * The output filenames are generated from the prefix and the sequence number
+ * of calls. So the first is called <prefix>_std_001.dat, <prefix>_simple_001.dat,
+ * <prefix>_weights_001.dat, etc.
+ *
+ * @param prefix base output filename
+ * @param nvertices the number of vertices
+ * @param nvertexweights the number vertex weights
+ * @param cellconruns first part of cell connectivity info (CSR)
+ * @param cellcon second part of cell connectivity info (CSR)
+ * @param vertexweights weights of vertices
+ * @param vertexsizes size of vertices
+ * @param edgeweights weights of edges
+ */
+void dumpMETISGraph(const char *prefix, idx_t nvertices, idx_t nvertexweights,
+                    idx_t *cellconruns, idx_t *cellcon, idx_t *vertexweights, 
+                    idx_t *vertexsizes, idx_t *edgeweights) {
+  FILE *stdfile = NULL;
+  FILE *simplefile = NULL;
+  FILE *weightfile = NULL;
+  char fname[200];
+  idx_t i;
+  idx_t j;
+  int haveedgeweight = 0;
+  int havevertexsize = 0;
+  int havevertexweight = 0;
+  static int nseq = 0;
+  nseq++;
+
+  if (vertexweights != NULL) {
+    for (i = 0; i < nvertices * nvertexweights; i++) {
+      if (vertexweights[i] != 1) {
+        havevertexweight = 1;
+        break;
+      }
+    }
+  }
+
+  if (vertexsizes != NULL) {
+    for (i = 0; i < nvertices; i++) {
+      if (vertexsizes[i] != 1) {
+        havevertexsize = 1;
+        break;
+      }
+    }
+  }
+
+  if (edgeweights != NULL) {
+    for (i = 0; i < cellconruns[nvertices]; i++) {
+      if (edgeweights[i] != 1) {
+        haveedgeweight = 1;
+        break;
+      }
+    }
+  }
+
+  /*  Open output files. */
+  sprintf(fname, "%s_std_%03d.dat", prefix, nseq);
+  stdfile = fopen( fname, "w" );
+
+  sprintf(fname, "%s_simple_%03d.dat", prefix, nseq);
+  simplefile = fopen( fname, "w" );
+
+  if (havevertexweight || havevertexsize || haveedgeweight) {
+    sprintf(fname, "%s_weights_%03d.dat", prefix, nseq);
+    weightfile = fopen( fname, "w" );
+  }
+
+  /*  Write the header lines. */
+  fprintf(stdfile, "%" PRIDX " %" PRIDX, nvertices, cellconruns[nvertices] / 2);
+  fprintf(simplefile, "%" PRIDX " %" PRIDX, nvertices, cellconruns[nvertices] / 2);
+  if (havevertexweight || havevertexsize || haveedgeweight) {
+    fprintf(weightfile, "%" PRIDX " %" PRIDX, nvertices, cellconruns[nvertices] / 2);
+
+    fprintf(stdfile, " %d%d%d", havevertexsize, havevertexweight, haveedgeweight);
+    fprintf(weightfile, " %d%d%d", havevertexsize, havevertexweight, haveedgeweight);
+
+    if (havevertexweight) {
+      fprintf(stdfile, " %d", (int)nvertexweights);
+      fprintf(weightfile, " %d", (int)nvertexweights);
+    }
+  }
+
+  /*  Write the rest of the graph. */
+  for (i = 0; i < nvertices; i++) {
+    fprintf(stdfile, "\n");
+    fprintf(simplefile, "\n");
+    if (weightfile != NULL) {
+        fprintf(weightfile, "\n");
+    }
+
+    if (havevertexsize) {
+      fprintf(stdfile, " %" PRIDX, vertexsizes[i]);
+      fprintf(weightfile, " %" PRIDX, vertexsizes[i]);
+    }
+
+    if (havevertexweight) {
+      for (j = 0; j < nvertexweights; j++) {
+        fprintf(stdfile, " %" PRIDX, vertexweights[i * nvertexweights + j]);
+        fprintf(weightfile, " %" PRIDX, vertexweights[i * nvertexweights + j]);
+      }
+    }
+
+    for (j = cellconruns[i]; j < cellconruns[i + 1]; j++) {
+      fprintf(stdfile, " %" PRIDX, cellcon[j] + 1);
+      fprintf(simplefile, " %" PRIDX, cellcon[j] + 1);
+      if (haveedgeweight) {
+        fprintf(stdfile, " %" PRIDX, edgeweights[j]);
+        fprintf(weightfile, " %" PRIDX, edgeweights[j]);
+      }
+    }
+  }
+  fprintf(stdfile, "\n");
+  fprintf(simplefile, "\n");
+  if (weightfile != NULL) {
+      fprintf(weightfile, "\n");
+  }
+
+  fclose(stdfile);
+  fclose(simplefile);
+  if (weightfile != NULL) {
+    fclose(weightfile);
+  }
+}
+
+#endif
diff --git a/src/debug.h b/src/debug.h
index 83461df45e3c0fb137557fba5fdf68cac9d4915a..27b2f94eff28c0d2fd0bc76f548d5d775414d2c2 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -27,4 +27,11 @@ void printParticle(struct part *parts, long long int i, int N);
 void printgParticle(struct gpart *parts, long long int i, int N);
 void printParticle_single(struct part *p);
 
+#ifdef HAVE_METIS
+#include "metis.h"
+void dumpMETISGraph(const char *prefix, idx_t nvtxs, idx_t ncon,
+                    idx_t *xadj, idx_t *adjncy, idx_t *vwgt, idx_t *vsize,
+                    idx_t *adjwgt);
+
+#endif
 #endif /* SWIFT_DEBUG_H */
diff --git a/src/engine.c b/src/engine.c
index 4e355a66da6520113af6fad3bc3b2942f1672d94..354fa42af9c955f09b64ea2bb6af84e010f3537b 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -169,6 +169,9 @@ void engine_redistribute(struct engine *e) {
     }
     const int cid = cell_getid(cdim, parts[k].x[0] * ih[0],
                                parts[k].x[1] * ih[1], parts[k].x[2] * ih[2]);
+    /* if (cid < 0 || cid >= s->nr_cells)
+      error("Bad cell id %i for part %i at [%.3e,%.3e,%.3e].",
+            cid, k, parts[k].x[0], parts[k].x[1], parts[k].x[2]); */
     dest[k] = cells[cid].nodeID;
     counts[nodeID * nr_nodes + dest[k]] += 1;
   }
@@ -303,7 +306,8 @@ void engine_repartition(struct engine *e) {
   int nr_nodes = e->nr_nodes, nodeID = e->nodeID;
   float wscale = 1e-3, vscale = 1e-3, wscale_buff;
   idx_t wtot = 0;
-  const idx_t wmax = 1e9 / e->nr_nodes;
+  idx_t wmax = 1e9 / e->nr_nodes;
+  idx_t wmin;
 
   /* Clear the repartition flag. */
   e->forcerepart = 0;
@@ -486,6 +490,24 @@ void engine_repartition(struct engine *e) {
   /* As of here, only one node needs to compute the partition. */
   if (nodeID == 0) {
 
+    /* Final rescale of all weights to avoid a large range. Large ranges have
+     * been seen to cause an incomplete graph. */
+    wmin = wmax;
+    wmax = 0.0;
+    for (k = 0; k < 26 * nr_cells; k++) {
+      wmax = weights_e[k] > wmax ? weights_e[k] : wmax;
+      wmin = weights_e[k] < wmin ? weights_e[k] : wmin;
+    }
+    if ((wmax - wmin) > engine_maxmetisweight) {
+      wscale = engine_maxmetisweight / (wmax - wmin);
+      for (k = 0; k < 26 * nr_cells; k++) {
+        weights_e[k] = (weights_e[k] - wmin) * wscale + 1;
+      }
+      for (k = 0; k < nr_cells; k++) {
+        weights_v[k] = (weights_v[k] - wmin) * wscale + 1;
+      }
+    }
+
     /* Check that the edge weights are fully symmetric. */
     /* for ( cid = 0 ; cid < nr_cells ; cid++ )
         for ( k = 0 ; k < 26 ; k++ ) {
@@ -544,16 +566,47 @@ void engine_repartition(struct engine *e) {
     /* Call METIS. */
     idx_t one = 1, idx_nr_cells = nr_cells, idx_nr_nodes = nr_nodes;
     idx_t objval;
+
+    /* Dump graph in METIS format */
+    /*dumpMETISGraph("metis_graph", idx_nr_cells, one, offsets, inds,
+                   weights_v, NULL, weights_e);*/
+
     if (METIS_PartGraphRecursive(&idx_nr_cells, &one, offsets, inds, weights_v,
                                  NULL, weights_e, &idx_nr_nodes, NULL, NULL,
                                  options, &objval, nodeIDs) != METIS_OK)
-      error("Call to METIS_PartGraphKway failed.");
+      error("Call to METIS_PartGraphRecursive failed.");
 
     /* Dump the 3d array of cell IDs. */
     /* printf( "engine_repartition: nodeIDs = reshape( [" );
     for ( i = 0 ; i < cdim[0]*cdim[1]*cdim[2] ; i++ )
         printf( "%i " , (int)nodeIDs[ i ] );
     printf("] ,%i,%i,%i);\n",cdim[0],cdim[1],cdim[2]); */
+
+    /* Check that the nodeIDs are ok. */
+    for (k = 0; k < nr_cells; k++)
+      if (nodeIDs[k] < 0 || nodeIDs[k] >= nr_nodes)
+        error("Got bad nodeID %"PRIDX" for cell %i.", nodeIDs[k], k);
+
+    /* Check that the partition is complete and all nodes have some work. */
+    int present[nr_nodes];
+    int failed = 0;
+    for (i = 0; i < nr_nodes; i++) present[i] = 0;
+    for (i = 0; i < nr_cells; i++) present[nodeIDs[i]]++;
+    for (i = 0; i < nr_nodes; i++) {
+      if (! present[i]) {
+        failed = 1;
+        message("Node %d is not present after repartition", i);
+      }
+    }
+
+    /* If partition failed continue with the current one, but make this
+     * clear. */
+    if (failed) {
+      message("WARNING: METIS repartition has failed, continuing with "
+              "the current partition, load balance will not be optimal");
+      for (k = 0; k < nr_cells; k++) nodeIDs[k] = cells[k].nodeID;
+    }
+    
   }
 
 /* Broadcast the result of the partition. */
@@ -2168,7 +2221,7 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
   s->nr_queues = nr_queues;
 
   /* Append a kick1 task to each cell. */
-  scheduler_reset(&e->sched, s->tot_cells + e->nr_threads);
+  scheduler_reset(&e->sched, 2 * s->tot_cells + e->nr_threads);
   for (k = 0; k < s->nr_cells; k++)
     s->cells[k].kick1 =
         scheduler_addtask(&e->sched, task_type_kick1, task_subtype_none, 0, 0,
diff --git a/src/engine.h b/src/engine.h
index c450532909557f2374ec76af6b58050fd9483bb9..b2beeaa1b4a1b4aebe650209739fec91c74b1ba7 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -47,6 +47,8 @@
 #define engine_maxproxies 64
 #define engine_tasksreweight 10
 
+#define engine_maxmetisweight 10000.0f
+
 /* The rank of the engine as a global variable (for messages). */
 extern int engine_rank;
 
diff --git a/src/runner.c b/src/runner.c
index 1e160a3d7ea502db35bab6a662b8b787a53d2055..66e6eb5a22b42ae4c20ed4ced82fd7b6ccd2fa8b 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -1291,6 +1291,9 @@ void *runner_main(void *data) {
         case task_type_psort:
           space_do_parts_sort();
           break;
+        case task_type_split_cell:
+          space_split(e->s, t->ci);
+          break;
          case task_type_rewait:
           for (struct task *t2 = (struct task *)t->ci;
                t2 != (struct task *)t->cj; t2++) {
diff --git a/src/scheduler.c b/src/scheduler.c
index 37cef93950bda0d89b51e2d63504aeffcb2ce33a..b790768b04f76be2892e8246b0fba8b39817e585 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -130,6 +130,9 @@ void scheduler_splittasks(struct scheduler *s) {
         break;
     }
 
+    /* Skip sorting tasks. */
+    if (t->type == task_type_psort) continue;
+
     /* Empty task? */
     if (t->ci == NULL || (t->type == task_type_pair && t->cj == NULL)) {
       t->type = task_type_none;
@@ -1069,7 +1072,7 @@ struct task *scheduler_done(struct scheduler *s, struct task *t) {
     }
   }
 
-  /* Task definitely done. */
+  /* Task definitely done, signal any sleeping runners. */
   if (!t->implicit) {
     t->toc = getticks();
     pthread_mutex_lock(&s->sleep_mutex);
diff --git a/src/space.c b/src/space.c
index 5d032a99fcea1e99d3bd00b9015cd5f70b442517..1b9784d4d717a896ef2a83330492140d2ddf8309 100644
--- a/src/space.c
+++ b/src/space.c
@@ -466,7 +466,11 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
   /* At this point, we have the upper-level cells, old or new. Now make
      sure that the parts in each cell are ok. */
   // tic = getticks();
-  for (k = 0; k < s->nr_cells; k++) space_split(s, &cells[k]);
+  // for (k = 0; k < s->nr_cells; k++) space_split(s, &cells[k]);
+  for (k = 0; k < s->nr_cells; k++)
+    scheduler_addtask(&s->e->sched, task_type_split_cell, task_subtype_none,
+                      k, 0, &cells[k], NULL, 0);
+  engine_launch(s->e, s->e->nr_threads, 1 << task_type_split_cell);
 
   // message( "space_split took %.3f ms." , (double)(getticks() - tic) / CPU_TPS
   // * 1000 );
@@ -476,6 +480,7 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
  * @brief Sort the particles and condensed particles according to the given
  *indices.
  *
+ * @param s The #space.
  * @param ind The indices with respect to which the parts are sorted.
  * @param N The number of parts
  * @param min Lowest index.
@@ -483,11 +488,11 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
  */
 
 void space_parts_sort(struct space *s, int *ind, int N, int min, int max) {
-  // Populate a parallel_sort structure with the input data.
+  // Populate the global parallel_sort structure with the input data.
   space_sort_struct.parts = s->parts;
   space_sort_struct.xparts = s->xparts;
   space_sort_struct.ind = ind;
-  space_sort_struct.stack_size = 2 * (max - min) + 10;
+  space_sort_struct.stack_size = 2 * (max - min + 1) + 10 + s->e->nr_threads;
   if ((space_sort_struct.stack = malloc(sizeof(struct qstack) *
                                         space_sort_struct.stack_size)) == NULL)
     error("Failed to allocate sorting stack.");
@@ -510,8 +515,9 @@ void space_parts_sort(struct space *s, int *ind, int N, int min, int max) {
   /* Verify space_sort_struct. */
   /* for (int i = 1; i < N; i++)
     if (ind[i - 1] > ind[i])
-      error("Sorting failed (ind[%i]=%i,ind[%i]=%i).", i - 1, ind[i - 1], i,
-            ind[i]); */
+      error("Sorting failed (ind[%i]=%i,ind[%i]=%i), min=%i, max=%i.", i - 1, ind[i - 1], i,
+            ind[i], min, max);
+  message("Sorting succeeded."); */
 
   // Clean up.
   free(space_sort_struct.stack);
@@ -525,15 +531,17 @@ void space_do_parts_sort() {
   struct xpart *xparts = space_sort_struct.xparts;
 
   /* Main loop. */
-  while (space_sort_struct.waiting > 0) {
+  while (space_sort_struct.waiting) {
 
     /* Grab an interval off the queue. */
     int qid =
         atomic_inc(&space_sort_struct.first) % space_sort_struct.stack_size;
 
-    /* Get the stack entry. */
+    /* Wait for the entry to be ready, or for the sorting do be done. */
     while (!space_sort_struct.stack[qid].ready)
       if (!space_sort_struct.waiting) return;
+      
+    /* Get the stack entry. */
     int i = space_sort_struct.stack[qid].i;
     int j = space_sort_struct.stack[qid].j;
     int min = space_sort_struct.stack[qid].min;
@@ -545,6 +553,8 @@ void space_do_parts_sort() {
 
       /* Bring beer. */
       const int pivot = (min + max) / 2;
+      /* message("Working on interval [%i,%i] with min=%i, max=%i, pivot=%i.",
+              i, j, min, max, pivot); */
 
       /* One pass of QuickSort's partitioning. */
       int ii = i;
@@ -566,18 +576,18 @@ void space_do_parts_sort() {
       }
 
       /* Verify space_sort_struct. */
-      /* for ( int k = i ; k <= jj ; k++ )
-         if ( ind[k] > pivot ) {
-         message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i,
-         N=%i." , k , ind[k] , pivot , i , j , N );
-         error( "Partition failed (<=pivot)." );
-         }
-         for ( int k = jj+1 ; k <= j ; k++ )
-         if ( ind[k] <= pivot ) {
-         message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i,
-         N=%i." , k , ind[k] , pivot , i , j , N );
-         error( "Partition failed (>pivot)." );
-         } */
+      /* for (int k = i; k <= jj; k++)
+        if (ind[k] > pivot) {
+          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k,
+                  ind[k], pivot, i, j);
+          error("Partition failed (<=pivot).");
+        }
+      for (int k = jj + 1; k <= j; k++)
+        if (ind[k] <= pivot) {
+          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k,
+                  ind[k], pivot, i, j);
+          error("Partition failed (>pivot).");
+        } */
 
       /* Split-off largest interval. */
       if (jj - i > j - jj + 1) {
@@ -586,14 +596,15 @@ void space_do_parts_sort() {
         if (jj > i && pivot > min) {
           qid = atomic_inc(&space_sort_struct.last) %
                 space_sort_struct.stack_size;
+          while (space_sort_struct.stack[qid].ready);
           space_sort_struct.stack[qid].i = i;
           space_sort_struct.stack[qid].j = jj;
           space_sort_struct.stack[qid].min = min;
           space_sort_struct.stack[qid].max = pivot;
-          space_sort_struct.stack[qid].ready = 1;
           if (atomic_inc(&space_sort_struct.waiting) >=
               space_sort_struct.stack_size)
             error("Qstack overflow.");
+          space_sort_struct.stack[qid].ready = 1;
         }
 
         /* Recurse on the right? */
@@ -606,17 +617,18 @@ void space_do_parts_sort() {
       } else {
 
         /* Recurse on the right? */
-        if (jj + 1 < j && pivot + 1 < max) {
+        if (pivot + 1 < max) {
           qid = atomic_inc(&space_sort_struct.last) %
                 space_sort_struct.stack_size;
+          while (space_sort_struct.stack[qid].ready);
           space_sort_struct.stack[qid].i = jj + 1;
           space_sort_struct.stack[qid].j = j;
           space_sort_struct.stack[qid].min = pivot + 1;
           space_sort_struct.stack[qid].max = max;
-          space_sort_struct.stack[qid].ready = 1;
           if (atomic_inc(&space_sort_struct.waiting) >=
               space_sort_struct.stack_size)
             error("Qstack overflow.");
+          space_sort_struct.stack[qid].ready = 1;
         }
 
         /* Recurse on the left? */
diff --git a/src/task.c b/src/task.c
index 4baa7d281eabb40d6db3b4e5dadf8dc301ae816c..7077818c6f6b9ac50ba13385877e28f966fe551b 100644
--- a/src/task.c
+++ b/src/task.c
@@ -46,7 +46,7 @@ const char *taskID_names[task_type_count] = {
     "none",  "sort",    "self",    "pair",    "sub",
     "ghost", "kick1",   "kick2",   "send",    "recv",
     "link",  "grav_pp", "grav_mm", "grav_up", "grav_down",
-    "psort", "rewait"};
+    "psort", "split_cell", "rewait"};
 
 /**
  * @brief Unlock the cell held by this task.
diff --git a/src/task.h b/src/task.h
index a4ab761b0e04833de53aa756a0f755be66853430..398e4dd82bf87ae52ee32d891c25f6d2e948268c 100644
--- a/src/task.h
+++ b/src/task.h
@@ -45,6 +45,7 @@ enum task_types {
   task_type_grav_up,
   task_type_grav_down,
   task_type_psort,
+  task_type_split_cell,
   task_type_rewait,
   task_type_count
 };