diff --git a/examples/MultiTypes/multiTypes.yml b/examples/MultiTypes/multiTypes.yml
index 05d45595e9ec43b0849ed574f6b9922c19021a33..c1b37ab02698c71f75e548a6801e059c3e1741a9 100644
--- a/examples/MultiTypes/multiTypes.yml
+++ b/examples/MultiTypes/multiTypes.yml
@@ -28,7 +28,7 @@ SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  
+
 # Parameters related to the initial conditions
 InitialConditions:
   file_name:  ./multiTypes.hdf5     # The file to read
diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml
index 9c3cee7630edf1be1e161a3e70547f06e6108ebd..6327540c27753ea61a79d7fb4c16d60c5f00635d 100644
--- a/examples/parameter_example.yml
+++ b/examples/parameter_example.yml
@@ -8,12 +8,13 @@ InternalUnitSystem:
 
 # Parameters for the task scheduling
 Scheduler:
-  nr_queues:             0         # (Optional) The number of task queues to use. Use 0  to let the system decide.
-  cell_max_size:         8000000   # (Optional) Maximal number of interactions per task if we force the split (this is the default value).
-  cell_sub_size_pair:    256000000 # (Optional) Maximal number of interactions per sub-pair task  (this is the default value).
-  cell_sub_size_self:    32000     # (Optional) Maximal number of interactions per sub-self task  (this is the default value).
-  cell_split_size:       400       # (Optional) Maximal number of particles per cell (this is the default value).
-  max_top_level_cells:   12        # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value).
+  nr_queues:              0         # (Optional) The number of task queues to use. Use 0  to let the system decide.
+  cell_max_size:          8000000   # (Optional) Maximal number of interactions per task if we force the split (this is the default value).
+  cell_sub_size_pair:     256000000 # (Optional) Maximal number of interactions per sub-pair task  (this is the default value).
+  cell_sub_size_self:     32000     # (Optional) Maximal number of interactions per sub-self task  (this is the default value).
+  cell_split_size:        400       # (Optional) Maximal number of particles per cell (this is the default value).
+  max_top_level_cells:    12        # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value).
+  tasks_per_cell:         0         # (Optional) The average number of tasks per cell. If not large enough the simulation will fail (means guess...).
 
 # Parameters governing the time integration (Set dt_min and dt_max to the same value for a fixed time-step run.)
 TimeIntegration:
diff --git a/m4/ax_gcc_archflag.m4 b/m4/ax_gcc_archflag.m4
index bba53a4c8a8cb363a017c55c4e4ebbb4c6528dae..298db809e82130b77758f5e66c29c276ceb8266a 100644
--- a/m4/ax_gcc_archflag.m4
+++ b/m4/ax_gcc_archflag.m4
@@ -108,6 +108,7 @@ case $host_cpu in
 	    *3?6[[ae]]?:*:*:*) ax_gcc_arch="ivybridge core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;
 	    *3?6[[cf]]?:*:*:*|*4?6[[56]]?:*:*:*) ax_gcc_arch="haswell core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;
 	    *3?6d?:*:*:*|*4?6f?:*:*:*) ax_gcc_arch="broadwell core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;
+	    *9?6[[de]]?:*:*:*) ax_gcc_arch="kabylake core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;	
 	    *1?6c?:*:*:*|*2?6[[67]]?:*:*:*|*3?6[[56]]?:*:*:*) ax_gcc_arch="bonnell atom core2 pentium-m pentium3 pentiumpro" ;;
 	    *3?67?:*:*:*|*[[45]]?6[[ad]]?:*:*:*) ax_gcc_arch="silvermont atom core2 pentium-m pentium3 pentiumpro" ;;
 	    *000?f[[012]]?:*:*:*|?f[[012]]?:*:*:*|f[[012]]?:*:*:*) ax_gcc_arch="pentium4 pentiumpro" ;;
diff --git a/src/debug.c b/src/debug.c
index 903d7e5a2e30bca8980078991c5155830f5e4c43..c9f12370f0e59f812afc72eb55b0bdaa032edfe1 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -32,6 +32,7 @@
 #include "debug.h"
 
 /* Local includes. */
+#include "active.h"
 #include "cell.h"
 #include "engine.h"
 #include "hydro.h"
@@ -269,6 +270,79 @@ int checkCellhdxmax(const struct cell *c, int *depth) {
   return result;
 }
 
+/**
+ * @brief map function for dumping cells. In MPI mode locally active cells
+ * only.
+ */
+static void dumpCells_map(struct cell *c, void *data) {
+  uintptr_t *ldata = (uintptr_t *)data;
+  FILE *file = (FILE *)ldata[0];
+  struct engine *e = (struct engine *)ldata[1];
+  float ntasks = c->nr_tasks;
+
+#if SWIFT_DEBUG_CHECKS
+  /* The c->nr_tasks field does not include all the tasks. So let's check this
+   * the hard way. Note pairs share the task 50/50 with the other cell. */
+  ntasks = 0.0f;
+  struct task *tasks = e->sched.tasks;
+  int nr_tasks = e->sched.nr_tasks;
+  for (int k = 0; k < nr_tasks; k++) {
+    if (tasks[k].cj == NULL) {
+      if (c == tasks[k].ci) {
+        ntasks = ntasks + 1.0f;
+      }
+    } else {
+      if (c == tasks[k].ci || c == tasks[k].cj) {
+        ntasks = ntasks + 0.5f;
+      }
+    }
+  }
+#endif
+
+  /* Only locally active cells are dumped. */
+  if (c->count > 0 || c->gcount > 0 || c->scount > 0)
+    fprintf(file,
+            "  %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6d %6d %6d %6d "
+            "%6.1f %20lld %6d %6d %6d %6d\n",
+            c->loc[0], c->loc[1], c->loc[2], c->width[0], c->width[1],
+            c->width[2], c->count, c->gcount, c->scount, c->depth, ntasks,
+            c->ti_end_min, get_time_bin(c->ti_end_min), (c->super == c),
+            cell_is_active(c, e), c->nodeID);
+}
+
+/**
+ * @brief Dump the location, depth, task counts and timebins and active state,
+ * for all cells to a simple text file.
+ *
+ * @param prefix base output filename
+ * @param s the space holding the cells to dump.
+ */
+void dumpCells(const char *prefix, struct space *s) {
+
+  FILE *file = NULL;
+
+  /* Name of output file. */
+  static int nseq = 0;
+  char fname[200];
+  int uniq = atomic_inc(&nseq);
+  sprintf(fname, "%s_%03d.dat", prefix, uniq);
+
+  file = fopen(fname, "w");
+
+  /* Header. */
+  fprintf(file,
+          "# %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s "
+          "%20s %6s %6s %6s\n",
+          "x", "y", "z", "xw", "yw", "zw", "count", "gcount", "scount", "depth",
+          "tasks", "ti_end_min", "timebin", "issuper", "active", "rank");
+
+  uintptr_t data[2];
+  data[0] = (size_t)file;
+  data[1] = (size_t)s->e;
+  space_map_cells_pre(s, 1, dumpCells_map, &data);
+  fclose(file);
+}
+
 #ifdef HAVE_METIS
 
 /**
diff --git a/src/debug.h b/src/debug.h
index 7dca848b6bf4e44de5f40fa8e1c0849e8ee3d0b4..7bea11de6ac3fe82ae9b3f1be84249f75742ecaf 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -33,6 +33,7 @@ void printgParticle_single(struct gpart *gp);
 
 int checkSpacehmax(struct space *s);
 int checkCellhdxmax(const struct cell *c, int *depth);
+void dumpCells(const char *prefix, struct space *s);
 
 #ifdef HAVE_METIS
 #include "metis.h"
diff --git a/src/engine.c b/src/engine.c
index 93481aca3d25fd9755b7c7f69ef25ddb4d9d9d06..8f2ce85a970233178eebaa9ffd0f6598f7c2b467 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -2507,7 +2507,7 @@ void engine_maketasks(struct engine *e) {
   const ticks tic = getticks();
 
   /* Re-set the scheduler. */
-  scheduler_reset(sched, s->tot_cells * engine_maxtaskspercell);
+  scheduler_reset(sched, engine_estimate_nr_tasks(e));
 
   /* Construct the firt hydro loop over neighbours */
   if (e->policy & engine_policy_hydro) {
@@ -2902,7 +2902,8 @@ void engine_print_task_counts(struct engine *e) {
     else
       counts[(int)tasks[k].type] += 1;
   }
-  message("Total = %d", nr_tasks);
+  message("Total = %d  (per cell = %d)", nr_tasks,
+          (int)ceil((double)nr_tasks / e->s->tot_cells));
 #ifdef WITH_MPI
   printf("[%04i] %s engine_print_task_counts: task counts are [ %s=%i",
          e->nodeID, clocks_get_timesincestart(), taskID_names[0], counts[0]);
@@ -2923,6 +2924,117 @@ void engine_print_task_counts(struct engine *e) {
             clocks_getunit());
 }
 
+/**
+ * @brief if necessary, estimate the number of tasks required given
+ *        the current tasks in use and the numbers of cells.
+ *
+ * If e->tasks_per_cell is set greater than 0 then that value is used
+ * as the estimate of the average number of tasks per cell,
+ * otherwise we attempt an estimate.
+ *
+ * @param e the #engine
+ *
+ * @return the estimated total number of tasks
+ */
+int engine_estimate_nr_tasks(struct engine *e) {
+
+  int tasks_per_cell = e->tasks_per_cell;
+  if (tasks_per_cell > 0) return e->s->tot_cells * tasks_per_cell;
+
+  /* Our guess differs depending on the types of tasks we are using, but we
+   * basically use a formula <n1>*ntopcells + <n2>*(totcells - ntopcells).
+   * Where <n1> is the expected maximum tasks per top-level/super cell, and
+   * <n2> the expected maximum tasks for all other cells. These should give
+   * a safe upper limit.
+   */
+  int n1 = 0;
+  int n2 = 0;
+  if (e->policy & engine_policy_hydro) {
+    n1 += 36;
+    n2 += 2;
+#ifdef WITH_MPI
+    n1 += 6;
+#endif
+
+#ifdef EXTRA_HYDRO_LOOP
+    n1 += 15;
+#ifdef WITH_MPI
+    n1 += 2;
+#endif
+#endif
+  }
+  if (e->policy & engine_policy_self_gravity) {
+    n1 += 24;
+    n2 += 1;
+#ifdef WITH_MPI
+    n2 += 2;
+#endif
+  }
+  if (e->policy & engine_policy_external_gravity) {
+    n1 += 2;
+  }
+  if (e->policy & engine_policy_cosmology) {
+    n1 += 2;
+  }
+  if (e->policy & engine_policy_cooling) {
+    n1 += 2;
+  }
+  if (e->policy & engine_policy_sourceterms) {
+    n1 += 2;
+  }
+  if (e->policy & engine_policy_stars) {
+    n1 += 2;
+  }
+
+#ifdef WITH_MPI
+
+  /* We need fewer tasks per rank when using MPI, but we could have
+   * imbalances, so we need to work using the locally active cells, not just
+   * some equipartition amongst the nodes. Don't want to recurse the whole
+   * cell tree, so just make a guess of the maximum possible total cells. */
+  int ntop = 0;
+  int ncells = 0;
+  for (int k = 0; k < e->s->nr_cells; k++) {
+    struct cell *c = &e->s->cells_top[k];
+
+    /* Any cells with particles will have tasks (local & foreign). */
+    int nparts = c->count + c->gcount + c->scount;
+    if (nparts > 0) {
+      ntop++;
+      ncells++;
+
+      /* Count cell depth until we get below the parts per cell threshold. */
+      int depth = 0;
+      while (nparts > space_splitsize) {
+        depth++;
+        nparts /= 8;
+        ncells += (1 << (depth * 3));
+      }
+    }
+  }
+
+  /* If no local cells, we are probably still initialising, so just keep
+   * room for the top-level. */
+  if (ncells == 0) {
+    ntop = e->s->nr_cells;
+    ncells = ntop;
+  }
+#else
+  int ntop = e->s->nr_cells;
+  int ncells = e->s->tot_cells;
+#endif
+
+  double ntasks = n1 * ntop + n2 * (ncells - ntop);
+  tasks_per_cell = ceil(ntasks / ncells);
+
+  if (tasks_per_cell < 1.0) tasks_per_cell = 1.0;
+  if (e->verbose)
+    message("tasks per cell estimated as: %d, maximum tasks: %d",
+            tasks_per_cell, ncells * tasks_per_cell);
+
+  return ncells * tasks_per_cell;
+}
+
 /**
  * @brief Rebuild the space and tasks.
  *
@@ -4503,9 +4615,14 @@ void engine_init(struct engine *e, struct space *s,
       pthread_barrier_init(&e->run_barrier, NULL, e->nr_threads + 1) != 0)
     error("Failed to initialize barrier.");
 
-  /* Init the scheduler with enough tasks for the initial sorting tasks. */
-  const int nr_tasks = 2 * s->tot_cells + 2 * e->nr_threads;
-  scheduler_init(&e->sched, e->s, nr_tasks, nr_queues,
+  /* Expected average for tasks per cell. If set to zero we use a heuristic
+   * guess based on the numbers of cells and how many tasks per cell we expect.
+   */
+  e->tasks_per_cell =
+      parser_get_opt_param_int(params, "Scheduler:tasks_per_cell", 0);
+
+  /* Init the scheduler. */
+  scheduler_init(&e->sched, e->s, engine_estimate_nr_tasks(e), nr_queues,
                  (policy & scheduler_flag_steal), e->nodeID, &e->threadpool);
 
   /* Allocate and init the threads. */
diff --git a/src/engine.h b/src/engine.h
index 47a30a99b696304365a0ddf31d4499628a649a37..90645b308529e49a370bf91f2b3d05195b2b08d8 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -75,7 +75,6 @@ enum engine_policy {
 extern const char *engine_policy_names[];
 
 #define engine_queue_scale 1.2
-#define engine_maxtaskspercell 96
 #define engine_maxproxies 64
 #define engine_tasksreweight 1
 #define engine_parts_size_grow 1.05
@@ -222,6 +221,10 @@ struct engine {
   struct link *links;
   int nr_links, size_links;
 
+  /* Average number of tasks per cell. Used to estimate the sizes
+   * of the various task arrays. */
+  int tasks_per_cell;
+
   /* Are we talkative ? */
   int verbose;
 
@@ -292,5 +295,6 @@ int engine_is_done(struct engine *e);
 void engine_pin();
 void engine_unpin();
 void engine_clean(struct engine *e);
+int engine_estimate_nr_tasks(struct engine *e);
 
 #endif /* SWIFT_ENGINE_H */
diff --git a/src/gravity_cache.h b/src/gravity_cache.h
index 14b672233aa9958ec39af32a87baead98c0bae04..b9fde71dd887d704fac5c730aee33767de8a28f7 100644
--- a/src/gravity_cache.h
+++ b/src/gravity_cache.h
@@ -131,7 +131,7 @@ static INLINE void gravity_cache_init(struct gravity_cache *c, int count) {
  */
 __attribute__((always_inline)) INLINE void gravity_cache_populate(
     struct gravity_cache *c, const struct gpart *restrict gparts, int gcount,
-    int gcount_padded, const double shift[3]) {
+    int gcount_padded, const double shift[3], const struct cell *cell) {
 
   /* Make the compiler understand we are in happy vectorization land */
   float *restrict x = c->x;
@@ -161,9 +161,9 @@ __attribute__((always_inline)) INLINE void gravity_cache_populate(
 
   /* Pad the caches */
   for (int i = gcount; i < gcount_padded; ++i) {
-    x[i] = 0.f;
-    y[i] = 0.f;
-    z[i] = 0.f;
+    x[i] = -3.f * cell->width[0];
+    y[i] = -3.f * cell->width[0];
+    z[i] = -3.f * cell->width[0];
     epsilon[i] = 0.f;
     m[i] = 0.f;
   }
diff --git a/src/runner.c b/src/runner.c
index ec08b743452508364a7f1900963aae73061a944d..dd3f3c8e4e59af15485ece16665ffdae85703117 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -1749,7 +1749,8 @@ void *runner_main(void *data) {
   struct runner *r = (struct runner *)data;
   struct engine *e = r->e;
   struct scheduler *sched = &e->sched;
-
+  unsigned int seed = r->id;
+  pthread_setspecific(sched->local_seed_pointer, &seed);
   /* Main loop. */
   while (1) {
 
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index c07d70f3e48bb6f1c9e7e343a50cdbba71da0785..14eecaa2c0c126b721a400e63298e1d55a9c98da 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -1039,7 +1039,7 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
       const float hj = pj->h;
       const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift;
       if (dj - rshift > di_max) continue;
-      
+
       double pjx[3];
       for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
       const float hjg2 = hj * hj * kernel_gamma2;
@@ -1451,7 +1451,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
     const float hj = pj->h;
     const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift;
     if (dj - rshift > di_max) continue;
-    
+
     double pjx[3];
     for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
     const float hjg2 = hj * hj * kernel_gamma2;
diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h
index 01ea6a073211a08430e77721f4c2e60ef7adfd04..69f821f0a991cb797a6ae6b2002ed83986759d86 100644
--- a/src/runner_doiact_grav.h
+++ b/src/runner_doiact_grav.h
@@ -194,9 +194,9 @@ void runner_dopair_grav_pp_full(struct runner *r, struct cell *ci,
 
   /* Fill the caches */
   gravity_cache_populate(ci_cache, gparts_i, gcount_i, gcount_padded_i,
-                         loc_mean);
+                         loc_mean, ci);
   gravity_cache_populate(cj_cache, gparts_j, gcount_j, gcount_padded_j,
-                         loc_mean);
+                         loc_mean, cj);
 
   /* Ok... Here we go ! */
 
@@ -542,9 +542,9 @@ void runner_dopair_grav_pp_truncated(struct runner *r, struct cell *ci,
 
   /* Fill the caches */
   gravity_cache_populate(ci_cache, gparts_i, gcount_i, gcount_padded_i,
-                         loc_mean);
+                         loc_mean, ci);
   gravity_cache_populate(cj_cache, gparts_j, gcount_j, gcount_padded_j,
-                         loc_mean);
+                         loc_mean, cj);
 
   /* Ok... Here we go ! */
 
@@ -941,7 +941,7 @@ void runner_doself_grav_pp_full(struct runner *r, struct cell *c) {
   /* Computed the padded counts */
   const int gcount_padded = gcount - (gcount % VEC_SIZE) + VEC_SIZE;
 
-  gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc);
+  gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc, c);
 
   /* Ok... Here we go ! */
 
@@ -1155,7 +1155,7 @@ void runner_doself_grav_pp_truncated(struct runner *r, struct cell *c) {
   /* Computed the padded counts */
   const int gcount_padded = gcount - (gcount % VEC_SIZE) + VEC_SIZE;
 
-  gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc);
+  gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc, c);
 
   /* Ok... Here we go ! */
 
diff --git a/src/scheduler.c b/src/scheduler.c
index 4081cde0489b1b439ceb46fc9b4e191541f15bef..d0eeb8cb726cf53321d1b4e6a028f2914246cbf2 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -772,7 +772,11 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
   const int ind = atomic_inc(&s->tasks_next);
 
   /* Overflow? */
-  if (ind >= s->size) error("Task list overflow.");
+  if (ind >= s->size)
+    error(
+        "Task list overflow (%d). Need to increase "
+        "Scheduler:tasks_per_cell.",
+        ind);
 
   /* Get a pointer to the new task. */
   struct task *t = &s->tasks[ind];
@@ -1377,7 +1381,8 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
     if (qid >= s->nr_queues) error("Bad computed qid.");
 
     /* If no previous owner, pick a random queue. */
-    if (qid < 0) qid = rand() % s->nr_queues;
+    /* Note that getticks() is random enough */
+    if (qid < 0) qid = getticks() % s->nr_queues;
 
     /* Increase the waiting counter. */
     atomic_inc(&s->waiting);
@@ -1609,6 +1614,7 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
   s->size = 0;
   s->tasks = NULL;
   s->tasks_ind = NULL;
+  pthread_key_create(&s->local_seed_pointer, NULL);
   scheduler_reset(s, nr_tasks);
 }
 
diff --git a/src/scheduler.h b/src/scheduler.h
index ac654580b2af2ffb506dc3fd9f0b988b89effbd0..0ef9426ed3248742de619d64f53bdd6b9954ed2c 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -102,6 +102,9 @@ struct scheduler {
 
   /* The node we are working on. */
   int nodeID;
+
+  /* 'Pointer' to the seed for the random number generator */
+  pthread_key_t local_seed_pointer;
 };
 
 /* Inlined functions (for speed). */