diff --git a/examples/MultiTypes/multiTypes.yml b/examples/MultiTypes/multiTypes.yml
index 05d45595e9ec43b0849ed574f6b9922c19021a33..c1b37ab02698c71f75e548a6801e059c3e1741a9 100644
--- a/examples/MultiTypes/multiTypes.yml
+++ b/examples/MultiTypes/multiTypes.yml
@@ -28,7 +28,7 @@ SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  
+
 # Parameters related to the initial conditions
 InitialConditions:
   file_name:  ./multiTypes.hdf5     # The file to read
diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml
index 9c3cee7630edf1be1e161a3e70547f06e6108ebd..6327540c27753ea61a79d7fb4c16d60c5f00635d 100644
--- a/examples/parameter_example.yml
+++ b/examples/parameter_example.yml
@@ -8,12 +8,13 @@ InternalUnitSystem:
 
 # Parameters for the task scheduling
 Scheduler:
-  nr_queues:             0         # (Optional) The number of task queues to use. Use 0  to let the system decide.
-  cell_max_size:         8000000   # (Optional) Maximal number of interactions per task if we force the split (this is the default value).
-  cell_sub_size_pair:    256000000 # (Optional) Maximal number of interactions per sub-pair task  (this is the default value).
-  cell_sub_size_self:    32000     # (Optional) Maximal number of interactions per sub-self task  (this is the default value).
-  cell_split_size:       400       # (Optional) Maximal number of particles per cell (this is the default value).
-  max_top_level_cells:   12        # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value).
+  nr_queues:              0         # (Optional) The number of task queues to use. Use 0  to let the system decide.
+  cell_max_size:          8000000   # (Optional) Maximal number of interactions per task if we force the split (this is the default value).
+  cell_sub_size_pair:     256000000 # (Optional) Maximal number of interactions per sub-pair task  (this is the default value).
+  cell_sub_size_self:     32000     # (Optional) Maximal number of interactions per sub-self task  (this is the default value).
+  cell_split_size:        400       # (Optional) Maximal number of particles per cell (this is the default value).
+  max_top_level_cells:    12        # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value).
+  tasks_per_cell:         0         # (Optional) The average number of tasks per cell. If not large enough the simulation will fail (means guess...).
 
 # Parameters governing the time integration (Set dt_min and dt_max to the same value for a fixed time-step run.)
 TimeIntegration:
diff --git a/m4/ax_gcc_archflag.m4 b/m4/ax_gcc_archflag.m4
index bba53a4c8a8cb363a017c55c4e4ebbb4c6528dae..298db809e82130b77758f5e66c29c276ceb8266a 100644
--- a/m4/ax_gcc_archflag.m4
+++ b/m4/ax_gcc_archflag.m4
@@ -108,6 +108,7 @@ case $host_cpu in
 	    *3?6[[ae]]?:*:*:*) ax_gcc_arch="ivybridge core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;
 	    *3?6[[cf]]?:*:*:*|*4?6[[56]]?:*:*:*) ax_gcc_arch="haswell core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;
 	    *3?6d?:*:*:*|*4?6f?:*:*:*) ax_gcc_arch="broadwell core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;
+	    *9?6[[de]]?:*:*:*) ax_gcc_arch="kabylake core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;	
 	    *1?6c?:*:*:*|*2?6[[67]]?:*:*:*|*3?6[[56]]?:*:*:*) ax_gcc_arch="bonnell atom core2 pentium-m pentium3 pentiumpro" ;;
 	    *3?67?:*:*:*|*[[45]]?6[[ad]]?:*:*:*) ax_gcc_arch="silvermont atom core2 pentium-m pentium3 pentiumpro" ;;
 	    *000?f[[012]]?:*:*:*|?f[[012]]?:*:*:*|f[[012]]?:*:*:*) ax_gcc_arch="pentium4 pentiumpro" ;;
diff --git a/src/debug.c b/src/debug.c
index 903d7e5a2e30bca8980078991c5155830f5e4c43..c9f12370f0e59f812afc72eb55b0bdaa032edfe1 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -32,6 +32,7 @@
 #include "debug.h"
 
 /* Local includes. */
+#include "active.h"
 #include "cell.h"
 #include "engine.h"
 #include "hydro.h"
@@ -269,6 +270,79 @@ int checkCellhdxmax(const struct cell *c, int *depth) {
   return result;
 }
 
+/**
+ * @brief map function for dumping cells. In MPI mode locally active cells
+ * only.
+ */
+static void dumpCells_map(struct cell *c, void *data) {
+  uintptr_t *ldata = (uintptr_t *)data;
+  FILE *file = (FILE *)ldata[0];
+  struct engine *e = (struct engine *)ldata[1];
+  float ntasks = c->nr_tasks;
+
+#if SWIFT_DEBUG_CHECKS
+  /* The c->nr_tasks field does not include all the tasks. So let's check this
+   * the hard way. Note pairs share the task 50/50 with the other cell. */
+  ntasks = 0.0f;
+  struct task *tasks = e->sched.tasks;
+  int nr_tasks = e->sched.nr_tasks;
+  for (int k = 0; k < nr_tasks; k++) {
+    if (tasks[k].cj == NULL) {
+      if (c == tasks[k].ci) {
+        ntasks = ntasks + 1.0f;
+      }
+    } else {
+      if (c == tasks[k].ci || c == tasks[k].cj) {
+        ntasks = ntasks + 0.5f;
+      }
+    }
+  }
+#endif
+
+  /* Only locally active cells are dumped. */
+  if (c->count > 0 || c->gcount > 0 || c->scount > 0)
+    fprintf(file,
+            "  %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6d %6d %6d %6d "
+            "%6.1f %20lld %6d %6d %6d %6d\n",
+            c->loc[0], c->loc[1], c->loc[2], c->width[0], c->width[1],
+            c->width[2], c->count, c->gcount, c->scount, c->depth, ntasks,
+            c->ti_end_min, get_time_bin(c->ti_end_min), (c->super == c),
+            cell_is_active(c, e), c->nodeID);
+}
+
+/**
+ * @brief Dump the location, depth, task counts and timebins and active state,
+ * for all cells to a simple text file.
+ *
+ * @param prefix base output filename
+ * @param s the space holding the cells to dump.
+ */
+void dumpCells(const char *prefix, struct space *s) {
+
+  FILE *file = NULL;
+
+  /* Name of output file. */
+  static int nseq = 0;
+  char fname[200];
+  int uniq = atomic_inc(&nseq);
+  sprintf(fname, "%s_%03d.dat", prefix, uniq);
+
+  file = fopen(fname, "w");
+
+  /* Header. */
+  fprintf(file,
+          "# %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s "
+          "%20s %6s %6s %6s\n",
+          "x", "y", "z", "xw", "yw", "zw", "count", "gcount", "scount", "depth",
+          "tasks", "ti_end_min", "timebin", "issuper", "active", "rank");
+
+  uintptr_t data[2];
+  data[0] = (size_t)file;
+  data[1] = (size_t)s->e;
+  space_map_cells_pre(s, 1, dumpCells_map, &data);
+  fclose(file);
+}
+
 #ifdef HAVE_METIS
 
 /**
diff --git a/src/debug.h b/src/debug.h
index 7dca848b6bf4e44de5f40fa8e1c0849e8ee3d0b4..7bea11de6ac3fe82ae9b3f1be84249f75742ecaf 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -33,6 +33,7 @@ void printgParticle_single(struct gpart *gp);
 
 int checkSpacehmax(struct space *s);
 int checkCellhdxmax(const struct cell *c, int *depth);
+void dumpCells(const char *prefix, struct space *s);
 
 #ifdef HAVE_METIS
 #include "metis.h"
diff --git a/src/engine.c b/src/engine.c
index 6dbbd3f6fb0733adc89c698d5c1f81119745151a..1230abf848b3b63465bd5e7d4065dec36d46e46f 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -2514,7 +2514,7 @@ void engine_maketasks(struct engine *e) {
   const ticks tic = getticks();
 
   /* Re-set the scheduler. */
-  scheduler_reset(sched, s->tot_cells * engine_maxtaskspercell);
+  scheduler_reset(sched, engine_estimate_nr_tasks(e));
 
   /* Construct the firt hydro loop over neighbours */
   if (e->policy & engine_policy_hydro) {
@@ -2943,7 +2943,8 @@ void engine_print_task_counts(struct engine *e) {
     else
       counts[(int)tasks[k].type] += 1;
   }
-  message("Total = %d", nr_tasks);
+  message("Total = %d  (per cell = %d)", nr_tasks,
+          (int)ceil((double)nr_tasks / e->s->tot_cells));
 #ifdef WITH_MPI
   printf("[%04i] %s engine_print_task_counts: task counts are [ %s=%i",
          e->nodeID, clocks_get_timesincestart(), taskID_names[0], counts[0]);
@@ -2964,6 +2965,117 @@ void engine_print_task_counts(struct engine *e) {
             clocks_getunit());
 }
 
+/**
+ * @brief if necessary, estimate the number of tasks required given
+ *        the current tasks in use and the numbers of cells.
+ *
+ * If e->tasks_per_cell is set greater than 0 then that value is used
+ * as the estimate of the average number of tasks per cell,
+ * otherwise we attempt an estimate.
+ *
+ * @param e the #engine
+ *
+ * @return the estimated total number of tasks
+ */
+int engine_estimate_nr_tasks(struct engine *e) {
+
+  int tasks_per_cell = e->tasks_per_cell;
+  if (tasks_per_cell > 0) return e->s->tot_cells * tasks_per_cell;
+
+  /* Our guess differs depending on the types of tasks we are using, but we
+   * basically use a formula <n1>*ntopcells + <n2>*(totcells - ntopcells).
+   * Where <n1> is the expected maximum tasks per top-level/super cell, and
+   * <n2> the expected maximum tasks for all other cells. These should give
+   * a safe upper limit.
+   */
+  int n1 = 0;
+  int n2 = 0;
+  if (e->policy & engine_policy_hydro) {
+    n1 += 36;
+    n2 += 2;
+#ifdef WITH_MPI
+    n1 += 6;
+#endif
+
+#ifdef EXTRA_HYDRO_LOOP
+    n1 += 15;
+#ifdef WITH_MPI
+    n1 += 2;
+#endif
+#endif
+  }
+  if (e->policy & engine_policy_self_gravity) {
+    n1 += 24;
+    n2 += 1;
+#ifdef WITH_MPI
+    n2 += 2;
+#endif
+  }
+  if (e->policy & engine_policy_external_gravity) {
+    n1 += 2;
+  }
+  if (e->policy & engine_policy_cosmology) {
+    n1 += 2;
+  }
+  if (e->policy & engine_policy_cooling) {
+    n1 += 2;
+  }
+  if (e->policy & engine_policy_sourceterms) {
+    n1 += 2;
+  }
+  if (e->policy & engine_policy_stars) {
+    n1 += 2;
+  }
+
+#ifdef WITH_MPI
+
+  /* We need fewer tasks per rank when using MPI, but we could have
+   * imbalances, so we need to work using the locally active cells, not just
+   * some equipartition amongst the nodes. Don't want to recurse the whole
+   * cell tree, so just make a guess of the maximum possible total cells. */
+  int ntop = 0;
+  int ncells = 0;
+  for (int k = 0; k < e->s->nr_cells; k++) {
+    struct cell *c = &e->s->cells_top[k];
+
+    /* Any cells with particles will have tasks (local & foreign). */
+    int nparts = c->count + c->gcount + c->scount;
+    if (nparts > 0) {
+      ntop++;
+      ncells++;
+
+      /* Count cell depth until we get below the parts per cell threshold. */
+      int depth = 0;
+      while (nparts > space_splitsize) {
+        depth++;
+        nparts /= 8;
+        ncells += (1 << (depth * 3));
+      }
+    }
+  }
+
+  /* If no local cells, we are probably still initialising, so just keep
+   * room for the top-level. */
+  if (ncells == 0) {
+    ntop = e->s->nr_cells;
+    ncells = ntop;
+  }
+#else
+  int ntop = e->s->nr_cells;
+  int ncells = e->s->tot_cells;
+#endif
+
+  double ntasks = n1 * ntop + n2 * (ncells - ntop);
+  tasks_per_cell = ceil(ntasks / ncells);
+
+  if (tasks_per_cell < 1.0) tasks_per_cell = 1.0;
+  if (e->verbose)
+    message("tasks per cell estimated as: %d, maximum tasks: %d",
+            tasks_per_cell, ncells * tasks_per_cell);
+
+  return ncells * tasks_per_cell;
+}
+
 /**
  * @brief Rebuild the space and tasks.
  *
@@ -4545,9 +4657,14 @@ void engine_init(struct engine *e, struct space *s,
       pthread_barrier_init(&e->run_barrier, NULL, e->nr_threads + 1) != 0)
     error("Failed to initialize barrier.");
 
-  /* Init the scheduler with enough tasks for the initial sorting tasks. */
-  const int nr_tasks = 2 * s->tot_cells + 2 * e->nr_threads;
-  scheduler_init(&e->sched, e->s, nr_tasks, nr_queues,
+  /* Expected average for tasks per cell. If set to zero we use a heuristic
+   * guess based on the numbers of cells and how many tasks per cell we expect.
+   */
+  e->tasks_per_cell =
+      parser_get_opt_param_int(params, "Scheduler:tasks_per_cell", 0);
+
+  /* Init the scheduler. */
+  scheduler_init(&e->sched, e->s, engine_estimate_nr_tasks(e), nr_queues,
                  (policy & scheduler_flag_steal), e->nodeID, &e->threadpool);
 
   /* Allocate and init the threads. */
diff --git a/src/engine.h b/src/engine.h
index 47a30a99b696304365a0ddf31d4499628a649a37..90645b308529e49a370bf91f2b3d05195b2b08d8 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -75,7 +75,6 @@ enum engine_policy {
 extern const char *engine_policy_names[];
 
 #define engine_queue_scale 1.2
-#define engine_maxtaskspercell 96
 #define engine_maxproxies 64
 #define engine_tasksreweight 1
 #define engine_parts_size_grow 1.05
@@ -222,6 +221,10 @@ struct engine {
   struct link *links;
   int nr_links, size_links;
 
+  /* Average number of tasks per cell. Used to estimate the sizes
+   * of the various task arrays. */
+  int tasks_per_cell;
+
   /* Are we talkative ? */
   int verbose;
 
@@ -292,5 +295,6 @@ int engine_is_done(struct engine *e);
 void engine_pin();
 void engine_unpin();
 void engine_clean(struct engine *e);
+int engine_estimate_nr_tasks(struct engine *e);
 
 #endif /* SWIFT_ENGINE_H */
diff --git a/src/runner.c b/src/runner.c
index 10c09372012872e924e287bb8da7d1d9363c5c58..bb5f2685555504c24a40a32e5cbf635a7d4a7f1f 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -1751,7 +1751,8 @@ void *runner_main(void *data) {
   struct runner *r = (struct runner *)data;
   struct engine *e = r->e;
   struct scheduler *sched = &e->sched;
-
+  unsigned int seed = r->id;
+  pthread_setspecific(sched->local_seed_pointer, &seed);
   /* Main loop. */
   while (1) {
 
diff --git a/src/scheduler.c b/src/scheduler.c
index af0f5e3900d66c6d70a08cfcfbda149662317f23..b1cc1a572d3344e7b1e2338c7594da0edff58919 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -740,7 +740,11 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
   const int ind = atomic_inc(&s->tasks_next);
 
   /* Overflow? */
-  if (ind >= s->size) error("Task list overflow.");
+  if (ind >= s->size)
+    error(
+        "Task list overflow (%d). Need to increase "
+        "Scheduler:tasks_per_cell.",
+        ind);
 
   /* Get a pointer to the new task. */
   struct task *t = &s->tasks[ind];
@@ -1345,7 +1349,8 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
     if (qid >= s->nr_queues) error("Bad computed qid.");
 
     /* If no previous owner, pick a random queue. */
-    if (qid < 0) qid = rand() % s->nr_queues;
+    /* Note that getticks() is random enough */
+    if (qid < 0) qid = getticks() % s->nr_queues;
 
     /* Increase the waiting counter. */
     atomic_inc(&s->waiting);
@@ -1577,6 +1582,7 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
   s->size = 0;
   s->tasks = NULL;
   s->tasks_ind = NULL;
+  pthread_key_create(&s->local_seed_pointer, NULL);
   scheduler_reset(s, nr_tasks);
 }
 
diff --git a/src/scheduler.h b/src/scheduler.h
index ac654580b2af2ffb506dc3fd9f0b988b89effbd0..0ef9426ed3248742de619d64f53bdd6b9954ed2c 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -102,6 +102,9 @@ struct scheduler {
 
   /* The node we are working on. */
   int nodeID;
+
+  /* 'Pointer' to the seed for the random number generator */
+  pthread_key_t local_seed_pointer;
 };
 
 /* Inlined functions (for speed). */