diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml
index 4c8d3b504853e35eabd0675984e463ace897be33..2660a72f22cdd865bc107c7b5f1190b139bf78c1 100644
--- a/examples/parameter_example.yml
+++ b/examples/parameter_example.yml
@@ -72,7 +72,7 @@ Scheduler:
   cell_extra_gparts:         0         # (Optional) Number of spare gparts per top-level allocated at rebuild time for on-the-fly creation.
   cell_extra_sparts:         400       # (Optional) Number of spare sparts per top-level allocated at rebuild time for on-the-fly creation.
   max_top_level_cells:       12        # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value).
-  tasks_per_cell:            0.0         # (Optional) The average number of tasks per cell. If not large enough the simulation will fail (means guess...).
+  tasks_per_cell:            0.0       # (Optional) The average number of tasks per cell. If not large enough the simulation will fail (means guess...).
   links_per_tasks:           10        # (Optional) The average number of links per tasks (before adding the communication tasks). If not large enough the simulation will fail (means guess...). Defaults to 10.
   mpi_message_limit:         4096      # (Optional) Maximum MPI task message size to send non-buffered, KB.
 
diff --git a/src/collectgroup.c b/src/collectgroup.c
index 0b7b419b565612149fd2b295116b37aa65aa01e9..ddf3e35d945fd8b07cc927d8ba383963c7558cd2 100644
--- a/src/collectgroup.c
+++ b/src/collectgroup.c
@@ -41,6 +41,9 @@ struct mpicollectgroup1 {
   integertime_t ti_hydro_end_min;
   integertime_t ti_gravity_end_min;
   int forcerebuild;
+  long long total_nr_cells;
+  long long total_nr_tasks;
+  float tasks_per_cell_max;
 };
 
 /* Forward declarations. */
@@ -93,6 +96,9 @@ void collectgroup1_apply(struct collectgroup1 *grp1, struct engine *e) {
   e->nr_inhibited_gparts = grp1->g_inhibited;
   e->nr_inhibited_sparts = grp1->s_inhibited;
   e->forcerebuild = grp1->forcerebuild;
+  e->total_nr_cells = grp1->total_nr_cells;
+  e->total_nr_tasks = grp1->total_nr_tasks;
+  e->tasks_per_cell_max = grp1->tasks_per_cell_max;
 }
 
 /**
@@ -101,37 +107,39 @@ void collectgroup1_apply(struct collectgroup1 *grp1, struct engine *e) {
  * @param grp1 The #collectgroup1 to initialise
  * @param updated the number of updated hydro particles on this node this step.
  * @param g_updated the number of updated gravity particles on this node this
- * step.
+ *                  step.
  * @param s_updated the number of updated star particles on this node this step.
  * @param inhibited the number of inhibited hydro particles on this node this
- * step.
+ *                  step.
  * @param g_inhibited the number of inhibited gravity particles on this node
- * this step.
+ *                    this step.
  * @param s_inhibited the number of inhibited star particles on this node this
- * step.
+ *                    step.
  * @param ti_hydro_end_min the minimum end time for next hydro time step after
- * this step.
+ *                         this step.
  * @param ti_hydro_end_max the maximum end time for next hydro time step after
- * this step.
+ *                         this step.
  * @param ti_hydro_beg_max the maximum begin time for next hydro time step after
- * this step.
+ *                         this step.
  * @param ti_gravity_end_min the minimum end time for next gravity time step
- * after this step.
+ *                           after this step.
  * @param ti_gravity_end_max the maximum end time for next gravity time step
- * after this step.
+ *                           after this step.
  * @param ti_gravity_beg_max the maximum begin time for next gravity time step
- * after this step.
+ *                           after this step.
  * @param forcerebuild whether a rebuild is required after this step.
+ * @param total_nr_cells total number of all cells on rank.
+ * @param total_nr_tasks total number of tasks on rank.
+ * @param tasks_per_cell the used number of tasks per cell.
  */
-void collectgroup1_init(struct collectgroup1 *grp1, size_t updated,
-                        size_t g_updated, size_t s_updated, size_t inhibited,
-                        size_t g_inhibited, size_t s_inhibited,
-                        integertime_t ti_hydro_end_min,
-                        integertime_t ti_hydro_end_max,
-                        integertime_t ti_hydro_beg_max,
-                        integertime_t ti_gravity_end_min,
-                        integertime_t ti_gravity_end_max,
-                        integertime_t ti_gravity_beg_max, int forcerebuild) {
+void collectgroup1_init(
+    struct collectgroup1 *grp1, size_t updated, size_t g_updated,
+    size_t s_updated, size_t inhibited, size_t g_inhibited, size_t s_inhibited,
+    integertime_t ti_hydro_end_min, integertime_t ti_hydro_end_max,
+    integertime_t ti_hydro_beg_max, integertime_t ti_gravity_end_min,
+    integertime_t ti_gravity_end_max, integertime_t ti_gravity_beg_max,
+    int forcerebuild, long long total_nr_cells, long long total_nr_tasks,
+    float tasks_per_cell) {
 
   grp1->updated = updated;
   grp1->g_updated = g_updated;
@@ -146,6 +154,9 @@ void collectgroup1_init(struct collectgroup1 *grp1, size_t updated,
   grp1->ti_gravity_end_max = ti_gravity_end_max;
   grp1->ti_gravity_beg_max = ti_gravity_beg_max;
   grp1->forcerebuild = forcerebuild;
+  grp1->total_nr_cells = total_nr_cells;
+  grp1->total_nr_tasks = total_nr_tasks;
+  grp1->tasks_per_cell_max = tasks_per_cell;
 }
 
 /**
@@ -171,6 +182,9 @@ void collectgroup1_reduce(struct collectgroup1 *grp1) {
   mpigrp11.ti_hydro_end_min = grp1->ti_hydro_end_min;
   mpigrp11.ti_gravity_end_min = grp1->ti_gravity_end_min;
   mpigrp11.forcerebuild = grp1->forcerebuild;
+  mpigrp11.total_nr_cells = grp1->total_nr_cells;
+  mpigrp11.total_nr_tasks = grp1->total_nr_tasks;
+  mpigrp11.tasks_per_cell_max = grp1->tasks_per_cell_max;
 
   struct mpicollectgroup1 mpigrp12;
   if (MPI_Allreduce(&mpigrp11, &mpigrp12, 1, mpicollectgroup1_type,
@@ -187,6 +201,9 @@ void collectgroup1_reduce(struct collectgroup1 *grp1) {
   grp1->ti_hydro_end_min = mpigrp12.ti_hydro_end_min;
   grp1->ti_gravity_end_min = mpigrp12.ti_gravity_end_min;
   grp1->forcerebuild = mpigrp12.forcerebuild;
+  grp1->total_nr_cells = mpigrp12.total_nr_cells;
+  grp1->total_nr_tasks = mpigrp12.total_nr_tasks;
+  grp1->tasks_per_cell_max = mpigrp12.tasks_per_cell_max;
 
 #endif
 }
@@ -221,6 +238,14 @@ static void doreduce1(struct mpicollectgroup1 *mpigrp11,
   /* Everyone must agree to not rebuild. */
   if (mpigrp11->forcerebuild || mpigrp12->forcerebuild)
     mpigrp11->forcerebuild = 1;
+
+  /* Totals of all tasks and cells. */
+  mpigrp11->total_nr_cells += mpigrp12->total_nr_cells;
+  mpigrp11->total_nr_tasks += mpigrp12->total_nr_tasks;
+
+  /* Maximum value of tasks_per_cell. */
+  mpigrp11->tasks_per_cell_max =
+      max(mpigrp11->tasks_per_cell_max, mpigrp12->tasks_per_cell_max);
 }
 
 /**
diff --git a/src/collectgroup.h b/src/collectgroup.h
index b6e8769ac993cc023ae402cdfc4b0169406f6181..3e430b58db05b563f96149d1ae21039444a03640 100644
--- a/src/collectgroup.h
+++ b/src/collectgroup.h
@@ -46,19 +46,25 @@ struct collectgroup1 {
 
   /* Force the engine to rebuild? */
   int forcerebuild;
+
+  /* Totals of cells and tasks. */
+  long long total_nr_cells;
+  long long total_nr_tasks;
+
+  /* Maximum value of actual tasks per cell across all ranks. */
+  float tasks_per_cell_max;
 };
 
 void collectgroup_init(void);
 void collectgroup1_apply(struct collectgroup1 *grp1, struct engine *e);
-void collectgroup1_init(struct collectgroup1 *grp1, size_t updated,
-                        size_t g_updated, size_t s_updated, size_t inhibited,
-                        size_t g_inhibited, size_t s_inhibited,
-                        integertime_t ti_hydro_end_min,
-                        integertime_t ti_hydro_end_max,
-                        integertime_t ti_hydro_beg_max,
-                        integertime_t ti_gravity_end_min,
-                        integertime_t ti_gravity_end_max,
-                        integertime_t ti_gravity_beg_max, int forcerebuild);
+void collectgroup1_init(
+    struct collectgroup1 *grp1, size_t updated, size_t g_updated,
+    size_t s_updated, size_t inhibited, size_t g_inhibited, size_t s_inhibited,
+    integertime_t ti_hydro_end_min, integertime_t ti_hydro_end_max,
+    integertime_t ti_hydro_beg_max, integertime_t ti_gravity_end_min,
+    integertime_t ti_gravity_end_max, integertime_t ti_gravity_beg_max,
+    int forcerebuild, long long total_nr_cells, long long total_nr_tasks,
+    float tasks_per_cell);
 void collectgroup1_reduce(struct collectgroup1 *grp1);
 
 #endif /* SWIFT_COLLECTGROUP_H */
diff --git a/src/engine.c b/src/engine.c
index 9b4b72436819e18f26cbfd316d43fbea732a9c50..21a14889fe489cb8c9f080f48e7b597186823ab7 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -1902,6 +1902,35 @@ void engine_print_task_counts(const struct engine *e) {
   const int nr_tasks = sched->nr_tasks;
   const struct task *const tasks = sched->tasks;
 
+  /* Global tasks and cells when using MPI. */
+#ifdef WITH_MPI
+  if (e->nodeID == 0 && e->total_nr_tasks > 0)
+    printf(
+        "[%04i] %s engine_print_task_counts: System total: %lld,"
+        " no. cells: %lld\n",
+        e->nodeID, clocks_get_timesincestart(), e->total_nr_tasks,
+        e->total_nr_cells);
+  fflush(stdout);
+#endif
+
+  /* Report value that can be used to estimate the task_per_cells parameter. */
+  float tasks_per_cell = (float)nr_tasks / (float)e->s->tot_cells;
+
+#ifdef WITH_MPI
+  message("Total = %d (per cell = %.2f)", nr_tasks, tasks_per_cell);
+
+  /* And the system maximum on rank 0, only after first step, increase by our
+   * margin to allow for some variation in repartitioning. */
+  if (e->nodeID == 0 && e->total_nr_tasks > 0) {
+    message("Total = %d (maximum per cell = %.2f)", nr_tasks,
+            e->tasks_per_cell_max * engine_tasks_per_cell_margin);
+  }
+
+#else
+  message("Total = %d (per cell = %.2f)", nr_tasks, tasks_per_cell);
+#endif
+  fflush(stdout);
+
   /* Count and print the number of each task type. */
   int counts[task_type_count + 1];
   for (int k = 0; k <= task_type_count; k++) counts[k] = 0;
@@ -1911,8 +1940,7 @@ void engine_print_task_counts(const struct engine *e) {
     else
       counts[(int)tasks[k].type] += 1;
   }
-  message("Total = %d  (per cell = %.2f)", nr_tasks,
-          (float)nr_tasks / e->s->tot_cells);
+
 #ifdef WITH_MPI
   printf("[%04i] %s engine_print_task_counts: task counts are [ %s=%i",
          e->nodeID, clocks_get_timesincestart(), taskID_names[0], counts[0]);
@@ -1949,10 +1977,10 @@ int engine_estimate_nr_tasks(const struct engine *e) {
 
   float tasks_per_cell = e->tasks_per_cell;
   if (tasks_per_cell > 0.0f) {
-      if (e->verbose)
-          message("tasks per cell given as: %.2f, so maximum tasks: %d",
-                  e->tasks_per_cell, (int)(e->s->tot_cells * tasks_per_cell));
-      return (int)(e->s->tot_cells * tasks_per_cell);
+    if (e->verbose)
+      message("tasks per cell given as: %.2f, so maximum tasks: %d",
+              e->tasks_per_cell, (int)(e->s->tot_cells * tasks_per_cell));
+    return (int)(e->s->tot_cells * tasks_per_cell);
   }
 
   /* Our guess differs depending on the types of tasks we are using, but we
@@ -2064,9 +2092,9 @@ int engine_estimate_nr_tasks(const struct engine *e) {
   if (tasks_per_cell < 1.0f) tasks_per_cell = 1.0f;
   if (e->verbose)
     message("tasks per cell estimated as: %.2f, maximum tasks: %d",
-            tasks_per_cell, (int) (ncells * tasks_per_cell));
+            tasks_per_cell, (int)(ncells * tasks_per_cell));
 
-  return (int) (ncells * tasks_per_cell);
+  return (int)(ncells * tasks_per_cell);
 }
 
 /**
@@ -2516,7 +2544,9 @@ void engine_collect_end_of_step(struct engine *e, int apply) {
       &e->collect_group1, data.updated, data.g_updated, data.s_updated,
       data.inhibited, data.g_inhibited, data.s_inhibited, data.ti_hydro_end_min,
       data.ti_hydro_end_max, data.ti_hydro_beg_max, data.ti_gravity_end_min,
-      data.ti_gravity_end_max, data.ti_gravity_beg_max, e->forcerebuild);
+      data.ti_gravity_end_max, data.ti_gravity_beg_max, e->forcerebuild,
+      e->s->tot_cells, e->sched.nr_tasks,
+      (float)e->sched.nr_tasks / (float)e->s->tot_cells);
 
 /* Aggregate collective data from the different nodes for this step. */
 #ifdef WITH_MPI
@@ -4164,6 +4194,8 @@ void engine_init(struct engine *e, struct space *s, struct swift_params *params,
   e->cputime_last_step = 0;
   e->last_repartition = 0;
 #endif
+  e->total_nr_cells = 0;
+  e->total_nr_tasks = 0;
 
 #if defined(WITH_LOGGER)
   e->logger = (struct logger *)malloc(sizeof(struct logger));
@@ -4695,6 +4727,8 @@ void engine_config(int restart, struct engine *e, struct swift_params *params,
    * from the end of the dumped run. Can be changed on restart. */
   e->tasks_per_cell =
       parser_get_opt_param_float(params, "Scheduler:tasks_per_cell", 0.0);
+  e->tasks_per_cell_max = 0.0f;
+
   float maxtasks = 0;
   if (restart)
     maxtasks = e->restart_max_tasks;
diff --git a/src/engine.h b/src/engine.h
index ce583243e59baea9effdf9d235bd95f2221ad216..e73a2eda85127d149262341696073726c51427e6 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -107,6 +107,7 @@ enum engine_step_properties {
 #define engine_default_timesteps_file_name "timesteps"
 #define engine_max_parts_per_ghost 1000
 #define engine_max_sparts_per_ghost 1000
+#define engine_tasks_per_cell_margin 1.2
 
 /**
  * @brief The rank of the engine as a global variable (for messages).
@@ -211,7 +212,13 @@ struct engine {
   /* Total numbers of particles in the system. */
   long long total_nr_parts, total_nr_gparts, total_nr_sparts;
 
-  /* The total number of inhibted particles in the system. */
+  /* Total numbers of cells (top-level and sub-cells) in the system. */
+  long long total_nr_cells;
+
+  /* Total numbers of tasks in the system. */
+  long long total_nr_tasks;
+
+  /* The total number of inhibited particles in the system. */
   long long nr_inhibited_parts, nr_inhibited_gparts, nr_inhibited_sparts;
 
 #ifdef SWIFT_DEBUG_CHECKS
@@ -326,8 +333,9 @@ struct engine {
   size_t nr_links, size_links;
 
   /* Average number of tasks per cell. Used to estimate the sizes
-   * of the various task arrays. */
+   * of the various task arrays. Also the maximum from all ranks. */
   float tasks_per_cell;
+  float tasks_per_cell_max;
 
   /* Average number of links per tasks. This number is used before
      the creation of communication tasks so needs to be large enough. */