diff --git a/doc/RTD/source/ParameterFiles/parameter_description.rst b/doc/RTD/source/ParameterFiles/parameter_description.rst
index 711dfb7b99938460038eb154456135bb9ad78f99..207c173001437f283688699fb507999c0225ec21 100644
--- a/doc/RTD/source/ParameterFiles/parameter_description.rst
+++ b/doc/RTD/source/ParameterFiles/parameter_description.rst
@@ -13,7 +13,7 @@ name followed by a column and the value of the parameter:
 
 .. code:: YAML
 
-   ICs:        santa_barbara.hdf5	  
+   ICs:        santa_barbara.hdf5
    dt_max:     1.5
    shift:      [2., 4., 5.]
 
@@ -46,7 +46,7 @@ will be raised. The code can also read an array of values:
 .. code:: YAML
 
    shift:  [2., 4., 5.]
-	  
+
 Some options in the parameter file are optional and
 when not provided, SWIFT will run with the default value. However, if
 a compulsory parameter is missing an error will be raised at
@@ -104,8 +104,8 @@ speed, we would use:
      UnitLength_in_cgs:   3.08567758e24 # 1 Mpc in centimeters
      UnitVelocity_in_cgs: 1e5           # 1 km/s in centimeters per second
      UnitCurrent_in_cgs:  1             # 1 Ampere
-     UnitTemp_in_cgs:     1             # 1 Kelvin   
-	  
+     UnitTemp_in_cgs:     1             # 1 Kelvin
+
 Note that there are currently no variables in any of the SWIFT physics
 schemes that make use of the unit of electric current. There is also
 no incentive to use anything else than Kelvin but that makes the whole
@@ -122,7 +122,7 @@ system <https://en.wikipedia.org/wiki/FFF_system>`_ one would use
      UnitLength_in_cgs:   20116.8     # 1 Furlong (fur) in cm
      UnitVelocity_in_cgs: 0.01663095  # 1 Furlong (fur) per Fortnight (ftn) in cm/s
      UnitCurrent_in_cgs:  1           # 1 Ampere
-     UnitTemp_in_cgs:     1           # 1 Kelvin   
+     UnitTemp_in_cgs:     1           # 1 Kelvin
 
 The value of the physical constants in this system is left as an
 exercise for the reader [#f1]_.
@@ -171,10 +171,10 @@ use the following parameters:
    Cosmology:
      a_begin:        0.0078125     # z = 127
      a_end:          1.0           # z = 0
-     h:              0.6777        
-     Omega_m:        0.307         
-     Omega_lambda:   0.693         
-     Omega_b:        0.0455        
+     h:              0.6777
+     Omega_m:        0.307
+     Omega_lambda:   0.693
+     Omega_b:        0.0455
      Omega_r:        0.            # (Optional)
      w_0:            -1.0          # (Optional)
      w_a:            0.            # (Optional)
@@ -192,7 +192,7 @@ provided in the ``Gravity`` section. The theory document puts these parameters i
 context of the equations being solved. We give a brief overview here.
 
 * The Plummer-equivalent co-moving softening length used for all particles :math:`\epsilon_{com}`: ``comoving_softening``,
-* The Plummer-equivalent maximal physical softening length used for all particles :math:`\epsilon_{max}`: ``comoving_softening``, 
+* The Plummer-equivalent maximal physical softening length used for all particles :math:`\epsilon_{max}`: ``comoving_softening``,
 
 At any redshift :math:`z`, the Plummer-equivalent softening length used by the
 code will be :math:`\epsilon=\min(\epsilon_{max},
@@ -200,7 +200,7 @@ code will be :math:`\epsilon=\min(\epsilon_{max},
 
 * The opening angle (multipole acceptance criterion) used in the FMM :math:`\theta`: ``theta``,
 * The time-step size pre-factor :math:`\eta`: ``eta``,
-  
+
 The time-step of a given particle is given by :math:`\Delta t =
 \eta\sqrt{\frac{\epsilon}{|\overrightarrow{a}|}}`, where
 :math:`\overrightarrow{a}` is the particle's acceleration. `Power et al. (2003) <http://adsabs.harvard.edu/abs/2003MNRAS.338...14P>`_ recommend using :math:`\eta=0.025`.
@@ -224,31 +224,31 @@ Particle-Mesh part of the calculation. The last three are optional:
 * The scale below which the short-range forces are assumed to be exactly Newtonian (in units of
   the mesh cell-size multiplied by :math:`a_{\rm smooth}`) :math:`r_{\rm
   cut,min}`: ``r_cut_min`` (default: ``0.1``),
-  
+
 For most runs, the default values can be used. Only the number of cells along
 each axis needs to be specified. The remaining three values are best described
 in the context of the full set of equations in the theory documents.
-  
+
 As a summary, here are the values used for the EAGLE :math:`100^3~{\rm Mpc}^3`
 simulation:
 
 .. code:: YAML
-	  
+
    # Parameters for the self-gravity scheme for the EAGLE-100 box
    Gravity:
-     eta:          0.025              
-     theta:        0.7                
+     eta:          0.025
+     theta:        0.7
      comoving_softening:     0.0026994  # 0.7 proper kpc at z=2.8.
      max_physical_softening: 0.0007     # 0.7 proper kpc
      rebuild_frequency:      0.01       # Default optional value
-     mesh_side_length:       512       
+     mesh_side_length:       512
      a_smooth:     1.25                 # Default optional value
      r_cut_max:    4.5                  # Default optional value
      r_cut_min:    0.1                  # Default optional value
 
 
 .. _Parameters_SPH:
-     
+
 SPH
 ---
 
@@ -312,7 +312,7 @@ Whilst for a cosmological run, one would need:
     max_dt_RMS_factor: 0.25     # Default optional value
 
 .. _Parameters_ICs:
-    
+
 Initial Conditions
 ------------------
 
@@ -368,21 +368,21 @@ be:
    InitialConditions:
      file_name:  my_ics.hdf5
      periodic:                    1
-     cleanup_h_factors:           1     
-     cleanup_velocity_factors:    1     
-     generate_gas_in_ics:         1     
-     cleanup_smoothing_lengths:   1  
+     cleanup_h_factors:           1
+     cleanup_velocity_factors:    1
+     generate_gas_in_ics:         1
+     cleanup_smoothing_lengths:   1
 
 
 .. _Parameters_constants:
-     
+
 Physical Constants
 ------------------
 
 For some idealised test it can be useful to overwrite the value of
 some physical constants; in particular the value of the gravitational
 constant. SWIFT offers an optional parameter to overwrite the value of
-:math:`G_N`. 
+:math:`G_N`.
 
 .. code:: YAML
 
@@ -475,7 +475,7 @@ one described above (See the :ref:`Parameters_units` section) and read:
 
 When un-specified, these all take the same value as assumed by the internal
 system of units. These are rarely used but can offer a practical alternative to
-converting data in the post-processing of the simulations. 
+converting data in the post-processing of the simulations.
 
 For a standard cosmological run with structure finding activated, the
 full section would be:
@@ -487,7 +487,7 @@ full section would be:
      scale_factor_first:  0.02    # z = 49
      delta_time:          1.02
      invoke_stf:          1
-	    
+
 Showing all the parameters for a basic hydro test-case, one would have:
 
 .. code:: YAML
@@ -513,7 +513,7 @@ following pages:
 
 
 .. _Parameters_statistics:
-  
+
 Statistics
 ----------
 
@@ -523,28 +523,28 @@ following page:
 * :ref:`Output_list_label` (to have statistics outputs not evenly spaced in time).
 
 .. _Parameters_restarts:
-  
+
 Restarts
 --------
 
 SWIFT can write check-pointing files and restart from them. The behaviour of
 this mechanism is driven by the options in the ``Restarts`` section of the YAML
 parameter file. All the parameters are optional but default to values that
-ensure a reasonable behaviour. 
+ensure a reasonable behaviour.
 
 * Whether or not to enable the dump of restart files: ``enable`` (default:
   ``1``).
 
 This parameter acts a master-switch for the check-pointing capabilities. All the
 other options require the ``enable`` parameter to be set to ``1``.
-  
+
 * Whether or not to save a copy of the previous set of check-pointing files:
   ``save`` (default: ``1``),
 * Whether or not to dump a set of restart file on regular exit: ``onexit``
   (default: ``0``),
 * The wall-clock time in hours between two sets of restart files:
   ``delta_hours`` (default: ``6.0``).
-  
+
 Note that there is no buffer time added to the ``delta_hours`` value. If the
 system's batch queue run time limit is set to 6 hours, the user must specify a
 smaller value to allow for enough time to safely dump the check-point files.
@@ -591,24 +591,120 @@ To run SWIFT, dumping check-pointing files every 6 hours and running for 24
 hours after which a shell command will be run, one would use:
 
 .. code:: YAML
-	  
+
   Restarts:
-    enable:             1          
+    enable:             1
     save:               1          # Keep copies
-    onexit:             0          
+    onexit:             0
     subdir:             restart    # Sub-directory of the directory where SWIFT is run
-    basename:           swift      
-    delta_hours:        6.0        
-    stop_steps:         100        
-    max_run_time:       24.0       # In hours 
-    resubmit_on_exit:   1          
-    resubmit_command:   ./resub.sh 
+    basename:           swift
+    delta_hours:        6.0
+    stop_steps:         100
+    max_run_time:       24.0       # In hours
+    resubmit_on_exit:   1
+    resubmit_command:   ./resub.sh
 
 .. _Parameters_scheduler:
 
 Scheduler
 ---------
 
+The Scheduler section contains various parameters that control how the cell
+tree is configured and defines some values for the related tasks.  In general
+these should be considered as tuning parameters, both for speed and memory
+use.
+
+.. code:: YAML
+
+   nr_queues: 0
+
+Defines the number of task queues used. These are normally set to one per
+thread and should be at least that number.
+
+A number of parameters decide how the cell tree will be split into sub-cells,
+according to the number of particles and their expected interaction count,
+and the type of interaction. These are:
+
+.. code:: YAML
+
+  cell_max_size:             8000000
+  cell_sub_size_pair_hydro:  256000000
+  cell_sub_size_self_hydro:  32000
+  cell_sub_size_pair_grav:   256000000
+  cell_sub_size_self_grav:   32000
+  cell_sub_size_pair_stars:  256000000
+  cell_sub_size_self_stars:  32000
+  cell_split_size:           400
+
+when possible cells that exceed these constraints will be split into a further
+level of sub-cells. So for instance a sub-cell should not contain more than
+400 particles (this number defines the scale of most `N*N` interactions).
+
+To control the number of self-gravity tasks we have the parameter:
+
+.. code:: YAML
+
+  cell_subdepth_diff_grav:   4
+
+which stops these from being done at the scale of the leaf cells, of which
+there can be a large number. In this case cells with gravity tasks must be at
+least 4 levels above the leaf cells (when possible).
+
+Extra space is required when particles are created in the system (to the time
+of the next rebuild). These are controlled by:
+
+.. code:: YAML
+
+  cell_extra_parts:          0
+  cell_extra_gparts:         0
+  cell_extra_sparts:         400
+
+
+The number of top-level cells is controlled by the parameter:
+
+.. code:: YAML
+
+  max_top_level_cells:       12
+
+this is the number per dimension, we will have 12x12x12 cells. There must be
+at least 3 top-level cells per dimension.
+
+The number of top-level cells should be set so that the number of particles
+per cell is not too large, this is particularly important when using MPI as
+this defines the maximum size of cell exchange and also the size of non-local
+cells (these are used for cell interactions with local cells), which can have
+a large influence on memory use. Best advice for this is to at least scale for
+additional nodes.
+
+The memory used for holding the task and task-link lists needs to be
+pre-allocated, but cannot be pre-calculated, so we have the two parameters:
+
+.. code:: YAML
+
+  tasks_per_cell:            0.0
+  links_per_tasks:           10
+
+which are guesses at the mean numbers of tasks per cell and number of links
+per task. The tasks_per_cell value will be conservatively guessed when set to
+0.0, but you will be able to save memory by setting a value. The way to get a
+better estimate is to run SWIFT with verbose reporting on (```--verbose=1```)
+and check for the lines that report the ```per cell``` or with MPI ``maximum
+per cell``` values. This number can vary as the balance between MPI ranks
+does, so it is probably best to leave some head room.
+
+If these are exceeded you should get an obvious error message.
+
+Finally the parameter:
+
+.. code:: YAML
+
+  mpi_message_limit:         4096
+
+Defines the size (in bytes) below which MPI communication will be sent using
+non-buffered calls. These should have lower latency, but how that works or
+is honoured is an implementation question.
+
+
 .. _Parameters_domain_decomposition:
 
 Domain Decomposition:
@@ -766,7 +862,7 @@ large numbers of particles can be exchanged between MPI ranks, so is best
 avoided.
 
 If you are using ParMETIS there additional ways that you can tune the
-repartition process. 
+repartition process.
 
 METIS only offers the ability to create a partition from a graph, which means
 that each solution is independent of those that have already been made, that
@@ -774,12 +870,12 @@ can make the exchange of particles very large (although SWIFT attempts to
 minimize this), however, using ParMETIS we can use the existing partition to
 inform the new partition, this has two algorithms that are controlled using::
 
-    adaptive:         1       
+    adaptive:         1
 
 which means use adaptive repartition, otherwise simple refinement. The
 adaptive algorithm is further controlled by the::
 
-    itr:              100     
+    itr:              100
 
 parameter, which defines the ratio of inter node communication time to data
 redistribution time, in the range 0.00001 to 10000000.0. Lower values give
@@ -807,7 +903,7 @@ enabled). This file can then be used to replace the same file found in the
 `src/` directory and SWIFT should then be recompiled. Once you have that, you
 can use the parameter::
 
-    use_fixed_costs:  1       
+    use_fixed_costs:  1
 
 to control whether they are used or not. If enabled these will be used to
 repartition after the second step, which will generally give as good a
@@ -835,4 +931,4 @@ Structure finding (VELOCIraptor)
 	 compare runs over the same physical time but with different numbers of
 	 snapshots. Snapshots at a given time would always have the same set of
 	 digits irrespective of the number of snapshots produced before.
-	       
+
diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml
index 39abf52c571bd65fb9904d7d66ffa662df8e0eec..43d6f4818f88d2451e639c60407dbb8a36f5a80a 100644
--- a/examples/parameter_example.yml
+++ b/examples/parameter_example.yml
@@ -73,7 +73,7 @@ Scheduler:
   cell_extra_gparts:         0         # (Optional) Number of spare gparts per top-level allocated at rebuild time for on-the-fly creation.
   cell_extra_sparts:         400       # (Optional) Number of spare sparts per top-level allocated at rebuild time for on-the-fly creation.
   max_top_level_cells:       12        # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value).
-  tasks_per_cell:            0         # (Optional) The average number of tasks per cell. If not large enough the simulation will fail (means guess...).
+  tasks_per_cell:            0.0       # (Optional) The average number of tasks per cell. If not large enough the simulation will fail (means guess...).
   links_per_tasks:           10        # (Optional) The average number of links per tasks (before adding the communication tasks). If not large enough the simulation will fail (means guess...). Defaults to 10.
   mpi_message_limit:         4096      # (Optional) Maximum MPI task message size to send non-buffered, KB.
 
diff --git a/src/collectgroup.c b/src/collectgroup.c
index 0b7b419b565612149fd2b295116b37aa65aa01e9..ddf3e35d945fd8b07cc927d8ba383963c7558cd2 100644
--- a/src/collectgroup.c
+++ b/src/collectgroup.c
@@ -41,6 +41,9 @@ struct mpicollectgroup1 {
   integertime_t ti_hydro_end_min;
   integertime_t ti_gravity_end_min;
   int forcerebuild;
+  long long total_nr_cells;
+  long long total_nr_tasks;
+  float tasks_per_cell_max;
 };
 
 /* Forward declarations. */
@@ -93,6 +96,9 @@ void collectgroup1_apply(struct collectgroup1 *grp1, struct engine *e) {
   e->nr_inhibited_gparts = grp1->g_inhibited;
   e->nr_inhibited_sparts = grp1->s_inhibited;
   e->forcerebuild = grp1->forcerebuild;
+  e->total_nr_cells = grp1->total_nr_cells;
+  e->total_nr_tasks = grp1->total_nr_tasks;
+  e->tasks_per_cell_max = grp1->tasks_per_cell_max;
 }
 
 /**
@@ -101,37 +107,39 @@ void collectgroup1_apply(struct collectgroup1 *grp1, struct engine *e) {
  * @param grp1 The #collectgroup1 to initialise
  * @param updated the number of updated hydro particles on this node this step.
  * @param g_updated the number of updated gravity particles on this node this
- * step.
+ *                  step.
  * @param s_updated the number of updated star particles on this node this step.
  * @param inhibited the number of inhibited hydro particles on this node this
- * step.
+ *                  step.
  * @param g_inhibited the number of inhibited gravity particles on this node
- * this step.
+ *                    this step.
  * @param s_inhibited the number of inhibited star particles on this node this
- * step.
+ *                    step.
  * @param ti_hydro_end_min the minimum end time for next hydro time step after
- * this step.
+ *                         this step.
  * @param ti_hydro_end_max the maximum end time for next hydro time step after
- * this step.
+ *                         this step.
  * @param ti_hydro_beg_max the maximum begin time for next hydro time step after
- * this step.
+ *                         this step.
  * @param ti_gravity_end_min the minimum end time for next gravity time step
- * after this step.
+ *                           after this step.
  * @param ti_gravity_end_max the maximum end time for next gravity time step
- * after this step.
+ *                           after this step.
  * @param ti_gravity_beg_max the maximum begin time for next gravity time step
- * after this step.
+ *                           after this step.
  * @param forcerebuild whether a rebuild is required after this step.
+ * @param total_nr_cells total number of all cells on rank.
+ * @param total_nr_tasks total number of tasks on rank.
+ * @param tasks_per_cell the used number of tasks per cell.
  */
-void collectgroup1_init(struct collectgroup1 *grp1, size_t updated,
-                        size_t g_updated, size_t s_updated, size_t inhibited,
-                        size_t g_inhibited, size_t s_inhibited,
-                        integertime_t ti_hydro_end_min,
-                        integertime_t ti_hydro_end_max,
-                        integertime_t ti_hydro_beg_max,
-                        integertime_t ti_gravity_end_min,
-                        integertime_t ti_gravity_end_max,
-                        integertime_t ti_gravity_beg_max, int forcerebuild) {
+void collectgroup1_init(
+    struct collectgroup1 *grp1, size_t updated, size_t g_updated,
+    size_t s_updated, size_t inhibited, size_t g_inhibited, size_t s_inhibited,
+    integertime_t ti_hydro_end_min, integertime_t ti_hydro_end_max,
+    integertime_t ti_hydro_beg_max, integertime_t ti_gravity_end_min,
+    integertime_t ti_gravity_end_max, integertime_t ti_gravity_beg_max,
+    int forcerebuild, long long total_nr_cells, long long total_nr_tasks,
+    float tasks_per_cell) {
 
   grp1->updated = updated;
   grp1->g_updated = g_updated;
@@ -146,6 +154,9 @@ void collectgroup1_init(struct collectgroup1 *grp1, size_t updated,
   grp1->ti_gravity_end_max = ti_gravity_end_max;
   grp1->ti_gravity_beg_max = ti_gravity_beg_max;
   grp1->forcerebuild = forcerebuild;
+  grp1->total_nr_cells = total_nr_cells;
+  grp1->total_nr_tasks = total_nr_tasks;
+  grp1->tasks_per_cell_max = tasks_per_cell;
 }
 
 /**
@@ -171,6 +182,9 @@ void collectgroup1_reduce(struct collectgroup1 *grp1) {
   mpigrp11.ti_hydro_end_min = grp1->ti_hydro_end_min;
   mpigrp11.ti_gravity_end_min = grp1->ti_gravity_end_min;
   mpigrp11.forcerebuild = grp1->forcerebuild;
+  mpigrp11.total_nr_cells = grp1->total_nr_cells;
+  mpigrp11.total_nr_tasks = grp1->total_nr_tasks;
+  mpigrp11.tasks_per_cell_max = grp1->tasks_per_cell_max;
 
   struct mpicollectgroup1 mpigrp12;
   if (MPI_Allreduce(&mpigrp11, &mpigrp12, 1, mpicollectgroup1_type,
@@ -187,6 +201,9 @@ void collectgroup1_reduce(struct collectgroup1 *grp1) {
   grp1->ti_hydro_end_min = mpigrp12.ti_hydro_end_min;
   grp1->ti_gravity_end_min = mpigrp12.ti_gravity_end_min;
   grp1->forcerebuild = mpigrp12.forcerebuild;
+  grp1->total_nr_cells = mpigrp12.total_nr_cells;
+  grp1->total_nr_tasks = mpigrp12.total_nr_tasks;
+  grp1->tasks_per_cell_max = mpigrp12.tasks_per_cell_max;
 
 #endif
 }
@@ -221,6 +238,14 @@ static void doreduce1(struct mpicollectgroup1 *mpigrp11,
   /* Everyone must agree to not rebuild. */
   if (mpigrp11->forcerebuild || mpigrp12->forcerebuild)
     mpigrp11->forcerebuild = 1;
+
+  /* Totals of all tasks and cells. */
+  mpigrp11->total_nr_cells += mpigrp12->total_nr_cells;
+  mpigrp11->total_nr_tasks += mpigrp12->total_nr_tasks;
+
+  /* Maximum value of tasks_per_cell. */
+  mpigrp11->tasks_per_cell_max =
+      max(mpigrp11->tasks_per_cell_max, mpigrp12->tasks_per_cell_max);
 }
 
 /**
diff --git a/src/collectgroup.h b/src/collectgroup.h
index b6e8769ac993cc023ae402cdfc4b0169406f6181..3e430b58db05b563f96149d1ae21039444a03640 100644
--- a/src/collectgroup.h
+++ b/src/collectgroup.h
@@ -46,19 +46,25 @@ struct collectgroup1 {
 
   /* Force the engine to rebuild? */
   int forcerebuild;
+
+  /* Totals of cells and tasks. */
+  long long total_nr_cells;
+  long long total_nr_tasks;
+
+  /* Maximum value of actual tasks per cell across all ranks. */
+  float tasks_per_cell_max;
 };
 
 void collectgroup_init(void);
 void collectgroup1_apply(struct collectgroup1 *grp1, struct engine *e);
-void collectgroup1_init(struct collectgroup1 *grp1, size_t updated,
-                        size_t g_updated, size_t s_updated, size_t inhibited,
-                        size_t g_inhibited, size_t s_inhibited,
-                        integertime_t ti_hydro_end_min,
-                        integertime_t ti_hydro_end_max,
-                        integertime_t ti_hydro_beg_max,
-                        integertime_t ti_gravity_end_min,
-                        integertime_t ti_gravity_end_max,
-                        integertime_t ti_gravity_beg_max, int forcerebuild);
+void collectgroup1_init(
+    struct collectgroup1 *grp1, size_t updated, size_t g_updated,
+    size_t s_updated, size_t inhibited, size_t g_inhibited, size_t s_inhibited,
+    integertime_t ti_hydro_end_min, integertime_t ti_hydro_end_max,
+    integertime_t ti_hydro_beg_max, integertime_t ti_gravity_end_min,
+    integertime_t ti_gravity_end_max, integertime_t ti_gravity_beg_max,
+    int forcerebuild, long long total_nr_cells, long long total_nr_tasks,
+    float tasks_per_cell);
 void collectgroup1_reduce(struct collectgroup1 *grp1);
 
 #endif /* SWIFT_COLLECTGROUP_H */
diff --git a/src/engine.c b/src/engine.c
index 179162cf18e1e4c5a3bc58e25d7617642644bd1f..bc243bac7bee9d862071c8487134e2a3b7902902 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -1904,6 +1904,35 @@ void engine_print_task_counts(const struct engine *e) {
   const int nr_tasks = sched->nr_tasks;
   const struct task *const tasks = sched->tasks;
 
+  /* Global tasks and cells when using MPI. */
+#ifdef WITH_MPI
+  if (e->nodeID == 0 && e->total_nr_tasks > 0)
+    printf(
+        "[%04i] %s engine_print_task_counts: System total: %lld,"
+        " no. cells: %lld\n",
+        e->nodeID, clocks_get_timesincestart(), e->total_nr_tasks,
+        e->total_nr_cells);
+  fflush(stdout);
+#endif
+
+  /* Report value that can be used to estimate the task_per_cells parameter. */
+  float tasks_per_cell = (float)nr_tasks / (float)e->s->tot_cells;
+
+#ifdef WITH_MPI
+  message("Total = %d (per cell = %.2f)", nr_tasks, tasks_per_cell);
+
+  /* And the system maximum on rank 0, only after first step, increase by our
+   * margin to allow for some variation in repartitioning. */
+  if (e->nodeID == 0 && e->total_nr_tasks > 0) {
+    message("Total = %d (maximum per cell = %.2f)", nr_tasks,
+            e->tasks_per_cell_max * engine_tasks_per_cell_margin);
+  }
+
+#else
+  message("Total = %d (per cell = %.2f)", nr_tasks, tasks_per_cell);
+#endif
+  fflush(stdout);
+
   /* Count and print the number of each task type. */
   int counts[task_type_count + 1];
   for (int k = 0; k <= task_type_count; k++) counts[k] = 0;
@@ -1913,8 +1942,7 @@ void engine_print_task_counts(const struct engine *e) {
     else
       counts[(int)tasks[k].type] += 1;
   }
-  message("Total = %d  (per cell = %d)", nr_tasks,
-          (int)ceil((double)nr_tasks / e->s->tot_cells));
+
 #ifdef WITH_MPI
   printf("[%04i] %s engine_print_task_counts: task counts are [ %s=%i",
          e->nodeID, clocks_get_timesincestart(), taskID_names[0], counts[0]);
@@ -1939,7 +1967,7 @@ void engine_print_task_counts(const struct engine *e) {
  * @brief if necessary, estimate the number of tasks required given
  *        the current tasks in use and the numbers of cells.
  *
- * If e->tasks_per_cell is set greater than 0 then that value is used
+ * If e->tasks_per_cell is set greater than 0.0 then that value is used
  * as the estimate of the average number of tasks per cell,
  * otherwise we attempt an estimate.
  *
@@ -1949,8 +1977,13 @@ void engine_print_task_counts(const struct engine *e) {
  */
 int engine_estimate_nr_tasks(const struct engine *e) {
 
-  int tasks_per_cell = e->tasks_per_cell;
-  if (tasks_per_cell > 0) return e->s->tot_cells * tasks_per_cell;
+  float tasks_per_cell = e->tasks_per_cell;
+  if (tasks_per_cell > 0.0f) {
+    if (e->verbose)
+      message("tasks per cell given as: %.2f, so maximum tasks: %d",
+              e->tasks_per_cell, (int)(e->s->tot_cells * tasks_per_cell));
+    return (int)(e->s->tot_cells * tasks_per_cell);
+  }
 
   /* Our guess differs depending on the types of tasks we are using, but we
    * basically use a formula <n1>*ntopcells + <n2>*(totcells - ntopcells).
@@ -2055,15 +2088,15 @@ int engine_estimate_nr_tasks(const struct engine *e) {
   int ncells = e->s->tot_cells;
 #endif
 
-  double ntasks = n1 * ntop + n2 * (ncells - ntop);
+  float ntasks = n1 * ntop + n2 * (ncells - ntop);
   if (ncells > 0) tasks_per_cell = ceil(ntasks / ncells);
 
-  if (tasks_per_cell < 1.0) tasks_per_cell = 1.0;
+  if (tasks_per_cell < 1.0f) tasks_per_cell = 1.0f;
   if (e->verbose)
-    message("tasks per cell estimated as: %d, maximum tasks: %d",
-            tasks_per_cell, ncells * tasks_per_cell);
+    message("tasks per cell estimated as: %.2f, maximum tasks: %d",
+            tasks_per_cell, (int)(ncells * tasks_per_cell));
 
-  return ncells * tasks_per_cell;
+  return (int)(ncells * tasks_per_cell);
 }
 
 /**
@@ -2513,7 +2546,9 @@ void engine_collect_end_of_step(struct engine *e, int apply) {
       &e->collect_group1, data.updated, data.g_updated, data.s_updated,
       data.inhibited, data.g_inhibited, data.s_inhibited, data.ti_hydro_end_min,
       data.ti_hydro_end_max, data.ti_hydro_beg_max, data.ti_gravity_end_min,
-      data.ti_gravity_end_max, data.ti_gravity_beg_max, e->forcerebuild);
+      data.ti_gravity_end_max, data.ti_gravity_beg_max, e->forcerebuild,
+      e->s->tot_cells, e->sched.nr_tasks,
+      (float)e->sched.nr_tasks / (float)e->s->tot_cells);
 
 /* Aggregate collective data from the different nodes for this step. */
 #ifdef WITH_MPI
@@ -4182,6 +4217,8 @@ void engine_init(struct engine *e, struct space *s, struct swift_params *params,
   e->cputime_last_step = 0;
   e->last_repartition = 0;
 #endif
+  e->total_nr_cells = 0;
+  e->total_nr_tasks = 0;
 
 #if defined(WITH_LOGGER)
   e->logger = (struct logger *)malloc(sizeof(struct logger));
@@ -4714,8 +4751,10 @@ void engine_config(int restart, struct engine *e, struct swift_params *params,
    * On restart this number cannot be estimated (no cells yet), so we recover
    * from the end of the dumped run. Can be changed on restart. */
   e->tasks_per_cell =
-      parser_get_opt_param_int(params, "Scheduler:tasks_per_cell", 0);
-  int maxtasks = 0;
+      parser_get_opt_param_float(params, "Scheduler:tasks_per_cell", 0.0);
+  e->tasks_per_cell_max = 0.0f;
+
+  float maxtasks = 0;
   if (restart)
     maxtasks = e->restart_max_tasks;
   else
diff --git a/src/engine.h b/src/engine.h
index 938213d8c5b8889266e74e38f6fb2c00baa1d256..0e0e9895a8b0d1928e48c52ad760d2303447c24d 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -107,6 +107,7 @@ enum engine_step_properties {
 #define engine_default_timesteps_file_name "timesteps"
 #define engine_max_parts_per_ghost 1000
 #define engine_max_sparts_per_ghost 1000
+#define engine_tasks_per_cell_margin 1.2
 
 /**
  * @brief The rank of the engine as a global variable (for messages).
@@ -211,7 +212,13 @@ struct engine {
   /* Total numbers of particles in the system. */
   long long total_nr_parts, total_nr_gparts, total_nr_sparts;
 
-  /* The total number of inhibted particles in the system. */
+  /* Total numbers of cells (top-level and sub-cells) in the system. */
+  long long total_nr_cells;
+
+  /* Total numbers of tasks in the system. */
+  long long total_nr_tasks;
+
+  /* The total number of inhibited particles in the system. */
   long long nr_inhibited_parts, nr_inhibited_gparts, nr_inhibited_sparts;
 
 #ifdef SWIFT_DEBUG_CHECKS
@@ -326,8 +333,9 @@ struct engine {
   size_t nr_links, size_links;
 
   /* Average number of tasks per cell. Used to estimate the sizes
-   * of the various task arrays. */
-  size_t tasks_per_cell;
+   * of the various task arrays. Also the maximum from all ranks. */
+  float tasks_per_cell;
+  float tasks_per_cell_max;
 
   /* Average number of links per tasks. This number is used before
      the creation of communication tasks so needs to be large enough. */