diff --git a/examples/main.c b/examples/main.c
index e884ea5b6bba907819e189b842f558302e4ca9af..8bd591ccbd1c2fd860fa8ae2321d2e34132b287f 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -120,6 +120,10 @@ int main(int argc, char *argv[]) {
     error("MPI_Comm_size failed with error %i.", res);
   if ((res = MPI_Comm_rank(MPI_COMM_WORLD, &myrank)) != MPI_SUCCESS)
     error("Call to MPI_Comm_rank failed with error %i.", res);
+
+  /* Make sure messages are stamped with the correct rank. */
+  engine_rank = myrank;
+
   if ((res = MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN)) !=
       MPI_SUCCESS)
     error("Call to MPI_Comm_set_errhandler failed with error %i.", res);
@@ -131,6 +135,7 @@ int main(int argc, char *argv[]) {
     message("WARNING: you should use the non-MPI version of this program.");
   }
   fflush(stdout);
+
 #endif
 
 /* Let's pin the main thread */
@@ -572,8 +577,8 @@ int main(int argc, char *argv[]) {
           fprintf(file_thread, " %03i 0 0 0 0 %lli %lli 0 0 0 0 %lli\n", myrank,
                   e.tic_step, e.toc_step, cpufreq);
           int count = 0;
-          for (int l = 0; l < e.sched.nr_tasks; l++)
-            if (!e.sched.tasks[l].skip && !e.sched.tasks[l].implicit) {
+          for (int l = 0; l < e.sched.nr_tasks; l++) {
+            if (!e.sched.tasks[l].implicit && e.sched.tasks[l].toc != 0) {
               fprintf(
                   file_thread, " %03i %i %i %i %i %lli %lli %i %i %i %i %i\n",
                   myrank, e.sched.tasks[l].rid, e.sched.tasks[l].type,
@@ -588,11 +593,10 @@ int main(int argc, char *argv[]) {
                   (e.sched.tasks[l].cj != NULL) ? e.sched.tasks[l].cj->gcount
                                                 : 0,
                   e.sched.tasks[l].flags);
-              fflush(stdout);
-              count++;
             }
-          message("rank %d counted %d tasks", myrank, count);
-
+            fflush(stdout);
+            count++;
+          }
           fclose(file_thread);
         }
 
@@ -608,8 +612,8 @@ int main(int argc, char *argv[]) {
       /* Add some information to help with the plots */
       fprintf(file_thread, " %i %i %i %i %lli %lli %i %i %i %lli\n", -2, -1, -1,
               1, e.tic_step, e.toc_step, 0, 0, 0, cpufreq);
-      for (int l = 0; l < e.sched.nr_tasks; l++)
-        if (!e.sched.tasks[l].skip && !e.sched.tasks[l].implicit)
+      for (int l = 0; l < e.sched.nr_tasks; l++) {
+        if (!e.sched.tasks[l].implicit && e.sched.tasks[l].toc != 0) {
           fprintf(
               file_thread, " %i %i %i %i %lli %lli %i %i %i %i\n",
               e.sched.tasks[l].rid, e.sched.tasks[l].type,
@@ -619,6 +623,8 @@ int main(int argc, char *argv[]) {
               (e.sched.tasks[l].cj == NULL) ? 0 : e.sched.tasks[l].cj->count,
               (e.sched.tasks[l].ci == NULL) ? 0 : e.sched.tasks[l].ci->gcount,
               (e.sched.tasks[l].cj == NULL) ? 0 : e.sched.tasks[l].cj->gcount);
+        }
+      }
       fclose(file_thread);
 #endif
     }
diff --git a/src/Makefile.am b/src/Makefile.am
index 37c88f1f9cc5facd61beb5f840fb83f7fcd92d8f..30afe6cf16960fe5b13f7a0bf6cb0cae86eaa43b 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -44,7 +44,7 @@ include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \
     common_io.h single_io.h multipole.h map.h tools.h partition.h clocks.h parser.h \
     physical_constants.h physical_constants_cgs.h potential.h version.h \
     hydro_properties.h riemann.h threadpool.h cooling.h cooling_struct.h sourceterms.h \
-    sourceterms_struct.h
+    sourceterms_struct.h statistics.h
 
 
 # Common source files
@@ -53,7 +53,8 @@ AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \
     units.c common_io.c single_io.c multipole.c version.c map.c \
     kernel_hydro.c tools.c part.c partition.c clocks.c parser.c \
     physical_constants.c potential.c hydro_properties.c \
-    runner_doiact_fft.c threadpool.c cooling.c sourceterms.c
+    runner_doiact_fft.c threadpool.c cooling.c sourceterms.c \
+    statistics.c
 
 # Include files for distribution, not installation.
 nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h \
diff --git a/src/cell.c b/src/cell.c
index c694ef9abff1dd183ec135765b3d227f55116ead..3e3a8e6cc537f948a809c1a3ebf3cb18194fedd1 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -52,6 +52,7 @@
 #include "gravity.h"
 #include "hydro.h"
 #include "hydro_properties.h"
+#include "scheduler.h"
 #include "space.h"
 #include "timers.h"
 
@@ -763,6 +764,9 @@ void cell_clean_links(struct cell *c, void *data) {
 
   c->force = NULL;
   c->nr_force = 0;
+
+  c->grav = NULL;
+  c->nr_grav = 0;
 }
 
 /**
@@ -893,6 +897,120 @@ int cell_is_drift_needed(struct cell *c, int ti_current) {
   return 0;
 }
 
+/**
+ * @brief Un-skips all the tasks associated with a given cell and checks
+ * if the space needs to be rebuilt.
+ *
+ * @param c the #cell.
+ * @param s the #scheduler.
+ *
+ * @return 1 If the space needs rebuilding. 0 otherwise.
+ */
+int cell_unskip_tasks(struct cell *c, struct scheduler *s) {
+
+  /* Un-skip the density tasks involved with this cell. */
+  for (struct link *l = c->density; l != NULL; l = l->next) {
+    struct task *t = l->t;
+    const struct cell *ci = t->ci;
+    const struct cell *cj = t->cj;
+    scheduler_activate(s, t);
+
+    /* Set the correct sorting flags */
+    if (t->type == task_type_pair) {
+      if (!(ci->sorted & (1 << t->flags))) {
+        atomic_or(&ci->sorts->flags, (1 << t->flags));
+        scheduler_activate(s, ci->sorts);
+      }
+      if (!(cj->sorted & (1 << t->flags))) {
+        atomic_or(&cj->sorts->flags, (1 << t->flags));
+        scheduler_activate(s, cj->sorts);
+      }
+    }
+
+    /* Check whether there was too much particle motion */
+    if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+      if (t->tight &&
+          (max(ci->h_max, cj->h_max) + ci->dx_max + cj->dx_max > cj->dmin ||
+           ci->dx_max > space_maxreldx * ci->h_max ||
+           cj->dx_max > space_maxreldx * cj->h_max))
+        return 1;
+
+#ifdef WITH_MPI
+      /* Activate the send/recv flags. */
+      if (ci->nodeID != engine_rank) {
+
+        /* Activate the tasks to recv foreign cell ci's data. */
+        scheduler_activate(s, ci->recv_xv);
+        scheduler_activate(s, ci->recv_rho);
+        scheduler_activate(s, ci->recv_ti);
+
+        /* Look for the local cell cj's send tasks. */
+        struct link *l = NULL;
+        for (l = cj->send_xv; l != NULL && l->t->cj->nodeID != ci->nodeID;
+             l = l->next)
+          ;
+        if (l == NULL) error("Missing link to send_xv task.");
+        scheduler_activate(s, l->t);
+
+        for (l = cj->send_rho; l != NULL && l->t->cj->nodeID != ci->nodeID;
+             l = l->next)
+          ;
+        if (l == NULL) error("Missing link to send_rho task.");
+        scheduler_activate(s, l->t);
+
+        for (l = cj->send_ti; l != NULL && l->t->cj->nodeID != ci->nodeID;
+             l = l->next)
+          ;
+        if (l == NULL) error("Missing link to send_ti task.");
+        scheduler_activate(s, l->t);
+
+      } else if (cj->nodeID != engine_rank) {
+
+        /* Activate the tasks to recv foreign cell cj's data. */
+        scheduler_activate(s, cj->recv_xv);
+        scheduler_activate(s, cj->recv_rho);
+        scheduler_activate(s, cj->recv_ti);
+        /* Look for the local cell ci's send tasks. */
+        struct link *l = NULL;
+        for (l = ci->send_xv; l != NULL && l->t->cj->nodeID != cj->nodeID;
+             l = l->next)
+          ;
+        if (l == NULL) error("Missing link to send_xv task.");
+        scheduler_activate(s, l->t);
+
+        for (l = ci->send_rho; l != NULL && l->t->cj->nodeID != cj->nodeID;
+             l = l->next)
+          ;
+        if (l == NULL) error("Missing link to send_rho task.");
+        scheduler_activate(s, l->t);
+
+        for (l = ci->send_ti; l != NULL && l->t->cj->nodeID != cj->nodeID;
+             l = l->next)
+          ;
+        if (l == NULL) error("Missing link to send_ti task.");
+        scheduler_activate(s, l->t);
+      }
+#endif
+    }
+  }
+
+  /* Unskip all the other task types. */
+  for (struct link *l = c->gradient; l != NULL; l = l->next)
+    scheduler_activate(s, l->t);
+  for (struct link *l = c->force; l != NULL; l = l->next)
+    scheduler_activate(s, l->t);
+  for (struct link *l = c->grav; l != NULL; l = l->next)
+    scheduler_activate(s, l->t);
+  if (c->extra_ghost != NULL) scheduler_activate(s, c->extra_ghost);
+  if (c->ghost != NULL) scheduler_activate(s, c->ghost);
+  if (c->init != NULL) scheduler_activate(s, c->init);
+  if (c->kick != NULL) scheduler_activate(s, c->kick);
+  if (c->cooling != NULL) scheduler_activate(s, c->cooling);
+  if (c->sourceterms != NULL) scheduler_activate(s, c->sourceterms);
+
+  return 0;
+}
+
 /**
  * @brief Set the super-cell pointers for all cells in a hierarchy.
  *
diff --git a/src/cell.h b/src/cell.h
index 6bdadfb6e543882ee5cc98499f6ac1c4205b0873..dfe0261f628799980111f4bedb8fcfdad90ef043 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -39,6 +39,7 @@
 
 /* Avoid cyclic inclusions */
 struct space;
+struct scheduler;
 
 /* Max tag size set to 2^29 to take into account some MPI implementations
  * that use 2^31 as the upper bound on MPI tags and the fact that
@@ -158,9 +159,6 @@ struct cell {
   /* Tasks for gravity tree. */
   struct task *grav_up, *grav_down;
 
-  /* Task for external gravity */
-  struct task *grav_external;
-
   /* Task for cooling */
   struct task *cooling;
 
@@ -179,12 +177,6 @@ struct cell {
   /* ID of the previous owner, e.g. runner. */
   int owner;
 
-  /* Momentum of particles in cell. */
-  double mom[3], ang_mom[3];
-
-  /* Mass, potential, internal  and kinetic energy of particles in this cell. */
-  double mass, e_pot_self, e_pot_ext, e_pot, e_int, e_kin, e_rad, entropy;
-
   /* Number of particles updated in this cell. */
   int updated, g_updated;
 
@@ -236,6 +228,7 @@ int cell_are_neighbours(const struct cell *restrict ci,
 void cell_check_multipole(struct cell *c, void *data);
 void cell_clean(struct cell *c);
 int cell_is_drift_needed(struct cell *c, int ti_current);
+int cell_unskip_tasks(struct cell *c, struct scheduler *s);
 void cell_set_super(struct cell *c, struct cell *super);
 
 #endif /* SWIFT_CELL_H */
diff --git a/src/engine.c b/src/engine.c
index 1323cd7c7d30f5e24b5faee1f2fcb9c0ea85e592..b841cdd299dfb27d302c23ba786c43e6d3d26389 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -63,6 +63,7 @@
 #include "runner.h"
 #include "serial_io.h"
 #include "single_io.h"
+#include "statistics.h"
 #include "timers.h"
 #include "tools.h"
 #include "units.h"
@@ -1249,7 +1250,7 @@ void engine_make_external_gravity_tasks(struct engine *e) {
     /* Is that neighbour local ? */
     if (ci->nodeID != nodeID) continue;
 
-    /* If the cells is local build a self-interaction */
+    /* If the cell is local, build a self-interaction */
     scheduler_addtask(sched, task_type_self, task_subtype_external_grav, 0, 0,
                       ci, NULL, 0);
   }
@@ -1330,59 +1331,101 @@ void engine_make_hydroloop_tasks(struct engine *e) {
 /**
  * @brief Counts the tasks associated with one cell and constructs the links
  *
- * For each hydrodynamic task, construct the links with the corresponding cell.
- * Similarly, construct the dependencies for all the sorting tasks.
+ * For each hydrodynamic and gravity task, construct the links with
+ * the corresponding cell.  Similarly, construct the dependencies for
+ * all the sorting tasks.
  *
  * @param e The #engine.
  */
 void engine_count_and_link_tasks(struct engine *e) {
 
-  struct scheduler *sched = &e->sched;
-
-  for (int ind = 0; ind < sched->nr_tasks; ind++) {
+  struct scheduler *const sched = &e->sched;
+  const int nr_tasks = sched->nr_tasks;
 
-    struct task *t = &sched->tasks[ind];
+  for (int ind = 0; ind < nr_tasks; ind++) {
 
-    if (t->skip) continue;
+    struct task *const t = &sched->tasks[ind];
+    struct cell *const ci = t->ci;
+    struct cell *const cj = t->cj;
 
     /* Link sort tasks together. */
-    if (t->type == task_type_sort && t->ci->split)
+    if (t->type == task_type_sort && ci->split)
       for (int j = 0; j < 8; j++)
-        if (t->ci->progeny[j] != NULL && t->ci->progeny[j]->sorts != NULL) {
-          t->ci->progeny[j]->sorts->skip = 0;
-          scheduler_addunlock(sched, t->ci->progeny[j]->sorts, t);
+        if (ci->progeny[j] != NULL && ci->progeny[j]->sorts != NULL) {
+          scheduler_addunlock(sched, ci->progeny[j]->sorts, t);
         }
 
-    /* Link density tasks to cells. */
+    /* Link self tasks to cells. */
     if (t->type == task_type_self) {
-      atomic_inc(&t->ci->nr_tasks);
+      atomic_inc(&ci->nr_tasks);
       if (t->subtype == task_subtype_density) {
-        engine_addlink(e, &t->ci->density, t);
-        atomic_inc(&t->ci->nr_density);
+        engine_addlink(e, &ci->density, t);
+        atomic_inc(&ci->nr_density);
       }
+      if (t->subtype == task_subtype_grav) {
+        engine_addlink(e, &ci->grav, t);
+        atomic_inc(&ci->nr_grav);
+      }
+      if (t->subtype == task_subtype_external_grav) {
+        engine_addlink(e, &ci->grav, t);
+        atomic_inc(&ci->nr_grav);
+      }
+
+      /* Link pair tasks to cells. */
     } else if (t->type == task_type_pair) {
-      atomic_inc(&t->ci->nr_tasks);
-      atomic_inc(&t->cj->nr_tasks);
+      atomic_inc(&ci->nr_tasks);
+      atomic_inc(&cj->nr_tasks);
       if (t->subtype == task_subtype_density) {
-        engine_addlink(e, &t->ci->density, t);
-        atomic_inc(&t->ci->nr_density);
-        engine_addlink(e, &t->cj->density, t);
-        atomic_inc(&t->cj->nr_density);
+        engine_addlink(e, &ci->density, t);
+        atomic_inc(&ci->nr_density);
+        engine_addlink(e, &cj->density, t);
+        atomic_inc(&cj->nr_density);
       }
+      if (t->subtype == task_subtype_grav) {
+        engine_addlink(e, &ci->grav, t);
+        atomic_inc(&ci->nr_grav);
+        engine_addlink(e, &cj->grav, t);
+        atomic_inc(&cj->nr_grav);
+      }
+
+      /* Link sub-self tasks to cells. */
     } else if (t->type == task_type_sub_self) {
-      atomic_inc(&t->ci->nr_tasks);
+      atomic_inc(&ci->nr_tasks);
       if (t->subtype == task_subtype_density) {
-        engine_addlink(e, &t->ci->density, t);
-        atomic_inc(&t->ci->nr_density);
+        engine_addlink(e, &ci->density, t);
+        atomic_inc(&ci->nr_density);
+      }
+      if (t->subtype == task_subtype_grav) {
+        engine_addlink(e, &ci->grav, t);
+        atomic_inc(&ci->nr_grav);
       }
+      if (t->subtype == task_subtype_external_grav) {
+        engine_addlink(e, &ci->grav, t);
+        atomic_inc(&ci->nr_grav);
+      }
+
+      /* Link sub-pair tasks to cells. */
     } else if (t->type == task_type_sub_pair) {
-      atomic_inc(&t->ci->nr_tasks);
-      atomic_inc(&t->cj->nr_tasks);
+      atomic_inc(&ci->nr_tasks);
+      atomic_inc(&cj->nr_tasks);
       if (t->subtype == task_subtype_density) {
-        engine_addlink(e, &t->ci->density, t);
-        atomic_inc(&t->ci->nr_density);
-        engine_addlink(e, &t->cj->density, t);
-        atomic_inc(&t->cj->nr_density);
+        engine_addlink(e, &ci->density, t);
+        atomic_inc(&ci->nr_density);
+        engine_addlink(e, &cj->density, t);
+        atomic_inc(&cj->nr_density);
+      }
+      if (t->subtype == task_subtype_grav) {
+        engine_addlink(e, &ci->grav, t);
+        atomic_inc(&ci->nr_grav);
+        engine_addlink(e, &cj->grav, t);
+        atomic_inc(&cj->nr_grav);
+      }
+      if (t->subtype == task_subtype_external_grav) {
+        error("Found a sub-pair/external-gravity task...");
+        engine_addlink(e, &ci->grav, t);
+        atomic_inc(&ci->nr_grav);
+        engine_addlink(e, &cj->grav, t);
+        atomic_inc(&cj->nr_grav);
       }
     }
   }
@@ -1449,9 +1492,6 @@ void engine_link_gravity_tasks(struct engine *e) {
     /* Get a pointer to the task. */
     struct task *t = &sched->tasks[k];
 
-    /* Skip? */
-    if (t->skip) continue;
-
     /* Multipole construction */
     if (t->type == task_type_grav_up) {
       scheduler_addunlock(sched, t, gather);
@@ -1599,9 +1639,6 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
   for (int ind = 0; ind < nr_tasks; ind++) {
     struct task *t = &sched->tasks[ind];
 
-    /* Skip? */
-    if (t->skip) continue;
-
     /* Self-interaction? */
     if (t->type == task_type_self && t->subtype == task_subtype_density) {
 
@@ -1874,14 +1911,16 @@ void engine_maketasks(struct engine *e) {
   /* Split the tasks. */
   scheduler_splittasks(sched);
 
-  /* Allocate the list of cell-task links. The maximum number of links
-     is the number of cells (s->tot_cells) times the number of neighbours (27)
-     times the number of interaction types (2, density and force). */
+  /* Allocate the list of cell-task links. The maximum number of links is the
+   * number of cells (s->tot_cells) times the number of neighbours (26) times
+   * the number of interaction types, so 26 * 3 (density, force, grav) pairs
+   * and 4 (density, force, grav, ext_grav) self.
+   */
   if (e->links != NULL) free(e->links);
 #ifdef EXTRA_HYDRO_LOOP
-  e->size_links = s->tot_cells * 27 * 3;
+  e->size_links = s->tot_cells * (26 * 4 + 4);
 #else
-  e->size_links = s->tot_cells * 27 * 2;
+  e->size_links = s->tot_cells * (26 * 3 + 4);
 #endif
   if ((e->links = malloc(sizeof(struct link) * e->size_links)) == NULL)
     error("Failed to allocate cell-task links.");
@@ -1956,7 +1995,7 @@ void engine_maketasks(struct engine *e) {
 }
 
 /**
- * @brief Mark tasks to be skipped and set the sort flags accordingly.
+ * @brief Mark tasks to be un-skipped and set the sort flags accordingly.
  *        Threadpool mapper function for fixdt version.
  *
  * @param map_data pointer to the tasks
@@ -1967,11 +2006,15 @@ void engine_marktasks_fixdt_mapper(void *map_data, int num_elements,
                                    void *extra_data) {
   /* Unpack the arguments. */
   struct task *tasks = (struct task *)map_data;
-  int *rebuild_space = (int *)extra_data;
+  size_t *rebuild_space = &((size_t *)extra_data)[0];
+  struct scheduler *s = (struct scheduler *)(((size_t *)extra_data)[1]);
 
   for (int ind = 0; ind < num_elements; ind++) {
     struct task *t = &tasks[ind];
 
+    /* All tasks are unskipped (we skip by default). */
+    scheduler_activate(s, t);
+
     /* Pair? */
     if (t->type == task_type_pair || t->type == task_type_sub_pair) {
 
@@ -1998,28 +2041,7 @@ void engine_marktasks_fixdt_mapper(void *map_data, int num_elements,
 }
 
 /**
- * @brief Mark any sort tasks as initially skipped.
- *        Threadpool mapper function.
- *
- * @param map_data pointer to the tasks
- * @param num_elements number of tasks
- * @param extra_data unused
- */
-void engine_marktasks_sorts_mapper(void *map_data, int num_elements,
-                                   void *extra_data) {
-  /* Unpack the arguments. */
-  struct task *tasks = (struct task *)map_data;
-  for (int ind = 0; ind < num_elements; ind++) {
-    struct task *t = &tasks[ind];
-    if (t->type == task_type_sort) {
-      t->flags = 0;
-      t->skip = 1;
-    }
-  }
-}
-
-/**
- * @brief Mark tasks to be skipped and set the sort flags accordingly.
+ * @brief Mark tasks to be un-skipped and set the sort flags accordingly.
  *        Threadpool mapper function.
  *
  * @param map_data pointer to the tasks
@@ -2030,18 +2052,20 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
                              void *extra_data) {
   /* Unpack the arguments. */
   struct task *tasks = (struct task *)map_data;
-  const int ti_end = ((int *)extra_data)[0];
-  int *rebuild_space = &((int *)extra_data)[1];
+  const int ti_end = ((size_t *)extra_data)[0];
+  size_t *rebuild_space = &((size_t *)extra_data)[1];
+  struct scheduler *s = (struct scheduler *)(((size_t *)extra_data)[2]);
 
   for (int ind = 0; ind < num_elements; ind++) {
     struct task *t = &tasks[ind];
 
     /* Single-cell task? */
     if (t->type == task_type_self || t->type == task_type_ghost ||
-        t->type == task_type_sub_self) {
+        t->type == task_type_extra_ghost || t->type == task_type_cooling ||
+        t->type == task_type_sourceterms || t->type == task_type_sub_self) {
 
       /* Set this task's skip. */
-      t->skip = (t->ci->ti_end_min > ti_end);
+      if (t->ci->ti_end_min <= ti_end) scheduler_activate(s, t);
     }
 
     /* Pair? */
@@ -2058,19 +2082,24 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
            cj->dx_max > space_maxreldx * cj->h_max))
         *rebuild_space = 1;
 
-      /* Set this task's skip. */
-      if ((t->skip = (ci->ti_end_min > ti_end && cj->ti_end_min > ti_end)) == 1)
+      /* Set this task's skip, otherwise nothing to do. */
+      if (ci->ti_end_min <= ti_end || cj->ti_end_min <= ti_end)
+        scheduler_activate(s, t);
+      else
         continue;
 
+      /* If this is not a density task, we don't have to do any of the below. */
+      if (t->subtype != task_subtype_density) continue;
+
       /* Set the sort flags. */
-      if (t->type == task_type_pair && t->subtype != task_subtype_grav) {
+      if (t->type == task_type_pair) {
         if (!(ci->sorted & (1 << t->flags))) {
           atomic_or(&ci->sorts->flags, (1 << t->flags));
-          ci->sorts->skip = 0;
+          scheduler_activate(s, ci->sorts);
         }
         if (!(cj->sorted & (1 << t->flags))) {
           atomic_or(&cj->sorts->flags, (1 << t->flags));
-          cj->sorts->skip = 0;
+          scheduler_activate(s, cj->sorts);
         }
       }
 
@@ -2080,9 +2109,9 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       if (ci->nodeID != engine_rank) {
 
         /* Activate the tasks to recv foreign cell ci's data. */
-        ci->recv_xv->skip = 0;
-        ci->recv_rho->skip = 0;
-        ci->recv_ti->skip = 0;
+        scheduler_activate(s, ci->recv_xv);
+        scheduler_activate(s, ci->recv_rho);
+        scheduler_activate(s, ci->recv_ti);
 
         /* Look for the local cell cj's send tasks. */
         struct link *l = NULL;
@@ -2090,45 +2119,46 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
              l = l->next)
           ;
         if (l == NULL) error("Missing link to send_xv task.");
-        l->t->skip = 0;
+        scheduler_activate(s, l->t);
 
         for (l = cj->send_rho; l != NULL && l->t->cj->nodeID != ci->nodeID;
              l = l->next)
           ;
         if (l == NULL) error("Missing link to send_rho task.");
-        l->t->skip = 0;
+        scheduler_activate(s, l->t);
 
         for (l = cj->send_ti; l != NULL && l->t->cj->nodeID != ci->nodeID;
              l = l->next)
           ;
         if (l == NULL) error("Missing link to send_ti task.");
-        l->t->skip = 0;
+        scheduler_activate(s, l->t);
 
       } else if (cj->nodeID != engine_rank) {
 
         /* Activate the tasks to recv foreign cell cj's data. */
-        cj->recv_xv->skip = 0;
-        cj->recv_rho->skip = 0;
-        cj->recv_ti->skip = 0;
+        scheduler_activate(s, cj->recv_xv);
+        scheduler_activate(s, cj->recv_rho);
+        scheduler_activate(s, cj->recv_ti);
+
         /* Look for the local cell ci's send tasks. */
         struct link *l = NULL;
         for (l = ci->send_xv; l != NULL && l->t->cj->nodeID != cj->nodeID;
              l = l->next)
           ;
         if (l == NULL) error("Missing link to send_xv task.");
-        l->t->skip = 0;
+        scheduler_activate(s, l->t);
 
         for (l = ci->send_rho; l != NULL && l->t->cj->nodeID != cj->nodeID;
              l = l->next)
           ;
         if (l == NULL) error("Missing link to send_rho task.");
-        l->t->skip = 0;
+        scheduler_activate(s, l->t);
 
         for (l = ci->send_ti; l != NULL && l->t->cj->nodeID != cj->nodeID;
              l = l->next)
           ;
         if (l == NULL) error("Missing link to send_ti task.");
-        l->t->skip = 0;
+        scheduler_activate(s, l->t);
       }
 
 #endif
@@ -2136,25 +2166,26 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
 
     /* Kick? */
     else if (t->type == task_type_kick) {
-      t->skip = (t->ci->ti_end_min > ti_end);
       t->ci->updated = 0;
       t->ci->g_updated = 0;
+      if (t->ci->ti_end_min <= ti_end) scheduler_activate(s, t);
     }
 
     /* Init? */
     else if (t->type == task_type_init) {
-      /* Set this task's skip. */
-      t->skip = (t->ci->ti_end_min > ti_end);
+      if (t->ci->ti_end_min <= ti_end) scheduler_activate(s, t);
     }
 
-    /* None? */
-    else if (t->type == task_type_none)
-      t->skip = 1;
+    /* Tasks with no cells should not be skipped? */
+    else if (t->type == task_type_grav_gather_m ||
+             t->type == task_type_grav_fft) {
+      scheduler_activate(s, t);
+    }
   }
 }
 
 /**
- * @brief Mark tasks to be skipped and set the sort flags accordingly.
+ * @brief Mark tasks to be un-skipped and set the sort flags accordingly.
  *
  * @return 1 if the space has to be rebuilt, 0 otherwise.
  */
@@ -2168,31 +2199,16 @@ int engine_marktasks(struct engine *e) {
   if (e->policy & engine_policy_fixdt) {
 
     /* Run through the tasks and mark as skip or not. */
+    size_t extra_data[2] = {rebuild_space, (size_t)&e->sched};
     threadpool_map(&e->threadpool, engine_marktasks_fixdt_mapper, s->tasks,
-                   s->nr_tasks, sizeof(struct task), 1000, &rebuild_space);
+                   s->nr_tasks, sizeof(struct task), 1000, extra_data);
     return rebuild_space;
 
     /* Multiple-timestep case */
   } else {
 
     /* Run through the tasks and mark as skip or not. */
-    int extra_data[2] = {e->ti_current, rebuild_space};
-    threadpool_map(&e->threadpool, engine_marktasks_sorts_mapper, s->tasks,
-                   s->nr_tasks, sizeof(struct task), 10000, NULL);
-
-#ifdef WITH_MPI
-    if (e->policy & engine_policy_mpi) {
-
-      /* Skip all sends and recvs, we will unmark if needed. */
-      for (int k = 0; k < s->nr_tasks; k++) {
-        struct task *t = &s->tasks[k];
-        if (t->type == task_type_send || t->type == task_type_recv) {
-          t->skip = 1;
-        }
-      }
-    }
-#endif
-
+    size_t extra_data[3] = {e->ti_current, rebuild_space, (size_t)&e->sched};
     threadpool_map(&e->threadpool, engine_marktasks_mapper, s->tasks,
                    s->nr_tasks, sizeof(struct task), 10000, extra_data);
     rebuild_space = extra_data[1];
@@ -2213,16 +2229,21 @@ int engine_marktasks(struct engine *e) {
  */
 void engine_print_task_counts(struct engine *e) {
 
-  struct scheduler *sched = &e->sched;
+  const ticks tic = getticks();
+  struct scheduler *const sched = &e->sched;
+  const int nr_tasks = sched->nr_tasks;
+  const struct task *const tasks = sched->tasks;
 
   /* Count and print the number of each task type. */
   int counts[task_type_count + 1];
   for (int k = 0; k <= task_type_count; k++) counts[k] = 0;
-  for (int k = 0; k < sched->nr_tasks; k++)
-    if (!sched->tasks[k].skip)
-      counts[(int)sched->tasks[k].type] += 1;
-    else
+  for (int k = 0; k < nr_tasks; k++) {
+    if (tasks[k].skip)
       counts[task_type_count] += 1;
+    else
+      counts[(int)tasks[k].type] += 1;
+  }
+  message("Total = %d", nr_tasks);
 #ifdef WITH_MPI
   printf("[%04i] %s engine_print_task_counts: task counts are [ %s=%i",
          e->nodeID, clocks_get_timesincestart(), taskID_names[0], counts[0]);
@@ -2236,6 +2257,10 @@ void engine_print_task_counts(struct engine *e) {
   fflush(stdout);
   message("nr_parts = %zu.", e->s->nr_parts);
   message("nr_gparts = %zu.", e->s->nr_gparts);
+
+  if (e->verbose)
+    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
+            clocks_getunit());
 }
 
 /**
@@ -2288,7 +2313,7 @@ void engine_prepare(struct engine *e, int nodrift) {
   TIMER_TIC;
 
   /* Run through the tasks and mark as skip or not. */
-  int rebuild = (e->forcerebuild || engine_marktasks(e));
+  int rebuild = e->forcerebuild;
 
 /* Collect the values of rebuild from all nodes. */
 #ifdef WITH_MPI
@@ -2398,6 +2423,10 @@ void engine_collect_kick(struct cell *c) {
         ti_end_min = min(ti_end_min, cp->ti_end_min);
         updated += cp->updated;
         g_updated += cp->g_updated;
+
+        /* Collected, so clear for next time. */
+        cp->updated = 0;
+        cp->g_updated = 0;
       }
     }
   }
@@ -2433,6 +2462,10 @@ void engine_collect_timestep(struct engine *e) {
       ti_end_min = min(ti_end_min, c->ti_end_min);
       updates += c->updated;
       g_updates += c->g_updated;
+
+      /* Collected, so clear for next time. */
+      c->updated = 0;
+      c->g_updated = 0;
     }
 
 /* Aggregate the data from the different nodes. */
@@ -2475,80 +2508,28 @@ void engine_collect_timestep(struct engine *e) {
 void engine_print_stats(struct engine *e) {
 
   const ticks tic = getticks();
-  const struct space *s = e->s;
 
-  double e_kin = 0.0, e_int = 0.0, e_pot_self = 0.0, e_pot_ext = 0.0 ,e_rad = 0.0;
-  double entropy = 0.0, mass = 0.0;
-  double mom[3] = {0.0, 0.0, 0.0}, ang_mom[3] = {0.0, 0.0, 0.0};
+  struct statistics stats;
+  stats_init(&stats);
 
-  /* Collect the cell data. */
-  for (int k = 0; k < s->nr_cells; k++)
-    if (s->cells_top[k].nodeID == e->nodeID) {
-      struct cell *c = &s->cells_top[k];
-      mass += c->mass;
-      e_kin += c->e_kin;
-      e_int += c->e_int;
-      e_pot_self += c->e_pot_self;
-      e_pot_ext += c->e_pot_ext;
-      e_rad += c->e_rad;
-      entropy += c->entropy;
-      mom[0] += c->mom[0];
-      mom[1] += c->mom[1];
-      mom[2] += c->mom[2];
-      ang_mom[0] += c->ang_mom[0];
-      ang_mom[1] += c->ang_mom[1];
-      ang_mom[2] += c->ang_mom[2];
-    }
+  /* Collect the stats on this node */
+  stats_collect(e->s, &stats);
 
 /* Aggregate the data from the different nodes. */
 #ifdef WITH_MPI
-  {
-    double in[13] = {0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.};
-    double out[13];
-    out[0] = e_kin;
-    out[1] = e_int;
-    out[2] = e_pot_self;
-    out[3] = e_pot_ext;
-    out[4] = e_rad;
-    out[5] = mom[0];
-    out[6] = mom[1];
-    out[7] = mom[2];
-    out[8] = ang_mom[0];
-    out[9] = ang_mom[1];
-    out[10] = ang_mom[2];
-    out[11] = mass;
-    out[12] = entropy;
-    if (MPI_Reduce(out, in, 11, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD) !=
-        MPI_SUCCESS)
-      error("Failed to aggregate stats.");
-    e_kin = out[0];
-    e_int = out[1];
-    e_pot_self = out[2];
-    e_pot_ext = out[3];
-    e_rad = out[4];
-    mom[0] = out[5];
-    mom[1] = out[6];
-    mom[2] = out[7];
-    ang_mom[0] = out[8];
-    ang_mom[1] = out[9];
-    ang_mom[2] = out[10];
-    mass = out[11];
-    entropy = out[12];
-  }
-#endif
+  struct statistics global_stats;
+  stats_init(&global_stats);
 
-  const double e_pot = e_pot_self + e_pot_ext;
-  const double e_tot = e_kin + e_int + e_pot;
+  if (MPI_Reduce(&stats, &global_stats, 1, statistics_mpi_type,
+                 statistics_mpi_reduce_op, 0, MPI_COMM_WORLD) != MPI_SUCCESS)
+    error("Failed to aggregate stats.");
+#else
+  struct statistics global_stats = stats;
+#endif
 
   /* Print info */
-  if (e->nodeID == 0) {
-    fprintf(e->file_stats,
-            " %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e "
-            "%14e %14e %14e\n",
-            e->time, mass, e_tot, e_kin, e_int, e_pot, e_pot_self, e_pot_ext, e_rad, entropy, mom[0],
-            mom[1], mom[2], ang_mom[0], ang_mom[1], ang_mom[2]);
-    fflush(e->file_stats);
-  }
+  if (e->nodeID == 0)
+    stats_print_to_file(e->file_stats, &global_stats, e->time);
 
   if (e->verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
@@ -2759,13 +2740,7 @@ void engine_step(struct engine *e) {
     fflush(e->file_timesteps);
   }
 
-  /* Save some statistics */
-  if (e->time - e->timeLastStatistics >= e->deltaTimeStatistics) {
-    engine_print_stats(e);
-    e->timeLastStatistics += e->deltaTimeStatistics;
-  }
-
-  /* Drift only the necessary particles, that all means all particles
+  /* Drift only the necessary particles, that means all particles
    * if we are about to repartition. */
   int repart = (e->forcerepart != REPART_NONE);
   e->drift_all = repart || e->drift_all;
@@ -2861,6 +2836,12 @@ void engine_step(struct engine *e) {
   engine_launch(e, e->nr_threads, mask, submask);
   TIMER_TOC(timer_runners);
 
+  /* Save some statistics */
+  if (e->time - e->timeLastStatistics >= e->deltaTimeStatistics) {
+    engine_print_stats(e);
+    e->timeLastStatistics += e->deltaTimeStatistics;
+  }
+
   TIMER_TOC2(timer_step);
 
   clocks_gettime(&time2);
@@ -2886,9 +2867,10 @@ void engine_drift(struct engine *e) {
   const ticks tic = getticks();
   threadpool_map(&e->threadpool, runner_do_drift_mapper, e->s->cells_top,
                  e->s->nr_cells, sizeof(struct cell), 1, e);
+
   if (e->verbose)
-    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
-            clocks_getunit());
+    message("took %.3f %s (including task unskipping).",
+            clocks_from_ticks(getticks() - tic), clocks_getunit());
 }
 
 /**
@@ -3516,6 +3498,7 @@ void engine_init(struct engine *e, struct space *s,
 /* Construct types for MPI communications */
 #ifdef WITH_MPI
   part_create_mpi_types();
+  stats_create_MPI_type();
 #endif
 
   /* Initialize the threadpool. */
diff --git a/src/potential/disc_patch/potential.h b/src/potential/disc_patch/potential.h
index 3a2205c3bdb33d0e43badca8eb35175d3477871a..725a7a7634cc34291d6c9d9fdf29acf8c5a035af 100644
--- a/src/potential/disc_patch/potential.h
+++ b/src/potential/disc_patch/potential.h
@@ -163,7 +163,7 @@ __attribute__((always_inline)) INLINE static void external_gravity_acceleration(
 
  __attribute__((always_inline)) INLINE static float external_gravity_get_potential_energy(
     const struct external_potential* potential,
-    const struct phys_const* const phys_const, struct part* p) {
+    const struct phys_const* const phys_const, const struct part* p) {
 
   return 0.;
  }
diff --git a/src/potential/isothermal/potential.h b/src/potential/isothermal/potential.h
index 6ac51e3e77e3cd8151adfc6b30a81e14827f72ff..bb9206172d245b3bded709b4d04328523d417c47 100644
--- a/src/potential/isothermal/potential.h
+++ b/src/potential/isothermal/potential.h
@@ -122,16 +122,16 @@ __attribute__((always_inline)) INLINE static void external_gravity_acceleration(
  *
  * @param potential The #external_potential used in the run.
  * @param phys_const Physical constants in internal units. 
- * @param p Pointer to the particle data.
+ * @param g Pointer to the particle data.
  */
 
  __attribute__((always_inline)) INLINE static float external_gravity_get_potential_energy(
     const struct external_potential* potential,
-    const struct phys_const* const phys_const, struct part* p) {
+    const struct phys_const* const phys_const, const struct gpart* g) {
 
-  const float dx = p->x[0] - potential->x;
-  const float dy = p->x[1] - potential->y;
-  const float dz = p->x[2] - potential->z;
+  const float dx = g->x[0] - potential->x;
+  const float dy = g->x[1] - potential->y;
+  const float dz = g->x[2] - potential->z;
 
   return potential->vrot * potential->vrot * 0.5 * log(dx*dx + dy*dy * dz*dz);
  }
diff --git a/src/potential/none/potential.h b/src/potential/none/potential.h
index 3f075f8fdc9b10e107bb4663b26d10e37119c9bb..fb3b27636cd9840f6b0b8fe1b7c6a9e20e851e82 100644
--- a/src/potential/none/potential.h
+++ b/src/potential/none/potential.h
@@ -68,6 +68,21 @@ __attribute__((always_inline)) INLINE static void external_gravity_acceleration(
     double time, const struct external_potential* restrict potential,
     const struct phys_const* restrict phys_const, struct gpart* restrict g) {}
 
+/**
+ * @brief Computes the gravitational potential energy due to nothing.
+ *
+ * @param potential The #external_potential used in the run.
+ * @param phys_const Physical constants in internal units. 
+ * @param g Pointer to the particle data.
+ */
+
+ __attribute__((always_inline)) INLINE static float external_gravity_get_potential_energy(
+    const struct external_potential* potential,
+    const struct phys_const* const phys_const, const struct part* g) {
+
+  return 0.;
+ }
+
 /**
  * @brief Initialises the external potential properties in the internal system
  * of units.
diff --git a/src/potential/point_mass/potential.h b/src/potential/point_mass/potential.h
index eafb27f35a56900cf814c218f370a6821ef07230..27ed2bcd4be1d2d85800a9e296e327755e90dc4c 100644
--- a/src/potential/point_mass/potential.h
+++ b/src/potential/point_mass/potential.h
@@ -118,20 +118,20 @@ __attribute__((always_inline)) INLINE static void external_gravity_acceleration(
 
 
 /**
- * @brief Computes the gravitational potential energy of a particle in an isothermal potential.
+ * @brief Computes the gravitational potential energy of a particle in a point mass potential.
  *
  * @param potential The #external_potential used in the run.
  * @param phys_const Physical constants in internal units. 
- * @param p Pointer to the particle data.
+ * @param g Pointer to the particle data.
  */
 
  __attribute__((always_inline)) INLINE static float external_gravity_get_potential_energy(
     const struct external_potential* potential,
-    const struct phys_const* const phys_const, struct part* p) {
+    const struct phys_const* const phys_const, const struct part* g) {
 
-  const float dx = p->x[0] - potential->x;
-  const float dy = p->x[1] - potential->y;
-  const float dz = p->x[2] - potential->z;
+  const float dx = g->x[0] - potential->x;
+  const float dy = g->x[1] - potential->y;
+  const float dz = g->x[2] - potential->z;
   const float rinv = 1./sqrtf(dx * dx + dy * dy + dz * dz);
   return -phys_const->const_newton_G * potential->mass * r_inv;
  }
diff --git a/src/potential/softened_isothermal/potential.h b/src/potential/softened_isothermal/potential.h
index 639c7530d2bee87567dcd35527f8bb0965da0295..9533986f3b6bb6e98f4aba40ae09065e967d71cb 100644
--- a/src/potential/softened_isothermal/potential.h
+++ b/src/potential/softened_isothermal/potential.h
@@ -125,16 +125,16 @@ __attribute__((always_inline)) INLINE static void external_gravity_acceleration(
  *
  * @param potential The #external_potential used in the run.
  * @param phys_const Physical constants in internal units. 
- * @param p Pointer to the particle data.
+ * @param g Pointer to the particle data.
  */
 
  __attribute__((always_inline)) INLINE static float external_gravity_get_potential_energy(
     const struct external_potential* potential,
-    const struct phys_const* const phys_const, struct part* p) {
+    const struct phys_const* const phys_const, const struct part* g) {
 
-  const float dx = p->x[0] - potential->x;
-  const float dy = p->x[1] - potential->y;
-  const float dz = p->x[2] - potential->z;
+  const float dx = g->x[0] - potential->x;
+  const float dy = g->x[1] - potential->y;
+  const float dz = g->x[2] - potential->z;
 
   return potential->vrot * potential->vrot * 0.5 * log(dx*dx + dy*dy * dz*dz + potential->epsilon2);
  }
diff --git a/src/runner.c b/src/runner.c
index 132bbd7feba6e7a708331a81a35855f9ace56970..2f9ca45d6d7d2d6ac01668acdc0ec74010800402 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -748,114 +748,99 @@ void runner_do_ghost(struct runner *r, struct cell *c) {
 }
 
 /**
- * @brief Drift particles and g-particles in a cell forward in time
+ * @brief Drift particles and g-particles in a cell forward in time,
+ *              unskipping any tasks associated with active cells.
  *
  * @param c The cell.
  * @param e The engine.
+ * @param drift whether to actually drift the particles, will not be
+ *              necessary for non-local cells.
  */
-static void runner_do_drift(struct cell *c, struct engine *e) {
+static void runner_do_drift(struct cell *c, struct engine *e, int drift) {
+
+  const int ti_current = e->ti_current;
+
+  /* Unskip any active tasks. */
+  if (c->ti_end_min == e->ti_current) {
+    const int forcerebuild = cell_unskip_tasks(c, &e->sched);
+    if (forcerebuild) atomic_inc(&e->forcerebuild);
+  }
+
+  /* Do we really need to drift? */
+  if (drift) {
+    if (!e->drift_all && !cell_is_drift_needed(c, ti_current)) return;
+  } else {
+
+    /* Not drifting, but may still need to recurse for task skipping. */
+    if (c->split) {
+      for (int k = 0; k < 8; k++) {
+        if (c->progeny[k] != NULL) {
+          struct cell *cp = c->progeny[k];
+          runner_do_drift(cp, e, 0);
+        }
+      }
+    }
+    return;
+  }
 
   const double timeBase = e->timeBase;
   const int ti_old = c->ti_old;
-  const int ti_current = e->ti_current;
   struct part *const parts = c->parts;
   struct xpart *const xparts = c->xparts;
   struct gpart *const gparts = c->gparts;
-  const struct phys_const *constants = e->physical_constants;
-  const struct external_potential *potential = e->external_potential;
-
-  /* Do we need to drift ? */
-  if (!e->drift_all && !cell_is_drift_needed(c, ti_current)) return;
-
-  /* Check that we are actually going to move forward. */
-  if (ti_current == ti_old) return;
 
   /* Drift from the last time the cell was drifted to the current time */
   const double dt = (ti_current - ti_old) * timeBase;
-
   float dx_max = 0.f, dx2_max = 0.f, h_max = 0.f;
-  double e_kin = 0.0, e_int = 0.0, e_pot_self = 0.0, e_pot_ext = 0.0, e_rad = 0.0;
-  double entropy = 0.0, mass = 0.0;
-  double mom[3] = {0.0, 0.0, 0.0};
-  double ang_mom[3] = {0.0, 0.0, 0.0};
 
   /* No children? */
   if (!c->split) {
 
-    /* Loop over all the g-particles in the cell */
-    const size_t nr_gparts = c->gcount;
-    for (size_t k = 0; k < nr_gparts; k++) {
+    /* Check that we are actually going to move forward. */
+    if (ti_current >= ti_old) {
 
-      /* Get a handle on the gpart. */
-      struct gpart *const gp = &gparts[k];
+      /* Loop over all the g-particles in the cell */
+      const size_t nr_gparts = c->gcount;
+      for (size_t k = 0; k < nr_gparts; k++) {
 
-      /* Drift... */
-      drift_gpart(gp, dt, timeBase, ti_old, ti_current);
+        /* Get a handle on the gpart. */
+        struct gpart *const gp = &gparts[k];
 
-      /* Compute (square of) motion since last cell construction */
-      const float dx2 = gp->x_diff[0] * gp->x_diff[0] +
-                        gp->x_diff[1] * gp->x_diff[1] +
-                        gp->x_diff[2] * gp->x_diff[2];
-      dx2_max = (dx2_max > dx2) ? dx2_max : dx2;
-    }
+        /* Drift... */
+        drift_gpart(gp, dt, timeBase, ti_old, ti_current);
 
-    /* Loop over all the particles in the cell (more work for these !) */
-    const size_t nr_parts = c->count;
-    for (size_t k = 0; k < nr_parts; k++) {
+        /* Compute (square of) motion since last cell construction */
+        const float dx2 = gp->x_diff[0] * gp->x_diff[0] +
+                          gp->x_diff[1] * gp->x_diff[1] +
+                          gp->x_diff[2] * gp->x_diff[2];
+        dx2_max = (dx2_max > dx2) ? dx2_max : dx2;
+      }
 
-      /* Get a handle on the part. */
-      struct part *const p = &parts[k];
-      struct xpart *const xp = &xparts[k];
-
-      /* Drift... */
-      drift_part(p, xp, dt, timeBase, ti_old, ti_current);
-
-      /* Compute (square of) motion since last cell construction */
-      const float dx2 = xp->x_diff[0] * xp->x_diff[0] +
-                        xp->x_diff[1] * xp->x_diff[1] +
-                        xp->x_diff[2] * xp->x_diff[2];
-      dx2_max = (dx2_max > dx2) ? dx2_max : dx2;
-
-      /* Maximal smoothing length */
-      h_max = (h_max > p->h) ? h_max : p->h;
-
-      /* Now collect quantities for statistics */
-
-      const float half_dt =
-          (ti_current - (p->ti_begin + p->ti_end) / 2) * timeBase;
-      const double x[3] = {p->x[0], p->x[1], p->x[2]};
-      const float v[3] = {xp->v_full[0] + p->a_hydro[0] * half_dt,
-                          xp->v_full[1] + p->a_hydro[1] * half_dt,
-                          xp->v_full[2] + p->a_hydro[2] * half_dt};
-
-      const float m = hydro_get_mass(p);
-
-      /* Collect mass */
-      mass += m;
-
-      /* Collect momentum */
-      mom[0] += m * v[0];
-      mom[1] += m * v[1];
-      mom[2] += m * v[2];
-
-      /* Collect angular momentum */
-      ang_mom[0] += m * (x[1] * v[2] - x[2] * v[1]);
-      ang_mom[1] += m * (x[2] * v[0] - x[0] * v[2]);
-      ang_mom[2] += m * (x[0] * v[1] - x[1] * v[0]);
-
-      /* Collect energies. */
-      e_kin += 0.5 * m * (v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
-      e_pot_self += 0.;
-      e_pot_ext += m * external_gravity_get_potential_energy(potential,constants,p);
-      e_int += m * hydro_get_internal_energy(p, half_dt);
-      e_rad += cooling_get_radiated_energy(xp);
-
-      /* Collect entropy */
-      entropy += m * hydro_get_entropy(p, half_dt);
-    }
+      /* Loop over all the particles in the cell */
+      const size_t nr_parts = c->count;
+      for (size_t k = 0; k < nr_parts; k++) {
+
+        /* Get a handle on the part. */
+        struct part *const p = &parts[k];
+        struct xpart *const xp = &xparts[k];
+
+        /* Drift... */
+        drift_part(p, xp, dt, timeBase, ti_old, ti_current);
+
+        /* Compute (square of) motion since last cell construction */
+        const float dx2 = xp->x_diff[0] * xp->x_diff[0] +
+                          xp->x_diff[1] * xp->x_diff[1] +
+                          xp->x_diff[2] * xp->x_diff[2];
+        dx2_max = (dx2_max > dx2) ? dx2_max : dx2;
+
+        /* Maximal smoothing length */
+        h_max = (h_max > p->h) ? h_max : p->h;
+      }
+
+      /* Now, get the maximal particle motion from its square */
+      dx_max = sqrtf(dx2_max);
 
-    /* Now, get the maximal particle motion from its square */
-    dx_max = sqrtf(dx2_max);
+    } /* Check that we are actually going to move forward. */
   }
 
   /* Otherwise, aggregate data from children. */
@@ -867,42 +852,15 @@ static void runner_do_drift(struct cell *c, struct engine *e) {
         struct cell *cp = c->progeny[k];
 
         /* Recurse. */
-        runner_do_drift(cp, e);
-
+        runner_do_drift(cp, e, drift);
         dx_max = max(dx_max, cp->dx_max);
         h_max = max(h_max, cp->h_max);
-        mass += cp->mass;
-        e_kin += cp->e_kin;
-        e_int += cp->e_int;
-        e_pot_self += cp->e_pot_self;
-	e_pot_ext += cp->e_pot_ext;
-        e_rad += cp->e_rad;
-        entropy += cp->entropy;
-        mom[0] += cp->mom[0];
-        mom[1] += cp->mom[1];
-        mom[2] += cp->mom[2];
-        ang_mom[0] += cp->ang_mom[0];
-        ang_mom[1] += cp->ang_mom[1];
-        ang_mom[2] += cp->ang_mom[2];
       }
   }
 
   /* Store the values */
   c->h_max = h_max;
   c->dx_max = dx_max;
-  c->mass = mass;
-  c->e_kin = e_kin;
-  c->e_int = e_int;
-  c->e_pot_self = e_pot_self;
-  c->e_pot_ext = e_pot_ext;
-  c->e_rad = e_rad;
-  c->entropy = entropy;
-  c->mom[0] = mom[0];
-  c->mom[1] = mom[1];
-  c->mom[2] = mom[2];
-  c->ang_mom[0] = ang_mom[0];
-  c->ang_mom[1] = ang_mom[1];
-  c->ang_mom[2] = ang_mom[2];
 
   /* Update the time of the last drift */
   c->ti_old = ti_current;
@@ -924,9 +882,11 @@ void runner_do_drift_mapper(void *map_data, int num_elements,
 
   for (int ind = 0; ind < num_elements; ind++) {
     struct cell *c = &cells[ind];
-
-    /* Only drift local particles. */
-    if (c != NULL && c->nodeID == e->nodeID) runner_do_drift(c, e);
+#ifdef WITH_MPI
+    if (c != NULL) runner_do_drift(c, e, (c->nodeID == e->nodeID));
+#else
+    if (c != NULL) runner_do_drift(c, e, 1);
+#endif
   }
 }
 
@@ -1274,6 +1234,35 @@ void *runner_main(void *data) {
       struct cell *cj = t->cj;
       t->rid = r->cpuid;
 
+/* Check that we haven't scheduled an inactive task */
+#ifdef SWIFT_DEBUG_CHECKS
+      if (cj == NULL) { /* self */
+        if (ci->ti_end_min > e->ti_current && t->type != task_type_sort)
+          error(
+              "Task (type='%s/%s') should have been skipped ti_current=%d "
+              "c->ti_end_min=%d",
+              taskID_names[t->type], subtaskID_names[t->subtype], e->ti_current,
+              ci->ti_end_min);
+
+        /* Special case for sorts */
+        if (ci->ti_end_min > e->ti_current && t->type == task_type_sort &&
+            t->flags == 0)
+          error(
+              "Task (type='%s/%s') should have been skipped ti_current=%d "
+              "c->ti_end_min=%d t->flags=%d",
+              taskID_names[t->type], subtaskID_names[t->subtype], e->ti_current,
+              ci->ti_end_min, t->flags);
+
+      } else { /* pair */
+        if (ci->ti_end_min > e->ti_current && cj->ti_end_min > e->ti_current)
+          error(
+              "Task (type='%s/%s') should have been skipped ti_current=%d "
+              "ci->ti_end_min=%d cj->ti_end_min=%d",
+              taskID_names[t->type], subtaskID_names[t->subtype], e->ti_current,
+              ci->ti_end_min, cj->ti_end_min);
+      }
+#endif
+
       /* Different types of tasks... */
       switch (t->type) {
         case task_type_self:
diff --git a/src/scheduler.c b/src/scheduler.c
index 76e0ecb41b9068e471caa75db95d572265bdcb22..44790fcd2fa5f67e6f325ba5849da19e35ab285a 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -50,6 +50,12 @@
 #include "task.h"
 #include "timers.h"
 
+/**
+ * @brief Re-set the list of active tasks.
+ */
+
+void scheduler_clear_active(struct scheduler *s) { s->active_count = 0; }
+
 /**
  * @brief Add an unlock_task to the given task.
  *
@@ -697,7 +703,7 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
   t->wait = wait;
   t->ci = ci;
   t->cj = cj;
-  t->skip = 0;
+  t->skip = 1; /* Mark tasks as skip by default. */
   t->tight = tight;
   t->implicit = 0;
   t->weight = 0;
@@ -863,6 +869,7 @@ void scheduler_reset(struct scheduler *s, int size) {
     /* Free existing task lists if necessary. */
     if (s->tasks != NULL) free(s->tasks);
     if (s->tasks_ind != NULL) free(s->tasks_ind);
+    if (s->tid_active != NULL) free(s->tid_active);
 
     /* Allocate the new lists. */
     if (posix_memalign((void *)&s->tasks, task_align,
@@ -871,6 +878,9 @@ void scheduler_reset(struct scheduler *s, int size) {
 
     if ((s->tasks_ind = (int *)malloc(sizeof(int) * size)) == NULL)
       error("Failed to allocate task lists.");
+
+    if ((s->tid_active = (int *)malloc(sizeof(int) * size)) == NULL)
+      error("Failed to allocate aactive task lists.");
   }
 
   /* Reset the counters. */
@@ -882,6 +892,7 @@ void scheduler_reset(struct scheduler *s, int size) {
   s->submask = 0;
   s->nr_unlocks = 0;
   s->completed_unlock_writes = 0;
+  s->active_count = 0;
 
   /* Set the task pointers in the queues. */
   for (int k = 0; k < s->nr_queues; k++) s->queues[k].tasks = s->tasks;
@@ -891,11 +902,11 @@ void scheduler_reset(struct scheduler *s, int size) {
  * @brief Compute the task weights
  *
  * @param s The #scheduler.
- * @param verbose Are we talkative ?
+ * @param verbose Are we talkative?
  */
+
 void scheduler_reweight(struct scheduler *s, int verbose) {
 
-  const ticks tic = getticks();
   const int nr_tasks = s->nr_tasks;
   int *tid = s->tasks_ind;
   struct task *tasks = s->tasks;
@@ -904,6 +915,7 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
                                0.4025, 0.1897, 0.4025, 0.1897, 0.4025,
                                0.5788, 0.4025, 0.5788};
   const float wscale = 0.001;
+  const ticks tic = getticks();
 
   /* Run through the tasks backwards and set their weights. */
   for (int k = nr_tasks - 1; k >= 0; k--) {
@@ -1033,33 +1045,84 @@ void scheduler_enqueue_mapper(void *map_data, int num_elements,
 void scheduler_start(struct scheduler *s, unsigned int mask,
                      unsigned int submask) {
 
-  // ticks tic = getticks();
-
   /* Store the masks */
   s->mask = mask;
   s->submask = submask | (1 << task_subtype_none);
 
-  /* Clear all the waits and rids. */
+  /* Clear all the waits, rids, and times. */
   for (int k = 0; k < s->nr_tasks; k++) {
     s->tasks[k].wait = 1;
     s->tasks[k].rid = -1;
+    s->tasks[k].tic = 0;
+    s->tasks[k].toc = 0;
+    if (((1 << s->tasks[k].type) & mask) == 0 ||
+        ((1 << s->tasks[k].subtype) & s->submask) == 0)
+      s->tasks[k].skip = 1;
   }
 
   /* Re-wait the tasks. */
   threadpool_map(s->threadpool, scheduler_rewait_mapper, s->tasks, s->nr_tasks,
                  sizeof(struct task), 1000, s);
 
+/* Check we have not missed an active task */
+#ifdef SWIFT_DEBUG_CHECKS
+
+  const int ti_current = s->space->e->ti_current;
+  for (int k = 0; k < s->nr_tasks; k++) {
+
+    struct task *t = &s->tasks[k];
+    struct cell *ci = t->ci;
+    struct cell *cj = t->cj;
+
+    if (cj == NULL) { /* self */
+
+      if (ci->ti_end_min == ti_current && t->skip && t->type != task_type_sort)
+        error(
+            "Task (type='%s/%s') should not have been skipped ti_current=%d "
+            "c->ti_end_min=%d",
+            taskID_names[t->type], subtaskID_names[t->subtype], ti_current,
+            ci->ti_end_min);
+
+      /* Special treatment for sort tasks */
+      if (ci->ti_end_min == ti_current && t->skip &&
+          t->type == task_type_sort && t->flags == 0)
+        error(
+            "Task (type='%s/%s') should not have been skipped ti_current=%d "
+            "c->ti_end_min=%d t->flags=%d",
+            taskID_names[t->type], subtaskID_names[t->subtype], ti_current,
+            ci->ti_end_min, t->flags);
+
+    } else { /* pair */
+
+      if ((ci->ti_end_min == ti_current || cj->ti_end_min == ti_current) &&
+          t->skip)
+        error(
+            "Task (type='%s/%s') should not have been skipped ti_current=%d "
+            "ci->ti_end_min=%d cj->ti_end_min=%d",
+            taskID_names[t->type], subtaskID_names[t->subtype], ti_current,
+            ci->ti_end_min, cj->ti_end_min);
+    }
+  }
+
+#endif
+
   /* Loop over the tasks and enqueue whoever is ready. */
-  threadpool_map(s->threadpool, scheduler_enqueue_mapper, s->tasks_ind,
-                 s->nr_tasks, sizeof(int), 1000, s);
+  for (int k = 0; k < s->active_count; k++) {
+    struct task *t = &s->tasks[s->tid_active[k]];
+    if (atomic_dec(&t->wait) == 1 && !t->skip && ((1 << t->type) & s->mask) &&
+        ((1 << t->subtype) & s->submask)) {
+      scheduler_enqueue(s, t);
+      pthread_cond_signal(&s->sleep_cond);
+    }
+  }
+
+  /* Clear the list of active tasks. */
+  s->active_count = 0;
 
   /* To be safe, fire of one last sleep_cond in a safe way. */
   pthread_mutex_lock(&s->sleep_mutex);
   pthread_cond_broadcast(&s->sleep_cond);
   pthread_mutex_unlock(&s->sleep_mutex);
-
-  /* message("enqueueing tasks took %.3f %s." ,
-          clocks_from_ticks( getticks() - tic ), clocks_getunit()); */
 }
 
 /**
@@ -1212,6 +1275,9 @@ struct task *scheduler_done(struct scheduler *s, struct task *t) {
     pthread_mutex_unlock(&s->sleep_mutex);
   }
 
+  /* Mark the task as skip. */
+  t->skip = 1;
+
   /* Return the next best task. Note that we currently do not
      implement anything that does this, as getting it to respect
      priorities is too tricky and currently unnecessary. */
@@ -1427,6 +1493,7 @@ void scheduler_clean(struct scheduler *s) {
   free(s->tasks_ind);
   free(s->unlocks);
   free(s->unlock_ind);
+  free(s->tid_active);
   for (int i = 0; i < s->nr_queues; ++i) queue_clean(&s->queues[i]);
   free(s->queues);
 }
diff --git a/src/scheduler.h b/src/scheduler.h
index c4eb5e99447d623e5fb8e442efc1c254c00bfadd..22c83ae3c551d4dc7e601e23f7a5301241bd1fa4 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -33,6 +33,7 @@
 
 /* Includes. */
 #include "cell.h"
+#include "inline.h"
 #include "lock.h"
 #include "queue.h"
 #include "task.h"
@@ -83,6 +84,10 @@ struct scheduler {
   /* The task indices. */
   int *tasks_ind;
 
+  /* List of initial tasks. */
+  int *tid_active;
+  int active_count;
+
   /* The task unlocks. */
   struct task **volatile unlocks;
   int *volatile unlock_ind;
@@ -105,7 +110,24 @@ struct scheduler {
   int nodeID;
 };
 
+/* Inlined functions (for speed). */
+/**
+ * @brief Add a task to the list of active tasks.
+ *
+ * @param s The #scheduler.
+ * @param t The task to be added.
+ */
+
+__attribute__((always_inline)) INLINE static void scheduler_activate(
+    struct scheduler *s, struct task *t) {
+  if (atomic_cas(&t->skip, 1, 0)) {
+    int ind = atomic_inc(&s->active_count);
+    s->tid_active[ind] = t - s->tasks;
+  }
+}
+
 /* Function prototypes. */
+void scheduler_clear_active(struct scheduler *s);
 void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
                     int nr_queues, unsigned int flags, int nodeID,
                     struct threadpool *tp);
diff --git a/src/space.c b/src/space.c
index 7f6e2880304939c96397563be299158817e2dfff..d2dc6cf511a5c5d8cb9a83dc563f88826836f645 100644
--- a/src/space.c
+++ b/src/space.c
@@ -395,9 +395,11 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
       s->cells_top[k].nr_density = 0;
       s->cells_top[k].nr_gradient = 0;
       s->cells_top[k].nr_force = 0;
+      s->cells_top[k].nr_grav = 0;
       s->cells_top[k].density = NULL;
       s->cells_top[k].gradient = NULL;
       s->cells_top[k].force = NULL;
+      s->cells_top[k].grav = NULL;
       s->cells_top[k].dx_max = 0.0f;
       s->cells_top[k].sorted = 0;
       s->cells_top[k].count = 0;
@@ -406,7 +408,20 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
       s->cells_top[k].extra_ghost = NULL;
       s->cells_top[k].ghost = NULL;
       s->cells_top[k].kick = NULL;
+      s->cells_top[k].cooling = NULL;
+      s->cells_top[k].sourceterms = NULL;
       s->cells_top[k].super = &s->cells_top[k];
+#if WITH_MPI
+      s->cells_top[k].recv_xv = NULL;
+      s->cells_top[k].recv_rho = NULL;
+      s->cells_top[k].recv_gradient = NULL;
+      s->cells_top[k].recv_ti = NULL;
+
+      s->cells_top[k].send_xv = NULL;
+      s->cells_top[k].send_rho = NULL;
+      s->cells_top[k].send_gradient = NULL;
+      s->cells_top[k].send_ti = NULL;
+#endif
     }
     s->maxdepth = 0;
   }
@@ -587,8 +602,7 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
   for (size_t k = 1; k < nr_parts; k++) {
     if (ind[k - 1] > ind[k]) {
       error("Sort failed!");
-    } else if (ind[k] != cell_getid(s->cdim, 
-                                    s->parts[k].x[0] * s->iwidth[0],
+    } else if (ind[k] != cell_getid(s->cdim, s->parts[k].x[0] * s->iwidth[0],
                                     s->parts[k].x[1] * s->iwidth[1],
                                     s->parts[k].x[2] * s->iwidth[2])) {
       error("Incorrect indices!");
diff --git a/src/statistics.c b/src/statistics.c
new file mode 100644
index 0000000000000000000000000000000000000000..96d73b0e610d616dc05aff4dec76961221db1c1f
--- /dev/null
+++ b/src/statistics.c
@@ -0,0 +1,331 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
+#include <string.h>
+
+/* MPI headers. */
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+/* This object's header. */
+#include "statistics.h"
+
+/* Local headers. */
+#include "cooling.h"
+#include "engine.h"
+#include "error.h"
+#include "hydro.h"
+#include "threadpool.h"
+
+/**
+ * @brief Information required to compute the statistics in the mapper
+ */
+struct index_data {
+  /*! The space we play with */
+  const struct space *s;
+
+  /*! The #statistics aggregator to fill */
+  struct statistics *stats;
+};
+
+/**
+ * @brief Adds the content of one #statistics aggregator to another one.
+ *
+ * Performs a += b;
+ *
+ * @param a The #statistics structure to update.
+ * @param b The #statistics structure to add to a.
+ */
+void stats_add(struct statistics *a, const struct statistics *b) {
+
+  /* Add everything */
+  a->E_kin += b->E_kin;
+  a->E_int += b->E_int;
+  a->E_pot_self += b->E_pot_self;
+  a->E_pot_ext += b->E_pot_ext;
+  a->E_rad += b->E_rad;
+  a->entropy += b->entropy;
+  a->mass += b->mass;
+  a->mom[0] += b->mom[0];
+  a->mom[1] += b->mom[1];
+  a->mom[2] += b->mom[2];
+  a->ang_mom[0] += b->ang_mom[0];
+  a->ang_mom[1] += b->ang_mom[1];
+  a->ang_mom[2] += b->ang_mom[2];
+}
+
+/**
+ * @brief Initialises a statistics aggregator to a valid state.
+ *
+ * @param s The #statistics aggregator to initialise
+ */
+void stats_init(struct statistics *s) {
+
+  /* Zero everything */
+  bzero(s, sizeof(struct statistics));
+
+  /* Set the lock */
+  lock_init(&s->lock);
+}
+
+/**
+ * @brief The #threadpool mapper function used to collect statistics for #part.
+ *
+ * @param map_data Pointer to the particles.
+ * @param nr_parts The number of particles in this chunk
+ * @param extra_data The #statistics aggregator.
+ */
+void stats_collect_part_mapper(void *map_data, int nr_parts, void *extra_data) {
+
+  /* Unpack the data */
+  const struct index_data *data = (struct index_data *)extra_data;
+  const struct space *s = data->s;
+  const struct part *restrict parts = (struct part *)map_data;
+  const struct xpart *restrict xparts =
+      s->xparts + (ptrdiff_t)(parts - s->parts);
+  const int ti_current = s->e->ti_current;
+  const double timeBase = s->e->timeBase;
+  struct statistics *const global_stats = data->stats;
+
+  /* Required for external potential energy */
+  const struct external_potential *potential = s->e->external_potential;
+  const struct phys_const *phys_const = s->e->physical_constants;
+
+  /* Local accumulator */
+  struct statistics stats;
+  stats_init(&stats);
+
+  /* Loop over particles */
+  for (int k = 0; k < nr_parts; k++) {
+
+    /* Get the particle */
+    const struct part *p = &parts[k];
+    const struct xpart *xp = &xparts[k];
+    struct gpart *gp = NULL;
+    if (p->gpart != NULL)
+      gp = p->gpart;
+
+    /* Get useful variables */
+    const float dt = (ti_current - (p->ti_begin + p->ti_end) / 2) * timeBase;
+    const double x[3] = {p->x[0], p->x[1], p->x[2]};
+    float a_tot[3] = {p->a_hydro[0], p->a_hydro[1], p->a_hydro[2]};
+    if (p->gpart != NULL) {
+      a_tot[0] += p->gpart->a_grav[0];
+      a_tot[1] += p->gpart->a_grav[1];
+      a_tot[2] += p->gpart->a_grav[2];
+    }
+    const float v[3] = {xp->v_full[0] + a_tot[0] * dt,
+                        xp->v_full[1] + a_tot[1] * dt,
+                        xp->v_full[2] + a_tot[2] * dt};
+
+    const float m = hydro_get_mass(p);
+
+    /* Collect mass */
+    stats.mass += m;
+
+    /* Collect momentum */
+    stats.mom[0] += m * v[0];
+    stats.mom[1] += m * v[1];
+    stats.mom[2] += m * v[2];
+
+    /* Collect angular momentum */
+    stats.ang_mom[0] += m * (x[1] * v[2] - x[2] * v[1]);
+    stats.ang_mom[1] += m * (x[2] * v[0] - x[0] * v[2]);
+    stats.ang_mom[2] += m * (x[0] * v[1] - x[1] * v[0]);
+
+    /* Collect energies. */
+    stats.E_kin += 0.5f * m * (v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+    stats.E_pot_self += 0.;
+    stats.E_pot_ext += m * external_gravity_get_potential_energy(potential,phys_const,gp);
+    stats.E_int += m * hydro_get_internal_energy(p, dt);
+    stats.E_rad += cooling_get_radiated_energy(xp);
+
+    /* Collect entropy */
+    stats.entropy += m * hydro_get_entropy(p, dt);
+  }
+
+  /* Now write back to memory */
+  if (lock_lock(&global_stats->lock) == 0) stats_add(global_stats, &stats);
+  if (lock_unlock(&global_stats->lock) != 0) error("Failed to unlock stats.");
+}
+
+/**
+ * @brief The #threadpool mapper function used to collect statistics for #gpart.
+ *
+ * @param map_data Pointer to the g-particles.
+ * @param nr_gparts The number of g-particles in this chunk
+ * @param extra_data The #statistics aggregator.
+ */
+void stats_collect_gpart_mapper(void *map_data, int nr_gparts,
+                                void *extra_data) {
+
+  /* Unpack the data */
+  const struct index_data *data = (struct index_data *)extra_data;
+  const struct space *s = data->s;
+  const struct gpart *restrict gparts = (struct gpart *)map_data;
+  const int ti_current = s->e->ti_current;
+  const double timeBase = s->e->timeBase;
+  struct statistics *const global_stats = data->stats;
+
+  /* Required for external potential energy */
+  const struct external_potential *potential = s->e->external_potential;
+  const struct phys_const *phys_const = s->e->physical_constants;
+
+  /* Local accumulator */
+  struct statistics stats;
+  stats_init(&stats);
+
+  /* Loop over particles */
+  for (int k = 0; k < nr_gparts; k++) {
+
+    /* Get the particle */
+    const struct gpart *gp = &gparts[k];
+
+    /* If the g-particle has a counterpart, ignore it */
+    if (gp->id_or_neg_offset < 0) continue;
+
+    /* Get useful variables */
+    const float dt = (ti_current - (gp->ti_begin + gp->ti_end) / 2) * timeBase;
+    const double x[3] = {gp->x[0], gp->x[1], gp->x[2]};
+    const float v[3] = {gp->v_full[0] + gp->a_grav[0] * dt,
+                        gp->v_full[1] + gp->a_grav[1] * dt,
+                        gp->v_full[2] + gp->a_grav[2] * dt};
+
+    const float m = gp->mass;
+
+    /* Collect mass */
+    stats.mass += m;
+
+    /* Collect momentum */
+    stats.mom[0] += m * v[0];
+    stats.mom[1] += m * v[1];
+    stats.mom[2] += m * v[2];
+
+    /* Collect angular momentum */
+    stats.ang_mom[0] += m * (x[1] * v[2] - x[2] * v[1]);
+    stats.ang_mom[1] += m * (x[2] * v[0] - x[0] * v[2]);
+    stats.ang_mom[2] += m * (x[0] * v[1] - x[1] * v[0]);
+
+    /* Collect energies. */
+    stats.E_kin += 0.5f * m * (v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+    stats.E_pot_self += 0.;
+    stats.E_pot_ext += m * external_gravity_get_potential_energy(potential,phys_const,gp);
+  }
+
+  /* Now write back to memory */
+  if (lock_lock(&global_stats->lock) == 0) stats_add(global_stats, &stats);
+  if (lock_unlock(&global_stats->lock) != 0) error("Failed to unlock stats.");
+}
+
+/**
+ * @brief Collect physical statistics over all particles in a #space.
+ *
+ * @param s The #space to collect from.
+ * @param stats The #statistics aggregator to fill.
+ */
+void stats_collect(const struct space *s, struct statistics *stats) {
+
+  /* Prepare the data */
+  struct index_data extra_data;
+  extra_data.s = s;
+  extra_data.stats = stats;
+
+  /* Run parallel collection of statistics for parts */
+  if (s->nr_parts > 0)
+    threadpool_map(&s->e->threadpool, stats_collect_part_mapper, s->parts,
+                   s->nr_parts, sizeof(struct part), 10000, &extra_data);
+
+  /* Run parallel collection of statistics for gparts */
+  if (s->nr_gparts > 0)
+    threadpool_map(&s->e->threadpool, stats_collect_gpart_mapper, s->gparts,
+                   s->nr_gparts, sizeof(struct gpart), 10000, &extra_data);
+}
+
+/**
+ * @brief Prints the content of a #statistics aggregator to a file
+ *
+ * @param file File to write to.
+ * @param stats The #statistics object to write to the file
+ * @param time The current physical time.
+ */
+void stats_print_to_file(FILE *file, const struct statistics *stats,
+                         double time) {
+
+  const double E_pot = stats->E_pot_self + stats->E_pot_ext;
+  const double E_tot = stats->E_kin + stats->E_int + E_pot;
+
+  fprintf(file,
+          " %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e "
+          "%14e %14e %14e\n",
+          time, stats->mass, E_tot, stats->E_kin, stats->E_int, E_pot,
+	  stats->E_pot_self, stats->E_pot_ext, stats->E_rad, stats->entropy,
+	  stats->mom[0], stats->mom[1], stats->mom[2], stats->ang_mom[0],
+	  stats->ang_mom[1], stats->ang_mom[2]);
+  fflush(file);
+}
+
+/* Extra stuff in MPI-land */
+#ifdef WITH_MPI
+
+/**
+ * @brief MPI datatype corresponding to the #statistics structure.
+ */
+MPI_Datatype statistics_mpi_type;
+
+/**
+ * @brief MPI operator used for the reduction of #statistics structure.
+ */
+MPI_Op statistics_mpi_reduce_op;
+
+/**
+ * @brief MPI reduce operator for #statistics structures.
+ */
+void stats_add_MPI(void *in, void *inout, int *len, MPI_Datatype *datatype) {
+
+  for (int i = 0; i < *len; ++i)
+    stats_add(&((struct statistics *)inout)[0],
+              &((const struct statistics *)in)[i]);
+}
+
+/**
+ * @brief Registers MPI #statistics type and reduction function.
+ */
+void stats_create_MPI_type() {
+
+  /* This is not the recommended way of doing this.
+     One should define the structure field by field
+     But as long as we don't do serialization via MPI-IO
+     we don't really care.
+     Also we would have to modify this function everytime something
+     is added to the statistics structure. */
+  if (MPI_Type_contiguous(sizeof(struct statistics) / sizeof(unsigned char),
+                          MPI_BYTE, &statistics_mpi_type) != MPI_SUCCESS ||
+      MPI_Type_commit(&statistics_mpi_type) != MPI_SUCCESS) {
+    error("Failed to create MPI type for statistics.");
+  }
+
+  /* Create the reduction operation */
+  MPI_Op_create(stats_add_MPI, 1, &statistics_mpi_reduce_op);
+}
+#endif
diff --git a/src/statistics.h b/src/statistics.h
new file mode 100644
index 0000000000000000000000000000000000000000..b007d1314c458254986862f99b7cb1669d995545
--- /dev/null
+++ b/src/statistics.h
@@ -0,0 +1,79 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#ifndef SWIFT_STATISTICS_H
+#define SWIFT_STATISTICS_H
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Local headers. */
+#include "lock.h"
+#include "space.h"
+
+/**
+ * @brief Quantities collected for physics statistics
+ */
+struct statistics {
+
+  /*! Kinetic energy */
+  double E_kin;
+
+  /*! Internal energy */
+  double E_int;
+
+  /*! Self potential energy */
+  double E_pot_self;
+
+  /*! External potential energy */
+  double E_pot_ext;
+
+  /*! Radiative energy */
+  double E_rad;
+
+  /*! Entropy */
+  double entropy;
+
+  /*! Mass */
+  double mass;
+
+  /*! Momentum */
+  double mom[3];
+
+  /*! Angular momentum */
+  double ang_mom[3];
+
+  /*! Lock for threaded access */
+  swift_lock_type lock;
+};
+
+void stats_collect(const struct space* s, struct statistics* stats);
+void stats_add(struct statistics* a, const struct statistics* b);
+void stats_print_to_file(FILE* file, const struct statistics* stats,
+                         double time);
+void stats_init(struct statistics* s);
+
+#ifdef WITH_MPI
+extern MPI_Datatype statistics_mpi_type;
+extern MPI_Op statistics_mpi_reduce_op;
+
+void stats_add_MPI(void* in, void* out, int* len, MPI_Datatype* datatype);
+void stats_create_MPI_type();
+#endif
+
+#endif /* SWIFT_STATISTICS_H */
diff --git a/src/threadpool.c b/src/threadpool.c
index 4ef75954b39603db0d442acc9be2bd95b39614d3..6bc887d96cb72804f0fbc8e2801a6522bf27f947 100644
--- a/src/threadpool.c
+++ b/src/threadpool.c
@@ -59,14 +59,22 @@ void *threadpool_runner(void *data) {
     pthread_mutex_unlock(&tp->thread_mutex);
 
     /* The index of the mapping task we will work on next. */
-    size_t task_ind;
-    while ((task_ind = atomic_add(&tp->map_data_count, tp->map_data_chunk)) <
-           tp->map_data_size) {
-      const int num_elements = task_ind + tp->map_data_chunk > tp->map_data_size
-                                   ? tp->map_data_size - task_ind
-                                   : tp->map_data_chunk;
+    while (1) {
+      /* Desired chunk size. */
+      size_t chunk_size =
+          (tp->map_data_size - tp->map_data_count) / (2 * tp->num_threads);
+      if (chunk_size > tp->map_data_chunk) chunk_size = tp->map_data_chunk;
+      if (chunk_size < 1) chunk_size = 1;
+
+      /* Get a chunk and check its size. */
+      size_t task_ind = atomic_add(&tp->map_data_count, chunk_size);
+      if (task_ind >= tp->map_data_size) break;
+      if (task_ind + chunk_size > tp->map_data_size)
+        chunk_size = tp->map_data_size - task_ind;
+
+      /* Call the mapper function. */
       tp->map_function((char *)tp->map_data + (tp->map_data_stride * task_ind),
-                       num_elements, tp->map_extra_data);
+                       chunk_size, tp->map_extra_data);
     }
   }
 }