diff --git a/src/engine.c b/src/engine.c
index 8267b52adb2013263c54103892956e5eb8aee01a..14eb656fd58c2523a1c48077900151e658546c22 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -66,19 +66,11 @@
 #include "timers.h"
 #include "units.h"
 
-const char *engine_policy_names[13] = {"none",
-                                       "rand",
-                                       "steal",
-                                       "keep",
-                                       "block",
-                                       "fix_dt",
-                                       "cpu_tight",
-                                       "mpi",
-                                       "numa_affinity",
-                                       "hydro",
-                                       "self_gravity",
-                                       "external_gravity",
-                                       "cosmology_integration"};
+const char *engine_policy_names[13] = {
+    "none",                 "rand",   "steal",        "keep",
+    "block",                "fix_dt", "cpu_tight",    "mpi",
+    "numa_affinity",        "hydro",  "self_gravity", "external_gravity",
+    "cosmology_integration"};
 
 /** The rank of the engine as a global variable (for messages). */
 int engine_rank;
@@ -98,7 +90,7 @@ static cpu_set_t entry_affinity;
  * @return The new #link pointer.
  */
 
-struct link *engine_addlink(struct engine *e, struct link *l, struct task *t) {
+void engine_addlink(struct engine *e, struct link **l, struct task *t) {
 
   /* Get the next free link. */
   const int ind = atomic_inc(&e->nr_links);
@@ -729,9 +721,9 @@ void engine_addtasks_send(struct engine *e, struct cell *ci, struct cell *cj,
     }
 
     /* Add them to the local cell. */
-    ci->send_xv = engine_addlink(e, ci->send_xv, t_xv);
-    ci->send_rho = engine_addlink(e, ci->send_rho, t_rho);
-    if (t_ti != NULL) ci->send_ti = engine_addlink(e, ci->send_ti, t_ti);
+    engine_addlink(e, &ci->send_xv, t_xv);
+    engine_addlink(e, &ci->send_rho, t_rho);
+    if (t_ti != NULL) engine_addlink(e, &ci->send_ti, t_ti);
   }
 
   /* Recurse? */
@@ -1267,14 +1259,13 @@ void engine_make_hydroloop_tasks(struct engine *e) {
  *
  * @param e The #engine.
  */
- 
+
 void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
                                         void *extra_data) {
 
   struct engine *e = (struct engine *)extra_data;
   struct task *tasks = (struct task *)map_data;
   struct scheduler *sched = &e->sched;
-  const int nr_tasks = sched->nr_tasks;
 
   for (int ind = 0; ind < num_elements; ind++) {
 
@@ -1317,12 +1308,14 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
       atomic_inc(&t->ci->nr_tasks);
       atomic_inc(&t->cj->nr_tasks);
       if (t->subtype == task_subtype_density) {
-        t->ci->density = engine_addlink(e, t->ci->density, t);
+        engine_addlink(e, &t->ci->density, t);
         atomic_inc(&t->ci->nr_density);
-        t->cj->density = engine_addlink(e, t->cj->density, t);
+        engine_addlink(e, &t->cj->density, t);
         atomic_inc(&t->cj->nr_density);
       }
     }
+  }
+}
 
 void engine_count_and_link_tasks_serial(struct engine *e) {
 
@@ -1369,9 +1362,9 @@ void engine_count_and_link_tasks_serial(struct engine *e) {
       atomic_inc(&t->ci->nr_tasks);
       atomic_inc(&t->cj->nr_tasks);
       if (t->subtype == task_subtype_density) {
-        t->ci->density = engine_addlink(e, t->ci->density, t);
+        engine_addlink(e, &t->ci->density, t);
         atomic_inc(&t->ci->nr_density);
-        t->cj->density = engine_addlink(e, t->cj->density, t);
+        engine_addlink(e, &t->cj->density, t);
         atomic_inc(&t->cj->nr_density);
       }
     }
@@ -1379,17 +1372,17 @@ void engine_count_and_link_tasks_serial(struct engine *e) {
 }
 
 /**
- * @brief Creates the dependency network for the hydro tasks of a given cell.
+ * @brief Creates the dependency network for the hydro tasks of a given
+ *cell.
  *
  * @param sched The #scheduler.
  * @param density The density task to link.
  * @param force The force task to link.
  * @param c The cell.
  */
-static inline void engine_make_hydro_loops_dependencies(struct scheduler *sched,
-                                                        struct task *density,
-                                                        struct task *force,
-                                                        struct cell *c) {
+void engine_make_hydro_loops_dependencies(struct scheduler *sched,
+                                          struct task *density,
+                                          struct task *force, struct cell *c) {
 
   /* init --> density loop --> ghost --> force loop --> kick */
   scheduler_addunlock(sched, c->super->init, density);
@@ -1466,33 +1459,49 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
     }
 
     /* Otherwise, sub interaction? */
-    else if (t->type == task_type_sub && t->subtype == task_subtype_density) {
+    else if (t->type == task_type_sub_pair &&
+             t->subtype == task_subtype_density) {
 
       /* Start by constructing the task for the second hydro loop */
       struct task *t2 =
-          scheduler_addtask(sched, task_type_sub, task_subtype_force, t->flags,
-                            0, t->ci, t->cj, 0);
+          scheduler_addtask(sched, task_type_sub_pair, task_subtype_force,
+                            t->flags, 0, t->ci, t->cj, 0);
 
       /* Add the link between the new loop and both cells */
       engine_addlink(e, &t->ci->force, t2);
       atomic_inc(&t->ci->nr_force);
-      if (t->cj != NULL) {
-        engine_addlink(e, &t->cj->force, t2);
-        atomic_inc(&t->cj->nr_force);
-      }
+      engine_addlink(e, &t->cj->force, t2);
+      atomic_inc(&t->cj->nr_force);
 
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super-cells */
       if (t->ci->nodeID == nodeID) {
         engine_make_hydro_loops_dependencies(sched, t, t2, t->ci);
       }
-      if (t->cj != NULL && t->cj->nodeID == nodeID &&
-          t->ci->super != t->cj->super) {
+      if (t->cj->nodeID == nodeID && t->ci->super != t->cj->super) {
         engine_make_hydro_loops_dependencies(sched, t, t2, t->cj);
       }
+    } else if (t->type == task_type_sub_self &&
+               t->subtype == task_subtype_density) {
+
+      /* Start by constructing the task for the second hydro loop */
+      struct task *t2 =
+          scheduler_addtask(sched, task_type_sub_self, task_subtype_force,
+                            t->flags, 0, t->ci, t->cj, 0);
+
+      /* Add the link between the new loop and both cells */
+      engine_addlink(e, &t->ci->force, t2);
+      atomic_inc(&t->ci->nr_force);
+
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super-cells */
+      if (t->ci->nodeID == nodeID) {
+        engine_make_hydro_loops_dependencies(sched, t, t2, t->ci);
+      }
     }
 
-    /* /\* Kick tasks should rely on the grav_down tasks of their cell. *\/ */
+    /* /\* Kick tasks should rely on the grav_down tasks of their cell. *\/
+       */
     /* else if (t->type == task_type_kick && t->ci->grav_down != NULL) */
     /*   scheduler_addunlock(sched, t->ci->grav_down, t); */
 
@@ -1564,7 +1573,7 @@ void engine_make_extra_hydroloop_tasks_serial(struct engine *e) {
                             t->flags, 0, t->ci, t->cj, 0);
 
       /* Add the link between the new loop and the cell */
-      t->ci->force = engine_addlink(e, t->ci->force, t2);
+      engine_addlink(e, &t->ci->force, t2);
       atomic_inc(&t->ci->nr_force);
 
       /* Now, build all the dependencies for the hydro for the cells */
@@ -1584,9 +1593,9 @@ void engine_make_extra_hydroloop_tasks_serial(struct engine *e) {
                             t->flags, 0, t->ci, t->cj, 0);
 
       /* Add the link between the new loop and both cells */
-      t->ci->force = engine_addlink(e, t->ci->force, t2);
+      engine_addlink(e, &t->ci->force, t2);
       atomic_inc(&t->ci->nr_force);
-      t->cj->force = engine_addlink(e, t->cj->force, t2);
+      engine_addlink(e, &t->cj->force, t2);
       atomic_inc(&t->cj->nr_force);
 
       /* Now, build all the dependencies for the hydro for the cells */
@@ -1599,7 +1608,8 @@ void engine_make_extra_hydroloop_tasks_serial(struct engine *e) {
       }
     }
 
-    /* /\* Kick tasks should rely on the grav_down tasks of their cell. *\/ */
+    /* /\* Kick tasks should rely on the grav_down tasks of their cell. *\/
+       */
     /* else if (t->type == task_type_kick && t->ci->grav_down != NULL) */
     /*   scheduler_addunlock(sched, t->ci->grav_down, t); */
 
@@ -1612,7 +1622,8 @@ void engine_make_extra_hydroloop_tasks_serial(struct engine *e) {
 }
 
 /**
- * @brief Constructs the top-level pair tasks for the gravity M-M interactions
+ * @brief Constructs the top-level pair tasks for the gravity M-M
+ *interactions
  *
  * Correct implementation is still lacking here.
  *
@@ -1647,7 +1658,8 @@ void engine_make_gravityinteraction_tasks(struct engine *e) {
 }
 
 /**
- * @brief Constructs the gravity tasks building the multipoles and propagating
+ * @brief Constructs the gravity tasks building the multipoles and
+ *propagating
  *them to the children
  *
  * Correct implementation is still lacking here.
@@ -1708,7 +1720,8 @@ void engine_maketasks(struct engine *e) {
   scheduler_splittasks(sched);
 
   /* Allocate the list of cell-task links. The maximum number of links
-     is the number of cells (s->tot_cells) times the number of neighbours (27)
+     is the number of cells (s->tot_cells) times the number of neighbours
+     (27)
      times the number of interaction types (2, density and force). */
   if (e->links != NULL) free(e->links);
   e->size_links = s->tot_cells * 27 * 2;
@@ -1716,7 +1729,8 @@ void engine_maketasks(struct engine *e) {
     error("Failed to allocate cell-task links.");
   e->nr_links = 0;
 
-  /* Add the gravity up/down tasks at the top-level cells and push them down. */
+  /* Add the gravity up/down tasks at the top-level cells and push them
+   * down. */
   if (e->policy & engine_policy_self_gravity)
     engine_make_gravityrecursive_tasks(e);
 
@@ -1724,7 +1738,8 @@ void engine_maketasks(struct engine *e) {
      store the density tasks in each cell, and make each sort
      depend on the sorts of its progeny. */
   /* threadpool_map(&e->threadpool, engine_count_and_link_tasks_mapper,
-                 sched->tasks, sched->nr_tasks, sizeof(struct task), 1000, e); */
+                 sched->tasks, sched->nr_tasks, sizeof(struct task), 1000,
+     e); */
   engine_count_and_link_tasks_serial(e);
 
   /* Append hierarchical tasks to each cells */
@@ -1740,8 +1755,10 @@ void engine_maketasks(struct engine *e) {
   /* Run through the tasks and make force tasks for each density task.
      Each force task depends on the cell ghosts and unlocks the kick task
      of its super-cell. */
-  /* threadpool_map(&e->threadpool, engine_make_extra_hydroloop_tasks_mapper,
-                 sched->tasks, sched->nr_tasks, sizeof(struct task), 1000, e); */
+  /* threadpool_map(&e->threadpool,
+     engine_make_extra_hydroloop_tasks_mapper,
+                 sched->tasks, sched->nr_tasks, sizeof(struct task), 1000,
+     e); */
   engine_make_extra_hydroloop_tasks_serial(e);
 
   /* Add the communication tasks if MPI is being used. */
@@ -1805,8 +1822,7 @@ void engine_marktasks_fixdt_mapper(void *map_data, int num_elements,
     struct task *t = &tasks[ind];
 
     /* Pair? */
-    if (t->type == task_type_pair ||
-        (t->type == task_type_sub_pair)) {
+    if (t->type == task_type_pair || (t->type == task_type_sub_pair)) {
 
       /* Local pointers. */
       const struct cell *ci = t->ci;
@@ -1855,15 +1871,14 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
 
     /* Single-cell task? */
     if (t->type == task_type_self || t->type == task_type_ghost ||
-        (t->type == task_type_sub && t->cj == NULL)) {
+        t->type == task_type_sub_self) {
 
       /* Set this task's skip. */
       t->skip = (t->ci->ti_end_min > ti_end);
     }
 
     /* Pair? */
-    else if (t->type == task_type_pair ||
-             (t->type == task_type_sub_pair)) {
+    else if (t->type == task_type_pair || (t->type == task_type_sub_pair)) {
 
       /* Local pointers. */
       const struct cell *ci = t->ci;
@@ -2368,7 +2383,8 @@ void engine_launch(struct engine *e, int nr_runners, unsigned int mask,
  *forward in time.
  *
  * @param e The #engine
- * @param flag_entropy_ICs Did the 'Internal Energy' of the particles actually
+ * @param flag_entropy_ICs Did the 'Internal Energy' of the particles
+ *actually
  *contain entropy ?
  */
 void engine_init_particles(struct engine *e, int flag_entropy_ICs) {
@@ -2407,7 +2423,8 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs) {
 
     mask |= 1 << task_type_self;
     mask |= 1 << task_type_pair;
-    mask |= 1 << task_type_sub;
+    mask |= 1 << task_type_sub_pair;
+    mask |= 1 << task_type_sub_self;
     mask |= 1 << task_type_ghost;
 
     submask |= 1 << task_subtype_density;
@@ -2690,7 +2707,8 @@ void engine_makeproxies(struct engine *e) {
 }
 
 /**
- * @brief Split the underlying space into regions and assign to separate nodes.
+ * @brief Split the underlying space into regions and assign to separate
+ *nodes.
  *
  * @param e The #engine.
  * @param initial_partition structure defining the cell partition technique
@@ -2807,7 +2825,7 @@ void engine_dump_snapshot(struct engine *e) {
 /**
  * @brief Returns the initial affinity the main thread is using.
  */
-static cpu_set_t *engine_entry_affinity() {
+cpu_set_t *engine_entry_affinity() {
 
   static int use_entry_affinity = 0;
 
@@ -2822,7 +2840,8 @@ static cpu_set_t *engine_entry_affinity() {
 #endif
 
 /**
- * @brief  Ensure the NUMA node on which we initialise (first touch) everything
+ * @brief  Ensure the NUMA node on which we initialise (first touch)
+ * everything
  *  doesn't change before engine_init allocates NUMA-local workers.
  */
 void engine_pin() {
@@ -3034,9 +3053,11 @@ void engine_init(struct engine *e, struct space *s,
 
   } /* with_aff */
 
-  /* Avoid (unexpected) interference between engine and runner threads. We can
+  /* Avoid (unexpected) interference between engine and runner threads. We
+   * can
    * do this once we've made at least one call to engine_entry_affinity and
-   * maybe numa_node_of_cpu(sched_getcpu()), even if the engine isn't already
+   * maybe numa_node_of_cpu(sched_getcpu()), even if the engine isn't
+   * already
    * pinned. Also unpin this when asked to not pin at all (!with_aff). */
   engine_unpin();
 #endif
@@ -3086,14 +3107,16 @@ void engine_init(struct engine *e, struct space *s,
   /* Check we have sensible time bounds */
   if (e->timeBegin >= e->timeEnd)
     error(
-        "Final simulation time (t_end = %e) must be larger than the start time "
+        "Final simulation time (t_end = %e) must be larger than the start "
+        "time "
         "(t_beg = %e)",
         e->timeEnd, e->timeBegin);
 
   /* Check we have sensible time-step values */
   if (e->dt_min > e->dt_max)
     error(
-        "Minimal time-step size (%e) must be smaller than maximal time-step "
+        "Minimal time-step size (%e) must be smaller than maximal "
+        "time-step "
         "size (%e)",
         e->dt_min, e->dt_max);
 
@@ -3147,7 +3170,8 @@ void engine_init(struct engine *e, struct space *s,
 
   if (e->timeFirstSnapshot < e->timeBegin)
     error(
-        "Time of first snapshot (%e) must be after the simulation start t=%e.",
+        "Time of first snapshot (%e) must be after the simulation start "
+        "t=%e.",
         e->timeFirstSnapshot, e->timeBegin);
 
   /* Find the time of the first output */
@@ -3172,18 +3196,7 @@ void engine_init(struct engine *e, struct space *s,
   /* Init the scheduler with enough tasks for the initial sorting tasks. */
   const int nr_tasks = 2 * s->tot_cells + 2 * e->nr_threads;
   scheduler_init(&e->sched, e->s, nr_tasks, nr_queues, scheduler_flag_steal,
-                 e->nodeID);
-
-  /* Create the sorting tasks. */
-  for (int i = 0; i < e->nr_threads; i++) {
-    scheduler_addtask(&e->sched, task_type_part_sort, task_subtype_none, i, 0,
-                      NULL, NULL, 0);
-
-    scheduler_addtask(&e->sched, task_type_gpart_sort, task_subtype_none, i, 0,
-                      NULL, NULL, 0);
-  }
-
-  scheduler_ranktasks(&e->sched);
+                 e->nodeID, &e->threadpool);
 
   /* Allocate and init the threads. */
   if ((e->runners = (struct runner *)malloc(sizeof(struct runner) *
diff --git a/src/scheduler.c b/src/scheduler.c
index 05f699557cc7aafb641112bf9d9493188ffde52c..02749bbba9d643d4893ed65b40d978a6831792fa 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -161,7 +161,7 @@ void scheduler_splittasks_mapper(void *map_data, int num_elements,
           if (scheduler_dosub && ci->count < space_subsize / ci->count) {
 
             /* convert to a self-subtask. */
-            t->type = task_type_sub;
+            t->type = task_type_sub_self;
 
             /* Otherwise, make tasks explicitly. */
           } else {
@@ -224,7 +224,7 @@ void scheduler_splittasks_mapper(void *map_data, int num_elements,
               sid != 0 && sid != 2 && sid != 6 && sid != 8) {
 
             /* Make this task a sub task. */
-            t->type = task_type_sub;
+            t->type = task_type_sub_pair;
 
             /* Otherwise, split it. */
           } else {
@@ -757,8 +757,11 @@ void scheduler_ranktasks(struct scheduler *s) {
   const int nr_tasks = s->nr_tasks;
 
   /* Run through the tasks and get all the waits right. */
-  threadpool_map(s->threadpool, scheduler_simple_rewait_mapper, tasks, nr_tasks,
-                 sizeof(struct task), 1000, NULL);
+  for (int k = 0; k < nr_tasks; k++) {
+    tid[k] = k;
+    for (int j = 0; j < tasks[k].nr_unlock_tasks; j++)
+      tasks[k].unlock_tasks[j]->wait += 1;
+  }
 
   /* Load the tids of tasks with no waits. */
   int left = 0;
@@ -1305,7 +1308,8 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
  */
 
 void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
-                    int nr_queues, unsigned int flags, int nodeID) {
+                    int nr_queues, unsigned int flags, int nodeID,
+                    struct threadpool *tp) {
 
   /* Init the lock. */
   lock_init(&s->lock);
@@ -1337,6 +1341,7 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
   s->flags = flags;
   s->space = space;
   s->nodeID = nodeID;
+  s->threadpool = tp;
 
   /* Init the tasks array. */
   s->size = 0;
diff --git a/src/space.c b/src/space.c
index 04214e49e0c66fc468c91f0098f75c10ef59c252..9499bbb4712c61f020f4aa88423a8dd8a6db3811 100644
--- a/src/space.c
+++ b/src/space.c
@@ -1226,6 +1226,7 @@ void space_split_mapper(void *map_data, int num_elements, void *extra_data) {
   for (int ind = 0; ind < num_elements; ind++) {
 
     struct cell *c = &cells[ind];
+    if (c == NULL) continue;
 
     const int count = c->count;
     const int gcount = c->gcount;
@@ -1271,7 +1272,7 @@ void space_split_mapper(void *map_data, int num_elements, void *extra_data) {
       }
 
       /* Split the cell data. */
-      cell_split(c);
+      cell_split(c, c->parts - s->parts);
 
       /* Remove any progeny with zero parts. */
       for (int k = 0; k < 8; k++)