diff --git a/src/cell.c b/src/cell.c
index 05a1990dd5355877cadcd0526252a6266dddcf6b..b9a49adee932e557eaba8d7e786be47d10f2ce1c 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -186,6 +186,7 @@ int cell_pack(struct cell *restrict c, struct pcell *restrict pc) {
   pc->count = c->count;
   pc->gcount = c->gcount;
   pc->scount = c->scount;
+
   c->tag = pc->tag = atomic_inc(&cell_next_tag) % cell_max_tag;
 #ifdef SWIFT_DEBUG_CHECKS
   pc->cellID = c->cellID;
diff --git a/src/engine.c b/src/engine.c
index de032b1c939743f36e679844cf2e238bf30becd8..7be4de0f7c6a35d581964197d28aee5e2eaa8b85 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -1325,12 +1325,12 @@ void engine_addtasks_send_hydro(struct engine *e, struct cell *ci,
     if (t_xv == NULL) {
 
       t_xv = scheduler_addtask(s, task_type_send, task_subtype_xv,
-                               6 * ci->tag + 0, 0, ci, cj);
+                               ci->tag, 0, ci, cj);
       t_rho = scheduler_addtask(s, task_type_send, task_subtype_rho,
-                                6 * ci->tag + 1, 0, ci, cj);
+                                ci->tag, 0, ci, cj);
 #ifdef EXTRA_HYDRO_LOOP
       t_gradient = scheduler_addtask(s, task_type_send, task_subtype_gradient,
-                                     6 * ci->tag + 3, 0, ci, cj);
+                                     ci->tag, 0, ci, cj);
 #endif
 
 #ifdef EXTRA_HYDRO_LOOP
@@ -1414,7 +1414,7 @@ void engine_addtasks_send_gravity(struct engine *e, struct cell *ci,
     if (t_grav == NULL) {
 
       t_grav = scheduler_addtask(s, task_type_send, task_subtype_gpart,
-                                 6 * ci->tag + 4, 0, ci, cj);
+                                 ci->tag, 0, ci, cj);
 
       /* The sends should unlock the down pass. */
       scheduler_addunlock(s, t_grav, ci->super_gravity->grav_down);
@@ -1474,7 +1474,7 @@ void engine_addtasks_send_timestep(struct engine *e, struct cell *ci,
     if (t_ti == NULL) {
 
       t_ti = scheduler_addtask(s, task_type_send, task_subtype_tend,
-                               6 * ci->tag + 2, 0, ci, cj);
+                               ci->tag, 0, ci, cj);
 
       /* The super-cell's timestep task should unlock the send_ti task. */
       scheduler_addunlock(s, ci->super->timestep, t_ti);
@@ -1515,13 +1515,13 @@ void engine_addtasks_recv_hydro(struct engine *e, struct cell *c,
   if (t_xv == NULL && c->density != NULL) {
 
     /* Create the tasks. */
-    t_xv = scheduler_addtask(s, task_type_recv, task_subtype_xv, 6 * c->tag + 0,
+    t_xv = scheduler_addtask(s, task_type_recv, task_subtype_xv, c->tag,
                              0, c, NULL);
     t_rho = scheduler_addtask(s, task_type_recv, task_subtype_rho,
-                              6 * c->tag + 1, 0, c, NULL);
+                              c->tag, 0, c, NULL);
 #ifdef EXTRA_HYDRO_LOOP
     t_gradient = scheduler_addtask(s, task_type_recv, task_subtype_gradient,
-                                   6 * c->tag + 3, 0, c, NULL);
+                                   c->tag, 0, c, NULL);
 #endif
   }
 
@@ -1577,7 +1577,7 @@ void engine_addtasks_recv_gravity(struct engine *e, struct cell *c,
 
     /* Create the tasks. */
     t_grav = scheduler_addtask(s, task_type_recv, task_subtype_gpart,
-                               6 * c->tag + 4, 0, c, NULL);
+                               c->tag, 0, c, NULL);
   }
 
   c->recv_grav = t_grav;
@@ -1613,7 +1613,7 @@ void engine_addtasks_recv_timestep(struct engine *e, struct cell *c,
   if (t_ti == NULL && (c->grav != NULL || c->density != NULL)) {
 
     t_ti = scheduler_addtask(s, task_type_recv, task_subtype_tend,
-                             6 * c->tag + 2, 0, c, NULL);
+                             c->tag, 0, c, NULL);
   }
 
   c->recv_ti = t_ti;
@@ -4793,7 +4793,7 @@ void engine_step(struct engine *e) {
   if (e->verbose) engine_print_task_counts(e);
 
     /* Dump local cells and active particle counts. */
-    /* dumpCells("cells", 0, 0, 0, 0, e->s, e->nodeID, e->step); */
+    //dumpCells("cells", 1, 0, 0, 0, e->s, e->nodeID, e->step);
 
 #ifdef SWIFT_DEBUG_CHECKS
   /* Check that we have the correct total mass in the top-level multipoles */
@@ -6341,6 +6341,7 @@ void engine_config(int restart, struct engine *e, struct swift_params *params,
 #ifdef WITH_MPI
   part_create_mpi_types();
   stats_create_MPI_type();
+  task_create_mpi_comms();
 #endif
 
   /* Initialise the collection group. */
diff --git a/src/scheduler.c b/src/scheduler.c
index 4974884651b02db57d851493a4fc8fe343483a05..a089f802175138059e4afa53a46ebddc4b731803 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1558,27 +1558,31 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
                                                 t->ci->pcell_size);
           err = MPI_Irecv(
               t->buff, t->ci->pcell_size * sizeof(struct pcell_step), MPI_BYTE,
-              t->ci->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+              t->ci->nodeID, t->flags, subtaskMPI_comms[t->subtype], &t->req);
         } else if (t->subtype == task_subtype_xv ||
                    t->subtype == task_subtype_rho ||
                    t->subtype == task_subtype_gradient) {
           err = MPI_Irecv(t->ci->parts, t->ci->count, part_mpi_type,
-                          t->ci->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+                          t->ci->nodeID, t->flags,
+                          subtaskMPI_comms[t->subtype], &t->req);
           // message( "receiving %i parts with tag=%i from %i to %i." ,
           //     t->ci->count , t->flags , t->ci->nodeID , s->nodeID );
           // fflush(stdout);
         } else if (t->subtype == task_subtype_gpart) {
           err = MPI_Irecv(t->ci->gparts, t->ci->gcount, gpart_mpi_type,
-                          t->ci->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+                          t->ci->nodeID, t->flags,
+                          subtaskMPI_comms[t->subtype], &t->req);
         } else if (t->subtype == task_subtype_spart) {
           err = MPI_Irecv(t->ci->sparts, t->ci->scount, spart_mpi_type,
-                          t->ci->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+                          t->ci->nodeID, t->flags,
+                          subtaskMPI_comms[t->subtype], &t->req);
         } else if (t->subtype == task_subtype_multipole) {
           t->buff = (struct gravity_tensors *)malloc(
               sizeof(struct gravity_tensors) * t->ci->pcell_size);
           err = MPI_Irecv(
               t->buff, sizeof(struct gravity_tensors) * t->ci->pcell_size,
-              MPI_BYTE, t->ci->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+              MPI_BYTE, t->ci->nodeID, t->flags,
+              subtaskMPI_comms[t->subtype], &t->req);
         } else {
           error("Unknown communication sub-type");
         }
@@ -1600,44 +1604,53 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
               s->mpi_message_limit)
             err = MPI_Isend(
                 t->buff, t->ci->pcell_size * sizeof(struct pcell_step),
-                MPI_BYTE, t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+                MPI_BYTE, t->cj->nodeID, t->flags,
+                subtaskMPI_comms[t->subtype], &t->req);
           else
             err = MPI_Issend(
                 t->buff, t->ci->pcell_size * sizeof(struct pcell_step),
-                MPI_BYTE, t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+                MPI_BYTE, t->cj->nodeID, t->flags,
+                subtaskMPI_comms[t->subtype], &t->req);
         } else if (t->subtype == task_subtype_xv ||
                    t->subtype == task_subtype_rho ||
                    t->subtype == task_subtype_gradient) {
           if ((t->ci->count * sizeof(struct part)) > s->mpi_message_limit)
             err = MPI_Isend(t->ci->parts, t->ci->count, part_mpi_type,
-                            t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+                            t->cj->nodeID, t->flags,
+                            subtaskMPI_comms[t->subtype], &t->req);
           else
             err = MPI_Issend(t->ci->parts, t->ci->count, part_mpi_type,
-                             t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+                             t->cj->nodeID, t->flags,
+                             subtaskMPI_comms[t->subtype], &t->req);
           // message( "sending %i parts with tag=%i from %i to %i." ,
           //     t->ci->count , t->flags , s->nodeID , t->cj->nodeID );
           // fflush(stdout);
         } else if (t->subtype == task_subtype_gpart) {
           if ((t->ci->gcount * sizeof(struct gpart)) > s->mpi_message_limit)
             err = MPI_Isend(t->ci->gparts, t->ci->gcount, gpart_mpi_type,
-                            t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+                            t->cj->nodeID, t->flags,
+                            subtaskMPI_comms[t->subtype], &t->req);
           else
             err = MPI_Issend(t->ci->gparts, t->ci->gcount, gpart_mpi_type,
-                             t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+                             t->cj->nodeID, t->flags,
+                             subtaskMPI_comms[t->subtype], &t->req);
         } else if (t->subtype == task_subtype_spart) {
           if ((t->ci->scount * sizeof(struct spart)) > s->mpi_message_limit)
             err = MPI_Isend(t->ci->sparts, t->ci->scount, spart_mpi_type,
-                            t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+                            t->cj->nodeID, t->flags,
+                            subtaskMPI_comms[t->subtype], &t->req);
           else
             err = MPI_Issend(t->ci->sparts, t->ci->scount, spart_mpi_type,
-                             t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+                             t->cj->nodeID, t->flags,
+                             subtaskMPI_comms[t->subtype], &t->req);
         } else if (t->subtype == task_subtype_multipole) {
           t->buff = (struct gravity_tensors *)malloc(
               sizeof(struct gravity_tensors) * t->ci->pcell_size);
           cell_pack_multipoles(t->ci, (struct gravity_tensors *)t->buff);
           err = MPI_Isend(
               t->buff, t->ci->pcell_size * sizeof(struct gravity_tensors),
-              MPI_BYTE, t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+              MPI_BYTE, t->cj->nodeID, t->flags,
+              subtaskMPI_comms[t->subtype], &t->req);
         } else {
           error("Unknown communication sub-type");
         }
diff --git a/src/task.c b/src/task.c
index 2782dabfc1369dedd43e9b42855a8b43acf2f1b7..406698e987d4d497d986171c58b321e40f8ad24b 100644
--- a/src/task.c
+++ b/src/task.c
@@ -63,6 +63,11 @@ const char *subtaskID_names[task_subtype_count] = {
     "none", "density", "gradient", "force", "grav",      "external_grav",
     "tend", "xv",      "rho",      "gpart", "multipole", "spart"};
 
+#ifdef WITH_MPI
+/* MPI communicators for the subtypes. */
+MPI_Comm subtaskMPI_comms[task_subtype_count];
+#endif
+
 /**
  * @brief Computes the overlap between the parts array of two given cells.
  *
@@ -485,3 +490,15 @@ void task_print(const struct task *t) {
           taskID_names[t->type], subtaskID_names[t->subtype], t->wait,
           t->nr_unlock_tasks, t->skip);
 }
+
+#ifdef WITH_MPI
+/**
+ * @brief Create global communicators for each of the subtasks.
+ */
+void task_create_mpi_comms(void) {
+  for (int i = 0; i < task_subtype_count; i++) {
+    MPI_Comm_dup(MPI_COMM_WORLD, &subtaskMPI_comms[i]);
+  }
+}
+#endif
+
diff --git a/src/task.h b/src/task.h
index 072d3979ce04990aaef46c5cc5eb0b8c62fdc860..58ea3a8cbb93b38b47ab7b6a243c3ee6c85d40b7 100644
--- a/src/task.h
+++ b/src/task.h
@@ -110,6 +110,13 @@ extern const char *taskID_names[];
  */
 extern const char *subtaskID_names[];
 
+/**
+ *  @brief The MPI communicators for the different subtypes.
+ */
+#ifdef WITH_MPI
+extern MPI_Comm subtaskMPI_comms[task_subtype_count];
+#endif
+
 /**
  * @brief A task to be run by the #scheduler.
  */
@@ -187,5 +194,7 @@ float task_overlap(const struct task *ta, const struct task *tb);
 int task_lock(struct task *t);
 void task_do_rewait(struct task *t);
 void task_print(const struct task *t);
-
+#ifdef WITH_MPI
+void task_create_mpi_comms(void);
+#endif
 #endif /* SWIFT_TASK_H */