diff --git a/src/cell.h b/src/cell.h
index 89f7c954c29f23845cb9c876b93c6bb3d469fbe5..59780697f42bce4b1fb778d7fcdf471de9cb2b27 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -205,6 +205,11 @@ struct pcell_step {
  */
 struct cell {
 
+  int num_hydro_proxies;
+  int num_grav_proxies;
+  int num_foreign_pair_hydro;
+  int num_foreign_pair_grav;
+
   /*! The cell location on the grid. */
   double loc[3];
 
diff --git a/src/engine.c b/src/engine.c
index f728862487f1666302dd10c5b1bb7c128ae0a0ee..5669612c82b266de2b23df891d81bd65b171f726 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -3620,6 +3620,8 @@ void engine_makeproxies(struct engine *e) {
                         cell_width[2] * cell_width[2];
   const double r_max = sqrt(r_max2);
 
+  message("PROXIES: r_max= %e max_distance= %e", r_max, max_distance);
+
   /* Prepare the proxies and the proxy index. */
   if (e->proxy_ind == NULL)
     if ((e->proxy_ind = (int *)malloc(sizeof(int) * e->nr_nodes)) == NULL)
@@ -3651,6 +3653,12 @@ void engine_makeproxies(struct engine *e) {
     }
   }
 
+  for(int i = 0; i < e->s->nr_cells; ++i) {
+    cells[i].num_hydro_proxies = 0;
+    cells[i].num_grav_proxies = 0;
+  }
+
+
   /* Let's be verbose about this choice */
   if (e->verbose)
     message(
@@ -3659,44 +3667,37 @@ void engine_makeproxies(struct engine *e) {
         delta, delta_m, delta_p);
 
   /* Loop over each cell in the space. */
-  int ind[3];
-  for (ind[0] = 0; ind[0] < cdim[0]; ind[0]++) {
-    for (ind[1] = 0; ind[1] < cdim[1]; ind[1]++) {
-      for (ind[2] = 0; ind[2] < cdim[2]; ind[2]++) {
-
+  for (int i = 0; i < cdim[0]; i++) {
+    for (int j = 0; j < cdim[1]; j++) {
+      for (int k = 0; k < cdim[2]; k++) {
+	
         /* Get the cell ID. */
-        const int cid = cell_getid(cdim, ind[0], ind[1], ind[2]);
-
+        const int cid = cell_getid(cdim, i, j, k);
+	
         /* and it's location */
         const double loc_i[3] = {cells[cid].loc[0], cells[cid].loc[1],
                                  cells[cid].loc[2]};
-
-        /* Loop over all its neighbours (periodic). */
-        for (int i = -delta_m; i <= delta_p; i++) {
-          int ii = ind[0] + i;
-          if (ii >= cdim[0])
-            ii -= cdim[0];
-          else if (ii < 0)
-            ii += cdim[0];
-          for (int j = -delta_m; j <= delta_p; j++) {
-            int jj = ind[1] + j;
-            if (jj >= cdim[1])
-              jj -= cdim[1];
-            else if (jj < 0)
-              jj += cdim[1];
-            for (int k = -delta_m; k <= delta_p; k++) {
-              int kk = ind[2] + k;
-              if (kk >= cdim[2])
-                kk -= cdim[2];
-              else if (kk < 0)
-                kk += cdim[2];
-
+	
+        /* Loop over all its neighbours neighbours in range. */
+        for (int ii = -delta_m; ii <= delta_p; ii++) {
+          int iii = i + ii;
+	  if (!periodic && (iii < 0 || iii >= cdim[0])) continue;
+	  iii = (iii + cdim[0]) % cdim[0];
+          for (int jj = -delta_m; jj <= delta_p; jj++) {
+	    int jjj = j + jj;
+	    if (!periodic && (jjj < 0 || jjj >= cdim[1])) continue;
+	    jjj = (jjj + cdim[1]) % cdim[1];
+            for (int kk = -delta_m; kk <= delta_p; kk++) {
+	      int kkk = k + kk;
+	      if (!periodic && (kkk < 0 || kkk >= cdim[2])) continue;
+	      kkk = (kkk + cdim[2]) % cdim[2];
+	      
               /* Get the cell ID. */
-              const int cjd = cell_getid(cdim, ii, jj, kk);
-
+              const int cjd = cell_getid(cdim, iii, jjj, kkk);
+	      
               /* Early abort (same cell) */
               if (cid == cjd) continue;
-
+	      
               /* Early abort (both same node) */
               if (cells[cid].nodeID == nodeID && cells[cjd].nodeID == nodeID)
                 continue;
@@ -3704,7 +3705,7 @@ void engine_makeproxies(struct engine *e) {
               /* Early abort (both foreign node) */
               if (cells[cid].nodeID != nodeID && cells[cjd].nodeID != nodeID)
                 continue;
-
+	      
               int proxy_type = 0;
 
               /* In the hydro case, only care about direct neighbours */
@@ -3715,15 +3716,15 @@ void engine_makeproxies(struct engine *e) {
 
                 /* This is super-ugly but checks for direct neighbours */
                 /* with periodic BC */
-                if (((abs(ind[0] - ii) <= 1 ||
-                      abs(ind[0] - ii - cdim[0]) <= 1 ||
-                      abs(ind[0] - ii + cdim[0]) <= 1) &&
-                     (abs(ind[1] - jj) <= 1 ||
-                      abs(ind[1] - jj - cdim[1]) <= 1 ||
-                      abs(ind[1] - jj + cdim[1]) <= 1) &&
-                     (abs(ind[2] - kk) <= 1 ||
-                      abs(ind[2] - kk - cdim[2]) <= 1 ||
-                      abs(ind[2] - kk + cdim[2]) <= 1)))
+                if (((abs(i - iii) <= 1 ||
+                      abs(i- iii - cdim[0]) <= 1 ||
+                      abs(i - iii + cdim[0]) <= 1) &&
+                     (abs(j - jjj) <= 1 ||
+                      abs(j - jjj - cdim[1]) <= 1 ||
+                      abs(j - jjj + cdim[1]) <= 1) &&
+                     (abs(k - kkk) <= 1 ||
+                      abs(k - kkk - cdim[2]) <= 1 ||
+                      abs(k - kkk + cdim[2]) <= 1)))
                   proxy_type |= (int)proxy_cell_type_hydro;
               }
 
@@ -3786,8 +3787,8 @@ void engine_makeproxies(struct engine *e) {
               if (cells[cid].nodeID == nodeID && cells[cjd].nodeID != nodeID) {
 
                 /* Do we already have a relationship with this node? */
-                int pid = e->proxy_ind[cells[cjd].nodeID];
-                if (pid < 0) {
+                int proxy_id = e->proxy_ind[cells[cjd].nodeID];
+                if (proxy_id < 0) {
                   if (e->nr_proxies == engine_maxproxies)
                     error("Maximum number of proxies exceeded.");
 
@@ -3797,24 +3798,34 @@ void engine_makeproxies(struct engine *e) {
 
                   /* Store the information */
                   e->proxy_ind[cells[cjd].nodeID] = e->nr_proxies;
-                  pid = e->nr_proxies;
+                  proxy_id = e->nr_proxies;
                   e->nr_proxies += 1;
+
+		  /* Check the maximal proxy limit */
+		  if(proxy_id > 8 * sizeof(long long)) 
+		    error("Created more than %zd proxies. cell.mpi.sendto will overflow.",
+			  8 * sizeof(long long));
                 }
 
                 /* Add the cell to the proxy */
-                proxy_addcell_in(&proxies[pid], &cells[cjd], proxy_type);
-                proxy_addcell_out(&proxies[pid], &cells[cid], proxy_type);
+                proxy_addcell_in(&proxies[proxy_id], &cells[cjd], proxy_type);
+                proxy_addcell_out(&proxies[proxy_id], &cells[cid], proxy_type);
+
+                if(proxy_type & (int)proxy_cell_type_gravity)
+                  cells[cid].num_grav_proxies++;
+                if(proxy_type & (int)proxy_cell_type_hydro)
+                  cells[cid].num_hydro_proxies++;
 
                 /* Store info about where to send the cell */
-                cells[cid].mpi.sendto |= (1ULL << pid);
+                cells[cid].mpi.sendto |= (1ULL << proxy_id);
               }
 
               /* Same for the symmetric case? */
               if (cells[cjd].nodeID == nodeID && cells[cid].nodeID != nodeID) {
 
                 /* Do we already have a relationship with this node? */
-                int pid = e->proxy_ind[cells[cid].nodeID];
-                if (pid < 0) {
+                int proxy_id = e->proxy_ind[cells[cid].nodeID];
+                if (proxy_id < 0) {
                   if (e->nr_proxies == engine_maxproxies)
                     error("Maximum number of proxies exceeded.");
 
@@ -3824,16 +3835,26 @@ void engine_makeproxies(struct engine *e) {
 
                   /* Store the information */
                   e->proxy_ind[cells[cid].nodeID] = e->nr_proxies;
-                  pid = e->nr_proxies;
+                  proxy_id = e->nr_proxies;
                   e->nr_proxies += 1;
+		  
+		  /* Check the maximal proxy limit */
+		  if(proxy_id > 8 * sizeof(long long)) 
+		    error("Created more than %zd proxies. cell.mpi.sendto will overflow.",
+			  8 * sizeof(long long));
                 }
 
                 /* Add the cell to the proxy */
-                proxy_addcell_in(&proxies[pid], &cells[cid], proxy_type);
-                proxy_addcell_out(&proxies[pid], &cells[cjd], proxy_type);
+                proxy_addcell_in(&proxies[proxy_id], &cells[cid], proxy_type);
+                proxy_addcell_out(&proxies[proxy_id], &cells[cjd], proxy_type);
+
+                if(proxy_type & (int)proxy_cell_type_gravity)
+                  cells[cjd].num_grav_proxies++;
+                if(proxy_type & (int)proxy_cell_type_hydro)
+                  cells[cjd].num_hydro_proxies++;
 
                 /* Store info about where to send the cell */
-                cells[cjd].mpi.sendto |= (1ULL << pid);
+                cells[cjd].mpi.sendto |= (1ULL << proxy_id);
               }
             }
           }
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index e4431a8cf14998774623e3575ecfdae08333e556..cbf14caa18e27ea4b4d22f3b047e343147525d47 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -757,7 +757,7 @@ void engine_make_hierarchical_tasks_stars(struct engine *e, struct cell *c) {
 void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements,
                                            void *extra_data) {
 
-  struct engine *e = ((struct engine **)extra_data)[0];
+  struct engine *e = (struct engine *) extra_data;
   struct space *s = e->s;
   struct scheduler *sched = &e->sched;
   const int nodeID = e->nodeID;
@@ -802,15 +802,12 @@ void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements,
     /* Skip cells without gravity particles */
     if (ci->grav.count == 0) continue;
 
-    /* Is that cell local ? */
-    if (ci->nodeID != nodeID) continue;
-
-    /* If the cells is local build a self-interaction */
-    scheduler_addtask(sched, task_type_self, task_subtype_grav, 0, 0, ci, NULL);
+    /* If the cell is local build a self-interaction */
+    if (ci->nodeID == nodeID)
+      scheduler_addtask(sched, task_type_self, task_subtype_grav, 0, 0, ci,
+			NULL);
 
-    /* Recover the multipole information */
-    const struct gravity_tensors *const multi_i = ci->grav.multipole;
-    const double CoM_i[3] = {multi_i->CoM[0], multi_i->CoM[1], multi_i->CoM[2]};
+    message("delta_m=%d delta_p=%d", delta_m, delta_p);
 
 #ifdef SWIFT_DEBUG_CHECKS
     if (cell_getid(cdim, i, j, k) != cid)
@@ -824,54 +821,41 @@ void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements,
 #endif
 
     /* Loop over every other cell within (Manhattan) range delta */
-    for (int x = -delta_m; x <= delta_p; x++) {
-      int ii = i + x;
-      if (ii >= cdim[0])
-        ii -= cdim[0];
-      else if (ii < 0)
-        ii += cdim[0];
-      for (int y = -delta_m; y <= delta_p; y++) {
-        int jj = j + y;
-        if (jj >= cdim[1])
-          jj -= cdim[1];
-        else if (jj < 0)
-          jj += cdim[1];
-        for (int z = -delta_m; z <= delta_p; z++) {
-          int kk = k + z;
-          if (kk >= cdim[2])
-            kk -= cdim[2];
-          else if (kk < 0)
-            kk += cdim[2];
+    for (int ii = -delta_m; ii <= delta_p; ii++) {
+      int iii = i + ii;
+      if (!periodic && (iii < 0 || iii >= cdim[0])) continue;
+      iii = (iii + cdim[0]) % cdim[0];
+      for (int jj = -delta_m; jj <= delta_p; jj++) {
+        int jjj = j + jj;
+        if (!periodic && (jjj < 0 || jjj >= cdim[1])) continue;
+        jjj = (jjj + cdim[1]) % cdim[1];
+        for (int kk = -delta_m; kk <= delta_p; kk++) {
+          int kkk = k + kk;
+          if (!periodic && (kkk < 0 || kkk >= cdim[2])) continue;
+          kkk = (kkk + cdim[2]) % cdim[2];
 
           /* Get the cell */
-          const int cjd = cell_getid(cdim, ii, jj, kk);
+          const int cjd = cell_getid(cdim, iii, jjj, kkk);
           struct cell *cj = &cells[cjd];
 
-#ifdef SWIFT_DEBUG_CHECKS
-          const int iii = cjd / (cdim[1] * cdim[2]);
-          const int jjj = (cjd / cdim[2]) % cdim[1];
-          const int kkk = cjd % cdim[2];
-
-          if (ii != iii || jj != jjj || kk != kkk)
-            error(
-                "Incorrect calculation of indices (iii,jjj,kkk)=(%d,%d,%d) "
-                "cjd=%d",
-                iii, jjj, kkk, cjd);
-#endif
-
           /* Avoid duplicates of local pairs*/
-          if (cid <= cjd && cj->nodeID == nodeID) continue;
-
-          /* Skip cells without gravity particles */
-          if (cj->grav.count == 0) continue;
+          if (cid >= cjd || cj->grav.count == 0 ||
+	      (ci->nodeID != nodeID && cj->nodeID != nodeID))
+            continue;
 
           /* Recover the multipole information */
-          const struct gravity_tensors *const multi_j = cj->grav.multipole;
+	  const struct gravity_tensors *multi_i = ci->grav.multipole;
+          const struct gravity_tensors *multi_j = cj->grav.multipole;
+
+	  if(multi_i == NULL && ci->nodeID != nodeID)
+	    error("Multipole of ci was not exchanged properly via the proxies");
+	  if(multi_j == NULL && cj->nodeID != nodeID)
+	    error("Multipole of cj was not exchanged properly via the proxies");
 
           /* Get the distance between the CoMs */
-          double dx = CoM_i[0] - multi_j->CoM[0];
-          double dy = CoM_i[1] - multi_j->CoM[1];
-          double dz = CoM_i[2] - multi_j->CoM[2];
+          double dx = multi_i->CoM[0] - multi_j->CoM[0];
+          double dy = multi_i->CoM[1] - multi_j->CoM[1];
+          double dz = multi_i->CoM[2] - multi_j->CoM[2];
 
           /* Apply BC */
           if (periodic) {
@@ -894,6 +878,11 @@ void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements,
             /* Ok, we need to add a direct pair calculation */
             scheduler_addtask(sched, task_type_pair, task_subtype_grav, 0, 0,
                               ci, cj);
+
+	  if(ci->nodeID == nodeID && cj->nodeID != engine_rank)
+	    atomic_inc(&ci->num_foreign_pair_grav);	    
+	  if(cj->nodeID == nodeID && ci->nodeID != engine_rank)
+	    atomic_inc(&cj->num_foreign_pair_grav);
           }
         }
       }
@@ -935,12 +924,14 @@ void engine_make_hierarchical_tasks_mapper(void *map_data, int num_elements,
 void engine_make_self_gravity_tasks(struct engine *e) {
 
   struct space *s = e->s;
-  struct task **ghosts = NULL;
+
+  for(int i = 0; i<s->nr_cells; ++i) {
+    s->cells_top[i].num_foreign_pair_grav = 0;
+  }
 
   /* Create the multipole self and pair tasks. */
-  void *extra_data[2] = {e, ghosts};
   threadpool_map(&e->threadpool, engine_make_self_gravity_tasks_mapper, NULL,
-                 s->nr_cells, 1, 0, extra_data);
+                 s->nr_cells, 1, 0, e);
 }
 
 /**
@@ -1763,6 +1754,8 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
 
     /* Get the cell index. */
     const int cid = (size_t)(map_data) + ind;
+
+    /* Integer indices of the cell in the top-level grid */
     const int i = cid / (cdim[1] * cdim[2]);
     const int j = (cid / cdim[2]) % cdim[1];
     const int k = cid % cdim[2];
@@ -1773,7 +1766,7 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
     /* Skip cells without hydro particles */
     if (ci->hydro.count == 0) continue;
 
-    /* If the cells is local build a self-interaction */
+    /* If the cell is local build a self-interaction */
     if (ci->nodeID == nodeID)
       scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0, ci,
                         NULL);
@@ -1805,6 +1798,11 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
           const int sid = sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))];
           scheduler_addtask(sched, task_type_pair, task_subtype_density, sid, 0,
                             ci, cj);
+
+	  if(ci->nodeID == nodeID && cj->nodeID != engine_rank)
+	    atomic_inc(&ci->num_foreign_pair_hydro);	    
+	  if(cj->nodeID == nodeID && ci->nodeID != engine_rank)
+	    atomic_inc(&cj->num_foreign_pair_hydro);
         }
       }
     }
@@ -1829,6 +1827,10 @@ void engine_maketasks(struct engine *e) {
 
   ticks tic2 = getticks();
 
+  for(int i = 0; i<s->nr_cells; ++i) {
+    s->cells_top[i].num_foreign_pair_hydro = 0;
+  }
+
   /* Construct the first hydro loop over neighbours */
   if (e->policy & engine_policy_hydro)
     threadpool_map(&e->threadpool, engine_make_hydroloop_tasks_mapper, NULL,
@@ -2087,4 +2089,13 @@ void engine_maketasks(struct engine *e) {
   if (e->verbose)
     message("took %.3f %s (including reweight).",
             clocks_from_ticks(getticks() - tic), clocks_getunit());
+
+  if(e->nodeID == 0)
+    for(int i = 0; i < e->s->nr_cells; ++i) {
+      message("cid= %d num_grav_proxy= %d num_hydro_proxy= %d num_foreign_grav= %d num_foreign_hydro= %d",
+	      i, e->s->cells_top[i].num_grav_proxies, e->s->cells_top[i].num_hydro_proxies,
+	      e->s->cells_top[i].num_foreign_pair_grav, e->s->cells_top[i].num_foreign_pair_hydro);
+    }
+
+	
 }
diff --git a/src/proxy.c b/src/proxy.c
index 325ed78644b07a497374e40bfc8518edcb018593..6f16691bd418d4e497df128f5a6a8ac723221ce0 100644
--- a/src/proxy.c
+++ b/src/proxy.c
@@ -269,6 +269,106 @@ void proxy_cells_exchange_second(struct proxy *p) {
 #endif
 }
 
+#ifdef WITH_MPI
+
+void proxy_cells_count_mapper(void *map_data, int num_elements,
+                              void *extra_data) {
+  struct cell *cells = (struct cell *)map_data;
+
+  for (int k = 0; k < num_elements; k++) {
+    if (cells[k].mpi.sendto) cells[k].mpi.pcell_size = cell_getsize(&cells[k]);
+  }
+}
+
+struct pack_mapper_data {
+  struct space *s;
+  int *offset;
+  struct pcell *pcells;
+  int with_gravity;
+};
+
+void proxy_cells_pack_mapper(void *map_data, int num_elements,
+                             void *extra_data) {
+  struct cell *cells = (struct cell *)map_data;
+  struct pack_mapper_data *data = (struct pack_mapper_data *)extra_data;
+
+  for (int k = 0; k < num_elements; k++) {
+    if (cells[k].mpi.sendto) {
+      ptrdiff_t ind = &cells[k] - data->s->cells_top;
+      cells[k].mpi.pcell = &data->pcells[data->offset[ind]];
+      cell_pack(&cells[k], cells[k].mpi.pcell, data->with_gravity);
+    }
+  }
+}
+
+void proxy_cells_exchange_first_mapper(void *map_data, int num_elements,
+                                       void *extra_data) {
+  struct proxy *proxies = (struct proxy *)map_data;
+
+  for (int k = 0; k < num_elements; k++) {
+    proxy_cells_exchange_first(&proxies[k]);
+  }
+}
+
+struct wait_and_unpack_mapper_data {
+  struct space *s;
+  int num_proxies;
+  MPI_Request *reqs_in;
+  struct proxy *proxies;
+  int with_gravity;
+  swift_lock_type lock;
+};
+
+void proxy_cells_wait_and_unpack_mapper(void *unused_map_data, int num_elements,
+                                        void *extra_data) {
+  struct wait_and_unpack_mapper_data *data =
+      (struct wait_and_unpack_mapper_data *)extra_data;
+
+  for (int k = 0; k < num_elements; k++) {
+    int pid = MPI_UNDEFINED;
+    MPI_Status status;
+    int res;
+
+    /* We need a lock to prevent concurrent calls to MPI_Waitany on
+       the same array of requests since this is not supported in the MPI
+       standard (v3.1). This is not really a problem since the threads
+       would block inside MPI_Waitany anyway. */
+    lock_lock(&data->lock);
+    if ((res = MPI_Waitany(data->num_proxies, data->reqs_in, &pid, &status)) !=
+            MPI_SUCCESS ||
+        pid == MPI_UNDEFINED)
+      mpi_error(res, "MPI_Waitany failed.");
+    if (lock_unlock(&data->lock) != 0) {
+      error("Failed to release lock.");
+    }
+
+    // message( "cell data from proxy %i has arrived." , pid );
+    for (int count = 0, j = 0; j < data->proxies[pid].nr_cells_in; j++)
+      count += cell_unpack(&data->proxies[pid].pcells_in[count],
+                           data->proxies[pid].cells_in[j], data->s,
+                           data->with_gravity);
+  }
+}
+
+void proxy_cells_unpack_mapper(void* map_data, int num_elements,
+			       void* extra_data) {
+
+  struct space *s = (struct space*) extra_data;
+  struct proxy *proxies = (struct proxy*) map_data;
+
+  for(int k = 0; k < num_elements; ++k) {
+
+    int count = 0;
+    for (int j = 0; j < proxies[k].nr_cells_in; j++) {
+      count += cell_unpack(&proxies[k].pcells_in[count],
+			   proxies[k].cells_in[j], s, s->gravity);
+    }
+
+  }
+}
+
+#endif  // WITH_MPI
+
 /**
  * @brief Exchange the cell structures with all proxies.
  *
@@ -294,13 +394,14 @@ void proxy_cells_exchange(struct proxy *proxies, int num_proxies,
 
   /* Run through the cells and get the size of the ones that will be sent off.
    */
+  threadpool_map(&s->e->threadpool, proxy_cells_count_mapper, s->cells_top,
+                 s->nr_cells, sizeof(struct cell), /*chunk=*/0,
+                 /*extra_data=*/NULL);
   int count_out = 0;
   int offset[s->nr_cells];
   for (int k = 0; k < s->nr_cells; k++) {
     offset[k] = count_out;
-    if (s->cells_top[k].mpi.sendto)
-      count_out +=
-          (s->cells_top[k].mpi.pcell_size = cell_getsize(&s->cells_top[k]));
+    if (s->cells_top[k].mpi.sendto) count_out += s->cells_top[k].mpi.pcell_size;
   }
 
   if (s->e->verbose)
@@ -316,19 +417,19 @@ void proxy_cells_exchange(struct proxy *proxies, int num_proxies,
   tic2 = getticks();
 
   /* Pack the cells. */
-  for (int k = 0; k < s->nr_cells; k++)
-    if (s->cells_top[k].mpi.sendto) {
-      cell_pack(&s->cells_top[k], &pcells[offset[k]], with_gravity);
-      s->cells_top[k].mpi.pcell = &pcells[offset[k]];
-    }
+  struct pack_mapper_data data = {s, offset, pcells, with_gravity};
+  threadpool_map(&s->e->threadpool, proxy_cells_pack_mapper, s->cells_top,
+                 s->nr_cells, sizeof(struct cell), /*chunk=*/0, &data);
 
   if (s->e->verbose)
     message("Packing cells took %.3f %s.", clocks_from_ticks(getticks() - tic2),
             clocks_getunit());
 
   /* Launch the first part of the exchange. */
+  threadpool_map(&s->e->threadpool, proxy_cells_exchange_first_mapper, proxies,
+                 num_proxies, sizeof(struct proxy), /*chunk=*/0,
+                 /*extra_data=*/NULL);
   for (int k = 0; k < num_proxies; k++) {
-    proxy_cells_exchange_first(&proxies[k]);
     reqs_in[k] = proxies[k].req_cells_count_in;
     reqs_out[k] = proxies[k].req_cells_count_out;
   }
@@ -356,23 +457,48 @@ void proxy_cells_exchange(struct proxy *proxies, int num_proxies,
 
   tic2 = getticks();
 
-  /* Wait for each pcell array to come in from the proxies. */
-  for (int k = 0; k < num_proxies; k++) {
-    int pid = MPI_UNDEFINED;
-    MPI_Status status;
-    if (MPI_Waitany(num_proxies, reqs_in, &pid, &status) != MPI_SUCCESS ||
-        pid == MPI_UNDEFINED)
-      error("MPI_Waitany failed.");
-    // message( "cell data from proxy %i has arrived." , pid );
-    for (int count = 0, j = 0; j < proxies[pid].nr_cells_in; j++)
-      count += cell_unpack(&proxies[pid].pcells_in[count],
-                           proxies[pid].cells_in[j], s, with_gravity);
-  }
+  if (MPI_Waitall(num_proxies, reqs_in, MPI_STATUSES_IGNORE) != MPI_SUCCESS)
+    error("MPI_Waitall on recvs failed.");
+
+  if (s->e->verbose)
+    message("MPI_Waitall on recvs took %.3f %s.",
+            clocks_from_ticks(getticks() - tic2), clocks_getunit());
+
+  int* counters = malloc(s->e->nr_nodes * sizeof(int));
+  bzero(counters, s->e->nr_nodes * sizeof(int));
+
+  tic2 = getticks();
+
+  message("num proxies: %d", num_proxies);
+
+  threadpool_map(&s->e->threadpool, proxy_cells_unpack_mapper,
+		 proxies, num_proxies, sizeof(struct proxy), 0, s);
 
   if (s->e->verbose)
     message("Un-packing cells took %.3f %s.",
             clocks_from_ticks(getticks() - tic2), clocks_getunit());
 
+  for(int i = 0; i < num_proxies; ++i) {
+    counters[i] += proxies[i].nr_cells_in;
+  }
+
+
+  for(int i = 0; i < s->e->nr_nodes; ++i) {
+    if( i == s->e->nodeID) {
+      printf("Rank %d: |", i);
+      int total = 0;
+      for(int j = 0; j < s->e->nr_nodes; ++j) {
+  	total += counters[j];
+  	printf(" %d", counters[j]);       
+      }
+      printf("| %d |\n", total);
+    }
+    fflush(stdout);
+    MPI_Barrier(MPI_COMM_WORLD);
+  }
+  free(counters);
+
+
   /* Wait for all the sends to have finished too. */
   if (MPI_Waitall(num_proxies, reqs_out, MPI_STATUSES_IGNORE) != MPI_SUCCESS)
     error("MPI_Waitall on sends failed.");