diff --git a/src/cell.h b/src/cell.h index 89f7c954c29f23845cb9c876b93c6bb3d469fbe5..59780697f42bce4b1fb778d7fcdf471de9cb2b27 100644 --- a/src/cell.h +++ b/src/cell.h @@ -205,6 +205,11 @@ struct pcell_step { */ struct cell { + int num_hydro_proxies; + int num_grav_proxies; + int num_foreign_pair_hydro; + int num_foreign_pair_grav; + /*! The cell location on the grid. */ double loc[3]; diff --git a/src/engine.c b/src/engine.c index f728862487f1666302dd10c5b1bb7c128ae0a0ee..5669612c82b266de2b23df891d81bd65b171f726 100644 --- a/src/engine.c +++ b/src/engine.c @@ -3620,6 +3620,8 @@ void engine_makeproxies(struct engine *e) { cell_width[2] * cell_width[2]; const double r_max = sqrt(r_max2); + message("PROXIES: r_max= %e max_distance= %e", r_max, max_distance); + /* Prepare the proxies and the proxy index. */ if (e->proxy_ind == NULL) if ((e->proxy_ind = (int *)malloc(sizeof(int) * e->nr_nodes)) == NULL) @@ -3651,6 +3653,12 @@ void engine_makeproxies(struct engine *e) { } } + for(int i = 0; i < e->s->nr_cells; ++i) { + cells[i].num_hydro_proxies = 0; + cells[i].num_grav_proxies = 0; + } + + /* Let's be verbose about this choice */ if (e->verbose) message( @@ -3659,44 +3667,37 @@ void engine_makeproxies(struct engine *e) { delta, delta_m, delta_p); /* Loop over each cell in the space. */ - int ind[3]; - for (ind[0] = 0; ind[0] < cdim[0]; ind[0]++) { - for (ind[1] = 0; ind[1] < cdim[1]; ind[1]++) { - for (ind[2] = 0; ind[2] < cdim[2]; ind[2]++) { - + for (int i = 0; i < cdim[0]; i++) { + for (int j = 0; j < cdim[1]; j++) { + for (int k = 0; k < cdim[2]; k++) { + /* Get the cell ID. */ - const int cid = cell_getid(cdim, ind[0], ind[1], ind[2]); - + const int cid = cell_getid(cdim, i, j, k); + /* and it's location */ const double loc_i[3] = {cells[cid].loc[0], cells[cid].loc[1], cells[cid].loc[2]}; - - /* Loop over all its neighbours (periodic). */ - for (int i = -delta_m; i <= delta_p; i++) { - int ii = ind[0] + i; - if (ii >= cdim[0]) - ii -= cdim[0]; - else if (ii < 0) - ii += cdim[0]; - for (int j = -delta_m; j <= delta_p; j++) { - int jj = ind[1] + j; - if (jj >= cdim[1]) - jj -= cdim[1]; - else if (jj < 0) - jj += cdim[1]; - for (int k = -delta_m; k <= delta_p; k++) { - int kk = ind[2] + k; - if (kk >= cdim[2]) - kk -= cdim[2]; - else if (kk < 0) - kk += cdim[2]; - + + /* Loop over all its neighbours neighbours in range. */ + for (int ii = -delta_m; ii <= delta_p; ii++) { + int iii = i + ii; + if (!periodic && (iii < 0 || iii >= cdim[0])) continue; + iii = (iii + cdim[0]) % cdim[0]; + for (int jj = -delta_m; jj <= delta_p; jj++) { + int jjj = j + jj; + if (!periodic && (jjj < 0 || jjj >= cdim[1])) continue; + jjj = (jjj + cdim[1]) % cdim[1]; + for (int kk = -delta_m; kk <= delta_p; kk++) { + int kkk = k + kk; + if (!periodic && (kkk < 0 || kkk >= cdim[2])) continue; + kkk = (kkk + cdim[2]) % cdim[2]; + /* Get the cell ID. */ - const int cjd = cell_getid(cdim, ii, jj, kk); - + const int cjd = cell_getid(cdim, iii, jjj, kkk); + /* Early abort (same cell) */ if (cid == cjd) continue; - + /* Early abort (both same node) */ if (cells[cid].nodeID == nodeID && cells[cjd].nodeID == nodeID) continue; @@ -3704,7 +3705,7 @@ void engine_makeproxies(struct engine *e) { /* Early abort (both foreign node) */ if (cells[cid].nodeID != nodeID && cells[cjd].nodeID != nodeID) continue; - + int proxy_type = 0; /* In the hydro case, only care about direct neighbours */ @@ -3715,15 +3716,15 @@ void engine_makeproxies(struct engine *e) { /* This is super-ugly but checks for direct neighbours */ /* with periodic BC */ - if (((abs(ind[0] - ii) <= 1 || - abs(ind[0] - ii - cdim[0]) <= 1 || - abs(ind[0] - ii + cdim[0]) <= 1) && - (abs(ind[1] - jj) <= 1 || - abs(ind[1] - jj - cdim[1]) <= 1 || - abs(ind[1] - jj + cdim[1]) <= 1) && - (abs(ind[2] - kk) <= 1 || - abs(ind[2] - kk - cdim[2]) <= 1 || - abs(ind[2] - kk + cdim[2]) <= 1))) + if (((abs(i - iii) <= 1 || + abs(i- iii - cdim[0]) <= 1 || + abs(i - iii + cdim[0]) <= 1) && + (abs(j - jjj) <= 1 || + abs(j - jjj - cdim[1]) <= 1 || + abs(j - jjj + cdim[1]) <= 1) && + (abs(k - kkk) <= 1 || + abs(k - kkk - cdim[2]) <= 1 || + abs(k - kkk + cdim[2]) <= 1))) proxy_type |= (int)proxy_cell_type_hydro; } @@ -3786,8 +3787,8 @@ void engine_makeproxies(struct engine *e) { if (cells[cid].nodeID == nodeID && cells[cjd].nodeID != nodeID) { /* Do we already have a relationship with this node? */ - int pid = e->proxy_ind[cells[cjd].nodeID]; - if (pid < 0) { + int proxy_id = e->proxy_ind[cells[cjd].nodeID]; + if (proxy_id < 0) { if (e->nr_proxies == engine_maxproxies) error("Maximum number of proxies exceeded."); @@ -3797,24 +3798,34 @@ void engine_makeproxies(struct engine *e) { /* Store the information */ e->proxy_ind[cells[cjd].nodeID] = e->nr_proxies; - pid = e->nr_proxies; + proxy_id = e->nr_proxies; e->nr_proxies += 1; + + /* Check the maximal proxy limit */ + if(proxy_id > 8 * sizeof(long long)) + error("Created more than %zd proxies. cell.mpi.sendto will overflow.", + 8 * sizeof(long long)); } /* Add the cell to the proxy */ - proxy_addcell_in(&proxies[pid], &cells[cjd], proxy_type); - proxy_addcell_out(&proxies[pid], &cells[cid], proxy_type); + proxy_addcell_in(&proxies[proxy_id], &cells[cjd], proxy_type); + proxy_addcell_out(&proxies[proxy_id], &cells[cid], proxy_type); + + if(proxy_type & (int)proxy_cell_type_gravity) + cells[cid].num_grav_proxies++; + if(proxy_type & (int)proxy_cell_type_hydro) + cells[cid].num_hydro_proxies++; /* Store info about where to send the cell */ - cells[cid].mpi.sendto |= (1ULL << pid); + cells[cid].mpi.sendto |= (1ULL << proxy_id); } /* Same for the symmetric case? */ if (cells[cjd].nodeID == nodeID && cells[cid].nodeID != nodeID) { /* Do we already have a relationship with this node? */ - int pid = e->proxy_ind[cells[cid].nodeID]; - if (pid < 0) { + int proxy_id = e->proxy_ind[cells[cid].nodeID]; + if (proxy_id < 0) { if (e->nr_proxies == engine_maxproxies) error("Maximum number of proxies exceeded."); @@ -3824,16 +3835,26 @@ void engine_makeproxies(struct engine *e) { /* Store the information */ e->proxy_ind[cells[cid].nodeID] = e->nr_proxies; - pid = e->nr_proxies; + proxy_id = e->nr_proxies; e->nr_proxies += 1; + + /* Check the maximal proxy limit */ + if(proxy_id > 8 * sizeof(long long)) + error("Created more than %zd proxies. cell.mpi.sendto will overflow.", + 8 * sizeof(long long)); } /* Add the cell to the proxy */ - proxy_addcell_in(&proxies[pid], &cells[cid], proxy_type); - proxy_addcell_out(&proxies[pid], &cells[cjd], proxy_type); + proxy_addcell_in(&proxies[proxy_id], &cells[cid], proxy_type); + proxy_addcell_out(&proxies[proxy_id], &cells[cjd], proxy_type); + + if(proxy_type & (int)proxy_cell_type_gravity) + cells[cjd].num_grav_proxies++; + if(proxy_type & (int)proxy_cell_type_hydro) + cells[cjd].num_hydro_proxies++; /* Store info about where to send the cell */ - cells[cjd].mpi.sendto |= (1ULL << pid); + cells[cjd].mpi.sendto |= (1ULL << proxy_id); } } } diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c index e4431a8cf14998774623e3575ecfdae08333e556..cbf14caa18e27ea4b4d22f3b047e343147525d47 100644 --- a/src/engine_maketasks.c +++ b/src/engine_maketasks.c @@ -757,7 +757,7 @@ void engine_make_hierarchical_tasks_stars(struct engine *e, struct cell *c) { void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements, void *extra_data) { - struct engine *e = ((struct engine **)extra_data)[0]; + struct engine *e = (struct engine *) extra_data; struct space *s = e->s; struct scheduler *sched = &e->sched; const int nodeID = e->nodeID; @@ -802,15 +802,12 @@ void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements, /* Skip cells without gravity particles */ if (ci->grav.count == 0) continue; - /* Is that cell local ? */ - if (ci->nodeID != nodeID) continue; - - /* If the cells is local build a self-interaction */ - scheduler_addtask(sched, task_type_self, task_subtype_grav, 0, 0, ci, NULL); + /* If the cell is local build a self-interaction */ + if (ci->nodeID == nodeID) + scheduler_addtask(sched, task_type_self, task_subtype_grav, 0, 0, ci, + NULL); - /* Recover the multipole information */ - const struct gravity_tensors *const multi_i = ci->grav.multipole; - const double CoM_i[3] = {multi_i->CoM[0], multi_i->CoM[1], multi_i->CoM[2]}; + message("delta_m=%d delta_p=%d", delta_m, delta_p); #ifdef SWIFT_DEBUG_CHECKS if (cell_getid(cdim, i, j, k) != cid) @@ -824,54 +821,41 @@ void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements, #endif /* Loop over every other cell within (Manhattan) range delta */ - for (int x = -delta_m; x <= delta_p; x++) { - int ii = i + x; - if (ii >= cdim[0]) - ii -= cdim[0]; - else if (ii < 0) - ii += cdim[0]; - for (int y = -delta_m; y <= delta_p; y++) { - int jj = j + y; - if (jj >= cdim[1]) - jj -= cdim[1]; - else if (jj < 0) - jj += cdim[1]; - for (int z = -delta_m; z <= delta_p; z++) { - int kk = k + z; - if (kk >= cdim[2]) - kk -= cdim[2]; - else if (kk < 0) - kk += cdim[2]; + for (int ii = -delta_m; ii <= delta_p; ii++) { + int iii = i + ii; + if (!periodic && (iii < 0 || iii >= cdim[0])) continue; + iii = (iii + cdim[0]) % cdim[0]; + for (int jj = -delta_m; jj <= delta_p; jj++) { + int jjj = j + jj; + if (!periodic && (jjj < 0 || jjj >= cdim[1])) continue; + jjj = (jjj + cdim[1]) % cdim[1]; + for (int kk = -delta_m; kk <= delta_p; kk++) { + int kkk = k + kk; + if (!periodic && (kkk < 0 || kkk >= cdim[2])) continue; + kkk = (kkk + cdim[2]) % cdim[2]; /* Get the cell */ - const int cjd = cell_getid(cdim, ii, jj, kk); + const int cjd = cell_getid(cdim, iii, jjj, kkk); struct cell *cj = &cells[cjd]; -#ifdef SWIFT_DEBUG_CHECKS - const int iii = cjd / (cdim[1] * cdim[2]); - const int jjj = (cjd / cdim[2]) % cdim[1]; - const int kkk = cjd % cdim[2]; - - if (ii != iii || jj != jjj || kk != kkk) - error( - "Incorrect calculation of indices (iii,jjj,kkk)=(%d,%d,%d) " - "cjd=%d", - iii, jjj, kkk, cjd); -#endif - /* Avoid duplicates of local pairs*/ - if (cid <= cjd && cj->nodeID == nodeID) continue; - - /* Skip cells without gravity particles */ - if (cj->grav.count == 0) continue; + if (cid >= cjd || cj->grav.count == 0 || + (ci->nodeID != nodeID && cj->nodeID != nodeID)) + continue; /* Recover the multipole information */ - const struct gravity_tensors *const multi_j = cj->grav.multipole; + const struct gravity_tensors *multi_i = ci->grav.multipole; + const struct gravity_tensors *multi_j = cj->grav.multipole; + + if(multi_i == NULL && ci->nodeID != nodeID) + error("Multipole of ci was not exchanged properly via the proxies"); + if(multi_j == NULL && cj->nodeID != nodeID) + error("Multipole of cj was not exchanged properly via the proxies"); /* Get the distance between the CoMs */ - double dx = CoM_i[0] - multi_j->CoM[0]; - double dy = CoM_i[1] - multi_j->CoM[1]; - double dz = CoM_i[2] - multi_j->CoM[2]; + double dx = multi_i->CoM[0] - multi_j->CoM[0]; + double dy = multi_i->CoM[1] - multi_j->CoM[1]; + double dz = multi_i->CoM[2] - multi_j->CoM[2]; /* Apply BC */ if (periodic) { @@ -894,6 +878,11 @@ void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements, /* Ok, we need to add a direct pair calculation */ scheduler_addtask(sched, task_type_pair, task_subtype_grav, 0, 0, ci, cj); + + if(ci->nodeID == nodeID && cj->nodeID != engine_rank) + atomic_inc(&ci->num_foreign_pair_grav); + if(cj->nodeID == nodeID && ci->nodeID != engine_rank) + atomic_inc(&cj->num_foreign_pair_grav); } } } @@ -935,12 +924,14 @@ void engine_make_hierarchical_tasks_mapper(void *map_data, int num_elements, void engine_make_self_gravity_tasks(struct engine *e) { struct space *s = e->s; - struct task **ghosts = NULL; + + for(int i = 0; i<s->nr_cells; ++i) { + s->cells_top[i].num_foreign_pair_grav = 0; + } /* Create the multipole self and pair tasks. */ - void *extra_data[2] = {e, ghosts}; threadpool_map(&e->threadpool, engine_make_self_gravity_tasks_mapper, NULL, - s->nr_cells, 1, 0, extra_data); + s->nr_cells, 1, 0, e); } /** @@ -1763,6 +1754,8 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements, /* Get the cell index. */ const int cid = (size_t)(map_data) + ind; + + /* Integer indices of the cell in the top-level grid */ const int i = cid / (cdim[1] * cdim[2]); const int j = (cid / cdim[2]) % cdim[1]; const int k = cid % cdim[2]; @@ -1773,7 +1766,7 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements, /* Skip cells without hydro particles */ if (ci->hydro.count == 0) continue; - /* If the cells is local build a self-interaction */ + /* If the cell is local build a self-interaction */ if (ci->nodeID == nodeID) scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0, ci, NULL); @@ -1805,6 +1798,11 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements, const int sid = sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))]; scheduler_addtask(sched, task_type_pair, task_subtype_density, sid, 0, ci, cj); + + if(ci->nodeID == nodeID && cj->nodeID != engine_rank) + atomic_inc(&ci->num_foreign_pair_hydro); + if(cj->nodeID == nodeID && ci->nodeID != engine_rank) + atomic_inc(&cj->num_foreign_pair_hydro); } } } @@ -1829,6 +1827,10 @@ void engine_maketasks(struct engine *e) { ticks tic2 = getticks(); + for(int i = 0; i<s->nr_cells; ++i) { + s->cells_top[i].num_foreign_pair_hydro = 0; + } + /* Construct the first hydro loop over neighbours */ if (e->policy & engine_policy_hydro) threadpool_map(&e->threadpool, engine_make_hydroloop_tasks_mapper, NULL, @@ -2087,4 +2089,13 @@ void engine_maketasks(struct engine *e) { if (e->verbose) message("took %.3f %s (including reweight).", clocks_from_ticks(getticks() - tic), clocks_getunit()); + + if(e->nodeID == 0) + for(int i = 0; i < e->s->nr_cells; ++i) { + message("cid= %d num_grav_proxy= %d num_hydro_proxy= %d num_foreign_grav= %d num_foreign_hydro= %d", + i, e->s->cells_top[i].num_grav_proxies, e->s->cells_top[i].num_hydro_proxies, + e->s->cells_top[i].num_foreign_pair_grav, e->s->cells_top[i].num_foreign_pair_hydro); + } + + } diff --git a/src/proxy.c b/src/proxy.c index 325ed78644b07a497374e40bfc8518edcb018593..6f16691bd418d4e497df128f5a6a8ac723221ce0 100644 --- a/src/proxy.c +++ b/src/proxy.c @@ -269,6 +269,106 @@ void proxy_cells_exchange_second(struct proxy *p) { #endif } +#ifdef WITH_MPI + +void proxy_cells_count_mapper(void *map_data, int num_elements, + void *extra_data) { + struct cell *cells = (struct cell *)map_data; + + for (int k = 0; k < num_elements; k++) { + if (cells[k].mpi.sendto) cells[k].mpi.pcell_size = cell_getsize(&cells[k]); + } +} + +struct pack_mapper_data { + struct space *s; + int *offset; + struct pcell *pcells; + int with_gravity; +}; + +void proxy_cells_pack_mapper(void *map_data, int num_elements, + void *extra_data) { + struct cell *cells = (struct cell *)map_data; + struct pack_mapper_data *data = (struct pack_mapper_data *)extra_data; + + for (int k = 0; k < num_elements; k++) { + if (cells[k].mpi.sendto) { + ptrdiff_t ind = &cells[k] - data->s->cells_top; + cells[k].mpi.pcell = &data->pcells[data->offset[ind]]; + cell_pack(&cells[k], cells[k].mpi.pcell, data->with_gravity); + } + } +} + +void proxy_cells_exchange_first_mapper(void *map_data, int num_elements, + void *extra_data) { + struct proxy *proxies = (struct proxy *)map_data; + + for (int k = 0; k < num_elements; k++) { + proxy_cells_exchange_first(&proxies[k]); + } +} + +struct wait_and_unpack_mapper_data { + struct space *s; + int num_proxies; + MPI_Request *reqs_in; + struct proxy *proxies; + int with_gravity; + swift_lock_type lock; +}; + +void proxy_cells_wait_and_unpack_mapper(void *unused_map_data, int num_elements, + void *extra_data) { + struct wait_and_unpack_mapper_data *data = + (struct wait_and_unpack_mapper_data *)extra_data; + + for (int k = 0; k < num_elements; k++) { + int pid = MPI_UNDEFINED; + MPI_Status status; + int res; + + /* We need a lock to prevent concurrent calls to MPI_Waitany on + the same array of requests since this is not supported in the MPI + standard (v3.1). This is not really a problem since the threads + would block inside MPI_Waitany anyway. */ + lock_lock(&data->lock); + if ((res = MPI_Waitany(data->num_proxies, data->reqs_in, &pid, &status)) != + MPI_SUCCESS || + pid == MPI_UNDEFINED) + mpi_error(res, "MPI_Waitany failed."); + if (lock_unlock(&data->lock) != 0) { + error("Failed to release lock."); + } + + // message( "cell data from proxy %i has arrived." , pid ); + for (int count = 0, j = 0; j < data->proxies[pid].nr_cells_in; j++) + count += cell_unpack(&data->proxies[pid].pcells_in[count], + data->proxies[pid].cells_in[j], data->s, + data->with_gravity); + } +} + +void proxy_cells_unpack_mapper(void* map_data, int num_elements, + void* extra_data) { + + struct space *s = (struct space*) extra_data; + struct proxy *proxies = (struct proxy*) map_data; + + for(int k = 0; k < num_elements; ++k) { + + int count = 0; + for (int j = 0; j < proxies[k].nr_cells_in; j++) { + count += cell_unpack(&proxies[k].pcells_in[count], + proxies[k].cells_in[j], s, s->gravity); + } + + } +} + +#endif // WITH_MPI + /** * @brief Exchange the cell structures with all proxies. * @@ -294,13 +394,14 @@ void proxy_cells_exchange(struct proxy *proxies, int num_proxies, /* Run through the cells and get the size of the ones that will be sent off. */ + threadpool_map(&s->e->threadpool, proxy_cells_count_mapper, s->cells_top, + s->nr_cells, sizeof(struct cell), /*chunk=*/0, + /*extra_data=*/NULL); int count_out = 0; int offset[s->nr_cells]; for (int k = 0; k < s->nr_cells; k++) { offset[k] = count_out; - if (s->cells_top[k].mpi.sendto) - count_out += - (s->cells_top[k].mpi.pcell_size = cell_getsize(&s->cells_top[k])); + if (s->cells_top[k].mpi.sendto) count_out += s->cells_top[k].mpi.pcell_size; } if (s->e->verbose) @@ -316,19 +417,19 @@ void proxy_cells_exchange(struct proxy *proxies, int num_proxies, tic2 = getticks(); /* Pack the cells. */ - for (int k = 0; k < s->nr_cells; k++) - if (s->cells_top[k].mpi.sendto) { - cell_pack(&s->cells_top[k], &pcells[offset[k]], with_gravity); - s->cells_top[k].mpi.pcell = &pcells[offset[k]]; - } + struct pack_mapper_data data = {s, offset, pcells, with_gravity}; + threadpool_map(&s->e->threadpool, proxy_cells_pack_mapper, s->cells_top, + s->nr_cells, sizeof(struct cell), /*chunk=*/0, &data); if (s->e->verbose) message("Packing cells took %.3f %s.", clocks_from_ticks(getticks() - tic2), clocks_getunit()); /* Launch the first part of the exchange. */ + threadpool_map(&s->e->threadpool, proxy_cells_exchange_first_mapper, proxies, + num_proxies, sizeof(struct proxy), /*chunk=*/0, + /*extra_data=*/NULL); for (int k = 0; k < num_proxies; k++) { - proxy_cells_exchange_first(&proxies[k]); reqs_in[k] = proxies[k].req_cells_count_in; reqs_out[k] = proxies[k].req_cells_count_out; } @@ -356,23 +457,48 @@ void proxy_cells_exchange(struct proxy *proxies, int num_proxies, tic2 = getticks(); - /* Wait for each pcell array to come in from the proxies. */ - for (int k = 0; k < num_proxies; k++) { - int pid = MPI_UNDEFINED; - MPI_Status status; - if (MPI_Waitany(num_proxies, reqs_in, &pid, &status) != MPI_SUCCESS || - pid == MPI_UNDEFINED) - error("MPI_Waitany failed."); - // message( "cell data from proxy %i has arrived." , pid ); - for (int count = 0, j = 0; j < proxies[pid].nr_cells_in; j++) - count += cell_unpack(&proxies[pid].pcells_in[count], - proxies[pid].cells_in[j], s, with_gravity); - } + if (MPI_Waitall(num_proxies, reqs_in, MPI_STATUSES_IGNORE) != MPI_SUCCESS) + error("MPI_Waitall on recvs failed."); + + if (s->e->verbose) + message("MPI_Waitall on recvs took %.3f %s.", + clocks_from_ticks(getticks() - tic2), clocks_getunit()); + + int* counters = malloc(s->e->nr_nodes * sizeof(int)); + bzero(counters, s->e->nr_nodes * sizeof(int)); + + tic2 = getticks(); + + message("num proxies: %d", num_proxies); + + threadpool_map(&s->e->threadpool, proxy_cells_unpack_mapper, + proxies, num_proxies, sizeof(struct proxy), 0, s); if (s->e->verbose) message("Un-packing cells took %.3f %s.", clocks_from_ticks(getticks() - tic2), clocks_getunit()); + for(int i = 0; i < num_proxies; ++i) { + counters[i] += proxies[i].nr_cells_in; + } + + + for(int i = 0; i < s->e->nr_nodes; ++i) { + if( i == s->e->nodeID) { + printf("Rank %d: |", i); + int total = 0; + for(int j = 0; j < s->e->nr_nodes; ++j) { + total += counters[j]; + printf(" %d", counters[j]); + } + printf("| %d |\n", total); + } + fflush(stdout); + MPI_Barrier(MPI_COMM_WORLD); + } + free(counters); + + /* Wait for all the sends to have finished too. */ if (MPI_Waitall(num_proxies, reqs_out, MPI_STATUSES_IGNORE) != MPI_SUCCESS) error("MPI_Waitall on sends failed.");