diff --git a/examples/test_bh_mpi.c b/examples/test_bh_mpi.c index d4e5fdafa94636c4a27e86b56d4c1a71096d5a5d..c6115f32d430300635dec984e8b3091b612e7b50 100644 --- a/examples/test_bh_mpi.c +++ b/examples/test_bh_mpi.c @@ -45,11 +45,11 @@ #define dist_min 0.5 /* Used for legacy walk only */ #define dist_cutoff_ratio 1.5 -#define ICHECK 9501 +#define ICHECK -1 #define SANITY_CHECKS #define NO_COM_AS_TASK #define NO_COUNTERS -#define NO_EXACT +#define EXACT @@ -146,6 +146,25 @@ enum task_type { /** Global variable for the pool of allocated cells. */ struct cell *cell_pool = NULL; + +/** + * + * @brief Checks whether the cells are direct neighbours or not. Cells can be different sizes. + * + */ +static inline int are_neighbours_different_size(struct cell *ci, struct cell *cj) { + + float min_dist = min_dist = 0.5*(ci->h+cj->h); + + for(int k = 0; k < 3; k++) + { + float center_i = ci->loc[k]; + float center_j = cj->loc[k]; + if( fabsf(center_i - center_j) > min_dist) return 0; + } + return 1; +} + /** * @brief Checks whether the cells are direct neighbours ot not. Both cells have * to be of the same size @@ -325,7 +344,7 @@ static inline void iact_pair_direct(struct qsched *s, struct cell *ci, struct ce r2 += dx[k] * dx[k]; } - if(r2 == 0) + /*if(r2 == 0) { printf("parts_j[j] = %.3f %.3f %.3f\n", parts_j[j].x[0], parts_j[j].x[1], parts_j[j].x[2]); printf("cj->parts[j] = %.3f %.3f %.3f\n", cj->parts[j].x[0], cj->parts[j].x[1] ,cj->parts[j].x[2]); @@ -335,7 +354,7 @@ static inline void iact_pair_direct(struct qsched *s, struct cell *ci, struct ce printf("ci->loc[0] = %.3f, cj->loc[0] = %.3f\n", ci->loc[0], cj->loc[0]); printf("ci->loc[1] = %.3f, cj->loc[1] = %.3f\n", ci->loc[1], cj->loc[1]); printf("ci->loc[2] = %.3f, cj->loc[2] = %.3f\n", ci->loc[2], cj->loc[2]); - } + }*/ /* Apply the gravitational acceleration. */ ir = 1.0f / sqrtf(r2); w = const_G * ir * ir * ir; @@ -471,95 +490,145 @@ void iact_self_direct(struct qsched *s, struct cell *c) { } } - - -/** - * @brief Loops all siblings of cells ci and cj and launches all - * the cell-monopole interactions that can be done between the siblings - * - * @param ci The first cell - * @param cj The second cell - */ static inline void iact_pair_pc(struct qsched *s, struct cell *ci, struct cell *cj) { - struct part_local *parts = NULL; - struct multipole_local mult_local[8]; - struct cell *cp, *cps; - int part_count, mult_count; - ci->parts = (struct part *) qsched_getresdata(s, ci->res_parts ); - cj->parts = (struct part *) qsched_getresdata(s, cj->res_parts ); - -#ifdef SANITY_CHECKS - if (ci->h != cj->h) - error(" Cells of different size !."); - - if ( ! are_neighbours(ci, cj) ) - error(" Cells that are not neighbours !"); -#endif - - /* Let's loop over all siblings of ci */ - for (cp = (struct cell*) qsched_getresdata(s, ci->firstchild); cp != (struct cell*) qsched_getresdata(s, ci->sibling); cp = (struct cell*) qsched_getresdata(s, cp->sibling) ) { + struct part_local *parts = NULL; + struct multipole_local mult_local[1]; + int part_count; + ci->parts = (struct part*) qsched_getresdata(s, ci->res_parts); + + /* Load particles. */ + part_count = ci->count; + if ((parts = (struct part_local *) malloc(sizeof(struct part_local) * part_count)) == NULL) + error("Failed to allocate local parts."); + for (int k = 0; k < part_count; k++) { + for (int j = 0; j < 3; j++) { + parts[k].x[j] = ci->parts[k].x[j] - ci->loc[j]; + parts[k].a[j] = 0.0f; + } + parts[k].mass = ci->parts[k].mass; + } - part_count = cp->count; - cp->parts = (struct part *) qsched_getresdata(s, cp->res_parts); - #ifdef SANITY_CHECKS - if(cp->parts == NULL) - { - error("cp->parts is NULL after we attempted to get the data from the scheduler."); - } - #endif - /* Get local copies of the particle data in cp. */ - if ((parts = (struct part_local *) malloc(sizeof(struct part_local) * part_count)) == NULL) - error("Failed to allocate local parts."); + mult_local[0].com[0] = cj->new.com[0] - ci->loc[0]; + mult_local[0].com[1] = cj->new.com[1] - ci->loc[1]; + mult_local[0].com[2] = cj->new.com[2] - ci->loc[2]; + mult_local[0].mass = cj->new.mass; + + #ifdef QUADRUPOLES + /* Final operation to get the quadrupoles */ + double imass = 1. / cj->new.mass; + mult_local[0].I_xx = cj->new.I_xx*imass - cj->new.com[0] * cj->new.com[0]; + mult_local[0].I_xy = cj->new.I_xy*imass - cj->new.com[0] * cj->new.com[1]; + mult_local[0].I_xz = cj->new.I_xz*imass - cj->new.com[0] * cj->new.com[2]; + mult_local[0].I_yy = cj->new.I_yy*imass - cj->new.com[1] * cj->new.com[1]; + mult_local[0].I_yz = cj->new.I_yz*imass - cj->new.com[1] * cj->new.com[2]; + mult_local[0].I_zz = cj->new.I_zz*imass - cj->new.com[2] * cj->new.com[2]; + #endif + /* Perform the interaction between the particles and the list of multipoles */ + make_interact_pc(parts, part_count, mult_local, 1); + + /* Clean up local parts? */ for (int k = 0; k < part_count; k++) { - for (int j = 0; j < 3; j++) { - parts[k].x[j] = cp->parts[k].x[j] - cp->loc[j]; - parts[k].a[j] = 0.0f; - } - parts[k].mass = cp->parts[k].mass; + for (int j = 0; j < 3; j++) ci->parts[k].a[j] += parts[k].a[j]; } - - mult_count = 0; - - /* Now loop over all the other cells */ - for (cps = (struct cell*) qsched_getresdata(s, cj->firstchild); cps != (struct cell*) qsched_getresdata(s, cj->sibling); cps = (struct cell*) qsched_getresdata(s, cps->sibling) ) { - - /* And build the list of multipoles to interact with */ - if ( ! are_neighbours(cp, cps) ) { + free(parts); +} - mult_local[mult_count].com[0] = cps->new.com[0] - cp->loc[0]; - mult_local[mult_count].com[1] = cps->new.com[1] - cp->loc[1]; - mult_local[mult_count].com[2] = cps->new.com[2] - cp->loc[2]; - mult_local[mult_count].mass = cps->new.mass; +#ifdef OLD_VERSION +/** + * @brief Interacts a cell ci with the monopole of cell cj, or recurses cj to the depth of ci if ci and cj are neighbours. + * + * @param ci The first cell (particles). + * @param cj The second cell (monopole). +*/ +static inline void iact_pair_pc(struct qsched *s, struct cell *ci, struct cell *cj) { -#ifdef QUADRUPOLES + struct part_local *parts = NULL; + struct multipole_local mult_local[8]; + struct cell *cps; + int part_count, mult_count; + ci->parts = (struct part* ) qsched_getresdata(s, ci->res_parts); - /* Final operation to get the quadrupoles */ - double imass = 1. / cps->new.mass; - mult_local[mult_count].I_xx = cps->new.I_xx*imass - cps->new.com[0] * cps->new.com[0]; - mult_local[mult_count].I_xy = cps->new.I_xy*imass - cps->new.com[0] * cps->new.com[1]; - mult_local[mult_count].I_xz = cps->new.I_xz*imass - cps->new.com[0] * cps->new.com[2]; - mult_local[mult_count].I_yy = cps->new.I_yy*imass - cps->new.com[1] * cps->new.com[1]; - mult_local[mult_count].I_yz = cps->new.I_yz*imass - cps->new.com[1] * cps->new.com[2]; - mult_local[mult_count].I_zz = cps->new.I_zz*imass - cps->new.com[2] * cps->new.com[2]; -#endif - - mult_count++; - } + /* Check if ci and cj are neighbours. */ + if(are_neighbours_different_size(ci, cj) && ci->h != cj->h && cj->split) + { + /* They are neighbours, so we need to recurse on cj.*/ + for (cps = (struct cell*) qsched_getresdata(s, cj->firstchild); cps != (struct cell*) qsched_getresdata(s, cj->sibling); cps = + (struct cell*) qsched_getresdata(s, cps->sibling) ) { + iact_pair_pc(s, ci, cps); + } + }else{ /* Else cj->h >= ci->h and they're not neighbours, so multipole interaction!*/ + /* Load particles. */ + part_count = ci->count; + if ((parts = (struct part_local *) malloc(sizeof(struct part_local) * part_count)) == NULL) + error("Failed to allocate local parts."); + for (int k = 0; k < part_count; k++) { + for (int j = 0; j < 3; j++) { + parts[k].x[j] = ci->parts[k].x[j] - ci->loc[j]; + parts[k].a[j] = 0.0f; + } + parts[k].mass = ci->parts[k].mass; + } + mult_count = 0; + if(cj->split && cj->h > ci->h) + { + /* Now loop over all the other cells */ + for (cps = (struct cell*) qsched_getresdata(s, cj->firstchild); cps != (struct cell*) qsched_getresdata(s, cj->sibling); cps = + (struct cell*) qsched_getresdata(s, cps->sibling) ) { + + mult_local[mult_count].com[0] = cps->new.com[0] - ci->loc[0]; + mult_local[mult_count].com[1] = cps->new.com[1] - ci->loc[1]; + mult_local[mult_count].com[2] = cps->new.com[2] - ci->loc[2]; + mult_local[mult_count].mass = cps->new.mass; + + #ifdef QUADRUPOLES + + /* Final operation to get the quadrupoles */ + double imass = 1. / cps->new.mass; + mult_local[mult_count].I_xx = cps->new.I_xx*imass - cps->new.com[0] * cps->new.com[0]; + mult_local[mult_count].I_xy = cps->new.I_xy*imass - cps->new.com[0] * cps->new.com[1]; + mult_local[mult_count].I_xz = cps->new.I_xz*imass - cps->new.com[0] * cps->new.com[2]; + mult_local[mult_count].I_yy = cps->new.I_yy*imass - cps->new.com[1] * cps->new.com[1]; + mult_local[mult_count].I_yz = cps->new.I_yz*imass - cps->new.com[1] * cps->new.com[2]; + mult_local[mult_count].I_zz = cps->new.I_zz*imass - cps->new.com[2] * cps->new.com[2]; + #endif + + mult_count++; + + } + }else + { + mult_count = 1; + mult_local[mult_count].com[0] = cj->new.com[0] - ci->loc[0]; + mult_local[mult_count].com[1] = cj->new.com[1] - ci->loc[1]; + mult_local[mult_count].com[2] = cj->new.com[2] - ci->loc[2]; + mult_local[mult_count].mass = cj->new.mass; + + #ifdef QUADRUPOLES + + /* Final operation to get the quadrupoles */ + double imass = 1. / cj->new.mass; + mult_local[mult_count].I_xx = cj->new.I_xx*imass - cj->new.com[0] * cj->new.com[0]; + mult_local[mult_count].I_xy = cj->new.I_xy*imass - cj->new.com[0] * cj->new.com[1]; + mult_local[mult_count].I_xz = cj->new.I_xz*imass - cj->new.com[0] * cj->new.com[2]; + mult_local[mult_count].I_yy = cj->new.I_yy*imass - cj->new.com[1] * cj->new.com[1]; + mult_local[mult_count].I_yz = cj->new.I_yz*imass - cj->new.com[1] * cj->new.com[2]; + mult_local[mult_count].I_zz = cj->new.I_zz*imass - cj->new.com[2] * cj->new.com[2]; + #endif + } + /* Perform the interaction between the particles and the list of multipoles */ + make_interact_pc(parts, part_count, mult_local, mult_count); + + /* Clean up local parts? */ + for (int k = 0; k < part_count; k++) { + for (int j = 0; j < 3; j++) ci->parts[k].a[j] += parts[k].a[j]; + } + free(parts); } - /* Perform the interaction between the particles and the list of multipoles */ - make_interact_pc(parts, part_count, mult_local, mult_count); - - /* Clean up local parts? */ - for (int k = 0; k < part_count; k++) { - for (int j = 0; j < 3; j++) cp->parts[k].a[j] += parts[k].a[j]; - } - free(parts); - } } - +#endif @@ -573,29 +642,6 @@ struct cell *cell_get(struct qsched *s) { res->res = resource; res->firstchild = -1; -#ifdef THINK_CAN_BE_DELETED - /* Allocate a new batch? */ - if (cell_pool == NULL) { - - /* Allocate the cell array. */ - if ((cell_pool = - (struct cell *)calloc(cell_pool_grow, sizeof(struct cell))) == - NULL) - error("Failed to allocate fresh batch of cells."); - - /* Link them up via their progeny pointers. */ - for (k = 1; k < cell_pool_grow; k++) - cell_pool[k - 1].firstchild = cell_pool[k].res; - } - - /* Pick a cell off the pool. */ - res = cell_pool; - cell_pool = cell_pool->firstchild; - - /* Clean up a few things. */ - res->res = qsched_res_none; - res->firstchild = 0; -#endif /* Return the cell. */ return res; } @@ -842,6 +888,96 @@ void cell_split(struct cell *c, struct qsched *s) { } +/** + * @brief Create the tasks for the particle-cell interactions. + * @param s The #sched in which to create the tasks. + * @param ci The first #cell. + * @param cj The second #cell. + * @param depth The current depth. + * @param leaf_depth The depth at which to start making tasks. + */ + +void create_pcs(struct qsched *s, struct cell *ci, struct cell *cj, int depth, int leaf_depth){ + + qsched_task_t tid; + qsched_task_t data[2]; + qsched_task_t cp, cps; + struct cell *cp1, *cp2; + + + /* If cj == NULL we're at the root.*/ + if(cj == NULL) + { + for (cp = ci->firstchild; cp != ci->sibling; cp = cp1->sibling) { + cp1 = (struct cell*) qsched_getresdata(s, cp); + /* Recurse over all pairs. */ + for (cps = cp1->sibling; cps != ci->sibling; cps = cp2->sibling) + { + cp2 = (struct cell*) qsched_getresdata(s, cps); + create_pcs(s, cp1, cp2, depth+1, leaf_depth); + } + } + /* Once we get back here we're done!*/ + return; + } + + /* If we're not yet at the level of making tasks */ + if(depth < leaf_depth) + { + for (cp = ci->firstchild; cp != ci->sibling; cp = cp1->sibling) { + cp1 = (struct cell*) qsched_getresdata(s, cp); + /* Recurse over all pairs. */ + for (cps = cj->firstchild; cps != cj->sibling; cps = cp2->sibling) + { + cp2 = (struct cell*) qsched_getresdata(s, cps); + create_pcs(s, cp1, cp2, depth+1, leaf_depth); + } + } + }else{ + /* We can make tasks if they're not neighbours! */ + if(are_neighbours(ci, cj)) + { + /* If they are neighbours we need to recurse if possible. If either is not split then we do the pair interaction so we can stop.*/ + if(ci->split && cj->split) + { + for (cp = ci->firstchild; cp != ci->sibling; cp = cp1->sibling) + { + cp1 = (struct cell*) qsched_getresdata(s, cp); + /* Recurse over all pairs. */ + for (cps = cj->firstchild; cps != cj->sibling; cps = cp2->sibling) + { + cp2 = (struct cell*) qsched_getresdata(s, cps); + create_pcs(s, cp1, cp2, depth+1, leaf_depth); + } + } + } + }else + { + /* Create the task. */ + data[0] = ci->res; + data[1] = cj->res; + tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, + sizeof(qsched_task_t) * 2, ci->count * 8 ); + + /* Add the resource and dependance */ + qsched_addlock(s, tid, ci->res_parts); + qsched_adduse(s, tid, ci->res); + qsched_adduse(s, tid, cj->res); + + /* Create the task. */ + data[0] = cj->res; + data[1] = ci->res; + tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, + sizeof(qsched_task_t) * 2, cj->count * 8 ); + + /* Add the resource and dependance */ + qsched_addlock(s, tid, cj->res_parts); + qsched_adduse(s, tid, cj->res); + qsched_adduse(s, tid, ci->res); + } + } +} + /** * @brief Create the tasks for the cell pair/self. * @@ -929,12 +1065,11 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { /* Recurse */ if (are_neighbours(cp1, cp2)) { create_tasks(s, cp1, cp2); - } - } - } + } + } + } - /* Let's also build particle-monopole tasks */ - +#ifdef OLD_SETUP /* Create the task. */ data[0] = ci->res; data[1] = cj->res; @@ -976,7 +1111,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { qsched_adduse(s, tid, cp1->res); } - +#endif /* Add the resource and dependance */ // qsched_addunlock(s, ci->com_tid, tid); @@ -992,7 +1127,6 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { tid = qsched_addtask(s, task_type_pair, task_flag_none, data, sizeof(qsched_task_t) * 2, ci->count * cj->count); - //TODO Check that cj->res_parts and ci->res_parts are distinct! struct part *part_j = (struct part*) qsched_getresdata(s, cj->res_parts ); struct part *part_i = (struct part*) qsched_getresdata(s, ci->res_parts ); ci->parts = part_i; @@ -1271,6 +1405,33 @@ if(s.rank == 0) if(s.rank == 0) { create_tasks(&s, root, NULL); + /* Compute the loweest depth of a leaf. */ + int depth = 1; + int leaf_depth = 0xFFFFFFF; + struct cell *temp = NULL; + long long int next = root->firstchild; + while(next > 0) + { + temp = (struct cell*) qsched_getresdata(&s, next); + if(temp->split) + { + depth += 1; + next = temp->firstchild; + }else + { + if(depth < leaf_depth) + { + leaf_depth = depth; + } + struct cell *temp2; + temp2 = qsched_getresdata(&s, temp->sibling); + if(temp2->h > temp->h) + depth -= 1; + next = temp->sibling; + } + } + message("leaf_depth = %i", leaf_depth); + create_pcs(&s, root, NULL, 0, 2); } printf("s.count = %i\n", s.count); //Mark all the schedulers dirty so they actually synchronize. diff --git a/examples/test_qr_mpi.c b/examples/test_qr_mpi.c index 6bb7c1d47cdacdefde3699ff2ee98c57edb74f85..989d626e2ea33dc997f1aca54ab60af1d29ff4c4 100644 --- a/examples/test_qr_mpi.c +++ b/examples/test_qr_mpi.c @@ -32,8 +32,9 @@ #include <fenv.h> #include <mpi.h> - +#ifdef WITH_CBLAS_LIB #include <cblas.h> +#endif /* Local includes. */ diff --git a/src/qsched.c b/src/qsched.c index c70a3495b5ed3606ca32247240f44b36bac1d045..7cfa4e9b9ed4d295572971304d11f43e8b985e39 100644 --- a/src/qsched.c +++ b/src/qsched.c @@ -40,6 +40,8 @@ #include <pthread.h> #endif +#define TASK_TIMERS + /* Local includes. */ #include "cycle.h" #include "atomic.h" @@ -861,6 +863,7 @@ void qsched_partition_compute_costs( struct qsched *s, idx_t *res_costs) for(j = 0; j < t->nr_uses; j++) { res_costs[getindex(t->uses[j],s)] += 1; +/* res_costs[getindex(t->uses[j],s)] = 1;*/ } } } @@ -2843,8 +2846,6 @@ for(i = 0; i < count; i++) free(parents); free(current_parents); - message("ts->count = %i", ts.count); - message("ts.count_unlockers = %i", ts.count_unlockers); /* All the nodes have created new tasks, we need to synchronize this across nodes now.*/ tsched_synchronize( &ts, s); @@ -3065,12 +3066,18 @@ for(i = 0; i < node_count; i++) // toc = getticks(); // message("METIS_PartGraphKway took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); //TODO Check the costs. + message("node_count = %i", node_count); int count_me = 0; for(i = 0; i < node_count; i++) { if(nodeIDs[i] == s->rank) + { +// if(s->res[noderef[i]].num_lockers > 0) + // message("My partition contains %lli with weight %i", noderef[i], nodelist[i]); count_me+= nodelist[i]; + } } + printf("\n"); message("My \"cost\" = %i", count_me); @@ -3498,6 +3505,9 @@ printf("Rank[%i]: qsched_prepare_mpi took %lli (= %e) ticks\n", s->rank, toc - tic, (float)(toc - tic)); MPI_Barrier(s->comm); + #ifdef TASK_TIMERS + s->start = getticks(); + #endif message("Beginning execution of quicksched"); tic = getticks(); #if defined( HAVE_OPENMP ) @@ -3513,12 +3523,18 @@ printf("Rank[%i]: qsched_prepare_mpi took %lli (= %e) ticks\n", s->rank, /* Loop as long as there are tasks. */ while ( ( t = qsched_gettask( s , qid ) ) != NULL ) { + #ifdef TASK_TIMERS + t->task_start = getticks() - s->start; + #endif /* Call the user-supplied function on the task with its data. */ fun( s, t->type , &s->data[ t->data ] ); /* Mark that task as done. */ qsched_done( s , t ); + #ifdef TASK_TIMERS + t->task_finish = getticks() - s->start; + #endif } /* loop as long as there are tasks. */ } /* parallel loop. */ @@ -3839,15 +3855,10 @@ void qsched_enqueue ( struct qsched *s , struct task *t ) { resID += (long long int) data[3]; int tag = data[4]; struct res *resource = &s->res[getindex(resID, s)]; - if(t->id == 281474976710656) - message("Emitting Isend for (2,2) after (2,2,2)"); int err; // printf("Sending tag = %i\n", tag); if(resource->data == NULL) { - message("resource->parent = %lli", resource->parent); - message("from = %i", from); - message("t->node= %i", t->node); if(resource->parent != -1) { struct res *temp = &s->res[getindex(resource->parent, s)]; @@ -4906,7 +4917,10 @@ void *temp; s->count += 1; t->node = s->rank; - + #ifdef TASK_TIMERS + t->node_executed = -1; + t->thread_executed = -1; + #endif /* The sched is now dirty. */ s->flags |= qsched_flag_dirty; /* Unlock the sched. */ diff --git a/src/qsched.h b/src/qsched.h index 6cdccaebbdee6b2558e6b6492d6b59cb61a15e0b..f488a892839e4f6c8b4a1b9dbef30cc5f4212a87 100644 --- a/src/qsched.h +++ b/src/qsched.h @@ -189,6 +189,9 @@ struct qsched { int size_users; int count_users; MPI_Comm comm; + #ifdef TASK_TIMERS + ticks start; + #endif #endif }; diff --git a/src/task.h b/src/task.h index 4305c22db5992bc9e0d6bdba7b0341853415b36f..bee97ca4408863b72417b5d8db506a2384cbb80a 100644 --- a/src/task.h +++ b/src/task.h @@ -70,6 +70,11 @@ struct task { int node; MPI_Request req; + #ifdef TASK_TIMERS + ticks task_start, task_finish; + int node_executed; + int thread_executed; + #endif #endif };