diff --git a/examples/test_bh_mpi.c b/examples/test_bh_mpi.c index 1e80224b1b8d58387bee4994a961fe5a04df7bcb..1f0689919be42d0818e757e9d771192241f8194f 100644 --- a/examples/test_bh_mpi.c +++ b/examples/test_bh_mpi.c @@ -33,6 +33,9 @@ #include <fenv.h> #include <mpi.h> +#define NO_TASK_TIMERS +#define NO_LOAD_BALANCE_EXACT + /* Local includes. */ #include "quicksched.h" #include "res.h" @@ -53,6 +56,8 @@ + + /** Data structure for the particles. */ struct part { double x[3]; @@ -814,6 +819,9 @@ void create_pcs(struct qsched *s, struct cell *ci, struct cell *cj, int depth, i qsched_task_t data[2]; qsched_task_t cp, cps; struct cell *cp1, *cp2; + #ifdef LOAD_BALANCE_EXACT + ticks tic, toc; + #endif #ifdef SANITY_CHECKS if(cj!= NULL && ci->h != cj->h) @@ -874,9 +882,15 @@ void create_pcs(struct qsched *s, struct cell *ci, struct cell *cj, int depth, i /* Create the task. */ data[0] = ci->res; data[1] = cj->res; + #ifdef LOAD_BALANCE_EXACT + tic = getticks(); + iact_pair_pc(s, ci, cj); + toc = getticks(); + tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(qsched_task_t) * 2, (toc-tic) / 100); + #else tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(qsched_task_t) * 2, ci->count * 8 ); - + #endif /* Add the resource and dependance */ qsched_addlock(s, tid, ci->res_parts); qsched_adduse(s, tid, ci->res); @@ -885,9 +899,15 @@ void create_pcs(struct qsched *s, struct cell *ci, struct cell *cj, int depth, i /* Create the task. */ data[0] = cj->res; data[1] = ci->res; + #ifdef LOAD_BALANCE_EXACT + tic = getticks(); + iact_pair_pc(s, cj, ci); + toc = getticks(); + tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(qsched_task_t) * 2, (toc-tic) / 100); + #else tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(qsched_task_t) * 2, cj->count * 8 ); - + #endif /* Add the resource and dependance */ qsched_addlock(s, tid, cj->res_parts); qsched_adduse(s, tid, cj->res); @@ -911,6 +931,9 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { qsched_task_t data[2]; qsched_task_t cp, cps; struct cell *cp1, *cp2; + #ifdef LOAD_BALANCE_EXACT + ticks tic, toc; + #endif #ifdef SANITY_CHECKS @@ -947,9 +970,15 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { data[1] = -1; /* Create the task. */ + #ifdef LOAD_BALANCE_EXACT + tic = getticks(); + iact_self_direct(s, ci); + toc = getticks(); + tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(qsched_task_t) * 2, (toc-tic) / 100); + #else tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof( qsched_task_t) * 2, ci->count * ci->count / 2); - + #endif /* Add the resource (i.e. the cell) to the new task. */ qsched_addlock(s, tid, ci->res_parts); qsched_adduse(s, tid, ci->res); @@ -988,53 +1017,6 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { } } -#ifdef OLD_SETUP - /* Create the task. */ - data[0] = ci->res; - data[1] = cj->res; - tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, - sizeof(qsched_task_t) * 2, ci->count * 8 ); - - /* Add the resource and dependance */ - qsched_addlock(s, tid, ci->res_parts); - qsched_adduse(s, tid, ci->res); - qsched_adduse(s, tid, cj->res); - for(cp = ci->firstchild; cp != ci->sibling; cp = cp1->sibling) - { - cp1 = (struct cell*) qsched_getresdata(s, cp); - qsched_adduse(s, tid, cp1->res); - } - for(cp = cj->firstchild; cp != cj->sibling; cp = cp1->sibling) - { - cp1 = (struct cell*) qsched_getresdata(s, cp); - qsched_adduse(s, tid, cp1->res); - } - - /* Create the task. */ - data[0] = cj->res; - data[1] = ci->res; - tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, - sizeof(qsched_task_t) * 2, cj->count * 8); - - qsched_addlock(s, tid, cj->res_parts); - qsched_adduse(s, tid, ci->res); - qsched_adduse(s, tid, cj->res); - for(cp = ci->firstchild; cp != ci->sibling; cp = cp1->sibling) - { - cp1 = (struct cell*) qsched_getresdata(s, cp); - qsched_adduse(s, tid, cp1->res); - } - for(cp = cj->firstchild; cp != cj->sibling; cp = cp1->sibling) - { - cp1 = (struct cell*) qsched_getresdata(s, cp); - qsched_adduse(s, tid, cp1->res); - } - -#endif - /* Add the resource and dependance */ -// qsched_addunlock(s, ci->com_tid, tid); - - } else { /* Otherwise, at least one of the cells is not split, build a direct * interaction. */ @@ -1043,9 +1025,15 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { data[1] = cj->res; /* Create the task. */ + #ifdef LOAD_BALANCE_EXACT + tic = getticks(); + iact_pair_direct(s, ci, cj); + toc = getticks(); + tid = qsched_addtask(s, task_type_pair, task_flag_none, data, sizeof(qsched_task_t) * 2, (toc-tic) / 100); + #else tid = qsched_addtask(s, task_type_pair, task_flag_none, data, sizeof(qsched_task_t) * 2, ci->count * cj->count); - + #endif struct part *part_j = (struct part*) qsched_getresdata(s, cj->res_parts ); struct part *part_i = (struct part*) qsched_getresdata(s, ci->res_parts ); ci->parts = part_i; @@ -1330,6 +1318,15 @@ if(s.rank == 0) if(s.rank == 0) { create_tasks(&s, root, NULL); + #ifdef LOAD_BALANCE_EXACT + struct parts *part = qsched_getresdata(&s, root->res_parts); + for(i = 0; i < root->count; i++) + { + parts[i].a[0] = 0.0f; + parts[i].a[1] = 0.0f; + parts[i].a[2] = 0.0f; + } + #endif /* Compute the loweest depth of a leaf. */ int depth = 1; int leaf_depth = 0xFFFFFFF; @@ -1357,7 +1354,7 @@ if(s.rank == 0) } message("leaf_depth = %i", leaf_depth); message("tasks before = %i", s.count); - create_pcs(&s, root, NULL, 0, leaf_depth-1); + create_pcs(&s, root, NULL, 0, leaf_depth-2); message("tasks after = %i", s.count); } printf("s.count = %i\n", s.count); @@ -1391,6 +1388,32 @@ for(i = 0; i < s.count_ranks; i++) MPI_Barrier(s.comm); } #endif +#ifdef TASK_TIMERS +//Each rank wants to loop through the tasks they executed and output the data, then synchronize. +int j; +if(s.rank == 0) +{ + file = fopen("task_timers.tks", "w"); + fclose(file); +} +for(i = 0; i < s.count_ranks; i++) +{ + if(i == s.rank) + { + file = fopen("task_timers.tks", "a"); + for(j = 0; j < s.task_ranks[s.count_ranks]; j++) + { + if(s.tasks[j].node_executed == s.rank) + { + struct task *t = &s.tasks[j]; + fprintf(file, "%lli %i %llu %llu %i %i\n", t->id, t->type, t->task_start, t->task_finish, t->node_executed, t->thread_executed); + } + } + fclose(file); + } + MPI_Barrier(s.comm); +} +#endif //Need to clean up everything. // free(parts); diff --git a/examples/test_qr_mpi.c b/examples/test_qr_mpi.c index ca83d75c14013be19488347ab050365855f2033a..c77c46c773da1b775217bdf73cbcc0f6abb73155 100644 --- a/examples/test_qr_mpi.c +++ b/examples/test_qr_mpi.c @@ -720,7 +720,7 @@ for(i = 0; i < s.count_ranks; i++) file = fopen("task_timers.tks", "a"); for(j = 0; j < s.task_ranks[s.count_ranks]; j++) { - if(s.tasks[j].node_executed == s.rank && s.tasks[j].type != -101) + if(s.tasks[j].node_executed == s.rank) { struct task *t = &s.tasks[j]; fprintf(file, "%lli %i %llu %llu %i %i\n", t->id, t->type, t->task_start, t->task_finish, t->node_executed, t->thread_executed); diff --git a/src/Makefile.am b/src/Makefile.am index 4d39c981ed5423256ec3562b3f1dfa6aa12461f5..b40775a0defa29b3e1aee5b501424fa80c6d2bf9 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -36,7 +36,7 @@ libquicksched_la_SOURCES = qsched.c queue.c #libquickschedMPI_la_CC = mpicc libquickschedMPI_la_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \ -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) -DTIMERS -std=gnu99 \ - -DWITH_MPI + -DWITH_MPI #-DTASK_TIMERS libquickschedMPI_la_SOURCES = qsched.c queue.c diff --git a/src/qsched.c b/src/qsched.c index b520f48f8e5ff4a1ed9e5e263d93889c9cb7883d..2be6b1751cfaf78c563d166035ceec80e4408e42 100644 --- a/src/qsched.c +++ b/src/qsched.c @@ -1632,12 +1632,16 @@ void *temp; t->nr_locks = 0; t->nr_uses = 0; t->id = id; + #ifdef TASK_TIMERS + t->node_executed = -1; + t->thread_executed = -1; + #endif /* Add a relative pointer to the data. */ memcpy( &ts->data[ ts->count_data ] , data , data_size ); t->data = &ts->data[ ts->count_data ] - ts->data; ts->count_data += data_size2; - + /* Increase the task counter. */ ts->count += 1; @@ -2565,6 +2569,8 @@ for(i = 0; i < count; i++) /* Update data_pos to the latest parent task in the top order. */ + /* We know we have the data correct as of the latest parent task in the topological order. */ + if(data_pos[getindex(t->locks[j], s)] < last_index) data_pos[getindex(t->locks[j], s)] = last_index; sends_added+=1; @@ -2763,6 +2769,8 @@ for(i = 0; i < count; i++) /* Update data_pos to the latest parent task in the top order. */ + /* We know we have the data correct as of the latest parent task in the topological order. */ + if(data_pos[getindex(t->uses[j], s)] < last_index) data_pos[getindex(t->uses[j], s)] = last_index; sends_added+=1; @@ -2857,15 +2865,15 @@ void qsched_partition( struct qsched *s){ int i, j; struct task *t; int errors; -// ticks tic, toc; + ticks tic, toc; // struct res *r; res_costs = (idx_t*) calloc(s->res_ranks[s->count_ranks], sizeof(idx_t)); -// tic = getticks(); + tic = getticks(); qsched_partition_compute_costs( s, res_costs); -// toc = getticks(); -// message("qsched_partition_compute_costs took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); + toc = getticks(); + message("qsched_partition_compute_costs took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6); //All reduce res_costs; #if IDXTYPEWIDTH == 32 @@ -2900,10 +2908,10 @@ void qsched_partition( struct qsched *s){ for(i = 0; i < s->res_ranks[s->count_ranks]; i++) pos_in_nodelist[i] = -1; -// tic = getticks(); + tic = getticks(); qsched_partition_build_nodelist(s, nodelist, noderef, &node_count, res_costs, pos_in_nodelist); -// toc = getticks(); - // message("qsched_partition_build_nodelist took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); + toc = getticks(); + message("qsched_partition_build_nodelist took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6); //Build an edgelist where edges are of weight += task->weight for each task that locks both. If noderef doesn't contain, recurse until we find the ones it does contain (yuck). Build a set of adjacency lists. @@ -2933,10 +2941,10 @@ for(i = 0; i < node_count; i++) edge_sizes[i] = initial_size; } -// tic = getticks(); + tic = getticks(); qsched_partition_build_edgelist(s, edge_vwgts, edge_lists, edge_counts, edge_sizes, node_count, noderef, pos_in_nodelist); -// toc = getticks(); -// message("qsched_partition_build_edgelist took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); + toc = getticks(); + message("qsched_partition_build_edgelist took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6); idx_t edgelist_size = 0; for(i = 0; i < node_count; i++) @@ -2970,7 +2978,7 @@ for(i = 0; i < node_count; i++) tic = getticks(); qsched_partition_build_edgelist(s, edgelist, node_count, noderef, pos_in_nodelist); toc = getticks(); - message("qsched_partition_build_edgelist took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); + message("qsched_partition_build_edgelist took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6); #endif // free(pos_in_nodelist); @@ -3001,7 +3009,7 @@ for(i = 0; i < node_count; i++) edgelist_size++; } toc = getticks(); - message("Checking number of elements in new edgelist took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); + message("Checking number of elements in new edgelist took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6); //Make the new contiguous edge list. @@ -3023,7 +3031,7 @@ for(i = 0; i < node_count; i++) tic = getticks(); qsched_partition_edgelist_squash(s, edgelist_pos, edgelist_new, edgelist_vwgt, &edgelist_count, edgelist, node_count); toc = getticks(); - message("qsched_partition_edgelist_squash took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); + message("qsched_partition_edgelist_squash took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6); #endif /*#if IDXTYPEWIDTH == 32 @@ -3053,13 +3061,13 @@ for(i = 0; i < node_count; i++) if(nodeIDs == NULL) error("Failed to allocate nodeIDs"); idx_t temp_count_ranks = s->count_ranks; -// tic = getticks(); + tic = getticks(); if(s->count_ranks > 1) { if( METIS_PartGraphKway(&node_count, &one, edgelist_pos, edgelist_new, nodelist, NULL, edgelist_vwgt, &temp_count_ranks, NULL, NULL,options, objval, nodeIDs) != METIS_OK) error("Failed to partition\n"); } -// toc = getticks(); -// message("METIS_PartGraphKway took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); + toc = getticks(); + message("METIS_PartGraphKway took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6); //TODO Check the costs. // message("node_count = %i", node_count); long long int count_me = 0; @@ -3077,7 +3085,7 @@ for(i = 0; i < node_count; i++) -// tic = getticks(); + tic = getticks(); if(s->count_ranks > 1) { MPI_Request *reqs; @@ -3188,14 +3196,14 @@ if(s->count_ranks > 1) if(temp->node == s->rank && temp->data == NULL) error("Local resource has data set to NULL"); } -// toc = getticks(); -// message("qsched_partition synchronizing resources took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); + toc = getticks(); + message("qsched_partition synchronizing resources took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6); // Move the tasks time! // Tasks belong to the node of the resource they lock of highest size. // If they don't lock any resources, the resources they use of highest size. // Everyone does it for all tasks at the moment... // TODO ISSUE: Whoever is assigned bigger resources will run more tasks - not balanced well. Minimises communication, less relevant. -// tic = getticks(); + tic = getticks(); for(i = 0; i < s->task_ranks[s->count_ranks]; i++) { struct task *t = &s->tasks[i]; @@ -3229,8 +3237,8 @@ if(s->count_ranks > 1) } -// toc = getticks(); -// message("qsched_partition task \"movement\" took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); + toc = getticks(); + message("qsched_partition task \"movement\" took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6); //First we need to do a topological sort. int count = s->task_ranks[s->count_ranks]; @@ -3258,10 +3266,10 @@ if(s->count_ranks > 1) } //Now we just need to create the send/recv tasks from the dependencies. -// tic = getticks(); + tic = getticks(); qsched_partition_create_sends( s, tid); -// toc = getticks(); -// message("qsched_partition_create_sends took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); + toc = getticks(); + message("qsched_partition_create_sends took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6); free(edge_lists); free(edge_sizes); @@ -3367,10 +3375,10 @@ void qsched_prepare_mpi( struct qsched *s){ // toc = getticks(); // message("Cleaning up scheduler took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); } -// tic = getticks(); + tic = getticks(); qsched_partition(s); -// toc = getticks(); - // message("qsched_partition took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); + toc = getticks(); + message("qsched_partition took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6); long long int *tid = malloc(sizeof(long long int) * s->task_ranks[s->count_ranks]); if(tid == NULL) @@ -3456,30 +3464,6 @@ if(tid == NULL) num_recv++; qsched_enqueue( s , t ); } - //TODO Remove this debug stuff. - /* if(t->flags & task_flag_skip && t->type == task_type_send && t->id == 886394) - { - int* data = (int*)&s->data[ t->data ]; - int to = data[1]; - int from = data[0]; - int tag = data[4]; - long long int resid = (((long long int)data[2]) << 32) + data[3]; - struct res *res = &s->res[getindex(resid, s)]; - int res_owner = res->node; - if(to != s->rank) - { - message("Not sending task to %i from %i with tag %i, id = %lli", to, from,tag, t->id); - } - if(to == s->rank) - { - message("This task should come to %i from %i with tag %i, id = %lli", from , to,tag, t->id); - } - message("Resource owner is node %i", res_owner); - } - - - message("Initial setup %i send tasks", num_send); - message("Initial setup %i recv tasks", num_recv);*/ } /* Clean up. */ @@ -3493,7 +3477,6 @@ if(tid == NULL) if(!(t->flags & task_flag_skip) ) s->waiting++; } -// printf("s->waiting = %i, count = %i\n", s->waiting, count); /* Set the ready flag. */ s->flags |= qsched_flag_ready; @@ -3639,8 +3622,8 @@ void qsched_run_MPI ( struct qsched *s, int nr_threads, qsched_funtype fun ) { /* Prepare the scheduler*/ qsched_prepare_mpi( s ); ticks toc = getticks(); -printf("Rank[%i]: qsched_prepare_mpi took %lli (= %e) ticks\n", s->rank, - toc - tic, (float)(toc - tic)); +printf("Rank[%i]: qsched_prepare_mpi took %lli (= %.3fms) ticks\n", s->rank, + toc - tic, (float)(toc - tic)/2.67e6); #if defined( HAVE_PTHREAD )