diff --git a/examples/test_qr_mpi.c b/examples/test_qr_mpi.c index c77c46c773da1b775217bdf73cbcc0f6abb73155..2a235235fee9146fbc9b8e82169b189b07b888fa 100644 --- a/examples/test_qr_mpi.c +++ b/examples/test_qr_mpi.c @@ -28,7 +28,7 @@ #include <math.h> #include <float.h> #include <limits.h> -#include <omp.h> +//#include <omp.h> #include <fenv.h> #include <mpi.h> @@ -909,9 +909,9 @@ int main(int argc, char* argv[]) { case 't': if (sscanf(optarg, "%d", &nr_threads) != 1) error("Error parsing number of threads."); - omp_set_num_threads(nr_threads); - message("omp_get_max_threads() = %i\n", omp_get_max_threads()); - message("omp_get_num_procs() = %i\n", omp_get_num_procs()); + // omp_set_num_threads(nr_threads); + // message("omp_get_max_threads() = %i\n", omp_get_max_threads()); + // message("omp_get_num_procs() = %i\n", omp_get_num_procs()); break; case '?': fprintf(stderr, "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n", diff --git a/src/qsched.c b/src/qsched.c index 2be6b1751cfaf78c563d166035ceec80e4408e42..c3615f23ec7d8d7998bcfd5b5c1e811d99abe842 100644 --- a/src/qsched.c +++ b/src/qsched.c @@ -146,6 +146,8 @@ void* qsched_getresdata( struct qsched *s, qsched_res_t res_id ) } } } + if(s->res[getindex(res_id, s)].data == NULL) + error("Data is NULL"); return s->res[getindex(res_id, s)].data; } #endif @@ -2225,15 +2227,15 @@ void qsched_integrate( struct qsched *s, struct tsched *ts) /* Once this is done, we need to sort out the arrays and fix the pointers to the locks/unlocks arrays.*/ - #pragma omp parallel + ////#pragma ompparallel { /* Sort the unlocks. */ - #pragma omp single nowait + // //#pragma ompsingle nowait qsched_quicksort( s->deps , s->deps_key , s->count_deps , 0 ,0xFFFFFFFFFFFFFFF ); /* Sort the locks. */ - #pragma omp single nowait + // //#pragma ompsingle nowait qsched_quicksort( s->locks , s->locks_key , s->count_locks , 0 , 0xFFFFFFFFFFFFFFF); } @@ -2253,7 +2255,9 @@ void qsched_partition_create_sends( struct qsched *s, long long int *tid) { int i, j, k, l; int count = s->task_ranks[s->count_ranks]; +ticks tic, toc; +tic = getticks(); /* We need a temporary scheduler! */ struct tsched ts; @@ -2317,7 +2321,7 @@ int sends_added = 0; for(i = 0; i < count; i++) { struct task *t = &s->tasks[gettaskindex(tid[i], s)]; - + long long int ttid = t->id; /* If we don't execute this task locally then we can skip it!*/ if( t->flags & task_flag_skip ) @@ -2328,31 +2332,34 @@ for(i = 0; i < count; i++) /* Loop through the locks, and for each resource that is not local create a send/recv pair. */ for(j = 0; j < t->nr_locks; j++){ + int lock_index = getindex(t->locks[j],s); + struct res *res = &s->res[lock_index]; + /* We need to get each resource that is non-local.*/ - if(s->res[getindex(t->locks[j],s)].node != s->rank){ + if(res->node != s->rank){ /* If this data isn't already sent to us:*/ - if(first_recv[getindex(t->locks[j], s)] == -1){ + if(first_recv[lock_index] == -1){ /* Find out the task data*/ - task_data[0] = s->res[getindex(t->locks[j],s)].node; + task_data[0] = res->node; task_data[1] = s->rank; - task_data[2] = (int)(s->res[getindex(t->locks[j],s)].ID >> 32); - task_data[3] = (int)(s->res[getindex(t->locks[j],s)].ID & 0xFFFFFFFF); + task_data[2] = (int)(res->ID >> 32); + task_data[3] = (int)(res->ID & 0xFFFFFFFF); task_data[4] = sends_added; /* Create the send task. */ - send_task_id = tsched_addtask(&ts, task_type_send , 0 , task_data , 5 * sizeof(int) , s->res[getindex(t->locks[j],s)].size ); + send_task_id = tsched_addtask(&ts, task_type_send , 0 , task_data , 5 * sizeof(int) , res->size ); /* The send task needs to lock the resource. */ - tsched_addlock( &ts, send_task_id, s->res[getindex(t->locks[j], s)].ID); + tsched_addlock( &ts, send_task_id, res->ID); /* Create the recv task*/ - recv_task_id = tsched_addtask(&ts, task_type_recv , 0 , task_data , 5 * sizeof(int) , s->res[getindex(t->locks[j],s)].size ); + recv_task_id = tsched_addtask(&ts, task_type_recv , 0 , task_data , 5 * sizeof(int) , res->size ); /* The recv task unlocks this task locally. */ tsched_addunlock( &ts, recv_task_id, t->id); /* The recv task needs to lock the resource. */ - tsched_addlock( &ts, recv_task_id, s->res[getindex(t->locks[j], s)].ID); + tsched_addlock( &ts, recv_task_id, res->ID); sends_added +=1; - first_recv[getindex(t->locks[j], s)] = recv_task_id; + first_recv[lock_index] = recv_task_id; }else{ - tsched_addunlock( &ts, first_recv[getindex(t->locks[j], s)], t->id); + tsched_addunlock( &ts, first_recv[lock_index], t->id); } } @@ -2362,30 +2369,34 @@ for(i = 0; i < count; i++) /* Loop through the uses, and for each resource that is not local create a send/recv pair. */ for(j = 0; j < t->nr_uses; j++){ - if(s->res[getindex(t->uses[j],s)].node != s->rank){ + + int use_index = getindex(t->uses[j],s); + struct res *res = &s->res[use_index]; + + if(res->node != s->rank){ /* Find out the task data*/ /* If this data isn't already sent to us:*/ - if(first_recv[getindex(t->uses[j], s)] == -1){ - task_data[0] = s->res[getindex(t->uses[j],s)].node; + if(first_recv[use_index] == -1){ + task_data[0] = res->node; task_data[1] = s->rank; - task_data[2] = (int)(s->res[getindex(t->uses[j],s)].ID >> 32); - task_data[3] = (int)(s->res[getindex(t->uses[j],s)].ID & 0xFFFFFFFF); + task_data[2] = (int)(res->ID >> 32); + task_data[3] = (int)(res->ID & 0xFFFFFFFF); task_data[4] = sends_added; /* Create the send task. */ - send_task_id = tsched_addtask(&ts, task_type_send , 0 , task_data , 5 * sizeof(int) , s->res[getindex(t->uses[j],s)].size ); + send_task_id = tsched_addtask(&ts, task_type_send , 0 , task_data , 5 * sizeof(int) , res->size ); /* The send task needs to lock the resource. */ - tsched_addlock( &ts, send_task_id, s->res[getindex(t->uses[j], s)].ID); + tsched_addlock( &ts, send_task_id, res->ID); /* Create the recv task*/ - recv_task_id = tsched_addtask(&ts, task_type_recv , 0 , task_data , 5 * sizeof(int) , s->res[getindex(t->uses[j],s)].size ); + recv_task_id = tsched_addtask(&ts, task_type_recv , 0 , task_data , 5 * sizeof(int) , res->size ); /* The recv task unlocks this task locally. */ tsched_addunlock( &ts, recv_task_id, t->id); /* The recv task needs to lock the resource. */ - tsched_addlock( &ts, recv_task_id, s->res[getindex(t->uses[j], s)].ID); + tsched_addlock( &ts, recv_task_id, res->ID); sends_added +=1; - first_recv[getindex(t->uses[j], s)] = recv_task_id; + first_recv[use_index] = recv_task_id; }else{ - tsched_addunlock( &ts, first_recv[getindex(t->uses[j], s)], t->id); + tsched_addunlock( &ts, first_recv[use_index], t->id); } @@ -2397,12 +2408,14 @@ for(i = 0; i < count; i++) /* If the task has dependencies*/ /*Find the parents. */ num_parents = 0; - for(j = 0; j < i && num_parents < wait_init[gettaskindex(t->id, s)]; j++) + int max = wait_init[gettaskindex(t->id, s)]; + for(j = 0; j < i && num_parents < max; j++) { struct task *t2 = &s->tasks[gettaskindex(tid[j], s)]; - for(k = 0; k < t2->nr_unlocks; k++) + int nr_un = t2->nr_unlocks; + for(k = 0; k < nr_un; k++) { - if(t2->unlocks[k] == t->id) + if(t2->unlocks[k] == ttid) { /* If parents array is full we need to extend it.*/ if(num_parents == size_parents -1) @@ -2430,22 +2443,30 @@ for(i = 0; i < count; i++) /* Loop over the locked resources */ for(j = 0; j < t->nr_locks; j++) { + int lock_index = getindex(t->locks[j],s); + struct res *res = &s->res[lock_index]; + + + if(res->node == s->rank) + continue; /* Find the parents that unlock this resource. */ num_current = 0; int num_skipped = 0; int last_index = -1; + int tlock = t->locks[j]; for(k = num_parents-1; k >= 0; k--) { + struct task *t2 = &s->tasks[ gettaskindex(tid[parents[k]], s)]; int found = 0; for(l = 0; l < t2->nr_locks; l++) { - if(t2->locks[l] == t->locks[j]) + if(t2->locks[l] == tlock) { found = 1; current_parents[num_current++] = parents[k]; - if(k > last_index) - last_index = k; + if(parents[k] > last_index) + last_index = parents[k]; /* If we skipped the parent task, increment the number we skipped. */ if(t2->flags & task_flag_skip) num_skipped++; @@ -2455,12 +2476,12 @@ for(i = 0; i < count; i++) while(r->parent != -1) { r = &s->res[getindex(r->parent,s)]; - if(r->ID == t->locks[j]) + if(r->ID == tlock) { found = 1; current_parents[num_current++] = parents[k]; - if(k > last_index) - last_index = k; + if(parents[k] > last_index) + last_index = parents[k]; /* If we skipped the parent task, increment the number we skipped. */ if(t2->flags & task_flag_skip) num_skipped++; @@ -2470,7 +2491,7 @@ for(i = 0; i < count; i++) if(found) continue; - r = &s->res[getindex(t->locks[l],s)]; + r = res; while(r->parent != -1) { r = &s->res[getindex(r->parent,s)]; @@ -2478,8 +2499,8 @@ for(i = 0; i < count; i++) { found = 1; current_parents[num_current++] = parents[k]; - if(k > last_index) - last_index = k; + if(parents[k] > last_index) + last_index = parents[k]; /* If we skipped the parent task, increment the number we skipped. */ if(t2->flags & task_flag_skip) num_skipped++; @@ -2523,6 +2544,7 @@ for(i = 0; i < count; i++) continue; }/* If no current parents.*/ + /* Check how many parents we skipped. */ if(num_skipped > 0) { @@ -2631,6 +2653,8 @@ for(i = 0; i < count; i++) /* Loop over the used resources */ for(j = 0; j < t->nr_uses; j++) { + if(s->res[getindex(t->uses[j],s)].node == s->rank) + continue; /* Find the parents that unlock this resource. */ num_current = 0; int num_skipped = 0; @@ -2721,7 +2745,6 @@ for(i = 0; i < count; i++) continue; }/* If no current parents.*/ - /* Check how many parents we skipped. */ if(num_skipped > 0) { @@ -2849,12 +2872,20 @@ for(i = 0; i < count; i++) free(parents); free(current_parents); + toc = getticks(); + message("qsched_partition_create_sends_creation took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6); + message("tsched_count = %i", ts.count); + + tic = getticks(); /* All the nodes have created new tasks, we need to synchronize this across nodes now.*/ tsched_synchronize( &ts, s); /* Once we have synchronized, we need to integrate the temporary scheduler into quicksched's scheduler. */ qsched_integrate( s, &ts); + toc = getticks(); + message("qsched_partition_create_sends_synchronize took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6); + tsched_free( &ts) ; } @@ -3293,7 +3324,7 @@ void qsched_prepare_mpi( struct qsched *s){ struct task *t, *tasks; TIMER_TIC -// ticks tic, toc; + // ticks tic, toc; /* Lock the sched. */ lock_lock( &s->lock ); @@ -3325,11 +3356,11 @@ void qsched_prepare_mpi( struct qsched *s){ // count = s->res_ranks[s->count_ranks]; /* Do the sorts in parallel, if possible. */ // tic = getticks(); - #pragma omp parallel +// //#pragma ompparallel { /* Sort the unlocks. */ - #pragma omp single nowait +// //#pragma ompsingle nowait qsched_quicksort( s->deps , s->deps_key , s->count_deps , 0 , 0xFFFFFFFFFFFFFFF); /*for(k = 1; k < s->count_deps; k++) { @@ -3340,17 +3371,17 @@ void qsched_prepare_mpi( struct qsched *s){ } }*/ /* Sort the locks. */ - #pragma omp single nowait + // //#pragma ompsingle nowait qsched_quicksort( s->locks , s->locks_key , s->count_locks , 0 , 0xFFFFFFFFFFFFFFF); /* Sort the uses. */ - #pragma omp single nowait + // //#pragma ompsingle nowait qsched_quicksort( s->uses , s->uses_key , s->count_uses , 0 , 0xFFFFFFFFFFFFFFF ); - #pragma omp single nowait + // //#pragma ompsingle nowait qsched_quicksort( s->users, s->users_key, s->count_users, 0, 0xFFFFFFFFFFFFFFF); - #pragma omp single nowait +// //#pragma ompsingle nowait qsched_quicksort( s->lockers, s->lockers_key, s->count_lockers, 0, 0xFFFFFFFFFFFFFFF); } @@ -3375,10 +3406,10 @@ void qsched_prepare_mpi( struct qsched *s){ // toc = getticks(); // message("Cleaning up scheduler took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); } - tic = getticks(); +// tic = getticks(); qsched_partition(s); - toc = getticks(); - message("qsched_partition took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6); +// toc = getticks(); +// message("qsched_partition took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6); long long int *tid = malloc(sizeof(long long int) * s->task_ranks[s->count_ranks]); if(tid == NULL) @@ -3700,7 +3731,12 @@ printf("Rank[%i]: qsched_prepare_mpi took %lli (= %.3fms) ticks\n", s->rank, error( "Failed to create pthread." ); #if defined( HAVE_LIBNUMA) CPU_ZERO(&cpuset); - CPU_SET(cpuid[tid], &cpuset); +#define SHMEM + #ifdef SHMEM + CPU_SET(cpuid[tid+s->rank], &cpuset); + #else + CPU_SET(cpuid[tid], &cpuset); + #endif if ( pthread_setaffinity_np(s->runners[tid].thread, sizeof(cpu_set_t), &cpuset ) != 0){ error("Failed to set thread affinity."); } @@ -3722,12 +3758,24 @@ printf("Rank[%i]: qsched_prepare_mpi took %lli (= %.3fms) ticks\n", s->rank, toc = getticks(); - printf("Rank[%i]: Execution took %lli (= %e) ticks\n", s->rank, - toc - tic, (float)(toc - tic)); +/* printf("Rank[%i]: Execution took %lli (= %e) ticks\n", s->rank, + toc - tic, (float)(toc - tic));*/ //TIMINGS BASED ON COSMA4! message("Execution took %.3f milliseconds.", (toc-tic)/2.67e6); MPI_Barrier(s->comm); - + + float max = (float)(toc-tic)/2.67e6; + float total = (float)(toc-tic)/2.67e6; + + MPI_Allreduce(MPI_IN_PLACE, &max, 1, MPI_FLOAT, MPI_MAX, s->comm); + MPI_Allreduce(MPI_IN_PLACE, &total, 1, MPI_FLOAT, MPI_SUM, s->comm); + + if(s->rank == 0) + { + message("Max runtime = %fms", max); + message("Total runtime = %fms", total*nr_threads); + } + #else #if defined( HAVE_OPENMP ) MPI_Barrier(s->comm); @@ -3737,7 +3785,7 @@ MPI_Barrier(s->comm); message("Beginning execution of quicksched"); tic = getticks(); /* Parallel loop. */ - #pragma omp parallel num_threads( nr_threads ) + //#pragma ompparallel num_threads( nr_threads ) { /* Local variable. */ struct task *t; @@ -3802,7 +3850,7 @@ void qsched_run_openmp ( struct qsched *s , int nr_threads , qsched_funtype fun qsched_prepare( s ); /* Parallel loop. */ - #pragma omp parallel num_threads( nr_threads ) + //#pragma ompparallel num_threads( nr_threads ) { /* Local variable. */ struct task *t; @@ -4577,13 +4625,13 @@ void qsched_quicksort (long long int *restrict data ,long long int *restrict ind /* Recurse on the left? */ if ( j > 0 && pivot > min ) { - #pragma omp task untied +// //#pragma omptask untied qsched_quicksort( data , ind , j+1 , min , pivot ); } /* Recurse on the right? */ if ( i < N && pivot+1 < max ) { - #pragma omp task untied +// //#pragma omptask untied qsched_quicksort( &data[i], &ind[i], N-i , pivot+1 , max ); } @@ -4628,19 +4676,19 @@ void qsched_prepare ( struct qsched *s ) { if ( s->flags & qsched_flag_dirty ) { /* Do the sorts in parallel, if possible. */ - #pragma omp parallel + //#pragma ompparallel { /* Sort the unlocks. */ - #pragma omp single nowait + //#pragma ompsingle nowait qsched_sort( s->deps , s->deps_key , s->count_deps , 0 , count - 1 ); /* Sort the locks. */ - #pragma omp single nowait + //#pragma ompsingle nowait qsched_sort( s->locks , s->locks_key , s->count_locks , 0 , count - 1 ); /* Sort the uses. */ - #pragma omp single nowait + //#pragma ompsingle nowait qsched_sort( s->uses , s->uses_key , s->count_uses , 0 , count - 1 ); }