From e11e626623eb338ea30627178ff3bf9746a180d0 Mon Sep 17 00:00:00 2001 From: d74ksy <aidan.chalk@durham.ac.uk> Date: Thu, 5 Nov 2015 10:29:41 +0000 Subject: [PATCH] Attempted to bugfix some issues to do with send task creation. Still problems running so attempting on cosma 4 --- examples/test_bh_mpi.c | 45 ++++++++++------ examples/test_qr_mpi.c | 56 ++++++++++++++++--- src/qsched.c | 119 ++++++++++++++++++++++++++--------------- 3 files changed, 155 insertions(+), 65 deletions(-) diff --git a/examples/test_bh_mpi.c b/examples/test_bh_mpi.c index deef473..d4e5fda 100644 --- a/examples/test_bh_mpi.c +++ b/examples/test_bh_mpi.c @@ -45,7 +45,7 @@ #define dist_min 0.5 /* Used for legacy walk only */ #define dist_cutoff_ratio 1.5 -#define ICHECK -1 +#define ICHECK 9501 #define SANITY_CHECKS #define NO_COM_AS_TASK #define NO_COUNTERS @@ -350,15 +350,21 @@ static inline void iact_pair_direct(struct qsched *s, struct cell *ci, struct ce ++count_direct_unsorted; #endif -#if ICHECK >= 0 && 0 +#if ICHECK >= 0 if (parts_i[i].id == ICHECK) - printf( - "[NEW] Interaction with particle id= %d (pair i) a=( %e %e %e )\n", - parts_j[j].id, -mi * w * dx[0], -mi * w * dx[1], -mi * w * dx[2]); - - if (parts_j[j].id == ICHECK) - printf( - "[NEW] Interaction with particle id= %d (pair j) a=( %e %e %e )\n", + /*message( + "[NEW] Interaction with particle id= %d (pair i) a=( %e %e %e )", + cj->parts[j].id, -mi * w * dx[0], -mi * w * dx[1], -mi * w * dx[2]);*/ + message( + "%d %e %e %e", + cj->parts[j].id, -mi * w * dx[0], -mi * w * dx[1], -mi * w * dx[2]); + + if (cj->parts[j].id == ICHECK) + /*message( + "[NEW] Interaction with particle id= %d (pair j) a=( %e %e %e )", + parts_i[i].id, mj * w * dx[0], mj * w * dx[1], mj * w * dx[2]);*/ + message( + "%d %e %e %e", parts_i[i].id, mj * w * dx[0], mj * w * dx[1], mj * w * dx[2]); #endif @@ -440,12 +446,16 @@ void iact_self_direct(struct qsched *s, struct cell *c) { #if ICHECK >= 0 if (c->parts[i].id == ICHECK) - message("[NEW] Interaction with particle id= %d (self i)", - c->parts[j].id); +/* message("[NEW] Interaction with particle id= %d (self i) a=( %e %e %e )", + c->parts[j].id, -mi * w * dx[0], -mi * w * dx[1], -mi * w * dx[2]);*/ + message("%d %e %e %e", + c->parts[j].id, -mi * w * dx[0], -mi * w * dx[1], -mi * w * dx[2]); if (c->parts[j].id == ICHECK) - message("[NEW] Interaction with particle id= %d (self j)", - c->parts[i].id); +/* message("[NEW] Interaction with particle id= %d (self j) a =( %e %e %e )", + c->parts[i].id, -mj * w * dx[0], -mj * w * dx[1], -mj * w * dx[2]);*/ + message("%d %e %e %e", + c->parts[i].id, -mj * w * dx[0], -mj * w * dx[1], -mj * w * dx[2]); #endif } /* loop over every other particle. */ @@ -1090,6 +1100,10 @@ void output_parts(struct qsched *s) struct part cur = parts[j]; fprintf(file, "%i %e %e %e %e %e %e %e %e %e %e %e %e %e\n", cur.id, cur.mass, cur.x[0], cur.x[1], cur.x[2], cur.a_exact[0], cur.a_exact[1], cur.a_exact[2], cur.a_legacy[0], cur.a_legacy[1], cur.a_legacy[2], cur.a[0], cur.a[1], cur.a[2]); + #if ICHECK >= 0 + if(cur.id == ICHECK) + message("Final accelerations = %e %e %e", cur.a[0], cur.a[1], cur.a[2]); + #endif } } } @@ -1269,9 +1283,10 @@ qsched_run_MPI(&s, nr_threads, runner); if(s.rank == 0) { file = fopen("particle_dump.out", "w"); - fprintf(file, + /*fprintf(file, "ID m x y z a_exact.x a_exact.y a_exact.z a_legacy.x " - "a_legacy.y a_legacy.z a_new.x a_new.y a_new.z\n"); + "a_legacy.y a_legacy.z a_new.x a_new.y a_new.z\n");*/ +// fprintf(file, ""); fclose(file); } diff --git a/examples/test_qr_mpi.c b/examples/test_qr_mpi.c index 0a75ef7..3dcddff 100644 --- a/examples/test_qr_mpi.c +++ b/examples/test_qr_mpi.c @@ -480,6 +480,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { // ticks tic, toc_run, tot_setup, tot_run = 0; #ifdef TASK_TIMERS ticks tic, toc_run; + ticks start; #endif char processor_name[MPI_MAX_PROCESSOR_NAME]; int name_len; @@ -488,6 +489,9 @@ char processor_name[MPI_MAX_PROCESSOR_NAME]; long long int *task_finish; int *task_tids; int *task_types; + int *task_i; + int *task_j; + int *task_k; #endif // Initialize the MPI environment @@ -593,10 +597,31 @@ if(MpiThreadLevel != MPI_THREAD_MULTIPLE) printf("Ran DSSRFT with k = %lli, j = %lli, i = %lli\n", kk, jj, ii); }*/ toc_run = getticks(); - task_start[tid] = tic; - task_finish[tid] = toc_run; + message("tid = %lli", tid); + message("sizeof arrays = %i", m*n*m*n); + task_start[tid] = tic - start; + task_finish[tid] = toc_run - start; task_tids[tid] = omp_get_thread_num(); task_types[tid] = type; + if(type == task_DGEQRF){ + task_i[tid] = (int)idata[2]; + task_j[tid] = (int)idata[2]; + task_k[tid] = (int)idata[2]; + }else if (type == task_DLARFT) + { + task_i[tid] = (int)idata[3]; + task_j[tid] = (int)idata[4]; + task_k[tid] = (int)idata[3]; + }else if(type == task_DTSQRF) + { + task_i[tid] = (int)idata[4]; + task_j[tid] = (int)idata[3]; + task_k[tid] = (int)idata[3]; + }else{ + task_i[tid] = (int)idata[5]; + task_j[tid] = (int)idata[6]; + task_k[tid] = (int)idata[4]; + } } #endif } @@ -604,11 +629,14 @@ if(MpiThreadLevel != MPI_THREAD_MULTIPLE) qsched_init(&s, 2, qsched_flag_none, MPI_COMM_WORLD); #ifdef TASK_TIMERS -task_start = (long long int*)calloc(sizeof(long long int), m*n*K*K); -printf("Created task_start of size %i", m*n*K*K); -task_finish = (long long int*)calloc(sizeof(long long int), m*n*K*K); -task_tids = (int*)calloc(sizeof(int), m*n*K*K); -task_types = (int*)calloc(sizeof(int), m*n*K*K); +task_start = (long long int*)calloc(sizeof(long long int), m*n*m*n); +printf("Created task_start of size %i", m*n*m*n); +task_finish = (long long int*)calloc(sizeof(long long int), m*n*m*n); +task_tids = (int*)calloc(sizeof(int), m*n*m*n); +task_types = (int*)calloc(sizeof(int), m*n*m*n); +task_i = (int*)calloc(sizeof(int), m*n*m*n); +task_j = (int*)calloc(sizeof(int), m*n*m*n); +task_k = (int*)calloc(sizeof(int), m*n*m*n); #endif @@ -671,6 +699,10 @@ if(s.rank == 0) { #endif qsched_addlock(&s, tid_new, rid[k * m + k]); qsched_addlock(&s, tid_new, tau_id[k*m+k]); + if(k == 10){ + message("Task ID of task (10, 10, 10) = %lli", tid_new); + message("Resource ID of cell (10,10) = %lli", rid[k*m+k]); + } if(k == 0) { memcpy(rids[k*m+k], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K); @@ -695,6 +727,10 @@ if(s.rank == 0) { tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, MPI_data, sizeof(long long int) * 3, 300); #endif + if(j == 11 && k == 10){ + message("Task ID of task (10, 11, 10) = %lli", tid_new); + message("Adding a use to rid = %lli", rid[k*m+k]); + } if(k == 0) { memcpy(rids[j*m+k], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K); @@ -774,6 +810,10 @@ if(s.rank == 0) { } /* build the tasks. */ } +#ifdef TASK_TIMERS + MPI_Barrier(s.comm); + start = getticks(); +#endif qsched_run_MPI(&s, nr_threads, runner); // Print off a hello world message printf("Hello world from processor %s, rank = %i, count_ranks = %i\n", @@ -794,7 +834,7 @@ FILE *file = NULL; for(i = 0; i < m*n*K*K; i++) { if(task_types[i] > 0) - fprintf(file, "%i %i %lli %lli\n", task_types[i], s.rank*nr_threads+task_tids[i],task_start[i], task_finish[i] ); + fprintf(file, "%i %i %lli %lli %i %i %i\n", task_types[i], s.rank*nr_threads+task_tids[i],task_start[i], task_finish[i] ,task_i[i], task_j[i], task_k[i] ); } diff --git a/src/qsched.c b/src/qsched.c index 4d97515..a7cffa0 100644 --- a/src/qsched.c +++ b/src/qsched.c @@ -1415,6 +1415,19 @@ void qsched_partition_topsort( struct qsched *s, long long int *tid) void qsched_partition_compute_waitinit(struct qsched *s, int *wait_init) { int i; + int k, j; + struct task *t; + int count = s->task_ranks[s->count_ranks]; + for( k = 0; k < count; k++) { + s->tasks[k].wait = 0; + } + + for ( k = 0 ; k < count ; k++ ) { + t = &s->tasks[k]; + // if ( !( t->flags & task_flag_skip ) ) + for ( j = 0 ; j < t->nr_unlocks ; j++ ) + s->tasks[ gettaskindex(t->unlocks[j], s) ].wait += 1; + } for(i = 0; i < s->task_ranks[s->count_ranks]; i++) { wait_init[i] = s->tasks[i].wait; @@ -2258,7 +2271,7 @@ if(first_recv == NULL) error("Failed to allocate first_send array."); for(i = 0; i < s->res_ranks[s->count_ranks]; i++) { - data_pos[i] = s->res[i].node; + data_pos[i] = -1; first_recv[i] = -1; data_task[i] = -1; } @@ -2288,6 +2301,7 @@ for(i = 0; i < count; i++) { struct task *t = &s->tasks[gettaskindex(tid[i], s)]; + /* If we don't execute this task locally then we can skip it!*/ if( t->flags & task_flag_skip ) continue; @@ -2366,7 +2380,6 @@ for(i = 0; i < count; i++) } /* If the task has no dependencies. */ /* If the task has dependencies*/ - /*Find the parents. */ num_parents = 0; for(j = 0; j < i && num_parents < wait_init[gettaskindex(t->id, s)]; j++) @@ -2600,7 +2613,7 @@ for(i = 0; i < count; i++) }/* j over locks. */ -/* Loop over the locked resources */ +/* Loop over the used resources */ for(j = 0; j < t->nr_uses; j++) { /* Find the parents that unlock this resource. */ @@ -2624,7 +2637,8 @@ for(i = 0; i < count; i++) } } }/* k over parents. */ - +/* if( t->uses[j] == 340) + message("We found the use! From task %lli debugf = %i", tid[i], debugf);*/ /* We have the current parents. */ if(num_current == 0) { @@ -2730,14 +2744,36 @@ for(i = 0; i < count; i++) } /* All the parents of this task are completed before the send task can occur also!*/ int found = 0; +/* message("data_pos for this resources is %i",data_pos[getindex(t->uses[j], s)]); + message("Owner of the data is %i", s->res[getindex(t->uses[j], s)].node); + message("Trying to reach ts.tasks[%lli]", (send_task[getindex(t->uses[j], s)] & 0xFFFFFFFFF) - ts.id_count); + message("send_task = %lli", send_task[getindex(t->uses[j], s)]); + message("send_task & 0xFFFFFFFFF = %lli", send_task[getindex(t->uses[j], s)] & 0xFFFFFFFFF); + message("j=%i, i = %i, k = %i", j, i, k);*/ struct task *temp = &ts.tasks[(send_task[getindex(t->uses[j], s)] & 0xFFFFFFFFF) - ts.id_count]; - for(l = 0; l < temp->nr_unlocks; l++) +// for(l = 0; l < temp->nr_unlocks; l++) + for(l = 0; l < ts.count_unlockers; l++) { - if(temp->unlocks[l] == tid[current_parents[k]] && ts.deps_key[l] == t->uses[j]) + /*if(s.rank == 0 && i == 3 && j == 0 && k == 1) + { + message("temp->unlocks[l] = %i", temp->unlocks[l]); + message("tid[current_parents[k]] = %i", tid[current_parents[k]]); + + }*/ + + //TODO What are we actually doing here?. + //TODO We want to look through ALL of ts.unlocks (as I don't think they have been sorted yet - maybe we're lucky and they're in order but i'm not sure thats the case). + //TODO We want to find if the current_parent unlocks this task. + if(ts.unlockers_key[l] == temp->id && ts.unlockers[l] == tid[current_parents[k]]) { found = 1; break; - } + } +/* if(temp->unlocks[l] == tid[current_parents[k]] && ts.deps_key[l] == t->uses[j]) + { + found = 1; + break; + } */ } if(!found) { @@ -2767,6 +2803,8 @@ for(i = 0; i < count; i++) free(parents); free(current_parents); + message("ts->count = %i", ts.count); + message("ts.count_unlockers = %i", ts.count_unlockers); /* All the nodes have created new tasks, we need to synchronize this across nodes now.*/ tsched_synchronize( &ts, s); @@ -2783,15 +2821,15 @@ void qsched_partition( struct qsched *s){ int i, j; struct task *t; int errors; - ticks tic, toc; +// ticks tic, toc; // struct res *r; res_costs = (idx_t*) calloc(s->res_ranks[s->count_ranks], sizeof(idx_t)); - tic = getticks(); +// tic = getticks(); qsched_partition_compute_costs( s, res_costs); - toc = getticks(); - message("qsched_partition_compute_costs took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); +// toc = getticks(); +// message("qsched_partition_compute_costs took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); //All reduce res_costs; #if IDXTYPEWIDTH == 32 @@ -2826,10 +2864,10 @@ void qsched_partition( struct qsched *s){ for(i = 0; i < s->res_ranks[s->count_ranks]; i++) pos_in_nodelist[i] = -1; - tic = getticks(); +// tic = getticks(); qsched_partition_build_nodelist(s, nodelist, noderef, &node_count, res_costs, pos_in_nodelist); - toc = getticks(); - message("qsched_partition_build_nodelist took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); +// toc = getticks(); + // message("qsched_partition_build_nodelist took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); //Build an edgelist where edges are of weight += task->weight for each task that locks both. If noderef doesn't contain, recurse until we find the ones it does contain (yuck). Build a set of adjacency lists. @@ -2859,17 +2897,15 @@ for(i = 0; i < node_count; i++) edge_sizes[i] = initial_size; } - tic = getticks(); +// tic = getticks(); qsched_partition_build_edgelist(s, edge_vwgts, edge_lists, edge_counts, edge_sizes, node_count, noderef, pos_in_nodelist); - toc = getticks(); - message("qsched_partition_build_edgelist took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); +// toc = getticks(); +// message("qsched_partition_build_edgelist took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); idx_t edgelist_size = 0; for(i = 0; i < node_count; i++) edgelist_size += edge_counts[i]; -printf("edgelist_size = %i\n", edgelist_size); -printf("node_count = %i\n", node_count); /* Convert the edgelists to METIS version.*/ idx_t *edgelist_new = malloc(sizeof(idx_t) * edgelist_size); if(edgelist_new == NULL) @@ -2981,13 +3017,13 @@ for(i = 0; i < node_count; i++) if(nodeIDs == NULL) error("Failed to allocate nodeIDs"); idx_t temp_count_ranks = s->count_ranks; - tic = getticks(); +// tic = getticks(); if(s->count_ranks > 1) { if( METIS_PartGraphKway(&node_count, &one, edgelist_pos, edgelist_new, nodelist, NULL, edgelist_vwgt, &temp_count_ranks, NULL, NULL,options, objval, nodeIDs) != METIS_OK) error("Failed to partition\n"); } - toc = getticks(); - message("METIS_PartGraphKway took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); +// toc = getticks(); +// message("METIS_PartGraphKway took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); //TODO Check the costs. int count_me = 0; for(i = 0; i < node_count; i++) @@ -2999,7 +3035,7 @@ for(i = 0; i < node_count; i++) - tic = getticks(); +// tic = getticks(); MPI_Request *reqs; reqs = (MPI_Request*) calloc(sizeof(MPI_Request) , node_count * 2); for(i = 0; i < node_count * 2; i++) @@ -3108,14 +3144,14 @@ for(i = 0; i < node_count; i++) if(temp->node == s->rank && temp->data == NULL) error("Local resource has data set to NULL"); } - toc = getticks(); - message("qsched_partition synchronizing resources took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); +// toc = getticks(); +// message("qsched_partition synchronizing resources took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); // Move the tasks time! // Tasks belong to the node of the resource they lock of highest size. // If they don't lock any resources, the resources they use of highest size. // Everyone does it for all tasks at the moment... // TODO ISSUE: Whoever is assigned bigger resources will run more tasks - not balanced well. Minimises communication, less relevant. - tic = getticks(); +// tic = getticks(); for(i = 0; i < s->task_ranks[s->count_ranks]; i++) { struct task *t = &s->tasks[i]; @@ -3149,8 +3185,8 @@ for(i = 0; i < node_count; i++) } - toc = getticks(); - message("qsched_partition task \"movement\" took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); +// toc = getticks(); +// message("qsched_partition task \"movement\" took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); //First we need to do a topological sort. int count = s->task_ranks[s->count_ranks]; @@ -3178,10 +3214,10 @@ for(i = 0; i < node_count; i++) } //Now we just need to create the send/recv tasks from the dependencies. - tic = getticks(); +// tic = getticks(); qsched_partition_create_sends( s, tid); - toc = getticks(); - message("qsched_partition_create_sends took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); +// toc = getticks(); +// message("qsched_partition_create_sends took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); free(edge_lists); free(edge_sizes); @@ -3206,7 +3242,7 @@ void qsched_prepare_mpi( struct qsched *s){ struct task *t, *tasks; TIMER_TIC - ticks tic, toc; +// ticks tic, toc; /* Lock the sched. */ lock_lock( &s->lock ); @@ -3229,15 +3265,15 @@ void qsched_prepare_mpi( struct qsched *s){ /* If the sched is dirty... */ if ( s->flags & qsched_flag_dirty ) { - tic = getticks(); +// tic = getticks(); qsched_sync_schedulers( s ); - toc = getticks(); - message("qsched_sync_schedulers took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); +// toc = getticks(); +// message("qsched_sync_schedulers took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); tasks = s->tasks; count = s->count; // count = s->res_ranks[s->count_ranks]; /* Do the sorts in parallel, if possible. */ - tic = getticks(); +// tic = getticks(); #pragma omp parallel { @@ -3285,14 +3321,13 @@ void qsched_prepare_mpi( struct qsched *s){ /* All cleaned-up now! */ s->flags &= ~qsched_flag_dirty; - toc = getticks(); - message("Cleaning up scheduler took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); +// toc = getticks(); + // message("Cleaning up scheduler took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); } - tic = getticks(); +// tic = getticks(); qsched_partition(s); - toc = getticks(); - message("qsched_partition took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); - //return; +// toc = getticks(); + // message("qsched_partition took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); long long int *tid = malloc(sizeof(long long int) * s->task_ranks[s->count_ranks]); if(tid == NULL) -- GitLab