diff --git a/src/qsched.c b/src/qsched.c index 9f47dafb5d06ef8b7cce9508ec38d3d270a3104e..25a92611b173898be735b7d47cb649e9799c5c8a 100644 --- a/src/qsched.c +++ b/src/qsched.c @@ -865,7 +865,7 @@ void qsched_partition_compute_costs( struct qsched *s, idx_t *res_costs) } } -void qsched_partition_build_nodelist(struct qsched *s, idx_t *nodelist, idx_t *noderef, idx_t *node_count_input, idx_t *res_costs, idx_t *pos_in_nodelist) +void qsched_partition_build_nodelist(struct qsched *s, idx_t *nodelist, long long int *noderef, idx_t *node_count_input, idx_t *res_costs, idx_t *pos_in_nodelist) { idx_t node_count = *node_count_input; struct res *r; @@ -936,7 +936,7 @@ void qsched_partition_build_nodelist(struct qsched *s, idx_t *nodelist, idx_t *n } void qsched_partition_build_edgelist(struct qsched *s, idx_t **edge_vwgts, idx_t **edge_lists, idx_t *edge_counts, idx_t *edge_sizes, - idx_t node_count, idx_t *noderef, idx_t *pos_in_nodelist ) + idx_t node_count, long long int *noderef, idx_t *pos_in_nodelist ) { int i, j, k, l; struct task *t; @@ -2341,6 +2341,7 @@ for(i = 0; i < count; i++) task_data[3] = (int)(s->res[getindex(t->uses[j],s)].ID & 0xFFFFFFFF); task_data[4] = sends_added; + /* Create the send task. */ send_task_id = tsched_addtask(&ts, task_type_send , 0 , task_data , 5 * sizeof(int) , s->res[getindex(t->uses[j],s)].size ); /* The send task needs to lock the resource. */ @@ -2473,6 +2474,7 @@ for(i = 0; i < count; i++) task_data[3] = (int)(s->res[getindex(t->locks[j],s)].ID & 0xFFFFFFFF); task_data[4] = sends_added; + /* Create the send task. */ send_task_id = tsched_addtask(&ts, task_type_send , 0 , task_data , 5 * sizeof(int) , s->res[getindex(t->locks[j],s)].size ); /* The send task needs to lock the resource. */ @@ -2505,7 +2507,8 @@ for(i = 0; i < count; i++) task_data[2] = (int)(s->res[getindex(t->locks[j],s)].ID >> 32); task_data[3] = (int)(s->res[getindex(t->locks[j],s)].ID & 0xFFFFFFFF); task_data[4] = sends_added; - + + /* Create the send task. */ send_task_id = tsched_addtask(&ts, task_type_send , 0 , task_data , 5 * sizeof(int) , s->res[getindex(t->locks[j],s)].size ); /* The send task needs to lock the resource. */ @@ -2634,6 +2637,7 @@ for(i = 0; i < count; i++) task_data[3] = (int)(s->res[getindex(t->uses[j],s)].ID & 0xFFFFFFFF); task_data[4] = sends_added; + /* Create the send task. */ send_task_id = tsched_addtask(&ts, task_type_send , 0 , task_data , 5 * sizeof(int) , s->res[getindex(t->uses[j],s)].size ); /* The send task needs to lock the resource. */ @@ -2807,12 +2811,12 @@ void qsched_partition( struct qsched *s){ //Build a nodelist of the highest level hierarchical resources which are locked or used. idx_t *nodelist; idx_t node_count=0; - idx_t *noderef; + long long int *noderef; idx_t *pos_in_nodelist; nodelist = (idx_t *) calloc( s->res_ranks[s->count_ranks], sizeof(idx_t) ); if(nodelist == NULL) error("Failed to allocate nodelist"); - noderef = (idx_t *) calloc(s->res_ranks[s->count_ranks], sizeof(idx_t) ); + noderef = (long long int *) calloc(s->res_ranks[s->count_ranks], sizeof(long long int) ); if(noderef == NULL) error("Failed to allocate noderef"); pos_in_nodelist = (idx_t *) malloc(s->res_ranks[s->count_ranks] * sizeof(idx_t) ); @@ -2896,7 +2900,7 @@ for(i = 0; i < node_count; i++) message("qsched_partition_build_edgelist took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); #endif - free(pos_in_nodelist); + // free(pos_in_nodelist); //This might work but for safety I've made all nodes compute the edgelist anyway. /*#if IDXTYPEWIDTH == 32 @@ -2972,14 +2976,15 @@ for(i = 0; i < node_count; i++) objval[1] = 0; objval[2] = 0; idx_t *nodeIDs; - nodeIDs = (idx_t *)malloc( sizeof(idx_t) * node_count ); + nodeIDs = (idx_t *)calloc( node_count, sizeof(idx_t) ); if(nodeIDs == NULL) error("Failed to allocate nodeIDs"); idx_t temp_count_ranks = s->count_ranks; tic = getticks(); - if( METIS_PartGraphKway(&node_count, &one, edgelist_pos, edgelist_new, nodelist, NULL, edgelist_vwgt, &temp_count_ranks, NULL, NULL,options, objval, nodeIDs) != METIS_OK) - error("Failed to partition\n"); - + if(s->count_ranks > 1) { + if( METIS_PartGraphKway(&node_count, &one, edgelist_pos, edgelist_new, nodelist, NULL, edgelist_vwgt, &temp_count_ranks, NULL, NULL,options, objval, nodeIDs) != METIS_OK) + error("Failed to partition\n"); + } toc = getticks(); message("METIS_PartGraphKway took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); //TODO Check the costs. @@ -3022,8 +3027,8 @@ for(i = 0; i < node_count; i++) error("A local resource has no data associated with it."); MPI_Isend(s->res[noderef[i]].data, s->res[noderef[i]].size, MPI_BYTE, nodeIDs[i], i, s->comm, &reqs[reqnr] ); reqnr++; - s->res[noderef[i]].node = nodeIDs[i]; } + s->res[noderef[i]].node = nodeIDs[i]; } } @@ -3050,8 +3055,8 @@ for(i = 0; i < node_count; i++) for(j = 0; j < s->res_ranks[s->count_ranks]; j++) { struct res *temp = &s->res[j]; - if(temp->num_lockers == 0 && temp->num_users == 0) - continue; + // if(temp->num_lockers == 0 && temp->num_users == 0) + // continue; struct res *parent; int offset = temp->offset; while(temp->parent != -1) @@ -3747,6 +3752,18 @@ void qsched_enqueue ( struct qsched *s , struct task *t ) { struct res *resource = &s->res[getindex(resID, s)]; int err; // printf("Sending tag = %i\n", tag); + if(resource->data == NULL) + { + message("resource->parent = %lli", resource->parent); + message("from = %i", from); + message("t->node= %i", t->node); + if(resource->parent != -1) + { + struct res *temp = &s->res[getindex(resource->parent, s)]; + message("parent->node = %i", temp->node); + } + error("We're sending a resource that has data = NULL, resource->ID = %lli and resource-node = %i", resource->ID, resource->node); + } if ((err = MPI_Isend(resource->data, resource->size, MPI_BYTE, to, tag, s->comm, &t->req)) != MPI_SUCCESS) {