Hopefully fixed the issues with heirarchical task generation

60922b1a · aidan · 045b696d · 60922b1a · 60922b1a · 60922b1a
Commit 60922b1a authored 10 years ago by aidan
--- a/examples/test_bh_2.cu
+++ b/examples/test_bh_2.cu
@@ -90,7 +90,9 @@ float4 *parts_a_m_host;
 double2 *com_xy_host;
 double *com_z_host;
 float *com_mass_host;
+double2 *parts_pos_xy_temp;
+double *parts_pos_z_temp;
+float4 *parts_a_m_temp;
@@ -113,9 +115,9 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
    /* Loop over cell i.*/
    for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
-        xi[0] = parts_xy[i].x;
+        xi[0] = parts_pos_xy[i].x;
-        xi[1] = parts_xy[i].y;
+        xi[1] = parts_pos_xy[i].y;
-        xi[2] = parts_z[i];     
+        xi[2] = parts_pos_z[i];     
        for(k = 0; k < 3; k++) {
            ai[k] = 0.0f;
        }
@@ -154,9 +156,9 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
    }
 /*Loop over cell j. */
    for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
-        xi[0] = parts_xy[i].x;
+        xi[0] = parts_pos_xy[i].x;
-        xi[1] = parts_xy[i].y;
+        xi[1] = parts_pos_xy[i].y;
-        xi[2] = parts_z[i];     
+        xi[2] = parts_pos_z[i];     
        for(k = 0; k < 3; k++) {
            ai[k] = 0.0f;
        }
@@ -290,23 +292,40 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf
    struct cell *cp ,*cps;
+    if(leaf->split)
+    {
+        printf("Leaf split = 1, oh dear.");
+        asm("trap;");
+    }
+if(ci->split > 1)
+    {
+        printf("Cell %i had split > 1\n", ci - cells);
+        asm("trap;");
+    }
+    if(cj->split > 1)
+    {
+        printf("cell %i had split > 1\n", cj - cells);
+        asm("trap;");
+    }
    for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
        if(is_inside(leaf, cp)) break;
    }
    if(are_neighbours_different_size(cp, cj)) {
-        for(cps = &cells[cj->firstchild]; cps != &cells[ci->sibling]; cps = &cells[cps->sibling]) {
+        for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
            if(are_neighbours(cp, cps)) {
                if(cp->split && cps->split) {
                    iact_pair_pc(cp, cps, leaf);
                }
            } else {
                make_interact_pc(leaf, cps);
+                __syncthreads();
            }
        }
    }
+    __syncthreads();
 }   
 /**
@@ -320,6 +339,17 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
    struct cell *cp, *cps;
+    if(leaf->split)
+    {
+        printf("Leaf split = 1, oh dear.");
+        asm("trap;");
+    }
+    if(c->split > 1)
+    {
+        printf("Cell had split > 1\n");
+        asm("trap;");
+    }
    /* Find the subcell of c the leaf is in.*/
    for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) {
        if(is_inside(leaf, cp)) break;
@@ -606,23 +636,23 @@ void cell_split(int c, struct qsched *s) {
    {
        if( cudaHostGetDevicePointer(&temp_xy, &parts_pos_xy_host[cell_pool[c].parts], 0) != cudaSuccess )
            error("Failed to get host device pointer.");
-        printf("tempxy = %p\n", temp_xy);
+//        printf("tempxy = %p\n", temp_xy);
        cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy,
-                                        sizeof(double2) * cell_pool[c].count, parts_pos_xy + cell_pool[c].parts);
+                                        sizeof(double2) * cell_pool[c].count, parts_pos_xy_temp + cell_pool[c].parts);
    }
    if(cell_pool[c].resz == qsched_res_none)
    {
        if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[c].parts], 0) != cudaSuccess )
            error("Failed to get host device pointer.");
        cell_pool[c].resz = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_z,
-                                        sizeof(double) * cell_pool[c].count, parts_pos_z + cell_pool[c].parts);
+                                        sizeof(double) * cell_pool[c].count, parts_pos_z_temp + cell_pool[c].parts);
    }
    if(cell_pool[c].resm == qsched_res_none)
    {
        if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[c].parts], 0) != cudaSuccess )
            error("Failed to get host device pointer.");
        cell_pool[c].resm = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_a_m,
-                                        sizeof(float4) * cell_pool[c].count, parts_a_m + cell_pool[c].parts);
+                                        sizeof(float4) * cell_pool[c].count, parts_a_m_temp + cell_pool[c].parts);
    }
       // error("Cell has no resource");*///TODO
@@ -731,15 +761,15 @@ void cell_split(int c, struct qsched *s) {
        if( cudaHostGetDevicePointer(&temp_xy, &parts_pos_xy_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
            error("Failed to get host device pointer.");
        cell_pool[progenitors[k]].res = qsched_addres(s, qsched_owner_none, cell->res, temp_xy,
-                                        sizeof(double2) * cell_pool[progenitors[k]].count, parts_pos_xy + cell_pool[progenitors[k]].parts);
+                                        sizeof(double2) * cell_pool[progenitors[k]].count, parts_pos_xy_temp + cell_pool[progenitors[k]].parts);
        if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
            error("Failed to get host device pointer.");
        cell_pool[progenitors[k]].resz = qsched_addres(s, qsched_owner_none, cell->resz, temp_z,
-                                        sizeof(double) * cell_pool[progenitors[k]].count, parts_pos_z + cell_pool[progenitors[k]].parts);
+                                        sizeof(double) * cell_pool[progenitors[k]].count, parts_pos_z_temp + cell_pool[progenitors[k]].parts);
        if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
            error("Failed to get host device pointer.");
        cell_pool[progenitors[k]].resm = qsched_addres(s, qsched_owner_none, cell->resm, temp_a_m,
-                                        sizeof(float4) * cell_pool[progenitors[k]].count, parts_a_m + cell_pool[progenitors[k]].parts);
+                                        sizeof(float4) * cell_pool[progenitors[k]].count, parts_a_m_temp + cell_pool[progenitors[k]].parts);
        }
        /* Find the first non-empty progenitor */
@@ -784,6 +814,9 @@ void cell_split(int c, struct qsched *s) {
    int data[2] = {root, c};
        int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
                                 2 * sizeof(int), 1);
+        qsched_addlock(s, tid, cell_pool[root].res);
+        qsched_addlock(s, tid, cell_pool[root].resz);
+        qsched_addlock(s, tid, cell_pool[root].resm);
        qsched_addlock(s, tid, cell_pool[c].res);
        qsched_addlock(s, tid, cell_pool[c].resz);
        qsched_addlock(s, tid, cell_pool[c].resm);
@@ -913,9 +946,7 @@ void test_bh(int N, int runs, char *fileName) {
  struct qsched s;
  ticks tic, toc_run, tot_setup = 0, tot_run = 0;
  int countMultipoles = 0, countPairs = 0, countCoMs = 0;
-  double2 *parts_pos_xy_temp;
+  struct cell *gpu_ptr_cells;
-  double *parts_pos_z_temp;
-  float4 *parts_a_m_temp;
  cudaFree(0);
    if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
@@ -934,10 +965,10 @@ void test_bh(int N, int runs, char *fileName) {
    if( cudaMalloc(&parts_pos_xy_temp, sizeof(double2) * N) != cudaSuccess)
        error("Failed to allocate device parts array");
-    printf("parts_pos_xy_temp = %p\n", parts_pos_xy_temp);
+  //  printf("parts_pos_xy_temp = %p\n", parts_pos_xy_temp);
    if( cudaMemcpyToSymbol(parts_pos_xy, &parts_pos_xy_temp, sizeof(double2*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
        error("Failed to set device symbol for parts array");
-    printf("parts_pos_xy = %p\n", parts_pos_xy);
+//    printf("parts_pos_xy = %p\n", parts_pos_xy);
    if( cudaMalloc(&parts_pos_z_temp, sizeof(double) * N) != cudaSuccess)
        error("Failed to allocate device parts array");
    if( cudaMemcpyToSymbol(parts_pos_z, &parts_pos_z_temp, sizeof(double*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
@@ -1018,6 +1049,10 @@ void test_bh(int N, int runs, char *fileName) {
    message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves));
    message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts);
+    for(k = 0; k < num_cells; k++)
+        if(cell_pool[k].split > 1 ) 
+            printf("Split > 1\n");
    create_tasks(&s, root, NULL);    
    message("total number of tasks: %i.", s.count);
@@ -1033,6 +1068,56 @@ void test_bh(int N, int runs, char *fileName) {
            parts_a_m_host[i].z = 0.0;
        }
+    /* Copy the cells to the device. */
+    if( cudaMalloc( &gpu_ptr_cells , sizeof(struct cell) * used_cells) != cudaSuccess)
+        error("Failed to allocate cells on the GPU");
+    if( cudaMemcpy( gpu_ptr_cells, cell_pool, sizeof(struct cell) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy cells to the GPU");
+    if( cudaMemcpyToSymbol(cells, &gpu_ptr_cells, sizeof(struct cell*), 0, cudaMemcpyHostToDevice) != cudaSuccess )
+        error("Failed to copy cell pointer to the GPU");
+double2 *com_temp;
+double *comz_temp;
+float *comm_temp;
+    if(cudaMalloc( &com_temp, sizeof(double2) * used_cells) != cudaSuccess)
+        error("Failed to allocate com on the GPU");
+    if( cudaMemcpy( com_temp, com_xy_host, sizeof(double2) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
+        error("failed to copy com to the GPU");
+    if( cudaMemcpyToSymbol(com_xy, &com_temp, sizeof(double2 *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy com pointer to the GPU");
+    if(cudaMalloc( &comz_temp, sizeof(double) * used_cells) != cudaSuccess)
+        error("Failed to allocate com on the GPU");
+    if( cudaMemcpy( comz_temp, com_z_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
+        error("failed to copy com to the GPU");
+    if( cudaMemcpyToSymbol(com_z, &comz_temp, sizeof(double *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy com pointer to the GPU");
+    if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess)
+        error("Failed to allocate com on the GPU");
+    if( cudaMemcpy( comm_temp, com_z_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
+        error("failed to copy com to the GPU");
+    if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy com pointer to the GPU");
+   /* for(i = 0; i < s.count; i++){
+        int *idata = (int*)&s.data[s.tasks[i].data];
+        if(s.tasks[i].type == task_type_self)
+            printf("Self task with data[0] = %i\n", idata[0]);
+        if(s.tasks[i].type == task_type_pair) {
+            printf("Pair task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
+        }
+        if(s.tasks[i].type == task_type_self_pc) {
+            printf("PC task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
+        }
+    }*/
        //Run code.
        printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
        qsched_run_CUDA( &s , func );

--- a/src/cuda_queue.cu
+++ b/src/cuda_queue.cu
--- a/src/cuda_queue.cu.outdated
+++ b/src/cuda_queue.cu.outdated
+/*
+* Not used.
+*/
+void qsched_prepare_deps( struct qsched *s )
+{
+    int **is_loaded;
+    int **parents;
+    int i, k, j, use, usek, usem;
+    int *num_parents;
+    is_loaded = (int**)malloc(sizeof(int*) * s->count);
+    parents = (int**)malloc(sizeof(int*) * s->count);
+    num_parents = (int*)malloc(sizeof(int) * s->count);
+    bzero(num_parents, sizeof(int)*s->count);
+    k = (sizeof(int)*s->count_res)/32 +1;
+    for(i = 0; i < s->count; i++)
+    {
+        is_loaded[i] = (int*)malloc(k);
+        bzero(is_loaded[i], k);
+    }
+    /* Is loaded[i][k] gives the set of k*32 resources for task i*/
+    /* Reset the waits to 0... */
+   for( k = 0; k < s->count; k++ )
+    {
+        s->tasks[k].wait = 0;
+    }    
+    /* Run through the tasks and set the waits... */
+    for ( k = 0 ; k < s->count ; k++ ) {
+        struct task *t = &s->tasks[k];
+        if ( !( t->flags & task_flag_skip ) )
+            for ( j = 0 ; j < t->nr_unlocks ; j++ )
+                s->tasks[ t->unlocks[j] ].wait += 1;
+        }
+    /* Sort the tasks topologically. */
+    int *tid = (int *)malloc( sizeof(int) * s->count );
+    for ( j = 0 , k = 0 ; k < s->count ; k++ )
+        if ( s->tasks[k].wait == 0 ) {
+            tid[j] = k;
+            j += 1;
+            }
+    for ( k = 0 ; k < j ; k++ ) {
+        struct task *t = &s->tasks[ tid[k] ];
+        for ( int kk = 0 ; kk < t->nr_unlocks ; kk++ )
+            if ( ( s->tasks[ t->unlocks[kk] ].wait -= 1 ) == 0 ) {
+                tid[j] = t->unlocks[kk];
+                j += 1;
+                }
+        }
+    if ( k < s->count )
+    {
+        error( "Circular dependencies detected." );
+     }
+    /* Store the maximum number of parents of a task.*/
+    int max_parents = 0;
+    /* Compute the number of parents for each task */
+    for(i = s->count-1; i >= 0; i--)
+    {
+        for(j = 0; j < s->tasks[i].nr_unlocks; j++)
+        {
+            num_parents[s->tasks[i].unlocks[j]]++;
+        }
+    }
+    /* Allocate memory to store parents in.*/
+    for(i = 0; i < s->count; i++)
+    {
+        if(num_parents[i] > 0)
+            parents[i] = (int*)calloc(num_parents[i],sizeof(int));
+        else
+            parents[i] = NULL;
+        if(num_parents[i] > max_parents)
+        {
+            max_parents = num_parents[i];
+        }
+        num_parents[i] = 0;
+    }
+    /* This seems to be outdated and unused?*/
+    for(i = 0; i < s->count; i++)
+    {
+        if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload)
+            continue;
+        for(k = 0; k < s->tasks[i].nr_uses; k++)
+        {
+            use = s->tasks[i].uses[k];
+            usek = use >> 5; // use / 32;
+            usem = use & 31; // use % 32.
+            if((is_loaded[i][usek] & (1 << (31-usem))) == 0 )
+            {
+                qsched_addunlock(s, s->res[use].task , i ) ;
+                printf("We actually did this?!");
+                is_loaded[i][usek] |= (1 <<(31-usem));
+            }
+        }
+        for(k = 0; k < s->tasks[i].nr_unlocks; k++)
+        {
+            if(s->tasks[s->tasks[i].unlocks[k]].type == type_load ||
+                    s->tasks[s->tasks[i].unlocks[k]].type == type_unload )
+                continue;
+            for(j = 0; j < s->count_res/32 +1; j++)
+            {
+                    is_loaded[s->tasks[i].unlocks[k]][j] |= is_loaded[i][j];
+            }
+            parents[s->tasks[i].unlocks[k]][num_parents[s->tasks[i].unlocks[k]]] = i;
+            num_parents[s->tasks[i].unlocks[k]] = num_parents[s->tasks[i].unlocks[k]] + 1;
+        }
+    }
+    max_parents = 0;
+    for(i = 0; i < s->count; i++)
+    {
+        if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload)
+            continue;
+        if(num_parents[i] > max_parents)
+        {
+            max_parents = num_parents[i];
+        }
+        bzero(is_loaded[i], k);
+    }
+    for(i = s->count-1; i >= 0; i--)
+    {
+        if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload)
+            continue;
+        for(k = 0; k < s->tasks[i].nr_uses; k++)
+        {
+            use = s->tasks[i].uses[k];
+            usek = use >> 5; // use / 32;
+            usem = use & 31; // use % 32.
+            if((is_loaded[i][usek] & (1 << (31-usem))) == 0 )
+            {   
+                qsched_addunlock(s, i, s->res[use].utask );
+                is_loaded[i][usek] |= (1 << (31-usem));
+            }
+        }
+        for(k = 0; k < num_parents[i]; k++)
+        {
+            for(j = 0; j < s->count_res/32 +1; j++)
+            {   
+                is_loaded[parents[i][k]][j] |= is_loaded[i][j];
+            }
+        }
+    }
+}