Added a non-recursive version of pc functions. Poor performance currently

320d4466 · aidan · 60922b1a · 320d4466 · 320d4466
Commit 320d4466 authored 10 years ago by aidan
--- a/examples/test_bh_2.cu
+++ b/examples/test_bh_2.cu
@@ -59,7 +59,7 @@ unsigned short int split, sorted;
 int parts, firstchild, sibling;
 int res, resz, resm, com_tid;

-}__attribute__((aligned(64)));
+};//__attribute__((aligned(64)));


 #define const_G 1
@@ -105,13 +105,16 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
    __shared__ double2 parts_xy[cell_maxparts];
    __shared__ double parts_z[cell_maxparts];
    __shared__ float4 parts_am[cell_maxparts];
+    /*if(threadIdx.x == 0)
+    printf("%f, %f, %f, %f, %i, %f, %f, %f, %f, %i\n", ci->h, ci->loc_xy.x, ci->loc_xy.y, ci->loc_z, ci->split, 
+            cj->h, cj->loc_xy.x, cj->loc_xy.y, cj->loc_z, cj->split);*/

    /* Load particles of cell j into shared memory */
-    for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) {
+    /*for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) {
        parts_xy[j] = parts_pos_xy[k];
        parts_z[j] = parts_pos_z[k];
        parts_am[j] = parts_a_m[k];
-    }
+    }*/

    /* Loop over cell i.*/
    for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
@@ -123,25 +126,27 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
        }
        mi = parts_a_m[i].w;
        
-        for(j = 0; j < count_j; j++) {
+        for(j = parts_j; j < parts_j + count_j; j++) {
            r2 = 0.0f;
-            dx[0] = xi[0] - parts_xy[j].x;
-            dx[1] = xi[1] - parts_xy[j].y;
-            dx[2] = xi[2] - parts_z[j];
+            dx[0] = xi[0] - parts_pos_xy[j].x;
+            dx[1] = xi[1] - parts_pos_xy[j].y;
+            dx[2] = xi[2] - parts_pos_z[j];
            r2 += dx[0] * dx[0];
            r2 += dx[1] * dx[1];
            r2 += dx[2] * dx[2];


-//            ir = 1.0f / sqrtf(r2);
+    //        ir = 1.0f / sqrtf(r2);
            ir = rsqrtf(r2);
            w = const_G * ir * ir * ir;
-            mj = parts_am[j].w;
+            mj = parts_a_m[j].w;
            for(k = 0; k < 3; k++) {
                ai[k] -= dx[k] * mj * w;
            }
+   //         atomicAdd(&parts_a_m[j].x, w*dx[0]*mi);
+    //        atomicAdd(&parts_a_m[j].y, w*dx[1]*mi);
+     //       atomicAdd(&parts_a_m[j].z, w*dx[2]*mi);
        }            
-         
       atomicAdd(&parts_a_m[i].x, ai[0]);           
       atomicAdd(&parts_a_m[i].y, ai[1]);
       atomicAdd(&parts_a_m[i].z, ai[2]);
@@ -149,11 +154,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
    }

    /* Load particles of cell i into shared memory */
-    for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) {
+    /*for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) {
        parts_xy[j] = parts_pos_xy[k];
        parts_z[j] = parts_pos_z[k];
        parts_am[j] = parts_a_m[k];
-    }
+    }*/
 /*Loop over cell j. */
    for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
        xi[0] = parts_pos_xy[i].x;
@@ -164,11 +169,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
        }
        mi = parts_a_m[i].w;
        
-        for(j = 0; j < count_j; j++) {
+        for(j = parts_i; j < parts_i + count_i; j++) {
            r2 = 0.0f;
-            dx[0] = xi[0] - parts_xy[j].x;
-            dx[1] = xi[1] - parts_xy[j].y;
-            dx[2] = xi[2] - parts_z[j];
+            dx[0] = xi[0] - parts_pos_xy[j].x;
+            dx[1] = xi[1] - parts_pos_xy[j].y;
+            dx[2] = xi[2] - parts_pos_z[j];
            r2 += dx[0] * dx[0];
            r2 += dx[1] * dx[1];
            r2 += dx[2] * dx[2];
@@ -176,12 +181,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c

            ir = rsqrtf(r2);
            w = const_G * ir * ir * ir;
-            mj = parts_am[j].w;
+            mj = parts_a_m[j].w;
            for(k = 0; k < 3; k++) {
                ai[k] -= dx[k] * mj * w;
            }
        }            
-         
       atomicAdd(&parts_a_m[i].x, ai[0]);           
       atomicAdd(&parts_a_m[i].y, ai[1]);
       atomicAdd(&parts_a_m[i].z, ai[2]);
@@ -190,25 +194,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c

 }

-/*__device__ void iact_pair(int celli, int cellj) {
-
-    struct cell *ci, *cj;
-    ci = &cells[celli];
-    cj = &cells[cellj];
-
-    if(Check if neighbours0)
-    {
-        if(ci->split && cj->split) {
-            //Split both cells and do all possible pairs.
-
-        }else {
-            iact_pair_direct(ci, cj);
-        }
-        
-    }
-
-}*/
-
 __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) {

    int i, k;
@@ -218,16 +203,29 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
    int count = leaf->count;    
    int parts = leaf->parts;
    int cell_j = cj - cells;
+    int temp;
    float r2, dx[3], ir, w;

+  //  if(cell_j < 0)
+//    {
+//        if(threadIdx.x == 0)
+  //          printf("cell_j = %i, leaf = %i, threadIdx.x == %i\n", cell_j, leaf-cells, threadIdx.x);
+  //      __syncthreads();
+//        asm("trap;");
+//    }
+
+ //   if(threadIdx.x == 0)
+ //       printf("%f, %f, %f\n", cj->loc_xy.x, cj->loc_xy.y, cj->loc_z);
+
+
+    temp = cell_j;

    /* Init the com's data.*/
    j_com_xy = com_xy[cell_j];
    j_com_z = com_z[cell_j];
    j_com_mass = com_mass[cell_j];

-    for(i = parts; i < parts+count; i++) {
-    
+    for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
        r2 = 0.0;
        dx[0] = j_com_xy.x - parts_pos_xy[i].x;
        r2 += dx[0] * dx[0];
@@ -238,11 +236,18 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
    
        ir = rsqrtf(r2);
        w = j_com_mass * const_G * ir * ir * ir;
-        
-        parts_a_m[i].x += w * dx[0];
-        parts_a_m[i].y += w * dx[1];
-        parts_a_m[i].z += w * dx[2];
+       /* __threadfence();
+        if(!isfinite(w * dx[0])){
+            printf("Error in make_interact_pc, j_com_mass = %f, cell_j = %i, temp = %i, i = %i, threadIdx.x=%i\n", j_com_mass, cell_j, temp, i, threadIdx.x); asm("trap;");}
+        if(!isfinite(w * dx[1])){
+            printf("Error in make_interact_pc\n"); asm("trap;");}
+        if(!isfinite(w * dx[2])){
+            printf("Error in make_interact_pc\n"); asm("trap;");}*/
+        atomicAdd( &parts_a_m[i].x , w * dx[0]);
+        atomicAdd( &parts_a_m[i].y , w * dx[1]);
+        atomicAdd( &parts_a_m[i].z , w * dx[2]);
    }
+//__syncthreads();
 }

 /**
@@ -291,38 +296,41 @@ __device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) {
 __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) {

    struct cell *cp ,*cps;
+    int leafnum = leaf - cells;
+//if(threadIdx.x == 0 && leafnum == 23)
+  //      printf("cj = %i\n", cj - cells);
+//    printf("%i\n", leafnum);

-    if(leaf->split)
-    {
-        printf("Leaf split = 1, oh dear.");
-        asm("trap;");
-    }
-if(ci->split > 1)
-    {
-        printf("Cell %i had split > 1\n", ci - cells);
-        asm("trap;");
-    }
-    if(cj->split > 1)
-    {
-        printf("cell %i had split > 1\n", cj - cells);
-        asm("trap;");
-    }
+ //   if(threadIdx.x == 0)
+ ///       printf("ci = %i, cj = %i, leaf = %i\n", ci - cells, cj - cells, leaf - cells);

    for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
        if(is_inside(leaf, cp)) break;
    }

    if(are_neighbours_different_size(cp, cj)) {
+
        for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
+
            if(are_neighbours(cp, cps)) {
                if(cp->split && cps->split) {
                    iact_pair_pc(cp, cps, leaf);
                }
            } else {
                make_interact_pc(leaf, cps);
+             //   if(threadIdx.x == 0 && leafnum == 23)
+               //         printf("leafnum = %i with cps = %i here\n", leafnum, cps - cells);
                __syncthreads();
            }
        }
+    }else{
+
+        for(cps = &cells[cj->firstchild]; cps!= &cells[cj->sibling]; cps = &cells[cps->sibling]) {
+         //   if(threadIdx.x == 0 && leafnum == 23)
+           //             printf("leafnum = %i with cps = %i\n", leafnum, cps - cells);
+            make_interact_pc(leaf, cps);
+        }
+
    }

    __syncthreads();
@@ -339,7 +347,7 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {

    struct cell *cp, *cps;

-    if(leaf->split)
+    /*if(leaf->split)
    {
        printf("Leaf split = 1, oh dear.");
        asm("trap;");
@@ -348,9 +356,26 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
    {
        printf("Cell had split > 1\n");
        asm("trap;");
-    }
+    }*/

    /* Find the subcell of c the leaf is in.*/
+
+    /*cp = c;
+    cps = c;
+    while(c->split)
+    {
+        for(cp = &cells[cp->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]){
+              if(is_inside(leaf, cp)) break;
+        }
+        if(cp->split){
+            for(cps = &cells[c->firstchild]; cps != &cells[c->sibling]; cps = &cells[cps->sibling]) {
+                
+                if(cp != cps && cps->split) iact_pair_pc(cp, cps, leaf);
+            }
+        }
+        c = cp;
+    }*/
+
    for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) {
        if(is_inside(leaf, cp)) break;
    }
@@ -364,7 +389,7 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
            if(cp != cps && cps->split) iact_pair_pc(cp,cps,leaf);
        }

-    }
+    }//TODO
 }


@@ -385,7 +410,8 @@ __device__ void iact_self_direct(int cellID) {
    int count;
    int i,j,k;
    
-
+    //if(threadIdx.x == 0)
+     //   printf("%f, %f, %f, %f, %i\n", c->h, c->loc_xy.x, c->loc_xy.y, c->loc_z, c->split);
    //If cell is split, interact each child with itself, and with each of its siblings.
    /*if(c->split) {
        //TODO
@@ -395,24 +421,24 @@ __device__ void iact_self_direct(int cellID) {
        count = c->count;
        int z = threadIdx.x;
        /* Load particle data into shared memory*/
-        for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) {
+        /*for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) {
            parts_xy[z] = parts_pos_xy[k];
            parts_z[z] = parts_pos_z[k];
            parts_am[z] = parts_a_m[k];
        }
-        __syncthreads();
-        for(i = threadIdx.x; i < count; i += blockDim.x)
+        __syncthreads();*/
+        for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x)
        {
-            xi[0] = parts_xy[i].x;
-            xi[1] = parts_xy[i].y;
-            xi[2] = parts_z[i];
+            xi[0] = parts_pos_xy[i].x;
+            xi[1] = parts_pos_xy[i].y;
+            xi[2] = parts_pos_z[i];
            for(k = 0; k < 3; k++) {
                ai[k] = 0.0;
            }
            mi = parts_a_m[i].w;
            
            //for(j = i+1; j!= i; j = (j+1)%count) 
-            for(j = 0; j < count; j++)
+            for(j = parts; j < parts+count; j++)
            {
                if(i != j){

@@ -430,7 +456,7 @@ __device__ void iact_self_direct(int cellID) {
                    //ir = 1.0f / sqrtf(r2);
                    ir = rsqrtf(r2);
                    w = const_G * ir * ir * ir;
-                    mj = parts_am[j].w;
+                    mj = parts_a_m[j].w;
                    for(k = 0; k < 3; k++) {
                        ai[k] -= w * dx[k] * mj;
                    }
@@ -813,10 +839,10 @@ void cell_split(int c, struct qsched *s) {
 //    struct cell *data[2] = {root, c};
    int data[2] = {root, c};
        int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
-                                 2 * sizeof(int), 1);
-        qsched_addlock(s, tid, cell_pool[root].res);
-        qsched_addlock(s, tid, cell_pool[root].resz);
-        qsched_addlock(s, tid, cell_pool[root].resm);
+                                 2 * sizeof(int), 3000);
+        /*qsched_adduse(s, tid, cell_pool[root].res);
+        qsched_adduse(s, tid, cell_pool[root].resz);
+        qsched_adduse(s, tid, cell_pool[root].resm);*/
        qsched_addlock(s, tid, cell_pool[c].res);
        qsched_addlock(s, tid, cell_pool[c].resz);
        qsched_addlock(s, tid, cell_pool[c].resm);
@@ -859,8 +885,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
        else{
            data[0] = ci - cell_pool;
            data[1] = -1;
-
-            tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, ci->count*ci->count/2);
+            tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, 2);
            qsched_addlock(s, tid, ci->res);
            qsched_addlock(s, tid, ci->resz);
            qsched_addlock(s, tid, ci->resm);
@@ -868,7 +893,9 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
    }
    /* Else its a pair!*/
    else{
-        if(are_neighbours_host(ci,cj)){/* Cells are neighbours */
+        if(!are_neighbours_host(ci,cj)){/* Cells are neighbours */
+
+        }else{
            /*Are both split? */
            if(ci->split && cj->split)
            {
@@ -885,7 +912,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
                
                /* Create the task. */
                tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
-                                     sizeof(struct cell *) * 2, ci->count * cj->count);
+                                     sizeof(struct cell *) * 2, 1);

                /* Add the resources. */
                qsched_addlock(s, tid, ci->res);
@@ -909,7 +936,6 @@ __device__ void runner( int type , void *data ) {
        int *idata = (int *)data;
        int i = idata[0];
        int j = idata[1];
-
        switch ( type ) {
            case task_type_self:
                iact_self_direct(i);
@@ -1044,18 +1070,31 @@ void test_bh(int N, int runs, char *fileName) {
            c = cell_pool[c].firstchild;
        }
    }
-    message("root.sibling = %i, root.split = %i", root->sibling, root->split);
-    printf("nr_leaves = %i\n", nr_leaves);
    message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves));
    message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts);
    
-    for(k = 0; k < num_cells; k++)
+   /* for(k = 0; k < num_cells; k++)
        if(cell_pool[k].split > 1 ) 
-            printf("Split > 1\n");
+            printf("Split > 1\n");*/

    create_tasks(&s, root, NULL);    

+    int self = 0, pair = 0, pc = 0;
+
+    for(k = 0; k < s.count; k++)
+    {
+        if(s.tasks[k].type == task_type_self)
+            self++;
+        else if (s.tasks[k].type == task_type_pair)
+            pair++;
+        else if (s.tasks[k].type >= 0)
+            pc++;
+    }
+
    message("total number of tasks: %i.", s.count);
+    message("total number of pair tasks: %i.", pair);
+    message("total number of self tasks: %i.", self);
+    message("total number of pc tasks: %i.", pc);
    message("total number of cells: %i.", number);
    message("total number of deps: %i.", s.count_deps);
    message("total number of res: %i.", s.count_res);
@@ -1098,7 +1137,7 @@ float *comm_temp;

    if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess)
        error("Failed to allocate com on the GPU");
-    if( cudaMemcpy( comm_temp, com_z_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
+    if( cudaMemcpy( comm_temp, com_mass_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
        error("failed to copy com to the GPU");
    if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
        error("Failed to copy com pointer to the GPU");
@@ -1115,13 +1154,37 @@ float *comm_temp;
        }
    }*/
        
-
+      //  printf("com_mass_host[152] = %f\n", com_mass_host[152]);
        

        //Run code.
-        printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
+//        printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
        qsched_run_CUDA( &s , func );
-    }
+        qsched_print_cuda_timers(&s);
+
+    k = 0;
+        printf("%e, %e, %e, %e, %e, %e, %e\n", parts_a_m_host[k].w, parts_pos_xy_host[k].x, parts_pos_xy_host[k].y, parts_pos_z_host[k],
+            parts_a_m_host[k].x, parts_a_m_host[k].y, parts_a_m_host[k].z);
+struct task* tasks =  qsched_get_timers( &s , s.count );
+    for(i = 0; i < s.count; i++)
+    {
+      printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
+       // printf("\n");
+    
+  }
+
+}   
+  /* Dump the particles to a file */
+  file = fopen("particle_dump.dat", "w");
+/*  fprintf(file,
+          "# ID m x y z a_exact.x   a_exact.y    a_exact.z    a_legacy.x    "
+          "a_legacy.y    a_legacy.z    a_new.x     a_new.y    a_new.z\n");*/
+  for (k = 0; k < N; ++k)
+    fprintf(file, "%e, %e, %e, %e, %e, %e, %e\n",
+            parts_a_m_host[k].w, parts_pos_xy_host[k].x, parts_pos_xy_host[k].y, parts_pos_z_host[k],
+            parts_a_m_host[k].x, parts_a_m_host[k].y, parts_a_m_host[k].z);
+  fclose(file);
+
 }



--- a/examples/test_bh_3.cu
+++ b/examples/test_bh_3.cu