diff --git a/examples/test_bh_2.cu b/examples/test_bh_2.cu index 7981f6b9877e8445dd4f7c192672d00dd4b55a96..0cdae9765a9956e669dcf550052e390708676994 100644 --- a/examples/test_bh_2.cu +++ b/examples/test_bh_2.cu @@ -59,7 +59,7 @@ unsigned short int split, sorted; int parts, firstchild, sibling; int res, resz, resm, com_tid; -}__attribute__((aligned(64))); +};//__attribute__((aligned(64))); #define const_G 1 @@ -105,13 +105,16 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c __shared__ double2 parts_xy[cell_maxparts]; __shared__ double parts_z[cell_maxparts]; __shared__ float4 parts_am[cell_maxparts]; + /*if(threadIdx.x == 0) + printf("%f, %f, %f, %f, %i, %f, %f, %f, %f, %i\n", ci->h, ci->loc_xy.x, ci->loc_xy.y, ci->loc_z, ci->split, + cj->h, cj->loc_xy.x, cj->loc_xy.y, cj->loc_z, cj->split);*/ /* Load particles of cell j into shared memory */ - for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) { + /*for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) { parts_xy[j] = parts_pos_xy[k]; parts_z[j] = parts_pos_z[k]; parts_am[j] = parts_a_m[k]; - } + }*/ /* Loop over cell i.*/ for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) { @@ -123,25 +126,27 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c } mi = parts_a_m[i].w; - for(j = 0; j < count_j; j++) { + for(j = parts_j; j < parts_j + count_j; j++) { r2 = 0.0f; - dx[0] = xi[0] - parts_xy[j].x; - dx[1] = xi[1] - parts_xy[j].y; - dx[2] = xi[2] - parts_z[j]; + dx[0] = xi[0] - parts_pos_xy[j].x; + dx[1] = xi[1] - parts_pos_xy[j].y; + dx[2] = xi[2] - parts_pos_z[j]; r2 += dx[0] * dx[0]; r2 += dx[1] * dx[1]; r2 += dx[2] * dx[2]; -// ir = 1.0f / sqrtf(r2); + // ir = 1.0f / sqrtf(r2); ir = rsqrtf(r2); w = const_G * ir * ir * ir; - mj = parts_am[j].w; + mj = parts_a_m[j].w; for(k = 0; k < 3; k++) { ai[k] -= dx[k] * mj * w; } + // atomicAdd(&parts_a_m[j].x, w*dx[0]*mi); + // atomicAdd(&parts_a_m[j].y, w*dx[1]*mi); + // atomicAdd(&parts_a_m[j].z, w*dx[2]*mi); } - atomicAdd(&parts_a_m[i].x, ai[0]); atomicAdd(&parts_a_m[i].y, ai[1]); atomicAdd(&parts_a_m[i].z, ai[2]); @@ -149,11 +154,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c } /* Load particles of cell i into shared memory */ - for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) { + /*for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) { parts_xy[j] = parts_pos_xy[k]; parts_z[j] = parts_pos_z[k]; parts_am[j] = parts_a_m[k]; - } + }*/ /*Loop over cell j. */ for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) { xi[0] = parts_pos_xy[i].x; @@ -164,11 +169,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c } mi = parts_a_m[i].w; - for(j = 0; j < count_j; j++) { + for(j = parts_i; j < parts_i + count_i; j++) { r2 = 0.0f; - dx[0] = xi[0] - parts_xy[j].x; - dx[1] = xi[1] - parts_xy[j].y; - dx[2] = xi[2] - parts_z[j]; + dx[0] = xi[0] - parts_pos_xy[j].x; + dx[1] = xi[1] - parts_pos_xy[j].y; + dx[2] = xi[2] - parts_pos_z[j]; r2 += dx[0] * dx[0]; r2 += dx[1] * dx[1]; r2 += dx[2] * dx[2]; @@ -176,12 +181,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c ir = rsqrtf(r2); w = const_G * ir * ir * ir; - mj = parts_am[j].w; + mj = parts_a_m[j].w; for(k = 0; k < 3; k++) { ai[k] -= dx[k] * mj * w; } } - atomicAdd(&parts_a_m[i].x, ai[0]); atomicAdd(&parts_a_m[i].y, ai[1]); atomicAdd(&parts_a_m[i].z, ai[2]); @@ -190,25 +194,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c } -/*__device__ void iact_pair(int celli, int cellj) { - - struct cell *ci, *cj; - ci = &cells[celli]; - cj = &cells[cellj]; - - if(Check if neighbours0) - { - if(ci->split && cj->split) { - //Split both cells and do all possible pairs. - - }else { - iact_pair_direct(ci, cj); - } - - } - -}*/ - __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) { int i, k; @@ -218,16 +203,29 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell int count = leaf->count; int parts = leaf->parts; int cell_j = cj - cells; + int temp; float r2, dx[3], ir, w; + // if(cell_j < 0) +// { +// if(threadIdx.x == 0) + // printf("cell_j = %i, leaf = %i, threadIdx.x == %i\n", cell_j, leaf-cells, threadIdx.x); + // __syncthreads(); +// asm("trap;"); +// } + + // if(threadIdx.x == 0) + // printf("%f, %f, %f\n", cj->loc_xy.x, cj->loc_xy.y, cj->loc_z); + + + temp = cell_j; /* Init the com's data.*/ j_com_xy = com_xy[cell_j]; j_com_z = com_z[cell_j]; j_com_mass = com_mass[cell_j]; - for(i = parts; i < parts+count; i++) { - + for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) { r2 = 0.0; dx[0] = j_com_xy.x - parts_pos_xy[i].x; r2 += dx[0] * dx[0]; @@ -238,11 +236,18 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell ir = rsqrtf(r2); w = j_com_mass * const_G * ir * ir * ir; - - parts_a_m[i].x += w * dx[0]; - parts_a_m[i].y += w * dx[1]; - parts_a_m[i].z += w * dx[2]; + /* __threadfence(); + if(!isfinite(w * dx[0])){ + printf("Error in make_interact_pc, j_com_mass = %f, cell_j = %i, temp = %i, i = %i, threadIdx.x=%i\n", j_com_mass, cell_j, temp, i, threadIdx.x); asm("trap;");} + if(!isfinite(w * dx[1])){ + printf("Error in make_interact_pc\n"); asm("trap;");} + if(!isfinite(w * dx[2])){ + printf("Error in make_interact_pc\n"); asm("trap;");}*/ + atomicAdd( &parts_a_m[i].x , w * dx[0]); + atomicAdd( &parts_a_m[i].y , w * dx[1]); + atomicAdd( &parts_a_m[i].z , w * dx[2]); } +//__syncthreads(); } /** @@ -291,38 +296,41 @@ __device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) { __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) { struct cell *cp ,*cps; + int leafnum = leaf - cells; +//if(threadIdx.x == 0 && leafnum == 23) + // printf("cj = %i\n", cj - cells); +// printf("%i\n", leafnum); - if(leaf->split) - { - printf("Leaf split = 1, oh dear."); - asm("trap;"); - } -if(ci->split > 1) - { - printf("Cell %i had split > 1\n", ci - cells); - asm("trap;"); - } - if(cj->split > 1) - { - printf("cell %i had split > 1\n", cj - cells); - asm("trap;"); - } + // if(threadIdx.x == 0) + /// printf("ci = %i, cj = %i, leaf = %i\n", ci - cells, cj - cells, leaf - cells); for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) { if(is_inside(leaf, cp)) break; } if(are_neighbours_different_size(cp, cj)) { + for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) { + if(are_neighbours(cp, cps)) { if(cp->split && cps->split) { iact_pair_pc(cp, cps, leaf); } } else { make_interact_pc(leaf, cps); + // if(threadIdx.x == 0 && leafnum == 23) + // printf("leafnum = %i with cps = %i here\n", leafnum, cps - cells); __syncthreads(); } } + }else{ + + for(cps = &cells[cj->firstchild]; cps!= &cells[cj->sibling]; cps = &cells[cps->sibling]) { + // if(threadIdx.x == 0 && leafnum == 23) + // printf("leafnum = %i with cps = %i\n", leafnum, cps - cells); + make_interact_pc(leaf, cps); + } + } __syncthreads(); @@ -339,7 +347,7 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) { struct cell *cp, *cps; - if(leaf->split) + /*if(leaf->split) { printf("Leaf split = 1, oh dear."); asm("trap;"); @@ -348,9 +356,26 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) { { printf("Cell had split > 1\n"); asm("trap;"); - } + }*/ /* Find the subcell of c the leaf is in.*/ + + /*cp = c; + cps = c; + while(c->split) + { + for(cp = &cells[cp->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]){ + if(is_inside(leaf, cp)) break; + } + if(cp->split){ + for(cps = &cells[c->firstchild]; cps != &cells[c->sibling]; cps = &cells[cps->sibling]) { + + if(cp != cps && cps->split) iact_pair_pc(cp, cps, leaf); + } + } + c = cp; + }*/ + for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) { if(is_inside(leaf, cp)) break; } @@ -364,7 +389,7 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) { if(cp != cps && cps->split) iact_pair_pc(cp,cps,leaf); } - } + }//TODO } @@ -385,7 +410,8 @@ __device__ void iact_self_direct(int cellID) { int count; int i,j,k; - + //if(threadIdx.x == 0) + // printf("%f, %f, %f, %f, %i\n", c->h, c->loc_xy.x, c->loc_xy.y, c->loc_z, c->split); //If cell is split, interact each child with itself, and with each of its siblings. /*if(c->split) { //TODO @@ -395,24 +421,24 @@ __device__ void iact_self_direct(int cellID) { count = c->count; int z = threadIdx.x; /* Load particle data into shared memory*/ - for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) { + /*for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) { parts_xy[z] = parts_pos_xy[k]; parts_z[z] = parts_pos_z[k]; parts_am[z] = parts_a_m[k]; } - __syncthreads(); - for(i = threadIdx.x; i < count; i += blockDim.x) + __syncthreads();*/ + for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x) { - xi[0] = parts_xy[i].x; - xi[1] = parts_xy[i].y; - xi[2] = parts_z[i]; + xi[0] = parts_pos_xy[i].x; + xi[1] = parts_pos_xy[i].y; + xi[2] = parts_pos_z[i]; for(k = 0; k < 3; k++) { ai[k] = 0.0; } mi = parts_a_m[i].w; //for(j = i+1; j!= i; j = (j+1)%count) - for(j = 0; j < count; j++) + for(j = parts; j < parts+count; j++) { if(i != j){ @@ -430,7 +456,7 @@ __device__ void iact_self_direct(int cellID) { //ir = 1.0f / sqrtf(r2); ir = rsqrtf(r2); w = const_G * ir * ir * ir; - mj = parts_am[j].w; + mj = parts_a_m[j].w; for(k = 0; k < 3; k++) { ai[k] -= w * dx[k] * mj; } @@ -813,10 +839,10 @@ void cell_split(int c, struct qsched *s) { // struct cell *data[2] = {root, c}; int data[2] = {root, c}; int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data, - 2 * sizeof(int), 1); - qsched_addlock(s, tid, cell_pool[root].res); - qsched_addlock(s, tid, cell_pool[root].resz); - qsched_addlock(s, tid, cell_pool[root].resm); + 2 * sizeof(int), 3000); + /*qsched_adduse(s, tid, cell_pool[root].res); + qsched_adduse(s, tid, cell_pool[root].resz); + qsched_adduse(s, tid, cell_pool[root].resm);*/ qsched_addlock(s, tid, cell_pool[c].res); qsched_addlock(s, tid, cell_pool[c].resz); qsched_addlock(s, tid, cell_pool[c].resm); @@ -859,8 +885,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){ else{ data[0] = ci - cell_pool; data[1] = -1; - - tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, ci->count*ci->count/2); + tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, 2); qsched_addlock(s, tid, ci->res); qsched_addlock(s, tid, ci->resz); qsched_addlock(s, tid, ci->resm); @@ -868,7 +893,9 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){ } /* Else its a pair!*/ else{ - if(are_neighbours_host(ci,cj)){/* Cells are neighbours */ + if(!are_neighbours_host(ci,cj)){/* Cells are neighbours */ + + }else{ /*Are both split? */ if(ci->split && cj->split) { @@ -885,7 +912,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){ /* Create the task. */ tid = qsched_addtask(s, task_type_pair, task_flag_none, data, - sizeof(struct cell *) * 2, ci->count * cj->count); + sizeof(struct cell *) * 2, 1); /* Add the resources. */ qsched_addlock(s, tid, ci->res); @@ -909,7 +936,6 @@ __device__ void runner( int type , void *data ) { int *idata = (int *)data; int i = idata[0]; int j = idata[1]; - switch ( type ) { case task_type_self: iact_self_direct(i); @@ -1044,18 +1070,31 @@ void test_bh(int N, int runs, char *fileName) { c = cell_pool[c].firstchild; } } - message("root.sibling = %i, root.split = %i", root->sibling, root->split); - printf("nr_leaves = %i\n", nr_leaves); message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves)); message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts); - for(k = 0; k < num_cells; k++) + /* for(k = 0; k < num_cells; k++) if(cell_pool[k].split > 1 ) - printf("Split > 1\n"); + printf("Split > 1\n");*/ create_tasks(&s, root, NULL); + int self = 0, pair = 0, pc = 0; + + for(k = 0; k < s.count; k++) + { + if(s.tasks[k].type == task_type_self) + self++; + else if (s.tasks[k].type == task_type_pair) + pair++; + else if (s.tasks[k].type >= 0) + pc++; + } + message("total number of tasks: %i.", s.count); + message("total number of pair tasks: %i.", pair); + message("total number of self tasks: %i.", self); + message("total number of pc tasks: %i.", pc); message("total number of cells: %i.", number); message("total number of deps: %i.", s.count_deps); message("total number of res: %i.", s.count_res); @@ -1098,7 +1137,7 @@ float *comm_temp; if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess) error("Failed to allocate com on the GPU"); - if( cudaMemcpy( comm_temp, com_z_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess ) + if( cudaMemcpy( comm_temp, com_mass_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess ) error("failed to copy com to the GPU"); if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess) error("Failed to copy com pointer to the GPU"); @@ -1115,13 +1154,37 @@ float *comm_temp; } }*/ - + // printf("com_mass_host[152] = %f\n", com_mass_host[152]); //Run code. - printf("gpu_data = %p\n", (int*)s.res[0].gpu_data); +// printf("gpu_data = %p\n", (int*)s.res[0].gpu_data); qsched_run_CUDA( &s , func ); - } + qsched_print_cuda_timers(&s); + + k = 0; + printf("%e, %e, %e, %e, %e, %e, %e\n", parts_a_m_host[k].w, parts_pos_xy_host[k].x, parts_pos_xy_host[k].y, parts_pos_z_host[k], + parts_a_m_host[k].x, parts_a_m_host[k].y, parts_a_m_host[k].z); +struct task* tasks = qsched_get_timers( &s , s.count ); + for(i = 0; i < s.count; i++) + { + printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID); + // printf("\n"); + + } + +} + /* Dump the particles to a file */ + file = fopen("particle_dump.dat", "w"); +/* fprintf(file, + "# ID m x y z a_exact.x a_exact.y a_exact.z a_legacy.x " + "a_legacy.y a_legacy.z a_new.x a_new.y a_new.z\n");*/ + for (k = 0; k < N; ++k) + fprintf(file, "%e, %e, %e, %e, %e, %e, %e\n", + parts_a_m_host[k].w, parts_pos_xy_host[k].x, parts_pos_xy_host[k].y, parts_pos_z_host[k], + parts_a_m_host[k].x, parts_a_m_host[k].y, parts_a_m_host[k].z); + fclose(file); + } diff --git a/examples/test_bh_3.cu b/examples/test_bh_3.cu new file mode 100644 index 0000000000000000000000000000000000000000..a1adfbd8bc88d788c83b51f1bd560013af1e2643 --- /dev/null +++ b/examples/test_bh_3.cu @@ -0,0 +1,1265 @@ +/******************************************************************************* + * This file is part of QuickSched. + * Coypright (c) 2014 Pedro Gonnet (pedro.gonnet@durham.ac.uk), + * Aidan Chalk (aidan.chalk@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * +* *****************************************************************************/ + + +/* Config parameters. */ +#include "../config.h" + +/* Standard includes. */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <math.h> +#include <float.h> +#include <limits.h> +#include <omp.h> +#include <fenv.h> + +/* Local includes. */ +extern "C"{ +#include "quicksched.h" +#include "res.h" +} +#include "cuda_queue.h" + +/** Task types. */ +enum task_type { + task_type_self = 0, + task_type_pair, + task_type_self_pc, + task_type_com, + task_type_count +}; + +struct cell{ + +double2 loc_xy; +double loc_z; +double h; +int count; +unsigned short int split, sorted; +int parts, firstchild, sibling; +int res, /*resz, resm,*/ com_tid; + +};//__attribute__((aligned(64))); + +struct part{ +double loc[3]; +float a[3]; +float m; +};//__attribute__((aligned(64))); + + +#define const_G 1 +/* Requred variables to obtain cells. */ +#define cell_maxparts 256 +#define CELL_STRETCH 2 +#define INITIAL_CELLS 256 +struct cell *cell_pool = NULL; +int used_cells=0; +int num_cells = 0; +int cell_size = INITIAL_CELLS*sizeof(struct cell); + +/* Device locations for the particle values. */ +//__device__ double2 *parts_pos_xy; +//__device__ double *parts_pos_z; +//__device__ float4 *parts_a_m; +__device__ double2 *com_xy; +__device__ double *com_z; +__device__ float *com_mass; +__device__ struct part *parts_cuda; +__device__ struct cell *cells; + + +/* Host locations for the particle values. */ +//double2 *parts_pos_xy_host; +//double *parts_pos_z_host; +//float4 *parts_a_m_host; +double2 *com_xy_host; +double *com_z_host; +float *com_mass_host; +double2 *parts_pos_xy_temp; +double *parts_pos_z_temp; +float4 *parts_a_m_temp; + +struct part *parts_host; +struct part *parts_temp; + + +__device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *cj) { + int i, j, k; + int count_i = ci->count, count_j = cj->count; + int parts_i = ci->parts, parts_j = cj->parts; + double xi[3]; + float dx[3], ai[3], mi, mj, r2, w, ir; + __shared__ double2 parts_xy[cell_maxparts]; + __shared__ double parts_z[cell_maxparts]; + __shared__ float4 parts_am[cell_maxparts]; + /*if(threadIdx.x == 0) + printf("%f, %f, %f, %f, %i, %f, %f, %f, %f, %i\n", ci->h, ci->loc_xy.x, ci->loc_xy.y, ci->loc_z, ci->split, + cj->h, cj->loc_xy.x, cj->loc_xy.y, cj->loc_z, cj->split);*/ + + /* Load particles of cell j into shared memory */ + /*for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) { + parts_xy[j] = parts_pos_xy[k]; + parts_z[j] = parts_pos_z[k]; + parts_am[j] = parts_a_m[k]; + }*/ + + /* Loop over cell i.*/ + for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) { + xi[0] = parts_cuda[i].loc[0]; + xi[1] = parts_cuda[i].loc[1]; + xi[2] = parts_cuda[i].loc[2]; + for(k = 0; k < 3; k++) { + ai[k] = 0.0f; + } + mi = parts_cuda[i].m; + + for(j = parts_j; j < parts_j + count_j; j++) { + r2 = 0.0f; + dx[0] = xi[0] - parts_cuda[j].loc[0]; + dx[1] = xi[1] - parts_cuda[j].loc[1]; + dx[2] = xi[2] - parts_cuda[j].loc[2]; + r2 += dx[0] * dx[0]; + r2 += dx[1] * dx[1]; + r2 += dx[2] * dx[2]; + + + // ir = 1.0f / sqrtf(r2); + ir = rsqrtf(r2); + w = const_G * ir * ir * ir; + mj = parts_cuda[j].m; + for(k = 0; k < 3; k++) { + ai[k] -= dx[k] * mj * w; + } + // atomicAdd(&parts_a_m[j].x, w*dx[0]*mi); + // atomicAdd(&parts_a_m[j].y, w*dx[1]*mi); + // atomicAdd(&parts_a_m[j].z, w*dx[2]*mi); + } + atomicAdd(&parts_cuda[i].a[0], ai[0]); + atomicAdd(&parts_cuda[i].a[1], ai[1]); + atomicAdd(&parts_cuda[i].a[2], ai[2]); + + } + + /* Load particles of cell i into shared memory */ + /*for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) { + parts_xy[j] = parts_pos_xy[k]; + parts_z[j] = parts_pos_z[k]; + parts_am[j] = parts_a_m[k]; + }*/ + /*Loop over cell j. */ + for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) { + xi[0] = parts_cuda[i].loc[0]; + xi[1] = parts_cuda[i].loc[1]; + xi[2] = parts_cuda[i].loc[2]; + for(k = 0; k < 3; k++) { + ai[k] = 0.0f; + } + mi = parts_cuda[i].m; + + for(j = parts_i; j < parts_i + count_i; j++) { + r2 = 0.0f; + dx[0] = xi[0] - parts_cuda[j].loc[0]; + dx[1] = xi[1] - parts_cuda[j].loc[1]; + dx[2] = xi[2] - parts_cuda[j].loc[2]; + r2 += dx[0] * dx[0]; + r2 += dx[1] * dx[1]; + r2 += dx[2] * dx[2]; + + + ir = rsqrtf(r2); + w = const_G * ir * ir * ir; + mj = parts_cuda[j].m; + for(k = 0; k < 3; k++) { + ai[k] -= dx[k] * mj * w; + } + } + atomicAdd(&parts_cuda[i].a[0], ai[0]); + atomicAdd(&parts_cuda[i].a[1], ai[1]); + atomicAdd(&parts_cuda[i].a[2], ai[2]); + + } + +} + +__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) { + + int i, k; + double2 j_com_xy; + double j_com_z; + float j_com_mass; + int count = leaf->count; + int parts = leaf->parts; + int cell_j = cj - cells; + int temp; + float r2, dx[3], ir, w; + + // if(cell_j < 0) +// { +// if(threadIdx.x == 0) + // printf("cell_j = %i, leaf = %i, threadIdx.x == %i\n", cell_j, leaf-cells, threadIdx.x); + // __syncthreads(); +// asm("trap;"); +// } + + // if(threadIdx.x == 0) + // printf("%f, %f, %f\n", cj->loc_xy.x, cj->loc_xy.y, cj->loc_z); + + + temp = cell_j; + + /* Init the com's data.*/ + j_com_xy = com_xy[cell_j]; + j_com_z = com_z[cell_j]; + j_com_mass = com_mass[cell_j]; + + for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) { + r2 = 0.0; + dx[0] = j_com_xy.x - parts_cuda[i].loc[0]; + r2 += dx[0] * dx[0]; + dx[1] = j_com_xy.y - parts_cuda[i].loc[1]; + r2 += dx[1] * dx[1]; + dx[2] = j_com_z - parts_cuda[i].loc[2]; + r2 += dx[2] * dx[2]; + + ir = rsqrtf(r2); + w = j_com_mass * const_G * ir * ir * ir; + atomicAdd( &parts_cuda[i].a[0] , w * dx[0]); + atomicAdd( &parts_cuda[i].a[1] , w * dx[1]); + atomicAdd( &parts_cuda[i].a[2] , w * dx[2]); + } +//__syncthreads(); +} + +/** + * @brief Checks whether the cells are direct neighbours ot not + */ +__device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) { + + int k; + float dx[3]; + double cih = ci->h, cjh = cj->h; + + float min_dist = 0.5*(cih+cjh); + + float center_i = ci->loc_xy.x + 0.5*cih; + float center_j = cj->loc_xy.x + 0.5*cjh; + dx[0] = fabsf(center_i - center_j); + center_i = ci->loc_xy.y + 0.5*cih; + center_j = cj->loc_xy.y + 0.5*cjh; + dx[1] = fabsf(center_i - center_j); + center_i = ci->loc_z + 0.5*cih; + center_j = cj->loc_z + 0.5*cjh; + dx[2] = fabsf(center_i - center_j); + return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); +} + +__device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) { + int k; + float dx[3]; + float min_dist = ci->h; + float center_i = ci->loc_xy.x; + float center_j = cj->loc_xy.x; + dx[0] = fabsf(center_i - center_j); + center_i = ci->loc_xy.y; + center_j = cj->loc_xy.y; + dx[1] = fabsf(center_i - center_j); + center_i = ci->loc_z; + center_j = cj->loc_z; + dx[2] = fabsf(center_i - center_j); + return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); +} + +__device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) { + return (leaf->parts >= c->parts) && (leaf->parts < c->parts + c->count); +} + +__device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) { + + struct cell *cp ,*cps; + + /* + Find child of ci containing leaf. + if child and cj are_neighbours_different_size + Loop through children of cj. + If child and childj are neighbours + If both split -> recurse + else + make_interact_pc + else + loop through children of cj + make interact pc + + */ + struct cell *loopend; + cp = ci; + while(cp->split) + { + for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) { + if(is_inside(leaf, cp)) break; + } + + if(are_neighbours_different_size(cp, cj)) { + loopend = &cells[cj->sibling]; + cps = &cells[cj->firstchild]; + while(cps != loopend) + { + if(!are_neighbours(cp, cps)){ + make_interact_pc(leaf, cps); + //__syncthreads(); + cps = &cells[cps->sibling]; + }else{ + if(cps->split) + cps = &cells[cps->firstchild]; + else + cps = &cells[cps->sibling]; + } + + } + + }else{ + for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) { + make_interact_pc(leaf, cps); + // __syncthreads(); + + } + break; + } + + ci = cp; + } + + + + + + + /* for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) { + if(is_inside(leaf, cp)) break; + } + + if(are_neighbours_different_size(cp, cj)) { + + for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) { + + if(!are_neighbours(cp, cps)) { + make_interact_pc(leaf, cps); + __syncthreads(); + } else { + if(cp->split && cps->split) { + iact_pair_pc(cp, cps, leaf); + } + + } + } + }else{ + + for(cps = &cells[cj->firstchild]; cps!= &cells[cj->sibling]; cps = &cells[cps->sibling]) { + make_interact_pc(leaf, cps); + } + + }*/ + +// __syncthreads(); +} + +/** + * @brief Compute the interactions between all particles in a leaf and + * and all the monopoles in the cell c + * + * @param c The #cell containing the monopoles + * @param leaf The #cell containing the particles + */ +__device__ void iact_self_pc(struct cell *c, struct cell *leaf) { + + struct cell *cp, *cps; + + /*if(leaf->split) + { + printf("Leaf split = 1, oh dear."); + asm("trap;"); + } + if(c->split > 1) + { + printf("Cell had split > 1\n"); + asm("trap;"); + }*/ + + /* Find the subcell of c the leaf is in.*/ + + cp = c; + cps = c; + while(c->split) + { + for(cp = &cells[cp->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]){ + if(is_inside(leaf, cp)) break; + } + if(cp->split){ + for(cps = &cells[c->firstchild]; cps != &cells[c->sibling]; cps = &cells[cps->sibling]) { + + if(cp != cps && cps->split) iact_pair_pc(cp, cps, leaf); + } + } + c = cp; + } + + /*for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) { + if(is_inside(leaf, cp)) break; + } + + if(cp->split) { + + iact_self_pc(cp, leaf); + + for(cps = &cells[c->firstchild]; cps != &cells[c->sibling]; cps = &cells[cps->sibling]) { + + if(cp != cps && cps->split) iact_pair_pc(cp,cps,leaf); + } + + }*///TODO +} + + + +/** + * @brief Compute the interactions between all particles in a cell. + * + * @param cellID The cell ID to compute interactions on. + */ +__device__ void iact_self_direct(int cellID) { + struct cell *c = &cells[cellID]; + double xi[3] = {0.0, 0.0, 0.0}; + float ai[3] = {0.0, 0.0, 0.0 }, mi, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w; + __shared__ double2 parts_xy[cell_maxparts]; + __shared__ double parts_z[cell_maxparts]; + __shared__ float4 parts_am[cell_maxparts]; + int parts; + int count; + int i,j,k; + + //if(threadIdx.x == 0) + // printf("%f, %f, %f, %f, %i\n", c->h, c->loc_xy.x, c->loc_xy.y, c->loc_z, c->split); + //If cell is split, interact each child with itself, and with each of its siblings. + /*if(c->split) { + //TODO + + } else {*/ + parts = c->parts; + count = c->count; + int z = threadIdx.x; + /* Load particle data into shared memory*/ + /*for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) { + parts_xy[z] = parts_pos_xy[k]; + parts_z[z] = parts_pos_z[k]; + parts_am[z] = parts_a_m[k]; + } + __syncthreads();*/ + for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x) + { + xi[0] = parts_cuda[i].loc[0]; + xi[1] = parts_cuda[i].loc[1]; + xi[2] = parts_cuda[i].loc[2]; + for(k = 0; k < 3; k++) { + ai[k] = 0.0; + } + mi = parts_cuda[i].m; + + //for(j = i+1; j!= i; j = (j+1)%count) + for(j = parts; j < parts+count; j++) + { + if(i != j){ + + /* Compute the pairwise distance. */ + r2 = 0.0; + dx[0] = xi[0] - parts_cuda[j].loc[0]; + r2 += dx[0]*dx[0]; + dx[1] = xi[1] - parts_cuda[j].loc[1]; + r2 += dx[1]*dx[1]; + dx[2] = xi[2] - parts_cuda[j].loc[2]; + r2 += dx[2]*dx[2]; + + /* Apply the gravitational acceleration. */ + + //ir = 1.0f / sqrtf(r2); + ir = rsqrtf(r2); + w = const_G * ir * ir * ir; + mj = parts_cuda[j].m; + for(k = 0; k < 3; k++) { + ai[k] -= w * dx[k] * mj; + } + + } + } + //Update. + atomicAdd(&parts_cuda[i].a[0],ai[0] ); + atomicAdd(&parts_cuda[i].a[1],ai[1] ); + atomicAdd(&parts_cuda[i].a[2],ai[2] ); + } + + +} + + +/** + * @brief Checks whether the cells are direct neighbours ot not. Both cells have + * to be of the same size + */ +static inline int are_neighbours_host(struct cell *ci, struct cell *cj) { + + int k; + float dx[3]; + +#ifdef SANITY_CHECKS + if (ci->h != cj->h) + error(" Cells of different size in distance calculation."); +#endif + + /* Maximum allowed distance */ + float min_dist = ci->h; + + /* (Manhattan) Distance between the cells */ + double2 loc1=ci->loc_xy, loc2=cj->loc_xy; + float center_i = loc1.x; + float center_j = loc2.x; + dx[0] = fabs(center_i - center_j); + center_i = loc1.y; + center_j = loc2.y; + dx[1] = fabs(center_i - center_j); + center_i = ci->loc_z; + center_j = cj->loc_z; + dx[2] = fabs(center_i - center_j); + + return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); +} + + +struct cell *cell_get() +{ + struct cell *res; + + if(num_cells == 0) + { + cell_pool = (struct cell*) calloc(INITIAL_CELLS, sizeof(struct cell)); + if(cell_pool == NULL) + error("Failed to allocate cell_pool"); + com_xy_host = (double2*) calloc(INITIAL_CELLS, sizeof(double2)); + if(com_xy_host == NULL) + error("Failed to allocate cell_pool"); + com_z_host = (double*) calloc(INITIAL_CELLS, sizeof(double)); + if(com_z_host == NULL) + error("Failed to allocate cell_pool"); + com_mass_host = (float*) calloc(INITIAL_CELLS, sizeof(float)); + if(com_mass_host == NULL) + error("Failed to allocate cell_pool"); + num_cells = INITIAL_CELLS; + } + + if(used_cells >= num_cells) + { + /* Stretch */ + struct cell *new_pool; + cell_size *= CELL_STRETCH; + new_pool = (struct cell*) calloc(num_cells*CELL_STRETCH, sizeof(struct cell)); + if(new_pool == NULL) + error("Failed to allocate new_pool"); + if(cell_pool != NULL) + memcpy(new_pool, cell_pool, num_cells*sizeof(struct cell)); + + + + double2 *tempxy = (double2*) calloc(num_cells*CELL_STRETCH, sizeof(double2)); + if(tempxy == NULL) + error("Failed to allocate tempxy"); + memcpy(tempxy, com_xy_host, sizeof(double2)*num_cells); + free(com_xy_host); + com_xy_host = tempxy; + double *tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double)); + if(tempz == NULL) + error("Failed to allocate tempz"); + memcpy(tempz, com_z_host, num_cells*sizeof(double)); + free(com_z_host); + com_z_host = tempz; + float *tempm = (float*) calloc(num_cells*CELL_STRETCH, sizeof(float)); + if(tempm == NULL) + error("Failed to allocate tempm"); + memcpy(tempm, com_mass_host, num_cells*sizeof(float)); + free(com_mass_host); + com_mass_host = tempm; + + num_cells *= CELL_STRETCH; + free(cell_pool); + cell_pool = new_pool; + + message("Increased size of arrays"); + } + used_cells++; + cell_pool[used_cells-1].sibling = -1; + cell_pool[used_cells-1].firstchild = -1; + cell_pool[used_cells-1].res = qsched_res_none; + //cell_pool[used_cells-1].resz = qsched_res_none; + //cell_pool[used_cells-1].resm = qsched_res_none; + return &cell_pool[used_cells-1]; +} + +void comp_com(struct cell *c){ + + int k, count = c->count; + int cpi; + struct cell *cp; + int parts = c->parts; + double com[3] = {0.0, 0.0, 0.0}, mass = 0.0; + + if(c->split) { + for(cp = &cell_pool[(cpi = c->firstchild)]; cpi != c->sibling; cp = &cell_pool[(cpi = cp->sibling)]) { + float cp_mass = com_mass_host[cpi]; + com[0] += com_xy_host[cpi].x * cp_mass; + com[1] += com_xy_host[cpi].y * cp_mass; + com[2] += com_z_host[cpi] * cp_mass; + mass += cp_mass; + } + + + /* Otherwise collect the multiple from the particles */ + } else { + + for(k = parts; k < parts+count; k++) + { + float p_mass = parts_host[k].m; + com[0] += parts_host[k].loc[0] * p_mass; + com[1] += parts_host[k].loc[1] * p_mass; + com[2] += parts_host[k].loc[2] * p_mass; + mass += p_mass; + } + } + + + k = c - cell_pool; + /* Store the COM data, if it was collected. */ + if(mass > 0.0) { + float imass = 1.0f/mass; + com_xy_host[k].x = com[0] * imass; + com_xy_host[k].y = com[1] * imass; + com_z_host[k] = com[2] * imass; + com_mass_host[k] = mass; + }else + { + com_xy_host[k].x = 0.0; + com_xy_host[k].y = 0.0; + com_z_host[k] = 0.0; + com_mass_host[k] = 0.0f; + } + + + +} + +/** + * @brief Sort the parts into eight bins along the given pivots and + * fill the multipoles. Also adds the hierarchical resources + * to the sched (TODO). + * + * @param c The #cell to be split. + * @param N The total number of parts. + * @param s The #sched to store the resources. + */ +void cell_split(int c, struct qsched *s) { + int i, j, k, kk, count = cell_pool[c].count; + int parts = cell_pool[c].parts; + struct part temp_part; + double2 tempxy; + double tempxy1; + float4 tempxy2; + struct cell *cp, *cell; + int left[8], right[8]; + double pivot[3]; + static int root = -1; +// struct cell *progenitors[8]; + int progenitors[8]; + int c1 = c; + struct part *temp_xy; + double *temp_z; + float4 *temp_a_m; + + /* Set the root cell. */ + if (root < 0) { + root = c; + cell_pool[c].sibling = -1; + } + + if(cell_pool[c].res == qsched_res_none) + { + if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[c].parts], 0) != cudaSuccess ) + error("Failed to get host device pointer."); +// printf("tempxy = %p\n", temp_xy); + cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy, + sizeof(struct part) * cell_pool[c].count, parts_pos_xy_temp + cell_pool[c].parts); + } + /*if(cell_pool[c].resz == qsched_res_none) + { + if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[c].parts], 0) != cudaSuccess ) + error("Failed to get host device pointer."); + cell_pool[c].resz = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_z, + sizeof(double) * cell_pool[c].count, parts_pos_z_temp + cell_pool[c].parts); + } + if(cell_pool[c].resm == qsched_res_none) + { + if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[c].parts], 0) != cudaSuccess ) + error("Failed to get host device pointer."); + cell_pool[c].resm = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_a_m, + sizeof(float4) * cell_pool[c].count, parts_a_m_temp + cell_pool[c].parts); + }*/ + // error("Cell has no resource");*///TODO + + if(count > cell_maxparts ) + { + cell_pool[c].split = 1; + + for(k = 0; k < 8; k++) + { + progenitors[k] = (cp = cell_get()) - cell_pool; + cell = &cell_pool[c]; + cp->loc_xy = cell->loc_xy; + cp->loc_z = cell->loc_z; + cp->h = cell->h*0.5; + if(k & 4) cp->loc_xy.x += cp->h; + if(k & 2) cp->loc_xy.y += cp->h; + if(k & 1) cp->loc_z += cp->h; + } + + /* Init the pivots.*/ + pivot[0] = cell->loc_xy.x + cell->h * 0.5; + pivot[1] = cell->loc_xy.y + cell->h * 0.5; + pivot[2] = cell->loc_z + cell->h * 0.5; + + /* Split along the x axis. */ + i = parts; + j = parts+count-1; + while(i < j) + { + while(i <= parts+count-1 && parts_host[i].loc[0] < pivot[0]) i += 1; + while(j >= parts && parts_host[j].loc[0] >= pivot[0]) j -= 1; + if(i < j){ + temp_part = parts_host[i]; + parts_host[i] = parts_host[j]; + parts_host[j] = temp_part; + } + } + left[1] = i; + right[1] = parts+count-1; + left[0] = parts; + right[0] = j; + + /* Split along the y axis twice. */ + for (k = 1; k >= 0; k--) { + i = left[k]; + j = right[k]; + while(i <= j){ + while(i <= right[k] && parts_host[i].loc[1] < pivot[1]) i += 1; + while(j >= left[k] && parts_host[j].loc[1] >= pivot[1]) j -= 1; + if(i < j) + { + temp_part = parts_host[i]; + parts_host[i] = parts_host[j]; + parts_host[j] = temp_part; + } + } + left[2*k+1] = i; + right[2*k+1] = right[k]; + left[2*k] = left[k]; + right[2*k] = j; + } + /* Split along the z axis four times.*/ + for(k = 3; k >=0; k--) + { + i = left[k]; + j = right[k]; + while(i <= j){ + while(i <= right[k] && parts_host[i].loc[2] < pivot[2]) i += 1; + while(j >= left[k] && parts_host[j].loc[2] >= pivot[2]) j -= 1; + if(i < j) + { + temp_part = parts_host[i]; + parts_host[i] = parts_host[j]; + parts_host[j] = temp_part; + } + } + left[2 * k + 1] = i; + right[2 * k + 1] = right[k]; + left[2 * k] = left[k]; + right[2 * k] = j; + } + + /* Store the counts and offsets. */ + for(k = 0; k < 8; k++) + { + cell_pool[progenitors[k]].count = right[k]-left[k]+1; + cell_pool[progenitors[k]].parts = left[k]; + if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess ) + error("Failed to get host device pointer."); + cell_pool[progenitors[k]].res = qsched_addres(s, qsched_owner_none, cell->res, temp_xy, + sizeof(struct part) * cell_pool[progenitors[k]].count, parts_temp + cell_pool[progenitors[k]].parts); + /*if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess ) + error("Failed to get host device pointer."); + cell_pool[progenitors[k]].resz = qsched_addres(s, qsched_owner_none, cell->resz, temp_z, + sizeof(double) * cell_pool[progenitors[k]].count, parts_pos_z_temp + cell_pool[progenitors[k]].parts); + if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess ) + error("Failed to get host device pointer."); + cell_pool[progenitors[k]].resm = qsched_addres(s, qsched_owner_none, cell->resm, temp_a_m, + sizeof(float4) * cell_pool[progenitors[k]].count, parts_a_m_temp + cell_pool[progenitors[k]].parts);*/ + } + + /* Find the first non-empty progenitor */ + for(k = 0; k < 8; k++) + { + if(cell_pool[progenitors[k]].count > 0) + { + cell->firstchild = progenitors[k]; + break; + } + } + + #ifdef SANITY_CHECKS + if(cell->firstchild == -1) + error("Cell has been split but all children have 0 parts"); + #endif + + /*Prepare the pointers*/ + for(k = 0; k < 8; k++) + { + /* Find the next non-empty sibling */ + for(kk = k+1; kk < 8; ++kk){ + if(cell_pool[progenitors[kk]].count > 0){ + cell_pool[progenitors[k]].sibling = progenitors[kk]; + break; + } + } + + /* No non-empty sibling, go back a level.*/ + if(kk == 8) cell_pool[progenitors[k]].sibling = cell->sibling; + + } + + /* Recurse */ + for(k = 0; k < 8; k++) + if(cell_pool[progenitors[k]].count > 0) cell_split(progenitors[k], s); + + /* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */ + } else { + +// struct cell *data[2] = {root, c}; + int data[2] = {root, c}; + int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data, + 2 * sizeof(int), 3000); + /*qsched_adduse(s, tid, cell_pool[root].res); + qsched_adduse(s, tid, cell_pool[root].resz); + qsched_adduse(s, tid, cell_pool[root].resm);*/ + qsched_addlock(s, tid, cell_pool[c].res); + // qsched_addlock(s, tid, cell_pool[c].resz); + // qsched_addlock(s, tid, cell_pool[c].resm); + } + +#ifndef COM_AS_TASK + comp_com(&cell_pool[c]); +#endif +} + +/** + * @brief Create the tasks for the cell pair/self. + * + * @param s The #sched in which to create the tasks. + * @param ci The first #cell. + * @param cj The second #cell. + */ +void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){ + + qsched_task_t tid; + int data[2]; + struct cell /**data[2],*/ *cp, *cps; + int cpi; + + + if(cj == NULL) + { + if(ci->split) + { + for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]) + { + //Self Interaction. + create_tasks(s, cp, NULL); + + for(cps = &cell_pool[cp->sibling]; cps != &cell_pool[ci->sibling]; cps = &cell_pool[cps->sibling]) + create_tasks(s, cp, cps); + } + } + /* Self task */ + else{ + data[0] = ci - cell_pool; + data[1] = -1; + tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, 2); + qsched_addlock(s, tid, ci->res); + //qsched_addlock(s, tid, ci->resz); + //qsched_addlock(s, tid, ci->resm); + } + } + /* Else its a pair!*/ + else{ + if(!are_neighbours_host(ci,cj)){/* Cells are neighbours */ + + }else{ + /*Are both split? */ + if(ci->split && cj->split) + { + /* Recurse over both cells. */ + for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]) + for(cps = &cell_pool[cj->firstchild]; cps != &cell_pool[cj->sibling]; cps = &cell_pool[cps->sibling]) + create_tasks(s, cp, cps); + + /* Otherwise, at least one of the cells is not split, build a direct + * interaction. */ + }else{ + data[0] = ci-cell_pool; + data[1] = cj-cell_pool; + + /* Create the task. */ + tid = qsched_addtask(s, task_type_pair, task_flag_none, data, + sizeof(struct cell *) * 2, 1); + + /* Add the resources. */ + qsched_addlock(s, tid, ci->res); + // qsched_addlock(s, tid, ci->resz); + // qsched_addlock(s, tid, ci->resm); + qsched_addlock(s, tid, cj->res); + // qsched_addlock(s, tid, cj->resz); + // qsched_addlock(s, tid, cj->resm); + } + } + + } + + + +} + + +__device__ void runner( int type , void *data ) { + + int *idata = (int *)data; + int i = idata[0]; + int j = idata[1]; + switch ( type ) { + case task_type_self: + iact_self_direct(i); + break; + case task_type_pair: + iact_pair_direct(&cells[i], &cells[j]); + break; + case task_type_self_pc: + iact_self_pc( &cells[i], &cells[j] ); + break; + default: + asm("trap;"); + } + __syncthreads(); +} + +__device__ qsched_funtype function = runner; + + +qsched_funtype func; +/** + * @brief Set up and run a task-based Barnes-Hutt N-body solver. + * + * @param N The number of random particles to use. + * @param runs Number of force evaluations to use as a benchmark. + * @param fileName Input file name. If @c NULL or an empty string, random + * particle positions will be used. + */ +void test_bh(int N, int runs, char *fileName) { + int i, k; + struct cell *root; + struct part *parts; + FILE *file; + struct qsched s; + ticks tic, toc_run, tot_setup = 0, tot_run = 0; + int countMultipoles = 0, countPairs = 0, countCoMs = 0; + struct cell *gpu_ptr_cells; + + cudaFree(0); + if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess) + error("Failed to copy function pointer from device"); + + /* Initialize the scheduler. */ + qsched_init(&s, 1, qsched_flag_none); + + //Create host particle arrays. + if( cudaMallocHost(&parts_host, sizeof(struct part) * N) != cudaSuccess) + error("Failed to allocate parts array"); + + + + if( cudaMalloc(&parts_temp, sizeof(struct part) * N) != cudaSuccess) + error("Failed to allocate device parts array"); + if( cudaMemcpyToSymbol(parts_cuda, &parts_temp, sizeof(struct part*), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to set device symbol for parts array"); + + + if(fileName == NULL || fileName[0] == 0) { + for(k = 0; k < N; k++) { + parts_host[k].loc[0] = ((double)rand())/ RAND_MAX; + parts_host[k].loc[1] = ((double)rand())/ RAND_MAX; + parts_host[k].loc[2] = ((double)rand())/ RAND_MAX; + parts_host[k].m = ((double)rand()) / RAND_MAX; + } + + } else { + file = fopen(fileName, "r"); + int tempxy; + if(file) { + for(k = 0; k < N; k++) { + if(fscanf(file, "%d", &tempxy) != 1) + error("Failed to read ID"); + if(fscanf(file, "%lf%lf%lf", &parts_host[k].loc[0], &parts_host[k].loc[1], &parts_host[k].loc[2]) != 3) + error("Failed to read position of part %i.", k); + if(fscanf(file, "%f", &parts_host[k].m) != 1) + error("Failed to read mass of part %i.", k); + } + fclose(file); + } + } + + /* Init the cells. */ + root = cell_get(); + root->loc_xy.x = 0.0; + root->loc_xy.y = 0.0; + root->loc_z = 0.0; + root->h = 1.0; + root->count = N; + root->parts = 0; + root->sibling = -1; + int c = root-cell_pool; + cell_split(root - cell_pool, &s); + root = &cell_pool[c]; + int nr_leaves = 0; + int maxparts=0, minparts=1000000; + int number = 0; + while(c >= 0) { + if(cell_pool[c].count > 0) + { + number++; + if(cell_pool[c].res == qsched_res_none) + message("cell %i has no res", c); + //if(cell_pool[c].resz == qsched_res_none) + // message("cell %i has no resz", c); + //if(cell_pool[c].resm == qsched_res_none) + // message("cell %i has no resm", c); + } + if(!cell_pool[c].split) { + nr_leaves++; + if(cell_pool[c].count > maxparts) + { + maxparts = cell_pool[c].count; + } + if(cell_pool[c].count < minparts) + { + minparts = cell_pool[c].count; + } + c = cell_pool[c].sibling; + } else { + c = cell_pool[c].firstchild; + } + } + message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves)); + message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts); + + /* for(k = 0; k < num_cells; k++) + if(cell_pool[k].split > 1 ) + printf("Split > 1\n");*/ + + create_tasks(&s, root, NULL); + + int self = 0, pair = 0, pc = 0; + + for(k = 0; k < s.count; k++) + { + if(s.tasks[k].type == task_type_self) + self++; + else if (s.tasks[k].type == task_type_pair) + pair++; + else if (s.tasks[k].type >= 0) + pc++; + } + + message("total number of tasks: %i.", s.count); + message("total number of pair tasks: %i.", pair); + message("total number of self tasks: %i.", self); + message("total number of pc tasks: %i.", pc); + message("total number of cells: %i.", number); + message("total number of deps: %i.", s.count_deps); + message("total number of res: %i.", s.count_res); + message("total number of locks: %i.", s.count_locks); + + for(k = 0; k < runs; k++) { + for(i = 0; i < N; ++i) { + parts_host[i].a[0] = 0.0; + parts_host[i].a[1] = 0.0; + parts_host[i].a[2] = 0.0; + } + + + /* Copy the cells to the device. */ + if( cudaMalloc( &gpu_ptr_cells , sizeof(struct cell) * used_cells) != cudaSuccess) + error("Failed to allocate cells on the GPU"); + if( cudaMemcpy( gpu_ptr_cells, cell_pool, sizeof(struct cell) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy cells to the GPU"); + if( cudaMemcpyToSymbol(cells, &gpu_ptr_cells, sizeof(struct cell*), 0, cudaMemcpyHostToDevice) != cudaSuccess ) + error("Failed to copy cell pointer to the GPU"); + +double2 *com_temp; +double *comz_temp; +float *comm_temp; + + if(cudaMalloc( &com_temp, sizeof(double2) * used_cells) != cudaSuccess) + error("Failed to allocate com on the GPU"); + if( cudaMemcpy( com_temp, com_xy_host, sizeof(double2) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess ) + error("failed to copy com to the GPU"); + if( cudaMemcpyToSymbol(com_xy, &com_temp, sizeof(double2 *), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy com pointer to the GPU"); + + + if(cudaMalloc( &comz_temp, sizeof(double) * used_cells) != cudaSuccess) + error("Failed to allocate com on the GPU"); + if( cudaMemcpy( comz_temp, com_z_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess ) + error("failed to copy com to the GPU"); + if( cudaMemcpyToSymbol(com_z, &comz_temp, sizeof(double *), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy com pointer to the GPU"); + + if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess) + error("Failed to allocate com on the GPU"); + if( cudaMemcpy( comm_temp, com_mass_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess ) + error("failed to copy com to the GPU"); + if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy com pointer to the GPU"); + + /* for(i = 0; i < s.count; i++){ + int *idata = (int*)&s.data[s.tasks[i].data]; + if(s.tasks[i].type == task_type_self) + printf("Self task with data[0] = %i\n", idata[0]); + if(s.tasks[i].type == task_type_pair) { + printf("Pair task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]); + } + if(s.tasks[i].type == task_type_self_pc) { + printf("PC task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]); + } + }*/ + + // printf("com_mass_host[152] = %f\n", com_mass_host[152]); + + + //Run code. +// printf("gpu_data = %p\n", (int*)s.res[0].gpu_data); + qsched_run_CUDA( &s , func ); + qsched_print_cuda_timers(&s); + + k = 0; + printf("%e, %e, %e, %e, %e, %e, %e\n", parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2], + parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]); +struct task* tasks = qsched_get_timers( &s , s.count ); + for(i = 0; i < s.count; i++) + { + printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID); + // printf("\n"); + + } + +} + /* Dump the particles to a file */ + file = fopen("particle_dump.dat", "w"); +/* fprintf(file, + "# ID m x y z a_exact.x a_exact.y a_exact.z a_legacy.x " + "a_legacy.y a_legacy.z a_new.x a_new.y a_new.z\n");*/ + for (k = 0; k < N; ++k) + fprintf(file, "%e, %e, %e, %e, %e, %e, %e\n", + parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2], + parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]); + fclose(file); + +} + + +int main(int argc, char *argv[]) { + int c, nr_threads; + int N = 1000, runs = 1; + char fileName[100] = {0}; + + /* Parse the options */ + while ((c = getopt(argc, argv, "n:r:t:f:c:i:")) != -1) switch (c) { + case 'n': + if (sscanf(optarg, "%d", &N) != 1) + error("Error parsing number of particles."); + break; + case 'r': + if (sscanf(optarg, "%d", &runs) != 1) + error("Error parsing number of runs."); + break; + case 't': + if (sscanf(optarg, "%d", &nr_threads) != 1) + error("Error parsing number of threads."); +// omp_set_num_threads(nr_threads); + break; + case 'f': + if (sscanf(optarg, "%s", &fileName[0]) != 1) + error("Error parsing file name."); + break; + case '?': + fprintf(stderr, "Usage: %s [-t nr_threads] [-n N] [-r runs] [-f file] " + "[-c Nparts] [-i Niterations] \n", + argv[0]); + fprintf(stderr, "Solves the N-body problem using a Barnes-Hut\n" + "tree code with N random particles read from a file in " + "[0,1]^3 using" + "nr_threads threads.\n" + "A test of the neighbouring cells interaction with " + "Nparts per cell is also run Niterations times.\n"); + exit(EXIT_FAILURE); + } + + /* Tree node information */ + printf("Size of cell: %zu bytes.\n", sizeof(struct cell)); + + /* Part information */ +// printf("Size of part: %zu bytes.\n", sizeof(struct part)); + + /* Dump arguments. */ + if (fileName[0] == 0) { + message("Computing the N-body problem over %i random particles using %i " + "threads (%i runs).", + N, nr_threads, runs); + } else { + message("Computing the N-body problem over %i particles read from '%s' " + "using %i threads (%i runs).", + N, fileName, nr_threads, runs); + } + + /* Run the BH test. */ + test_bh(N, runs, fileName); +}