Skip to content
Snippets Groups Projects
Commit 320d4466 authored by aidan's avatar aidan
Browse files

Added a non-recursive version of pc functions. Poor performance currently

parent 60922b1a
No related branches found
No related tags found
No related merge requests found
...@@ -59,7 +59,7 @@ unsigned short int split, sorted; ...@@ -59,7 +59,7 @@ unsigned short int split, sorted;
int parts, firstchild, sibling; int parts, firstchild, sibling;
int res, resz, resm, com_tid; int res, resz, resm, com_tid;
}__attribute__((aligned(64))); };//__attribute__((aligned(64)));
#define const_G 1 #define const_G 1
...@@ -105,13 +105,16 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c ...@@ -105,13 +105,16 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
__shared__ double2 parts_xy[cell_maxparts]; __shared__ double2 parts_xy[cell_maxparts];
__shared__ double parts_z[cell_maxparts]; __shared__ double parts_z[cell_maxparts];
__shared__ float4 parts_am[cell_maxparts]; __shared__ float4 parts_am[cell_maxparts];
/*if(threadIdx.x == 0)
printf("%f, %f, %f, %f, %i, %f, %f, %f, %f, %i\n", ci->h, ci->loc_xy.x, ci->loc_xy.y, ci->loc_z, ci->split,
cj->h, cj->loc_xy.x, cj->loc_xy.y, cj->loc_z, cj->split);*/
/* Load particles of cell j into shared memory */ /* Load particles of cell j into shared memory */
for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) { /*for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) {
parts_xy[j] = parts_pos_xy[k]; parts_xy[j] = parts_pos_xy[k];
parts_z[j] = parts_pos_z[k]; parts_z[j] = parts_pos_z[k];
parts_am[j] = parts_a_m[k]; parts_am[j] = parts_a_m[k];
} }*/
/* Loop over cell i.*/ /* Loop over cell i.*/
for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) { for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
...@@ -123,25 +126,27 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c ...@@ -123,25 +126,27 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
} }
mi = parts_a_m[i].w; mi = parts_a_m[i].w;
for(j = 0; j < count_j; j++) { for(j = parts_j; j < parts_j + count_j; j++) {
r2 = 0.0f; r2 = 0.0f;
dx[0] = xi[0] - parts_xy[j].x; dx[0] = xi[0] - parts_pos_xy[j].x;
dx[1] = xi[1] - parts_xy[j].y; dx[1] = xi[1] - parts_pos_xy[j].y;
dx[2] = xi[2] - parts_z[j]; dx[2] = xi[2] - parts_pos_z[j];
r2 += dx[0] * dx[0]; r2 += dx[0] * dx[0];
r2 += dx[1] * dx[1]; r2 += dx[1] * dx[1];
r2 += dx[2] * dx[2]; r2 += dx[2] * dx[2];
// ir = 1.0f / sqrtf(r2); // ir = 1.0f / sqrtf(r2);
ir = rsqrtf(r2); ir = rsqrtf(r2);
w = const_G * ir * ir * ir; w = const_G * ir * ir * ir;
mj = parts_am[j].w; mj = parts_a_m[j].w;
for(k = 0; k < 3; k++) { for(k = 0; k < 3; k++) {
ai[k] -= dx[k] * mj * w; ai[k] -= dx[k] * mj * w;
} }
// atomicAdd(&parts_a_m[j].x, w*dx[0]*mi);
// atomicAdd(&parts_a_m[j].y, w*dx[1]*mi);
// atomicAdd(&parts_a_m[j].z, w*dx[2]*mi);
} }
atomicAdd(&parts_a_m[i].x, ai[0]); atomicAdd(&parts_a_m[i].x, ai[0]);
atomicAdd(&parts_a_m[i].y, ai[1]); atomicAdd(&parts_a_m[i].y, ai[1]);
atomicAdd(&parts_a_m[i].z, ai[2]); atomicAdd(&parts_a_m[i].z, ai[2]);
...@@ -149,11 +154,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c ...@@ -149,11 +154,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
} }
/* Load particles of cell i into shared memory */ /* Load particles of cell i into shared memory */
for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) { /*for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) {
parts_xy[j] = parts_pos_xy[k]; parts_xy[j] = parts_pos_xy[k];
parts_z[j] = parts_pos_z[k]; parts_z[j] = parts_pos_z[k];
parts_am[j] = parts_a_m[k]; parts_am[j] = parts_a_m[k];
} }*/
/*Loop over cell j. */ /*Loop over cell j. */
for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) { for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
xi[0] = parts_pos_xy[i].x; xi[0] = parts_pos_xy[i].x;
...@@ -164,11 +169,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c ...@@ -164,11 +169,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
} }
mi = parts_a_m[i].w; mi = parts_a_m[i].w;
for(j = 0; j < count_j; j++) { for(j = parts_i; j < parts_i + count_i; j++) {
r2 = 0.0f; r2 = 0.0f;
dx[0] = xi[0] - parts_xy[j].x; dx[0] = xi[0] - parts_pos_xy[j].x;
dx[1] = xi[1] - parts_xy[j].y; dx[1] = xi[1] - parts_pos_xy[j].y;
dx[2] = xi[2] - parts_z[j]; dx[2] = xi[2] - parts_pos_z[j];
r2 += dx[0] * dx[0]; r2 += dx[0] * dx[0];
r2 += dx[1] * dx[1]; r2 += dx[1] * dx[1];
r2 += dx[2] * dx[2]; r2 += dx[2] * dx[2];
...@@ -176,12 +181,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c ...@@ -176,12 +181,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
ir = rsqrtf(r2); ir = rsqrtf(r2);
w = const_G * ir * ir * ir; w = const_G * ir * ir * ir;
mj = parts_am[j].w; mj = parts_a_m[j].w;
for(k = 0; k < 3; k++) { for(k = 0; k < 3; k++) {
ai[k] -= dx[k] * mj * w; ai[k] -= dx[k] * mj * w;
} }
} }
atomicAdd(&parts_a_m[i].x, ai[0]); atomicAdd(&parts_a_m[i].x, ai[0]);
atomicAdd(&parts_a_m[i].y, ai[1]); atomicAdd(&parts_a_m[i].y, ai[1]);
atomicAdd(&parts_a_m[i].z, ai[2]); atomicAdd(&parts_a_m[i].z, ai[2]);
...@@ -190,25 +194,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c ...@@ -190,25 +194,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
} }
/*__device__ void iact_pair(int celli, int cellj) {
struct cell *ci, *cj;
ci = &cells[celli];
cj = &cells[cellj];
if(Check if neighbours0)
{
if(ci->split && cj->split) {
//Split both cells and do all possible pairs.
}else {
iact_pair_direct(ci, cj);
}
}
}*/
__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) { __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) {
int i, k; int i, k;
...@@ -218,16 +203,29 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell ...@@ -218,16 +203,29 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
int count = leaf->count; int count = leaf->count;
int parts = leaf->parts; int parts = leaf->parts;
int cell_j = cj - cells; int cell_j = cj - cells;
int temp;
float r2, dx[3], ir, w; float r2, dx[3], ir, w;
// if(cell_j < 0)
// {
// if(threadIdx.x == 0)
// printf("cell_j = %i, leaf = %i, threadIdx.x == %i\n", cell_j, leaf-cells, threadIdx.x);
// __syncthreads();
// asm("trap;");
// }
// if(threadIdx.x == 0)
// printf("%f, %f, %f\n", cj->loc_xy.x, cj->loc_xy.y, cj->loc_z);
temp = cell_j;
/* Init the com's data.*/ /* Init the com's data.*/
j_com_xy = com_xy[cell_j]; j_com_xy = com_xy[cell_j];
j_com_z = com_z[cell_j]; j_com_z = com_z[cell_j];
j_com_mass = com_mass[cell_j]; j_com_mass = com_mass[cell_j];
for(i = parts; i < parts+count; i++) { for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
r2 = 0.0; r2 = 0.0;
dx[0] = j_com_xy.x - parts_pos_xy[i].x; dx[0] = j_com_xy.x - parts_pos_xy[i].x;
r2 += dx[0] * dx[0]; r2 += dx[0] * dx[0];
...@@ -238,11 +236,18 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell ...@@ -238,11 +236,18 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
ir = rsqrtf(r2); ir = rsqrtf(r2);
w = j_com_mass * const_G * ir * ir * ir; w = j_com_mass * const_G * ir * ir * ir;
/* __threadfence();
parts_a_m[i].x += w * dx[0]; if(!isfinite(w * dx[0])){
parts_a_m[i].y += w * dx[1]; printf("Error in make_interact_pc, j_com_mass = %f, cell_j = %i, temp = %i, i = %i, threadIdx.x=%i\n", j_com_mass, cell_j, temp, i, threadIdx.x); asm("trap;");}
parts_a_m[i].z += w * dx[2]; if(!isfinite(w * dx[1])){
printf("Error in make_interact_pc\n"); asm("trap;");}
if(!isfinite(w * dx[2])){
printf("Error in make_interact_pc\n"); asm("trap;");}*/
atomicAdd( &parts_a_m[i].x , w * dx[0]);
atomicAdd( &parts_a_m[i].y , w * dx[1]);
atomicAdd( &parts_a_m[i].z , w * dx[2]);
} }
//__syncthreads();
} }
/** /**
...@@ -291,38 +296,41 @@ __device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) { ...@@ -291,38 +296,41 @@ __device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) {
__device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) { __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) {
struct cell *cp ,*cps; struct cell *cp ,*cps;
int leafnum = leaf - cells;
//if(threadIdx.x == 0 && leafnum == 23)
// printf("cj = %i\n", cj - cells);
// printf("%i\n", leafnum);
if(leaf->split) // if(threadIdx.x == 0)
{ /// printf("ci = %i, cj = %i, leaf = %i\n", ci - cells, cj - cells, leaf - cells);
printf("Leaf split = 1, oh dear.");
asm("trap;");
}
if(ci->split > 1)
{
printf("Cell %i had split > 1\n", ci - cells);
asm("trap;");
}
if(cj->split > 1)
{
printf("cell %i had split > 1\n", cj - cells);
asm("trap;");
}
for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) { for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
if(is_inside(leaf, cp)) break; if(is_inside(leaf, cp)) break;
} }
if(are_neighbours_different_size(cp, cj)) { if(are_neighbours_different_size(cp, cj)) {
for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) { for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
if(are_neighbours(cp, cps)) { if(are_neighbours(cp, cps)) {
if(cp->split && cps->split) { if(cp->split && cps->split) {
iact_pair_pc(cp, cps, leaf); iact_pair_pc(cp, cps, leaf);
} }
} else { } else {
make_interact_pc(leaf, cps); make_interact_pc(leaf, cps);
// if(threadIdx.x == 0 && leafnum == 23)
// printf("leafnum = %i with cps = %i here\n", leafnum, cps - cells);
__syncthreads(); __syncthreads();
} }
} }
}else{
for(cps = &cells[cj->firstchild]; cps!= &cells[cj->sibling]; cps = &cells[cps->sibling]) {
// if(threadIdx.x == 0 && leafnum == 23)
// printf("leafnum = %i with cps = %i\n", leafnum, cps - cells);
make_interact_pc(leaf, cps);
}
} }
__syncthreads(); __syncthreads();
...@@ -339,7 +347,7 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) { ...@@ -339,7 +347,7 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
struct cell *cp, *cps; struct cell *cp, *cps;
if(leaf->split) /*if(leaf->split)
{ {
printf("Leaf split = 1, oh dear."); printf("Leaf split = 1, oh dear.");
asm("trap;"); asm("trap;");
...@@ -348,9 +356,26 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) { ...@@ -348,9 +356,26 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
{ {
printf("Cell had split > 1\n"); printf("Cell had split > 1\n");
asm("trap;"); asm("trap;");
} }*/
/* Find the subcell of c the leaf is in.*/ /* Find the subcell of c the leaf is in.*/
/*cp = c;
cps = c;
while(c->split)
{
for(cp = &cells[cp->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]){
if(is_inside(leaf, cp)) break;
}
if(cp->split){
for(cps = &cells[c->firstchild]; cps != &cells[c->sibling]; cps = &cells[cps->sibling]) {
if(cp != cps && cps->split) iact_pair_pc(cp, cps, leaf);
}
}
c = cp;
}*/
for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) { for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) {
if(is_inside(leaf, cp)) break; if(is_inside(leaf, cp)) break;
} }
...@@ -364,7 +389,7 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) { ...@@ -364,7 +389,7 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
if(cp != cps && cps->split) iact_pair_pc(cp,cps,leaf); if(cp != cps && cps->split) iact_pair_pc(cp,cps,leaf);
} }
} }//TODO
} }
...@@ -385,7 +410,8 @@ __device__ void iact_self_direct(int cellID) { ...@@ -385,7 +410,8 @@ __device__ void iact_self_direct(int cellID) {
int count; int count;
int i,j,k; int i,j,k;
//if(threadIdx.x == 0)
// printf("%f, %f, %f, %f, %i\n", c->h, c->loc_xy.x, c->loc_xy.y, c->loc_z, c->split);
//If cell is split, interact each child with itself, and with each of its siblings. //If cell is split, interact each child with itself, and with each of its siblings.
/*if(c->split) { /*if(c->split) {
//TODO //TODO
...@@ -395,24 +421,24 @@ __device__ void iact_self_direct(int cellID) { ...@@ -395,24 +421,24 @@ __device__ void iact_self_direct(int cellID) {
count = c->count; count = c->count;
int z = threadIdx.x; int z = threadIdx.x;
/* Load particle data into shared memory*/ /* Load particle data into shared memory*/
for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) { /*for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) {
parts_xy[z] = parts_pos_xy[k]; parts_xy[z] = parts_pos_xy[k];
parts_z[z] = parts_pos_z[k]; parts_z[z] = parts_pos_z[k];
parts_am[z] = parts_a_m[k]; parts_am[z] = parts_a_m[k];
} }
__syncthreads(); __syncthreads();*/
for(i = threadIdx.x; i < count; i += blockDim.x) for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x)
{ {
xi[0] = parts_xy[i].x; xi[0] = parts_pos_xy[i].x;
xi[1] = parts_xy[i].y; xi[1] = parts_pos_xy[i].y;
xi[2] = parts_z[i]; xi[2] = parts_pos_z[i];
for(k = 0; k < 3; k++) { for(k = 0; k < 3; k++) {
ai[k] = 0.0; ai[k] = 0.0;
} }
mi = parts_a_m[i].w; mi = parts_a_m[i].w;
//for(j = i+1; j!= i; j = (j+1)%count) //for(j = i+1; j!= i; j = (j+1)%count)
for(j = 0; j < count; j++) for(j = parts; j < parts+count; j++)
{ {
if(i != j){ if(i != j){
...@@ -430,7 +456,7 @@ __device__ void iact_self_direct(int cellID) { ...@@ -430,7 +456,7 @@ __device__ void iact_self_direct(int cellID) {
//ir = 1.0f / sqrtf(r2); //ir = 1.0f / sqrtf(r2);
ir = rsqrtf(r2); ir = rsqrtf(r2);
w = const_G * ir * ir * ir; w = const_G * ir * ir * ir;
mj = parts_am[j].w; mj = parts_a_m[j].w;
for(k = 0; k < 3; k++) { for(k = 0; k < 3; k++) {
ai[k] -= w * dx[k] * mj; ai[k] -= w * dx[k] * mj;
} }
...@@ -813,10 +839,10 @@ void cell_split(int c, struct qsched *s) { ...@@ -813,10 +839,10 @@ void cell_split(int c, struct qsched *s) {
// struct cell *data[2] = {root, c}; // struct cell *data[2] = {root, c};
int data[2] = {root, c}; int data[2] = {root, c};
int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data, int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
2 * sizeof(int), 1); 2 * sizeof(int), 3000);
qsched_addlock(s, tid, cell_pool[root].res); /*qsched_adduse(s, tid, cell_pool[root].res);
qsched_addlock(s, tid, cell_pool[root].resz); qsched_adduse(s, tid, cell_pool[root].resz);
qsched_addlock(s, tid, cell_pool[root].resm); qsched_adduse(s, tid, cell_pool[root].resm);*/
qsched_addlock(s, tid, cell_pool[c].res); qsched_addlock(s, tid, cell_pool[c].res);
qsched_addlock(s, tid, cell_pool[c].resz); qsched_addlock(s, tid, cell_pool[c].resz);
qsched_addlock(s, tid, cell_pool[c].resm); qsched_addlock(s, tid, cell_pool[c].resm);
...@@ -859,8 +885,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){ ...@@ -859,8 +885,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
else{ else{
data[0] = ci - cell_pool; data[0] = ci - cell_pool;
data[1] = -1; data[1] = -1;
tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, 2);
tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, ci->count*ci->count/2);
qsched_addlock(s, tid, ci->res); qsched_addlock(s, tid, ci->res);
qsched_addlock(s, tid, ci->resz); qsched_addlock(s, tid, ci->resz);
qsched_addlock(s, tid, ci->resm); qsched_addlock(s, tid, ci->resm);
...@@ -868,7 +893,9 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){ ...@@ -868,7 +893,9 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
} }
/* Else its a pair!*/ /* Else its a pair!*/
else{ else{
if(are_neighbours_host(ci,cj)){/* Cells are neighbours */ if(!are_neighbours_host(ci,cj)){/* Cells are neighbours */
}else{
/*Are both split? */ /*Are both split? */
if(ci->split && cj->split) if(ci->split && cj->split)
{ {
...@@ -885,7 +912,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){ ...@@ -885,7 +912,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
/* Create the task. */ /* Create the task. */
tid = qsched_addtask(s, task_type_pair, task_flag_none, data, tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
sizeof(struct cell *) * 2, ci->count * cj->count); sizeof(struct cell *) * 2, 1);
/* Add the resources. */ /* Add the resources. */
qsched_addlock(s, tid, ci->res); qsched_addlock(s, tid, ci->res);
...@@ -909,7 +936,6 @@ __device__ void runner( int type , void *data ) { ...@@ -909,7 +936,6 @@ __device__ void runner( int type , void *data ) {
int *idata = (int *)data; int *idata = (int *)data;
int i = idata[0]; int i = idata[0];
int j = idata[1]; int j = idata[1];
switch ( type ) { switch ( type ) {
case task_type_self: case task_type_self:
iact_self_direct(i); iact_self_direct(i);
...@@ -1044,18 +1070,31 @@ void test_bh(int N, int runs, char *fileName) { ...@@ -1044,18 +1070,31 @@ void test_bh(int N, int runs, char *fileName) {
c = cell_pool[c].firstchild; c = cell_pool[c].firstchild;
} }
} }
message("root.sibling = %i, root.split = %i", root->sibling, root->split);
printf("nr_leaves = %i\n", nr_leaves);
message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves)); message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves));
message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts); message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts);
for(k = 0; k < num_cells; k++) /* for(k = 0; k < num_cells; k++)
if(cell_pool[k].split > 1 ) if(cell_pool[k].split > 1 )
printf("Split > 1\n"); printf("Split > 1\n");*/
create_tasks(&s, root, NULL); create_tasks(&s, root, NULL);
int self = 0, pair = 0, pc = 0;
for(k = 0; k < s.count; k++)
{
if(s.tasks[k].type == task_type_self)
self++;
else if (s.tasks[k].type == task_type_pair)
pair++;
else if (s.tasks[k].type >= 0)
pc++;
}
message("total number of tasks: %i.", s.count); message("total number of tasks: %i.", s.count);
message("total number of pair tasks: %i.", pair);
message("total number of self tasks: %i.", self);
message("total number of pc tasks: %i.", pc);
message("total number of cells: %i.", number); message("total number of cells: %i.", number);
message("total number of deps: %i.", s.count_deps); message("total number of deps: %i.", s.count_deps);
message("total number of res: %i.", s.count_res); message("total number of res: %i.", s.count_res);
...@@ -1098,7 +1137,7 @@ float *comm_temp; ...@@ -1098,7 +1137,7 @@ float *comm_temp;
if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess) if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess)
error("Failed to allocate com on the GPU"); error("Failed to allocate com on the GPU");
if( cudaMemcpy( comm_temp, com_z_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess ) if( cudaMemcpy( comm_temp, com_mass_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
error("failed to copy com to the GPU"); error("failed to copy com to the GPU");
if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess) if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to copy com pointer to the GPU"); error("Failed to copy com pointer to the GPU");
...@@ -1115,13 +1154,37 @@ float *comm_temp; ...@@ -1115,13 +1154,37 @@ float *comm_temp;
} }
}*/ }*/
// printf("com_mass_host[152] = %f\n", com_mass_host[152]);
//Run code. //Run code.
printf("gpu_data = %p\n", (int*)s.res[0].gpu_data); // printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
qsched_run_CUDA( &s , func ); qsched_run_CUDA( &s , func );
} qsched_print_cuda_timers(&s);
k = 0;
printf("%e, %e, %e, %e, %e, %e, %e\n", parts_a_m_host[k].w, parts_pos_xy_host[k].x, parts_pos_xy_host[k].y, parts_pos_z_host[k],
parts_a_m_host[k].x, parts_a_m_host[k].y, parts_a_m_host[k].z);
struct task* tasks = qsched_get_timers( &s , s.count );
for(i = 0; i < s.count; i++)
{
printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
// printf("\n");
}
}
/* Dump the particles to a file */
file = fopen("particle_dump.dat", "w");
/* fprintf(file,
"# ID m x y z a_exact.x a_exact.y a_exact.z a_legacy.x "
"a_legacy.y a_legacy.z a_new.x a_new.y a_new.z\n");*/
for (k = 0; k < N; ++k)
fprintf(file, "%e, %e, %e, %e, %e, %e, %e\n",
parts_a_m_host[k].w, parts_pos_xy_host[k].x, parts_pos_xy_host[k].y, parts_pos_z_host[k],
parts_a_m_host[k].x, parts_a_m_host[k].y, parts_a_m_host[k].z);
fclose(file);
} }
......
/*******************************************************************************
* This file is part of QuickSched.
* Coypright (c) 2014 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
* Aidan Chalk (aidan.chalk@durham.ac.uk)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* *****************************************************************************/
/* Config parameters. */
#include "../config.h"
/* Standard includes. */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include <omp.h>
#include <fenv.h>
/* Local includes. */
extern "C"{
#include "quicksched.h"
#include "res.h"
}
#include "cuda_queue.h"
/** Task types. */
enum task_type {
task_type_self = 0,
task_type_pair,
task_type_self_pc,
task_type_com,
task_type_count
};
struct cell{
double2 loc_xy;
double loc_z;
double h;
int count;
unsigned short int split, sorted;
int parts, firstchild, sibling;
int res, /*resz, resm,*/ com_tid;
};//__attribute__((aligned(64)));
struct part{
double loc[3];
float a[3];
float m;
};//__attribute__((aligned(64)));
#define const_G 1
/* Requred variables to obtain cells. */
#define cell_maxparts 256
#define CELL_STRETCH 2
#define INITIAL_CELLS 256
struct cell *cell_pool = NULL;
int used_cells=0;
int num_cells = 0;
int cell_size = INITIAL_CELLS*sizeof(struct cell);
/* Device locations for the particle values. */
//__device__ double2 *parts_pos_xy;
//__device__ double *parts_pos_z;
//__device__ float4 *parts_a_m;
__device__ double2 *com_xy;
__device__ double *com_z;
__device__ float *com_mass;
__device__ struct part *parts_cuda;
__device__ struct cell *cells;
/* Host locations for the particle values. */
//double2 *parts_pos_xy_host;
//double *parts_pos_z_host;
//float4 *parts_a_m_host;
double2 *com_xy_host;
double *com_z_host;
float *com_mass_host;
double2 *parts_pos_xy_temp;
double *parts_pos_z_temp;
float4 *parts_a_m_temp;
struct part *parts_host;
struct part *parts_temp;
__device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *cj) {
int i, j, k;
int count_i = ci->count, count_j = cj->count;
int parts_i = ci->parts, parts_j = cj->parts;
double xi[3];
float dx[3], ai[3], mi, mj, r2, w, ir;
__shared__ double2 parts_xy[cell_maxparts];
__shared__ double parts_z[cell_maxparts];
__shared__ float4 parts_am[cell_maxparts];
/*if(threadIdx.x == 0)
printf("%f, %f, %f, %f, %i, %f, %f, %f, %f, %i\n", ci->h, ci->loc_xy.x, ci->loc_xy.y, ci->loc_z, ci->split,
cj->h, cj->loc_xy.x, cj->loc_xy.y, cj->loc_z, cj->split);*/
/* Load particles of cell j into shared memory */
/*for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) {
parts_xy[j] = parts_pos_xy[k];
parts_z[j] = parts_pos_z[k];
parts_am[j] = parts_a_m[k];
}*/
/* Loop over cell i.*/
for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
xi[0] = parts_cuda[i].loc[0];
xi[1] = parts_cuda[i].loc[1];
xi[2] = parts_cuda[i].loc[2];
for(k = 0; k < 3; k++) {
ai[k] = 0.0f;
}
mi = parts_cuda[i].m;
for(j = parts_j; j < parts_j + count_j; j++) {
r2 = 0.0f;
dx[0] = xi[0] - parts_cuda[j].loc[0];
dx[1] = xi[1] - parts_cuda[j].loc[1];
dx[2] = xi[2] - parts_cuda[j].loc[2];
r2 += dx[0] * dx[0];
r2 += dx[1] * dx[1];
r2 += dx[2] * dx[2];
// ir = 1.0f / sqrtf(r2);
ir = rsqrtf(r2);
w = const_G * ir * ir * ir;
mj = parts_cuda[j].m;
for(k = 0; k < 3; k++) {
ai[k] -= dx[k] * mj * w;
}
// atomicAdd(&parts_a_m[j].x, w*dx[0]*mi);
// atomicAdd(&parts_a_m[j].y, w*dx[1]*mi);
// atomicAdd(&parts_a_m[j].z, w*dx[2]*mi);
}
atomicAdd(&parts_cuda[i].a[0], ai[0]);
atomicAdd(&parts_cuda[i].a[1], ai[1]);
atomicAdd(&parts_cuda[i].a[2], ai[2]);
}
/* Load particles of cell i into shared memory */
/*for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) {
parts_xy[j] = parts_pos_xy[k];
parts_z[j] = parts_pos_z[k];
parts_am[j] = parts_a_m[k];
}*/
/*Loop over cell j. */
for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
xi[0] = parts_cuda[i].loc[0];
xi[1] = parts_cuda[i].loc[1];
xi[2] = parts_cuda[i].loc[2];
for(k = 0; k < 3; k++) {
ai[k] = 0.0f;
}
mi = parts_cuda[i].m;
for(j = parts_i; j < parts_i + count_i; j++) {
r2 = 0.0f;
dx[0] = xi[0] - parts_cuda[j].loc[0];
dx[1] = xi[1] - parts_cuda[j].loc[1];
dx[2] = xi[2] - parts_cuda[j].loc[2];
r2 += dx[0] * dx[0];
r2 += dx[1] * dx[1];
r2 += dx[2] * dx[2];
ir = rsqrtf(r2);
w = const_G * ir * ir * ir;
mj = parts_cuda[j].m;
for(k = 0; k < 3; k++) {
ai[k] -= dx[k] * mj * w;
}
}
atomicAdd(&parts_cuda[i].a[0], ai[0]);
atomicAdd(&parts_cuda[i].a[1], ai[1]);
atomicAdd(&parts_cuda[i].a[2], ai[2]);
}
}
__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) {
int i, k;
double2 j_com_xy;
double j_com_z;
float j_com_mass;
int count = leaf->count;
int parts = leaf->parts;
int cell_j = cj - cells;
int temp;
float r2, dx[3], ir, w;
// if(cell_j < 0)
// {
// if(threadIdx.x == 0)
// printf("cell_j = %i, leaf = %i, threadIdx.x == %i\n", cell_j, leaf-cells, threadIdx.x);
// __syncthreads();
// asm("trap;");
// }
// if(threadIdx.x == 0)
// printf("%f, %f, %f\n", cj->loc_xy.x, cj->loc_xy.y, cj->loc_z);
temp = cell_j;
/* Init the com's data.*/
j_com_xy = com_xy[cell_j];
j_com_z = com_z[cell_j];
j_com_mass = com_mass[cell_j];
for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
r2 = 0.0;
dx[0] = j_com_xy.x - parts_cuda[i].loc[0];
r2 += dx[0] * dx[0];
dx[1] = j_com_xy.y - parts_cuda[i].loc[1];
r2 += dx[1] * dx[1];
dx[2] = j_com_z - parts_cuda[i].loc[2];
r2 += dx[2] * dx[2];
ir = rsqrtf(r2);
w = j_com_mass * const_G * ir * ir * ir;
atomicAdd( &parts_cuda[i].a[0] , w * dx[0]);
atomicAdd( &parts_cuda[i].a[1] , w * dx[1]);
atomicAdd( &parts_cuda[i].a[2] , w * dx[2]);
}
//__syncthreads();
}
/**
* @brief Checks whether the cells are direct neighbours ot not
*/
__device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) {
int k;
float dx[3];
double cih = ci->h, cjh = cj->h;
float min_dist = 0.5*(cih+cjh);
float center_i = ci->loc_xy.x + 0.5*cih;
float center_j = cj->loc_xy.x + 0.5*cjh;
dx[0] = fabsf(center_i - center_j);
center_i = ci->loc_xy.y + 0.5*cih;
center_j = cj->loc_xy.y + 0.5*cjh;
dx[1] = fabsf(center_i - center_j);
center_i = ci->loc_z + 0.5*cih;
center_j = cj->loc_z + 0.5*cjh;
dx[2] = fabsf(center_i - center_j);
return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
}
__device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) {
int k;
float dx[3];
float min_dist = ci->h;
float center_i = ci->loc_xy.x;
float center_j = cj->loc_xy.x;
dx[0] = fabsf(center_i - center_j);
center_i = ci->loc_xy.y;
center_j = cj->loc_xy.y;
dx[1] = fabsf(center_i - center_j);
center_i = ci->loc_z;
center_j = cj->loc_z;
dx[2] = fabsf(center_i - center_j);
return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
}
__device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) {
return (leaf->parts >= c->parts) && (leaf->parts < c->parts + c->count);
}
__device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) {
struct cell *cp ,*cps;
/*
Find child of ci containing leaf.
if child and cj are_neighbours_different_size
Loop through children of cj.
If child and childj are neighbours
If both split -> recurse
else
make_interact_pc
else
loop through children of cj
make interact pc
*/
struct cell *loopend;
cp = ci;
while(cp->split)
{
for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
if(is_inside(leaf, cp)) break;
}
if(are_neighbours_different_size(cp, cj)) {
loopend = &cells[cj->sibling];
cps = &cells[cj->firstchild];
while(cps != loopend)
{
if(!are_neighbours(cp, cps)){
make_interact_pc(leaf, cps);
//__syncthreads();
cps = &cells[cps->sibling];
}else{
if(cps->split)
cps = &cells[cps->firstchild];
else
cps = &cells[cps->sibling];
}
}
}else{
for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
make_interact_pc(leaf, cps);
// __syncthreads();
}
break;
}
ci = cp;
}
/* for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
if(is_inside(leaf, cp)) break;
}
if(are_neighbours_different_size(cp, cj)) {
for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
if(!are_neighbours(cp, cps)) {
make_interact_pc(leaf, cps);
__syncthreads();
} else {
if(cp->split && cps->split) {
iact_pair_pc(cp, cps, leaf);
}
}
}
}else{
for(cps = &cells[cj->firstchild]; cps!= &cells[cj->sibling]; cps = &cells[cps->sibling]) {
make_interact_pc(leaf, cps);
}
}*/
// __syncthreads();
}
/**
* @brief Compute the interactions between all particles in a leaf and
* and all the monopoles in the cell c
*
* @param c The #cell containing the monopoles
* @param leaf The #cell containing the particles
*/
__device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
struct cell *cp, *cps;
/*if(leaf->split)
{
printf("Leaf split = 1, oh dear.");
asm("trap;");
}
if(c->split > 1)
{
printf("Cell had split > 1\n");
asm("trap;");
}*/
/* Find the subcell of c the leaf is in.*/
cp = c;
cps = c;
while(c->split)
{
for(cp = &cells[cp->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]){
if(is_inside(leaf, cp)) break;
}
if(cp->split){
for(cps = &cells[c->firstchild]; cps != &cells[c->sibling]; cps = &cells[cps->sibling]) {
if(cp != cps && cps->split) iact_pair_pc(cp, cps, leaf);
}
}
c = cp;
}
/*for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) {
if(is_inside(leaf, cp)) break;
}
if(cp->split) {
iact_self_pc(cp, leaf);
for(cps = &cells[c->firstchild]; cps != &cells[c->sibling]; cps = &cells[cps->sibling]) {
if(cp != cps && cps->split) iact_pair_pc(cp,cps,leaf);
}
}*///TODO
}
/**
* @brief Compute the interactions between all particles in a cell.
*
* @param cellID The cell ID to compute interactions on.
*/
__device__ void iact_self_direct(int cellID) {
struct cell *c = &cells[cellID];
double xi[3] = {0.0, 0.0, 0.0};
float ai[3] = {0.0, 0.0, 0.0 }, mi, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w;
__shared__ double2 parts_xy[cell_maxparts];
__shared__ double parts_z[cell_maxparts];
__shared__ float4 parts_am[cell_maxparts];
int parts;
int count;
int i,j,k;
//if(threadIdx.x == 0)
// printf("%f, %f, %f, %f, %i\n", c->h, c->loc_xy.x, c->loc_xy.y, c->loc_z, c->split);
//If cell is split, interact each child with itself, and with each of its siblings.
/*if(c->split) {
//TODO
} else {*/
parts = c->parts;
count = c->count;
int z = threadIdx.x;
/* Load particle data into shared memory*/
/*for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) {
parts_xy[z] = parts_pos_xy[k];
parts_z[z] = parts_pos_z[k];
parts_am[z] = parts_a_m[k];
}
__syncthreads();*/
for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x)
{
xi[0] = parts_cuda[i].loc[0];
xi[1] = parts_cuda[i].loc[1];
xi[2] = parts_cuda[i].loc[2];
for(k = 0; k < 3; k++) {
ai[k] = 0.0;
}
mi = parts_cuda[i].m;
//for(j = i+1; j!= i; j = (j+1)%count)
for(j = parts; j < parts+count; j++)
{
if(i != j){
/* Compute the pairwise distance. */
r2 = 0.0;
dx[0] = xi[0] - parts_cuda[j].loc[0];
r2 += dx[0]*dx[0];
dx[1] = xi[1] - parts_cuda[j].loc[1];
r2 += dx[1]*dx[1];
dx[2] = xi[2] - parts_cuda[j].loc[2];
r2 += dx[2]*dx[2];
/* Apply the gravitational acceleration. */
//ir = 1.0f / sqrtf(r2);
ir = rsqrtf(r2);
w = const_G * ir * ir * ir;
mj = parts_cuda[j].m;
for(k = 0; k < 3; k++) {
ai[k] -= w * dx[k] * mj;
}
}
}
//Update.
atomicAdd(&parts_cuda[i].a[0],ai[0] );
atomicAdd(&parts_cuda[i].a[1],ai[1] );
atomicAdd(&parts_cuda[i].a[2],ai[2] );
}
}
/**
* @brief Checks whether the cells are direct neighbours ot not. Both cells have
* to be of the same size
*/
static inline int are_neighbours_host(struct cell *ci, struct cell *cj) {
int k;
float dx[3];
#ifdef SANITY_CHECKS
if (ci->h != cj->h)
error(" Cells of different size in distance calculation.");
#endif
/* Maximum allowed distance */
float min_dist = ci->h;
/* (Manhattan) Distance between the cells */
double2 loc1=ci->loc_xy, loc2=cj->loc_xy;
float center_i = loc1.x;
float center_j = loc2.x;
dx[0] = fabs(center_i - center_j);
center_i = loc1.y;
center_j = loc2.y;
dx[1] = fabs(center_i - center_j);
center_i = ci->loc_z;
center_j = cj->loc_z;
dx[2] = fabs(center_i - center_j);
return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
}
struct cell *cell_get()
{
struct cell *res;
if(num_cells == 0)
{
cell_pool = (struct cell*) calloc(INITIAL_CELLS, sizeof(struct cell));
if(cell_pool == NULL)
error("Failed to allocate cell_pool");
com_xy_host = (double2*) calloc(INITIAL_CELLS, sizeof(double2));
if(com_xy_host == NULL)
error("Failed to allocate cell_pool");
com_z_host = (double*) calloc(INITIAL_CELLS, sizeof(double));
if(com_z_host == NULL)
error("Failed to allocate cell_pool");
com_mass_host = (float*) calloc(INITIAL_CELLS, sizeof(float));
if(com_mass_host == NULL)
error("Failed to allocate cell_pool");
num_cells = INITIAL_CELLS;
}
if(used_cells >= num_cells)
{
/* Stretch */
struct cell *new_pool;
cell_size *= CELL_STRETCH;
new_pool = (struct cell*) calloc(num_cells*CELL_STRETCH, sizeof(struct cell));
if(new_pool == NULL)
error("Failed to allocate new_pool");
if(cell_pool != NULL)
memcpy(new_pool, cell_pool, num_cells*sizeof(struct cell));
double2 *tempxy = (double2*) calloc(num_cells*CELL_STRETCH, sizeof(double2));
if(tempxy == NULL)
error("Failed to allocate tempxy");
memcpy(tempxy, com_xy_host, sizeof(double2)*num_cells);
free(com_xy_host);
com_xy_host = tempxy;
double *tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double));
if(tempz == NULL)
error("Failed to allocate tempz");
memcpy(tempz, com_z_host, num_cells*sizeof(double));
free(com_z_host);
com_z_host = tempz;
float *tempm = (float*) calloc(num_cells*CELL_STRETCH, sizeof(float));
if(tempm == NULL)
error("Failed to allocate tempm");
memcpy(tempm, com_mass_host, num_cells*sizeof(float));
free(com_mass_host);
com_mass_host = tempm;
num_cells *= CELL_STRETCH;
free(cell_pool);
cell_pool = new_pool;
message("Increased size of arrays");
}
used_cells++;
cell_pool[used_cells-1].sibling = -1;
cell_pool[used_cells-1].firstchild = -1;
cell_pool[used_cells-1].res = qsched_res_none;
//cell_pool[used_cells-1].resz = qsched_res_none;
//cell_pool[used_cells-1].resm = qsched_res_none;
return &cell_pool[used_cells-1];
}
void comp_com(struct cell *c){
int k, count = c->count;
int cpi;
struct cell *cp;
int parts = c->parts;
double com[3] = {0.0, 0.0, 0.0}, mass = 0.0;
if(c->split) {
for(cp = &cell_pool[(cpi = c->firstchild)]; cpi != c->sibling; cp = &cell_pool[(cpi = cp->sibling)]) {
float cp_mass = com_mass_host[cpi];
com[0] += com_xy_host[cpi].x * cp_mass;
com[1] += com_xy_host[cpi].y * cp_mass;
com[2] += com_z_host[cpi] * cp_mass;
mass += cp_mass;
}
/* Otherwise collect the multiple from the particles */
} else {
for(k = parts; k < parts+count; k++)
{
float p_mass = parts_host[k].m;
com[0] += parts_host[k].loc[0] * p_mass;
com[1] += parts_host[k].loc[1] * p_mass;
com[2] += parts_host[k].loc[2] * p_mass;
mass += p_mass;
}
}
k = c - cell_pool;
/* Store the COM data, if it was collected. */
if(mass > 0.0) {
float imass = 1.0f/mass;
com_xy_host[k].x = com[0] * imass;
com_xy_host[k].y = com[1] * imass;
com_z_host[k] = com[2] * imass;
com_mass_host[k] = mass;
}else
{
com_xy_host[k].x = 0.0;
com_xy_host[k].y = 0.0;
com_z_host[k] = 0.0;
com_mass_host[k] = 0.0f;
}
}
/**
* @brief Sort the parts into eight bins along the given pivots and
* fill the multipoles. Also adds the hierarchical resources
* to the sched (TODO).
*
* @param c The #cell to be split.
* @param N The total number of parts.
* @param s The #sched to store the resources.
*/
void cell_split(int c, struct qsched *s) {
int i, j, k, kk, count = cell_pool[c].count;
int parts = cell_pool[c].parts;
struct part temp_part;
double2 tempxy;
double tempxy1;
float4 tempxy2;
struct cell *cp, *cell;
int left[8], right[8];
double pivot[3];
static int root = -1;
// struct cell *progenitors[8];
int progenitors[8];
int c1 = c;
struct part *temp_xy;
double *temp_z;
float4 *temp_a_m;
/* Set the root cell. */
if (root < 0) {
root = c;
cell_pool[c].sibling = -1;
}
if(cell_pool[c].res == qsched_res_none)
{
if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[c].parts], 0) != cudaSuccess )
error("Failed to get host device pointer.");
// printf("tempxy = %p\n", temp_xy);
cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy,
sizeof(struct part) * cell_pool[c].count, parts_pos_xy_temp + cell_pool[c].parts);
}
/*if(cell_pool[c].resz == qsched_res_none)
{
if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[c].parts], 0) != cudaSuccess )
error("Failed to get host device pointer.");
cell_pool[c].resz = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_z,
sizeof(double) * cell_pool[c].count, parts_pos_z_temp + cell_pool[c].parts);
}
if(cell_pool[c].resm == qsched_res_none)
{
if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[c].parts], 0) != cudaSuccess )
error("Failed to get host device pointer.");
cell_pool[c].resm = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_a_m,
sizeof(float4) * cell_pool[c].count, parts_a_m_temp + cell_pool[c].parts);
}*/
// error("Cell has no resource");*///TODO
if(count > cell_maxparts )
{
cell_pool[c].split = 1;
for(k = 0; k < 8; k++)
{
progenitors[k] = (cp = cell_get()) - cell_pool;
cell = &cell_pool[c];
cp->loc_xy = cell->loc_xy;
cp->loc_z = cell->loc_z;
cp->h = cell->h*0.5;
if(k & 4) cp->loc_xy.x += cp->h;
if(k & 2) cp->loc_xy.y += cp->h;
if(k & 1) cp->loc_z += cp->h;
}
/* Init the pivots.*/
pivot[0] = cell->loc_xy.x + cell->h * 0.5;
pivot[1] = cell->loc_xy.y + cell->h * 0.5;
pivot[2] = cell->loc_z + cell->h * 0.5;
/* Split along the x axis. */
i = parts;
j = parts+count-1;
while(i < j)
{
while(i <= parts+count-1 && parts_host[i].loc[0] < pivot[0]) i += 1;
while(j >= parts && parts_host[j].loc[0] >= pivot[0]) j -= 1;
if(i < j){
temp_part = parts_host[i];
parts_host[i] = parts_host[j];
parts_host[j] = temp_part;
}
}
left[1] = i;
right[1] = parts+count-1;
left[0] = parts;
right[0] = j;
/* Split along the y axis twice. */
for (k = 1; k >= 0; k--) {
i = left[k];
j = right[k];
while(i <= j){
while(i <= right[k] && parts_host[i].loc[1] < pivot[1]) i += 1;
while(j >= left[k] && parts_host[j].loc[1] >= pivot[1]) j -= 1;
if(i < j)
{
temp_part = parts_host[i];
parts_host[i] = parts_host[j];
parts_host[j] = temp_part;
}
}
left[2*k+1] = i;
right[2*k+1] = right[k];
left[2*k] = left[k];
right[2*k] = j;
}
/* Split along the z axis four times.*/
for(k = 3; k >=0; k--)
{
i = left[k];
j = right[k];
while(i <= j){
while(i <= right[k] && parts_host[i].loc[2] < pivot[2]) i += 1;
while(j >= left[k] && parts_host[j].loc[2] >= pivot[2]) j -= 1;
if(i < j)
{
temp_part = parts_host[i];
parts_host[i] = parts_host[j];
parts_host[j] = temp_part;
}
}
left[2 * k + 1] = i;
right[2 * k + 1] = right[k];
left[2 * k] = left[k];
right[2 * k] = j;
}
/* Store the counts and offsets. */
for(k = 0; k < 8; k++)
{
cell_pool[progenitors[k]].count = right[k]-left[k]+1;
cell_pool[progenitors[k]].parts = left[k];
if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
error("Failed to get host device pointer.");
cell_pool[progenitors[k]].res = qsched_addres(s, qsched_owner_none, cell->res, temp_xy,
sizeof(struct part) * cell_pool[progenitors[k]].count, parts_temp + cell_pool[progenitors[k]].parts);
/*if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
error("Failed to get host device pointer.");
cell_pool[progenitors[k]].resz = qsched_addres(s, qsched_owner_none, cell->resz, temp_z,
sizeof(double) * cell_pool[progenitors[k]].count, parts_pos_z_temp + cell_pool[progenitors[k]].parts);
if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
error("Failed to get host device pointer.");
cell_pool[progenitors[k]].resm = qsched_addres(s, qsched_owner_none, cell->resm, temp_a_m,
sizeof(float4) * cell_pool[progenitors[k]].count, parts_a_m_temp + cell_pool[progenitors[k]].parts);*/
}
/* Find the first non-empty progenitor */
for(k = 0; k < 8; k++)
{
if(cell_pool[progenitors[k]].count > 0)
{
cell->firstchild = progenitors[k];
break;
}
}
#ifdef SANITY_CHECKS
if(cell->firstchild == -1)
error("Cell has been split but all children have 0 parts");
#endif
/*Prepare the pointers*/
for(k = 0; k < 8; k++)
{
/* Find the next non-empty sibling */
for(kk = k+1; kk < 8; ++kk){
if(cell_pool[progenitors[kk]].count > 0){
cell_pool[progenitors[k]].sibling = progenitors[kk];
break;
}
}
/* No non-empty sibling, go back a level.*/
if(kk == 8) cell_pool[progenitors[k]].sibling = cell->sibling;
}
/* Recurse */
for(k = 0; k < 8; k++)
if(cell_pool[progenitors[k]].count > 0) cell_split(progenitors[k], s);
/* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */
} else {
// struct cell *data[2] = {root, c};
int data[2] = {root, c};
int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
2 * sizeof(int), 3000);
/*qsched_adduse(s, tid, cell_pool[root].res);
qsched_adduse(s, tid, cell_pool[root].resz);
qsched_adduse(s, tid, cell_pool[root].resm);*/
qsched_addlock(s, tid, cell_pool[c].res);
// qsched_addlock(s, tid, cell_pool[c].resz);
// qsched_addlock(s, tid, cell_pool[c].resm);
}
#ifndef COM_AS_TASK
comp_com(&cell_pool[c]);
#endif
}
/**
* @brief Create the tasks for the cell pair/self.
*
* @param s The #sched in which to create the tasks.
* @param ci The first #cell.
* @param cj The second #cell.
*/
void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
qsched_task_t tid;
int data[2];
struct cell /**data[2],*/ *cp, *cps;
int cpi;
if(cj == NULL)
{
if(ci->split)
{
for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling])
{
//Self Interaction.
create_tasks(s, cp, NULL);
for(cps = &cell_pool[cp->sibling]; cps != &cell_pool[ci->sibling]; cps = &cell_pool[cps->sibling])
create_tasks(s, cp, cps);
}
}
/* Self task */
else{
data[0] = ci - cell_pool;
data[1] = -1;
tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, 2);
qsched_addlock(s, tid, ci->res);
//qsched_addlock(s, tid, ci->resz);
//qsched_addlock(s, tid, ci->resm);
}
}
/* Else its a pair!*/
else{
if(!are_neighbours_host(ci,cj)){/* Cells are neighbours */
}else{
/*Are both split? */
if(ci->split && cj->split)
{
/* Recurse over both cells. */
for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling])
for(cps = &cell_pool[cj->firstchild]; cps != &cell_pool[cj->sibling]; cps = &cell_pool[cps->sibling])
create_tasks(s, cp, cps);
/* Otherwise, at least one of the cells is not split, build a direct
* interaction. */
}else{
data[0] = ci-cell_pool;
data[1] = cj-cell_pool;
/* Create the task. */
tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
sizeof(struct cell *) * 2, 1);
/* Add the resources. */
qsched_addlock(s, tid, ci->res);
// qsched_addlock(s, tid, ci->resz);
// qsched_addlock(s, tid, ci->resm);
qsched_addlock(s, tid, cj->res);
// qsched_addlock(s, tid, cj->resz);
// qsched_addlock(s, tid, cj->resm);
}
}
}
}
__device__ void runner( int type , void *data ) {
int *idata = (int *)data;
int i = idata[0];
int j = idata[1];
switch ( type ) {
case task_type_self:
iact_self_direct(i);
break;
case task_type_pair:
iact_pair_direct(&cells[i], &cells[j]);
break;
case task_type_self_pc:
iact_self_pc( &cells[i], &cells[j] );
break;
default:
asm("trap;");
}
__syncthreads();
}
__device__ qsched_funtype function = runner;
qsched_funtype func;
/**
* @brief Set up and run a task-based Barnes-Hutt N-body solver.
*
* @param N The number of random particles to use.
* @param runs Number of force evaluations to use as a benchmark.
* @param fileName Input file name. If @c NULL or an empty string, random
* particle positions will be used.
*/
void test_bh(int N, int runs, char *fileName) {
int i, k;
struct cell *root;
struct part *parts;
FILE *file;
struct qsched s;
ticks tic, toc_run, tot_setup = 0, tot_run = 0;
int countMultipoles = 0, countPairs = 0, countCoMs = 0;
struct cell *gpu_ptr_cells;
cudaFree(0);
if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
error("Failed to copy function pointer from device");
/* Initialize the scheduler. */
qsched_init(&s, 1, qsched_flag_none);
//Create host particle arrays.
if( cudaMallocHost(&parts_host, sizeof(struct part) * N) != cudaSuccess)
error("Failed to allocate parts array");
if( cudaMalloc(&parts_temp, sizeof(struct part) * N) != cudaSuccess)
error("Failed to allocate device parts array");
if( cudaMemcpyToSymbol(parts_cuda, &parts_temp, sizeof(struct part*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to set device symbol for parts array");
if(fileName == NULL || fileName[0] == 0) {
for(k = 0; k < N; k++) {
parts_host[k].loc[0] = ((double)rand())/ RAND_MAX;
parts_host[k].loc[1] = ((double)rand())/ RAND_MAX;
parts_host[k].loc[2] = ((double)rand())/ RAND_MAX;
parts_host[k].m = ((double)rand()) / RAND_MAX;
}
} else {
file = fopen(fileName, "r");
int tempxy;
if(file) {
for(k = 0; k < N; k++) {
if(fscanf(file, "%d", &tempxy) != 1)
error("Failed to read ID");
if(fscanf(file, "%lf%lf%lf", &parts_host[k].loc[0], &parts_host[k].loc[1], &parts_host[k].loc[2]) != 3)
error("Failed to read position of part %i.", k);
if(fscanf(file, "%f", &parts_host[k].m) != 1)
error("Failed to read mass of part %i.", k);
}
fclose(file);
}
}
/* Init the cells. */
root = cell_get();
root->loc_xy.x = 0.0;
root->loc_xy.y = 0.0;
root->loc_z = 0.0;
root->h = 1.0;
root->count = N;
root->parts = 0;
root->sibling = -1;
int c = root-cell_pool;
cell_split(root - cell_pool, &s);
root = &cell_pool[c];
int nr_leaves = 0;
int maxparts=0, minparts=1000000;
int number = 0;
while(c >= 0) {
if(cell_pool[c].count > 0)
{
number++;
if(cell_pool[c].res == qsched_res_none)
message("cell %i has no res", c);
//if(cell_pool[c].resz == qsched_res_none)
// message("cell %i has no resz", c);
//if(cell_pool[c].resm == qsched_res_none)
// message("cell %i has no resm", c);
}
if(!cell_pool[c].split) {
nr_leaves++;
if(cell_pool[c].count > maxparts)
{
maxparts = cell_pool[c].count;
}
if(cell_pool[c].count < minparts)
{
minparts = cell_pool[c].count;
}
c = cell_pool[c].sibling;
} else {
c = cell_pool[c].firstchild;
}
}
message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves));
message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts);
/* for(k = 0; k < num_cells; k++)
if(cell_pool[k].split > 1 )
printf("Split > 1\n");*/
create_tasks(&s, root, NULL);
int self = 0, pair = 0, pc = 0;
for(k = 0; k < s.count; k++)
{
if(s.tasks[k].type == task_type_self)
self++;
else if (s.tasks[k].type == task_type_pair)
pair++;
else if (s.tasks[k].type >= 0)
pc++;
}
message("total number of tasks: %i.", s.count);
message("total number of pair tasks: %i.", pair);
message("total number of self tasks: %i.", self);
message("total number of pc tasks: %i.", pc);
message("total number of cells: %i.", number);
message("total number of deps: %i.", s.count_deps);
message("total number of res: %i.", s.count_res);
message("total number of locks: %i.", s.count_locks);
for(k = 0; k < runs; k++) {
for(i = 0; i < N; ++i) {
parts_host[i].a[0] = 0.0;
parts_host[i].a[1] = 0.0;
parts_host[i].a[2] = 0.0;
}
/* Copy the cells to the device. */
if( cudaMalloc( &gpu_ptr_cells , sizeof(struct cell) * used_cells) != cudaSuccess)
error("Failed to allocate cells on the GPU");
if( cudaMemcpy( gpu_ptr_cells, cell_pool, sizeof(struct cell) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to copy cells to the GPU");
if( cudaMemcpyToSymbol(cells, &gpu_ptr_cells, sizeof(struct cell*), 0, cudaMemcpyHostToDevice) != cudaSuccess )
error("Failed to copy cell pointer to the GPU");
double2 *com_temp;
double *comz_temp;
float *comm_temp;
if(cudaMalloc( &com_temp, sizeof(double2) * used_cells) != cudaSuccess)
error("Failed to allocate com on the GPU");
if( cudaMemcpy( com_temp, com_xy_host, sizeof(double2) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
error("failed to copy com to the GPU");
if( cudaMemcpyToSymbol(com_xy, &com_temp, sizeof(double2 *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to copy com pointer to the GPU");
if(cudaMalloc( &comz_temp, sizeof(double) * used_cells) != cudaSuccess)
error("Failed to allocate com on the GPU");
if( cudaMemcpy( comz_temp, com_z_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
error("failed to copy com to the GPU");
if( cudaMemcpyToSymbol(com_z, &comz_temp, sizeof(double *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to copy com pointer to the GPU");
if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess)
error("Failed to allocate com on the GPU");
if( cudaMemcpy( comm_temp, com_mass_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
error("failed to copy com to the GPU");
if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to copy com pointer to the GPU");
/* for(i = 0; i < s.count; i++){
int *idata = (int*)&s.data[s.tasks[i].data];
if(s.tasks[i].type == task_type_self)
printf("Self task with data[0] = %i\n", idata[0]);
if(s.tasks[i].type == task_type_pair) {
printf("Pair task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
}
if(s.tasks[i].type == task_type_self_pc) {
printf("PC task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
}
}*/
// printf("com_mass_host[152] = %f\n", com_mass_host[152]);
//Run code.
// printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
qsched_run_CUDA( &s , func );
qsched_print_cuda_timers(&s);
k = 0;
printf("%e, %e, %e, %e, %e, %e, %e\n", parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2],
parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]);
struct task* tasks = qsched_get_timers( &s , s.count );
for(i = 0; i < s.count; i++)
{
printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
// printf("\n");
}
}
/* Dump the particles to a file */
file = fopen("particle_dump.dat", "w");
/* fprintf(file,
"# ID m x y z a_exact.x a_exact.y a_exact.z a_legacy.x "
"a_legacy.y a_legacy.z a_new.x a_new.y a_new.z\n");*/
for (k = 0; k < N; ++k)
fprintf(file, "%e, %e, %e, %e, %e, %e, %e\n",
parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2],
parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]);
fclose(file);
}
int main(int argc, char *argv[]) {
int c, nr_threads;
int N = 1000, runs = 1;
char fileName[100] = {0};
/* Parse the options */
while ((c = getopt(argc, argv, "n:r:t:f:c:i:")) != -1) switch (c) {
case 'n':
if (sscanf(optarg, "%d", &N) != 1)
error("Error parsing number of particles.");
break;
case 'r':
if (sscanf(optarg, "%d", &runs) != 1)
error("Error parsing number of runs.");
break;
case 't':
if (sscanf(optarg, "%d", &nr_threads) != 1)
error("Error parsing number of threads.");
// omp_set_num_threads(nr_threads);
break;
case 'f':
if (sscanf(optarg, "%s", &fileName[0]) != 1)
error("Error parsing file name.");
break;
case '?':
fprintf(stderr, "Usage: %s [-t nr_threads] [-n N] [-r runs] [-f file] "
"[-c Nparts] [-i Niterations] \n",
argv[0]);
fprintf(stderr, "Solves the N-body problem using a Barnes-Hut\n"
"tree code with N random particles read from a file in "
"[0,1]^3 using"
"nr_threads threads.\n"
"A test of the neighbouring cells interaction with "
"Nparts per cell is also run Niterations times.\n");
exit(EXIT_FAILURE);
}
/* Tree node information */
printf("Size of cell: %zu bytes.\n", sizeof(struct cell));
/* Part information */
// printf("Size of part: %zu bytes.\n", sizeof(struct part));
/* Dump arguments. */
if (fileName[0] == 0) {
message("Computing the N-body problem over %i random particles using %i "
"threads (%i runs).",
N, nr_threads, runs);
} else {
message("Computing the N-body problem over %i particles read from '%s' "
"using %i threads (%i runs).",
N, fileName, nr_threads, runs);
}
/* Run the BH test. */
test_bh(N, runs, fileName);
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment