Skip to content
Snippets Groups Projects
Commit 320d4466 authored by aidan's avatar aidan
Browse files

Added a non-recursive version of pc functions. Poor performance currently

parent 60922b1a
No related branches found
No related tags found
No related merge requests found
......@@ -59,7 +59,7 @@ unsigned short int split, sorted;
int parts, firstchild, sibling;
int res, resz, resm, com_tid;
}__attribute__((aligned(64)));
};//__attribute__((aligned(64)));
#define const_G 1
......@@ -105,13 +105,16 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
__shared__ double2 parts_xy[cell_maxparts];
__shared__ double parts_z[cell_maxparts];
__shared__ float4 parts_am[cell_maxparts];
/*if(threadIdx.x == 0)
printf("%f, %f, %f, %f, %i, %f, %f, %f, %f, %i\n", ci->h, ci->loc_xy.x, ci->loc_xy.y, ci->loc_z, ci->split,
cj->h, cj->loc_xy.x, cj->loc_xy.y, cj->loc_z, cj->split);*/
/* Load particles of cell j into shared memory */
for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) {
/*for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) {
parts_xy[j] = parts_pos_xy[k];
parts_z[j] = parts_pos_z[k];
parts_am[j] = parts_a_m[k];
}
}*/
/* Loop over cell i.*/
for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
......@@ -123,25 +126,27 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
}
mi = parts_a_m[i].w;
for(j = 0; j < count_j; j++) {
for(j = parts_j; j < parts_j + count_j; j++) {
r2 = 0.0f;
dx[0] = xi[0] - parts_xy[j].x;
dx[1] = xi[1] - parts_xy[j].y;
dx[2] = xi[2] - parts_z[j];
dx[0] = xi[0] - parts_pos_xy[j].x;
dx[1] = xi[1] - parts_pos_xy[j].y;
dx[2] = xi[2] - parts_pos_z[j];
r2 += dx[0] * dx[0];
r2 += dx[1] * dx[1];
r2 += dx[2] * dx[2];
// ir = 1.0f / sqrtf(r2);
// ir = 1.0f / sqrtf(r2);
ir = rsqrtf(r2);
w = const_G * ir * ir * ir;
mj = parts_am[j].w;
mj = parts_a_m[j].w;
for(k = 0; k < 3; k++) {
ai[k] -= dx[k] * mj * w;
}
// atomicAdd(&parts_a_m[j].x, w*dx[0]*mi);
// atomicAdd(&parts_a_m[j].y, w*dx[1]*mi);
// atomicAdd(&parts_a_m[j].z, w*dx[2]*mi);
}
atomicAdd(&parts_a_m[i].x, ai[0]);
atomicAdd(&parts_a_m[i].y, ai[1]);
atomicAdd(&parts_a_m[i].z, ai[2]);
......@@ -149,11 +154,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
}
/* Load particles of cell i into shared memory */
for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) {
/*for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) {
parts_xy[j] = parts_pos_xy[k];
parts_z[j] = parts_pos_z[k];
parts_am[j] = parts_a_m[k];
}
}*/
/*Loop over cell j. */
for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
xi[0] = parts_pos_xy[i].x;
......@@ -164,11 +169,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
}
mi = parts_a_m[i].w;
for(j = 0; j < count_j; j++) {
for(j = parts_i; j < parts_i + count_i; j++) {
r2 = 0.0f;
dx[0] = xi[0] - parts_xy[j].x;
dx[1] = xi[1] - parts_xy[j].y;
dx[2] = xi[2] - parts_z[j];
dx[0] = xi[0] - parts_pos_xy[j].x;
dx[1] = xi[1] - parts_pos_xy[j].y;
dx[2] = xi[2] - parts_pos_z[j];
r2 += dx[0] * dx[0];
r2 += dx[1] * dx[1];
r2 += dx[2] * dx[2];
......@@ -176,12 +181,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
ir = rsqrtf(r2);
w = const_G * ir * ir * ir;
mj = parts_am[j].w;
mj = parts_a_m[j].w;
for(k = 0; k < 3; k++) {
ai[k] -= dx[k] * mj * w;
}
}
atomicAdd(&parts_a_m[i].x, ai[0]);
atomicAdd(&parts_a_m[i].y, ai[1]);
atomicAdd(&parts_a_m[i].z, ai[2]);
......@@ -190,25 +194,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
}
/*__device__ void iact_pair(int celli, int cellj) {
struct cell *ci, *cj;
ci = &cells[celli];
cj = &cells[cellj];
if(Check if neighbours0)
{
if(ci->split && cj->split) {
//Split both cells and do all possible pairs.
}else {
iact_pair_direct(ci, cj);
}
}
}*/
__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) {
int i, k;
......@@ -218,16 +203,29 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
int count = leaf->count;
int parts = leaf->parts;
int cell_j = cj - cells;
int temp;
float r2, dx[3], ir, w;
// if(cell_j < 0)
// {
// if(threadIdx.x == 0)
// printf("cell_j = %i, leaf = %i, threadIdx.x == %i\n", cell_j, leaf-cells, threadIdx.x);
// __syncthreads();
// asm("trap;");
// }
// if(threadIdx.x == 0)
// printf("%f, %f, %f\n", cj->loc_xy.x, cj->loc_xy.y, cj->loc_z);
temp = cell_j;
/* Init the com's data.*/
j_com_xy = com_xy[cell_j];
j_com_z = com_z[cell_j];
j_com_mass = com_mass[cell_j];
for(i = parts; i < parts+count; i++) {
for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
r2 = 0.0;
dx[0] = j_com_xy.x - parts_pos_xy[i].x;
r2 += dx[0] * dx[0];
......@@ -238,11 +236,18 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
ir = rsqrtf(r2);
w = j_com_mass * const_G * ir * ir * ir;
parts_a_m[i].x += w * dx[0];
parts_a_m[i].y += w * dx[1];
parts_a_m[i].z += w * dx[2];
/* __threadfence();
if(!isfinite(w * dx[0])){
printf("Error in make_interact_pc, j_com_mass = %f, cell_j = %i, temp = %i, i = %i, threadIdx.x=%i\n", j_com_mass, cell_j, temp, i, threadIdx.x); asm("trap;");}
if(!isfinite(w * dx[1])){
printf("Error in make_interact_pc\n"); asm("trap;");}
if(!isfinite(w * dx[2])){
printf("Error in make_interact_pc\n"); asm("trap;");}*/
atomicAdd( &parts_a_m[i].x , w * dx[0]);
atomicAdd( &parts_a_m[i].y , w * dx[1]);
atomicAdd( &parts_a_m[i].z , w * dx[2]);
}
//__syncthreads();
}
/**
......@@ -291,38 +296,41 @@ __device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) {
__device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) {
struct cell *cp ,*cps;
int leafnum = leaf - cells;
//if(threadIdx.x == 0 && leafnum == 23)
// printf("cj = %i\n", cj - cells);
// printf("%i\n", leafnum);
if(leaf->split)
{
printf("Leaf split = 1, oh dear.");
asm("trap;");
}
if(ci->split > 1)
{
printf("Cell %i had split > 1\n", ci - cells);
asm("trap;");
}
if(cj->split > 1)
{
printf("cell %i had split > 1\n", cj - cells);
asm("trap;");
}
// if(threadIdx.x == 0)
/// printf("ci = %i, cj = %i, leaf = %i\n", ci - cells, cj - cells, leaf - cells);
for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
if(is_inside(leaf, cp)) break;
}
if(are_neighbours_different_size(cp, cj)) {
for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
if(are_neighbours(cp, cps)) {
if(cp->split && cps->split) {
iact_pair_pc(cp, cps, leaf);
}
} else {
make_interact_pc(leaf, cps);
// if(threadIdx.x == 0 && leafnum == 23)
// printf("leafnum = %i with cps = %i here\n", leafnum, cps - cells);
__syncthreads();
}
}
}else{
for(cps = &cells[cj->firstchild]; cps!= &cells[cj->sibling]; cps = &cells[cps->sibling]) {
// if(threadIdx.x == 0 && leafnum == 23)
// printf("leafnum = %i with cps = %i\n", leafnum, cps - cells);
make_interact_pc(leaf, cps);
}
}
__syncthreads();
......@@ -339,7 +347,7 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
struct cell *cp, *cps;
if(leaf->split)
/*if(leaf->split)
{
printf("Leaf split = 1, oh dear.");
asm("trap;");
......@@ -348,9 +356,26 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
{
printf("Cell had split > 1\n");
asm("trap;");
}
}*/
/* Find the subcell of c the leaf is in.*/
/*cp = c;
cps = c;
while(c->split)
{
for(cp = &cells[cp->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]){
if(is_inside(leaf, cp)) break;
}
if(cp->split){
for(cps = &cells[c->firstchild]; cps != &cells[c->sibling]; cps = &cells[cps->sibling]) {
if(cp != cps && cps->split) iact_pair_pc(cp, cps, leaf);
}
}
c = cp;
}*/
for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) {
if(is_inside(leaf, cp)) break;
}
......@@ -364,7 +389,7 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
if(cp != cps && cps->split) iact_pair_pc(cp,cps,leaf);
}
}
}//TODO
}
......@@ -385,7 +410,8 @@ __device__ void iact_self_direct(int cellID) {
int count;
int i,j,k;
//if(threadIdx.x == 0)
// printf("%f, %f, %f, %f, %i\n", c->h, c->loc_xy.x, c->loc_xy.y, c->loc_z, c->split);
//If cell is split, interact each child with itself, and with each of its siblings.
/*if(c->split) {
//TODO
......@@ -395,24 +421,24 @@ __device__ void iact_self_direct(int cellID) {
count = c->count;
int z = threadIdx.x;
/* Load particle data into shared memory*/
for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) {
/*for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) {
parts_xy[z] = parts_pos_xy[k];
parts_z[z] = parts_pos_z[k];
parts_am[z] = parts_a_m[k];
}
__syncthreads();
for(i = threadIdx.x; i < count; i += blockDim.x)
__syncthreads();*/
for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x)
{
xi[0] = parts_xy[i].x;
xi[1] = parts_xy[i].y;
xi[2] = parts_z[i];
xi[0] = parts_pos_xy[i].x;
xi[1] = parts_pos_xy[i].y;
xi[2] = parts_pos_z[i];
for(k = 0; k < 3; k++) {
ai[k] = 0.0;
}
mi = parts_a_m[i].w;
//for(j = i+1; j!= i; j = (j+1)%count)
for(j = 0; j < count; j++)
for(j = parts; j < parts+count; j++)
{
if(i != j){
......@@ -430,7 +456,7 @@ __device__ void iact_self_direct(int cellID) {
//ir = 1.0f / sqrtf(r2);
ir = rsqrtf(r2);
w = const_G * ir * ir * ir;
mj = parts_am[j].w;
mj = parts_a_m[j].w;
for(k = 0; k < 3; k++) {
ai[k] -= w * dx[k] * mj;
}
......@@ -813,10 +839,10 @@ void cell_split(int c, struct qsched *s) {
// struct cell *data[2] = {root, c};
int data[2] = {root, c};
int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
2 * sizeof(int), 1);
qsched_addlock(s, tid, cell_pool[root].res);
qsched_addlock(s, tid, cell_pool[root].resz);
qsched_addlock(s, tid, cell_pool[root].resm);
2 * sizeof(int), 3000);
/*qsched_adduse(s, tid, cell_pool[root].res);
qsched_adduse(s, tid, cell_pool[root].resz);
qsched_adduse(s, tid, cell_pool[root].resm);*/
qsched_addlock(s, tid, cell_pool[c].res);
qsched_addlock(s, tid, cell_pool[c].resz);
qsched_addlock(s, tid, cell_pool[c].resm);
......@@ -859,8 +885,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
else{
data[0] = ci - cell_pool;
data[1] = -1;
tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, ci->count*ci->count/2);
tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, 2);
qsched_addlock(s, tid, ci->res);
qsched_addlock(s, tid, ci->resz);
qsched_addlock(s, tid, ci->resm);
......@@ -868,7 +893,9 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
}
/* Else its a pair!*/
else{
if(are_neighbours_host(ci,cj)){/* Cells are neighbours */
if(!are_neighbours_host(ci,cj)){/* Cells are neighbours */
}else{
/*Are both split? */
if(ci->split && cj->split)
{
......@@ -885,7 +912,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
/* Create the task. */
tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
sizeof(struct cell *) * 2, ci->count * cj->count);
sizeof(struct cell *) * 2, 1);
/* Add the resources. */
qsched_addlock(s, tid, ci->res);
......@@ -909,7 +936,6 @@ __device__ void runner( int type , void *data ) {
int *idata = (int *)data;
int i = idata[0];
int j = idata[1];
switch ( type ) {
case task_type_self:
iact_self_direct(i);
......@@ -1044,18 +1070,31 @@ void test_bh(int N, int runs, char *fileName) {
c = cell_pool[c].firstchild;
}
}
message("root.sibling = %i, root.split = %i", root->sibling, root->split);
printf("nr_leaves = %i\n", nr_leaves);
message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves));
message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts);
for(k = 0; k < num_cells; k++)
/* for(k = 0; k < num_cells; k++)
if(cell_pool[k].split > 1 )
printf("Split > 1\n");
printf("Split > 1\n");*/
create_tasks(&s, root, NULL);
int self = 0, pair = 0, pc = 0;
for(k = 0; k < s.count; k++)
{
if(s.tasks[k].type == task_type_self)
self++;
else if (s.tasks[k].type == task_type_pair)
pair++;
else if (s.tasks[k].type >= 0)
pc++;
}
message("total number of tasks: %i.", s.count);
message("total number of pair tasks: %i.", pair);
message("total number of self tasks: %i.", self);
message("total number of pc tasks: %i.", pc);
message("total number of cells: %i.", number);
message("total number of deps: %i.", s.count_deps);
message("total number of res: %i.", s.count_res);
......@@ -1098,7 +1137,7 @@ float *comm_temp;
if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess)
error("Failed to allocate com on the GPU");
if( cudaMemcpy( comm_temp, com_z_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
if( cudaMemcpy( comm_temp, com_mass_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
error("failed to copy com to the GPU");
if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to copy com pointer to the GPU");
......@@ -1115,13 +1154,37 @@ float *comm_temp;
}
}*/
// printf("com_mass_host[152] = %f\n", com_mass_host[152]);
//Run code.
printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
// printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
qsched_run_CUDA( &s , func );
}
qsched_print_cuda_timers(&s);
k = 0;
printf("%e, %e, %e, %e, %e, %e, %e\n", parts_a_m_host[k].w, parts_pos_xy_host[k].x, parts_pos_xy_host[k].y, parts_pos_z_host[k],
parts_a_m_host[k].x, parts_a_m_host[k].y, parts_a_m_host[k].z);
struct task* tasks = qsched_get_timers( &s , s.count );
for(i = 0; i < s.count; i++)
{
printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
// printf("\n");
}
}
/* Dump the particles to a file */
file = fopen("particle_dump.dat", "w");
/* fprintf(file,
"# ID m x y z a_exact.x a_exact.y a_exact.z a_legacy.x "
"a_legacy.y a_legacy.z a_new.x a_new.y a_new.z\n");*/
for (k = 0; k < N; ++k)
fprintf(file, "%e, %e, %e, %e, %e, %e, %e\n",
parts_a_m_host[k].w, parts_pos_xy_host[k].x, parts_pos_xy_host[k].y, parts_pos_z_host[k],
parts_a_m_host[k].x, parts_a_m_host[k].y, parts_a_m_host[k].z);
fclose(file);
}
......
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment