From 320d4466aaf6acb0789370ee738cdabd96fe1d2c Mon Sep 17 00:00:00 2001
From: aidan <aidan@gtx690.dur.ac.uk>
Date: Wed, 25 Feb 2015 14:51:15 +0000
Subject: [PATCH] Added a non-recursive version of pc functions. Poor
performance currently
---
examples/test_bh_2.cu | 237 +++++---
examples/test_bh_3.cu | 1265 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 1415 insertions(+), 87 deletions(-)
create mode 100644 examples/test_bh_3.cu
diff --git a/examples/test_bh_2.cu b/examples/test_bh_2.cu
index 7981f6b..0cdae97 100644
--- a/examples/test_bh_2.cu
+++ b/examples/test_bh_2.cu
@@ -59,7 +59,7 @@ unsigned short int split, sorted;
int parts, firstchild, sibling;
int res, resz, resm, com_tid;
-}__attribute__((aligned(64)));
+};//__attribute__((aligned(64)));
#define const_G 1
@@ -105,13 +105,16 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
__shared__ double2 parts_xy[cell_maxparts];
__shared__ double parts_z[cell_maxparts];
__shared__ float4 parts_am[cell_maxparts];
+ /*if(threadIdx.x == 0)
+ printf("%f, %f, %f, %f, %i, %f, %f, %f, %f, %i\n", ci->h, ci->loc_xy.x, ci->loc_xy.y, ci->loc_z, ci->split,
+ cj->h, cj->loc_xy.x, cj->loc_xy.y, cj->loc_z, cj->split);*/
/* Load particles of cell j into shared memory */
- for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) {
+ /*for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) {
parts_xy[j] = parts_pos_xy[k];
parts_z[j] = parts_pos_z[k];
parts_am[j] = parts_a_m[k];
- }
+ }*/
/* Loop over cell i.*/
for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
@@ -123,25 +126,27 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
}
mi = parts_a_m[i].w;
- for(j = 0; j < count_j; j++) {
+ for(j = parts_j; j < parts_j + count_j; j++) {
r2 = 0.0f;
- dx[0] = xi[0] - parts_xy[j].x;
- dx[1] = xi[1] - parts_xy[j].y;
- dx[2] = xi[2] - parts_z[j];
+ dx[0] = xi[0] - parts_pos_xy[j].x;
+ dx[1] = xi[1] - parts_pos_xy[j].y;
+ dx[2] = xi[2] - parts_pos_z[j];
r2 += dx[0] * dx[0];
r2 += dx[1] * dx[1];
r2 += dx[2] * dx[2];
-// ir = 1.0f / sqrtf(r2);
+ // ir = 1.0f / sqrtf(r2);
ir = rsqrtf(r2);
w = const_G * ir * ir * ir;
- mj = parts_am[j].w;
+ mj = parts_a_m[j].w;
for(k = 0; k < 3; k++) {
ai[k] -= dx[k] * mj * w;
}
+ // atomicAdd(&parts_a_m[j].x, w*dx[0]*mi);
+ // atomicAdd(&parts_a_m[j].y, w*dx[1]*mi);
+ // atomicAdd(&parts_a_m[j].z, w*dx[2]*mi);
}
-
atomicAdd(&parts_a_m[i].x, ai[0]);
atomicAdd(&parts_a_m[i].y, ai[1]);
atomicAdd(&parts_a_m[i].z, ai[2]);
@@ -149,11 +154,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
}
/* Load particles of cell i into shared memory */
- for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) {
+ /*for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) {
parts_xy[j] = parts_pos_xy[k];
parts_z[j] = parts_pos_z[k];
parts_am[j] = parts_a_m[k];
- }
+ }*/
/*Loop over cell j. */
for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
xi[0] = parts_pos_xy[i].x;
@@ -164,11 +169,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
}
mi = parts_a_m[i].w;
- for(j = 0; j < count_j; j++) {
+ for(j = parts_i; j < parts_i + count_i; j++) {
r2 = 0.0f;
- dx[0] = xi[0] - parts_xy[j].x;
- dx[1] = xi[1] - parts_xy[j].y;
- dx[2] = xi[2] - parts_z[j];
+ dx[0] = xi[0] - parts_pos_xy[j].x;
+ dx[1] = xi[1] - parts_pos_xy[j].y;
+ dx[2] = xi[2] - parts_pos_z[j];
r2 += dx[0] * dx[0];
r2 += dx[1] * dx[1];
r2 += dx[2] * dx[2];
@@ -176,12 +181,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
ir = rsqrtf(r2);
w = const_G * ir * ir * ir;
- mj = parts_am[j].w;
+ mj = parts_a_m[j].w;
for(k = 0; k < 3; k++) {
ai[k] -= dx[k] * mj * w;
}
}
-
atomicAdd(&parts_a_m[i].x, ai[0]);
atomicAdd(&parts_a_m[i].y, ai[1]);
atomicAdd(&parts_a_m[i].z, ai[2]);
@@ -190,25 +194,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
}
-/*__device__ void iact_pair(int celli, int cellj) {
-
- struct cell *ci, *cj;
- ci = &cells[celli];
- cj = &cells[cellj];
-
- if(Check if neighbours0)
- {
- if(ci->split && cj->split) {
- //Split both cells and do all possible pairs.
-
- }else {
- iact_pair_direct(ci, cj);
- }
-
- }
-
-}*/
-
__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) {
int i, k;
@@ -218,16 +203,29 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
int count = leaf->count;
int parts = leaf->parts;
int cell_j = cj - cells;
+ int temp;
float r2, dx[3], ir, w;
+ // if(cell_j < 0)
+// {
+// if(threadIdx.x == 0)
+ // printf("cell_j = %i, leaf = %i, threadIdx.x == %i\n", cell_j, leaf-cells, threadIdx.x);
+ // __syncthreads();
+// asm("trap;");
+// }
+
+ // if(threadIdx.x == 0)
+ // printf("%f, %f, %f\n", cj->loc_xy.x, cj->loc_xy.y, cj->loc_z);
+
+
+ temp = cell_j;
/* Init the com's data.*/
j_com_xy = com_xy[cell_j];
j_com_z = com_z[cell_j];
j_com_mass = com_mass[cell_j];
- for(i = parts; i < parts+count; i++) {
-
+ for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
r2 = 0.0;
dx[0] = j_com_xy.x - parts_pos_xy[i].x;
r2 += dx[0] * dx[0];
@@ -238,11 +236,18 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
ir = rsqrtf(r2);
w = j_com_mass * const_G * ir * ir * ir;
-
- parts_a_m[i].x += w * dx[0];
- parts_a_m[i].y += w * dx[1];
- parts_a_m[i].z += w * dx[2];
+ /* __threadfence();
+ if(!isfinite(w * dx[0])){
+ printf("Error in make_interact_pc, j_com_mass = %f, cell_j = %i, temp = %i, i = %i, threadIdx.x=%i\n", j_com_mass, cell_j, temp, i, threadIdx.x); asm("trap;");}
+ if(!isfinite(w * dx[1])){
+ printf("Error in make_interact_pc\n"); asm("trap;");}
+ if(!isfinite(w * dx[2])){
+ printf("Error in make_interact_pc\n"); asm("trap;");}*/
+ atomicAdd( &parts_a_m[i].x , w * dx[0]);
+ atomicAdd( &parts_a_m[i].y , w * dx[1]);
+ atomicAdd( &parts_a_m[i].z , w * dx[2]);
}
+//__syncthreads();
}
/**
@@ -291,38 +296,41 @@ __device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) {
__device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) {
struct cell *cp ,*cps;
+ int leafnum = leaf - cells;
+//if(threadIdx.x == 0 && leafnum == 23)
+ // printf("cj = %i\n", cj - cells);
+// printf("%i\n", leafnum);
- if(leaf->split)
- {
- printf("Leaf split = 1, oh dear.");
- asm("trap;");
- }
-if(ci->split > 1)
- {
- printf("Cell %i had split > 1\n", ci - cells);
- asm("trap;");
- }
- if(cj->split > 1)
- {
- printf("cell %i had split > 1\n", cj - cells);
- asm("trap;");
- }
+ // if(threadIdx.x == 0)
+ /// printf("ci = %i, cj = %i, leaf = %i\n", ci - cells, cj - cells, leaf - cells);
for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
if(is_inside(leaf, cp)) break;
}
if(are_neighbours_different_size(cp, cj)) {
+
for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
+
if(are_neighbours(cp, cps)) {
if(cp->split && cps->split) {
iact_pair_pc(cp, cps, leaf);
}
} else {
make_interact_pc(leaf, cps);
+ // if(threadIdx.x == 0 && leafnum == 23)
+ // printf("leafnum = %i with cps = %i here\n", leafnum, cps - cells);
__syncthreads();
}
}
+ }else{
+
+ for(cps = &cells[cj->firstchild]; cps!= &cells[cj->sibling]; cps = &cells[cps->sibling]) {
+ // if(threadIdx.x == 0 && leafnum == 23)
+ // printf("leafnum = %i with cps = %i\n", leafnum, cps - cells);
+ make_interact_pc(leaf, cps);
+ }
+
}
__syncthreads();
@@ -339,7 +347,7 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
struct cell *cp, *cps;
- if(leaf->split)
+ /*if(leaf->split)
{
printf("Leaf split = 1, oh dear.");
asm("trap;");
@@ -348,9 +356,26 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
{
printf("Cell had split > 1\n");
asm("trap;");
- }
+ }*/
/* Find the subcell of c the leaf is in.*/
+
+ /*cp = c;
+ cps = c;
+ while(c->split)
+ {
+ for(cp = &cells[cp->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]){
+ if(is_inside(leaf, cp)) break;
+ }
+ if(cp->split){
+ for(cps = &cells[c->firstchild]; cps != &cells[c->sibling]; cps = &cells[cps->sibling]) {
+
+ if(cp != cps && cps->split) iact_pair_pc(cp, cps, leaf);
+ }
+ }
+ c = cp;
+ }*/
+
for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) {
if(is_inside(leaf, cp)) break;
}
@@ -364,7 +389,7 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
if(cp != cps && cps->split) iact_pair_pc(cp,cps,leaf);
}
- }
+ }//TODO
}
@@ -385,7 +410,8 @@ __device__ void iact_self_direct(int cellID) {
int count;
int i,j,k;
-
+ //if(threadIdx.x == 0)
+ // printf("%f, %f, %f, %f, %i\n", c->h, c->loc_xy.x, c->loc_xy.y, c->loc_z, c->split);
//If cell is split, interact each child with itself, and with each of its siblings.
/*if(c->split) {
//TODO
@@ -395,24 +421,24 @@ __device__ void iact_self_direct(int cellID) {
count = c->count;
int z = threadIdx.x;
/* Load particle data into shared memory*/
- for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) {
+ /*for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) {
parts_xy[z] = parts_pos_xy[k];
parts_z[z] = parts_pos_z[k];
parts_am[z] = parts_a_m[k];
}
- __syncthreads();
- for(i = threadIdx.x; i < count; i += blockDim.x)
+ __syncthreads();*/
+ for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x)
{
- xi[0] = parts_xy[i].x;
- xi[1] = parts_xy[i].y;
- xi[2] = parts_z[i];
+ xi[0] = parts_pos_xy[i].x;
+ xi[1] = parts_pos_xy[i].y;
+ xi[2] = parts_pos_z[i];
for(k = 0; k < 3; k++) {
ai[k] = 0.0;
}
mi = parts_a_m[i].w;
//for(j = i+1; j!= i; j = (j+1)%count)
- for(j = 0; j < count; j++)
+ for(j = parts; j < parts+count; j++)
{
if(i != j){
@@ -430,7 +456,7 @@ __device__ void iact_self_direct(int cellID) {
//ir = 1.0f / sqrtf(r2);
ir = rsqrtf(r2);
w = const_G * ir * ir * ir;
- mj = parts_am[j].w;
+ mj = parts_a_m[j].w;
for(k = 0; k < 3; k++) {
ai[k] -= w * dx[k] * mj;
}
@@ -813,10 +839,10 @@ void cell_split(int c, struct qsched *s) {
// struct cell *data[2] = {root, c};
int data[2] = {root, c};
int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
- 2 * sizeof(int), 1);
- qsched_addlock(s, tid, cell_pool[root].res);
- qsched_addlock(s, tid, cell_pool[root].resz);
- qsched_addlock(s, tid, cell_pool[root].resm);
+ 2 * sizeof(int), 3000);
+ /*qsched_adduse(s, tid, cell_pool[root].res);
+ qsched_adduse(s, tid, cell_pool[root].resz);
+ qsched_adduse(s, tid, cell_pool[root].resm);*/
qsched_addlock(s, tid, cell_pool[c].res);
qsched_addlock(s, tid, cell_pool[c].resz);
qsched_addlock(s, tid, cell_pool[c].resm);
@@ -859,8 +885,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
else{
data[0] = ci - cell_pool;
data[1] = -1;
-
- tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, ci->count*ci->count/2);
+ tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, 2);
qsched_addlock(s, tid, ci->res);
qsched_addlock(s, tid, ci->resz);
qsched_addlock(s, tid, ci->resm);
@@ -868,7 +893,9 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
}
/* Else its a pair!*/
else{
- if(are_neighbours_host(ci,cj)){/* Cells are neighbours */
+ if(!are_neighbours_host(ci,cj)){/* Cells are neighbours */
+
+ }else{
/*Are both split? */
if(ci->split && cj->split)
{
@@ -885,7 +912,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
/* Create the task. */
tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
- sizeof(struct cell *) * 2, ci->count * cj->count);
+ sizeof(struct cell *) * 2, 1);
/* Add the resources. */
qsched_addlock(s, tid, ci->res);
@@ -909,7 +936,6 @@ __device__ void runner( int type , void *data ) {
int *idata = (int *)data;
int i = idata[0];
int j = idata[1];
-
switch ( type ) {
case task_type_self:
iact_self_direct(i);
@@ -1044,18 +1070,31 @@ void test_bh(int N, int runs, char *fileName) {
c = cell_pool[c].firstchild;
}
}
- message("root.sibling = %i, root.split = %i", root->sibling, root->split);
- printf("nr_leaves = %i\n", nr_leaves);
message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves));
message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts);
- for(k = 0; k < num_cells; k++)
+ /* for(k = 0; k < num_cells; k++)
if(cell_pool[k].split > 1 )
- printf("Split > 1\n");
+ printf("Split > 1\n");*/
create_tasks(&s, root, NULL);
+ int self = 0, pair = 0, pc = 0;
+
+ for(k = 0; k < s.count; k++)
+ {
+ if(s.tasks[k].type == task_type_self)
+ self++;
+ else if (s.tasks[k].type == task_type_pair)
+ pair++;
+ else if (s.tasks[k].type >= 0)
+ pc++;
+ }
+
message("total number of tasks: %i.", s.count);
+ message("total number of pair tasks: %i.", pair);
+ message("total number of self tasks: %i.", self);
+ message("total number of pc tasks: %i.", pc);
message("total number of cells: %i.", number);
message("total number of deps: %i.", s.count_deps);
message("total number of res: %i.", s.count_res);
@@ -1098,7 +1137,7 @@ float *comm_temp;
if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess)
error("Failed to allocate com on the GPU");
- if( cudaMemcpy( comm_temp, com_z_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
+ if( cudaMemcpy( comm_temp, com_mass_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
error("failed to copy com to the GPU");
if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to copy com pointer to the GPU");
@@ -1115,13 +1154,37 @@ float *comm_temp;
}
}*/
-
+ // printf("com_mass_host[152] = %f\n", com_mass_host[152]);
//Run code.
- printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
+// printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
qsched_run_CUDA( &s , func );
- }
+ qsched_print_cuda_timers(&s);
+
+ k = 0;
+ printf("%e, %e, %e, %e, %e, %e, %e\n", parts_a_m_host[k].w, parts_pos_xy_host[k].x, parts_pos_xy_host[k].y, parts_pos_z_host[k],
+ parts_a_m_host[k].x, parts_a_m_host[k].y, parts_a_m_host[k].z);
+struct task* tasks = qsched_get_timers( &s , s.count );
+ for(i = 0; i < s.count; i++)
+ {
+ printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
+ // printf("\n");
+
+ }
+
+}
+ /* Dump the particles to a file */
+ file = fopen("particle_dump.dat", "w");
+/* fprintf(file,
+ "# ID m x y z a_exact.x a_exact.y a_exact.z a_legacy.x "
+ "a_legacy.y a_legacy.z a_new.x a_new.y a_new.z\n");*/
+ for (k = 0; k < N; ++k)
+ fprintf(file, "%e, %e, %e, %e, %e, %e, %e\n",
+ parts_a_m_host[k].w, parts_pos_xy_host[k].x, parts_pos_xy_host[k].y, parts_pos_z_host[k],
+ parts_a_m_host[k].x, parts_a_m_host[k].y, parts_a_m_host[k].z);
+ fclose(file);
+
}
diff --git a/examples/test_bh_3.cu b/examples/test_bh_3.cu
new file mode 100644
index 0000000..a1adfbd
--- /dev/null
+++ b/examples/test_bh_3.cu
@@ -0,0 +1,1265 @@
+/*******************************************************************************
+ * This file is part of QuickSched.
+ * Coypright (c) 2014 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
+ * Aidan Chalk (aidan.chalk@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+* *****************************************************************************/
+
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Standard includes. */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+#include <omp.h>
+#include <fenv.h>
+
+/* Local includes. */
+extern "C"{
+#include "quicksched.h"
+#include "res.h"
+}
+#include "cuda_queue.h"
+
+/** Task types. */
+enum task_type {
+ task_type_self = 0,
+ task_type_pair,
+ task_type_self_pc,
+ task_type_com,
+ task_type_count
+};
+
+struct cell{
+
+double2 loc_xy;
+double loc_z;
+double h;
+int count;
+unsigned short int split, sorted;
+int parts, firstchild, sibling;
+int res, /*resz, resm,*/ com_tid;
+
+};//__attribute__((aligned(64)));
+
+struct part{
+double loc[3];
+float a[3];
+float m;
+};//__attribute__((aligned(64)));
+
+
+#define const_G 1
+/* Requred variables to obtain cells. */
+#define cell_maxparts 256
+#define CELL_STRETCH 2
+#define INITIAL_CELLS 256
+struct cell *cell_pool = NULL;
+int used_cells=0;
+int num_cells = 0;
+int cell_size = INITIAL_CELLS*sizeof(struct cell);
+
+/* Device locations for the particle values. */
+//__device__ double2 *parts_pos_xy;
+//__device__ double *parts_pos_z;
+//__device__ float4 *parts_a_m;
+__device__ double2 *com_xy;
+__device__ double *com_z;
+__device__ float *com_mass;
+__device__ struct part *parts_cuda;
+__device__ struct cell *cells;
+
+
+/* Host locations for the particle values. */
+//double2 *parts_pos_xy_host;
+//double *parts_pos_z_host;
+//float4 *parts_a_m_host;
+double2 *com_xy_host;
+double *com_z_host;
+float *com_mass_host;
+double2 *parts_pos_xy_temp;
+double *parts_pos_z_temp;
+float4 *parts_a_m_temp;
+
+struct part *parts_host;
+struct part *parts_temp;
+
+
+__device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *cj) {
+ int i, j, k;
+ int count_i = ci->count, count_j = cj->count;
+ int parts_i = ci->parts, parts_j = cj->parts;
+ double xi[3];
+ float dx[3], ai[3], mi, mj, r2, w, ir;
+ __shared__ double2 parts_xy[cell_maxparts];
+ __shared__ double parts_z[cell_maxparts];
+ __shared__ float4 parts_am[cell_maxparts];
+ /*if(threadIdx.x == 0)
+ printf("%f, %f, %f, %f, %i, %f, %f, %f, %f, %i\n", ci->h, ci->loc_xy.x, ci->loc_xy.y, ci->loc_z, ci->split,
+ cj->h, cj->loc_xy.x, cj->loc_xy.y, cj->loc_z, cj->split);*/
+
+ /* Load particles of cell j into shared memory */
+ /*for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) {
+ parts_xy[j] = parts_pos_xy[k];
+ parts_z[j] = parts_pos_z[k];
+ parts_am[j] = parts_a_m[k];
+ }*/
+
+ /* Loop over cell i.*/
+ for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
+ xi[0] = parts_cuda[i].loc[0];
+ xi[1] = parts_cuda[i].loc[1];
+ xi[2] = parts_cuda[i].loc[2];
+ for(k = 0; k < 3; k++) {
+ ai[k] = 0.0f;
+ }
+ mi = parts_cuda[i].m;
+
+ for(j = parts_j; j < parts_j + count_j; j++) {
+ r2 = 0.0f;
+ dx[0] = xi[0] - parts_cuda[j].loc[0];
+ dx[1] = xi[1] - parts_cuda[j].loc[1];
+ dx[2] = xi[2] - parts_cuda[j].loc[2];
+ r2 += dx[0] * dx[0];
+ r2 += dx[1] * dx[1];
+ r2 += dx[2] * dx[2];
+
+
+ // ir = 1.0f / sqrtf(r2);
+ ir = rsqrtf(r2);
+ w = const_G * ir * ir * ir;
+ mj = parts_cuda[j].m;
+ for(k = 0; k < 3; k++) {
+ ai[k] -= dx[k] * mj * w;
+ }
+ // atomicAdd(&parts_a_m[j].x, w*dx[0]*mi);
+ // atomicAdd(&parts_a_m[j].y, w*dx[1]*mi);
+ // atomicAdd(&parts_a_m[j].z, w*dx[2]*mi);
+ }
+ atomicAdd(&parts_cuda[i].a[0], ai[0]);
+ atomicAdd(&parts_cuda[i].a[1], ai[1]);
+ atomicAdd(&parts_cuda[i].a[2], ai[2]);
+
+ }
+
+ /* Load particles of cell i into shared memory */
+ /*for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) {
+ parts_xy[j] = parts_pos_xy[k];
+ parts_z[j] = parts_pos_z[k];
+ parts_am[j] = parts_a_m[k];
+ }*/
+ /*Loop over cell j. */
+ for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
+ xi[0] = parts_cuda[i].loc[0];
+ xi[1] = parts_cuda[i].loc[1];
+ xi[2] = parts_cuda[i].loc[2];
+ for(k = 0; k < 3; k++) {
+ ai[k] = 0.0f;
+ }
+ mi = parts_cuda[i].m;
+
+ for(j = parts_i; j < parts_i + count_i; j++) {
+ r2 = 0.0f;
+ dx[0] = xi[0] - parts_cuda[j].loc[0];
+ dx[1] = xi[1] - parts_cuda[j].loc[1];
+ dx[2] = xi[2] - parts_cuda[j].loc[2];
+ r2 += dx[0] * dx[0];
+ r2 += dx[1] * dx[1];
+ r2 += dx[2] * dx[2];
+
+
+ ir = rsqrtf(r2);
+ w = const_G * ir * ir * ir;
+ mj = parts_cuda[j].m;
+ for(k = 0; k < 3; k++) {
+ ai[k] -= dx[k] * mj * w;
+ }
+ }
+ atomicAdd(&parts_cuda[i].a[0], ai[0]);
+ atomicAdd(&parts_cuda[i].a[1], ai[1]);
+ atomicAdd(&parts_cuda[i].a[2], ai[2]);
+
+ }
+
+}
+
+__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) {
+
+ int i, k;
+ double2 j_com_xy;
+ double j_com_z;
+ float j_com_mass;
+ int count = leaf->count;
+ int parts = leaf->parts;
+ int cell_j = cj - cells;
+ int temp;
+ float r2, dx[3], ir, w;
+
+ // if(cell_j < 0)
+// {
+// if(threadIdx.x == 0)
+ // printf("cell_j = %i, leaf = %i, threadIdx.x == %i\n", cell_j, leaf-cells, threadIdx.x);
+ // __syncthreads();
+// asm("trap;");
+// }
+
+ // if(threadIdx.x == 0)
+ // printf("%f, %f, %f\n", cj->loc_xy.x, cj->loc_xy.y, cj->loc_z);
+
+
+ temp = cell_j;
+
+ /* Init the com's data.*/
+ j_com_xy = com_xy[cell_j];
+ j_com_z = com_z[cell_j];
+ j_com_mass = com_mass[cell_j];
+
+ for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
+ r2 = 0.0;
+ dx[0] = j_com_xy.x - parts_cuda[i].loc[0];
+ r2 += dx[0] * dx[0];
+ dx[1] = j_com_xy.y - parts_cuda[i].loc[1];
+ r2 += dx[1] * dx[1];
+ dx[2] = j_com_z - parts_cuda[i].loc[2];
+ r2 += dx[2] * dx[2];
+
+ ir = rsqrtf(r2);
+ w = j_com_mass * const_G * ir * ir * ir;
+ atomicAdd( &parts_cuda[i].a[0] , w * dx[0]);
+ atomicAdd( &parts_cuda[i].a[1] , w * dx[1]);
+ atomicAdd( &parts_cuda[i].a[2] , w * dx[2]);
+ }
+//__syncthreads();
+}
+
+/**
+ * @brief Checks whether the cells are direct neighbours ot not
+ */
+__device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) {
+
+ int k;
+ float dx[3];
+ double cih = ci->h, cjh = cj->h;
+
+ float min_dist = 0.5*(cih+cjh);
+
+ float center_i = ci->loc_xy.x + 0.5*cih;
+ float center_j = cj->loc_xy.x + 0.5*cjh;
+ dx[0] = fabsf(center_i - center_j);
+ center_i = ci->loc_xy.y + 0.5*cih;
+ center_j = cj->loc_xy.y + 0.5*cjh;
+ dx[1] = fabsf(center_i - center_j);
+ center_i = ci->loc_z + 0.5*cih;
+ center_j = cj->loc_z + 0.5*cjh;
+ dx[2] = fabsf(center_i - center_j);
+ return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
+}
+
+__device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) {
+ int k;
+ float dx[3];
+ float min_dist = ci->h;
+ float center_i = ci->loc_xy.x;
+ float center_j = cj->loc_xy.x;
+ dx[0] = fabsf(center_i - center_j);
+ center_i = ci->loc_xy.y;
+ center_j = cj->loc_xy.y;
+ dx[1] = fabsf(center_i - center_j);
+ center_i = ci->loc_z;
+ center_j = cj->loc_z;
+ dx[2] = fabsf(center_i - center_j);
+ return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
+}
+
+__device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) {
+ return (leaf->parts >= c->parts) && (leaf->parts < c->parts + c->count);
+}
+
+__device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) {
+
+ struct cell *cp ,*cps;
+
+ /*
+ Find child of ci containing leaf.
+ if child and cj are_neighbours_different_size
+ Loop through children of cj.
+ If child and childj are neighbours
+ If both split -> recurse
+ else
+ make_interact_pc
+ else
+ loop through children of cj
+ make interact pc
+
+ */
+ struct cell *loopend;
+ cp = ci;
+ while(cp->split)
+ {
+ for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
+ if(is_inside(leaf, cp)) break;
+ }
+
+ if(are_neighbours_different_size(cp, cj)) {
+ loopend = &cells[cj->sibling];
+ cps = &cells[cj->firstchild];
+ while(cps != loopend)
+ {
+ if(!are_neighbours(cp, cps)){
+ make_interact_pc(leaf, cps);
+ //__syncthreads();
+ cps = &cells[cps->sibling];
+ }else{
+ if(cps->split)
+ cps = &cells[cps->firstchild];
+ else
+ cps = &cells[cps->sibling];
+ }
+
+ }
+
+ }else{
+ for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
+ make_interact_pc(leaf, cps);
+ // __syncthreads();
+
+ }
+ break;
+ }
+
+ ci = cp;
+ }
+
+
+
+
+
+
+ /* for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
+ if(is_inside(leaf, cp)) break;
+ }
+
+ if(are_neighbours_different_size(cp, cj)) {
+
+ for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
+
+ if(!are_neighbours(cp, cps)) {
+ make_interact_pc(leaf, cps);
+ __syncthreads();
+ } else {
+ if(cp->split && cps->split) {
+ iact_pair_pc(cp, cps, leaf);
+ }
+
+ }
+ }
+ }else{
+
+ for(cps = &cells[cj->firstchild]; cps!= &cells[cj->sibling]; cps = &cells[cps->sibling]) {
+ make_interact_pc(leaf, cps);
+ }
+
+ }*/
+
+// __syncthreads();
+}
+
+/**
+ * @brief Compute the interactions between all particles in a leaf and
+ * and all the monopoles in the cell c
+ *
+ * @param c The #cell containing the monopoles
+ * @param leaf The #cell containing the particles
+ */
+__device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
+
+ struct cell *cp, *cps;
+
+ /*if(leaf->split)
+ {
+ printf("Leaf split = 1, oh dear.");
+ asm("trap;");
+ }
+ if(c->split > 1)
+ {
+ printf("Cell had split > 1\n");
+ asm("trap;");
+ }*/
+
+ /* Find the subcell of c the leaf is in.*/
+
+ cp = c;
+ cps = c;
+ while(c->split)
+ {
+ for(cp = &cells[cp->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]){
+ if(is_inside(leaf, cp)) break;
+ }
+ if(cp->split){
+ for(cps = &cells[c->firstchild]; cps != &cells[c->sibling]; cps = &cells[cps->sibling]) {
+
+ if(cp != cps && cps->split) iact_pair_pc(cp, cps, leaf);
+ }
+ }
+ c = cp;
+ }
+
+ /*for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) {
+ if(is_inside(leaf, cp)) break;
+ }
+
+ if(cp->split) {
+
+ iact_self_pc(cp, leaf);
+
+ for(cps = &cells[c->firstchild]; cps != &cells[c->sibling]; cps = &cells[cps->sibling]) {
+
+ if(cp != cps && cps->split) iact_pair_pc(cp,cps,leaf);
+ }
+
+ }*///TODO
+}
+
+
+
+/**
+ * @brief Compute the interactions between all particles in a cell.
+ *
+ * @param cellID The cell ID to compute interactions on.
+ */
+__device__ void iact_self_direct(int cellID) {
+ struct cell *c = &cells[cellID];
+ double xi[3] = {0.0, 0.0, 0.0};
+ float ai[3] = {0.0, 0.0, 0.0 }, mi, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w;
+ __shared__ double2 parts_xy[cell_maxparts];
+ __shared__ double parts_z[cell_maxparts];
+ __shared__ float4 parts_am[cell_maxparts];
+ int parts;
+ int count;
+ int i,j,k;
+
+ //if(threadIdx.x == 0)
+ // printf("%f, %f, %f, %f, %i\n", c->h, c->loc_xy.x, c->loc_xy.y, c->loc_z, c->split);
+ //If cell is split, interact each child with itself, and with each of its siblings.
+ /*if(c->split) {
+ //TODO
+
+ } else {*/
+ parts = c->parts;
+ count = c->count;
+ int z = threadIdx.x;
+ /* Load particle data into shared memory*/
+ /*for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) {
+ parts_xy[z] = parts_pos_xy[k];
+ parts_z[z] = parts_pos_z[k];
+ parts_am[z] = parts_a_m[k];
+ }
+ __syncthreads();*/
+ for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x)
+ {
+ xi[0] = parts_cuda[i].loc[0];
+ xi[1] = parts_cuda[i].loc[1];
+ xi[2] = parts_cuda[i].loc[2];
+ for(k = 0; k < 3; k++) {
+ ai[k] = 0.0;
+ }
+ mi = parts_cuda[i].m;
+
+ //for(j = i+1; j!= i; j = (j+1)%count)
+ for(j = parts; j < parts+count; j++)
+ {
+ if(i != j){
+
+ /* Compute the pairwise distance. */
+ r2 = 0.0;
+ dx[0] = xi[0] - parts_cuda[j].loc[0];
+ r2 += dx[0]*dx[0];
+ dx[1] = xi[1] - parts_cuda[j].loc[1];
+ r2 += dx[1]*dx[1];
+ dx[2] = xi[2] - parts_cuda[j].loc[2];
+ r2 += dx[2]*dx[2];
+
+ /* Apply the gravitational acceleration. */
+
+ //ir = 1.0f / sqrtf(r2);
+ ir = rsqrtf(r2);
+ w = const_G * ir * ir * ir;
+ mj = parts_cuda[j].m;
+ for(k = 0; k < 3; k++) {
+ ai[k] -= w * dx[k] * mj;
+ }
+
+ }
+ }
+ //Update.
+ atomicAdd(&parts_cuda[i].a[0],ai[0] );
+ atomicAdd(&parts_cuda[i].a[1],ai[1] );
+ atomicAdd(&parts_cuda[i].a[2],ai[2] );
+ }
+
+
+}
+
+
+/**
+ * @brief Checks whether the cells are direct neighbours ot not. Both cells have
+ * to be of the same size
+ */
+static inline int are_neighbours_host(struct cell *ci, struct cell *cj) {
+
+ int k;
+ float dx[3];
+
+#ifdef SANITY_CHECKS
+ if (ci->h != cj->h)
+ error(" Cells of different size in distance calculation.");
+#endif
+
+ /* Maximum allowed distance */
+ float min_dist = ci->h;
+
+ /* (Manhattan) Distance between the cells */
+ double2 loc1=ci->loc_xy, loc2=cj->loc_xy;
+ float center_i = loc1.x;
+ float center_j = loc2.x;
+ dx[0] = fabs(center_i - center_j);
+ center_i = loc1.y;
+ center_j = loc2.y;
+ dx[1] = fabs(center_i - center_j);
+ center_i = ci->loc_z;
+ center_j = cj->loc_z;
+ dx[2] = fabs(center_i - center_j);
+
+ return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
+}
+
+
+struct cell *cell_get()
+{
+ struct cell *res;
+
+ if(num_cells == 0)
+ {
+ cell_pool = (struct cell*) calloc(INITIAL_CELLS, sizeof(struct cell));
+ if(cell_pool == NULL)
+ error("Failed to allocate cell_pool");
+ com_xy_host = (double2*) calloc(INITIAL_CELLS, sizeof(double2));
+ if(com_xy_host == NULL)
+ error("Failed to allocate cell_pool");
+ com_z_host = (double*) calloc(INITIAL_CELLS, sizeof(double));
+ if(com_z_host == NULL)
+ error("Failed to allocate cell_pool");
+ com_mass_host = (float*) calloc(INITIAL_CELLS, sizeof(float));
+ if(com_mass_host == NULL)
+ error("Failed to allocate cell_pool");
+ num_cells = INITIAL_CELLS;
+ }
+
+ if(used_cells >= num_cells)
+ {
+ /* Stretch */
+ struct cell *new_pool;
+ cell_size *= CELL_STRETCH;
+ new_pool = (struct cell*) calloc(num_cells*CELL_STRETCH, sizeof(struct cell));
+ if(new_pool == NULL)
+ error("Failed to allocate new_pool");
+ if(cell_pool != NULL)
+ memcpy(new_pool, cell_pool, num_cells*sizeof(struct cell));
+
+
+
+ double2 *tempxy = (double2*) calloc(num_cells*CELL_STRETCH, sizeof(double2));
+ if(tempxy == NULL)
+ error("Failed to allocate tempxy");
+ memcpy(tempxy, com_xy_host, sizeof(double2)*num_cells);
+ free(com_xy_host);
+ com_xy_host = tempxy;
+ double *tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double));
+ if(tempz == NULL)
+ error("Failed to allocate tempz");
+ memcpy(tempz, com_z_host, num_cells*sizeof(double));
+ free(com_z_host);
+ com_z_host = tempz;
+ float *tempm = (float*) calloc(num_cells*CELL_STRETCH, sizeof(float));
+ if(tempm == NULL)
+ error("Failed to allocate tempm");
+ memcpy(tempm, com_mass_host, num_cells*sizeof(float));
+ free(com_mass_host);
+ com_mass_host = tempm;
+
+ num_cells *= CELL_STRETCH;
+ free(cell_pool);
+ cell_pool = new_pool;
+
+ message("Increased size of arrays");
+ }
+ used_cells++;
+ cell_pool[used_cells-1].sibling = -1;
+ cell_pool[used_cells-1].firstchild = -1;
+ cell_pool[used_cells-1].res = qsched_res_none;
+ //cell_pool[used_cells-1].resz = qsched_res_none;
+ //cell_pool[used_cells-1].resm = qsched_res_none;
+ return &cell_pool[used_cells-1];
+}
+
+void comp_com(struct cell *c){
+
+ int k, count = c->count;
+ int cpi;
+ struct cell *cp;
+ int parts = c->parts;
+ double com[3] = {0.0, 0.0, 0.0}, mass = 0.0;
+
+ if(c->split) {
+ for(cp = &cell_pool[(cpi = c->firstchild)]; cpi != c->sibling; cp = &cell_pool[(cpi = cp->sibling)]) {
+ float cp_mass = com_mass_host[cpi];
+ com[0] += com_xy_host[cpi].x * cp_mass;
+ com[1] += com_xy_host[cpi].y * cp_mass;
+ com[2] += com_z_host[cpi] * cp_mass;
+ mass += cp_mass;
+ }
+
+
+ /* Otherwise collect the multiple from the particles */
+ } else {
+
+ for(k = parts; k < parts+count; k++)
+ {
+ float p_mass = parts_host[k].m;
+ com[0] += parts_host[k].loc[0] * p_mass;
+ com[1] += parts_host[k].loc[1] * p_mass;
+ com[2] += parts_host[k].loc[2] * p_mass;
+ mass += p_mass;
+ }
+ }
+
+
+ k = c - cell_pool;
+ /* Store the COM data, if it was collected. */
+ if(mass > 0.0) {
+ float imass = 1.0f/mass;
+ com_xy_host[k].x = com[0] * imass;
+ com_xy_host[k].y = com[1] * imass;
+ com_z_host[k] = com[2] * imass;
+ com_mass_host[k] = mass;
+ }else
+ {
+ com_xy_host[k].x = 0.0;
+ com_xy_host[k].y = 0.0;
+ com_z_host[k] = 0.0;
+ com_mass_host[k] = 0.0f;
+ }
+
+
+
+}
+
+/**
+ * @brief Sort the parts into eight bins along the given pivots and
+ * fill the multipoles. Also adds the hierarchical resources
+ * to the sched (TODO).
+ *
+ * @param c The #cell to be split.
+ * @param N The total number of parts.
+ * @param s The #sched to store the resources.
+ */
+void cell_split(int c, struct qsched *s) {
+ int i, j, k, kk, count = cell_pool[c].count;
+ int parts = cell_pool[c].parts;
+ struct part temp_part;
+ double2 tempxy;
+ double tempxy1;
+ float4 tempxy2;
+ struct cell *cp, *cell;
+ int left[8], right[8];
+ double pivot[3];
+ static int root = -1;
+// struct cell *progenitors[8];
+ int progenitors[8];
+ int c1 = c;
+ struct part *temp_xy;
+ double *temp_z;
+ float4 *temp_a_m;
+
+ /* Set the root cell. */
+ if (root < 0) {
+ root = c;
+ cell_pool[c].sibling = -1;
+ }
+
+ if(cell_pool[c].res == qsched_res_none)
+ {
+ if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[c].parts], 0) != cudaSuccess )
+ error("Failed to get host device pointer.");
+// printf("tempxy = %p\n", temp_xy);
+ cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy,
+ sizeof(struct part) * cell_pool[c].count, parts_pos_xy_temp + cell_pool[c].parts);
+ }
+ /*if(cell_pool[c].resz == qsched_res_none)
+ {
+ if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[c].parts], 0) != cudaSuccess )
+ error("Failed to get host device pointer.");
+ cell_pool[c].resz = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_z,
+ sizeof(double) * cell_pool[c].count, parts_pos_z_temp + cell_pool[c].parts);
+ }
+ if(cell_pool[c].resm == qsched_res_none)
+ {
+ if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[c].parts], 0) != cudaSuccess )
+ error("Failed to get host device pointer.");
+ cell_pool[c].resm = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_a_m,
+ sizeof(float4) * cell_pool[c].count, parts_a_m_temp + cell_pool[c].parts);
+ }*/
+ // error("Cell has no resource");*///TODO
+
+ if(count > cell_maxparts )
+ {
+ cell_pool[c].split = 1;
+
+ for(k = 0; k < 8; k++)
+ {
+ progenitors[k] = (cp = cell_get()) - cell_pool;
+ cell = &cell_pool[c];
+ cp->loc_xy = cell->loc_xy;
+ cp->loc_z = cell->loc_z;
+ cp->h = cell->h*0.5;
+ if(k & 4) cp->loc_xy.x += cp->h;
+ if(k & 2) cp->loc_xy.y += cp->h;
+ if(k & 1) cp->loc_z += cp->h;
+ }
+
+ /* Init the pivots.*/
+ pivot[0] = cell->loc_xy.x + cell->h * 0.5;
+ pivot[1] = cell->loc_xy.y + cell->h * 0.5;
+ pivot[2] = cell->loc_z + cell->h * 0.5;
+
+ /* Split along the x axis. */
+ i = parts;
+ j = parts+count-1;
+ while(i < j)
+ {
+ while(i <= parts+count-1 && parts_host[i].loc[0] < pivot[0]) i += 1;
+ while(j >= parts && parts_host[j].loc[0] >= pivot[0]) j -= 1;
+ if(i < j){
+ temp_part = parts_host[i];
+ parts_host[i] = parts_host[j];
+ parts_host[j] = temp_part;
+ }
+ }
+ left[1] = i;
+ right[1] = parts+count-1;
+ left[0] = parts;
+ right[0] = j;
+
+ /* Split along the y axis twice. */
+ for (k = 1; k >= 0; k--) {
+ i = left[k];
+ j = right[k];
+ while(i <= j){
+ while(i <= right[k] && parts_host[i].loc[1] < pivot[1]) i += 1;
+ while(j >= left[k] && parts_host[j].loc[1] >= pivot[1]) j -= 1;
+ if(i < j)
+ {
+ temp_part = parts_host[i];
+ parts_host[i] = parts_host[j];
+ parts_host[j] = temp_part;
+ }
+ }
+ left[2*k+1] = i;
+ right[2*k+1] = right[k];
+ left[2*k] = left[k];
+ right[2*k] = j;
+ }
+ /* Split along the z axis four times.*/
+ for(k = 3; k >=0; k--)
+ {
+ i = left[k];
+ j = right[k];
+ while(i <= j){
+ while(i <= right[k] && parts_host[i].loc[2] < pivot[2]) i += 1;
+ while(j >= left[k] && parts_host[j].loc[2] >= pivot[2]) j -= 1;
+ if(i < j)
+ {
+ temp_part = parts_host[i];
+ parts_host[i] = parts_host[j];
+ parts_host[j] = temp_part;
+ }
+ }
+ left[2 * k + 1] = i;
+ right[2 * k + 1] = right[k];
+ left[2 * k] = left[k];
+ right[2 * k] = j;
+ }
+
+ /* Store the counts and offsets. */
+ for(k = 0; k < 8; k++)
+ {
+ cell_pool[progenitors[k]].count = right[k]-left[k]+1;
+ cell_pool[progenitors[k]].parts = left[k];
+ if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
+ error("Failed to get host device pointer.");
+ cell_pool[progenitors[k]].res = qsched_addres(s, qsched_owner_none, cell->res, temp_xy,
+ sizeof(struct part) * cell_pool[progenitors[k]].count, parts_temp + cell_pool[progenitors[k]].parts);
+ /*if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
+ error("Failed to get host device pointer.");
+ cell_pool[progenitors[k]].resz = qsched_addres(s, qsched_owner_none, cell->resz, temp_z,
+ sizeof(double) * cell_pool[progenitors[k]].count, parts_pos_z_temp + cell_pool[progenitors[k]].parts);
+ if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
+ error("Failed to get host device pointer.");
+ cell_pool[progenitors[k]].resm = qsched_addres(s, qsched_owner_none, cell->resm, temp_a_m,
+ sizeof(float4) * cell_pool[progenitors[k]].count, parts_a_m_temp + cell_pool[progenitors[k]].parts);*/
+ }
+
+ /* Find the first non-empty progenitor */
+ for(k = 0; k < 8; k++)
+ {
+ if(cell_pool[progenitors[k]].count > 0)
+ {
+ cell->firstchild = progenitors[k];
+ break;
+ }
+ }
+
+ #ifdef SANITY_CHECKS
+ if(cell->firstchild == -1)
+ error("Cell has been split but all children have 0 parts");
+ #endif
+
+ /*Prepare the pointers*/
+ for(k = 0; k < 8; k++)
+ {
+ /* Find the next non-empty sibling */
+ for(kk = k+1; kk < 8; ++kk){
+ if(cell_pool[progenitors[kk]].count > 0){
+ cell_pool[progenitors[k]].sibling = progenitors[kk];
+ break;
+ }
+ }
+
+ /* No non-empty sibling, go back a level.*/
+ if(kk == 8) cell_pool[progenitors[k]].sibling = cell->sibling;
+
+ }
+
+ /* Recurse */
+ for(k = 0; k < 8; k++)
+ if(cell_pool[progenitors[k]].count > 0) cell_split(progenitors[k], s);
+
+ /* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */
+ } else {
+
+// struct cell *data[2] = {root, c};
+ int data[2] = {root, c};
+ int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
+ 2 * sizeof(int), 3000);
+ /*qsched_adduse(s, tid, cell_pool[root].res);
+ qsched_adduse(s, tid, cell_pool[root].resz);
+ qsched_adduse(s, tid, cell_pool[root].resm);*/
+ qsched_addlock(s, tid, cell_pool[c].res);
+ // qsched_addlock(s, tid, cell_pool[c].resz);
+ // qsched_addlock(s, tid, cell_pool[c].resm);
+ }
+
+#ifndef COM_AS_TASK
+ comp_com(&cell_pool[c]);
+#endif
+}
+
+/**
+ * @brief Create the tasks for the cell pair/self.
+ *
+ * @param s The #sched in which to create the tasks.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ */
+void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
+
+ qsched_task_t tid;
+ int data[2];
+ struct cell /**data[2],*/ *cp, *cps;
+ int cpi;
+
+
+ if(cj == NULL)
+ {
+ if(ci->split)
+ {
+ for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling])
+ {
+ //Self Interaction.
+ create_tasks(s, cp, NULL);
+
+ for(cps = &cell_pool[cp->sibling]; cps != &cell_pool[ci->sibling]; cps = &cell_pool[cps->sibling])
+ create_tasks(s, cp, cps);
+ }
+ }
+ /* Self task */
+ else{
+ data[0] = ci - cell_pool;
+ data[1] = -1;
+ tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, 2);
+ qsched_addlock(s, tid, ci->res);
+ //qsched_addlock(s, tid, ci->resz);
+ //qsched_addlock(s, tid, ci->resm);
+ }
+ }
+ /* Else its a pair!*/
+ else{
+ if(!are_neighbours_host(ci,cj)){/* Cells are neighbours */
+
+ }else{
+ /*Are both split? */
+ if(ci->split && cj->split)
+ {
+ /* Recurse over both cells. */
+ for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling])
+ for(cps = &cell_pool[cj->firstchild]; cps != &cell_pool[cj->sibling]; cps = &cell_pool[cps->sibling])
+ create_tasks(s, cp, cps);
+
+ /* Otherwise, at least one of the cells is not split, build a direct
+ * interaction. */
+ }else{
+ data[0] = ci-cell_pool;
+ data[1] = cj-cell_pool;
+
+ /* Create the task. */
+ tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
+ sizeof(struct cell *) * 2, 1);
+
+ /* Add the resources. */
+ qsched_addlock(s, tid, ci->res);
+ // qsched_addlock(s, tid, ci->resz);
+ // qsched_addlock(s, tid, ci->resm);
+ qsched_addlock(s, tid, cj->res);
+ // qsched_addlock(s, tid, cj->resz);
+ // qsched_addlock(s, tid, cj->resm);
+ }
+ }
+
+ }
+
+
+
+}
+
+
+__device__ void runner( int type , void *data ) {
+
+ int *idata = (int *)data;
+ int i = idata[0];
+ int j = idata[1];
+ switch ( type ) {
+ case task_type_self:
+ iact_self_direct(i);
+ break;
+ case task_type_pair:
+ iact_pair_direct(&cells[i], &cells[j]);
+ break;
+ case task_type_self_pc:
+ iact_self_pc( &cells[i], &cells[j] );
+ break;
+ default:
+ asm("trap;");
+ }
+ __syncthreads();
+}
+
+__device__ qsched_funtype function = runner;
+
+
+qsched_funtype func;
+/**
+ * @brief Set up and run a task-based Barnes-Hutt N-body solver.
+ *
+ * @param N The number of random particles to use.
+ * @param runs Number of force evaluations to use as a benchmark.
+ * @param fileName Input file name. If @c NULL or an empty string, random
+ * particle positions will be used.
+ */
+void test_bh(int N, int runs, char *fileName) {
+ int i, k;
+ struct cell *root;
+ struct part *parts;
+ FILE *file;
+ struct qsched s;
+ ticks tic, toc_run, tot_setup = 0, tot_run = 0;
+ int countMultipoles = 0, countPairs = 0, countCoMs = 0;
+ struct cell *gpu_ptr_cells;
+
+ cudaFree(0);
+ if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
+ error("Failed to copy function pointer from device");
+
+ /* Initialize the scheduler. */
+ qsched_init(&s, 1, qsched_flag_none);
+
+ //Create host particle arrays.
+ if( cudaMallocHost(&parts_host, sizeof(struct part) * N) != cudaSuccess)
+ error("Failed to allocate parts array");
+
+
+
+ if( cudaMalloc(&parts_temp, sizeof(struct part) * N) != cudaSuccess)
+ error("Failed to allocate device parts array");
+ if( cudaMemcpyToSymbol(parts_cuda, &parts_temp, sizeof(struct part*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+ error("Failed to set device symbol for parts array");
+
+
+ if(fileName == NULL || fileName[0] == 0) {
+ for(k = 0; k < N; k++) {
+ parts_host[k].loc[0] = ((double)rand())/ RAND_MAX;
+ parts_host[k].loc[1] = ((double)rand())/ RAND_MAX;
+ parts_host[k].loc[2] = ((double)rand())/ RAND_MAX;
+ parts_host[k].m = ((double)rand()) / RAND_MAX;
+ }
+
+ } else {
+ file = fopen(fileName, "r");
+ int tempxy;
+ if(file) {
+ for(k = 0; k < N; k++) {
+ if(fscanf(file, "%d", &tempxy) != 1)
+ error("Failed to read ID");
+ if(fscanf(file, "%lf%lf%lf", &parts_host[k].loc[0], &parts_host[k].loc[1], &parts_host[k].loc[2]) != 3)
+ error("Failed to read position of part %i.", k);
+ if(fscanf(file, "%f", &parts_host[k].m) != 1)
+ error("Failed to read mass of part %i.", k);
+ }
+ fclose(file);
+ }
+ }
+
+ /* Init the cells. */
+ root = cell_get();
+ root->loc_xy.x = 0.0;
+ root->loc_xy.y = 0.0;
+ root->loc_z = 0.0;
+ root->h = 1.0;
+ root->count = N;
+ root->parts = 0;
+ root->sibling = -1;
+ int c = root-cell_pool;
+ cell_split(root - cell_pool, &s);
+ root = &cell_pool[c];
+ int nr_leaves = 0;
+ int maxparts=0, minparts=1000000;
+ int number = 0;
+ while(c >= 0) {
+ if(cell_pool[c].count > 0)
+ {
+ number++;
+ if(cell_pool[c].res == qsched_res_none)
+ message("cell %i has no res", c);
+ //if(cell_pool[c].resz == qsched_res_none)
+ // message("cell %i has no resz", c);
+ //if(cell_pool[c].resm == qsched_res_none)
+ // message("cell %i has no resm", c);
+ }
+ if(!cell_pool[c].split) {
+ nr_leaves++;
+ if(cell_pool[c].count > maxparts)
+ {
+ maxparts = cell_pool[c].count;
+ }
+ if(cell_pool[c].count < minparts)
+ {
+ minparts = cell_pool[c].count;
+ }
+ c = cell_pool[c].sibling;
+ } else {
+ c = cell_pool[c].firstchild;
+ }
+ }
+ message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves));
+ message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts);
+
+ /* for(k = 0; k < num_cells; k++)
+ if(cell_pool[k].split > 1 )
+ printf("Split > 1\n");*/
+
+ create_tasks(&s, root, NULL);
+
+ int self = 0, pair = 0, pc = 0;
+
+ for(k = 0; k < s.count; k++)
+ {
+ if(s.tasks[k].type == task_type_self)
+ self++;
+ else if (s.tasks[k].type == task_type_pair)
+ pair++;
+ else if (s.tasks[k].type >= 0)
+ pc++;
+ }
+
+ message("total number of tasks: %i.", s.count);
+ message("total number of pair tasks: %i.", pair);
+ message("total number of self tasks: %i.", self);
+ message("total number of pc tasks: %i.", pc);
+ message("total number of cells: %i.", number);
+ message("total number of deps: %i.", s.count_deps);
+ message("total number of res: %i.", s.count_res);
+ message("total number of locks: %i.", s.count_locks);
+
+ for(k = 0; k < runs; k++) {
+ for(i = 0; i < N; ++i) {
+ parts_host[i].a[0] = 0.0;
+ parts_host[i].a[1] = 0.0;
+ parts_host[i].a[2] = 0.0;
+ }
+
+
+ /* Copy the cells to the device. */
+ if( cudaMalloc( &gpu_ptr_cells , sizeof(struct cell) * used_cells) != cudaSuccess)
+ error("Failed to allocate cells on the GPU");
+ if( cudaMemcpy( gpu_ptr_cells, cell_pool, sizeof(struct cell) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess)
+ error("Failed to copy cells to the GPU");
+ if( cudaMemcpyToSymbol(cells, &gpu_ptr_cells, sizeof(struct cell*), 0, cudaMemcpyHostToDevice) != cudaSuccess )
+ error("Failed to copy cell pointer to the GPU");
+
+double2 *com_temp;
+double *comz_temp;
+float *comm_temp;
+
+ if(cudaMalloc( &com_temp, sizeof(double2) * used_cells) != cudaSuccess)
+ error("Failed to allocate com on the GPU");
+ if( cudaMemcpy( com_temp, com_xy_host, sizeof(double2) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
+ error("failed to copy com to the GPU");
+ if( cudaMemcpyToSymbol(com_xy, &com_temp, sizeof(double2 *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+ error("Failed to copy com pointer to the GPU");
+
+
+ if(cudaMalloc( &comz_temp, sizeof(double) * used_cells) != cudaSuccess)
+ error("Failed to allocate com on the GPU");
+ if( cudaMemcpy( comz_temp, com_z_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
+ error("failed to copy com to the GPU");
+ if( cudaMemcpyToSymbol(com_z, &comz_temp, sizeof(double *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+ error("Failed to copy com pointer to the GPU");
+
+ if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess)
+ error("Failed to allocate com on the GPU");
+ if( cudaMemcpy( comm_temp, com_mass_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
+ error("failed to copy com to the GPU");
+ if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+ error("Failed to copy com pointer to the GPU");
+
+ /* for(i = 0; i < s.count; i++){
+ int *idata = (int*)&s.data[s.tasks[i].data];
+ if(s.tasks[i].type == task_type_self)
+ printf("Self task with data[0] = %i\n", idata[0]);
+ if(s.tasks[i].type == task_type_pair) {
+ printf("Pair task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
+ }
+ if(s.tasks[i].type == task_type_self_pc) {
+ printf("PC task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
+ }
+ }*/
+
+ // printf("com_mass_host[152] = %f\n", com_mass_host[152]);
+
+
+ //Run code.
+// printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
+ qsched_run_CUDA( &s , func );
+ qsched_print_cuda_timers(&s);
+
+ k = 0;
+ printf("%e, %e, %e, %e, %e, %e, %e\n", parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2],
+ parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]);
+struct task* tasks = qsched_get_timers( &s , s.count );
+ for(i = 0; i < s.count; i++)
+ {
+ printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
+ // printf("\n");
+
+ }
+
+}
+ /* Dump the particles to a file */
+ file = fopen("particle_dump.dat", "w");
+/* fprintf(file,
+ "# ID m x y z a_exact.x a_exact.y a_exact.z a_legacy.x "
+ "a_legacy.y a_legacy.z a_new.x a_new.y a_new.z\n");*/
+ for (k = 0; k < N; ++k)
+ fprintf(file, "%e, %e, %e, %e, %e, %e, %e\n",
+ parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2],
+ parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]);
+ fclose(file);
+
+}
+
+
+int main(int argc, char *argv[]) {
+ int c, nr_threads;
+ int N = 1000, runs = 1;
+ char fileName[100] = {0};
+
+ /* Parse the options */
+ while ((c = getopt(argc, argv, "n:r:t:f:c:i:")) != -1) switch (c) {
+ case 'n':
+ if (sscanf(optarg, "%d", &N) != 1)
+ error("Error parsing number of particles.");
+ break;
+ case 'r':
+ if (sscanf(optarg, "%d", &runs) != 1)
+ error("Error parsing number of runs.");
+ break;
+ case 't':
+ if (sscanf(optarg, "%d", &nr_threads) != 1)
+ error("Error parsing number of threads.");
+// omp_set_num_threads(nr_threads);
+ break;
+ case 'f':
+ if (sscanf(optarg, "%s", &fileName[0]) != 1)
+ error("Error parsing file name.");
+ break;
+ case '?':
+ fprintf(stderr, "Usage: %s [-t nr_threads] [-n N] [-r runs] [-f file] "
+ "[-c Nparts] [-i Niterations] \n",
+ argv[0]);
+ fprintf(stderr, "Solves the N-body problem using a Barnes-Hut\n"
+ "tree code with N random particles read from a file in "
+ "[0,1]^3 using"
+ "nr_threads threads.\n"
+ "A test of the neighbouring cells interaction with "
+ "Nparts per cell is also run Niterations times.\n");
+ exit(EXIT_FAILURE);
+ }
+
+ /* Tree node information */
+ printf("Size of cell: %zu bytes.\n", sizeof(struct cell));
+
+ /* Part information */
+// printf("Size of part: %zu bytes.\n", sizeof(struct part));
+
+ /* Dump arguments. */
+ if (fileName[0] == 0) {
+ message("Computing the N-body problem over %i random particles using %i "
+ "threads (%i runs).",
+ N, nr_threads, runs);
+ } else {
+ message("Computing the N-body problem over %i particles read from '%s' "
+ "using %i threads (%i runs).",
+ N, fileName, nr_threads, runs);
+ }
+
+ /* Run the BH test. */
+ test_bh(N, runs, fileName);
+}
--
GitLab