diff --git a/README b/README index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..59ae0410860ec6e1fad951920ef78dd4c832793e 100644 --- a/README +++ b/README @@ -0,0 +1,6 @@ +Installation Instructions: + +1) Run ./autogen.sh +2) Run ./configure +3) Run make in the main directory. +4) Navigate to the src/ directory and execute the ./CUDACompile.sh script. With gcc-4.8 as the default compiler and nvcc command registered it should work. diff --git a/autogen.sh b/autogen.sh index 6334558115f4bf217877cb630ef76842e3052cb4..c513213392e44e1b736fcb9d9f39a0e0cac9cb6e 100755 --- a/autogen.sh +++ b/autogen.sh @@ -31,4 +31,3 @@ autoheader -I m4 # run automake automake --add-missing - diff --git a/examples/test_bh.cu b/examples/test_bh.cu index cfc5af5e9802a1a85e1431b793c78eee7d4b5a73..6c8f7bfc6df1d7c4dc5621cc12350da446b5734c 100644 --- a/examples/test_bh.cu +++ b/examples/test_bh.cu @@ -1,7 +1,7 @@ /******************************************************************************* * This file is part of QuickSched. * Coypright (c) 2014 Pedro Gonnet (pedro.gonnet@durham.ac.uk), - * Matthieu Schaller (matthieu.schaller@durham.ac.uk) + * Aidan Chalk (aidan.chalk@durham.ac.uk) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published @@ -17,6 +17,9 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. * * *****************************************************************************/ +//#define EXACT +//#define ID +#define NO_QUADRUPOLES /* Config parameters. */ #include "../config.h" @@ -32,868 +35,892 @@ #include <omp.h> #include <fenv.h> - - /* Local includes. */ extern "C"{ #include "quicksched.h" #include "res.h" -#include <cblas.h> } #include "cuda_queue.h" - -/* Some local constants. */ -#define cell_pool_grow 1000 -#define cell_maxparts 128 -#define task_limit 1e8 -#define const_G 1 // 6.6738e-8 -#define dist_min 0.5 /* Used for legacy walk only */ -#define dist_cutoff_ratio 1.5 - -#define ICHECK -1 -#define NO_SANITY_CHECKS -#define NO_COM_AS_TASK -#define NO_COUNTERS - -/** Data structure for the particles. */ -struct part { - double x[3]; - // union { - float a[3]; - float a_legacy[3]; - float a_exact[3]; - // }; - float mass; - int id; -}; // __attribute__((aligned(64))); - -struct multipole { - double com[3]; - float mass; -}; - -struct cell_device{ - double loc[3]; - double h; - int count; - unsigned short int split,sorted; - int part_loc; /*Index of particles in part array. */ - int first_child; /* Index of first child in cell array. */ - int sibling; /* Next node.*/ - struct multipole mpole; - int res, com_tid; -} - -/** Data structure for the BH tree cell. */ -struct cell { - double loc[3]; - double h; - - int count; - unsigned short int split, sorted; - struct part *parts; - - struct cell *firstchild; /* Next node if opening */ - struct cell *sibling; /* Next node */ - -/** Global variable for the pool of allocated cells. */ - - /* We keep both CoMs and masses to make sure the comp_com calculation is - * correct (use an anonymous union to keep variable names compact). */ -// union { - - /* Information for the legacy walk */ -// struct multipole legacy; - - /* Information for the QuickShed walk */ - struct multipole mpole; -// }; - - int res, com_tid; - struct index *indices; - -} __attribute__((aligned(128))); -struct cell *cell_pool = NULL; +#define double float +#define double2 float2 +//#define float double +//#define float2 double2 /** Task types. */ enum task_type { task_type_self = 0, task_type_pair, - task_type_self_pc, + task_type_pair_pc, + task_type_pair_pc_split, task_type_com, task_type_count }; -__device__ struct part *parts; +struct cell{ +double2 loc_xy; +double loc_z; +double h; +int count; +unsigned short int split, sorted; +int parts, firstchild, sibling; +int res, com_tid; -#ifdef TODO -__device__ double *part_x0; -__device__ double *part_x1; -__device__ double *part_x2; -__device__ float *part_a0; -__device__ float *part_a1; -__device__ float *part_a2; -__device__ float *part_mass; +}; +struct part{ +#ifdef ID +int id; #endif +double loc[3]; +float a[3]; +float m; +}; -#define cuda_error(s, ...) {if(threadIdx.x == 0){fprintf( stderr , "%s:%s():%i: " s "\n" , __FILE__ , __FUNCTION__ , __LINE__ , ##__VA_ARGS__ );} __syncthreads(); asm("trap\n"); } - +#define const_G 1 +/* Requred variables to obtain cells. */ +#define cell_maxparts 128 +#define CELL_STRETCH 2 +#define INITIAL_CELLS 256 +struct cell *cell_pool = NULL; +int used_cells=0; +int num_cells = 0; +int cell_size = INITIAL_CELLS*sizeof(struct cell); + +/* Device locations for the particle values. */ +__device__ double2 *com_xy; +__device__ double *com_z; +__device__ float *com_mass; +#ifdef QUADRUPOLES +__device__ double *I_xx; +__device__ double *I_yy; +__device__ double *I_zz; +__device__ double *I_xy; +__device__ double *I_xz; +__device__ double *I_yz; +#endif +__device__ struct part *parts_cuda; +__device__ struct cell *cells; +__device__ int pc_calcs = 0; + + +/* Host locations for the particle values. */ +double2 *com_xy_host; +double *com_z_host; +float *com_mass_host; +#ifdef QUADRUPOLES +double *I_xx_host; +double *I_yy_host; +double *I_zz_host; +double *I_xy_host; +double *I_xz_host; +double *I_yz_host; +#endif +double2 *parts_pos_xy_temp; +double *parts_pos_z_temp; +float4 *parts_a_m_temp; -__device__ double atomicAdd(double* address, double val) -{ - unsigned long long int* address_as_ull = - (unsigned long long int*)address; - unsigned long long int old = *address_as_ull, assumed; - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + - __longlong_as_double(assumed))); - } while (assumed != old); - return __longlong_as_double(old); -} +struct part *parts_host; +struct part *parts_temp; -__device__ inline double __shfl_down_dbl(double var, unsigned int srcLane, int width) -{ - int2 a = *reinterpret_cast<int2*>(&var); - a.x = __shfl_down(a.x, srcLane, width); - a.y = __shfl_down(a.y, srcLane, width); - return *reinterpret_cast<double*>(&a); -} +#ifndef double +__device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) +} while (assumed != old); return __longlong_as_double(old); } +#endif -__device__ inline void reduceSumDouble( double* value) -{ - for(int mask = 16; mask > 0; mask >>= 1) - *value += __shfl_down_dbl( *value, mask, 32); -} -__device__ inline void reduceSum( float* value ) -{ - #pragma unroll - for(int mask = 16 ; mask > 0 ; mask >>= 1) - *value += __shfl_down((float)*value, mask); +__device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *cj) { + int i, j, k; + int count_i = ci->count, count_j = cj->count; + int parts_i = ci->parts, parts_j = cj->parts; + double xi[3]; + float dx[3]; + float ai[3], mj, r2, w, ir; + + /* Loop over cell i.*/ + for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) { + xi[0] = parts_cuda[i].loc[0]; + xi[1] = parts_cuda[i].loc[1]; + xi[2] = parts_cuda[i].loc[2]; + for(k = 0; k < 3; k++) { + ai[k] = 0.0f; + } + + for(j = parts_j; j < parts_j + count_j; j++) { + r2 = 0.0f; + dx[0] = xi[0] - parts_cuda[j].loc[0]; + dx[1] = xi[1] - parts_cuda[j].loc[1]; + dx[2] = xi[2] - parts_cuda[j].loc[2]; + r2 += dx[0] * dx[0]; + r2 += dx[1] * dx[1]; + r2 += dx[2] * dx[2]; - // *value = __shfl((float)*value, 0); -} -/** - * @brief Checks whether the cells are direct neighbours ot not - */ -__device__ __forceinline__ int are_neighbours_different_size_CUDA(struct cell *ci, - struct cell *cj) { + ir = rsqrtf(r2); + w = const_G * ir * ir * ir; + mj = parts_cuda[j].m; + for(k = 0; k < 3; k++) { + ai[k] -= dx[k] * mj * w; + } + } + atomicAdd(&parts_cuda[i].a[0], ai[0]); + atomicAdd(&parts_cuda[i].a[1], ai[1]); + atomicAdd(&parts_cuda[i].a[2], ai[2]); + + } - int k; - float dx[3]; - double cih = ci->h, cjh = cj->h; + /* Load particles of cell i into shared memory */ + /*Loop over cell j. */ + for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) { + xi[0] = parts_cuda[i].loc[0]; + xi[1] = parts_cuda[i].loc[1]; + xi[2] = parts_cuda[i].loc[2]; + for(k = 0; k < 3; k++) { + ai[k] = 0.0f; + } + + for(j = parts_i; j < parts_i + count_i; j++) { + r2 = 0.0f; + dx[0] = xi[0] - parts_cuda[j].loc[0]; + dx[1] = xi[1] - parts_cuda[j].loc[1]; + dx[2] = xi[2] - parts_cuda[j].loc[2]; + r2 += dx[0] * dx[0]; + r2 += dx[1] * dx[1]; + r2 += dx[2] * dx[2]; - /* Maximum allowed distance */ - float min_dist = 0.5 * (cih + cjh); - /* (Manhattan) Distance between the cells */ - for (k = 0; k < 3; k++) { - float center_i = ci->loc[k] + 0.5 * cih; - float center_j = cj->loc[k] + 0.5 * cjh; - dx[k] = fabsf(center_i - center_j); - } + ir = rsqrtf(r2); + w = const_G * ir * ir * ir; + mj = parts_cuda[j].m; + for(k = 0; k < 3; k++) { + ai[k] -= dx[k] * mj * w; + } + } + atomicAdd(&parts_cuda[i].a[0], ai[0]); + atomicAdd(&parts_cuda[i].a[1], ai[1]); + atomicAdd(&parts_cuda[i].a[2], ai[2]); + + } - return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); } -/** - * @brief Compute the center of mass of a given cell. - * - * @param c The #cell. - */ -__device__ void comp_com_CUDA(struct cell *c) { - int k, count = c->count; - struct cell *cp; - struct part *parts = c->parts; - double com[3] = {0.0, 0.0, 0.0}; - float mass = 0.0; +__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) { - if (c->split) { + int i; + int count = leaf->count; + int parts = leaf->parts; + int cell_j = cj - cells; + float r2; + float dx[3], ir, w; - /* Loop over the projenitors and collect the multipole information. */ if(threadIdx.x == 0) - { - for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) { - float cp_mass = cp->mpole.mass; - com[0] += cp->mpole.com[0] * cp_mass; - com[1] += cp->mpole.com[1] * cp_mass; - com[2] += cp->mpole.com[2] * cp_mass; - mass += cp_mass; - } - /* Store the COM data, if any was collected. */ - if (mass > 0.0) { - float imass = 1.0f / mass; - c->mpole.com[0] = com[0] * imass; - c->mpole.com[1] = com[1] * imass; - c->mpole.com[2] = com[2] * imass; - c->mpole.mass = mass; - } else { - c->mpole.com[0] = 0.0; - c->mpole.com[1] = 0.0; - c->mpole.com[2] = 0.0; - c->mpole.mass = 0.0; - } - } - /* Otherwise, collect the multipole from the particles. */ - } else { - - for (k = threadIdx.x; k < count; k+= blockDim.x) { - float p_mass = part_mass[k]; - com[0] += part_x0[k] * p_mass; - com[1] += part_x1[k] * p_mass; - com[2] += part_x2[k] * p_mass; - mass += p_mass; - } - reduceSumDouble(com); - reduceSumDouble(&com[1]); - reduceSumDouble(&com[2]); - reduceSum(&mass); - if(threadIdx.x % 32 == 0) - { - if (mass > 0.0) { - float imass = 1.0f / mass; - atomicAdd( &c->mpole.com[0] , com[0] * imass); - atomicAdd( &c->mpole.com[1] , com[1] * imass); - atomicAdd( &c->mpole.com[2] , com[2] * imass); - atomicAdd( &c->mpole.mass , mass); - } else if(threadIdx.x == 0) { - c->mpole.com[0] = 0.0; - c->mpole.com[1] = 0.0; - c->mpole.com[2] = 0.0; - c->mpole.mass = 0.0; - } + atomicAdd(&pc_calcs, 1); + + /* int leaf_num = leaf - cells; + int cjnum = cj - cells; + if(threadIdx.x == 0 && leaf_num == 55) + printf("Calculating with %d %i whose firstchild is %i and cj->split == %i\n", (cjnum), cj->firstchild, cj->split);*/ +/* if(threadIdx.x == 0) + printf("leaf = %i\n", leaf - cells);*/ + + for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) { + r2 = 0.0; + dx[0] = com_xy[cell_j].x - parts_cuda[i].loc[0]; + r2 += dx[0] * dx[0]; + dx[1] = com_xy[cell_j].y - parts_cuda[i].loc[1]; + r2 += dx[1] * dx[1]; + dx[2] = com_z[cell_j] - parts_cuda[i].loc[2]; + r2 += dx[2] * dx[2]; + + ir = rsqrtf(r2); + w = com_mass[cell_j] * const_G * ir * ir * ir; + printf("Doing make_interact_pc\n"); +/* if(threadIdx.x == 0) + printf("part.a = %e w*dx = %e \n", parts_cuda[i].a[0], w*dx[0]);*/ + atomicAdd(&parts_cuda[i].a[0], w*dx[0]); + atomicAdd(&parts_cuda[i].a[1], w*dx[1]); + atomicAdd(&parts_cuda[i].a[2], w*dx[2]); +/* (*accels).x+= w*dx[0]; + (*accels).y+= w*dx[1]; + (*accels).z+= w*dx[2];*/ } - } - } - /** - * @brief Checks whether the cells are direct neighbours ot not. Both cells have - * to be of the same size + * @brief Checks whether the cells are direct neighbours ot not */ -__device__ __forceinline__ int are_neighbours_CUDA(struct cell *ci, struct cell *cj) { - - int k; - float dx[3]; - -#ifdef SANITY_CHECKS - if (ci->h != cj->h) - error(" Cells of different size in distance calculation."); -#endif - - /* Maximum allowed distance */ - float min_dist = ci->h; +__device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) { - /* (Manhattan) Distance between the cells */ - for (k = 0; k < 3; k++) { - float center_i = ci->loc[k]; - float center_j = cj->loc[k]; - dx[k] = fabsf(center_i - center_j); - } - - return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); + float dx[3]; + float cih = ci->h, cjh = cj->h; + + float min_dist = 0.5*(cih+cjh); + + float center_i = ci->loc_xy.x + 0.5*cih; + float center_j = cj->loc_xy.x + 0.5*cjh; + dx[0] = fabsf(center_i - center_j); + center_i = ci->loc_xy.y + 0.5*cih; + center_j = cj->loc_xy.y + 0.5*cjh; + dx[1] = fabsf(center_i - center_j); + center_i = ci->loc_z + 0.5*cih; + center_j = cj->loc_z + 0.5*cjh; + dx[2] = fabsf(center_i - center_j); + return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); } -/** - * @brief Compute the interactions between all particles in a cell - * and all particles in the other cell (N^2 algorithm). - * Only applies the forces to cell ci. - * - * @param ci The #cell containing the particles. - * @param cj The #cell containing the other particles - */ -__device__ __forceinline__ void iact_pair_direct_CUDA(struct cell_device *ci, struct cell_device *cj) -{ - int i, j, k; - int count_i = ci->count, count_j = cj->count; - //struct part *parts_i = ci->parts, *parts_j = cj->parts; - int parts_it = ci->part_loc, parts_jt = cj->part_loc; - double xi[3]; - float dx[3], ai[3], r2, w, ir; +/*__device__ __forceinline__ int are_neighbours_different_size(double cih, float ci_x, float ci_y, float ci_z, struct cell *cj) { + float dx_x, dx_y, dx_z; + double cjh = cj->h; + float min_dist = 0.5*(cih+cjh); + float center_i = ci_x + 0.5*cih; + float center_j = cj->loc_xy.x + 0.5*cjh; + dx_x = fabsf(center_i - center_j); + center_i = ci_y + 0.5*cih; + center_j = cj->loc_xy.y + 0.5*cjh; + dx_y = fabsf(center_i - center_j); + center_i = ci_z + 0.5*cih; + center_j = cj->loc_z + 0.5*cjh; + dx_z = fabsf(center_i - center_j); + return (dx_x <= min_dist) && (dx_y <= min_dist) && (dx_z <= min_dist); +}*/ + +__device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) { + float dx[3]; + float min_dist = ci->h; + float center_i = ci->loc_xy.x; + float center_j = cj->loc_xy.x; + dx[0] = fabsf(center_i - center_j); + center_i = ci->loc_xy.y; + center_j = cj->loc_xy.y; + dx[1] = fabsf(center_i - center_j); + center_i = ci->loc_z; + center_j = cj->loc_z; + dx[2] = fabsf(center_i - center_j); + return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); +} -#ifdef SANITY_CHECKS +/*__device__ __forceinline__ int are_neighbours(float cih, float ci_x, float ci_y, float ci_z, struct cell *cj) { + float dx_x, dx_y, dx_z; + float center_j = cj->loc_xy.x; + dx_x = fabsf(ci_x - center_j); + center_j = cj->loc_xy.y; + dx_y = fabsf(ci_y - center_j); + center_j = cj->loc_z; + dx_z = fabsf(ci_z - center_j); + return (dx_x <= cih) && (dx_y <= cih) && (dx_z <= cih); +}*/ + +__device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) { + return (leaf->parts >= c->parts) && (leaf->parts < c->parts + c->count); +} - /* Bad stuff will happen if cell sizes are different */ - if (ci->h != cj->h) - error("Non matching cell sizes !! h_i=%f h_j=%f\n", ci->h, cj->h); +__device__ __forceinline__ void make_interact_pc_new(struct cell *child, int *mpoles, int count2) +{ +int i,j; + int count = child->count; + int parts = child->parts; +// int cell_j = cj - cells; + float r2; + float dx[3], ir, w; + float a[3]; +#ifdef QUADRUPOLES + __shared__ double I_xx_local[8]; + __shared__ double I_xy_local[8]; + __shared__ double I_xz_local[8]; + __shared__ double I_yy_local[8]; + __shared__ double I_yz_local[8]; + __shared__ double I_zz_local[8]; + + if(threadIdx.x < count2){ + i = threadIdx.x; + I_xx_local[i] = I_xx[mpoles[i]]; + I_xy_local[i] = I_xy[mpoles[i]]; + I_xz_local[i] = I_xz[mpoles[i]]; + I_yy_local[i] = I_yy[mpoles[i]]; + I_yz_local[i] = I_yz[mpoles[i]]; + I_zz_local[i] = I_zz[mpoles[i]]; + } + __syncthreads(); #endif - - /* Loop over particles in cell i */ - for(i=0; i < count_i; i+= blockDim.x) - { - /* Init the ith particle's data. */ - xi[0] = part_x0[parts_it+i]; - xi[1] = part_x1[parts_it+i]; - xi[2] = part_x2[parts_it+i]; - for (k = 0; k < 3; k++) { - ai[k] = 0.0f; - } - // mi = parts_i[i].mass; - /* Loop over particles in cell j */ - for(j = 0; j < count_j; j++) - { - /* Compute the pairwise distance. */ -// for (r2 = 0.0, k = 0; k < 3; k++) { - dx[0] = xi[0] - part_x0[parts_jt+j]; - dx[1] = xi[1] - part_x1[parts_jt+j]; - dx[2] = xi[2] - part_x2[parts_jt+j]; - r2 = dx[0] * dx[0]; + for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) { + a[0] = 0.0; + a[1] = 0.0; + a[2] = 0.0; + for(j = 0; j < count2; j++){ + r2 = 0.0; + dx[0] = com_xy[mpoles[j]].x - parts_cuda[i].loc[0]; + r2 += dx[0] * dx[0]; + dx[1] = com_xy[mpoles[j]].y - parts_cuda[i].loc[1]; r2 += dx[1] * dx[1]; + dx[2] = com_z[mpoles[j]] - parts_cuda[i].loc[2]; r2 += dx[2] * dx[2]; - // } - /* Apply the gravitational acceleration. */ - ir = 1.0f / sqrtf(r2); - w = const_G * ir * ir * ir; - for(k = 0; k < 3; k++) { - ai[k] += w * dx[k] * part_mass[parts_jt+j];; - } - } - atomicAdd(&parts_a0[parts_it+i], ai[0]); - atomicAdd(&parts_a1[parts_it+i], ai[1]); - atomicAdd(&parts_a2[parts_it+i], ai[2]); - }/* Loop over particles in cell i.*/ -} - -/** - * @brief Decides whether two cells use the direct summation interaction or the - * multipole interactions - * - * @param ci The #cell. - * @param cj The other #cell. - */ -__device__ void iact_pair_CUDA(struct cell *ci, struct cell *cj) { - - struct cell *cp, *cps; - -#ifdef SANITY_CHECKS - - /* Early abort? */ - if (ci->count == 0 || cj->count == 0) error("Empty cell !"); - - /* Bad stuff will happen if cell sizes are different */ - if (ci->h != cj->h) - error("Non matching cell sizes !! h_i=%f h_j=%f\n", ci->h, cj->h); - - /* Sanity check */ - if (ci == cj) - error("The impossible has happened: pair interaction between a cell and " - "itself."); - + + ir = rsqrtf(r2); + w = com_mass[mpoles[j]] * const_G * ir * ir * ir; +#ifndef QUADRUPOLES + a[0] += w*dx[0]; + a[1] += w*dx[1]; + a[2] += w*dx[2]; +#else + float mrinv5 = w * ir * ir; + float mrinv7 = mrinv5 * ir * ir; + float D1 = -w; + float D2 = 3.f * mrinv5; + float D3 = -15.f * mrinv7; + + float q = I_xx_local[j] + I_yy_local[j] + I_zz_local[j]; + float qRx = I_xx_local[j] * dx[0] + I_xy_local[j] * dx[1] + I_xz_local[j] * dx[2]; + float qRy = I_xy_local[j] * dx[0] + I_yy_local[j] * dx[1] + I_yz_local[j] * dx[2]; + float qRz = I_xz_local[j] * dx[0] + I_yz_local[j] * dx[1] + I_zz_local[j] * dx[2]; + float qRR = qRx * dx[0] + qRy * dx[1] + qRz * dx[2]; + float C = D1 + 0.5f * D2 * q + 0.5f * D3 * qRR; + + a[0] -= C * dx[0] + D2 * qRx; + a[1] -= C * dx[1] + D2 * qRy; + a[2] -= C * dx[2] + D2 * qRz; + #endif + } +/* if(threadIdx.x == 0) + printf("part.a = %e w*dx = %e \n", parts_cuda[i].a[0], w*dx[0]);*/ + atomicAdd(&parts_cuda[i].a[0], a[0]); + atomicAdd(&parts_cuda[i].a[1], a[1]); + atomicAdd(&parts_cuda[i].a[2], a[2]); +/* (*accels).x+= w*dx[0]; + (*accels).y+= w*dx[1]; + (*accels).z+= w*dx[2];*/ + } - /* Are the cells direct neighbours? */ - if (are_neighbours_CUDA(ci, cj)) { +} - /* Are both cells split ? */ - if (ci->split && cj->split) { +__device__ __forceinline__ void iact_pair_pc(struct cell *ci, struct cell *cj) +{ +struct cell *cp, *cps; +int icells[8]; +int count = 0; - /* Let's split both cells and build all possible pairs */ - for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) { - for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) { + /* Let's split both cells and build all possible non-neighbouring pairs */ + for (cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) { + count = 0; + for (cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) { - /* If the cells are neighbours, recurse. */ - if (are_neighbours_CUDA(cp, cps)) { - iact_pair_CUDA(cp, cps); - } - } + if ( ! are_neighbours(cp, cps) ) { + icells[count] = cps - cells; + count++; } - } else {/* Otherwise, compute the interactions at this level directly. */ - iact_pair_direct_CUDA(ci, cj); - iact_pair_direct_CUDA(cj, ci); } + make_interact_pc_new(cp, icells, count); } } -__device__ void iact_self_direct_CUDA(struct cell_device *c) +__device__ __forceinline__ void iact_pair_pc_split(struct cell *child, struct cell *cj) { - int i, j, k, count = c->count; - double xi[3] = {0.0, 0.0, 0.0}; - // float ai[3] = {0.0, 0.0, 0.0}, mj, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w; - struct part *parts = c->parts; - int partst = c->part_loc; - struct cell *cp, *cps; - -#ifdef SANITY_CHECKS - - /* Early abort? */ - if (count == 0) - { - cuda_error("Empty cell"); + struct cell *cps; + int icells[8]; + int count = 0; + for (cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) { + if(!are_neighbours(child, cps) ){ + icells[count] = cps - cells; + count++; + } } + make_interact_pc_new(child, icells, count); +} -#endif +#ifdef OLD +__device__ __forceinline__ void iact_self_pc(struct cell *c, struct cell *leaf) +{ - /* If the cell is split, interact each progeny with itself, and with - each of its siblings. */ - if(c->split) - { - for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) { - iact_self_direct_CUDA(cp); - for (cps = cp->sibling; cps != c->sibling; cps = cps->sibling) - iact_pair_CUDA(cp, cps); - } - } else { + struct cell *ci, *cj, *cp, *cps; + float leafh = leaf->h; + float3 accelerations; - /* Loop over all particles. */ - for(i = threadIdx.x; i < count; i+= blockDim.x) - { - /* Init the ith particle's data. */ - for (k = 0; k < 3; k++) { - xi[k] = parts[i].x[k]; - ai[k] = 0.0; - } -// mi = parts[i].mass; + int i; - /* Loop all particles - note this was an improvement in mdcore so we use it here. */ - for(j = 0; j < count; j++) + if(threadIdx.x < leaf->count) +{ + accelerations.x = 0.0f; + accelerations.y = 0.0f; + accelerations.z = 0.0f; + + while(c != leaf) + { + ci = NULL; + /* Loop over children of c. */ + for(cj = &cells[c->firstchild]; cj != &cells[c->sibling]; cj = &cells[cj->sibling]) + { + if(ci == NULL && is_inside(leaf, cj)) { - /* Compute the pairwise distance. */ - for (r2 = 0.0, k = 0; k < 3; k++) { - dx[k] = xi[k] - parts[j].x[k]; - r2 += dx[k] * dx[k]; - } - /* Apply the gravitational acceleration. */ - ir = 1.0f / sqrtf(r2); - w = const_G * ir * ir * ir; - mj = parts[j].mass; - for(k = 0; k < 3; k++) + ci = cj; + }else{ + //Depth first search of cj. + cp = cj; + while(cp != &cells[cj->sibling]) { - ai[k] -= w * dx[k] * mj; + if(!are_neighbours(cp, leaf)) + { +// if(!cp->split) + make_interact_pc(leaf, cp, &accelerations); +// else + // for(cps = &cells[cp->firstchild]; cps != &cells[cp->sibling]; cps = &cells[cps->sibling]) + // make_interact_pc(leaf, cps, &accelerations); + + cp = &cells[cp->sibling]; + }else{ + if(cp->split && cp->h > leafh) + cp = &cells[cp->firstchild]; + else + cp = &cells[cp->sibling]; + } } - } /* Inner particle loop. */ - /* Store the accumulated acceleration on the ith part. */ - for (k = 0; k < 3; k++) atomicAdd( &parts[i].a[k] , ai[k]); - }/* Outer particle loop */ - }/* All done!*/ + } + + } -} + c = ci; -/** - * @brief Checks whether the cell leaf is a subvolume of the cell c - */ -__device__ __forceinline__ int is_inside_CUDA(struct cell *leaf, struct cell *c) { - return (leaf->parts >= c->parts) && (leaf->parts < c->parts + c->count); + } + int parts = leaf->parts; + int count = leaf->count; + for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) { + atomicAdd( &parts_cuda[i].a[0] , accelerations.x); + atomicAdd( &parts_cuda[i].a[1] , accelerations.y); + atomicAdd( &parts_cuda[i].a[2] , accelerations.z); + } } - -/** - * @brief Interacts all particles in ci with the monopole in cj - */ -__device__ __forceinline__ void make_interact_pc_CUDA(struct cell *leaf, struct cell *cj) { - - int j, k, count = leaf->count; - double com[3] = {0.0, 0.0, 0.0}; - float mcom, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w; - struct part *parts = leaf->parts; - -#ifdef SANITY_CHECKS - - /* Sanity checks */ - if (leaf->count == 0) error("Empty cell!"); - - /* Sanity check. */ - if (cj->new.mass == 0.0) { - message("%e %e %e %d %p %d %p", cj->new.com[0], cj->new.com[1], - cj->new.com[2], cj->count, cj, cj->split, cj->sibling); - - for (j = 0; j < cj->count; ++j) - message("part %d mass=%e id=%d", j, cj->parts[j].mass, cj->parts[j].id); - - error("com does not seem to have been set."); - } - +} #endif - /* Init the com's data. */ - for (k = 0; k < 3; k++) com[k] = cj->mpole.com[k]; - mcom = cj->mpole.mass; - /* Loop over every particle in leaf. */ - for (j = threadIdx.x; j < count; j+=blockDim.x) { - /* Compute the pairwise distance. */ - for (r2 = 0.0, k = 0; k < 3; k++) { - dx[k] = com[k] - parts[j].x[k]; - r2 += dx[k] * dx[k]; - } - /* Apply the gravitational acceleration. */ - ir = 1.0f / sqrtf(r2); - w = mcom * const_G * ir * ir * ir; - - for (k = 0; k < 3; k++) atomicAdd( &parts[j].a[k] , w * dx[k]); - -#if ICHECK >= 0 - if (leaf->parts[j].id == ICHECK) - printf("[DEBUG] cell [%f,%f,%f] interacting with cj->loc=[%f,%f,%f] " - "m=%f h=%f\n", - leaf->loc[0], leaf->loc[1], leaf->loc[2], cj->loc[0], cj->loc[1], - cj->loc[2], mcom, cj->h); - - if (leaf->parts[j].id == ICHECK) - printf("[NEW] Interaction with monopole a=( %e %e %e ) h= %f Nj= %d m_j= " - "%f\n", - w * dx[0], w * dx[1], w * dx[2], cj->h, cj->count, mcom); -#endif - } /* loop over every particle. */ -} /** - * @brief Compute the interactions between all particles in a cell leaf - * and the center of mass of all the cells in a part of the tree - * described by ci and cj - * - * @param ci The #cell containing the particle - * @param cj The #cell containing the center of mass. + * @brief Checks whether the cells are direct neighbours ot not. Both cells have + * to be of the same size */ -__device__ __forceinline__ void iact_pair_pc_CUDA(struct cell *ci, struct cell *cj, - struct cell *leaf) { +static inline int are_neighbours_host(struct cell *ci, struct cell *cj) { - struct cell *cp, *cps; +// int k; + float dx[3]; #ifdef SANITY_CHECKS - - /* Early abort? */ - if (ci->count == 0 || cj->count == 0) error("Empty cell !"); - - /* Sanity check */ - if (ci == cj) - error("The impossible has happened: pair interaction between a cell and " - "itself."); - - /* Sanity check */ - if (!is_inside(leaf, ci)) - error("The impossible has happened: The leaf is not within ci"); - - /* Are the cells direct neighbours? */ - if (!are_neighbours(ci, cj)) error("Cells are not neighours"); - - /* Are both cells split ? */ - if (!ci->split || !cj->split) error("One of the cells is not split !"); + if (ci->h != cj->h) + error(" Cells of different size in distance calculation."); #endif - /* Let's find in which subcell of ci the leaf is */ - for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) { - - if (is_inside_CUDA(leaf, cp)) break; - } - - if (are_neighbours_different_size_CUDA(cp, cj)) { - - /* Now interact this subcell with all subcells of cj */ - for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) { - - /* Check whether we have to recurse or can directly jump to the multipole - * calculation */ - if (are_neighbours_CUDA(cp, cps)) { - - /* We only recurse if the children are split */ - if (cp->split && cps->split) { - iact_pair_pc_CUDA(cp, cps, leaf); - } - - } else { - make_interact_pc_CUDA(leaf, cps); - } - } - } else { + /* Maximum allowed distance */ + float min_dist = ci->h; - /* If cp is not a neoghbour of cj, we can directly interact with the - * multipoles */ - for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) { + /* (Manhattan) Distance between the cells */ + double2 loc1=ci->loc_xy, loc2=cj->loc_xy; + float center_i = loc1.x; + float center_j = loc2.x; + dx[0] = fabs(center_i - center_j); + center_i = loc1.y; + center_j = loc2.y; + dx[1] = fabs(center_i - center_j); + center_i = ci->loc_z; + center_j = cj->loc_z; + dx[2] = fabs(center_i - center_j); - make_interact_pc_CUDA(leaf, cps); - } - } + return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); } -__device__ void iact_self_pc_CUDA(struct cell *c, struct cell *leaf) +struct cell *cell_get() { - -struct cell *cp, *cps; - - #ifdef SANITY_CHECKS - /* Early abort? */ - if (c->count == 0) error("Empty cell !"); - - if (!c->split) error("Cell is not split !"); - - #endif - -/* Find in which subcell of c the leaf is */ - for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) { - - /* Only recurse if the leaf is in this part of the tree */ - if (is_inside_CUDA(leaf, cp)) break; - } - -if (cp->split) { - - /* Recurse if the cell can be split */ - iact_self_pc_CUDA(cp, leaf); - - /* Now, interact with every other subcell */ - for (cps = c->firstchild; cps != c->sibling; cps = cps->sibling) { + if(num_cells == 0) + { + cell_pool = (struct cell*) calloc(INITIAL_CELLS, sizeof(struct cell)); + if(cell_pool == NULL) + error("Failed to allocate cell_pool"); + com_xy_host = (double2*) calloc(INITIAL_CELLS, sizeof(double2)); + if(com_xy_host == NULL) + error("Failed to allocate cell_pool"); + com_z_host = (double*) calloc(INITIAL_CELLS, sizeof(double)); + if(com_z_host == NULL) + error("Failed to allocate cell_pool"); + com_mass_host = (float*) calloc(INITIAL_CELLS, sizeof(float)); + if(com_mass_host == NULL) + error("Failed to allocate cell_pool"); + #ifdef QUADRUPOLES + I_xx_host = (double*) calloc(INITIAL_CELLS, sizeof(double)); + if(I_xx_host == NULL) + error("Failed to allocate quadrupoles"); + I_xy_host = (double*) calloc(INITIAL_CELLS, sizeof(double)); + if(I_xy_host == NULL) + error("Failed to allocate quadrupoles"); + I_xz_host = (double*) calloc(INITIAL_CELLS, sizeof(double)); + if(I_xz_host == NULL) + error("Failed to allocate quadrupoles"); + I_yy_host = (double*) calloc(INITIAL_CELLS, sizeof(double)); + if(I_yy_host == NULL) + error("Failed to allocate quadrupoles"); + I_yz_host = (double*) calloc(INITIAL_CELLS, sizeof(double)); + if(I_yz_host == NULL) + error("Failed to allocate quadrupoles"); + I_zz_host = (double*) calloc(INITIAL_CELLS, sizeof(double)); + if(I_zz_host == NULL) + error("Failed to allocate quadrupoles"); + #endif + num_cells = INITIAL_CELLS; + } - /* Since cp and cps will be direct neighbours it is only worth recursing - */ - /* if the cells can both be split */ - if (cp != cps && cps->split) iact_pair_pc_CUDA(cp, cps, leaf); + if(used_cells >= num_cells) + { + /* Stretch */ + struct cell *new_pool; + cell_size *= CELL_STRETCH; + new_pool = (struct cell*) calloc(num_cells*CELL_STRETCH, sizeof(struct cell)); + if(new_pool == NULL) + error("Failed to allocate new_pool"); + if(cell_pool != NULL) + memcpy(new_pool, cell_pool, num_cells*sizeof(struct cell)); + + + + double2 *tempxy = (double2*) calloc(num_cells*CELL_STRETCH, sizeof(double2)); + if(tempxy == NULL) + error("Failed to allocate tempxy"); + memcpy(tempxy, com_xy_host, sizeof(double2)*num_cells); + free(com_xy_host); + com_xy_host = tempxy; + double *tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double)); + if(tempz == NULL) + error("Failed to allocate tempz"); + memcpy(tempz, com_z_host, num_cells*sizeof(double)); + free(com_z_host); + com_z_host = tempz; + float *tempm = (float*) calloc(num_cells*CELL_STRETCH, sizeof(float)); + if(tempm == NULL) + error("Failed to allocate tempm"); + memcpy(tempm, com_mass_host, num_cells*sizeof(float)); + free(com_mass_host); + com_mass_host = tempm; + + #ifdef QUADRUPOLES + tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double)); + if(tempz == NULL) + error("Failed to allocate tempz"); + memcpy(tempz, I_xx_host, num_cells*sizeof(double)); + free(I_xx_host); + I_xx_host = tempz; + + tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double)); + if(tempz == NULL) + error("Failed to allocate tempz"); + memcpy(tempz, I_xy_host, num_cells*sizeof(double)); + free(I_xy_host); + I_xy_host = tempz; + + tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double)); + if(tempz == NULL) + error("Failed to allocate tempz"); + memcpy(tempz, I_xz_host, num_cells*sizeof(double)); + free(I_xz_host); + I_xz_host = tempz; + + tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double)); + if(tempz == NULL) + error("Failed to allocate tempz"); + memcpy(tempz, I_yy_host, num_cells*sizeof(double)); + free(I_yy_host); + I_yy_host = tempz; + + tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double)); + if(tempz == NULL) + error("Failed to allocate tempz"); + memcpy(tempz, I_yz_host, num_cells*sizeof(double)); + free(I_yz_host); + I_yz_host = tempz; + + tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double)); + if(tempz == NULL) + error("Failed to allocate tempz"); + memcpy(tempz, I_zz_host, num_cells*sizeof(double)); + free(I_zz_host); + I_zz_host = tempz; + #endif + + num_cells *= CELL_STRETCH; + free(cell_pool); + cell_pool = new_pool; + + message("Increased size of arrays"); } - } + used_cells++; + cell_pool[used_cells-1].sibling = -1; + cell_pool[used_cells-1].firstchild = -1; + cell_pool[used_cells-1].res = qsched_res_none; + return &cell_pool[used_cells-1]; } +void comp_com(struct cell *c){ + + int k, count = c->count; + int cpi; + struct cell *cp; + int parts = c->parts; + double com[3] = {0.0, 0.0, 0.0}, mass = 0.0; + +#ifdef QUADRUPOLES + double I_xx_local = 0.; + double I_yy_local = 0.; + double I_zz_local = 0.; + double I_xy_local = 0.; + double I_xz_local = 0.; + double I_yz_local = 0.; +#endif + if(c->split) { + for(cp = &cell_pool[(cpi = c->firstchild)]; cpi != c->sibling; cp = &cell_pool[(cpi = cp->sibling)]) { + float cp_mass = com_mass_host[cpi]; + com[0] += com_xy_host[cpi].x * cp_mass; + com[1] += com_xy_host[cpi].y * cp_mass; + com[2] += com_z_host[cpi] * cp_mass; + mass += cp_mass; + #ifdef QUADRUPOLES + I_xx_local += I_xx_host[cpi]; + I_yy_local += I_yy_host[cpi]; + I_zz_local += I_zz_host[cpi]; + I_xy_local += I_xy_host[cpi]; + I_xz_local += I_xz_host[cpi]; + I_yz_local += I_yz_host[cpi]; + #endif + } -/** - * @brief Get a #cell from the pool. - */ -struct cell *cell_get() { + /* Otherwise collect the multiple from the particles */ + } else { - struct cell *res; - int k; + for(k = parts; k < parts+count; k++) + { + float p_mass = parts_host[k].m; + com[0] += parts_host[k].loc[0] * p_mass; + com[1] += parts_host[k].loc[1] * p_mass; + com[2] += parts_host[k].loc[2] * p_mass; + mass += p_mass; + + #ifdef QUADRUPOLES + I_xx_local += parts_host[k].loc[0] * parts_host[k].loc[0] * p_mass; + I_yy_local += parts_host[k].loc[1] * parts_host[k].loc[1] * p_mass; + I_zz_local += parts_host[k].loc[2] * parts_host[k].loc[2] * p_mass; + I_xy_local += parts_host[k].loc[0] * parts_host[k].loc[1] * p_mass; + I_xz_local += parts_host[k].loc[0] * parts_host[k].loc[2] * p_mass; + I_yz_local += parts_host[k].loc[1] * parts_host[k].loc[2] * p_mass; + #endif + } + } - /* Allocate a new batch? */ - if (cell_pool == NULL) { - /* Allocate the cell array. */ - if ((cell_pool = - (struct cell *)calloc(cell_pool_grow, sizeof(struct cell))) == - NULL) - error("Failed to allocate fresh batch of cells."); + k = c - cell_pool; + /* Store the COM data, if it was collected. */ + if(mass > 0.0f) { + float imass = 1.0f/mass; + com_xy_host[k].x = com[0] * imass; + com_xy_host[k].y = com[1] * imass; + com_z_host[k] = com[2] * imass; + com_mass_host[k] = mass; + }else + { + com_xy_host[k].x = 0.0; + com_xy_host[k].y = 0.0; + com_z_host[k] = 0.0; + com_mass_host[k] = 0.0f; + } - /* Link them up via their progeny pointers. */ - for (k = 1; k < cell_pool_grow; k++) - cell_pool[k - 1].firstchild = &cell_pool[k]; - } +#ifdef QUADRUPOLES + I_xx_host[k] = I_xx_local; + I_yy_host[k] = I_yy_local; + I_zz_host[k] = I_zz_local; + I_xy_host[k] = I_xy_local; + I_xz_host[k] = I_xz_local; + I_yz_host[k] = I_yz_local; +#endif - /* Pick a cell off the pool. */ - res = cell_pool; - cell_pool = cell_pool->firstchild; - /* Clean up a few things. */ - res->res = qsched_res_none; - res->firstchild = 0; - /* Return the cell. */ - return res; } - /** * @brief Sort the parts into eight bins along the given pivots and * fill the multipoles. Also adds the hierarchical resources - * to the sched. + * to the sched (TODO). * * @param c The #cell to be split. * @param N The total number of parts. * @param s The #sched to store the resources. */ -void cell_split(struct cell *c, struct qsched *s) { - - int i, j, k, kk, count = c->count; - struct part temp, *parts = c->parts; - struct cell *cp; - int left[8], right[8]; - double pivot[3]; - static struct cell *root = NULL; - struct cell *progenitors[8]; - - /* Set the root cell. */ - if (root == NULL) { - root = c; - c->sibling = 0; - } - - /* Add a resource for this cell if it doesn't have one yet. */ - if (c->res == qsched_res_none) - c->res = qsched_addres(s, qsched_owner_none, qsched_res_none, NULL, 0 ,NULL); - -/* Attach a center-of-mass task to the cell. */ -#ifdef COM_AS_TASK - if (count > 0) - c->com_tid = qsched_addtask(s, task_type_com, task_flag_none, &c, - sizeof(struct cell *), 1); -#endif - - /* Does this cell need to be split? */ - if (count > cell_maxparts) { - - /* Mark this cell as split. */ - c->split = 1; - - /* Create the progeny. */ - for (k = 0; k < 8; k++) { - progenitors[k] = cp = cell_get(); - cp->loc[0] = c->loc[0]; - cp->loc[1] = c->loc[1]; - cp->loc[2] = c->loc[2]; - cp->h = c->h * 0.5; - cp->res = qsched_addres(s, qsched_owner_none, c->res, NULL, 0, NULL); - if (k & 4) cp->loc[0] += cp->h; - if (k & 2) cp->loc[1] += cp->h; - if (k & 1) cp->loc[2] += cp->h; - } - - /* Init the pivots. */ - for (k = 0; k < 3; k++) pivot[k] = c->loc[k] + c->h * 0.5; - - /* Split along the x-axis. */ - i = 0; - j = count - 1; - while (i <= j) { - while (i <= count - 1 && parts[i].x[0] < pivot[0]) i += 1; - while (j >= 0 && parts[j].x[0] >= pivot[0]) j -= 1; - if (i < j) { - temp = parts[i]; - parts[i] = parts[j]; - parts[j] = temp; - } - } - left[1] = i; - right[1] = count - 1; - left[0] = 0; - right[0] = j; - - /* Split along the y axis, twice. */ - for (k = 1; k >= 0; k--) { - i = left[k]; - j = right[k]; - while (i <= j) { - while (i <= right[k] && parts[i].x[1] < pivot[1]) i += 1; - while (j >= left[k] && parts[j].x[1] >= pivot[1]) j -= 1; - if (i < j) { - temp = parts[i]; - parts[i] = parts[j]; - parts[j] = temp; - } - } - left[2 * k + 1] = i; - right[2 * k + 1] = right[k]; - left[2 * k] = left[k]; - right[2 * k] = j; +void cell_split(int c, struct qsched *s) { + int i, j, k, kk, count = cell_pool[c].count; + int parts = cell_pool[c].parts; + struct part temp_part; + struct cell *cp, *cell; + int left[8], right[8]; + double pivot[3]; + static int root = -1; + int progenitors[8]; + struct part *temp_xy; + + /* Set the root cell. */ + if (root < 0) { + root = c; + cell_pool[c].sibling = -1; } - /* Split along the z axis, four times. */ - for (k = 3; k >= 0; k--) { - i = left[k]; - j = right[k]; - while (i <= j) { - while (i <= right[k] && parts[i].x[2] < pivot[2]) i += 1; - while (j >= left[k] && parts[j].x[2] >= pivot[2]) j -= 1; - if (i < j) { - temp = parts[i]; - parts[i] = parts[j]; - parts[j] = temp; - } - } - left[2 * k + 1] = i; - right[2 * k + 1] = right[k]; - left[2 * k] = left[k]; - right[2 * k] = j; - } - - /* Store the counts and offsets. */ - for (k = 0; k < 8; k++) { - progenitors[k]->count = right[k] - left[k] + 1; - progenitors[k]->parts = &c->parts[left[k]]; + if(cell_pool[c].res == qsched_res_none) + { + if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[c].parts], 0) != cudaSuccess ) + error("Failed to get host device pointer."); + cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy, + sizeof(struct part) * cell_pool[c].count, parts_temp + cell_pool[c].parts); } - /* Find the first non-empty progenitor */ - for (k = 0; k < 8; k++) - if (progenitors[k]->count > 0) { - c->firstchild = progenitors[k]; - break; - } + if(count > cell_maxparts ) + { + cell_pool[c].split = 1; -#ifdef SANITY_CHECKS - if (c->firstchild == NULL) - error("Cell has been split but all progenitors have 0 particles"); -#endif + for(k = 0; k < 8; k++) + { + progenitors[k] = (cp = cell_get()) - cell_pool; + cell = &cell_pool[c]; + cp->loc_xy = cell->loc_xy; + cp->loc_z = cell->loc_z; + cp->h = cell->h*0.5; + if(k & 4) cp->loc_xy.x += cp->h; + if(k & 2) cp->loc_xy.y += cp->h; + if(k & 1) cp->loc_z += cp->h; + } - /* Prepare the pointers. */ - for (k = 0; k < 8; k++) { + /* Init the pivots.*/ + pivot[0] = cell->loc_xy.x + cell->h * 0.5; + pivot[1] = cell->loc_xy.y + cell->h * 0.5; + pivot[2] = cell->loc_z + cell->h * 0.5; - /* Find the next non-empty sibling */ - for (kk = k + 1; kk < 8; ++kk) { - if (progenitors[kk]->count > 0) { - progenitors[k]->sibling = progenitors[kk]; - break; + /* Split along the x axis. */ + i = parts; + j = parts+count-1; + while(i < j) + { + while(i <= parts+count-1 && parts_host[i].loc[0] < pivot[0]) i += 1; + while(j >= parts && parts_host[j].loc[0] >= pivot[0]) j -= 1; + if(i < j){ + temp_part = parts_host[i]; + parts_host[i] = parts_host[j]; + parts_host[j] = temp_part; + } + } + left[1] = i; + right[1] = parts+count-1; + left[0] = parts; + right[0] = j; + + /* Split along the y axis twice. */ + for (k = 1; k >= 0; k--) { + i = left[k]; + j = right[k]; + while(i <= j){ + while(i <= right[k] && parts_host[i].loc[1] < pivot[1]) i += 1; + while(j >= left[k] && parts_host[j].loc[1] >= pivot[1]) j -= 1; + if(i < j) + { + temp_part = parts_host[i]; + parts_host[i] = parts_host[j]; + parts_host[j] = temp_part; + } + } + left[2*k+1] = i; + right[2*k+1] = right[k]; + left[2*k] = left[k]; + right[2*k] = j; + } + /* Split along the z axis four times.*/ + for(k = 3; k >=0; k--) + { + i = left[k]; + j = right[k]; + while(i <= j){ + while(i <= right[k] && parts_host[i].loc[2] < pivot[2]) i += 1; + while(j >= left[k] && parts_host[j].loc[2] >= pivot[2]) j -= 1; + if(i < j) + { + temp_part = parts_host[i]; + parts_host[i] = parts_host[j]; + parts_host[j] = temp_part; + } + } + left[2 * k + 1] = i; + right[2 * k + 1] = right[k]; + left[2 * k] = left[k]; + right[2 * k] = j; } - } - /* No non-empty sibling ? Go back a level */ - if (kk == 8) progenitors[k]->sibling = c->sibling; - } + /* Store the counts and offsets. */ + for(k = 0; k < 8; k++) + { + cell_pool[progenitors[k]].count = right[k]-left[k]+1; + cell_pool[progenitors[k]].parts = left[k]; + if(cell_pool[progenitors[k]].count > 0){ + if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess ) + error("Failed to get host device pointer."); + cell_pool[progenitors[k]].res = qsched_addres(s, qsched_owner_none, cell->res, temp_xy, + sizeof(struct part) * cell_pool[progenitors[k]].count, parts_temp + cell_pool[progenitors[k]].parts); + } + } - /* Recurse. */ - for (k = 0; k < 8; k++) - if (progenitors[k]->count > 0) cell_split(progenitors[k], s); + /* Find the first non-empty progenitor */ + for(k = 0; k < 8; k++) + { + if(cell_pool[progenitors[k]].count > 0) + { + cell->firstchild = progenitors[k]; + break; + } + } -/* Link the COM tasks. */ -#ifdef COM_AS_TASK - for (k = 0; k < 8; k++) - if (progenitors[k]->count > 0) - qsched_addunlock(s, progenitors[k]->com_tid, c->com_tid); -#endif + #ifdef SANITY_CHECKS + if(cell->firstchild == -1) + error("Cell has been split but all children have 0 parts"); + #endif - /* Otherwise, we're at a leaf, so create the cell's particle-cell task. */ - } else { - struct cell *data[2] = {root, c}; - int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data, - 2 * sizeof(struct cell *), 1); - qsched_addlock(s, tid, c->res); -#ifdef COM_AS_TASK - qsched_addunlock(s, root->com_tid, tid); -#endif - } /* does the cell need to be split? */ + /*Prepare the pointers*/ + for(k = 0; k < 8; k++) + { + /* Find the next non-empty sibling */ + for(kk = k+1; kk < 8; ++kk){ + if(cell_pool[progenitors[kk]].count > 0){ + cell_pool[progenitors[k]].sibling = progenitors[kk]; + break; + } + } -/* Compute the cell's center of mass. */ -#ifndef COM_AS_TASK - comp_com(c); -#endif + /* No non-empty sibling, go back a level.*/ + if(kk == 8) cell_pool[progenitors[k]].sibling = cell->sibling; - /* Set this cell's resources ownership. */ - qsched_res_own(s, c->res, - s->nr_queues * (c->parts - root->parts) / root->count); -} + } -/** - * @brief Checks whether the cells are direct neighbours ot not. Both cells have - * to be of the same size - */ -static inline int are_neighbours(struct cell *ci, struct cell *cj) { + /* Recurse */ + for(k = 0; k < 8; k++) + if(cell_pool[progenitors[k]].count > 0) cell_split(progenitors[k], s); + + /* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */ + } /*else { - int k; - float dx[3]; +// struct cell *data[2] = {root, c}; + int data[2] = {root, c}; + int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data, + 2 * sizeof(int), 3000); + qsched_addlock(s, tid, cell_pool[c].res); + }*/ -#ifdef SANITY_CHECKS - if (ci->h != cj->h) - error(" Cells of different size in distance calculation."); +#ifndef COM_AS_TASK + comp_com(&cell_pool[c]); #endif - - /* Maximum allowed distance */ - float min_dist = ci->h; - - /* (Manhattan) Distance between the cells */ - for (k = 0; k < 3; k++) { - float center_i = ci->loc[k]; - float center_j = cj->loc[k]; - dx[k] = fabs(center_i - center_j); - } - - return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); } /** @@ -906,7 +933,8 @@ static inline int are_neighbours(struct cell *ci, struct cell *cj) { void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { qsched_task_t tid; - struct cell *data[2], *cp, *cps; + int data[2]; + struct cell *cp, *cps; #ifdef SANITY_CHECKS @@ -919,16 +947,16 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { if (cj == NULL) { /* Is this cell split and above the task limit ? */ - if (ci->split && ci->count > task_limit / ci->count) { + if (ci->split /*&& ci->count > task_limit / ci->count*/) { /* Loop over each of this cell's progeny. */ - for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) { + for (cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]) { /* Make self-interaction task. */ create_tasks(s, cp, NULL); /* Make all pair-interaction tasks. */ - for (cps = cp->sibling; cps != ci->sibling; cps = cps->sibling) + for (cps = &cell_pool[cp->sibling]; cps != &cell_pool[ci->sibling]; cps = &cell_pool[cps->sibling]) create_tasks(s, cp, cps); } @@ -936,18 +964,16 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { } else { /* Set the data. */ - data[0] = ci; - data[1] = NULL; + data[0] = ci -cell_pool; + data[1] = -1; /* Create the task. */ - tid = - qsched_addtask(s, task_type_self, task_flag_none, data, - sizeof(struct cell *) * 2, ci->count * ci->count / 2); - - /* Add the resource (i.e. the cell) to the mpole task. */ + tid = qsched_addtask(s, task_type_self, task_flag_none, data, + sizeof(int) * 2, ci->count * ci->count * 0.05); + /* Add the resource (i.e. the cell) to the new task. */ qsched_addlock(s, tid, ci->res); -/* If this call might recurse, add a dependency on the cell's COM +/* Since this call might recurse, add a dependency on the cell's COM task. */ #ifdef COM_AS_TASK if (ci->split) qsched_addunlock(s, ci->com_tid, tid); @@ -957,304 +983,583 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { /* Otherwise, it's a pair. */ } else { - /* Are the cells NOT neighbours ? */ - if (!are_neighbours(ci, cj)) { + /* Are both cells split and we are above the task limit ? */ + if (ci->split && cj->split ) { + + /* Let's split both cells and build all possible pairs */ + for (cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]) { + for (cps = &cell_pool[cj->firstchild]; cps != &cell_pool[cj->sibling]; cps = &cell_pool[cps->sibling]) { + + /* Recurse */ + if (are_neighbours_host(cp, cps)) { + create_tasks(s, cp, cps); + } + } + } - } else {/* Cells are direct neighbours */ + if( ci->count > 64*cell_maxparts) + { + /* Let's also build a particle-monopole task */ + for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]){ + /* Create the task. */ + data[0] = cp-cell_pool; + data[1] = cj-cell_pool; + tid = qsched_addtask(s, task_type_pair_pc_split, task_flag_none, data, + sizeof(int) * 2, cp->count * cj->count); + + /* Add the resource and dependance */ + qsched_addlock(s, tid, cp->res); +// qsched_addlock(s, tid, cj->res); + } + for(cp = &cell_pool[cj->firstchild]; cp != &cell_pool[cj->sibling]; cp = &cell_pool[cp->sibling]){ + /* Create the task. */ + data[0] = cp-cell_pool; + data[1] = ci-cell_pool; + tid = qsched_addtask(s, task_type_pair_pc_split, task_flag_none, data, + sizeof(int) * 2, cp->count * cj->count); + + /* Add the resource and dependance */ + qsched_addlock(s, tid, cp->res); +// qsched_addlock(s, tid, cj->res); + } + }else + { + data[0] = ci -cell_pool; + data[1] = cj - cell_pool; + tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(int) * 2, ci->count * cj->count); + /* Add the resource and dependance */ + qsched_addlock(s, tid, ci->res); + data[0] = cj - cell_pool; + data[1] = ci - cell_pool; + tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(int) * 2, ci->count * cj->count); + qsched_addlock(s, tid, cj->res); + } - /* Are both cells split ? */ - if (ci->split && cj->split && ci->count > task_limit / cj->count) { +#ifdef COM_AS_TASK + qsched_addunlock(s, cj->com_tid, tid); + qsched_addunlock(s, ci->com_tid, tid); +#endif - /* Let's split both cells and build all possible pairs */ - for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) { - for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) { - /* Recurse */ - create_tasks(s, cp, cps); - } - } - /* Otherwise, at least one of the cells is not split, build a direct - * interaction. */ - } else { + + } else { /* Otherwise, at least one of the cells is not split, build a direct + * interaction. */ - /* Set the data. */ - data[0] = ci; - data[1] = cj; + /* Set the data. */ + data[0] = ci-cell_pool; + data[1] = cj-cell_pool; + + /* Create the task. */ + tid = qsched_addtask(s, task_type_pair, task_flag_none, data, + sizeof(int) * 2, ci->count * cj->count * 0.1); + + /* Add the resources. */ + qsched_addlock(s, tid, ci->res); + qsched_addlock(s, tid, cj->res); - /* Create the task. */ - tid = qsched_addtask(s, task_type_pair, task_flag_none, data, - sizeof(struct cell *) * 2, ci->count * cj->count); + } - /* Add the resources. */ - qsched_addlock(s, tid, ci->res); - qsched_addlock(s, tid, cj->res); - } - } /* Cells are direct neighbours */ } /* Otherwise it's a pair */ } + +/** + * @brief Compute the interactions between all particles in a cell. + * + * @param cellID The cell ID to compute interactions on. + */ +__device__ __forceinline__ void iact_self_direct(int cellID) { + struct cell *c = &cells[cellID]; + double xi[3] = {0.0, 0.0, 0.0}; + float ai[3] = {0.0, 0.0, 0.0 }, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w; + int parts; + int count; + int i,j,k; + + parts = c->parts; + count = c->count; + for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x) + { + xi[0] = parts_cuda[i].loc[0]; + xi[1] = parts_cuda[i].loc[1]; + xi[2] = parts_cuda[i].loc[2]; + for(k = 0; k < 3; k++) { + ai[k] = 0.0; + } + + for(j = parts; j < parts+count; j++) + { + if(i != j){ + + /* Compute the pairwise distance. */ + r2 = 0.0; + dx[0] = xi[0] - parts_cuda[j].loc[0]; + r2 += dx[0]*dx[0]; + dx[1] = xi[1] - parts_cuda[j].loc[1]; + r2 += dx[1]*dx[1]; + dx[2] = xi[2] - parts_cuda[j].loc[2]; + r2 += dx[2]*dx[2]; + + /* Apply the gravitational acceleration. */ + + ir = rsqrtf(r2); + w = const_G * ir * ir * ir; + mj = parts_cuda[j].m; + for(k = 0; k < 3; k++) { + ai[k] -= w * dx[k] * mj; + } + + } + } + //Update. + atomicAdd(&parts_cuda[i].a[0],ai[0] ); + atomicAdd(&parts_cuda[i].a[1],ai[1] ); + atomicAdd(&parts_cuda[i].a[2],ai[2] ); + } + + +} + +#ifdef EXACT +__global__ void interact_exact(int count){ + + int number = blockIdx.x * blockDim.x + threadIdx.x; + int j, k; + double dx[3], ir, r2, w; + if(number < count) + { + double pix[3] = {parts_cuda[number].loc[0], parts_cuda[number].loc[1], parts_cuda[number].loc[2]}; + //float mi = parts_cuda[number].m; + for(j = 0; j < count; j++) + { + if(j != number) + { + for (r2 = 0.0, k = 0; k < 3; k++) { + dx[k] = parts_cuda[j].loc[k] - pix[k]; + r2 += dx[k] * dx[k]; + } + ir = rsqrt(r2); + w = const_G * ir * ir * ir; + + for(k = 0; k < 3; k++) + { + atomicAdd(&parts_cuda[number].a[k] , w * dx[k] * parts_cuda[j].m); + } + } + } + + } +} +#endif + +__device__ __forceinline__ void runner( int type , void *data ) { + + int *idata = (int *)data; + int i = idata[0]; + int j = idata[1]; + switch ( type ) { + case task_type_self: + iact_self_direct(i); + break; + case task_type_pair: + iact_pair_direct(&cells[i], &cells[j]); + break; + case task_type_pair_pc: + iact_pair_pc( &cells[i], &cells[j] ); + break; + case task_type_pair_pc_split: + iact_pair_pc_split(&cells[i], &cells[j]); + break; + default: + printf("Got to default?\n"); + asm("trap;"); + } + __syncthreads(); +} + +__device__ qsched_funtype function = runner; + + + + + +int calcMaxDepth(struct cell *c, int depth) +{ + struct cell *cp; + if(c->split) + { + int maxd = 0, temp; + for(cp = &cell_pool[c->firstchild]; cp != &cell_pool[c->sibling]; cp = &cell_pool[cp->sibling]) + { + temp = calcMaxDepth(cp, depth+1); + if(temp > maxd) + maxd = temp; + } + return maxd; + }else{ + return depth+1; + } +} + +#ifdef QUADRUPOLES +void update_quads() { + int k; + for( k = 0; k < used_cells; k++) + { + double imass = 1. / com_mass_host[k]; + I_xx_host[k] = I_xx_host[k] * imass - com_xy_host[k].x * com_xy_host[k].x; + I_xy_host[k] = I_xy_host[k] * imass - com_xy_host[k].x * com_xy_host[k].y; + I_xz_host[k] = I_xz_host[k] * imass - com_xy_host[k].x * com_z_host[k]; + I_yy_host[k] = I_yy_host[k] * imass - com_xy_host[k].y * com_xy_host[k].y; + I_yz_host[k] = I_yz_host[k] * imass - com_xy_host[k].y * com_z_host[k]; + I_zz_host[k] = I_zz_host[k] * imass - com_z_host[k] * com_z_host[k]; + } +} +#endif +qsched_funtype func; /** * @brief Set up and run a task-based Barnes-Hutt N-body solver. * * @param N The number of random particles to use. - * @param nr_threads Number of threads to use. * @param runs Number of force evaluations to use as a benchmark. * @param fileName Input file name. If @c NULL or an empty string, random * particle positions will be used. */ -void test_bh(int N, int nr_threads, int runs, char *fileName) { - +void test_bh(int N, int runs, char *fileName) { int i, k; struct cell *root; - struct part *parts; FILE *file; struct qsched s; - ticks tic, toc_run, tot_setup = 0, tot_run = 0; - int countMultipoles = 0, countPairs = 0, countCoMs = 0; + struct cell *gpu_ptr_cells; +double2 *com_temp; +double *comz_temp; +float *comm_temp; - /* Initialize the per-task type timers. */ -// for (k = 0; k < task_type_count; k++) task_timers[k] = 0; + cudaFree(0); + cudaThreadSetCacheConfig(cudaFuncCachePreferL1); + if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess) + error("Failed to copy function pointer from device"); /* Initialize the scheduler. */ qsched_init(&s, 1, qsched_flag_none); - /* Init and fill the particle array. */ - if ((parts = (struct part *)malloc(sizeof(struct part) * N)) == NULL) - error("Failed to allocate particle buffer."); - - double *partsx0, *partsx1, *partsx2; - float *partsa0, *partsa1, *partsa2, *partsmass; - - if ((partsx0 = (double*)malloc(sizeof(double)*N)) == NULL) - error("Failed to allocate particle buffer."); - if ((partsx1 = (double*)malloc(sizeof(double)*N)) == NULL) - error("Failed to allocate particle buffer."); - if ((partsx2 = (double*)malloc(sizeof(double)*N)) == NULL) - error("Failed to allocate particle buffer."); - - if ((partsa0 = (float*)malloc(sizeof(float)*N)) == NULL) - error("Failed to allocate particle buffer."); - if ((partsa1 = (float*)malloc(sizeof(float)*N)) == NULL) - error("Failed to allocate particle buffer."); - if ((partsa2 = (float*)malloc(sizeof(float)*N)) == NULL) - error("Failed to allocate particle buffer."); - if ((partsmass = (float*)malloc(sizeof(float)*N)) == NULL) - error("Failed to allocate particle buffer."); - - - - - /* If no input file was specified, generate random particle positions. */ - if (fileName == NULL || fileName[0] == 0) { - for (k = 0; k < N; k++) { - parts[k].id = k; - parts[k].x[0] = ((double)rand()) / RAND_MAX; - parts[k].x[1] = ((double)rand()) / RAND_MAX; - parts[k].x[2] = ((double)rand()) / RAND_MAX; - parts[k].mass = ((double)rand()) / RAND_MAX; - parts[k].a_legacy[0] = 0.0; - parts[k].a_legacy[1] = 0.0; - parts[k].a_legacy[2] = 0.0; + //Create host particle arrays. + if( cudaMallocHost(&parts_host, sizeof(struct part) * N) != cudaSuccess) + error("Failed to allocate parts array"); + + + + if( cudaMalloc(&parts_temp, sizeof(struct part) * N) != cudaSuccess) + error("Failed to allocate device parts array"); + if( cudaMemcpyToSymbol(parts_cuda, &parts_temp, sizeof(struct part*), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to set device symbol for parts array"); + + if(fileName == NULL || fileName[0] == 0) { + for(k = 0; k < N; k++) { + parts_host[k].loc[0] = ((double)rand())/ RAND_MAX; + parts_host[k].loc[1] = ((double)rand())/ RAND_MAX; + parts_host[k].loc[2] = ((double)rand())/ RAND_MAX; + parts_host[k].m = ((double)rand()) / RAND_MAX; + } + + } else { + file = fopen(fileName, "r"); + #ifndef ID + long long int tempxy; + #endif +// double temp; + printf("Reading input from file\n"); + if(file) { + for(k = 0; k < N; k++) { +#ifdef ID + if(fscanf(file, "%d", &parts_host[k].id) != 1) +#else + if(fscanf(file, "%Ld", &tempxy) != 1) +#endif + error("Failed to read ID"); + #ifndef double + if(fscanf(file, "%f", &parts_host[k].m) != 1) + error("Failed to read mass of part %i.", k); + if(fscanf(file, "%lf %lf %lf", &parts_host[k].loc[0], &parts_host[k].loc[1], &parts_host[k].loc[2]) != 3) + error("Failed to read position of part %i.", k); + //if(fscanf(file,"%lf %lf %lf %lf %lf %lf %lf %lf %lf", &temp,&temp,&temp,&temp,&temp,&temp,&temp,&temp,&temp ) != 9) + // error("Failed to read extra stuff"); + #else + if(fscanf(file, "%f", &parts_host[k].m) != 1) + error("Failed to read mass of part %i.", k); + if(fscanf(file, "%f %f %f", &parts_host[k].loc[0], &parts_host[k].loc[1], &parts_host[k].loc[2]) != 3) + error("Failed to read position of part %i.", k); +// if(fscanf(file,"%f %f %f %f %f %f %f %f %f", &temp,&temp,&temp,&temp,&temp,&temp,&temp,&temp,&temp ) != 9) + // error("Failed to read extra stuff"); + #endif + } + fclose(file); + } } - /* Otherwise, read them from a file. */ - } else { - file = fopen(fileName, "r"); - if (file) { - for (k = 0; k < N; k++) { - if (fscanf(file, "%d", &parts[k].id) != 1) - error("Failed to read ID of part %i.", k); - if (fscanf(file, "%lf%lf%lf", &parts[k].x[0], &parts[k].x[1], - &parts[k].x[2]) != - 3) - error("Failed to read position of part %i.", k); - if (fscanf(file, "%f", &parts[k].mass) != 1) - error("Failed to read mass of part %i.", k); - - partsx0[k] = parts[k].x[0]; - partsx1[k] = parts[k].x[1]; - partsx2[k] = parts[k].x[2]; - partsmass[k] = parts[k].mass; - partsa0[k] = 0.0f; - partsa1[k] = 0.0f; - partsa2[k] = 0.0f; - } - fclose(file); + /* Init the cells. */ + root = cell_get(); + root->loc_xy.x = 0.0; + root->loc_xy.y = 0.0; + root->loc_z = 0.0; + root->h = 1.0; + root->count = N; + root->parts = 0; + root->sibling = -1; + int c = root-cell_pool; + cell_split(root - cell_pool, &s); + root = &cell_pool[c]; + int nr_leaves = 0; + int maxparts=0, minparts=1000000; + int number = 0; + while(c >= 0) { + if(cell_pool[c].count > 0) + { + number++; + if(cell_pool[c].res == qsched_res_none) + message("cell %i has no res", c); + } + if(!cell_pool[c].split) { + nr_leaves++; + if(cell_pool[c].count > maxparts) + { + maxparts = cell_pool[c].count; + } + if(cell_pool[c].count < minparts) + { + minparts = cell_pool[c].count; + } + c = cell_pool[c].sibling; + } else { + c = cell_pool[c].firstchild; + } } - } + message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves)); + message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts); + + create_tasks(&s, root, NULL); + #ifdef QUADRUPOLES + update_quads(); + //Copy Quadrupoles to the Device. + + if( cudaMalloc(&comz_temp, sizeof(double) * used_cells) != cudaSuccess) + error("Failed to allocate Quads on the GPU"); + if(cudaMemcpy(comz_temp, I_xx_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy Quads to the GPU"); + if( cudaMemcpyToSymbol( I_xx, &comz_temp, sizeof(double*), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy Quads pointer to the GPU"); + + if( cudaMalloc(&comz_temp, sizeof(double) * used_cells) != cudaSuccess) + error("Failed to allocate Quads on the GPU"); + if(cudaMemcpy(comz_temp, I_xy_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy Quads to the GPU"); + if( cudaMemcpyToSymbol( I_xy, &comz_temp, sizeof(double*), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy Quads pointer to the GPU"); + + if( cudaMalloc(&comz_temp, sizeof(double) * used_cells) != cudaSuccess) + error("Failed to allocate Quads on the GPU"); + if(cudaMemcpy(comz_temp, I_xz_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy Quads to the GPU"); + if( cudaMemcpyToSymbol( I_xz, &comz_temp, sizeof(double*), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy Quads pointer to the GPU"); + + if( cudaMalloc(&comz_temp, sizeof(double) * used_cells) != cudaSuccess) + error("Failed to allocate Quads on the GPU"); + if(cudaMemcpy(comz_temp, I_yy_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy Quads to the GPU"); + if( cudaMemcpyToSymbol( I_yy, &comz_temp, sizeof(double*), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy Quads pointer to the GPU"); + + if( cudaMalloc(&comz_temp, sizeof(double) * used_cells) != cudaSuccess) + error("Failed to allocate Quads on the GPU"); + if(cudaMemcpy(comz_temp, I_yz_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy Quads to the GPU"); + if( cudaMemcpyToSymbol( I_yz, &comz_temp, sizeof(double*), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy Quads pointer to the GPU"); + + if( cudaMalloc(&comz_temp, sizeof(double) * used_cells) != cudaSuccess) + error("Failed to allocate Quads on the GPU"); + if(cudaMemcpy(comz_temp, I_zz_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy Quads to the GPU"); + if( cudaMemcpyToSymbol( I_zz, &comz_temp, sizeof(double*), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy Quads pointer to the GPU"); + #endif - /* Init the cells. */ - root = cell_get(); - root->loc[0] = 0.0; - root->loc[1] = 0.0; - root->loc[2] = 0.0; - root->h = 1.0; - root->count = N; - root->parts = parts; - cell_split(root, &s); - - /* Iterate over the cells and get the average number of particles - per leaf. */ - struct cell *c = root; - int nr_leaves = 0; - while (c != NULL) { - if (!c->split) { - nr_leaves++; - c = c->sibling; - } else { - c = c->firstchild; + int self = 0, pair = 0, pc = 0; + + for(k = 0; k < s.count; k++) + { + if(s.tasks[k].type == task_type_self) + self++; + else if (s.tasks[k].type == task_type_pair) + pair++; + else if (s.tasks[k].type >= 0) + pc++; } - } - message("Average number of parts per leaf is %f.", ((double)N) / nr_leaves); - - for(k = 0; k < N; k++) - { - partsx0[k] = parts[k].x[0]; - partsx1[k] = parts[k].x[1]; - partsx2[k] = parts[k].x[2]; - partsmass[k] = parts[k].mass; - partsa0[k] = 0.0f; - partsa1[k] = 0.0f; - partsa2[k] = 0.0f; - } -#if ICHECK > 0 - printf("----------------------------------------------------------\n"); + message("total number of tasks: %i.", s.count); + message("total number of pair tasks: %i.", pair); + message("total number of self tasks: %i.", self); + message("total number of pc tasks: %i.", pc); + message("total number of cells: %i.", number); + message("total number of deps: %i.", s.count_deps); + message("total number of res: %i.", s.count_res); + message("total number of locks: %i.", s.count_locks); + + for(k = 0; k < runs; k++) { + for(i = 0; i < N; ++i) { + parts_host[i].a[0] = 0.0; + parts_host[i].a[1] = 0.0; + parts_host[i].a[2] = 0.0; + } - /* Do a N^2 interactions calculation */ - ticks tic_exact = getticks(); - interact_exact(N, parts, ICHECK); - ticks toc_exact = getticks(); + /* Copy the cells to the device. */ + if( cudaMalloc( &gpu_ptr_cells , sizeof(struct cell) * used_cells) != cudaSuccess) + error("Failed to allocate cells on the GPU"); + if( cudaMemcpy( gpu_ptr_cells, cell_pool, sizeof(struct cell) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy cells to the GPU"); + if( cudaMemcpyToSymbol(cells, &gpu_ptr_cells, sizeof(struct cell*), 0, cudaMemcpyHostToDevice) != cudaSuccess ) + error("Failed to copy cell pointer to the GPU"); - printf("Exact calculation (1 thread) took %lli (= %e) ticks\n", - toc_exact - tic_exact, (float)(toc_exact - tic_exact)); - printf("----------------------------------------------------------\n"); -#endif - /* Create the tasks. */ - tic = getticks(); - create_tasks(&s, root, NULL); - tot_setup += getticks() - tic; - - /* Dump the number of tasks. */ - message("total nr of tasks: %i.", s.count); - message("total nr of deps: %i.", s.count_deps); - message("total nr of res: %i.", s.count_res); - message("total nr of locks: %i.", s.count_locks); - message("total nr of uses: %i.", s.count_uses); - int counts[task_type_count]; - for (k = 0; k < task_type_count; k++) counts[k] = 0; - for (k = 0; k < s.count; k++) counts[s.tasks[k].type] += 1; - - // char buffer[200]; - // sprintf(buffer, "timings_legacy_%d_%d.dat", cell_maxparts, nr_threads); - // FILE *fileTime = fopen(buffer, "w"); - -// fclose(fileTime); - -#if ICHECK >= 0 - message("[check] accel of part %i is [%.3e,%.3e,%.3e]", ICHECK, - root->parts[ICHECK].a[0], root->parts[ICHECK].a[1], - root->parts[ICHECK].a[2]); -#endif - printf("task counts: [ %8s %8s %8s %8s %8s ]\n", "self", "pair", "m-poles", - "direct", "CoMs"); - printf("task counts: [ %8i %8i %8i %8i %8i ] (legacy).\n", 0, 0, - countMultipoles, countPairs, countCoMs); - printf("task counts: [ "); - for (k = 0; k < task_type_count; k++) printf("%8i ", counts[k]); - printf("] (mpole).\n"); - - /* Loop over the number of runs. */ - for (k = 0; k < runs; k++) { - - for (i = 0; i < N; ++i) { - parts[i].a[0] = 0.0; - parts[i].a[1] = 0.0; - parts[i].a[2] = 0.0; - partsa0[k] = 0.0f; - partsa1[k] = 0.0f; - partsa2[k] = 0.0f; - } - - /* Execute the tasks. */ - tic = getticks(); -// qsched_run(&s, nr_threads, runner); - toc_run = getticks(); - message("%ith run took %lli (= %e) ticks...", k, toc_run - tic, - (float)(toc_run - tic)); - tot_run += toc_run - tic; - } + if(cudaMalloc( &com_temp, sizeof(double2) * used_cells) != cudaSuccess) + error("Failed to allocate com on the GPU"); + if( cudaMemcpy( com_temp, com_xy_host, sizeof(double2) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess ) + error("failed to copy com to the GPU"); + if( cudaMemcpyToSymbol(com_xy, &com_temp, sizeof(double2 *), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy com pointer to the GPU"); -// message("[check] root mass= %f %f", root->legacy.mass, root->mpole.mass); -// message("[check] root CoMx= %f %f", root->legacy.com[0], root->mpole.com[0]); -// message("[check] root CoMy= %f %f", root->legacy.com[1], root->mpole.com[1]); -// message("[check] root CoMz= %f %f", root->legacy.com[2], root->mpole.com[2]); -#if ICHECK >= 0 - for (i = 0; i < N; ++i) - if (root->parts[i].id == ICHECK) - message("[check] accel of part %i is [%.3e,%.3e,%.3e]", ICHECK, - root->parts[i].a[0], root->parts[i].a[1], root->parts[i].a[2]); -#endif - /* Dump the tasks. */ - /* for ( k = 0 ; k < s.count ; k++ ) - printf( " %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid , - s.tasks[k].tic , s.tasks[k].toc ); */ + if(cudaMalloc( &comz_temp, sizeof(double) * used_cells) != cudaSuccess) + error("Failed to allocate com on the GPU"); + if( cudaMemcpy( comz_temp, com_z_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess ) + error("failed to copy com to the GPU"); + if( cudaMemcpyToSymbol(com_z, &comz_temp, sizeof(double *), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy com pointer to the GPU"); + + if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess) + error("Failed to allocate com on the GPU"); + if( cudaMemcpy( comm_temp, com_mass_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess ) + error("failed to copy com to the GPU"); + if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy com pointer to the GPU"); + printf("Max depth is %i\n", calcMaxDepth(root, 0)); + + #ifdef NO_LOADS + ticks tic_loads, tic; + double itpms = 1000.0 / CPU_TPS; + tic = getticks(); + if( cudaMemcpy( parts_host, parts_temp, sizeof(struct part) * N, cudaMemcpyHostToDevice ) != cudaSuccess) + error("Failed to copy matrix to device"); + tic_loads = getticks() - tic; + printf("load %.3f \n", tic_loads * itpms ); + #endif + qsched_run_CUDA( &s , func ); + qsched_print_cuda_timers(&s); + #ifdef NO_LOADS + tic = getticks(); + if(cudaMemcpy( parts_temp, parts_host, sizeof(struct part) *N, cudaMemcpyDeviceToHost) != cudaSuccess) + error("Failed to copy from device"); + tic_loads = getticks() - tic; + printf("unload %.3f \n", tic_loads * itpms ); + #endif + + + int pcs; + if( cudaMemcpyFromSymbol( &pcs, pc_calcs, sizeof(int), 0, cudaMemcpyDeviceToHost) != cudaSuccess) + error("Failed"); + printf("pc calcs = %i\n", pcs); +struct task* tasks = qsched_get_timers( &s , s.count ); + for(i = 0; i < s.count; i++) + { + printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID); + // printf("\n"); + + } - /* Dump the costs. */ - message("costs: setup=%lli ticks, run=%lli ticks.", tot_setup, - tot_run / runs); +} + #ifdef EXACT + struct part *parts2; + parts2 = (struct part*) malloc(sizeof(struct part) * N ); + for(k = 0; k < N; k++) + { + parts2[k].loc[0] = parts_host[k].loc[0]; + parts2[k].loc[1] = parts_host[k].loc[1]; + parts2[k].loc[2] = parts_host[k].loc[2]; + parts2[k].m = parts_host[k].m; + parts2[k].a[0] = 0.0f; + parts2[k].a[1] = 0.0f; + parts2[k].a[2] = 0.0f; + } +// cudaDeviceReset(); + if(cudaMalloc(&com_temp, sizeof(struct part) * N) != cudaSuccess) + error("Couldn't allocate com_temp"); + if(cudaMemcpy(com_temp, parts2, sizeof(struct part) * N, cudaMemcpyHostToDevice) != cudaSuccess) + error("Couldn't copy part data"); + if(cudaMemcpyToSymbol(parts_cuda, &com_temp, sizeof(struct part*), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy part data"); + + int numblocks = N / 128 + 1; + double itpms = 1000.0 / CPU_TPS; +ticks tic, toc_run ; + tic = getticks(); + interact_exact<<<numblocks, 128>>>(N); + cudaDeviceSynchronize(); + toc_run = getticks(); + printf("Cuda kernel took %.3f ms to run exact solution\n", ((double)(toc_run - tic)) * itpms); + if( cudaMemcpy( parts2, com_temp, sizeof(struct part) * N, cudaMemcpyDeviceToHost) != cudaSuccess) + error("Failed to copy data back from device"); - /* Dump the timers. */ - for (k = 0; k < qsched_timer_count; k++) - message("timer %s is %lli ticks.", qsched_timer_names[k], - s.timers[k] / runs); + printf("%e\n", parts_host[0].m); - /* Dump the per-task type timers. */ -// printf("task timers: [ "); -// for (k = 0; k < task_type_count; k++) printf("%lli ", task_timers[k] / runs); - //printf("] ticks.\n"); + k = 0; + printf("%e, %e, %e, %e, %e, %e, %e, %e, %e, %e\n", parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2], + parts2[k].a[0], parts2[k].a[1], parts2[k].a[2], parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]); + #endif /* Dump the particles to a file */ file = fopen("particle_dump.dat", "w"); - fprintf(file, +/* fprintf(file, "# ID m x y z a_exact.x a_exact.y a_exact.z a_legacy.x " - "a_legacy.y a_legacy.z a_mpole.x a_mpole.y a_mpole.z\n"); + "a_legacy.y a_legacy.z a_new.x a_new.y a_new.z\n");*/ +#ifdef EXACT + printf("Printing exact\n"); + for(k = 0; k < N; ++k) + fprintf(file, "%i %e %e %e %e %e %e %e %e %e %e %e %e %e\n", + #ifdef ID + parts_host[k].id, + #else + k, + #endif + parts_host[k].m,parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2], + parts2[k].a[0], parts2[k].a[1], parts2[k].a[2], parts2[k].a[0], parts2[k].a[1], parts2[k].a[2], parts_host[k].a[0], + parts_host[k].a[1], parts_host[k].a[2]); + +#else for (k = 0; k < N; ++k) - fprintf(file, "%d %e %e %e %e %e %e %e %e %e %e %e %e %e\n", parts[k].id, - parts[k].mass, parts[k].x[0], parts[k].x[1], parts[k].x[2], - parts[k].a_exact[0], parts[k].a_exact[1], parts[k].a_exact[2], - parts[k].a_legacy[0], parts[k].a_legacy[1], parts[k].a_legacy[2], - parts[k].a[0], parts[k].a[1], parts[k].a[2]); + fprintf(file, "%i %e %e %e %e %e %e %e\n", + k, parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2], + parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]); +#endif fclose(file); - - /* Clean up. */ - qsched_free(&s); - free(parts); + file = fopen("particle_pos.dat", "w"); + fprintf(file, "m x[1] x[2] x[3]\n"); + for(k = 0; k < N; k++) + fprintf(file, "%e %e %e %e\n", parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2]); + + fclose(file); + cudaFreeHost(parts_host); + //free(parts_host); } -/** - * @brief Main function. - */ int main(int argc, char *argv[]) { - - int c, nr_threads; - int N = 1000, runs = 1; - char fileName[100] = {0}; - - /* Die on FP-exceptions. */ - feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); - -/* Get the number of threads. */ -#pragma omp parallel shared(nr_threads) - { - if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads(); - } - - /* Parse the options */ + int c, nr_threads; + int N = 1000, runs = 1; + char fileName[100]; + fileName[0] = 0; + /* Parse the options */ while ((c = getopt(argc, argv, "n:r:t:f:c:i:")) != -1) switch (c) { case 'n': if (sscanf(optarg, "%d", &N) != 1) @@ -1267,7 +1572,6 @@ int main(int argc, char *argv[]) { case 't': if (sscanf(optarg, "%d", &nr_threads) != 1) error("Error parsing number of threads."); - omp_set_num_threads(nr_threads); break; case 'f': if (sscanf(optarg, "%s", &fileName[0]) != 1) @@ -1304,7 +1608,5 @@ int main(int argc, char *argv[]) { } /* Run the BH test. */ - test_bh(N, nr_threads, runs, fileName); - - return 0; + test_bh(N, runs, fileName); } diff --git a/examples/test_qr.cu b/examples/test_qr.cu index 5446870ae5d44da45b7dd971e633e8f042782907..c37277ab7f51b7efd9f1ac4bb26bfb6261cd4a23 100644 --- a/examples/test_qr.cu +++ b/examples/test_qr.cu @@ -7,20 +7,22 @@ #include <string.h> #include <unistd.h> #include <math.h> -#include <gperftools/profiler.h> +//#include <gperftools/profiler.h> //#define WITH_CULA -#ifdef WITH_CULA -#include <cula.h> -#endif +//#ifdef WITH_CULA +//#include <cula.h> +//#endif /* Local includes. */ extern "C"{ #include "quicksched.h" #include "res.h" +#ifdef WITH_CBLAS #include <cblas.h> +#endif } #include "cuda_queue.h" @@ -95,6 +97,7 @@ void printMatrix(float *matrix, int m, int n, int tilesize) * This function is simply for validation and is implemented naively as we know * of no implementation to retrieve Q from the tiled QR. */ +#ifdef WITH_CBLAS float* computeQ(float* HR, int size, int tilesize, float* tau, int tauNum) { float* Q = (float*)malloc(sizeof(float) * size * size); float* Qtemp = (float*)malloc(sizeof(float) * size * size); @@ -163,6 +166,7 @@ float* computeQ(float* HR, int size, int tilesize, float* tau, int tauNum) { free(temp); return Q; } +#endif float* getR(float* HR, int size) { float* R = (float*)malloc(sizeof(float) * size * size); @@ -693,7 +697,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) int data[3]; // ticks tic, tot_run = 0; #ifdef NO_LOADS - ticks tic_loads; + ticks tic_loads, tic; #endif /* Initialize the scheduler. */ qsched_init( &s , 1 , qsched_flag_none ); @@ -730,10 +734,12 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) if( cudaMalloc( &device_array , sizeof(float) * m * n * K * K ) != cudaSuccess ) error("Failed to allocate the matrix on the device"); #ifdef NO_LOADS + double itpms = 1000.0 / CPU_TPS; tic = getticks(); if( cudaMemcpy( A, device_array, sizeof(float) * m * n * K *K, cudaMemcpyHostToDevice ) != cudaSuccess) error("Failed to copy matrix to device"); tic_loads = getticks() - tic; + printf("load %.3f \n", tic_loads * itpms ); #endif if( cudaMemcpyToSymbol( GPU_matrix , &device_array , sizeof(float *) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy matrix pointer to the device"); @@ -816,14 +822,16 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) qsched_run_CUDA( &s , func ); - qsched_print_cuda_timers(&s); #ifdef NO_LOADS tic = getticks(); if( cudaMemcpy( device_array, A, sizeof(float) * m * n * K *K, cudaMemcpyDeviceToHost ) != cudaSuccess) error("Failed to copy matrix from device"); - tic_loads += getticks() - tic;double itpms = 1000.0 / CPU_TPS; - printf("%.3f\n", ((double)(tic_loads)) * itpms ); + tic_loads = getticks() - tic; + printf("unload %.3f\n", ((double)(tic_loads)) * itpms ); #endif + + qsched_print_cuda_timers(&s); + if(cudaMemcpy( tau , tau_device , sizeof(float) * m * n * K , cudaMemcpyDeviceToHost ) != cudaSuccess ) error("Failed to copy the tau data from the device."); diff --git a/src/cuda_queue.cu b/src/cuda_queue.cu index 71d7dbebe725866fdcecbfbc40cdb7a8a1f6665b..3d4e5f3bd10d50f10df3ff9cecee6c2055525e58 100644 --- a/src/cuda_queue.cu +++ b/src/cuda_queue.cu @@ -20,11 +20,11 @@ #include "../config.h" /* Standard includes. */ +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <gperftools/profiler.h> -#include <math.h> + extern "C"{ #include "quicksched.h"