diff --git a/examples/test_bh_4.cu b/examples/test_bh_4.cu new file mode 100644 index 0000000000000000000000000000000000000000..a1bd8daf502c9ae192d069d36589db32949c89f8 --- /dev/null +++ b/examples/test_bh_4.cu @@ -0,0 +1,1368 @@ +/******************************************************************************* + * This file is part of QuickSched. + * Coypright (c) 2014 Pedro Gonnet (pedro.gonnet@durham.ac.uk), + * Aidan Chalk (aidan.chalk@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * +* *****************************************************************************/ +//#define EXACT +//#define ID + +/* Config parameters. */ +#include "../config.h" + +/* Standard includes. */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <math.h> +#include <float.h> +#include <limits.h> +#include <omp.h> +#include <fenv.h> + +/* Local includes. */ +extern "C"{ +#include "quicksched.h" +#include "res.h" +} +#include "cuda_queue.h" +//#define double float +//#define double2 float2 +//#define float double +//#define float2 double2 + +/** Task types. */ +enum task_type { + task_type_self = 0, + task_type_pair, + task_type_pair_pc, + task_type_pair_pc_split, + task_type_com, + task_type_count +}; + +struct cell{ + +double2 loc_xy; +double loc_z; +double h; +int count; +unsigned short int split, sorted; +int parts, firstchild, sibling; +int res, com_tid; + +}; +struct part{ +#ifdef ID +int id; +#endif +double loc[3]; +float a[3]; +float m; +}; + + +#define const_G 1 +/* Requred variables to obtain cells. */ +#define cell_maxparts 128 +#define CELL_STRETCH 2 +#define INITIAL_CELLS 256 +struct cell *cell_pool = NULL; +int used_cells=0; +int num_cells = 0; +int cell_size = INITIAL_CELLS*sizeof(struct cell); + +/* Device locations for the particle values. */ +__device__ double2 *com_xy; +__device__ double *com_z; +__device__ float *com_mass; +__device__ struct part *parts_cuda; +__device__ struct cell *cells; +__device__ int pc_calcs = 0; + + +/* Host locations for the particle values. */ +double2 *com_xy_host; +double *com_z_host; +float *com_mass_host; +double2 *parts_pos_xy_temp; +double *parts_pos_z_temp; +float4 *parts_a_m_temp; + +struct part *parts_host; +struct part *parts_temp; + +__device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) +} while (assumed != old); return __longlong_as_double(old); } + + + +__device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *cj) { + int i, j, k; + int count_i = ci->count, count_j = cj->count; + int parts_i = ci->parts, parts_j = cj->parts; + double xi[3]; + float dx[3]; + float ai[3], mj, r2, w, ir; + + /* Loop over cell i.*/ + for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) { + xi[0] = parts_cuda[i].loc[0]; + xi[1] = parts_cuda[i].loc[1]; + xi[2] = parts_cuda[i].loc[2]; + for(k = 0; k < 3; k++) { + ai[k] = 0.0f; + } + + for(j = parts_j; j < parts_j + count_j; j++) { + r2 = 0.0f; + dx[0] = xi[0] - parts_cuda[j].loc[0]; + dx[1] = xi[1] - parts_cuda[j].loc[1]; + dx[2] = xi[2] - parts_cuda[j].loc[2]; + r2 += dx[0] * dx[0]; + r2 += dx[1] * dx[1]; + r2 += dx[2] * dx[2]; + + + ir = rsqrtf(r2); + w = const_G * ir * ir * ir; + mj = parts_cuda[j].m; + for(k = 0; k < 3; k++) { + ai[k] -= dx[k] * mj * w; + } + } + atomicAdd(&parts_cuda[i].a[0], ai[0]); + atomicAdd(&parts_cuda[i].a[1], ai[1]); + atomicAdd(&parts_cuda[i].a[2], ai[2]); + + } + + /* Load particles of cell i into shared memory */ + /*Loop over cell j. */ + for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) { + xi[0] = parts_cuda[i].loc[0]; + xi[1] = parts_cuda[i].loc[1]; + xi[2] = parts_cuda[i].loc[2]; + for(k = 0; k < 3; k++) { + ai[k] = 0.0f; + } + + for(j = parts_i; j < parts_i + count_i; j++) { + r2 = 0.0f; + dx[0] = xi[0] - parts_cuda[j].loc[0]; + dx[1] = xi[1] - parts_cuda[j].loc[1]; + dx[2] = xi[2] - parts_cuda[j].loc[2]; + r2 += dx[0] * dx[0]; + r2 += dx[1] * dx[1]; + r2 += dx[2] * dx[2]; + + + ir = rsqrtf(r2); + w = const_G * ir * ir * ir; + mj = parts_cuda[j].m; + for(k = 0; k < 3; k++) { + ai[k] -= dx[k] * mj * w; + } + } + atomicAdd(&parts_cuda[i].a[0], ai[0]); + atomicAdd(&parts_cuda[i].a[1], ai[1]); + atomicAdd(&parts_cuda[i].a[2], ai[2]); + + } + +} + +__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) { + + int i; + int count = leaf->count; + int parts = leaf->parts; + int cell_j = cj - cells; + float r2; + float dx[3], ir, w; + + if(threadIdx.x == 0) + atomicAdd(&pc_calcs, 1); + + /* int leaf_num = leaf - cells; + int cjnum = cj - cells; + if(threadIdx.x == 0 && leaf_num == 55) + printf("Calculating with %d %i whose firstchild is %i and cj->split == %i\n", (cjnum), cj->firstchild, cj->split);*/ +/* if(threadIdx.x == 0) + printf("leaf = %i\n", leaf - cells);*/ + + for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) { + r2 = 0.0; + dx[0] = com_xy[cell_j].x - parts_cuda[i].loc[0]; + r2 += dx[0] * dx[0]; + dx[1] = com_xy[cell_j].y - parts_cuda[i].loc[1]; + r2 += dx[1] * dx[1]; + dx[2] = com_z[cell_j] - parts_cuda[i].loc[2]; + r2 += dx[2] * dx[2]; + + ir = rsqrtf(r2); + w = com_mass[cell_j] * const_G * ir * ir * ir; +/* if(threadIdx.x == 0) + printf("part.a = %e w*dx = %e \n", parts_cuda[i].a[0], w*dx[0]);*/ + atomicAdd(&parts_cuda[i].a[0], w*dx[0]); + atomicAdd(&parts_cuda[i].a[1], w*dx[1]); + atomicAdd(&parts_cuda[i].a[2], w*dx[2]); +/* (*accels).x+= w*dx[0]; + (*accels).y+= w*dx[1]; + (*accels).z+= w*dx[2];*/ + } +} + +/** + * @brief Checks whether the cells are direct neighbours ot not + */ +__device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) { + + float dx[3]; + float cih = ci->h, cjh = cj->h; + + float min_dist = 0.5*(cih+cjh); + + float center_i = ci->loc_xy.x + 0.5*cih; + float center_j = cj->loc_xy.x + 0.5*cjh; + dx[0] = fabsf(center_i - center_j); + center_i = ci->loc_xy.y + 0.5*cih; + center_j = cj->loc_xy.y + 0.5*cjh; + dx[1] = fabsf(center_i - center_j); + center_i = ci->loc_z + 0.5*cih; + center_j = cj->loc_z + 0.5*cjh; + dx[2] = fabsf(center_i - center_j); + return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); +} + +/*__device__ __forceinline__ int are_neighbours_different_size(double cih, float ci_x, float ci_y, float ci_z, struct cell *cj) { + float dx_x, dx_y, dx_z; + double cjh = cj->h; + float min_dist = 0.5*(cih+cjh); + float center_i = ci_x + 0.5*cih; + float center_j = cj->loc_xy.x + 0.5*cjh; + dx_x = fabsf(center_i - center_j); + center_i = ci_y + 0.5*cih; + center_j = cj->loc_xy.y + 0.5*cjh; + dx_y = fabsf(center_i - center_j); + center_i = ci_z + 0.5*cih; + center_j = cj->loc_z + 0.5*cjh; + dx_z = fabsf(center_i - center_j); + return (dx_x <= min_dist) && (dx_y <= min_dist) && (dx_z <= min_dist); +}*/ + +__device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) { + float dx[3]; + float min_dist = ci->h; + float center_i = ci->loc_xy.x; + float center_j = cj->loc_xy.x; + dx[0] = fabsf(center_i - center_j); + center_i = ci->loc_xy.y; + center_j = cj->loc_xy.y; + dx[1] = fabsf(center_i - center_j); + center_i = ci->loc_z; + center_j = cj->loc_z; + dx[2] = fabsf(center_i - center_j); + return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); +} + +/*__device__ __forceinline__ int are_neighbours(float cih, float ci_x, float ci_y, float ci_z, struct cell *cj) { + float dx_x, dx_y, dx_z; + float center_j = cj->loc_xy.x; + dx_x = fabsf(ci_x - center_j); + center_j = cj->loc_xy.y; + dx_y = fabsf(ci_y - center_j); + center_j = cj->loc_z; + dx_z = fabsf(ci_z - center_j); + return (dx_x <= cih) && (dx_y <= cih) && (dx_z <= cih); +}*/ + +__device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) { + return (leaf->parts >= c->parts) && (leaf->parts < c->parts + c->count); +} + + +__device__ __forceinline__ void make_interact_pc_new(struct cell *child, int *mpoles, int count2) +{ +int i,j; + int count = child->count; + int parts = child->parts; +// int cell_j = cj - cells; + float r2; + float dx[3], ir, w; + float a[3]; + + for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) { + a[0] = 0.0; + a[1] = 0.0; + a[2] = 0.0; + for(j = 0; j < count2; j++){ + r2 = 0.0; + dx[0] = com_xy[mpoles[j]].x - parts_cuda[i].loc[0]; + r2 += dx[0] * dx[0]; + dx[1] = com_xy[mpoles[j]].y - parts_cuda[i].loc[1]; + r2 += dx[1] * dx[1]; + dx[2] = com_z[mpoles[j]] - parts_cuda[i].loc[2]; + r2 += dx[2] * dx[2]; + + ir = rsqrtf(r2); + w = com_mass[mpoles[j]] * const_G * ir * ir * ir; + a[0] += w*dx[0]; + a[1] += w*dx[1]; + a[2] += w*dx[2]; + } +/* if(threadIdx.x == 0) + printf("part.a = %e w*dx = %e \n", parts_cuda[i].a[0], w*dx[0]);*/ + atomicAdd(&parts_cuda[i].a[0], a[0]); + atomicAdd(&parts_cuda[i].a[1], a[1]); + atomicAdd(&parts_cuda[i].a[2], a[2]); +/* (*accels).x+= w*dx[0]; + (*accels).y+= w*dx[1]; + (*accels).z+= w*dx[2];*/ + } + +} + +__device__ __forceinline__ void iact_pair_pc(struct cell *ci, struct cell *cj) +{ +struct cell *cp, *cps; +int icells[8]; +int count = 0; + + /* Let's split both cells and build all possible non-neighbouring pairs */ + for (cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) { + count = 0; + for (cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) { + + if ( ! are_neighbours(cp, cps) ) { + icells[count] = cps - cells; + count++; + } + } + make_interact_pc_new(cp, icells, count); + } +} + +__device__ __forceinline__ void iact_pair_pc_split(struct cell *child, struct cell *cj) +{ + struct cell *cps; + int icells[8]; + int count = 0; + for (cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) { + if(!are_neighbours(child, cps) ){ + icells[count] = cps - cells; + count++; + } + } + make_interact_pc_new(child, icells, count); +} + +#ifdef OLD +__device__ __forceinline__ void iact_self_pc(struct cell *c, struct cell *leaf) +{ + + struct cell *ci, *cj, *cp, *cps; + float leafh = leaf->h; + float3 accelerations; + + int i; + + if(threadIdx.x < leaf->count) +{ + accelerations.x = 0.0f; + accelerations.y = 0.0f; + accelerations.z = 0.0f; + + while(c != leaf) + { + ci = NULL; + /* Loop over children of c. */ + for(cj = &cells[c->firstchild]; cj != &cells[c->sibling]; cj = &cells[cj->sibling]) + { + if(ci == NULL && is_inside(leaf, cj)) + { + ci = cj; + }else{ + //Depth first search of cj. + cp = cj; + while(cp != &cells[cj->sibling]) + { + if(!are_neighbours(cp, leaf)) + { +// if(!cp->split) + make_interact_pc(leaf, cp, &accelerations); +// else + // for(cps = &cells[cp->firstchild]; cps != &cells[cp->sibling]; cps = &cells[cps->sibling]) + // make_interact_pc(leaf, cps, &accelerations); + + cp = &cells[cp->sibling]; + }else{ + if(cp->split && cp->h > leafh) + cp = &cells[cp->firstchild]; + else + cp = &cells[cp->sibling]; + } + } + } + + } + + c = ci; + + } + int parts = leaf->parts; + int count = leaf->count; + for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) { + atomicAdd( &parts_cuda[i].a[0] , accelerations.x); + atomicAdd( &parts_cuda[i].a[1] , accelerations.y); + atomicAdd( &parts_cuda[i].a[2] , accelerations.z); + } +} + +} +#endif + + + + + + +/** + * @brief Checks whether the cells are direct neighbours ot not. Both cells have + * to be of the same size + */ +static inline int are_neighbours_host(struct cell *ci, struct cell *cj) { + +// int k; + float dx[3]; + +#ifdef SANITY_CHECKS + if (ci->h != cj->h) + error(" Cells of different size in distance calculation."); +#endif + + /* Maximum allowed distance */ + float min_dist = ci->h; + + /* (Manhattan) Distance between the cells */ + double2 loc1=ci->loc_xy, loc2=cj->loc_xy; + float center_i = loc1.x; + float center_j = loc2.x; + dx[0] = fabs(center_i - center_j); + center_i = loc1.y; + center_j = loc2.y; + dx[1] = fabs(center_i - center_j); + center_i = ci->loc_z; + center_j = cj->loc_z; + dx[2] = fabs(center_i - center_j); + + return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); +} + + +struct cell *cell_get() +{ + + if(num_cells == 0) + { + cell_pool = (struct cell*) calloc(INITIAL_CELLS, sizeof(struct cell)); + if(cell_pool == NULL) + error("Failed to allocate cell_pool"); + com_xy_host = (double2*) calloc(INITIAL_CELLS, sizeof(double2)); + if(com_xy_host == NULL) + error("Failed to allocate cell_pool"); + com_z_host = (double*) calloc(INITIAL_CELLS, sizeof(double)); + if(com_z_host == NULL) + error("Failed to allocate cell_pool"); + com_mass_host = (float*) calloc(INITIAL_CELLS, sizeof(float)); + if(com_mass_host == NULL) + error("Failed to allocate cell_pool"); + num_cells = INITIAL_CELLS; + } + + if(used_cells >= num_cells) + { + /* Stretch */ + struct cell *new_pool; + cell_size *= CELL_STRETCH; + new_pool = (struct cell*) calloc(num_cells*CELL_STRETCH, sizeof(struct cell)); + if(new_pool == NULL) + error("Failed to allocate new_pool"); + if(cell_pool != NULL) + memcpy(new_pool, cell_pool, num_cells*sizeof(struct cell)); + + + + double2 *tempxy = (double2*) calloc(num_cells*CELL_STRETCH, sizeof(double2)); + if(tempxy == NULL) + error("Failed to allocate tempxy"); + memcpy(tempxy, com_xy_host, sizeof(double2)*num_cells); + free(com_xy_host); + com_xy_host = tempxy; + double *tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double)); + if(tempz == NULL) + error("Failed to allocate tempz"); + memcpy(tempz, com_z_host, num_cells*sizeof(double)); + free(com_z_host); + com_z_host = tempz; + float *tempm = (float*) calloc(num_cells*CELL_STRETCH, sizeof(float)); + if(tempm == NULL) + error("Failed to allocate tempm"); + memcpy(tempm, com_mass_host, num_cells*sizeof(float)); + free(com_mass_host); + com_mass_host = tempm; + + num_cells *= CELL_STRETCH; + free(cell_pool); + cell_pool = new_pool; + + message("Increased size of arrays"); + } + used_cells++; + cell_pool[used_cells-1].sibling = -1; + cell_pool[used_cells-1].firstchild = -1; + cell_pool[used_cells-1].res = qsched_res_none; + return &cell_pool[used_cells-1]; +} + +void comp_com(struct cell *c){ + + int k, count = c->count; + int cpi; + struct cell *cp; + int parts = c->parts; + double com[3] = {0.0, 0.0, 0.0}, mass = 0.0; + + if(c->split) { + for(cp = &cell_pool[(cpi = c->firstchild)]; cpi != c->sibling; cp = &cell_pool[(cpi = cp->sibling)]) { + float cp_mass = com_mass_host[cpi]; + com[0] += com_xy_host[cpi].x * cp_mass; + com[1] += com_xy_host[cpi].y * cp_mass; + com[2] += com_z_host[cpi] * cp_mass; + mass += cp_mass; + } + + + /* Otherwise collect the multiple from the particles */ + } else { + + for(k = parts; k < parts+count; k++) + { + float p_mass = parts_host[k].m; + com[0] += parts_host[k].loc[0] * p_mass; + com[1] += parts_host[k].loc[1] * p_mass; + com[2] += parts_host[k].loc[2] * p_mass; + mass += p_mass; + } + } + + + k = c - cell_pool; + /* Store the COM data, if it was collected. */ + if(mass > 0.0f) { + float imass = 1.0f/mass; + com_xy_host[k].x = com[0] * imass; + com_xy_host[k].y = com[1] * imass; + com_z_host[k] = com[2] * imass; + com_mass_host[k] = mass; + }else + { + com_xy_host[k].x = 0.0; + com_xy_host[k].y = 0.0; + com_z_host[k] = 0.0; + com_mass_host[k] = 0.0f; + } + + + +} + +/** + * @brief Sort the parts into eight bins along the given pivots and + * fill the multipoles. Also adds the hierarchical resources + * to the sched (TODO). + * + * @param c The #cell to be split. + * @param N The total number of parts. + * @param s The #sched to store the resources. + */ +void cell_split(int c, struct qsched *s) { + int i, j, k, kk, count = cell_pool[c].count; + int parts = cell_pool[c].parts; + struct part temp_part; + struct cell *cp, *cell; + int left[8], right[8]; + double pivot[3]; + static int root = -1; + int progenitors[8]; + struct part *temp_xy; + + /* Set the root cell. */ + if (root < 0) { + root = c; + cell_pool[c].sibling = -1; + } + + if(cell_pool[c].res == qsched_res_none) + { + if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[c].parts], 0) != cudaSuccess ) + error("Failed to get host device pointer."); + cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy, + sizeof(struct part) * cell_pool[c].count, parts_temp + cell_pool[c].parts); + } + + if(count > cell_maxparts ) + { + cell_pool[c].split = 1; + + for(k = 0; k < 8; k++) + { + progenitors[k] = (cp = cell_get()) - cell_pool; + cell = &cell_pool[c]; + cp->loc_xy = cell->loc_xy; + cp->loc_z = cell->loc_z; + cp->h = cell->h*0.5; + if(k & 4) cp->loc_xy.x += cp->h; + if(k & 2) cp->loc_xy.y += cp->h; + if(k & 1) cp->loc_z += cp->h; + } + + /* Init the pivots.*/ + pivot[0] = cell->loc_xy.x + cell->h * 0.5; + pivot[1] = cell->loc_xy.y + cell->h * 0.5; + pivot[2] = cell->loc_z + cell->h * 0.5; + + /* Split along the x axis. */ + i = parts; + j = parts+count-1; + while(i < j) + { + while(i <= parts+count-1 && parts_host[i].loc[0] < pivot[0]) i += 1; + while(j >= parts && parts_host[j].loc[0] >= pivot[0]) j -= 1; + if(i < j){ + temp_part = parts_host[i]; + parts_host[i] = parts_host[j]; + parts_host[j] = temp_part; + } + } + left[1] = i; + right[1] = parts+count-1; + left[0] = parts; + right[0] = j; + + /* Split along the y axis twice. */ + for (k = 1; k >= 0; k--) { + i = left[k]; + j = right[k]; + while(i <= j){ + while(i <= right[k] && parts_host[i].loc[1] < pivot[1]) i += 1; + while(j >= left[k] && parts_host[j].loc[1] >= pivot[1]) j -= 1; + if(i < j) + { + temp_part = parts_host[i]; + parts_host[i] = parts_host[j]; + parts_host[j] = temp_part; + } + } + left[2*k+1] = i; + right[2*k+1] = right[k]; + left[2*k] = left[k]; + right[2*k] = j; + } + /* Split along the z axis four times.*/ + for(k = 3; k >=0; k--) + { + i = left[k]; + j = right[k]; + while(i <= j){ + while(i <= right[k] && parts_host[i].loc[2] < pivot[2]) i += 1; + while(j >= left[k] && parts_host[j].loc[2] >= pivot[2]) j -= 1; + if(i < j) + { + temp_part = parts_host[i]; + parts_host[i] = parts_host[j]; + parts_host[j] = temp_part; + } + } + left[2 * k + 1] = i; + right[2 * k + 1] = right[k]; + left[2 * k] = left[k]; + right[2 * k] = j; + } + + /* Store the counts and offsets. */ + for(k = 0; k < 8; k++) + { + cell_pool[progenitors[k]].count = right[k]-left[k]+1; + cell_pool[progenitors[k]].parts = left[k]; + if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess ) + error("Failed to get host device pointer."); + cell_pool[progenitors[k]].res = qsched_addres(s, qsched_owner_none, cell->res, temp_xy, + sizeof(struct part) * cell_pool[progenitors[k]].count, parts_temp + cell_pool[progenitors[k]].parts); + } + + /* Find the first non-empty progenitor */ + for(k = 0; k < 8; k++) + { + if(cell_pool[progenitors[k]].count > 0) + { + cell->firstchild = progenitors[k]; + break; + } + } + + #ifdef SANITY_CHECKS + if(cell->firstchild == -1) + error("Cell has been split but all children have 0 parts"); + #endif + + /*Prepare the pointers*/ + for(k = 0; k < 8; k++) + { + /* Find the next non-empty sibling */ + for(kk = k+1; kk < 8; ++kk){ + if(cell_pool[progenitors[kk]].count > 0){ + cell_pool[progenitors[k]].sibling = progenitors[kk]; + break; + } + } + + /* No non-empty sibling, go back a level.*/ + if(kk == 8) cell_pool[progenitors[k]].sibling = cell->sibling; + + } + + /* Recurse */ + for(k = 0; k < 8; k++) + if(cell_pool[progenitors[k]].count > 0) cell_split(progenitors[k], s); + + /* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */ + } /*else { + +// struct cell *data[2] = {root, c}; + int data[2] = {root, c}; + int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data, + 2 * sizeof(int), 3000); + qsched_addlock(s, tid, cell_pool[c].res); + }*/ + +#ifndef COM_AS_TASK + comp_com(&cell_pool[c]); +#endif +} + +/** + * @brief Create the tasks for the cell pair/self. + * + * @param s The #sched in which to create the tasks. + * @param ci The first #cell. + * @param cj The second #cell. + */ +void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { + + qsched_task_t tid; + int data[2]; + struct cell *cp, *cps; + +#ifdef SANITY_CHECKS + + /* If either cell is empty, stop. */ + if (ci->count == 0 || (cj != NULL && cj->count == 0)) error("Empty cell !"); + +#endif + + /* Single cell? */ + if (cj == NULL) { + + /* Is this cell split and above the task limit ? */ + if (ci->split /*&& ci->count > task_limit / ci->count*/) { + + /* Loop over each of this cell's progeny. */ + for (cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]) { + + /* Make self-interaction task. */ + create_tasks(s, cp, NULL); + + /* Make all pair-interaction tasks. */ + for (cps = &cell_pool[cp->sibling]; cps != &cell_pool[ci->sibling]; cps = &cell_pool[cps->sibling]) + create_tasks(s, cp, cps); + } + + /* Otherwise, add a self-interaction task. */ + } else { + + /* Set the data. */ + data[0] = ci -cell_pool; + data[1] = -1; + + /* Create the task. */ + tid = qsched_addtask(s, task_type_self, task_flag_none, data, + sizeof(int) * 2, ci->count * ci->count / 2); + + /* Add the resource (i.e. the cell) to the new task. */ + qsched_addlock(s, tid, ci->res); + +/* Since this call might recurse, add a dependency on the cell's COM + task. */ +#ifdef COM_AS_TASK + if (ci->split) qsched_addunlock(s, ci->com_tid, tid); +#endif + } + + /* Otherwise, it's a pair. */ + } else { + + /* Are both cells split and we are above the task limit ? */ + if (ci->split && cj->split ) { + + /* Let's split both cells and build all possible pairs */ + for (cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]) { + for (cps = &cell_pool[cj->firstchild]; cps != &cell_pool[cj->sibling]; cps = &cell_pool[cps->sibling]) { + + /* Recurse */ + if (are_neighbours_host(cp, cps)) { + create_tasks(s, cp, cps); + } + } + } + + if( 0 && ci->count > 64*cell_maxparts) + { + /* Let's also build a particle-monopole task */ + for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]){ + /* Create the task. */ + data[0] = cp-cell_pool; + data[1] = cj-cell_pool; + tid = qsched_addtask(s, task_type_pair_pc_split, task_flag_none, data, + sizeof(int) * 2, ci->count + cj->count); + + /* Add the resource and dependance */ + qsched_addlock(s, tid, cp->res); +// qsched_addlock(s, tid, cj->res); + } + for(cp = &cell_pool[cj->firstchild]; cp != &cell_pool[cj->sibling]; cp = &cell_pool[cp->sibling]){ + /* Create the task. */ + data[0] = cp-cell_pool; + data[1] = ci-cell_pool; + tid = qsched_addtask(s, task_type_pair_pc_split, task_flag_none, data, + sizeof(int) * 2, ci->count + cj->count); + + /* Add the resource and dependance */ + qsched_addlock(s, tid, cp->res); +// qsched_addlock(s, tid, cj->res); + } + }else + { + data[0] = ci -cell_pool; + data[1] = cj - cell_pool; + tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(int) * 2, ci->count + cj->count); + /* Add the resource and dependance */ + qsched_addlock(s, tid, ci->res); + data[0] = cj - cell_pool; + data[1] = ci - cell_pool; + tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(int) * 2, ci->count + cj->count); + qsched_addlock(s, tid, cj->res); + } + +#ifdef COM_AS_TASK + qsched_addunlock(s, cj->com_tid, tid); + qsched_addunlock(s, ci->com_tid, tid); +#endif + + + } else { /* Otherwise, at least one of the cells is not split, build a direct + * interaction. */ + + /* Set the data. */ + data[0] = ci-cell_pool; + data[1] = cj-cell_pool; + + /* Create the task. */ + tid = qsched_addtask(s, task_type_pair, task_flag_none, data, + sizeof(int) * 2, ci->count * cj->count); + + /* Add the resources. */ + qsched_addlock(s, tid, ci->res); + qsched_addlock(s, tid, cj->res); + + } + + } /* Otherwise it's a pair */ +} + + +/** + * @brief Compute the interactions between all particles in a cell. + * + * @param cellID The cell ID to compute interactions on. + */ +__device__ __forceinline__ void iact_self_direct(int cellID) { + struct cell *c = &cells[cellID]; + double xi[3] = {0.0, 0.0, 0.0}; + float ai[3] = {0.0, 0.0, 0.0 }, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w; + int parts; + int count; + int i,j,k; + + parts = c->parts; + count = c->count; + for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x) + { + xi[0] = parts_cuda[i].loc[0]; + xi[1] = parts_cuda[i].loc[1]; + xi[2] = parts_cuda[i].loc[2]; + for(k = 0; k < 3; k++) { + ai[k] = 0.0; + } + + for(j = parts; j < parts+count; j++) + { + if(i != j){ + + /* Compute the pairwise distance. */ + r2 = 0.0; + dx[0] = xi[0] - parts_cuda[j].loc[0]; + r2 += dx[0]*dx[0]; + dx[1] = xi[1] - parts_cuda[j].loc[1]; + r2 += dx[1]*dx[1]; + dx[2] = xi[2] - parts_cuda[j].loc[2]; + r2 += dx[2]*dx[2]; + + /* Apply the gravitational acceleration. */ + + ir = rsqrtf(r2); + w = const_G * ir * ir * ir; + mj = parts_cuda[j].m; + for(k = 0; k < 3; k++) { + ai[k] -= w * dx[k] * mj; + } + + } + } + //Update. + atomicAdd(&parts_cuda[i].a[0],ai[0] ); + atomicAdd(&parts_cuda[i].a[1],ai[1] ); + atomicAdd(&parts_cuda[i].a[2],ai[2] ); + } + + +} + +#ifdef EXACT +__global__ void interact_exact(int count){ + + int number = blockIdx.x * blockDim.x + threadIdx.x; + int i, j, k; + double dx[3], ir, r2, w; + if(number < count) + { + double pix[3] = {parts_cuda[number].loc[0], parts_cuda[number].loc[1], parts_cuda[number].loc[2]}; + float mi = parts_cuda[number].m; + for(j = 0; j < count; j++) + { + if(j != number) + { + for (r2 = 0.0, k = 0; k < 3; k++) { + dx[k] = parts_cuda[j].loc[k] - pix[k]; + r2 += dx[k] * dx[k]; + } + ir = rsqrt(r2); + w = const_G * ir * ir * ir; + + for(k = 0; k < 3; k++) + { + atomicAdd(&parts_cuda[number].a[k] , w * dx[k] * parts_cuda[j].m); + } + } + } + + } +} +#endif + +__device__ __forceinline__ void runner( int type , void *data ) { + + int *idata = (int *)data; + int i = idata[0]; + int j = idata[1]; + switch ( type ) { + case task_type_self: + iact_self_direct(i); + break; + case task_type_pair: + iact_pair_direct(&cells[i], &cells[j]); + break; + case task_type_pair_pc: + iact_pair_pc( &cells[i], &cells[j] ); + break; + case task_type_pair_pc_split: + iact_pair_pc_split(&cells[i], &cells[j]); + break; + default: + printf("Got to default?\n"); + asm("trap;"); + } + __syncthreads(); +} + +__device__ qsched_funtype function = runner; + + + + + +int calcMaxDepth(struct cell *c, int depth) +{ + struct cell *cp; + if(c->split) + { + int maxd = 0, temp; + for(cp = &cell_pool[c->firstchild]; cp != &cell_pool[c->sibling]; cp = &cell_pool[cp->sibling]) + { + temp = calcMaxDepth(cp, depth+1); + if(temp > maxd) + maxd = temp; + } + return maxd; + }else{ + return depth+1; + } +} + + +qsched_funtype func; +/** + * @brief Set up and run a task-based Barnes-Hutt N-body solver. + * + * @param N The number of random particles to use. + * @param runs Number of force evaluations to use as a benchmark. + * @param fileName Input file name. If @c NULL or an empty string, random + * particle positions will be used. + */ +void test_bh(int N, int runs, char *fileName) { + int i, k; + struct cell *root; + FILE *file; + struct qsched s; + struct cell *gpu_ptr_cells; +double2 *com_temp; +double *comz_temp; +float *comm_temp; + + cudaFree(0); + cudaThreadSetCacheConfig(cudaFuncCachePreferL1); + if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess) + error("Failed to copy function pointer from device"); + + /* Initialize the scheduler. */ + qsched_init(&s, 1, qsched_flag_none); + + //Create host particle arrays. + if( cudaMallocHost(&parts_host, sizeof(struct part) * N) != cudaSuccess) + error("Failed to allocate parts array"); + + + + if( cudaMalloc(&parts_temp, sizeof(struct part) * N) != cudaSuccess) + error("Failed to allocate device parts array"); + if( cudaMemcpyToSymbol(parts_cuda, &parts_temp, sizeof(struct part*), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to set device symbol for parts array"); + + if(fileName == NULL || fileName[0] == 0) { + for(k = 0; k < N; k++) { + parts_host[k].loc[0] = ((double)rand())/ RAND_MAX; + parts_host[k].loc[1] = ((double)rand())/ RAND_MAX; + parts_host[k].loc[2] = ((double)rand())/ RAND_MAX; + parts_host[k].m = ((double)rand()) / RAND_MAX; + } + + } else { + file = fopen(fileName, "r"); + #ifndef ID + int tempxy; + #endif + double temp; + printf("Reading input from file\n"); + if(file) { + for(k = 0; k < N; k++) { +#ifdef ID + if(fscanf(file, "%d", &parts_host[k].id) != 1) +#else + if(fscanf(file, "%d", &tempxy) != 1) +#endif + error("Failed to read ID"); + if(fscanf(file, "%f", &parts_host[k].m) != 1) + error("Failed to read mass of part %i.", k); + if(fscanf(file, "%lf%lf%lf", &parts_host[k].loc[0], &parts_host[k].loc[1], &parts_host[k].loc[2]) != 3) + error("Failed to read position of part %i.", k); + if(fscanf(file,"%lf %lf %lf %lf %lf %lf %lf %lf %lf", &temp,&temp,&temp,&temp,&temp,&temp,&temp,&temp,&temp ) != 9) + error("Failed to read extra stuff"); + } + fclose(file); + } + } + + /* Init the cells. */ + root = cell_get(); + root->loc_xy.x = 0.0; + root->loc_xy.y = 0.0; + root->loc_z = 0.0; + root->h = 1.0; + root->count = N; + root->parts = 0; + root->sibling = -1; + int c = root-cell_pool; + cell_split(root - cell_pool, &s); + root = &cell_pool[c]; + int nr_leaves = 0; + int maxparts=0, minparts=1000000; + int number = 0; + while(c >= 0) { + if(cell_pool[c].count > 0) + { + number++; + if(cell_pool[c].res == qsched_res_none) + message("cell %i has no res", c); + } + if(!cell_pool[c].split) { + nr_leaves++; + if(cell_pool[c].count > maxparts) + { + maxparts = cell_pool[c].count; + } + if(cell_pool[c].count < minparts) + { + minparts = cell_pool[c].count; + } + c = cell_pool[c].sibling; + } else { + c = cell_pool[c].firstchild; + } + } + message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves)); + message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts); + + create_tasks(&s, root, NULL); + + int self = 0, pair = 0, pc = 0; + + for(k = 0; k < s.count; k++) + { + if(s.tasks[k].type == task_type_self) + self++; + else if (s.tasks[k].type == task_type_pair) + pair++; + else if (s.tasks[k].type >= 0) + pc++; + } + + message("total number of tasks: %i.", s.count); + message("total number of pair tasks: %i.", pair); + message("total number of self tasks: %i.", self); + message("total number of pc tasks: %i.", pc); + message("total number of cells: %i.", number); + message("total number of deps: %i.", s.count_deps); + message("total number of res: %i.", s.count_res); + message("total number of locks: %i.", s.count_locks); + + for(k = 0; k < runs; k++) { + for(i = 0; i < N; ++i) { + parts_host[i].a[0] = 0.0; + parts_host[i].a[1] = 0.0; + parts_host[i].a[2] = 0.0; + } + + + /* Copy the cells to the device. */ + if( cudaMalloc( &gpu_ptr_cells , sizeof(struct cell) * used_cells) != cudaSuccess) + error("Failed to allocate cells on the GPU"); + if( cudaMemcpy( gpu_ptr_cells, cell_pool, sizeof(struct cell) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy cells to the GPU"); + if( cudaMemcpyToSymbol(cells, &gpu_ptr_cells, sizeof(struct cell*), 0, cudaMemcpyHostToDevice) != cudaSuccess ) + error("Failed to copy cell pointer to the GPU"); + + + + if(cudaMalloc( &com_temp, sizeof(double2) * used_cells) != cudaSuccess) + error("Failed to allocate com on the GPU"); + if( cudaMemcpy( com_temp, com_xy_host, sizeof(double2) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess ) + error("failed to copy com to the GPU"); + if( cudaMemcpyToSymbol(com_xy, &com_temp, sizeof(double2 *), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy com pointer to the GPU"); + + + if(cudaMalloc( &comz_temp, sizeof(double) * used_cells) != cudaSuccess) + error("Failed to allocate com on the GPU"); + if( cudaMemcpy( comz_temp, com_z_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess ) + error("failed to copy com to the GPU"); + if( cudaMemcpyToSymbol(com_z, &comz_temp, sizeof(double *), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy com pointer to the GPU"); + + if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess) + error("Failed to allocate com on the GPU"); + if( cudaMemcpy( comm_temp, com_mass_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess ) + error("failed to copy com to the GPU"); + if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy com pointer to the GPU"); + printf("Max depth is %i\n", calcMaxDepth(root, 0)); + qsched_run_CUDA( &s , func ); + qsched_print_cuda_timers(&s); + + int pcs; + if( cudaMemcpyFromSymbol( &pcs, pc_calcs, sizeof(int), 0, cudaMemcpyDeviceToHost) != cudaSuccess) + error("Failed"); + printf("pc calcs = %i\n", pcs); +/*struct task* tasks = qsched_get_timers( &s , s.count ); + for(i = 0; i < s.count; i++) + { + printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID); + // printf("\n"); + + }*/ + +} + #ifdef EXACT + struct part *parts2; + parts2 = (struct part*) malloc(sizeof(struct part) * N ); + for(k = 0; k < N; k++) + { + parts2[k].loc[0] = parts_host[k].loc[0]; + parts2[k].loc[1] = parts_host[k].loc[1]; + parts2[k].loc[2] = parts_host[k].loc[2]; + parts2[k].m = parts_host[k].m; + parts2[k].a[0] = 0.0f; + parts2[k].a[1] = 0.0f; + parts2[k].a[2] = 0.0f; + } +// cudaDeviceReset(); + if(cudaMalloc(&com_temp, sizeof(struct part) * N) != cudaSuccess) + error("Couldn't allocate com_temp"); + if(cudaMemcpy(com_temp, parts2, sizeof(struct part) * N, cudaMemcpyHostToDevice) != cudaSuccess) + error("Couldn't copy part data"); + if(cudaMemcpyToSymbol(parts_cuda, &com_temp, sizeof(struct part*), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy part data"); + + int numblocks = N / 128 + 1; + double itpms = 1000.0 / CPU_TPS; +ticks tic, toc_run ; + tic = getticks(); + interact_exact<<<numblocks, 128>>>(N); + cudaDeviceSynchronize(); + toc_run = getticks(); + printf("Cuda kernel took %.3f ms to run exact solution\n", ((double)(toc_run - tic)) * itpms); + if( cudaMemcpy( parts2, com_temp, sizeof(struct part) * N, cudaMemcpyDeviceToHost) != cudaSuccess) + error("Failed to copy data back from device"); + + printf("%e\n", parts_host[0].m); + + + k = 0; + printf("%e, %e, %e, %e, %e, %e, %e, %e, %e, %e\n", parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2], + parts2[k].a[0], parts2[k].a[1], parts2[k].a[2], parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]); + #endif + /* Dump the particles to a file */ + file = fopen("particle_dump.dat", "w"); +/* fprintf(file, + "# ID m x y z a_exact.x a_exact.y a_exact.z a_legacy.x " + "a_legacy.y a_legacy.z a_new.x a_new.y a_new.z\n");*/ +#ifdef EXACT + printf("Printing exact\n"); + for(k = 0; k < N; ++k) + fprintf(file, "%i %e %e %e %e %e %e %e %e %e %e %e %e %e\n", + #ifdef ID + parts_host[k].id, + #else + k, + #endif + parts_host[k].m,parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2], + parts2[k].a[0], parts2[k].a[1], parts2[k].a[2], parts2[k].a[0], parts2[k].a[1], parts2[k].a[2], parts_host[k].a[0], + parts_host[k].a[1], parts_host[k].a[2]); + +#else + for (k = 0; k < N; ++k) + fprintf(file, "%i %e %e %e %e %e %e %e\n", + k, parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2], + parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]); +#endif + fclose(file); + file = fopen("particle_pos.dat", "w"); + fprintf(file, "m x[1] x[2] x[3]\n"); + for(k = 0; k < N; k++) + fprintf(file, "%e %e %e %e\n", parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2]); + + fclose(file); + cudaFreeHost(parts_host); + //free(parts_host); +} + + +int main(int argc, char *argv[]) { + int c, nr_threads; + int N = 1000, runs = 1; + char fileName[100] = {0}; + + /* Parse the options */ + while ((c = getopt(argc, argv, "n:r:t:f:c:i:")) != -1) switch (c) { + case 'n': + if (sscanf(optarg, "%d", &N) != 1) + error("Error parsing number of particles."); + break; + case 'r': + if (sscanf(optarg, "%d", &runs) != 1) + error("Error parsing number of runs."); + break; + case 't': + if (sscanf(optarg, "%d", &nr_threads) != 1) + error("Error parsing number of threads."); + break; + case 'f': + if (sscanf(optarg, "%s", &fileName[0]) != 1) + error("Error parsing file name."); + break; + case '?': + fprintf(stderr, "Usage: %s [-t nr_threads] [-n N] [-r runs] [-f file] " + "[-c Nparts] [-i Niterations] \n", + argv[0]); + fprintf(stderr, "Solves the N-body problem using a Barnes-Hut\n" + "tree code with N random particles read from a file in " + "[0,1]^3 using" + "nr_threads threads.\n" + "A test of the neighbouring cells interaction with " + "Nparts per cell is also run Niterations times.\n"); + exit(EXIT_FAILURE); + } + + /* Tree node information */ + printf("Size of cell: %zu bytes.\n", sizeof(struct cell)); + + /* Part information */ + printf("Size of part: %zu bytes.\n", sizeof(struct part)); + + /* Dump arguments. */ + if (fileName[0] == 0) { + message("Computing the N-body problem over %i random particles using %i " + "threads (%i runs).", + N, nr_threads, runs); + } else { + message("Computing the N-body problem over %i particles read from '%s' " + "using %i threads (%i runs).", + N, fileName, nr_threads, runs); + } + + /* Run the BH test. */ + test_bh(N, runs, fileName); +}