Skip to content
Snippets Groups Projects
Commit 342961e3 authored by aidan's avatar aidan
Browse files

Added test_bh_4 which contains the final version of the barnes hut. Everything...

Added test_bh_4 which contains the final version of the barnes hut. Everything seems to work correctly
parent 1614dd27
Branches
No related tags found
No related merge requests found
/*******************************************************************************
* This file is part of QuickSched.
* Coypright (c) 2014 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
* Aidan Chalk (aidan.chalk@durham.ac.uk)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* *****************************************************************************/
//#define EXACT
//#define ID
/* Config parameters. */
#include "../config.h"
/* Standard includes. */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include <omp.h>
#include <fenv.h>
/* Local includes. */
extern "C"{
#include "quicksched.h"
#include "res.h"
}
#include "cuda_queue.h"
//#define double float
//#define double2 float2
//#define float double
//#define float2 double2
/** Task types. */
enum task_type {
task_type_self = 0,
task_type_pair,
task_type_pair_pc,
task_type_pair_pc_split,
task_type_com,
task_type_count
};
struct cell{
double2 loc_xy;
double loc_z;
double h;
int count;
unsigned short int split, sorted;
int parts, firstchild, sibling;
int res, com_tid;
};
struct part{
#ifdef ID
int id;
#endif
double loc[3];
float a[3];
float m;
};
#define const_G 1
/* Requred variables to obtain cells. */
#define cell_maxparts 128
#define CELL_STRETCH 2
#define INITIAL_CELLS 256
struct cell *cell_pool = NULL;
int used_cells=0;
int num_cells = 0;
int cell_size = INITIAL_CELLS*sizeof(struct cell);
/* Device locations for the particle values. */
__device__ double2 *com_xy;
__device__ double *com_z;
__device__ float *com_mass;
__device__ struct part *parts_cuda;
__device__ struct cell *cells;
__device__ int pc_calcs = 0;
/* Host locations for the particle values. */
double2 *com_xy_host;
double *com_z_host;
float *com_mass_host;
double2 *parts_pos_xy_temp;
double *parts_pos_z_temp;
float4 *parts_a_m_temp;
struct part *parts_host;
struct part *parts_temp;
__device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old); return __longlong_as_double(old); }
__device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *cj) {
int i, j, k;
int count_i = ci->count, count_j = cj->count;
int parts_i = ci->parts, parts_j = cj->parts;
double xi[3];
float dx[3];
float ai[3], mj, r2, w, ir;
/* Loop over cell i.*/
for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
xi[0] = parts_cuda[i].loc[0];
xi[1] = parts_cuda[i].loc[1];
xi[2] = parts_cuda[i].loc[2];
for(k = 0; k < 3; k++) {
ai[k] = 0.0f;
}
for(j = parts_j; j < parts_j + count_j; j++) {
r2 = 0.0f;
dx[0] = xi[0] - parts_cuda[j].loc[0];
dx[1] = xi[1] - parts_cuda[j].loc[1];
dx[2] = xi[2] - parts_cuda[j].loc[2];
r2 += dx[0] * dx[0];
r2 += dx[1] * dx[1];
r2 += dx[2] * dx[2];
ir = rsqrtf(r2);
w = const_G * ir * ir * ir;
mj = parts_cuda[j].m;
for(k = 0; k < 3; k++) {
ai[k] -= dx[k] * mj * w;
}
}
atomicAdd(&parts_cuda[i].a[0], ai[0]);
atomicAdd(&parts_cuda[i].a[1], ai[1]);
atomicAdd(&parts_cuda[i].a[2], ai[2]);
}
/* Load particles of cell i into shared memory */
/*Loop over cell j. */
for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
xi[0] = parts_cuda[i].loc[0];
xi[1] = parts_cuda[i].loc[1];
xi[2] = parts_cuda[i].loc[2];
for(k = 0; k < 3; k++) {
ai[k] = 0.0f;
}
for(j = parts_i; j < parts_i + count_i; j++) {
r2 = 0.0f;
dx[0] = xi[0] - parts_cuda[j].loc[0];
dx[1] = xi[1] - parts_cuda[j].loc[1];
dx[2] = xi[2] - parts_cuda[j].loc[2];
r2 += dx[0] * dx[0];
r2 += dx[1] * dx[1];
r2 += dx[2] * dx[2];
ir = rsqrtf(r2);
w = const_G * ir * ir * ir;
mj = parts_cuda[j].m;
for(k = 0; k < 3; k++) {
ai[k] -= dx[k] * mj * w;
}
}
atomicAdd(&parts_cuda[i].a[0], ai[0]);
atomicAdd(&parts_cuda[i].a[1], ai[1]);
atomicAdd(&parts_cuda[i].a[2], ai[2]);
}
}
__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) {
int i;
int count = leaf->count;
int parts = leaf->parts;
int cell_j = cj - cells;
float r2;
float dx[3], ir, w;
if(threadIdx.x == 0)
atomicAdd(&pc_calcs, 1);
/* int leaf_num = leaf - cells;
int cjnum = cj - cells;
if(threadIdx.x == 0 && leaf_num == 55)
printf("Calculating with %d %i whose firstchild is %i and cj->split == %i\n", (cjnum), cj->firstchild, cj->split);*/
/* if(threadIdx.x == 0)
printf("leaf = %i\n", leaf - cells);*/
for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
r2 = 0.0;
dx[0] = com_xy[cell_j].x - parts_cuda[i].loc[0];
r2 += dx[0] * dx[0];
dx[1] = com_xy[cell_j].y - parts_cuda[i].loc[1];
r2 += dx[1] * dx[1];
dx[2] = com_z[cell_j] - parts_cuda[i].loc[2];
r2 += dx[2] * dx[2];
ir = rsqrtf(r2);
w = com_mass[cell_j] * const_G * ir * ir * ir;
/* if(threadIdx.x == 0)
printf("part.a = %e w*dx = %e \n", parts_cuda[i].a[0], w*dx[0]);*/
atomicAdd(&parts_cuda[i].a[0], w*dx[0]);
atomicAdd(&parts_cuda[i].a[1], w*dx[1]);
atomicAdd(&parts_cuda[i].a[2], w*dx[2]);
/* (*accels).x+= w*dx[0];
(*accels).y+= w*dx[1];
(*accels).z+= w*dx[2];*/
}
}
/**
* @brief Checks whether the cells are direct neighbours ot not
*/
__device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) {
float dx[3];
float cih = ci->h, cjh = cj->h;
float min_dist = 0.5*(cih+cjh);
float center_i = ci->loc_xy.x + 0.5*cih;
float center_j = cj->loc_xy.x + 0.5*cjh;
dx[0] = fabsf(center_i - center_j);
center_i = ci->loc_xy.y + 0.5*cih;
center_j = cj->loc_xy.y + 0.5*cjh;
dx[1] = fabsf(center_i - center_j);
center_i = ci->loc_z + 0.5*cih;
center_j = cj->loc_z + 0.5*cjh;
dx[2] = fabsf(center_i - center_j);
return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
}
/*__device__ __forceinline__ int are_neighbours_different_size(double cih, float ci_x, float ci_y, float ci_z, struct cell *cj) {
float dx_x, dx_y, dx_z;
double cjh = cj->h;
float min_dist = 0.5*(cih+cjh);
float center_i = ci_x + 0.5*cih;
float center_j = cj->loc_xy.x + 0.5*cjh;
dx_x = fabsf(center_i - center_j);
center_i = ci_y + 0.5*cih;
center_j = cj->loc_xy.y + 0.5*cjh;
dx_y = fabsf(center_i - center_j);
center_i = ci_z + 0.5*cih;
center_j = cj->loc_z + 0.5*cjh;
dx_z = fabsf(center_i - center_j);
return (dx_x <= min_dist) && (dx_y <= min_dist) && (dx_z <= min_dist);
}*/
__device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) {
float dx[3];
float min_dist = ci->h;
float center_i = ci->loc_xy.x;
float center_j = cj->loc_xy.x;
dx[0] = fabsf(center_i - center_j);
center_i = ci->loc_xy.y;
center_j = cj->loc_xy.y;
dx[1] = fabsf(center_i - center_j);
center_i = ci->loc_z;
center_j = cj->loc_z;
dx[2] = fabsf(center_i - center_j);
return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
}
/*__device__ __forceinline__ int are_neighbours(float cih, float ci_x, float ci_y, float ci_z, struct cell *cj) {
float dx_x, dx_y, dx_z;
float center_j = cj->loc_xy.x;
dx_x = fabsf(ci_x - center_j);
center_j = cj->loc_xy.y;
dx_y = fabsf(ci_y - center_j);
center_j = cj->loc_z;
dx_z = fabsf(ci_z - center_j);
return (dx_x <= cih) && (dx_y <= cih) && (dx_z <= cih);
}*/
__device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) {
return (leaf->parts >= c->parts) && (leaf->parts < c->parts + c->count);
}
__device__ __forceinline__ void make_interact_pc_new(struct cell *child, int *mpoles, int count2)
{
int i,j;
int count = child->count;
int parts = child->parts;
// int cell_j = cj - cells;
float r2;
float dx[3], ir, w;
float a[3];
for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
a[0] = 0.0;
a[1] = 0.0;
a[2] = 0.0;
for(j = 0; j < count2; j++){
r2 = 0.0;
dx[0] = com_xy[mpoles[j]].x - parts_cuda[i].loc[0];
r2 += dx[0] * dx[0];
dx[1] = com_xy[mpoles[j]].y - parts_cuda[i].loc[1];
r2 += dx[1] * dx[1];
dx[2] = com_z[mpoles[j]] - parts_cuda[i].loc[2];
r2 += dx[2] * dx[2];
ir = rsqrtf(r2);
w = com_mass[mpoles[j]] * const_G * ir * ir * ir;
a[0] += w*dx[0];
a[1] += w*dx[1];
a[2] += w*dx[2];
}
/* if(threadIdx.x == 0)
printf("part.a = %e w*dx = %e \n", parts_cuda[i].a[0], w*dx[0]);*/
atomicAdd(&parts_cuda[i].a[0], a[0]);
atomicAdd(&parts_cuda[i].a[1], a[1]);
atomicAdd(&parts_cuda[i].a[2], a[2]);
/* (*accels).x+= w*dx[0];
(*accels).y+= w*dx[1];
(*accels).z+= w*dx[2];*/
}
}
__device__ __forceinline__ void iact_pair_pc(struct cell *ci, struct cell *cj)
{
struct cell *cp, *cps;
int icells[8];
int count = 0;
/* Let's split both cells and build all possible non-neighbouring pairs */
for (cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
count = 0;
for (cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
if ( ! are_neighbours(cp, cps) ) {
icells[count] = cps - cells;
count++;
}
}
make_interact_pc_new(cp, icells, count);
}
}
__device__ __forceinline__ void iact_pair_pc_split(struct cell *child, struct cell *cj)
{
struct cell *cps;
int icells[8];
int count = 0;
for (cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
if(!are_neighbours(child, cps) ){
icells[count] = cps - cells;
count++;
}
}
make_interact_pc_new(child, icells, count);
}
#ifdef OLD
__device__ __forceinline__ void iact_self_pc(struct cell *c, struct cell *leaf)
{
struct cell *ci, *cj, *cp, *cps;
float leafh = leaf->h;
float3 accelerations;
int i;
if(threadIdx.x < leaf->count)
{
accelerations.x = 0.0f;
accelerations.y = 0.0f;
accelerations.z = 0.0f;
while(c != leaf)
{
ci = NULL;
/* Loop over children of c. */
for(cj = &cells[c->firstchild]; cj != &cells[c->sibling]; cj = &cells[cj->sibling])
{
if(ci == NULL && is_inside(leaf, cj))
{
ci = cj;
}else{
//Depth first search of cj.
cp = cj;
while(cp != &cells[cj->sibling])
{
if(!are_neighbours(cp, leaf))
{
// if(!cp->split)
make_interact_pc(leaf, cp, &accelerations);
// else
// for(cps = &cells[cp->firstchild]; cps != &cells[cp->sibling]; cps = &cells[cps->sibling])
// make_interact_pc(leaf, cps, &accelerations);
cp = &cells[cp->sibling];
}else{
if(cp->split && cp->h > leafh)
cp = &cells[cp->firstchild];
else
cp = &cells[cp->sibling];
}
}
}
}
c = ci;
}
int parts = leaf->parts;
int count = leaf->count;
for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
atomicAdd( &parts_cuda[i].a[0] , accelerations.x);
atomicAdd( &parts_cuda[i].a[1] , accelerations.y);
atomicAdd( &parts_cuda[i].a[2] , accelerations.z);
}
}
}
#endif
/**
* @brief Checks whether the cells are direct neighbours ot not. Both cells have
* to be of the same size
*/
static inline int are_neighbours_host(struct cell *ci, struct cell *cj) {
// int k;
float dx[3];
#ifdef SANITY_CHECKS
if (ci->h != cj->h)
error(" Cells of different size in distance calculation.");
#endif
/* Maximum allowed distance */
float min_dist = ci->h;
/* (Manhattan) Distance between the cells */
double2 loc1=ci->loc_xy, loc2=cj->loc_xy;
float center_i = loc1.x;
float center_j = loc2.x;
dx[0] = fabs(center_i - center_j);
center_i = loc1.y;
center_j = loc2.y;
dx[1] = fabs(center_i - center_j);
center_i = ci->loc_z;
center_j = cj->loc_z;
dx[2] = fabs(center_i - center_j);
return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
}
struct cell *cell_get()
{
if(num_cells == 0)
{
cell_pool = (struct cell*) calloc(INITIAL_CELLS, sizeof(struct cell));
if(cell_pool == NULL)
error("Failed to allocate cell_pool");
com_xy_host = (double2*) calloc(INITIAL_CELLS, sizeof(double2));
if(com_xy_host == NULL)
error("Failed to allocate cell_pool");
com_z_host = (double*) calloc(INITIAL_CELLS, sizeof(double));
if(com_z_host == NULL)
error("Failed to allocate cell_pool");
com_mass_host = (float*) calloc(INITIAL_CELLS, sizeof(float));
if(com_mass_host == NULL)
error("Failed to allocate cell_pool");
num_cells = INITIAL_CELLS;
}
if(used_cells >= num_cells)
{
/* Stretch */
struct cell *new_pool;
cell_size *= CELL_STRETCH;
new_pool = (struct cell*) calloc(num_cells*CELL_STRETCH, sizeof(struct cell));
if(new_pool == NULL)
error("Failed to allocate new_pool");
if(cell_pool != NULL)
memcpy(new_pool, cell_pool, num_cells*sizeof(struct cell));
double2 *tempxy = (double2*) calloc(num_cells*CELL_STRETCH, sizeof(double2));
if(tempxy == NULL)
error("Failed to allocate tempxy");
memcpy(tempxy, com_xy_host, sizeof(double2)*num_cells);
free(com_xy_host);
com_xy_host = tempxy;
double *tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double));
if(tempz == NULL)
error("Failed to allocate tempz");
memcpy(tempz, com_z_host, num_cells*sizeof(double));
free(com_z_host);
com_z_host = tempz;
float *tempm = (float*) calloc(num_cells*CELL_STRETCH, sizeof(float));
if(tempm == NULL)
error("Failed to allocate tempm");
memcpy(tempm, com_mass_host, num_cells*sizeof(float));
free(com_mass_host);
com_mass_host = tempm;
num_cells *= CELL_STRETCH;
free(cell_pool);
cell_pool = new_pool;
message("Increased size of arrays");
}
used_cells++;
cell_pool[used_cells-1].sibling = -1;
cell_pool[used_cells-1].firstchild = -1;
cell_pool[used_cells-1].res = qsched_res_none;
return &cell_pool[used_cells-1];
}
void comp_com(struct cell *c){
int k, count = c->count;
int cpi;
struct cell *cp;
int parts = c->parts;
double com[3] = {0.0, 0.0, 0.0}, mass = 0.0;
if(c->split) {
for(cp = &cell_pool[(cpi = c->firstchild)]; cpi != c->sibling; cp = &cell_pool[(cpi = cp->sibling)]) {
float cp_mass = com_mass_host[cpi];
com[0] += com_xy_host[cpi].x * cp_mass;
com[1] += com_xy_host[cpi].y * cp_mass;
com[2] += com_z_host[cpi] * cp_mass;
mass += cp_mass;
}
/* Otherwise collect the multiple from the particles */
} else {
for(k = parts; k < parts+count; k++)
{
float p_mass = parts_host[k].m;
com[0] += parts_host[k].loc[0] * p_mass;
com[1] += parts_host[k].loc[1] * p_mass;
com[2] += parts_host[k].loc[2] * p_mass;
mass += p_mass;
}
}
k = c - cell_pool;
/* Store the COM data, if it was collected. */
if(mass > 0.0f) {
float imass = 1.0f/mass;
com_xy_host[k].x = com[0] * imass;
com_xy_host[k].y = com[1] * imass;
com_z_host[k] = com[2] * imass;
com_mass_host[k] = mass;
}else
{
com_xy_host[k].x = 0.0;
com_xy_host[k].y = 0.0;
com_z_host[k] = 0.0;
com_mass_host[k] = 0.0f;
}
}
/**
* @brief Sort the parts into eight bins along the given pivots and
* fill the multipoles. Also adds the hierarchical resources
* to the sched (TODO).
*
* @param c The #cell to be split.
* @param N The total number of parts.
* @param s The #sched to store the resources.
*/
void cell_split(int c, struct qsched *s) {
int i, j, k, kk, count = cell_pool[c].count;
int parts = cell_pool[c].parts;
struct part temp_part;
struct cell *cp, *cell;
int left[8], right[8];
double pivot[3];
static int root = -1;
int progenitors[8];
struct part *temp_xy;
/* Set the root cell. */
if (root < 0) {
root = c;
cell_pool[c].sibling = -1;
}
if(cell_pool[c].res == qsched_res_none)
{
if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[c].parts], 0) != cudaSuccess )
error("Failed to get host device pointer.");
cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy,
sizeof(struct part) * cell_pool[c].count, parts_temp + cell_pool[c].parts);
}
if(count > cell_maxparts )
{
cell_pool[c].split = 1;
for(k = 0; k < 8; k++)
{
progenitors[k] = (cp = cell_get()) - cell_pool;
cell = &cell_pool[c];
cp->loc_xy = cell->loc_xy;
cp->loc_z = cell->loc_z;
cp->h = cell->h*0.5;
if(k & 4) cp->loc_xy.x += cp->h;
if(k & 2) cp->loc_xy.y += cp->h;
if(k & 1) cp->loc_z += cp->h;
}
/* Init the pivots.*/
pivot[0] = cell->loc_xy.x + cell->h * 0.5;
pivot[1] = cell->loc_xy.y + cell->h * 0.5;
pivot[2] = cell->loc_z + cell->h * 0.5;
/* Split along the x axis. */
i = parts;
j = parts+count-1;
while(i < j)
{
while(i <= parts+count-1 && parts_host[i].loc[0] < pivot[0]) i += 1;
while(j >= parts && parts_host[j].loc[0] >= pivot[0]) j -= 1;
if(i < j){
temp_part = parts_host[i];
parts_host[i] = parts_host[j];
parts_host[j] = temp_part;
}
}
left[1] = i;
right[1] = parts+count-1;
left[0] = parts;
right[0] = j;
/* Split along the y axis twice. */
for (k = 1; k >= 0; k--) {
i = left[k];
j = right[k];
while(i <= j){
while(i <= right[k] && parts_host[i].loc[1] < pivot[1]) i += 1;
while(j >= left[k] && parts_host[j].loc[1] >= pivot[1]) j -= 1;
if(i < j)
{
temp_part = parts_host[i];
parts_host[i] = parts_host[j];
parts_host[j] = temp_part;
}
}
left[2*k+1] = i;
right[2*k+1] = right[k];
left[2*k] = left[k];
right[2*k] = j;
}
/* Split along the z axis four times.*/
for(k = 3; k >=0; k--)
{
i = left[k];
j = right[k];
while(i <= j){
while(i <= right[k] && parts_host[i].loc[2] < pivot[2]) i += 1;
while(j >= left[k] && parts_host[j].loc[2] >= pivot[2]) j -= 1;
if(i < j)
{
temp_part = parts_host[i];
parts_host[i] = parts_host[j];
parts_host[j] = temp_part;
}
}
left[2 * k + 1] = i;
right[2 * k + 1] = right[k];
left[2 * k] = left[k];
right[2 * k] = j;
}
/* Store the counts and offsets. */
for(k = 0; k < 8; k++)
{
cell_pool[progenitors[k]].count = right[k]-left[k]+1;
cell_pool[progenitors[k]].parts = left[k];
if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
error("Failed to get host device pointer.");
cell_pool[progenitors[k]].res = qsched_addres(s, qsched_owner_none, cell->res, temp_xy,
sizeof(struct part) * cell_pool[progenitors[k]].count, parts_temp + cell_pool[progenitors[k]].parts);
}
/* Find the first non-empty progenitor */
for(k = 0; k < 8; k++)
{
if(cell_pool[progenitors[k]].count > 0)
{
cell->firstchild = progenitors[k];
break;
}
}
#ifdef SANITY_CHECKS
if(cell->firstchild == -1)
error("Cell has been split but all children have 0 parts");
#endif
/*Prepare the pointers*/
for(k = 0; k < 8; k++)
{
/* Find the next non-empty sibling */
for(kk = k+1; kk < 8; ++kk){
if(cell_pool[progenitors[kk]].count > 0){
cell_pool[progenitors[k]].sibling = progenitors[kk];
break;
}
}
/* No non-empty sibling, go back a level.*/
if(kk == 8) cell_pool[progenitors[k]].sibling = cell->sibling;
}
/* Recurse */
for(k = 0; k < 8; k++)
if(cell_pool[progenitors[k]].count > 0) cell_split(progenitors[k], s);
/* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */
} /*else {
// struct cell *data[2] = {root, c};
int data[2] = {root, c};
int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
2 * sizeof(int), 3000);
qsched_addlock(s, tid, cell_pool[c].res);
}*/
#ifndef COM_AS_TASK
comp_com(&cell_pool[c]);
#endif
}
/**
* @brief Create the tasks for the cell pair/self.
*
* @param s The #sched in which to create the tasks.
* @param ci The first #cell.
* @param cj The second #cell.
*/
void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
qsched_task_t tid;
int data[2];
struct cell *cp, *cps;
#ifdef SANITY_CHECKS
/* If either cell is empty, stop. */
if (ci->count == 0 || (cj != NULL && cj->count == 0)) error("Empty cell !");
#endif
/* Single cell? */
if (cj == NULL) {
/* Is this cell split and above the task limit ? */
if (ci->split /*&& ci->count > task_limit / ci->count*/) {
/* Loop over each of this cell's progeny. */
for (cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]) {
/* Make self-interaction task. */
create_tasks(s, cp, NULL);
/* Make all pair-interaction tasks. */
for (cps = &cell_pool[cp->sibling]; cps != &cell_pool[ci->sibling]; cps = &cell_pool[cps->sibling])
create_tasks(s, cp, cps);
}
/* Otherwise, add a self-interaction task. */
} else {
/* Set the data. */
data[0] = ci -cell_pool;
data[1] = -1;
/* Create the task. */
tid = qsched_addtask(s, task_type_self, task_flag_none, data,
sizeof(int) * 2, ci->count * ci->count / 2);
/* Add the resource (i.e. the cell) to the new task. */
qsched_addlock(s, tid, ci->res);
/* Since this call might recurse, add a dependency on the cell's COM
task. */
#ifdef COM_AS_TASK
if (ci->split) qsched_addunlock(s, ci->com_tid, tid);
#endif
}
/* Otherwise, it's a pair. */
} else {
/* Are both cells split and we are above the task limit ? */
if (ci->split && cj->split ) {
/* Let's split both cells and build all possible pairs */
for (cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]) {
for (cps = &cell_pool[cj->firstchild]; cps != &cell_pool[cj->sibling]; cps = &cell_pool[cps->sibling]) {
/* Recurse */
if (are_neighbours_host(cp, cps)) {
create_tasks(s, cp, cps);
}
}
}
if( 0 && ci->count > 64*cell_maxparts)
{
/* Let's also build a particle-monopole task */
for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]){
/* Create the task. */
data[0] = cp-cell_pool;
data[1] = cj-cell_pool;
tid = qsched_addtask(s, task_type_pair_pc_split, task_flag_none, data,
sizeof(int) * 2, ci->count + cj->count);
/* Add the resource and dependance */
qsched_addlock(s, tid, cp->res);
// qsched_addlock(s, tid, cj->res);
}
for(cp = &cell_pool[cj->firstchild]; cp != &cell_pool[cj->sibling]; cp = &cell_pool[cp->sibling]){
/* Create the task. */
data[0] = cp-cell_pool;
data[1] = ci-cell_pool;
tid = qsched_addtask(s, task_type_pair_pc_split, task_flag_none, data,
sizeof(int) * 2, ci->count + cj->count);
/* Add the resource and dependance */
qsched_addlock(s, tid, cp->res);
// qsched_addlock(s, tid, cj->res);
}
}else
{
data[0] = ci -cell_pool;
data[1] = cj - cell_pool;
tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(int) * 2, ci->count + cj->count);
/* Add the resource and dependance */
qsched_addlock(s, tid, ci->res);
data[0] = cj - cell_pool;
data[1] = ci - cell_pool;
tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(int) * 2, ci->count + cj->count);
qsched_addlock(s, tid, cj->res);
}
#ifdef COM_AS_TASK
qsched_addunlock(s, cj->com_tid, tid);
qsched_addunlock(s, ci->com_tid, tid);
#endif
} else { /* Otherwise, at least one of the cells is not split, build a direct
* interaction. */
/* Set the data. */
data[0] = ci-cell_pool;
data[1] = cj-cell_pool;
/* Create the task. */
tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
sizeof(int) * 2, ci->count * cj->count);
/* Add the resources. */
qsched_addlock(s, tid, ci->res);
qsched_addlock(s, tid, cj->res);
}
} /* Otherwise it's a pair */
}
/**
* @brief Compute the interactions between all particles in a cell.
*
* @param cellID The cell ID to compute interactions on.
*/
__device__ __forceinline__ void iact_self_direct(int cellID) {
struct cell *c = &cells[cellID];
double xi[3] = {0.0, 0.0, 0.0};
float ai[3] = {0.0, 0.0, 0.0 }, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w;
int parts;
int count;
int i,j,k;
parts = c->parts;
count = c->count;
for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x)
{
xi[0] = parts_cuda[i].loc[0];
xi[1] = parts_cuda[i].loc[1];
xi[2] = parts_cuda[i].loc[2];
for(k = 0; k < 3; k++) {
ai[k] = 0.0;
}
for(j = parts; j < parts+count; j++)
{
if(i != j){
/* Compute the pairwise distance. */
r2 = 0.0;
dx[0] = xi[0] - parts_cuda[j].loc[0];
r2 += dx[0]*dx[0];
dx[1] = xi[1] - parts_cuda[j].loc[1];
r2 += dx[1]*dx[1];
dx[2] = xi[2] - parts_cuda[j].loc[2];
r2 += dx[2]*dx[2];
/* Apply the gravitational acceleration. */
ir = rsqrtf(r2);
w = const_G * ir * ir * ir;
mj = parts_cuda[j].m;
for(k = 0; k < 3; k++) {
ai[k] -= w * dx[k] * mj;
}
}
}
//Update.
atomicAdd(&parts_cuda[i].a[0],ai[0] );
atomicAdd(&parts_cuda[i].a[1],ai[1] );
atomicAdd(&parts_cuda[i].a[2],ai[2] );
}
}
#ifdef EXACT
__global__ void interact_exact(int count){
int number = blockIdx.x * blockDim.x + threadIdx.x;
int i, j, k;
double dx[3], ir, r2, w;
if(number < count)
{
double pix[3] = {parts_cuda[number].loc[0], parts_cuda[number].loc[1], parts_cuda[number].loc[2]};
float mi = parts_cuda[number].m;
for(j = 0; j < count; j++)
{
if(j != number)
{
for (r2 = 0.0, k = 0; k < 3; k++) {
dx[k] = parts_cuda[j].loc[k] - pix[k];
r2 += dx[k] * dx[k];
}
ir = rsqrt(r2);
w = const_G * ir * ir * ir;
for(k = 0; k < 3; k++)
{
atomicAdd(&parts_cuda[number].a[k] , w * dx[k] * parts_cuda[j].m);
}
}
}
}
}
#endif
__device__ __forceinline__ void runner( int type , void *data ) {
int *idata = (int *)data;
int i = idata[0];
int j = idata[1];
switch ( type ) {
case task_type_self:
iact_self_direct(i);
break;
case task_type_pair:
iact_pair_direct(&cells[i], &cells[j]);
break;
case task_type_pair_pc:
iact_pair_pc( &cells[i], &cells[j] );
break;
case task_type_pair_pc_split:
iact_pair_pc_split(&cells[i], &cells[j]);
break;
default:
printf("Got to default?\n");
asm("trap;");
}
__syncthreads();
}
__device__ qsched_funtype function = runner;
int calcMaxDepth(struct cell *c, int depth)
{
struct cell *cp;
if(c->split)
{
int maxd = 0, temp;
for(cp = &cell_pool[c->firstchild]; cp != &cell_pool[c->sibling]; cp = &cell_pool[cp->sibling])
{
temp = calcMaxDepth(cp, depth+1);
if(temp > maxd)
maxd = temp;
}
return maxd;
}else{
return depth+1;
}
}
qsched_funtype func;
/**
* @brief Set up and run a task-based Barnes-Hutt N-body solver.
*
* @param N The number of random particles to use.
* @param runs Number of force evaluations to use as a benchmark.
* @param fileName Input file name. If @c NULL or an empty string, random
* particle positions will be used.
*/
void test_bh(int N, int runs, char *fileName) {
int i, k;
struct cell *root;
FILE *file;
struct qsched s;
struct cell *gpu_ptr_cells;
double2 *com_temp;
double *comz_temp;
float *comm_temp;
cudaFree(0);
cudaThreadSetCacheConfig(cudaFuncCachePreferL1);
if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
error("Failed to copy function pointer from device");
/* Initialize the scheduler. */
qsched_init(&s, 1, qsched_flag_none);
//Create host particle arrays.
if( cudaMallocHost(&parts_host, sizeof(struct part) * N) != cudaSuccess)
error("Failed to allocate parts array");
if( cudaMalloc(&parts_temp, sizeof(struct part) * N) != cudaSuccess)
error("Failed to allocate device parts array");
if( cudaMemcpyToSymbol(parts_cuda, &parts_temp, sizeof(struct part*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to set device symbol for parts array");
if(fileName == NULL || fileName[0] == 0) {
for(k = 0; k < N; k++) {
parts_host[k].loc[0] = ((double)rand())/ RAND_MAX;
parts_host[k].loc[1] = ((double)rand())/ RAND_MAX;
parts_host[k].loc[2] = ((double)rand())/ RAND_MAX;
parts_host[k].m = ((double)rand()) / RAND_MAX;
}
} else {
file = fopen(fileName, "r");
#ifndef ID
int tempxy;
#endif
double temp;
printf("Reading input from file\n");
if(file) {
for(k = 0; k < N; k++) {
#ifdef ID
if(fscanf(file, "%d", &parts_host[k].id) != 1)
#else
if(fscanf(file, "%d", &tempxy) != 1)
#endif
error("Failed to read ID");
if(fscanf(file, "%f", &parts_host[k].m) != 1)
error("Failed to read mass of part %i.", k);
if(fscanf(file, "%lf%lf%lf", &parts_host[k].loc[0], &parts_host[k].loc[1], &parts_host[k].loc[2]) != 3)
error("Failed to read position of part %i.", k);
if(fscanf(file,"%lf %lf %lf %lf %lf %lf %lf %lf %lf", &temp,&temp,&temp,&temp,&temp,&temp,&temp,&temp,&temp ) != 9)
error("Failed to read extra stuff");
}
fclose(file);
}
}
/* Init the cells. */
root = cell_get();
root->loc_xy.x = 0.0;
root->loc_xy.y = 0.0;
root->loc_z = 0.0;
root->h = 1.0;
root->count = N;
root->parts = 0;
root->sibling = -1;
int c = root-cell_pool;
cell_split(root - cell_pool, &s);
root = &cell_pool[c];
int nr_leaves = 0;
int maxparts=0, minparts=1000000;
int number = 0;
while(c >= 0) {
if(cell_pool[c].count > 0)
{
number++;
if(cell_pool[c].res == qsched_res_none)
message("cell %i has no res", c);
}
if(!cell_pool[c].split) {
nr_leaves++;
if(cell_pool[c].count > maxparts)
{
maxparts = cell_pool[c].count;
}
if(cell_pool[c].count < minparts)
{
minparts = cell_pool[c].count;
}
c = cell_pool[c].sibling;
} else {
c = cell_pool[c].firstchild;
}
}
message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves));
message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts);
create_tasks(&s, root, NULL);
int self = 0, pair = 0, pc = 0;
for(k = 0; k < s.count; k++)
{
if(s.tasks[k].type == task_type_self)
self++;
else if (s.tasks[k].type == task_type_pair)
pair++;
else if (s.tasks[k].type >= 0)
pc++;
}
message("total number of tasks: %i.", s.count);
message("total number of pair tasks: %i.", pair);
message("total number of self tasks: %i.", self);
message("total number of pc tasks: %i.", pc);
message("total number of cells: %i.", number);
message("total number of deps: %i.", s.count_deps);
message("total number of res: %i.", s.count_res);
message("total number of locks: %i.", s.count_locks);
for(k = 0; k < runs; k++) {
for(i = 0; i < N; ++i) {
parts_host[i].a[0] = 0.0;
parts_host[i].a[1] = 0.0;
parts_host[i].a[2] = 0.0;
}
/* Copy the cells to the device. */
if( cudaMalloc( &gpu_ptr_cells , sizeof(struct cell) * used_cells) != cudaSuccess)
error("Failed to allocate cells on the GPU");
if( cudaMemcpy( gpu_ptr_cells, cell_pool, sizeof(struct cell) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to copy cells to the GPU");
if( cudaMemcpyToSymbol(cells, &gpu_ptr_cells, sizeof(struct cell*), 0, cudaMemcpyHostToDevice) != cudaSuccess )
error("Failed to copy cell pointer to the GPU");
if(cudaMalloc( &com_temp, sizeof(double2) * used_cells) != cudaSuccess)
error("Failed to allocate com on the GPU");
if( cudaMemcpy( com_temp, com_xy_host, sizeof(double2) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
error("failed to copy com to the GPU");
if( cudaMemcpyToSymbol(com_xy, &com_temp, sizeof(double2 *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to copy com pointer to the GPU");
if(cudaMalloc( &comz_temp, sizeof(double) * used_cells) != cudaSuccess)
error("Failed to allocate com on the GPU");
if( cudaMemcpy( comz_temp, com_z_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
error("failed to copy com to the GPU");
if( cudaMemcpyToSymbol(com_z, &comz_temp, sizeof(double *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to copy com pointer to the GPU");
if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess)
error("Failed to allocate com on the GPU");
if( cudaMemcpy( comm_temp, com_mass_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
error("failed to copy com to the GPU");
if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to copy com pointer to the GPU");
printf("Max depth is %i\n", calcMaxDepth(root, 0));
qsched_run_CUDA( &s , func );
qsched_print_cuda_timers(&s);
int pcs;
if( cudaMemcpyFromSymbol( &pcs, pc_calcs, sizeof(int), 0, cudaMemcpyDeviceToHost) != cudaSuccess)
error("Failed");
printf("pc calcs = %i\n", pcs);
/*struct task* tasks = qsched_get_timers( &s , s.count );
for(i = 0; i < s.count; i++)
{
printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
// printf("\n");
}*/
}
#ifdef EXACT
struct part *parts2;
parts2 = (struct part*) malloc(sizeof(struct part) * N );
for(k = 0; k < N; k++)
{
parts2[k].loc[0] = parts_host[k].loc[0];
parts2[k].loc[1] = parts_host[k].loc[1];
parts2[k].loc[2] = parts_host[k].loc[2];
parts2[k].m = parts_host[k].m;
parts2[k].a[0] = 0.0f;
parts2[k].a[1] = 0.0f;
parts2[k].a[2] = 0.0f;
}
// cudaDeviceReset();
if(cudaMalloc(&com_temp, sizeof(struct part) * N) != cudaSuccess)
error("Couldn't allocate com_temp");
if(cudaMemcpy(com_temp, parts2, sizeof(struct part) * N, cudaMemcpyHostToDevice) != cudaSuccess)
error("Couldn't copy part data");
if(cudaMemcpyToSymbol(parts_cuda, &com_temp, sizeof(struct part*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to copy part data");
int numblocks = N / 128 + 1;
double itpms = 1000.0 / CPU_TPS;
ticks tic, toc_run ;
tic = getticks();
interact_exact<<<numblocks, 128>>>(N);
cudaDeviceSynchronize();
toc_run = getticks();
printf("Cuda kernel took %.3f ms to run exact solution\n", ((double)(toc_run - tic)) * itpms);
if( cudaMemcpy( parts2, com_temp, sizeof(struct part) * N, cudaMemcpyDeviceToHost) != cudaSuccess)
error("Failed to copy data back from device");
printf("%e\n", parts_host[0].m);
k = 0;
printf("%e, %e, %e, %e, %e, %e, %e, %e, %e, %e\n", parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2],
parts2[k].a[0], parts2[k].a[1], parts2[k].a[2], parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]);
#endif
/* Dump the particles to a file */
file = fopen("particle_dump.dat", "w");
/* fprintf(file,
"# ID m x y z a_exact.x a_exact.y a_exact.z a_legacy.x "
"a_legacy.y a_legacy.z a_new.x a_new.y a_new.z\n");*/
#ifdef EXACT
printf("Printing exact\n");
for(k = 0; k < N; ++k)
fprintf(file, "%i %e %e %e %e %e %e %e %e %e %e %e %e %e\n",
#ifdef ID
parts_host[k].id,
#else
k,
#endif
parts_host[k].m,parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2],
parts2[k].a[0], parts2[k].a[1], parts2[k].a[2], parts2[k].a[0], parts2[k].a[1], parts2[k].a[2], parts_host[k].a[0],
parts_host[k].a[1], parts_host[k].a[2]);
#else
for (k = 0; k < N; ++k)
fprintf(file, "%i %e %e %e %e %e %e %e\n",
k, parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2],
parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]);
#endif
fclose(file);
file = fopen("particle_pos.dat", "w");
fprintf(file, "m x[1] x[2] x[3]\n");
for(k = 0; k < N; k++)
fprintf(file, "%e %e %e %e\n", parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2]);
fclose(file);
cudaFreeHost(parts_host);
//free(parts_host);
}
int main(int argc, char *argv[]) {
int c, nr_threads;
int N = 1000, runs = 1;
char fileName[100] = {0};
/* Parse the options */
while ((c = getopt(argc, argv, "n:r:t:f:c:i:")) != -1) switch (c) {
case 'n':
if (sscanf(optarg, "%d", &N) != 1)
error("Error parsing number of particles.");
break;
case 'r':
if (sscanf(optarg, "%d", &runs) != 1)
error("Error parsing number of runs.");
break;
case 't':
if (sscanf(optarg, "%d", &nr_threads) != 1)
error("Error parsing number of threads.");
break;
case 'f':
if (sscanf(optarg, "%s", &fileName[0]) != 1)
error("Error parsing file name.");
break;
case '?':
fprintf(stderr, "Usage: %s [-t nr_threads] [-n N] [-r runs] [-f file] "
"[-c Nparts] [-i Niterations] \n",
argv[0]);
fprintf(stderr, "Solves the N-body problem using a Barnes-Hut\n"
"tree code with N random particles read from a file in "
"[0,1]^3 using"
"nr_threads threads.\n"
"A test of the neighbouring cells interaction with "
"Nparts per cell is also run Niterations times.\n");
exit(EXIT_FAILURE);
}
/* Tree node information */
printf("Size of cell: %zu bytes.\n", sizeof(struct cell));
/* Part information */
printf("Size of part: %zu bytes.\n", sizeof(struct part));
/* Dump arguments. */
if (fileName[0] == 0) {
message("Computing the N-body problem over %i random particles using %i "
"threads (%i runs).",
N, nr_threads, runs);
} else {
message("Computing the N-body problem over %i particles read from '%s' "
"using %i threads (%i runs).",
N, fileName, nr_threads, runs);
}
/* Run the BH test. */
test_bh(N, runs, fileName);
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment