diff --git a/examples/test_bh.cu b/examples/test_bh.cu new file mode 100644 index 0000000000000000000000000000000000000000..f269cd47bf8de54327dbac53a1acef6fbef05821 --- /dev/null +++ b/examples/test_bh.cu @@ -0,0 +1,1294 @@ +/******************************************************************************* + * This file is part of QuickSched. + * Coypright (c) 2014 Pedro Gonnet (pedro.gonnet@durham.ac.uk), + * Matthieu Schaller (matthieu.schaller@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * +* *****************************************************************************/ + +/* Config parameters. */ +#include "../config.h" + +/* Standard includes. */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <math.h> +#include <float.h> +#include <limits.h> +#include <omp.h> +#include <fenv.h> + + + +/* Local includes. */ +extern "C"{ +#include "quicksched.h" +#include "res.h" +#include <cblas.h> +} +#include "cuda_queue.h" + +/* Some local constants. */ +#define cell_pool_grow 1000 +#define cell_maxparts 128 +#define task_limit 1e8 +#define const_G 1 // 6.6738e-8 +#define dist_min 0.5 /* Used for legacy walk only */ +#define dist_cutoff_ratio 1.5 + +#define ICHECK -1 +#define NO_SANITY_CHECKS +#define NO_COM_AS_TASK +#define NO_COUNTERS + +/** Data structure for the particles. */ +struct part { + double x[3]; + // union { + float a[3]; + float a_legacy[3]; + float a_exact[3]; + // }; + float mass; + int id; +}; // __attribute__((aligned(64))); + +struct multipole { + double com[3]; + float mass; +}; + +struct cell_device{ + double loc[3]; + double h; + int count; + unsigned short int split,sorted; + int part_loc; /*Index of particles in part array. */ + int first_child; /* Index of first child in cell array. */ + int sibling; /* Next node.*/ + struct multipole mpole; + int res, com_tid; +} + +/** Data structure for the BH tree cell. */ +struct cell { + double loc[3]; + double h; + + int count; + unsigned short int split, sorted; + struct part *parts; + + struct cell *firstchild; /* Next node if opening */ + struct cell *sibling; /* Next node */ + +/** Global variable for the pool of allocated cells. */ + + /* We keep both CoMs and masses to make sure the comp_com calculation is + * correct (use an anonymous union to keep variable names compact). */ +// union { + + /* Information for the legacy walk */ +// struct multipole legacy; + + /* Information for the QuickShed walk */ + struct multipole mpole; +// }; + + int res, com_tid; + struct index *indices; + +} __attribute__((aligned(128))); +struct cell *cell_pool = NULL; + +/** Task types. */ +enum task_type { + task_type_self = 0, + task_type_pair, + task_type_self_pc, + task_type_com, + task_type_count +}; + +__device__ struct part *parts; + + +#ifdef TODO +__device__ double *part_x1; +__device__ double *part_x2; +__device__ double *part_x3; +__device__ float *part_a1; +__device__ float *part_a2; +__device__ float *part_a3; +__device__ float *part_mass; +#endif + + +#define cuda_error(s, ...) {if(threadIdx.x == 0){fprintf( stderr , "%s:%s():%i: " s "\n" , __FILE__ , __FUNCTION__ , __LINE__ , ##__VA_ARGS__ );} __syncthreads(); asm("trap\n"); } + + +__device__ double atomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = + (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} + +__device__ inline double __shfl_down_dbl(double var, unsigned int srcLane, int width) +{ + int2 a = *reinterpret_cast<int2*>(&var); + a.x = __shfl_down(a.x, srcLane, width); + a.y = __shfl_down(a.y, srcLane, width); + return *reinterpret_cast<double*>(&a); +} + +__device__ inline void reduceSumDouble( double* value) +{ + for(int mask = 16; mask > 0; mask >>= 1) + *value += __shfl_down_dbl( *value, mask, 32); +} + +__device__ inline void reduceSum( float* value ) +{ + #pragma unroll + for(int mask = 16 ; mask > 0 ; mask >>= 1) + *value += __shfl_down((float)*value, mask); + + // *value = __shfl((float)*value, 0); +} + +/** + * @brief Checks whether the cells are direct neighbours ot not + */ +__device__ __forceinline__ int are_neighbours_different_size_CUDA(struct cell *ci, + struct cell *cj) { + + int k; + float dx[3]; + double cih = ci->h, cjh = cj->h; + + /* Maximum allowed distance */ + float min_dist = 0.5 * (cih + cjh); + + /* (Manhattan) Distance between the cells */ + for (k = 0; k < 3; k++) { + float center_i = ci->loc[k] + 0.5 * cih; + float center_j = cj->loc[k] + 0.5 * cjh; + dx[k] = fabsf(center_i - center_j); + } + + return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); +} +/** + * @brief Compute the center of mass of a given cell. + * + * @param c The #cell. + */ +__device__ void comp_com_CUDA(struct cell *c) { + + int k, count = c->count; + struct cell *cp; + struct part *parts = c->parts; + double com[3] = {0.0, 0.0, 0.0}; + float mass = 0.0; + + if (c->split) { + + /* Loop over the projenitors and collect the multipole information. */ + if(threadIdx.x == 0) + { + for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) { + float cp_mass = cp->mpole.mass; + com[0] += cp->mpole.com[0] * cp_mass; + com[1] += cp->mpole.com[1] * cp_mass; + com[2] += cp->mpole.com[2] * cp_mass; + mass += cp_mass; + } + /* Store the COM data, if any was collected. */ + if (mass > 0.0) { + float imass = 1.0f / mass; + c->mpole.com[0] = com[0] * imass; + c->mpole.com[1] = com[1] * imass; + c->mpole.com[2] = com[2] * imass; + c->mpole.mass = mass; + } else { + c->mpole.com[0] = 0.0; + c->mpole.com[1] = 0.0; + c->mpole.com[2] = 0.0; + c->mpole.mass = 0.0; + } + } + /* Otherwise, collect the multipole from the particles. */ + } else { + + for (k = threadIdx.x; k < count; k+= blockDim.x) { + float p_mass = parts[k].mass; + com[0] += parts[k].x[0] * p_mass; + com[1] += parts[k].x[1] * p_mass; + com[2] += parts[k].x[2] * p_mass; + mass += p_mass; + } + reduceSumDouble(com); + reduceSumDouble(&com[1]); + reduceSumDouble(&com[2]); + reduceSum(&mass); + if(threadIdx.x % 32 == 0) + { + if (mass > 0.0) { + float imass = 1.0f / mass; + atomicAdd( &c->mpole.com[0] , com[0] * imass); + atomicAdd( &c->mpole.com[1] , com[1] * imass); + atomicAdd( &c->mpole.com[2] , com[2] * imass); + atomicAdd( &c->mpole.mass , mass); + } else if(threadIdx.x == 0) { + c->mpole.com[0] = 0.0; + c->mpole.com[1] = 0.0; + c->mpole.com[2] = 0.0; + c->mpole.mass = 0.0; + } + } + } + +} + + +/** + * @brief Checks whether the cells are direct neighbours ot not. Both cells have + * to be of the same size + */ +__device__ __forceinline__ int are_neighbours_CUDA(struct cell *ci, struct cell *cj) { + + int k; + float dx[3]; + +#ifdef SANITY_CHECKS + if (ci->h != cj->h) + error(" Cells of different size in distance calculation."); +#endif + + /* Maximum allowed distance */ + float min_dist = ci->h; + + /* (Manhattan) Distance between the cells */ + for (k = 0; k < 3; k++) { + float center_i = ci->loc[k]; + float center_j = cj->loc[k]; + dx[k] = fabsf(center_i - center_j); + } + + return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); +} + +/** + * @brief Compute the interactions between all particles in a cell + * and all particles in the other cell (N^2 algorithm). + * Only applies the forces to cell ci. + * + * @param ci The #cell containing the particles. + * @param cj The #cell containing the other particles + */ +__device__ __forceinline__ void iact_pair_direct_CUDA(struct cell *ci, struct cell *cj) +{ + int i, j, k; + int count_i = ci->count, count_j = cj->count; + struct part *parts_i = ci->parts, *parts_j = cj->parts; + double xi[3]; + float dx[3], ai[3], r2, w, ir; + +#ifdef SANITY_CHECKS + + /* Bad stuff will happen if cell sizes are different */ + if (ci->h != cj->h) + error("Non matching cell sizes !! h_i=%f h_j=%f\n", ci->h, cj->h); + +#endif + + /* Loop over particles in cell i */ + for(i=0; i < count_i; i+= blockDim.x) + { + /* Init the ith particle's data. */ + for (k = 0; k < 3; k++) { + xi[k] = parts_i[i].x[k]; + ai[k] = 0.0f; + } + // mi = parts_i[i].mass; + /* Loop over particles in cell j */ + for(j = 0; j < count_j; j++) + { + /* Compute the pairwise distance. */ + for (r2 = 0.0, k = 0; k < 3; k++) { + dx[k] = xi[k] - parts_j[j].x[k]; + r2 += dx[k] * dx[k]; + } + /* Apply the gravitational acceleration. */ + ir = 1.0f / sqrtf(r2); + w = const_G * ir * ir * ir; + for(k = 0; k < 3; k++) { + ai[k] = w * dx[k] * parts_j[j].mass; + } + } + + for (k = 0; k < 3; k++) atomicAdd( &parts[i].a[k] , ai[k]); + }/* Loop over particles in cell i.*/ +} + +/** + * @brief Decides whether two cells use the direct summation interaction or the + * multipole interactions + * + * @param ci The #cell. + * @param cj The other #cell. + */ +__device__ void iact_pair_CUDA(struct cell *ci, struct cell *cj) { + + struct cell *cp, *cps; + +#ifdef SANITY_CHECKS + + /* Early abort? */ + if (ci->count == 0 || cj->count == 0) error("Empty cell !"); + + /* Bad stuff will happen if cell sizes are different */ + if (ci->h != cj->h) + error("Non matching cell sizes !! h_i=%f h_j=%f\n", ci->h, cj->h); + + /* Sanity check */ + if (ci == cj) + error("The impossible has happened: pair interaction between a cell and " + "itself."); + +#endif + + /* Are the cells direct neighbours? */ + if (are_neighbours_CUDA(ci, cj)) { + + /* Are both cells split ? */ + if (ci->split && cj->split) { + + /* Let's split both cells and build all possible pairs */ + for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) { + for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) { + + /* If the cells are neighbours, recurse. */ + if (are_neighbours_CUDA(cp, cps)) { + iact_pair_CUDA(cp, cps); + } + } + } + } else {/* Otherwise, compute the interactions at this level directly. */ + iact_pair_direct_CUDA(ci, cj); + iact_pair_direct_CUDA(cj, ci); + } + } +} + +__device__ void iact_self_direct_CUDA(struct cell *c) +{ + int i, j, k, count = c->count; + double xi[3] = {0.0, 0.0, 0.0}; + float ai[3] = {0.0, 0.0, 0.0}, mj, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w; + struct part *parts = c->parts; + struct cell *cp, *cps; + +#ifdef SANITY_CHECKS + + /* Early abort? */ + if (count == 0) + { + cuda_error("Empty cell"); + } + +#endif + + /* If the cell is split, interact each progeny with itself, and with + each of its siblings. */ + if(c->split) + { + for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) { + iact_self_direct_CUDA(cp); + for (cps = cp->sibling; cps != c->sibling; cps = cps->sibling) + iact_pair_CUDA(cp, cps); + } + } else { + + /* Loop over all particles. */ + for(i = threadIdx.x; i < count; i+= blockDim.x) + { + /* Init the ith particle's data. */ + for (k = 0; k < 3; k++) { + xi[k] = parts[i].x[k]; + ai[k] = 0.0; + } +// mi = parts[i].mass; + + /* Loop all particles - note this was an improvement in quicksched so we use it here. */ + for(j = 0; j < count; j++) + { + /* Compute the pairwise distance. */ + for (r2 = 0.0, k = 0; k < 3; k++) { + dx[k] = xi[k] - parts[j].x[k]; + r2 += dx[k] * dx[k]; + } + /* Apply the gravitational acceleration. */ + ir = 1.0f / sqrtf(r2); + w = const_G * ir * ir * ir; + mj = parts[j].mass; + for(k = 0; k < 3; k++) + { + ai[k] -= w * dx[k] * mj; + } + } /* Inner particle loop. */ + /* Store the accumulated acceleration on the ith part. */ + for (k = 0; k < 3; k++) atomicAdd( &parts[i].a[k] , ai[k]); + }/* Outer particle loop */ + }/* All done!*/ + +} + +/** + * @brief Checks whether the cell leaf is a subvolume of the cell c + */ +__device__ __forceinline__ int is_inside_CUDA(struct cell *leaf, struct cell *c) { + return (leaf->parts >= c->parts) && (leaf->parts < c->parts + c->count); +} + + +/** + * @brief Interacts all particles in ci with the monopole in cj + */ +__device__ __forceinline__ void make_interact_pc_CUDA(struct cell *leaf, struct cell *cj) { + + int j, k, count = leaf->count; + double com[3] = {0.0, 0.0, 0.0}; + float mcom, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w; + struct part *parts = leaf->parts; + +#ifdef SANITY_CHECKS + + /* Sanity checks */ + if (leaf->count == 0) error("Empty cell!"); + + /* Sanity check. */ + if (cj->new.mass == 0.0) { + message("%e %e %e %d %p %d %p", cj->new.com[0], cj->new.com[1], + cj->new.com[2], cj->count, cj, cj->split, cj->sibling); + + for (j = 0; j < cj->count; ++j) + message("part %d mass=%e id=%d", j, cj->parts[j].mass, cj->parts[j].id); + + error("com does not seem to have been set."); + } + +#endif + + /* Init the com's data. */ + for (k = 0; k < 3; k++) com[k] = cj->mpole.com[k]; + mcom = cj->mpole.mass; + + /* Loop over every particle in leaf. */ + for (j = threadIdx.x; j < count; j+=blockDim.x) { + + /* Compute the pairwise distance. */ + for (r2 = 0.0, k = 0; k < 3; k++) { + dx[k] = com[k] - parts[j].x[k]; + r2 += dx[k] * dx[k]; + } + + /* Apply the gravitational acceleration. */ + ir = 1.0f / sqrtf(r2); + w = mcom * const_G * ir * ir * ir; + + for (k = 0; k < 3; k++) atomicAdd( &parts[j].a[k] , w * dx[k]); + +#if ICHECK >= 0 + if (leaf->parts[j].id == ICHECK) + printf("[DEBUG] cell [%f,%f,%f] interacting with cj->loc=[%f,%f,%f] " + "m=%f h=%f\n", + leaf->loc[0], leaf->loc[1], leaf->loc[2], cj->loc[0], cj->loc[1], + cj->loc[2], mcom, cj->h); + + if (leaf->parts[j].id == ICHECK) + printf("[NEW] Interaction with monopole a=( %e %e %e ) h= %f Nj= %d m_j= " + "%f\n", + w * dx[0], w * dx[1], w * dx[2], cj->h, cj->count, mcom); +#endif + + } /* loop over every particle. */ +} + +/** + * @brief Compute the interactions between all particles in a cell leaf + * and the center of mass of all the cells in a part of the tree + * described by ci and cj + * + * @param ci The #cell containing the particle + * @param cj The #cell containing the center of mass. + */ +__device__ __forceinline__ void iact_pair_pc_CUDA(struct cell *ci, struct cell *cj, + struct cell *leaf) { + + struct cell *cp, *cps; + +#ifdef SANITY_CHECKS + + /* Early abort? */ + if (ci->count == 0 || cj->count == 0) error("Empty cell !"); + + /* Sanity check */ + if (ci == cj) + error("The impossible has happened: pair interaction between a cell and " + "itself."); + + /* Sanity check */ + if (!is_inside(leaf, ci)) + error("The impossible has happened: The leaf is not within ci"); + + /* Are the cells direct neighbours? */ + if (!are_neighbours(ci, cj)) error("Cells are not neighours"); + + /* Are both cells split ? */ + if (!ci->split || !cj->split) error("One of the cells is not split !"); +#endif + + /* Let's find in which subcell of ci the leaf is */ + for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) { + + if (is_inside_CUDA(leaf, cp)) break; + } + + if (are_neighbours_different_size_CUDA(cp, cj)) { + + /* Now interact this subcell with all subcells of cj */ + for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) { + + /* Check whether we have to recurse or can directly jump to the multipole + * calculation */ + if (are_neighbours_CUDA(cp, cps)) { + + /* We only recurse if the children are split */ + if (cp->split && cps->split) { + iact_pair_pc_CUDA(cp, cps, leaf); + } + + } else { + make_interact_pc_CUDA(leaf, cps); + } + } + } else { + + /* If cp is not a neoghbour of cj, we can directly interact with the + * multipoles */ + for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) { + + make_interact_pc_CUDA(leaf, cps); + } + } +} + + +__device__ void iact_self_pc_CUDA(struct cell *c, struct cell *leaf) +{ + +struct cell *cp, *cps; + + #ifdef SANITY_CHECKS + + /* Early abort? */ + if (c->count == 0) error("Empty cell !"); + + if (!c->split) error("Cell is not split !"); + + #endif + +/* Find in which subcell of c the leaf is */ + for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) { + + /* Only recurse if the leaf is in this part of the tree */ + if (is_inside_CUDA(leaf, cp)) break; + } + +if (cp->split) { + + /* Recurse if the cell can be split */ + iact_self_pc_CUDA(cp, leaf); + + /* Now, interact with every other subcell */ + for (cps = c->firstchild; cps != c->sibling; cps = cps->sibling) { + + /* Since cp and cps will be direct neighbours it is only worth recursing + */ + /* if the cells can both be split */ + if (cp != cps && cps->split) iact_pair_pc_CUDA(cp, cps, leaf); + } + } +} + + + + +/** + * @brief Get a #cell from the pool. + */ +struct cell *cell_get() { + + struct cell *res; + int k; + + /* Allocate a new batch? */ + if (cell_pool == NULL) { + + /* Allocate the cell array. */ + if ((cell_pool = + (struct cell *)calloc(cell_pool_grow, sizeof(struct cell))) == + NULL) + error("Failed to allocate fresh batch of cells."); + + /* Link them up via their progeny pointers. */ + for (k = 1; k < cell_pool_grow; k++) + cell_pool[k - 1].firstchild = &cell_pool[k]; + } + + /* Pick a cell off the pool. */ + res = cell_pool; + cell_pool = cell_pool->firstchild; + + /* Clean up a few things. */ + res->res = qsched_res_none; + res->firstchild = 0; + + /* Return the cell. */ + return res; +} + + +/** + * @brief Sort the parts into eight bins along the given pivots and + * fill the multipoles. Also adds the hierarchical resources + * to the sched. + * + * @param c The #cell to be split. + * @param N The total number of parts. + * @param s The #sched to store the resources. + */ +void cell_split(struct cell *c, struct qsched *s) { + + int i, j, k, kk, count = c->count; + struct part temp, *parts = c->parts; + struct cell *cp; + int left[8], right[8]; + double pivot[3]; + static struct cell *root = NULL; + struct cell *progenitors[8]; + + /* Set the root cell. */ + if (root == NULL) { + root = c; + c->sibling = 0; + } + + /* Add a resource for this cell if it doesn't have one yet. */ + if (c->res == qsched_res_none) + c->res = qsched_addres(s, qsched_owner_none, qsched_res_none, NULL, 0 ,NULL); + +/* Attach a center-of-mass task to the cell. */ +#ifdef COM_AS_TASK + if (count > 0) + c->com_tid = qsched_addtask(s, task_type_com, task_flag_none, &c, + sizeof(struct cell *), 1); +#endif + + /* Does this cell need to be split? */ + if (count > cell_maxparts) { + + /* Mark this cell as split. */ + c->split = 1; + + /* Create the progeny. */ + for (k = 0; k < 8; k++) { + progenitors[k] = cp = cell_get(); + cp->loc[0] = c->loc[0]; + cp->loc[1] = c->loc[1]; + cp->loc[2] = c->loc[2]; + cp->h = c->h * 0.5; + cp->res = qsched_addres(s, qsched_owner_none, c->res, NULL, 0, NULL); + if (k & 4) cp->loc[0] += cp->h; + if (k & 2) cp->loc[1] += cp->h; + if (k & 1) cp->loc[2] += cp->h; + } + + /* Init the pivots. */ + for (k = 0; k < 3; k++) pivot[k] = c->loc[k] + c->h * 0.5; + + /* Split along the x-axis. */ + i = 0; + j = count - 1; + while (i <= j) { + while (i <= count - 1 && parts[i].x[0] < pivot[0]) i += 1; + while (j >= 0 && parts[j].x[0] >= pivot[0]) j -= 1; + if (i < j) { + temp = parts[i]; + parts[i] = parts[j]; + parts[j] = temp; + } + } + left[1] = i; + right[1] = count - 1; + left[0] = 0; + right[0] = j; + + /* Split along the y axis, twice. */ + for (k = 1; k >= 0; k--) { + i = left[k]; + j = right[k]; + while (i <= j) { + while (i <= right[k] && parts[i].x[1] < pivot[1]) i += 1; + while (j >= left[k] && parts[j].x[1] >= pivot[1]) j -= 1; + if (i < j) { + temp = parts[i]; + parts[i] = parts[j]; + parts[j] = temp; + } + } + left[2 * k + 1] = i; + right[2 * k + 1] = right[k]; + left[2 * k] = left[k]; + right[2 * k] = j; + } + + /* Split along the z axis, four times. */ + for (k = 3; k >= 0; k--) { + i = left[k]; + j = right[k]; + while (i <= j) { + while (i <= right[k] && parts[i].x[2] < pivot[2]) i += 1; + while (j >= left[k] && parts[j].x[2] >= pivot[2]) j -= 1; + if (i < j) { + temp = parts[i]; + parts[i] = parts[j]; + parts[j] = temp; + } + } + left[2 * k + 1] = i; + right[2 * k + 1] = right[k]; + left[2 * k] = left[k]; + right[2 * k] = j; + } + + /* Store the counts and offsets. */ + for (k = 0; k < 8; k++) { + progenitors[k]->count = right[k] - left[k] + 1; + progenitors[k]->parts = &c->parts[left[k]]; + } + + /* Find the first non-empty progenitor */ + for (k = 0; k < 8; k++) + if (progenitors[k]->count > 0) { + c->firstchild = progenitors[k]; + break; + } + +#ifdef SANITY_CHECKS + if (c->firstchild == NULL) + error("Cell has been split but all progenitors have 0 particles"); +#endif + + /* Prepare the pointers. */ + for (k = 0; k < 8; k++) { + + /* Find the next non-empty sibling */ + for (kk = k + 1; kk < 8; ++kk) { + if (progenitors[kk]->count > 0) { + progenitors[k]->sibling = progenitors[kk]; + break; + } + } + + /* No non-empty sibling ? Go back a level */ + if (kk == 8) progenitors[k]->sibling = c->sibling; + } + + /* Recurse. */ + for (k = 0; k < 8; k++) + if (progenitors[k]->count > 0) cell_split(progenitors[k], s); + +/* Link the COM tasks. */ +#ifdef COM_AS_TASK + for (k = 0; k < 8; k++) + if (progenitors[k]->count > 0) + qsched_addunlock(s, progenitors[k]->com_tid, c->com_tid); +#endif + + /* Otherwise, we're at a leaf, so create the cell's particle-cell task. */ + } else { + struct cell *data[2] = {root, c}; + int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data, + 2 * sizeof(struct cell *), 1); + qsched_addlock(s, tid, c->res); +#ifdef COM_AS_TASK + qsched_addunlock(s, root->com_tid, tid); +#endif + } /* does the cell need to be split? */ + +/* Compute the cell's center of mass. */ +#ifndef COM_AS_TASK + comp_com(c); +#endif + + /* Set this cell's resources ownership. */ + qsched_res_own(s, c->res, + s->nr_queues * (c->parts - root->parts) / root->count); +} + +/** + * @brief Checks whether the cells are direct neighbours ot not. Both cells have + * to be of the same size + */ +static inline int are_neighbours(struct cell *ci, struct cell *cj) { + + int k; + float dx[3]; + +#ifdef SANITY_CHECKS + if (ci->h != cj->h) + error(" Cells of different size in distance calculation."); +#endif + + /* Maximum allowed distance */ + float min_dist = ci->h; + + /* (Manhattan) Distance between the cells */ + for (k = 0; k < 3; k++) { + float center_i = ci->loc[k]; + float center_j = cj->loc[k]; + dx[k] = fabs(center_i - center_j); + } + + return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); +} + +/** + * @brief Create the tasks for the cell pair/self. + * + * @param s The #sched in which to create the tasks. + * @param ci The first #cell. + * @param cj The second #cell. + */ +void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { + + qsched_task_t tid; + struct cell *data[2], *cp, *cps; + +#ifdef SANITY_CHECKS + + /* If either cell is empty, stop. */ + if (ci->count == 0 || (cj != NULL && cj->count == 0)) error("Empty cell !"); + +#endif + + /* Single cell? */ + if (cj == NULL) { + + /* Is this cell split and above the task limit ? */ + if (ci->split && ci->count > task_limit / ci->count) { + + /* Loop over each of this cell's progeny. */ + for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) { + + /* Make self-interaction task. */ + create_tasks(s, cp, NULL); + + /* Make all pair-interaction tasks. */ + for (cps = cp->sibling; cps != ci->sibling; cps = cps->sibling) + create_tasks(s, cp, cps); + } + + /* Otherwise, add a self-interaction task. */ + } else { + + /* Set the data. */ + data[0] = ci; + data[1] = NULL; + + /* Create the task. */ + tid = + qsched_addtask(s, task_type_self, task_flag_none, data, + sizeof(struct cell *) * 2, ci->count * ci->count / 2); + + /* Add the resource (i.e. the cell) to the mpole task. */ + qsched_addlock(s, tid, ci->res); + +/* If this call might recurse, add a dependency on the cell's COM + task. */ +#ifdef COM_AS_TASK + if (ci->split) qsched_addunlock(s, ci->com_tid, tid); +#endif + } + + /* Otherwise, it's a pair. */ + } else { + + /* Are the cells NOT neighbours ? */ + if (!are_neighbours(ci, cj)) { + + } else {/* Cells are direct neighbours */ + + /* Are both cells split ? */ + if (ci->split && cj->split && ci->count > task_limit / cj->count) { + + /* Let's split both cells and build all possible pairs */ + for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) { + for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) { + /* Recurse */ + create_tasks(s, cp, cps); + } + } + /* Otherwise, at least one of the cells is not split, build a direct + * interaction. */ + } else { + + /* Set the data. */ + data[0] = ci; + data[1] = cj; + + /* Create the task. */ + tid = qsched_addtask(s, task_type_pair, task_flag_none, data, + sizeof(struct cell *) * 2, ci->count * cj->count); + + /* Add the resources. */ + qsched_addlock(s, tid, ci->res); + qsched_addlock(s, tid, cj->res); + } + } /* Cells are direct neighbours */ + } /* Otherwise it's a pair */ +} + +/** + * @brief Set up and run a task-based Barnes-Hutt N-body solver. + * + * @param N The number of random particles to use. + * @param nr_threads Number of threads to use. + * @param runs Number of force evaluations to use as a benchmark. + * @param fileName Input file name. If @c NULL or an empty string, random + * particle positions will be used. + */ +void test_bh(int N, int nr_threads, int runs, char *fileName) { + + int i, k; + struct cell *root; + struct part *parts; + FILE *file; + struct qsched s; + ticks tic, toc_run, tot_setup = 0, tot_run = 0; + int countMultipoles = 0, countPairs = 0, countCoMs = 0; + + /* Initialize the per-task type timers. */ +// for (k = 0; k < task_type_count; k++) task_timers[k] = 0; + + /* Initialize the scheduler. */ + qsched_init(&s, 1, qsched_flag_none); + + /* Init and fill the particle array. */ + if ((parts = (struct part *)malloc(sizeof(struct part) * N)) == NULL) + error("Failed to allocate particle buffer."); + + double *partsx0, *partsx1, *partsx2; + float *partsa0, *partsa1, *partsa2, *partsmass; + + if ((partsx0 = (double*)malloc(sizeof(double)*N)) == NULL) + error("Failed to allocate particle buffer."); + if ((partsx1 = (double*)malloc(sizeof(double)*N)) == NULL) + error("Failed to allocate particle buffer."); + if ((partsx2 = (double*)malloc(sizeof(double)*N)) == NULL) + error("Failed to allocate particle buffer."); + + if ((partsa0 = (float*)malloc(sizeof(float)*N)) == NULL) + error("Failed to allocate particle buffer."); + if ((partsa1 = (float*)malloc(sizeof(float)*N)) == NULL) + error("Failed to allocate particle buffer."); + if ((partsa2 = (float*)malloc(sizeof(float)*N)) == NULL) + error("Failed to allocate particle buffer."); + if ((partsmass = (float*)malloc(sizeof(float)*N)) == NULL) + error("Failed to allocate particle buffer."); + + + + + /* If no input file was specified, generate random particle positions. */ + if (fileName == NULL || fileName[0] == 0) { + for (k = 0; k < N; k++) { + parts[k].id = k; + parts[k].x[0] = ((double)rand()) / RAND_MAX; + parts[k].x[1] = ((double)rand()) / RAND_MAX; + parts[k].x[2] = ((double)rand()) / RAND_MAX; + parts[k].mass = ((double)rand()) / RAND_MAX; + parts[k].a_legacy[0] = 0.0; + parts[k].a_legacy[1] = 0.0; + parts[k].a_legacy[2] = 0.0; + partsx0[k] = parts[k].x[0]; + partsx1[k] = parts[k].x[1]; + partsx2[k] = parts[k].x[2]; + partsmass[k] = parts[k].mass; + partsa0[k] = 0.0f; + partsa1[k] = 0.0f; + partsa2[k] = 0.0f; + } + + /* Otherwise, read them from a file. */ + } else { + file = fopen(fileName, "r"); + if (file) { + for (k = 0; k < N; k++) { + if (fscanf(file, "%d", &parts[k].id) != 1) + error("Failed to read ID of part %i.", k); + if (fscanf(file, "%lf%lf%lf", &parts[k].x[0], &parts[k].x[1], + &parts[k].x[2]) != + 3) + error("Failed to read position of part %i.", k); + if (fscanf(file, "%f", &parts[k].mass) != 1) + error("Failed to read mass of part %i.", k); + + partsx0[k] = parts[k].x[0]; + partsx1[k] = parts[k].x[1]; + partsx2[k] = parts[k].x[2]; + partsmass[k] = parts[k].mass; + partsa0[k] = 0.0f; + partsa1[k] = 0.0f; + partsa2[k] = 0.0f; + } + fclose(file); + } + } + + /* Init the cells. */ + root = cell_get(); + root->loc[0] = 0.0; + root->loc[1] = 0.0; + root->loc[2] = 0.0; + root->h = 1.0; + root->count = N; + root->parts = parts; + cell_split(root, &s); + + /* Iterate over the cells and get the average number of particles + per leaf. */ + struct cell *c = root; + int nr_leaves = 0; + while (c != NULL) { + if (!c->split) { + nr_leaves++; + c = c->sibling; + } else { + c = c->firstchild; + } + } + message("Average number of parts per leaf is %f.", ((double)N) / nr_leaves); + +#if ICHECK > 0 + printf("----------------------------------------------------------\n"); + + /* Do a N^2 interactions calculation */ + + ticks tic_exact = getticks(); + interact_exact(N, parts, ICHECK); + ticks toc_exact = getticks(); + + printf("Exact calculation (1 thread) took %lli (= %e) ticks\n", + toc_exact - tic_exact, (float)(toc_exact - tic_exact)); + + printf("----------------------------------------------------------\n"); +#endif + + /* Create the tasks. */ + tic = getticks(); + create_tasks(&s, root, NULL); + tot_setup += getticks() - tic; + + /* Dump the number of tasks. */ + message("total nr of tasks: %i.", s.count); + message("total nr of deps: %i.", s.count_deps); + message("total nr of res: %i.", s.count_res); + message("total nr of locks: %i.", s.count_locks); + message("total nr of uses: %i.", s.count_uses); + int counts[task_type_count]; + for (k = 0; k < task_type_count; k++) counts[k] = 0; + for (k = 0; k < s.count; k++) counts[s.tasks[k].type] += 1; + + // char buffer[200]; + // sprintf(buffer, "timings_legacy_%d_%d.dat", cell_maxparts, nr_threads); + // FILE *fileTime = fopen(buffer, "w"); + +// fclose(fileTime); + +#if ICHECK >= 0 + message("[check] accel of part %i is [%.3e,%.3e,%.3e]", ICHECK, + root->parts[ICHECK].a[0], root->parts[ICHECK].a[1], + root->parts[ICHECK].a[2]); +#endif + printf("task counts: [ %8s %8s %8s %8s %8s ]\n", "self", "pair", "m-poles", + "direct", "CoMs"); + printf("task counts: [ %8i %8i %8i %8i %8i ] (legacy).\n", 0, 0, + countMultipoles, countPairs, countCoMs); + printf("task counts: [ "); + for (k = 0; k < task_type_count; k++) printf("%8i ", counts[k]); + printf("] (mpole).\n"); + + /* Loop over the number of runs. */ + for (k = 0; k < runs; k++) { + + for (i = 0; i < N; ++i) { + parts[i].a[0] = 0.0; + parts[i].a[1] = 0.0; + parts[i].a[2] = 0.0; + } + + /* Execute the tasks. */ + tic = getticks(); +// qsched_run(&s, nr_threads, runner); + toc_run = getticks(); + message("%ith run took %lli (= %e) ticks...", k, toc_run - tic, + (float)(toc_run - tic)); + tot_run += toc_run - tic; + } + +// message("[check] root mass= %f %f", root->legacy.mass, root->mpole.mass); +// message("[check] root CoMx= %f %f", root->legacy.com[0], root->mpole.com[0]); +// message("[check] root CoMy= %f %f", root->legacy.com[1], root->mpole.com[1]); +// message("[check] root CoMz= %f %f", root->legacy.com[2], root->mpole.com[2]); +#if ICHECK >= 0 + for (i = 0; i < N; ++i) + if (root->parts[i].id == ICHECK) + message("[check] accel of part %i is [%.3e,%.3e,%.3e]", ICHECK, + root->parts[i].a[0], root->parts[i].a[1], root->parts[i].a[2]); +#endif + + /* Dump the tasks. */ + /* for ( k = 0 ; k < s.count ; k++ ) + printf( " %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid , + s.tasks[k].tic , s.tasks[k].toc ); */ + + /* Dump the costs. */ + message("costs: setup=%lli ticks, run=%lli ticks.", tot_setup, + tot_run / runs); + + /* Dump the timers. */ + for (k = 0; k < qsched_timer_count; k++) + message("timer %s is %lli ticks.", qsched_timer_names[k], + s.timers[k] / runs); + + /* Dump the per-task type timers. */ +// printf("task timers: [ "); +// for (k = 0; k < task_type_count; k++) printf("%lli ", task_timers[k] / runs); + //printf("] ticks.\n"); + + /* Dump the particles to a file */ + file = fopen("particle_dump.dat", "w"); + fprintf(file, + "# ID m x y z a_exact.x a_exact.y a_exact.z a_legacy.x " + "a_legacy.y a_legacy.z a_mpole.x a_mpole.y a_mpole.z\n"); + for (k = 0; k < N; ++k) + fprintf(file, "%d %e %e %e %e %e %e %e %e %e %e %e %e %e\n", parts[k].id, + parts[k].mass, parts[k].x[0], parts[k].x[1], parts[k].x[2], + parts[k].a_exact[0], parts[k].a_exact[1], parts[k].a_exact[2], + parts[k].a_legacy[0], parts[k].a_legacy[1], parts[k].a_legacy[2], + parts[k].a[0], parts[k].a[1], parts[k].a[2]); + fclose(file); + + /* Clean up. */ + qsched_free(&s); + free(parts); +} + +/** + * @brief Main function. + */ + +int main(int argc, char *argv[]) { + + int c, nr_threads; + int N = 1000, runs = 1; + char fileName[100] = {0}; + + /* Die on FP-exceptions. */ + feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); + +/* Get the number of threads. */ +#pragma omp parallel shared(nr_threads) + { + if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads(); + } + + /* Parse the options */ + while ((c = getopt(argc, argv, "n:r:t:f:c:i:")) != -1) switch (c) { + case 'n': + if (sscanf(optarg, "%d", &N) != 1) + error("Error parsing number of particles."); + break; + case 'r': + if (sscanf(optarg, "%d", &runs) != 1) + error("Error parsing number of runs."); + break; + case 't': + if (sscanf(optarg, "%d", &nr_threads) != 1) + error("Error parsing number of threads."); + omp_set_num_threads(nr_threads); + break; + case 'f': + if (sscanf(optarg, "%s", &fileName[0]) != 1) + error("Error parsing file name."); + break; + case '?': + fprintf(stderr, "Usage: %s [-t nr_threads] [-n N] [-r runs] [-f file] " + "[-c Nparts] [-i Niterations] \n", + argv[0]); + fprintf(stderr, "Solves the N-body problem using a Barnes-Hut\n" + "tree code with N random particles read from a file in " + "[0,1]^3 using" + "nr_threads threads.\n" + "A test of the neighbouring cells interaction with " + "Nparts per cell is also run Niterations times.\n"); + exit(EXIT_FAILURE); + } + + /* Tree node information */ + printf("Size of cell: %zu bytes.\n", sizeof(struct cell)); + + /* Part information */ + printf("Size of part: %zu bytes.\n", sizeof(struct part)); + + /* Dump arguments. */ + if (fileName[0] == 0) { + message("Computing the N-body problem over %i random particles using %i " + "threads (%i runs).", + N, nr_threads, runs); + } else { + message("Computing the N-body problem over %i particles read from '%s' " + "using %i threads (%i runs).", + N, fileName, nr_threads, runs); + } + + /* Run the BH test. */ + test_bh(N, nr_threads, runs, fileName); + + return 0; +} diff --git a/examples/test_qr.cu b/examples/test_qr.cu index 6f075d4b58b3b2c03c6d2912747102c543f3d333..b93b68da5c38ab5958648aa7be0d7f2fe35f2712 100644 --- a/examples/test_qr.cu +++ b/examples/test_qr.cu @@ -892,7 +892,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) qsched_run_CUDA( &s , func ); - + qsched_print_cuda_timers(&s); #ifdef NO_LOADS tic = getticks(); if( cudaMemcpy( device_array, A, sizeof(float) * m * n * K *K, cudaMemcpyDeviceToHost ) != cudaSuccess) @@ -906,7 +906,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) // printMatrix(tileToColumn(A, m*n*K*K, m, n, K), m, n); - /*float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K); +/* float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K); float *Q = computeQ(tempMatrix, m*K, K, tau, m); float *R = getR(tempMatrix, m*K); cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q, @@ -931,7 +931,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) // printMatrix(A, m, n); // printTileMatrix(A, m , n); struct task* tasks = qsched_get_timers( &s , s.count ); -/* for(i = 0; i < s.count; i++) + for(i = 0; i < s.count; i++) { printf("%i %lli %lli %i ", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID); if(tasks[i].type < 0) @@ -946,7 +946,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) }else printf("0 0"); printf("\n"); - }*/ + } free(tasks); cudaDeviceReset(); } diff --git a/src/cuda_queue.cu b/src/cuda_queue.cu index 5a684959016050b6e3d39bb77ce76c051fb0023d..971255fb4f2e72f9475d9ba5fc107a4f2fd43067 100644 --- a/src/cuda_queue.cu +++ b/src/cuda_queue.cu @@ -37,6 +37,30 @@ extern "C"{ #define type_ghost -102 +enum {timers_load_q=0, timers_unload_q, timers_queue, timers_doload, timers_dounload, timers_doother, timers_count}; +#ifdef TIMINGS +#ifdef TIMER_TIC +#undef TIMER_TIC +#endif +#ifdef TIMER_TOC +#undef TIMER_TOC +#endif +#define TIMER_TIC int timer = clock(); +#define TIMER_TOC(N) if(threadIdx.x == 0){atomicAdd( &cuda_timers[N], clock()-timer);} +#define TIMER_TIC_ND timer = clock(); +__device__ unsigned long long int cuda_timers[timers_count]; +#else +#ifdef TIMER_TIC +#undef TIMER_TIC +#endif +#ifdef TIMER_TOC +#undef TIMER_TOC +#endif +#define TIMER_TIC +#define TIMER_TOC(N) +#define TIMER_TIC_ND +#endif + /*Define if conflicts are enabled or not.*/ //#define GPU_locks @@ -356,7 +380,15 @@ __device__ int runner_cuda_gettask ( struct queue_cuda *q ) { return tid; } +#ifdef PRIQ + + +__device__ int runner_cuda_gettask_priority ( struct queue_cuda *q ) { + return runner_cuda_gettask(q); + + } +#endif #else @@ -372,7 +404,6 @@ __device__ int runner_cuda_gettask ( struct queue_cuda *q ) { * This routine blocks until a valid task is picked up, or the * specified queue is empty. */ -#ifndef PRIQ __device__ int runner_cuda_gettask ( struct queue_cuda *q ) { int tid = -1; @@ -397,7 +428,7 @@ __device__ int runner_cuda_gettask ( struct queue_cuda *q ) { return tid; } -#endif + #ifdef PRIQ __device__ int get_best_task(struct queue_cuda *q ) @@ -436,7 +467,6 @@ __device__ int get_best_task(struct queue_cuda *q ) ind2 %= cuda_queue_size; /* Loop until there is a valid task at that index. */ while ( tot_num_tasks > 0 && ( tid1 = q->data[ind1] ) < 0); - // atomicAdd((int*) &tot_num_tasks, -1); while ( tot_num_tasks > 0 && ( tid2 = q->data[ind2] ) < 0); if(tid1 < 0 && tid2 < 0) @@ -446,55 +476,6 @@ __device__ int get_best_task(struct queue_cuda *q ) pri1 = tasks_cuda[tid1].weight; pri2 = tasks_cuda[tid2].weight; - - int res1, res2; - if( (tid1 >= 0 && tasks_cuda[tid1].type == type_load) ) - { - res1 = cuda_trymultilock(&res_cuda[tasks_cuda[tid1].locks[0]].lock); - //printf("%i %i\n", res1, res_cuda[tasks_cuda[tid1].locks[0]].lock); - } - else - res1 = 1; - - if( tid2 >= 0 && tasks_cuda[tid2].type == type_load && (tid1 >= 0 && tasks_cuda[tid1].type != type_load )) - res2 = cuda_trymultilock(&res_cuda[tasks_cuda[tid2].locks[0]].lock); - else - res2 = 1; - - if(res1 == 0 && res2 == 0) - { - cuda_multiunlock(&res_cuda[tasks_cuda[tid1].locks[0]].lock); - q->data[ind1] = -1; - q->data[ind2] = -1; - cuda_queue_puttask( q, tid1); - cuda_queue_puttask( q, tid2); - return -1; - } - - if(res1 == 0) - { - cuda_multiunlock(&res_cuda[tasks_cuda[tid1].locks[0]].lock); - q->data[ind1] = -1; - q->data[ind2] = -1; - atomicAdd((int*) &tot_num_tasks, -1); - cuda_queue_puttask( q, tid1); - return tid2; - } - - if(res2 == 0) - { - cuda_multiunlock(&res_cuda[tasks_cuda[tid2].locks[0]].lock); - q->data[ind1] = -1; - q->data[ind2] = -1; - atomicAdd((int*) &tot_num_tasks, -1); - cuda_queue_puttask( q, tid2); - return tid1; - } - - - - - if(pri1 >= pri2) { @@ -518,7 +499,7 @@ __device__ int get_best_task(struct queue_cuda *q ) } -__device__ int runner_cuda_gettask ( struct queue_cuda *q ) { +__device__ int runner_cuda_gettask_priority ( struct queue_cuda *q ) { int tid = -1; if( atomicAdd((int*)&q->nr_avail_tasks, -1) <= 0) @@ -563,28 +544,39 @@ __global__ void qsched_device_kernel ( ) if ( threadIdx.x == 0 ) { tid = -1; /* Highest priority queue, holds the unload tasks. */ -#ifndef PRIQ #ifndef NO_LOADS if(cuda_queues[2].nr_avail_tasks > 0 ) - tid = runner_cuda_gettask( &cuda_queues[2] ); + { + TIMER_TIC + tid = runner_cuda_gettask( &cuda_queues[2] ); + TIMER_TOC(timers_unload_q) + } #endif - /* Middle priority queue, contains user-specifed tasks. */ +#ifdef PRIQ + if( tid < 0 && cuda_queues[0].nr_avail_tasks > 0 ) + { + TIMER_TIC + tid = runner_cuda_gettask_priority ( &cuda_queues[0]); + TIMER_TOC(timers_queue) + } +#else if( tid < 0 && cuda_queues[0].nr_avail_tasks > 0 ) + { + TIMER_TIC tid = runner_cuda_gettask ( &cuda_queues[0]); + TIMER_TOC(timers_queue) + } +#endif #ifndef NO_LOADS /* Low priority queue, contains the load tasks. */ - if( tid < 0 && cuda_queues[1].nr_avail_tasks > 0 /*&& blockIdx.x < 1*/ ) -{ + if( tid < 0 && cuda_queues[1].nr_avail_tasks > 0 /*&& blockIdx.x < 12*/ ) + { + TIMER_TIC tid = runner_cuda_gettask ( &cuda_queues[1]); - -} -#endif - -#else - if(tot_num_tasks > 0 && cuda_queues[0].nr_avail_tasks > 0) - tid = runner_cuda_gettask ( &cuda_queues[0] ); + TIMER_TOC(timers_load_q) + } #endif } @@ -614,23 +606,25 @@ __global__ void qsched_device_kernel ( ) /* Pick task type to do, if its not load, unload or ghost use the user supplied function. */ if( tasks_cuda[tid].type == type_load ) { + TIMER_TIC int *d = (int*)&data_cuda[tasks_cuda[tid].data]; src = (int*)res_cuda[d[0]].data; dest = (int*)res_cuda[d[0]].gpu_data; cuda_memcpy_tasks( dest, src , res_cuda[d[0]].size, tid); - #ifdef PRIQ - __syncthreads(); - if(threadIdx.x == 0) - cuda_multiunlock(&res_cuda[tasks_cuda[tid].locks[0]].lock); - #endif + TIMER_TOC(timers_doload) }else if( tasks_cuda[tid].type == type_unload ) { + TIMER_TIC int *d = (int*)&data_cuda[tasks_cuda[tid].data]; src = (int*)res_cuda[d[0]].gpu_data; dest = (int*)res_cuda[d[0]].data; cuda_memcpy_tasks( dest, src , res_cuda[d[0]].size, d[0]); + TIMER_TOC(timers_dounload) }else if (tasks_cuda[tid].type != type_ghost ){ + TIMER_TIC fun(tasks_cuda[tid].type , &data_cuda[tasks_cuda[tid].data]); + __syncthreads(); + TIMER_TOC(timers_doother) } __syncthreads(); /*Stop the task clock*/ @@ -648,16 +642,12 @@ __global__ void qsched_device_kernel ( ) if( atomicSub( &tasks_cuda[tasks_cuda[tid].unlocks[i]].wait , 1 ) == 1 && !( tasks_cuda[tasks_cuda[tid].unlocks[i]].flags & task_flag_skip )) { /* Place unloads into highest priority queue, any other task goes to normal priority queue. Load tasks are never unlocked.*/ - #ifdef PRIQ - cuda_queue_puttask( &cuda_queues[0] , tasks_cuda[tid].unlocks[i] ); - #else if(tasks_cuda[tasks_cuda[tid].unlocks[i]].type != type_unload) { cuda_queue_puttask( &cuda_queues[0] , tasks_cuda[tid].unlocks[i] ); } else cuda_queue_puttask( &cuda_queues[2] , tasks_cuda[tid].unlocks[i] ); - #endif } } } @@ -712,10 +702,10 @@ int maxVal( int *array, int size ) int i, maxi=-3200000; for (i=0; i<size; i++) { - if (array[i]>maxi) - { - maxi=array[i]; - } + if (array[i]>maxi) + { + maxi=array[i]; + } } return maxi ; } @@ -725,10 +715,10 @@ int minVal( int *array, int size ) int i, maxi=3200000; for (i=0; i<size; i++) { - if (array[i]<maxi) - { - maxi=array[i]; - } + if (array[i]<maxi) + { + maxi=array[i]; + } } return maxi ; } @@ -1202,7 +1192,7 @@ for(i = sorted[s->count_res]; i >= 0; i-- ) } toc_run = getticks(); -// message( "Sorting took %.3f ms" , ((double)(toc_run - tic)) * itpms ); +// message( "Sorting took %.3f ms" , ((double)(toc_run - tic)) * itpms ); tic = getticks(); /* If nothing overlaps create tasks.*/ @@ -1220,7 +1210,7 @@ for( i = sorted[s->count_res]; i < s->count_res; i++ ) toc_run = getticks(); tic = getticks(); -// message( "Creating load tasks took %.3f ms" , ((double)(toc_run - tic)) * itpms ); +// message( "Creating load tasks took %.3f ms" , ((double)(toc_run - tic)) * itpms ); /* Check all resources have load tasks - if not give parents (recursively)*/ @@ -1603,7 +1593,7 @@ toc2 += getticks() - tic; }*/ //qsched_prepare_deps( s ); //printf("Number dependencies = %i\n", s->count_deps); -#ifdef PRIQ +/*#ifdef PRIQ int PCI_res; PCI_res = qsched_addres(s , qsched_owner_none , qsched_res_none , NULL, 0 , NULL); s->res[PCI_res].lock = PCIEX; @@ -1614,9 +1604,9 @@ for(i = 0; i < s->count; i++) qsched_addlock(s, i, PCI_res); } } -#endif +#endif*/ toc_run = getticks(); - //message( "Setting up dependencies took %.3f ms" , toc2 * itpms ); + //message( "Setting up dependencies took %.3f ms" , toc2 * itpms ); //error("Got to here"); } @@ -1757,7 +1747,7 @@ ticks tic, toc_run ; qsched_prepare_loads(s); #endif toc_run = getticks(); -// message( "prepare_loads took %.3f ms" , ((double)(toc_run - tic)) * itpms ); +// message( "prepare_loads took %.3f ms" , ((double)(toc_run - tic)) * itpms ); @@ -2013,21 +2003,16 @@ if( cudaMemcpyToSymbol ( tasks_cuda, &cuda_t , sizeof(struct task *) , 0 , cud error("Failed to copy task pointer to the device."); /* Initialize the queues. */ -#ifdef PRIQ -int nr_queues= 1, qsize; -#else int nr_queues= 3,qsize; -#endif int *data2; struct queue_cuda queues[ cuda_numqueues ]; -#ifdef PRIQ +/*#ifdef PRIQ qsize = max(2*s->count, 512); if ( cudaMemcpyToSymbol( cuda_queue_size , &qsize , sizeof(int) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy queue size to the device."); - /* Allocate a temporary buffer for the queue data. */ if ( ( data = (int *)malloc( sizeof(int) * qsize ) ) == NULL ) error("Failed to allocate data buffer."); if( ( data2 = (int *) malloc( sizeof(int) * qsize ) ) == NULL ) @@ -2075,7 +2060,6 @@ qsize = max(2*s->count, 512); for ( i = queues[0].count ; i < qsize ; i++ ) data[i] = -1; - /* Allocate and copy the data. */ if ( cudaMalloc( &queues[0].data , sizeof(int) * qsize ) != cudaSuccess ) error("Failed to allocate queue data on the device."); if ( cudaMemcpy( (void *)queues[0].data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess ) @@ -2084,24 +2068,21 @@ qsize = max(2*s->count, 512); for ( k = 0; k < qsize; k++ ) data[k] = -1; - /* Allocate and copy the recyling data. */ if ( cudaMalloc( &queues[0].rec_data , sizeof(int) * qsize ) != cudaSuccess ) error("Failed to allocate queue data on the device."); if ( cudaMemcpy( (void *)queues[0].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy queue data pointer to the device"); - /* Set some other values. */ queues[0].first = 0; queues[0].last = queues[0].count; queues[0].nr_avail_tasks = queues[0].last; queues[0].rec_count = 0; queues[0].count = s->count; - /* Copy the queue structures to the device. */ if ( cudaMemcpyToSymbol( cuda_queues , &queues , sizeof(struct queue_cuda) * nr_queues , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy the queues to the device"); -#else +#else*/ qsize = max(2*s->count / nr_queues, 256); if ( cudaMemcpyToSymbol( cuda_queue_size , &qsize , sizeof(int) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy queue size to the device."); @@ -2212,7 +2193,6 @@ qsize = max(2*s->count, 512); /* Copy the queue structures to the device. */ if ( cudaMemcpyToSymbol( cuda_queues , &queues , sizeof(struct queue_cuda) * nr_queues , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy the queues to the device"); -#endif /* Clean up. */ free( tid ); @@ -2227,6 +2207,23 @@ qsize = max(2*s->count, 512); } +void qsched_print_cuda_timers(struct qsched *s) +{ +#ifdef TIMINGS +unsigned long long int timers[timers_count]; +int i; +cudaMemcpyFromSymbol(timers,cuda_timers, sizeof(unsigned long long int)*timers_count, 0 ,cudaMemcpyDeviceToHost ); +printf("timers: "); +for(i = 0; i < timers_count; i++) +{ + printf("%llu ", timers[i]); +} +printf("\n"); +#endif +} + + + struct task* qsched_get_timers( struct qsched *s, int numtasks ) { struct task *gpu_tasks = NULL, *cuda_tasks; @@ -2259,6 +2256,15 @@ struct task* qsched_get_timers( struct qsched *s, int numtasks ) void qsched_run_CUDA ( struct qsched *s, qsched_funtype func) { #ifdef WITH_CUDA + #ifdef TIMINGS + int timers[timers_count]; + int k; + for ( k = 0 ; k < timers_count ; k++ ) + timers[k] = 0; + cudaMemcpyToSymbol(cuda_timers , timers , sizeof(int) * timers_count , 0 , cudaMemcpyHostToDevice); + #endif + cudaEvent_t startEvent, stopEvent; + float time; //ProfilerStart("/home/aidan/quicksched-code/examples/profiler.out"); double itpms = 1000.0 / CPU_TPS; ticks tic, toc_run ; @@ -2266,19 +2272,24 @@ void qsched_run_CUDA ( struct qsched *s, qsched_funtype func) { qsched_prepare_cuda( s ); toc_run = getticks(); printf("%.3f ", ((double)(toc_run - tic)) * itpms ); -// message( "prepare_cuda took %.3f ms" , ((double)(toc_run - tic)) * itpms ); + cudaEventCreate(&startEvent); + cudaEventCreate(&stopEvent); +// message( "prepare_cuda took %.3f ms" , ((double)(toc_run - tic)) * itpms ); cudaMemcpyToSymbol( fun , &func , sizeof(qsched_funtype)); tic = getticks(); // ProfilerStop(); cudaMemcpyToSymbol( tot_num_tasks, &s->count, sizeof(int) ); + cudaEventRecord(startEvent,0); qsched_device_kernel<<<128, 128 >>> ( ); + cudaEventRecord(stopEvent,0); if( cudaDeviceSynchronize() != cudaSuccess ) error("Failed to execute kernel:%s", cudaGetErrorString(cudaPeekAtLastError())); toc_run = getticks(); + cudaEventElapsedTime(&time, startEvent, stopEvent); #ifdef NO_LOADS - printf("%.3f ", ((double)(toc_run - tic)) * itpms ); + printf("%.3f ", time ); #else - printf("%.3f\n", ((double)(toc_run - tic)) * itpms ); + printf("%.3f\n", time ); #endif // message( "run_CUDA took %.3f ms" , ((double)(toc_run - tic)) * itpms ); @@ -2289,3 +2300,23 @@ void qsched_run_CUDA ( struct qsched *s, qsched_funtype func) { } + +#ifdef TIMER_TIC +#undef TIMER_TIC +#endif +#ifdef TIMER_TOC +#undef TIMER_TOC +#endif +#ifdef TIMER_TIC2 +#undef TIMER_TIC2 +#endif +#ifdef TIMERS + #define TIMER_TIC ticks __tic = getticks(); + #define TIMER_TIC2 __tic = getticks(); + #define TIMER_TOC(s,tid) atomic_add( &s->timers[tid] , getticks() - __tic ); +#else + #define TIMER_TIC + #define TIMER_TIC2 + #define TIMER_TOC +#endif + diff --git a/src/cuda_queue.h b/src/cuda_queue.h index b548f8ca512c7844b35609e85cc933f60a27acc7..f2aed4836688b0c11738c3fd51d35e7e0a1335fc 100644 --- a/src/cuda_queue.h +++ b/src/cuda_queue.h @@ -47,3 +47,4 @@ struct queue_cuda { void qsched_run_CUDA ( struct qsched *s, qsched_funtype fun ); extern "C" void qsched_prepare_cuda ( struct qsched *s ); struct task* qsched_get_timers( struct qsched *s, int numtasks ); +void qsched_print_cuda_timers(struct qsched *s);