Skip to content
Snippets Groups Projects
Commit f0ab955c authored by aidan's avatar aidan
Browse files

Code cleanup

parent 320d4466
No related branches found
No related tags found
No related merge requests found
...@@ -40,6 +40,7 @@ extern "C"{ ...@@ -40,6 +40,7 @@ extern "C"{
} }
#include "cuda_queue.h" #include "cuda_queue.h"
/** Task types. */ /** Task types. */
enum task_type { enum task_type {
task_type_self = 0, task_type_self = 0,
...@@ -59,12 +60,12 @@ unsigned short int split, sorted; ...@@ -59,12 +60,12 @@ unsigned short int split, sorted;
int parts, firstchild, sibling; int parts, firstchild, sibling;
int res, resz, resm, com_tid; int res, resz, resm, com_tid;
};//__attribute__((aligned(64))); };
#define const_G 1 #define const_G 1
/* Requred variables to obtain cells. */ /* Requred variables to obtain cells. */
#define cell_maxparts 128 #define cell_maxparts 256
#define CELL_STRETCH 2 #define CELL_STRETCH 2
#define INITIAL_CELLS 256 #define INITIAL_CELLS 256
struct cell *cell_pool = NULL; struct cell *cell_pool = NULL;
...@@ -101,20 +102,7 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c ...@@ -101,20 +102,7 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
int count_i = ci->count, count_j = cj->count; int count_i = ci->count, count_j = cj->count;
int parts_i = ci->parts, parts_j = cj->parts; int parts_i = ci->parts, parts_j = cj->parts;
double xi[3]; double xi[3];
float dx[3], ai[3], mi, mj, r2, w, ir; float dx[3], ai[3], mj, r2, w, ir;
__shared__ double2 parts_xy[cell_maxparts];
__shared__ double parts_z[cell_maxparts];
__shared__ float4 parts_am[cell_maxparts];
/*if(threadIdx.x == 0)
printf("%f, %f, %f, %f, %i, %f, %f, %f, %f, %i\n", ci->h, ci->loc_xy.x, ci->loc_xy.y, ci->loc_z, ci->split,
cj->h, cj->loc_xy.x, cj->loc_xy.y, cj->loc_z, cj->split);*/
/* Load particles of cell j into shared memory */
/*for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) {
parts_xy[j] = parts_pos_xy[k];
parts_z[j] = parts_pos_z[k];
parts_am[j] = parts_a_m[k];
}*/
/* Loop over cell i.*/ /* Loop over cell i.*/
for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) { for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
...@@ -124,7 +112,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c ...@@ -124,7 +112,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
for(k = 0; k < 3; k++) { for(k = 0; k < 3; k++) {
ai[k] = 0.0f; ai[k] = 0.0f;
} }
mi = parts_a_m[i].w;
for(j = parts_j; j < parts_j + count_j; j++) { for(j = parts_j; j < parts_j + count_j; j++) {
r2 = 0.0f; r2 = 0.0f;
...@@ -136,16 +123,12 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c ...@@ -136,16 +123,12 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
r2 += dx[2] * dx[2]; r2 += dx[2] * dx[2];
// ir = 1.0f / sqrtf(r2);
ir = rsqrtf(r2); ir = rsqrtf(r2);
w = const_G * ir * ir * ir; w = const_G * ir * ir * ir;
mj = parts_a_m[j].w; mj = parts_a_m[j].w;
for(k = 0; k < 3; k++) { for(k = 0; k < 3; k++) {
ai[k] -= dx[k] * mj * w; ai[k] -= dx[k] * mj * w;
} }
// atomicAdd(&parts_a_m[j].x, w*dx[0]*mi);
// atomicAdd(&parts_a_m[j].y, w*dx[1]*mi);
// atomicAdd(&parts_a_m[j].z, w*dx[2]*mi);
} }
atomicAdd(&parts_a_m[i].x, ai[0]); atomicAdd(&parts_a_m[i].x, ai[0]);
atomicAdd(&parts_a_m[i].y, ai[1]); atomicAdd(&parts_a_m[i].y, ai[1]);
...@@ -154,11 +137,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c ...@@ -154,11 +137,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
} }
/* Load particles of cell i into shared memory */ /* Load particles of cell i into shared memory */
/*for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) {
parts_xy[j] = parts_pos_xy[k];
parts_z[j] = parts_pos_z[k];
parts_am[j] = parts_a_m[k];
}*/
/*Loop over cell j. */ /*Loop over cell j. */
for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) { for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
xi[0] = parts_pos_xy[i].x; xi[0] = parts_pos_xy[i].x;
...@@ -167,7 +145,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c ...@@ -167,7 +145,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
for(k = 0; k < 3; k++) { for(k = 0; k < 3; k++) {
ai[k] = 0.0f; ai[k] = 0.0f;
} }
mi = parts_a_m[i].w;
for(j = parts_i; j < parts_i + count_i; j++) { for(j = parts_i; j < parts_i + count_i; j++) {
r2 = 0.0f; r2 = 0.0f;
...@@ -196,29 +173,15 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c ...@@ -196,29 +173,15 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) { __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) {
int i, k; int i;
double2 j_com_xy; double2 j_com_xy;
double j_com_z; double j_com_z;
float j_com_mass; float j_com_mass;
int count = leaf->count; int count = leaf->count;
int parts = leaf->parts; int parts = leaf->parts;
int cell_j = cj - cells; int cell_j = cj - cells;
int temp;
float r2, dx[3], ir, w; float r2, dx[3], ir, w;
// if(cell_j < 0)
// {
// if(threadIdx.x == 0)
// printf("cell_j = %i, leaf = %i, threadIdx.x == %i\n", cell_j, leaf-cells, threadIdx.x);
// __syncthreads();
// asm("trap;");
// }
// if(threadIdx.x == 0)
// printf("%f, %f, %f\n", cj->loc_xy.x, cj->loc_xy.y, cj->loc_z);
temp = cell_j;
/* Init the com's data.*/ /* Init the com's data.*/
j_com_xy = com_xy[cell_j]; j_com_xy = com_xy[cell_j];
...@@ -236,18 +199,10 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell ...@@ -236,18 +199,10 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
ir = rsqrtf(r2); ir = rsqrtf(r2);
w = j_com_mass * const_G * ir * ir * ir; w = j_com_mass * const_G * ir * ir * ir;
/* __threadfence();
if(!isfinite(w * dx[0])){
printf("Error in make_interact_pc, j_com_mass = %f, cell_j = %i, temp = %i, i = %i, threadIdx.x=%i\n", j_com_mass, cell_j, temp, i, threadIdx.x); asm("trap;");}
if(!isfinite(w * dx[1])){
printf("Error in make_interact_pc\n"); asm("trap;");}
if(!isfinite(w * dx[2])){
printf("Error in make_interact_pc\n"); asm("trap;");}*/
atomicAdd( &parts_a_m[i].x , w * dx[0]); atomicAdd( &parts_a_m[i].x , w * dx[0]);
atomicAdd( &parts_a_m[i].y , w * dx[1]); atomicAdd( &parts_a_m[i].y , w * dx[1]);
atomicAdd( &parts_a_m[i].z , w * dx[2]); atomicAdd( &parts_a_m[i].z , w * dx[2]);
} }
//__syncthreads();
} }
/** /**
...@@ -255,7 +210,6 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell ...@@ -255,7 +210,6 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
*/ */
__device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) { __device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) {
int k;
float dx[3]; float dx[3];
double cih = ci->h, cjh = cj->h; double cih = ci->h, cjh = cj->h;
...@@ -274,7 +228,7 @@ __device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, st ...@@ -274,7 +228,7 @@ __device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, st
} }
__device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) { __device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) {
int k;
float dx[3]; float dx[3];
float min_dist = ci->h; float min_dist = ci->h;
float center_i = ci->loc_xy.x; float center_i = ci->loc_xy.x;
...@@ -296,13 +250,6 @@ __device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) { ...@@ -296,13 +250,6 @@ __device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) {
__device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) { __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) {
struct cell *cp ,*cps; struct cell *cp ,*cps;
int leafnum = leaf - cells;
//if(threadIdx.x == 0 && leafnum == 23)
// printf("cj = %i\n", cj - cells);
// printf("%i\n", leafnum);
// if(threadIdx.x == 0)
/// printf("ci = %i, cj = %i, leaf = %i\n", ci - cells, cj - cells, leaf - cells);
for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) { for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
if(is_inside(leaf, cp)) break; if(is_inside(leaf, cp)) break;
...@@ -318,16 +265,12 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf ...@@ -318,16 +265,12 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf
} }
} else { } else {
make_interact_pc(leaf, cps); make_interact_pc(leaf, cps);
// if(threadIdx.x == 0 && leafnum == 23)
// printf("leafnum = %i with cps = %i here\n", leafnum, cps - cells);
__syncthreads(); __syncthreads();
} }
} }
}else{ }else{
for(cps = &cells[cj->firstchild]; cps!= &cells[cj->sibling]; cps = &cells[cps->sibling]) { for(cps = &cells[cj->firstchild]; cps!= &cells[cj->sibling]; cps = &cells[cps->sibling]) {
// if(threadIdx.x == 0 && leafnum == 23)
// printf("leafnum = %i with cps = %i\n", leafnum, cps - cells);
make_interact_pc(leaf, cps); make_interact_pc(leaf, cps);
} }
...@@ -347,17 +290,6 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) { ...@@ -347,17 +290,6 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
struct cell *cp, *cps; struct cell *cp, *cps;
/*if(leaf->split)
{
printf("Leaf split = 1, oh dear.");
asm("trap;");
}
if(c->split > 1)
{
printf("Cell had split > 1\n");
asm("trap;");
}*/
/* Find the subcell of c the leaf is in.*/ /* Find the subcell of c the leaf is in.*/
/*cp = c; /*cp = c;
...@@ -402,31 +334,13 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) { ...@@ -402,31 +334,13 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
__device__ void iact_self_direct(int cellID) { __device__ void iact_self_direct(int cellID) {
struct cell *c = &cells[cellID]; struct cell *c = &cells[cellID];
double xi[3] = {0.0, 0.0, 0.0}; double xi[3] = {0.0, 0.0, 0.0};
float ai[3] = {0.0, 0.0, 0.0 }, mi, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w; float ai[3] = {0.0, 0.0, 0.0 }, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w;
__shared__ double2 parts_xy[cell_maxparts];
__shared__ double parts_z[cell_maxparts];
__shared__ float4 parts_am[cell_maxparts];
int parts; int parts;
int count; int count;
int i,j,k; int i,j,k;
//if(threadIdx.x == 0)
// printf("%f, %f, %f, %f, %i\n", c->h, c->loc_xy.x, c->loc_xy.y, c->loc_z, c->split);
//If cell is split, interact each child with itself, and with each of its siblings.
/*if(c->split) {
//TODO
} else {*/
parts = c->parts; parts = c->parts;
count = c->count; count = c->count;
int z = threadIdx.x;
/* Load particle data into shared memory*/
/*for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) {
parts_xy[z] = parts_pos_xy[k];
parts_z[z] = parts_pos_z[k];
parts_am[z] = parts_a_m[k];
}
__syncthreads();*/
for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x) for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x)
{ {
xi[0] = parts_pos_xy[i].x; xi[0] = parts_pos_xy[i].x;
...@@ -435,9 +349,7 @@ __device__ void iact_self_direct(int cellID) { ...@@ -435,9 +349,7 @@ __device__ void iact_self_direct(int cellID) {
for(k = 0; k < 3; k++) { for(k = 0; k < 3; k++) {
ai[k] = 0.0; ai[k] = 0.0;
} }
mi = parts_a_m[i].w;
//for(j = i+1; j!= i; j = (j+1)%count)
for(j = parts; j < parts+count; j++) for(j = parts; j < parts+count; j++)
{ {
if(i != j){ if(i != j){
...@@ -453,7 +365,6 @@ __device__ void iact_self_direct(int cellID) { ...@@ -453,7 +365,6 @@ __device__ void iact_self_direct(int cellID) {
/* Apply the gravitational acceleration. */ /* Apply the gravitational acceleration. */
//ir = 1.0f / sqrtf(r2);
ir = rsqrtf(r2); ir = rsqrtf(r2);
w = const_G * ir * ir * ir; w = const_G * ir * ir * ir;
mj = parts_a_m[j].w; mj = parts_a_m[j].w;
...@@ -479,7 +390,7 @@ __device__ void iact_self_direct(int cellID) { ...@@ -479,7 +390,7 @@ __device__ void iact_self_direct(int cellID) {
*/ */
static inline int are_neighbours_host(struct cell *ci, struct cell *cj) { static inline int are_neighbours_host(struct cell *ci, struct cell *cj) {
int k; // int k;
float dx[3]; float dx[3];
#ifdef SANITY_CHECKS #ifdef SANITY_CHECKS
...@@ -508,7 +419,7 @@ static inline int are_neighbours_host(struct cell *ci, struct cell *cj) { ...@@ -508,7 +419,7 @@ static inline int are_neighbours_host(struct cell *ci, struct cell *cj) {
struct cell *cell_get() struct cell *cell_get()
{ {
struct cell *res; // struct cell *res;
if(num_cells == 0) if(num_cells == 0)
{ {
...@@ -645,9 +556,7 @@ void cell_split(int c, struct qsched *s) { ...@@ -645,9 +556,7 @@ void cell_split(int c, struct qsched *s) {
int left[8], right[8]; int left[8], right[8];
double pivot[3]; double pivot[3];
static int root = -1; static int root = -1;
// struct cell *progenitors[8];
int progenitors[8]; int progenitors[8];
int c1 = c;
double2 *temp_xy; double2 *temp_xy;
double *temp_z; double *temp_z;
float4 *temp_a_m; float4 *temp_a_m;
...@@ -662,7 +571,6 @@ void cell_split(int c, struct qsched *s) { ...@@ -662,7 +571,6 @@ void cell_split(int c, struct qsched *s) {
{ {
if( cudaHostGetDevicePointer(&temp_xy, &parts_pos_xy_host[cell_pool[c].parts], 0) != cudaSuccess ) if( cudaHostGetDevicePointer(&temp_xy, &parts_pos_xy_host[cell_pool[c].parts], 0) != cudaSuccess )
error("Failed to get host device pointer."); error("Failed to get host device pointer.");
// printf("tempxy = %p\n", temp_xy);
cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy, cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy,
sizeof(double2) * cell_pool[c].count, parts_pos_xy_temp + cell_pool[c].parts); sizeof(double2) * cell_pool[c].count, parts_pos_xy_temp + cell_pool[c].parts);
} }
...@@ -680,7 +588,6 @@ void cell_split(int c, struct qsched *s) { ...@@ -680,7 +588,6 @@ void cell_split(int c, struct qsched *s) {
cell_pool[c].resm = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_a_m, cell_pool[c].resm = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_a_m,
sizeof(float4) * cell_pool[c].count, parts_a_m_temp + cell_pool[c].parts); sizeof(float4) * cell_pool[c].count, parts_a_m_temp + cell_pool[c].parts);
} }
// error("Cell has no resource");*///TODO
if(count > cell_maxparts ) if(count > cell_maxparts )
{ {
...@@ -836,13 +743,9 @@ void cell_split(int c, struct qsched *s) { ...@@ -836,13 +743,9 @@ void cell_split(int c, struct qsched *s) {
/* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */ /* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */
} else { } else {
// struct cell *data[2] = {root, c};
int data[2] = {root, c}; int data[2] = {root, c};
int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data, int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
2 * sizeof(int), 3000); 2 * sizeof(int), 3000);
/*qsched_adduse(s, tid, cell_pool[root].res);
qsched_adduse(s, tid, cell_pool[root].resz);
qsched_adduse(s, tid, cell_pool[root].resm);*/
qsched_addlock(s, tid, cell_pool[c].res); qsched_addlock(s, tid, cell_pool[c].res);
qsched_addlock(s, tid, cell_pool[c].resz); qsched_addlock(s, tid, cell_pool[c].resz);
qsched_addlock(s, tid, cell_pool[c].resm); qsched_addlock(s, tid, cell_pool[c].resm);
...@@ -864,8 +767,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){ ...@@ -864,8 +767,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
qsched_task_t tid; qsched_task_t tid;
int data[2]; int data[2];
struct cell /**data[2],*/ *cp, *cps; struct cell *cp, *cps;
int cpi;
if(cj == NULL) if(cj == NULL)
...@@ -967,11 +869,8 @@ qsched_funtype func; ...@@ -967,11 +869,8 @@ qsched_funtype func;
void test_bh(int N, int runs, char *fileName) { void test_bh(int N, int runs, char *fileName) {
int i, k; int i, k;
struct cell *root; struct cell *root;
struct part *parts;
FILE *file; FILE *file;
struct qsched s; struct qsched s;
ticks tic, toc_run, tot_setup = 0, tot_run = 0;
int countMultipoles = 0, countPairs = 0, countCoMs = 0;
struct cell *gpu_ptr_cells; struct cell *gpu_ptr_cells;
cudaFree(0); cudaFree(0);
...@@ -1142,23 +1041,6 @@ float *comm_temp; ...@@ -1142,23 +1041,6 @@ float *comm_temp;
if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess) if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to copy com pointer to the GPU"); error("Failed to copy com pointer to the GPU");
/* for(i = 0; i < s.count; i++){
int *idata = (int*)&s.data[s.tasks[i].data];
if(s.tasks[i].type == task_type_self)
printf("Self task with data[0] = %i\n", idata[0]);
if(s.tasks[i].type == task_type_pair) {
printf("Pair task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
}
if(s.tasks[i].type == task_type_self_pc) {
printf("PC task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
}
}*/
// printf("com_mass_host[152] = %f\n", com_mass_host[152]);
//Run code.
// printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
qsched_run_CUDA( &s , func ); qsched_run_CUDA( &s , func );
qsched_print_cuda_timers(&s); qsched_print_cuda_timers(&s);
...@@ -1169,16 +1051,12 @@ struct task* tasks = qsched_get_timers( &s , s.count ); ...@@ -1169,16 +1051,12 @@ struct task* tasks = qsched_get_timers( &s , s.count );
for(i = 0; i < s.count; i++) for(i = 0; i < s.count; i++)
{ {
printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID); printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
// printf("\n");
} }
} }
/* Dump the particles to a file */ /* Dump the particles to a file */
file = fopen("particle_dump.dat", "w"); file = fopen("particle_dump.dat", "w");
/* fprintf(file,
"# ID m x y z a_exact.x a_exact.y a_exact.z a_legacy.x "
"a_legacy.y a_legacy.z a_new.x a_new.y a_new.z\n");*/
for (k = 0; k < N; ++k) for (k = 0; k < N; ++k)
fprintf(file, "%e, %e, %e, %e, %e, %e, %e\n", fprintf(file, "%e, %e, %e, %e, %e, %e, %e\n",
parts_a_m_host[k].w, parts_pos_xy_host[k].x, parts_pos_xy_host[k].y, parts_pos_z_host[k], parts_a_m_host[k].w, parts_pos_xy_host[k].x, parts_pos_xy_host[k].y, parts_pos_z_host[k],
...@@ -1228,9 +1106,6 @@ int main(int argc, char *argv[]) { ...@@ -1228,9 +1106,6 @@ int main(int argc, char *argv[]) {
/* Tree node information */ /* Tree node information */
printf("Size of cell: %zu bytes.\n", sizeof(struct cell)); printf("Size of cell: %zu bytes.\n", sizeof(struct cell));
/* Part information */
// printf("Size of part: %zu bytes.\n", sizeof(struct part));
/* Dump arguments. */ /* Dump arguments. */
if (fileName[0] == 0) { if (fileName[0] == 0) {
message("Computing the N-body problem over %i random particles using %i " message("Computing the N-body problem over %i random particles using %i "
......
This diff is collapsed.
...@@ -67,7 +67,6 @@ __device__ void quarter(float *data) ...@@ -67,7 +67,6 @@ __device__ void quarter(float *data)
__global__ void Manual(float *src) __global__ void Manual(float *src)
{ {
int i;
int datas = blockIdx.x; int datas = blockIdx.x;
cuda_data[datas*1000+threadIdx.x] = src[datas*1000+threadIdx.x]; cuda_data[datas*1000+threadIdx.x] = src[datas*1000+threadIdx.x];
...@@ -109,13 +108,13 @@ __global__ void Setup() ...@@ -109,13 +108,13 @@ __global__ void Setup()
int main ( int argc , char *argv[] ) { int main ( int argc , char *argv[] ) {
float *array, *cuda_array, *cuda_array2, *device_array; float *array, *cuda_array, *device_array;
int i, k=0; int i, k=0;
qsched_funtype func; qsched_funtype func;
struct qsched s; struct qsched s;
qsched_task_t *tid; qsched_task_t *tid;
qsched_res_t *rid; qsched_res_t *rid;
ticks tic, toc_run, tot_setup, tot_run = 0; ticks tic, toc_run, tot_run = 0;
qsched_init( &s , 1 , qsched_flag_none ); qsched_init( &s , 1 , qsched_flag_none );
cudaDeviceReset(); cudaDeviceReset();
cudaSetDevice(0); cudaSetDevice(0);
...@@ -134,11 +133,6 @@ int main ( int argc , char *argv[] ) { ...@@ -134,11 +133,6 @@ int main ( int argc , char *argv[] ) {
array[i] = (float)i; array[i] = (float)i;
} }
// if(cudaHostGetDevicePointer(&cuda_array , &array[k*1000] , 0) != cudaSuccess)
//error("Failed to get device pointer for data: %s", cudaGetErrorString(cudaPeekAtLastError()));
// if ( cudaMallocHost(&array, M*sizeof(float)) != cudaSuccess )
// error("Failed to allocate array");
/* Initialize the scheduler. */ /* Initialize the scheduler. */
...@@ -193,16 +187,9 @@ printf("Starting second run\n"); ...@@ -193,16 +187,9 @@ printf("Starting second run\n");
printf("Second run complete\n"); printf("Second run complete\n");
struct task *completed_tasks = qsched_get_timers( &s, s.count ); struct task *completed_tasks = qsched_get_timers( &s, s.count );
/* tic = getticks();
qsched_run_CUDA( &s , func );
toc_run = getticks();
message( "qsched_run_CUDA took %lli ticks..." , toc_run - tic );
tot_run += toc_run - tic;*/
// if( cudaFreeHost( array) != cudaSuccess)
// error("Failed to free array");
cudaDeviceReset(); cudaDeviceReset();
...@@ -215,8 +202,6 @@ cudaDeviceReset(); ...@@ -215,8 +202,6 @@ cudaDeviceReset();
error("Failed to get device pointer for data: %s",cudaGetErrorString(cudaPeekAtLastError())); error("Failed to get device pointer for data: %s",cudaGetErrorString(cudaPeekAtLastError()));
if( cudaMalloc(&device_array , M*sizeof(float) ) != cudaSuccess ) if( cudaMalloc(&device_array , M*sizeof(float) ) != cudaSuccess )
error("Failed to allocate device array: %s", cudaGetErrorString(cudaPeekAtLastError())); error("Failed to allocate device array: %s", cudaGetErrorString(cudaPeekAtLastError()));
/*if( cudaMemcpy( &device_array , array , M*sizeof(float), cudaMemcpyHostToDevice ) != cudaSuccess )
error("Failed to copy device array: %s", cudaGetErrorString(cudaPeekAtLastError()));*/
if( cudaMemcpyToSymbol( cuda_data , &device_array,sizeof(float*), 0 , cudaMemcpyHostToDevice) != cudaSuccess ) if( cudaMemcpyToSymbol( cuda_data , &device_array,sizeof(float*), 0 , cudaMemcpyHostToDevice) != cudaSuccess )
error("Failed to copy array pointer to device: %s", cudaGetErrorString(cudaPeekAtLastError())); error("Failed to copy array pointer to device: %s", cudaGetErrorString(cudaPeekAtLastError()));
tic = getticks(); tic = getticks();
...@@ -229,6 +214,4 @@ cudaDeviceReset(); ...@@ -229,6 +214,4 @@ cudaDeviceReset();
if(array[i] != ((float)i)*((float)i)*0.5f) if(array[i] != ((float)i)*((float)i)*0.5f)
printf("%i wrong, %.3f != %.3f\n", i, array[i], ((float)i)*((float)i)*0.5f ); printf("%i wrong, %.3f != %.3f\n", i, array[i], ((float)i)*((float)i)*0.5f );
//printf("%.3f\n", array[2]);
} }
...@@ -22,11 +22,8 @@ extern "C"{ ...@@ -22,11 +22,8 @@ extern "C"{
int g_size; int g_size;
enum task_types { task_SGEQRF , task_SLARFT , task_STSQRF , task_SSSRFT} ; enum task_types { task_SGEQRF , task_SLARFT , task_STSQRF , task_SSSRFT} ;
//#define TID threadIdx.x
#define numthreads 128 #define numthreads 128
//#define tilesize 32
#define cuda_maxtasks 1000000 #define cuda_maxtasks 1000000
//#define CO(x,y,ldm) (((y)*ldm) + (x))
#define WARPS 4 #define WARPS 4
__device__ float *GPU_matrix; __device__ float *GPU_matrix;
...@@ -145,7 +142,7 @@ float* generateColumnMatrix(int size, unsigned long int m_z) ...@@ -145,7 +142,7 @@ float* generateColumnMatrix(int size, unsigned long int m_z)
{ {
float* matrix; float* matrix;
cudaError_t code = cudaMallocHost(&matrix, sizeof(float)*size/**size*/); cudaError_t code = cudaMallocHost(&matrix, sizeof(float)*size);
if(code != cudaSuccess) if(code != cudaSuccess)
printf("%s size = %i g_size = %i\n", cudaGetErrorString(code),size, g_size); printf("%s size = %i g_size = %i\n", cudaGetErrorString(code),size, g_size);
else else
...@@ -211,8 +208,6 @@ __device__ inline void reduceSumMultiWarp( float* value ) ...@@ -211,8 +208,6 @@ __device__ inline void reduceSumMultiWarp( float* value )
} }
__threadfence(); __threadfence();
*value = stuff[group * 32]; *value = stuff[group * 32];
//printf("Not supported.\n");
//asm("trap;");
#endif #endif
} }
...@@ -236,7 +231,6 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float* ...@@ -236,7 +231,6 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float*
int i, j; int i, j;
float norm=0.0, sign, u1, tau, z; float norm=0.0, sign, u1, tau, z;
int TID = threadIdx.x % 32; int TID = threadIdx.x % 32;
//int set = threadIdx.x / 32;
float w; float w;
/*Find the householder vector for each row. */ /*Find the householder vector for each row. */
for(i = 0; i < tilesize; i++) for(i = 0; i < tilesize; i++)
...@@ -253,7 +247,6 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float* ...@@ -253,7 +247,6 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float*
else else
sign = 1; sign = 1;
// sign = __shfl(sign, i, 32);
norm = sqrt(norm); norm = sqrt(norm);
if(TID >= i ) if(TID >= i )
w = cornerTile[ i*tilesize + TID ]; w = cornerTile[ i*tilesize + TID ];
...@@ -262,13 +255,8 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float* ...@@ -262,13 +255,8 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float*
// if(TID==i)
u1 = cornerTile[i*tilesize + i] - sign*norm; u1 = cornerTile[i*tilesize + i] - sign*norm;
// __syncthreads();
// u1 = __shfl(u1, i, 32);
// if(i==1)
// printf("%.3f\n", u1);
if(u1 != 0.0) if(u1 != 0.0)
{ {
if(TID > i) if(TID > i)
...@@ -285,8 +273,6 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float* ...@@ -285,8 +273,6 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float*
else else
tau = 0.0; tau = 0.0;
//if(TID == 0)
// printf("%.3f\n", sign);
/*Store the below diagonal vector */ /*Store the below diagonal vector */
if(TID > i) if(TID > i)
...@@ -358,17 +344,9 @@ void __device__ SLARFT(volatile float* cornerTile, volatile float* rowTile, int ...@@ -358,17 +344,9 @@ void __device__ SLARFT(volatile float* cornerTile, volatile float* rowTile, int
/* Below Diagonal!*/ /* Below Diagonal!*/
/*Compute w'*A_j*/ /*Compute w'*A_j*/
z = w * rowTile[j*tilesize+TID]; z = w * rowTile[j*tilesize+TID];
//for(n = i; n < tileSize; n++)
//{
// z = z + w[n] * rowTile[j*tileSize+n];
//}
reduceSumMultiWarp(&z); reduceSumMultiWarp(&z);
if(TID >= i) if(TID >= i)
rowTile[j*tilesize + TID] = rowTile[j*tilesize + TID] - tauMatrix[(kk*tilesize+i)*tauNum + kk] * w * z; rowTile[j*tilesize + TID] = rowTile[j*tilesize + TID] - tauMatrix[(kk*tilesize+i)*tauNum + kk] * w * z;
//for(n = i; n < tileSize; n++)
//{
// rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z;
//}
} }
...@@ -393,16 +371,14 @@ void __device__ SLARFT(volatile float* cornerTile, volatile float* rowTile, int ...@@ -393,16 +371,14 @@ void __device__ SLARFT(volatile float* cornerTile, volatile float* rowTile, int
*/ */
__device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, int tilesize, int ii, int kk, volatile float* tauMatrix, int tauNum ) __device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, int tilesize, int ii, int kk, volatile float* tauMatrix, int tauNum )
{ {
int i, j, n; int i, j;
float norm=0.0, sign, u1, tau, z; float norm=0.0, sign, u1, tau, z;
float w, wupper; float w, wupper;
int TID = threadIdx.x % 32; int TID = threadIdx.x % 32;
//int set = threadIdx.x / 32;
/* For each column compute the householder vector. */ /* For each column compute the householder vector. */
for(i = 0; i < tilesize; i++) for(i = 0; i < tilesize; i++)
{ {
// norm = 0.0;
wupper = cornerTile[i*tilesize + i]; wupper = cornerTile[i*tilesize + i];
...@@ -446,33 +422,6 @@ __device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, in ...@@ -446,33 +422,6 @@ __device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, in
columnTile[i*tilesize+TID] = w; columnTile[i*tilesize+TID] = w;
if(threadIdx.x == 0) if(threadIdx.x == 0)
tauMatrix[(kk*tilesize+i)*tauNum+ii] = tau; tauMatrix[(kk*tilesize+i)*tauNum+ii] = tau;
/* Apply to each row to the right.*/
// for(j = i; j < tilesize; j++)
// {
/* Find w'*A_j, w is 0s except for first value with upper tile.*/
// z = 1.0 * cornerTile[ j*tilesize+i ];
// for(n = 0; n < tilesize; n++)
// {
// z = z + w[ tilesize+n ]*columnTile[ j*tilesize+n ];
// }
/* Apply to upper tile.*/
// cornerTile[j*tilesize+i] = cornerTile[j*tilesize+i ] - tau*1.0*z;
// for(n = i+1; n < tilesize; n++)
// {
// cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n ] - tau*w[n]*z;
// }
/* Apply to lower tile.*/
// for(n = 0; n < tilesize; n++)
// {
// columnTile[ j*tilesize+n] = columnTile[ j*tilesize+n ] - tau*w[tilesize+n]*z;
// }
// }
/* Store w*/
// for(j = 0; j < tilesize; j++){
// columnTile[ i*tilesize+j ] = w[tilesize+j];
// }
// tauMatrix[(kk*tilesize+i)*tauNum+ ii] = tau;
} }
} }
...@@ -495,7 +444,7 @@ __device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, in ...@@ -495,7 +444,7 @@ __device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, in
*/ */
__device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,volatile float* rowTile, int tilesize, int ii, int jj, int kk,volatile float* tauMatrix, int tauNum ) __device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,volatile float* rowTile, int tilesize, int ii, int jj, int kk,volatile float* tauMatrix, int tauNum )
{ {
int i, j, n; int i, j;
float z; float z;
float w; float w;
int TID = threadIdx.x % 32; int TID = threadIdx.x % 32;
...@@ -520,30 +469,6 @@ __device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,vo ...@@ -520,30 +469,6 @@ __device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,vo
cornerTile[j*tilesize+TID] = cornerTile[j*tilesize+TID] - tau*w*z; cornerTile[j*tilesize+TID] = cornerTile[j*tilesize+TID] - tau*w*z;
} }
__syncthreads(); __syncthreads();
/*for(j = 0; j < i; j++)
w[j] = 0.0;
w[i] = 1.0;
for(j = i+1; j < tilesize; j++)
w[j] = 0.0;
for(j = 0; j < tilesize; j++)
w[j+tilesize] = columnTile[i*tilesize +j];*/
/* Apply householder vector (w) to the tiles.*/
/* for(j = 0; j < tilesize; j++)
{
z = 0.0;*/
/* Compute w' * A_j */
/* for(n = 0; n < tilesize; n++)
{
z += w[n] * rowTile[j*tilesize+n];
z += w[n + tilesize] * cornerTile[j*tilesize+n];
}
for(n = 0; n < tilesize; n++)
{
rowTile[j*tilesize + n] = rowTile[j*tilesize + n] - tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[n]*z;
cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n]- tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[tilesize+n]*z;
}
}*/
} }
} }
...@@ -551,37 +476,25 @@ __device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,vo ...@@ -551,37 +476,25 @@ __device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,vo
__device__ void runner ( int type , void *data ) { __device__ void runner ( int type , void *data ) {
__shared__ volatile float blockCache[(32*32) + 128]; __shared__ volatile float blockCache[(32*32) + 128];
volatile float *workVector;
workVector = blockCache + (32*32);
/* Decode the task data. */ /* Decode the task data. */
int *idata = (int *)data; int *idata = (int *)data;
int i = idata[0]; int i = idata[0];
int j = idata[1]; int j = idata[1];
int k = idata[2]; int k = idata[2];
// int z;
// double buff[ 2*K*K ];
/* Decode and execute the task. */ /* Decode and execute the task. */
switch ( type ) { switch ( type ) {
case task_SGEQRF: case task_SGEQRF:
//if(threadIdx.x == 0)
// printf("SGEQRF %i %i\n", k, (k*cuda_m+k)*32*32);
if(threadIdx.x < 32) if(threadIdx.x < 32)
SGEQRF( &GPU_matrix[(k*cuda_m+k)*32*32], 32, GPU_tau, k, cuda_m); SGEQRF( &GPU_matrix[(k*cuda_m+k)*32*32], 32, GPU_tau, k, cuda_m);
break; break;
case task_SLARFT: case task_SLARFT:
// if(threadIdx.x == 0)
// printf("SLARFT %i %i %i\n", k,j, (j*cuda_m+k)*32*32);
SLARFT( &GPU_matrix[(k*cuda_m +k)*32*32], &GPU_matrix[(j*cuda_m+k)*32*32], 32, j, k, GPU_tau, cuda_m); SLARFT( &GPU_matrix[(k*cuda_m +k)*32*32], &GPU_matrix[(j*cuda_m+k)*32*32], 32, j, k, GPU_tau, cuda_m);
break; break;
case task_STSQRF: case task_STSQRF:
// if(threadIdx.x == 0)
// printf("STSQRF %i %i %i\n", k,i, (k*cuda_m+i)*32*32);
if(threadIdx.x < 32) if(threadIdx.x < 32)
STSQRF( &GPU_matrix[(k*cuda_m+k)*32*32], &GPU_matrix[(k*cuda_m + i)*32*32], 32, i, k, GPU_tau, cuda_m); STSQRF( &GPU_matrix[(k*cuda_m+k)*32*32], &GPU_matrix[(k*cuda_m + i)*32*32], 32, i, k, GPU_tau, cuda_m);
break; break;
case task_SSSRFT: case task_SSSRFT:
// if(threadIdx.x == 0)
// printf("SSSRFT %i %i %i %i\n", k,j, i, (j*cuda_m+i)*32*32);
SSSRFT( &GPU_matrix[(j*cuda_m+i)*32*32],&GPU_matrix[(k*cuda_m + i)*32*32],&GPU_matrix[(j*cuda_m+k)*32*32], 32, i, j, k, GPU_tau, cuda_m ); SSSRFT( &GPU_matrix[(j*cuda_m+i)*32*32],&GPU_matrix[(k*cuda_m + i)*32*32],&GPU_matrix[(j*cuda_m+k)*32*32], 32, i, j, k, GPU_tau, cuda_m );
break; break;
default: default:
...@@ -630,8 +543,6 @@ float* columnToTile( float* columnMatrix, int size , int m , int n ) ...@@ -630,8 +543,6 @@ float* columnToTile( float* columnMatrix, int size , int m , int n )
cudaMallocHost(&TileMatrix, sizeof(float) * size ); cudaMallocHost(&TileMatrix, sizeof(float) * size );
if(TileMatrix == NULL) if(TileMatrix == NULL)
error("failed to allocate TileMatrix"); error("failed to allocate TileMatrix");
int rows = m*32;
int columns = n*32;
int i,j,k,l; int i,j,k,l;
for( i = 0; i < n ; i++ ) for( i = 0; i < n ; i++ )
...@@ -691,8 +602,6 @@ float* createIdentity(int m, int n) ...@@ -691,8 +602,6 @@ float* createIdentity(int m, int n)
cudaMallocHost(&Matrix, sizeof(float) * m*n*32*32); cudaMallocHost(&Matrix, sizeof(float) * m*n*32*32);
if(Matrix == NULL) if(Matrix == NULL)
error("Failed to allocate Matrix"); error("Failed to allocate Matrix");
int rows = m*32;
int columns = n*32;
int i, j; int i, j;
memset ( Matrix, 0, sizeof(float)*m*n*32*32 ); memset ( Matrix, 0, sizeof(float)*m*n*32*32 );
...@@ -729,24 +638,6 @@ void printMatrix(float* Matrix, int m, int n) ...@@ -729,24 +638,6 @@ void printMatrix(float* Matrix, int m, int n)
} }
/*void printTileMatrix(float* Matrix, int m, int n)
{
int i,j;
float *tempMatrix, *tempMatrix2;
for(i = 0; i < m*32; i++)
{
int tiled = i/32;
tempMatrix = &Matrix[tiled*32*32];
for(j = 0; j < n*32; j++)
{
int tile = j/32;
tempMatrix2 = &tempMatrix[tile*32*32*m + i%32];
printf(" %.1f ", tempMatrix2[j*32]);
//printf(" %.1f,%i ", tempMatrix2[j*32], &tempMatrix2[j*32]-Matrix);
}
printf("\n");
}
}*/
qsched_funtype func; qsched_funtype func;
...@@ -759,7 +650,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) ...@@ -759,7 +650,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
qsched_task_t *tid, tid_new; qsched_task_t *tid, tid_new;
qsched_res_t *rid; qsched_res_t *rid;
int data[3]; int data[3];
ticks tic, toc_run, tot_setup, tot_run = 0; // ticks tic, tot_run = 0;
#ifdef NO_LOADS #ifdef NO_LOADS
ticks tic_loads; ticks tic_loads;
#endif #endif
...@@ -776,22 +667,15 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) ...@@ -776,22 +667,15 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
/* Allocate and fill the original matrix. */ /* Allocate and fill the original matrix. */
if(cudaMallocHost(&A, sizeof(float) * m * n * K * K ) != cudaSuccess || if(cudaMallocHost(&A, sizeof(float) * m * n * K * K ) != cudaSuccess ||
cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess /*|| cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess )
cudaMallocHost(&A_orig, sizeof(float) * m * n * K * K ) != cudaSuccess*/ )
error("Failed to allocate matrices."); error("Failed to allocate matrices.");
g_size = g_size + sizeof(float) * m * n * K * K + sizeof(float) * m * n * K; g_size = g_size + sizeof(float) * m * n * K * K + sizeof(float) * m * n * K;
//cudaFreeHost(A_orig);
// for ( k = 0 ; k < m * n * K * K ; k++ )
// A_orig[k] = 2.0f*((float)rand()) / RAND_MAX - 1.0f;
A_orig = generateColumnMatrix(m*n*K*K, 35532); A_orig = generateColumnMatrix(m*n*K*K, 35532);
// printMatrix(A_orig, m, n);
float *temp = columnToTile(A_orig, m * n * K *K, m , n); float *temp = columnToTile(A_orig, m * n * K *K, m , n);
cudaFreeHost(A_orig); cudaFreeHost(A_orig);
A_orig = temp; A_orig = temp;
temp = tileToColumn(A_orig, m*n*K*K, m , n, K); temp = tileToColumn(A_orig, m*n*K*K, m , n, K);
// printMatrix(temp, m, n);
// printTileMatrix(A_orig, m, n);
memcpy( A , A_orig , sizeof(float) * m * n * K * K ); memcpy( A , A_orig , sizeof(float) * m * n * K * K );
bzero( tau , sizeof(float) * m * n * K ); bzero( tau , sizeof(float) * m * n * K );
...@@ -814,7 +698,6 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) ...@@ -814,7 +698,6 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
error("Failed to copy matrix pointer to the device"); error("Failed to copy matrix pointer to the device");
/* Allocate and init the task ID and resource ID matrix. */ /* Allocate and init the task ID and resource ID matrix. */
tic = getticks();
/* Allocate and init the task ID and resource ID matrix. */ /* Allocate and init the task ID and resource ID matrix. */
if( cudaMallocHost(&tid , sizeof(qsched_task_t) * m*n ) != cudaSuccess ) if( cudaMallocHost(&tid , sizeof(qsched_task_t) * m*n ) != cudaSuccess )
error("Failed to allocate tid"); error("Failed to allocate tid");
...@@ -860,7 +743,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) ...@@ -860,7 +743,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
data[0] = i; data[1] = k; data[2] = k; data[0] = i; data[1] = k; data[2] = k;
tid_new = qsched_addtask( &s , task_STSQRF , task_flag_none , data , sizeof(int)*3 , 3 ); tid_new = qsched_addtask( &s , task_STSQRF , task_flag_none , data , sizeof(int)*3 , 3 );
qsched_addlock(&s, tid_new, rid[k * m + i]); qsched_addlock(&s, tid_new, rid[k * m + i]);
qsched_adduse(&s, tid_new, rid[k * m + k]); qsched_addlock(&s, tid_new, rid[k * m + k]);
qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new); qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new);
if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new); if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new);
tid[k * m + i] = tid_new; tid[k * m + i] = tid_new;
...@@ -871,7 +754,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) ...@@ -871,7 +754,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
tid_new = qsched_addtask( &s , task_SSSRFT , task_flag_none , data , sizeof(int)*3 , 5 ); tid_new = qsched_addtask( &s , task_SSSRFT , task_flag_none , data , sizeof(int)*3 , 5 );
qsched_addlock(&s, tid_new, rid[j * m + i]); qsched_addlock(&s, tid_new, rid[j * m + i]);
qsched_adduse(&s, tid_new, rid[k * m + i]); qsched_adduse(&s, tid_new, rid[k * m + i]);
qsched_adduse(&s, tid_new, rid[j * m + k]); qsched_addlock(&s, tid_new, rid[j * m + k]);
qsched_addunlock(&s, tid[k * m + i], tid_new); qsched_addunlock(&s, tid[k * m + i], tid_new);
qsched_addunlock(&s, tid[j * m + i - 1], tid_new); qsched_addunlock(&s, tid[j * m + i - 1], tid_new);
if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new); if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new);
...@@ -905,15 +788,12 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) ...@@ -905,15 +788,12 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
// printMatrix(tileToColumn(A, m*n*K*K, m, n, K), m, n); float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
/* float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
float *Q = computeQ(tempMatrix, m*K, K, tau, m); float *Q = computeQ(tempMatrix, m*K, K, tau, m);
float *R = getR(tempMatrix, m*K); float *R = getR(tempMatrix, m*K);
cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q, cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q,
m*K, R, m*K, 0.0, tempMatrix, m*K); m*K, R, m*K, 0.0, tempMatrix, m*K);
free(Q); free(Q);
// printMatrix(tempMatrix, m, n);
// printf("\n\n\n\n");
Q = tileToColumn(A_orig, m*n*K*K, m, n, K); Q = tileToColumn(A_orig, m*n*K*K, m, n, K);
for(i = 0; i < m * n * K * K; i++) for(i = 0; i < m * n * K * K; i++)
{ {
...@@ -922,15 +802,16 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) ...@@ -922,15 +802,16 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i], printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i],
tempMatrix[i]); tempMatrix[i]);
} }
printf("Checked for correctness\n");
free(tempMatrix); free(tempMatrix);
free(Q); free(Q);
free(R);*/ free(R);
// cudaMemcpy( A , device_array , sizeof(float) * m * n * K * K, cudaMemcpyHostToDevice); // cudaMemcpy( A , device_array , sizeof(float) * m * n * K * K, cudaMemcpyHostToDevice);
// A = tileToColumn(A,m * n * K * K, m, n, K); // A = tileToColumn(A,m * n * K * K, m, n, K);
// printMatrix(A, m, n); // printMatrix(A, m, n);
// printTileMatrix(A, m , n); // printTileMatrix(A, m , n);
struct task* tasks = qsched_get_timers( &s , s.count ); /* struct task* tasks = qsched_get_timers( &s , s.count );
for(i = 0; i < s.count; i++) for(i = 0; i < s.count; i++)
{ {
printf("%i %lli %lli %i ", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID); printf("%i %lli %lli %i ", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
...@@ -947,7 +828,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) ...@@ -947,7 +828,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
printf("0 0"); printf("0 0");
printf("\n"); printf("\n");
} }
free(tasks); free(tasks);*/
cudaDeviceReset(); cudaDeviceReset();
} }
...@@ -956,7 +837,6 @@ void test() ...@@ -956,7 +837,6 @@ void test()
{ {
int m = 2, n = 2; int m = 2, n = 2;
float* Matrix = createIdentity(m,n); float* Matrix = createIdentity(m,n);
//printMatrix(Matrix, m, n);
float* MatrixTile = columnToTile(Matrix, m*n*32*32, m, n); float* MatrixTile = columnToTile(Matrix, m*n*32*32, m, n);
printMatrix(&MatrixTile[0], 1, 1); printMatrix(&MatrixTile[0], 1, 1);
printf("\n \n \n"); printf("\n \n \n");
...@@ -966,7 +846,6 @@ void test() ...@@ -966,7 +846,6 @@ void test()
printf("\n \n \n"); printf("\n \n \n");
printMatrix(&MatrixTile[3*32*32], 1, 1); printMatrix(&MatrixTile[3*32*32], 1, 1);
printf("\n \n \n"); printf("\n \n \n");
//printTileMatrix(MatrixTile, m, n);
free(Matrix); free(Matrix);
Matrix = tileToColumn(MatrixTile, m*n*32*32, m , n, 32); Matrix = tileToColumn(MatrixTile, m*n*32*32, m , n, 32);
printMatrix(Matrix, m, n); printMatrix(Matrix, m, n);
...@@ -988,11 +867,9 @@ __global__ void SGEQRF_test(float* cornerTile) ...@@ -988,11 +867,9 @@ __global__ void SGEQRF_test(float* cornerTile)
for(i = threadIdx.x; i < 32*32; i+=blockDim.x) for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
{ {
// printf("Copying value %i\n", i);
tile[i] = cornerTile[i]; tile[i] = cornerTile[i];
} }
// printf("blockDim.x = %i\n", blockDim.x);
__syncthreads(); __syncthreads();
if(threadIdx.x < 32) if(threadIdx.x < 32)
SGEQRF(tile, 32, tau, 0, 1); SGEQRF(tile, 32, tau, 0, 1);
...@@ -1029,7 +906,6 @@ __global__ void SLARFT_test(float* cornerTile, float* rowTile) ...@@ -1029,7 +906,6 @@ __global__ void SLARFT_test(float* cornerTile, float* rowTile)
SGEQRF(tile, 32, tau, 0, 2); SGEQRF(tile, 32, tau, 0, 2);
__syncthreads(); __syncthreads();
// if(threadIdx.x < 32)
SLARFT(tile, tile2, 32, 1, 0, tau, 2); SLARFT(tile, tile2, 32, 1, 0, tau, 2);
__syncthreads(); __syncthreads();
for(i = threadIdx.x; i < 32*32; i+=blockDim.x) for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
...@@ -1165,12 +1041,6 @@ int main ( int argc , char *argv[] ) { ...@@ -1165,12 +1041,6 @@ int main ( int argc , char *argv[] ) {
int c, nr_threads=128; int c, nr_threads=128;
int M = 4, N = 4, runs = 1, K = 32; int M = 4, N = 4, runs = 1, K = 32;
/* Get the number of threads. */
//#pragma omp parallel shared(nr_threads)
// {
// if ( omp_get_thread_num() == 0 )
// nr_threads = omp_get_num_threads();
// }
/* Parse the options */ /* Parse the options */
while ( ( c = getopt( argc , argv , "m:n:k:r:t:D" ) ) != -1 ) while ( ( c = getopt( argc , argv , "m:n:k:r:t:D" ) ) != -1 )
...@@ -1194,7 +1064,6 @@ int main ( int argc , char *argv[] ) { ...@@ -1194,7 +1064,6 @@ int main ( int argc , char *argv[] ) {
case 't': case 't':
if ( sscanf( optarg , "%d" , &nr_threads ) != 1 ) if ( sscanf( optarg , "%d" , &nr_threads ) != 1 )
error( "Error parsing number of threads." ); error( "Error parsing number of threads." );
//omp_set_num_threads( nr_threads );
break; break;
case 'D': case 'D':
runTests(); runTests();
...@@ -1206,9 +1075,6 @@ int main ( int argc , char *argv[] ) { ...@@ -1206,9 +1075,6 @@ int main ( int argc , char *argv[] ) {
exit( EXIT_FAILURE ); exit( EXIT_FAILURE );
} }
/* Dump arguments. */
// message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." ,
// 32*M , 32*N , nr_threads , runs );
test_qr( M , N , K , nr_threads , runs ); test_qr( M , N , K , nr_threads , runs );
......
#!/bin/bash #!/bin/bash
FLAGS2="-Xcompiler=-fsanitize=address -Xcompiler=-fno-omit-frame-pointer" FLAGS2="-Xcompiler=-fsanitize=address -Xcompiler=-fno-omit-frame-pointer"
DEBUG_FLAGS="-G -DDEBUG_GPU" DEBUG_FLAGS="-G -DDEBUG_GPU"
FLAGS="-O3 -g -G -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8" FLAGS="-O3 -g -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8"
# -DGPU_locks -Xptxas -dlcm=cg -Xptxas="-v"" # -DGPU_locks -Xptxas -dlcm=cg -Xptxas="-v""
# -DNO_LOADS # -DNO_LOADS
...@@ -36,3 +36,7 @@ cd ../examples ...@@ -36,3 +36,7 @@ cd ../examples
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda test_bh_2.cu -lprofiler /home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda test_bh_2.cu -lprofiler
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh_2.o ../src/.libs/libquicksched_cuda.a -o test_bh_2 -lprofiler /home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh_2.o ../src/.libs/libquicksched_cuda.a -o test_bh_2 -lprofiler
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda test_bh_3.cu -lprofiler
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh_3.o ../src/.libs/libquicksched_cuda.a -o test_bh_3 -lprofiler
This diff is collapsed.
...@@ -163,8 +163,9 @@ struct qsched { ...@@ -163,8 +163,9 @@ struct qsched {
/* Function prototypes. */ /* Function prototypes. */
/* Internal functions. */ /* Internal functions. */
void qsched_quicksort ( int *data , int *ind , int N , int min , int max );
void qsched_sort ( int *data , int *ind , int N , int min , int max ); void qsched_sort ( int *data , int *ind , int N , int min , int max );
void qsched_sort_rec ( int *data , int *ind , int N , int min , int max ); //void qsched_sort_rec ( int *data , int *ind , int N , int min , int max );
struct task *qsched_gettask ( struct qsched *s , int qid ); struct task *qsched_gettask ( struct qsched *s , int qid );
void qsched_done ( struct qsched *s , struct task *t ); void qsched_done ( struct qsched *s , struct task *t );
void *qsched_getdata( struct qsched *s , struct task *t ); void *qsched_getdata( struct qsched *s , struct task *t );
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment