Skip to content
Snippets Groups Projects
Commit f0ab955c authored by aidan's avatar aidan
Browse files

Code cleanup

parent 320d4466
Branches
No related tags found
No related merge requests found
......@@ -40,6 +40,7 @@ extern "C"{
}
#include "cuda_queue.h"
/** Task types. */
enum task_type {
task_type_self = 0,
......@@ -59,12 +60,12 @@ unsigned short int split, sorted;
int parts, firstchild, sibling;
int res, resz, resm, com_tid;
};//__attribute__((aligned(64)));
};
#define const_G 1
/* Requred variables to obtain cells. */
#define cell_maxparts 128
#define cell_maxparts 256
#define CELL_STRETCH 2
#define INITIAL_CELLS 256
struct cell *cell_pool = NULL;
......@@ -101,20 +102,7 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
int count_i = ci->count, count_j = cj->count;
int parts_i = ci->parts, parts_j = cj->parts;
double xi[3];
float dx[3], ai[3], mi, mj, r2, w, ir;
__shared__ double2 parts_xy[cell_maxparts];
__shared__ double parts_z[cell_maxparts];
__shared__ float4 parts_am[cell_maxparts];
/*if(threadIdx.x == 0)
printf("%f, %f, %f, %f, %i, %f, %f, %f, %f, %i\n", ci->h, ci->loc_xy.x, ci->loc_xy.y, ci->loc_z, ci->split,
cj->h, cj->loc_xy.x, cj->loc_xy.y, cj->loc_z, cj->split);*/
/* Load particles of cell j into shared memory */
/*for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) {
parts_xy[j] = parts_pos_xy[k];
parts_z[j] = parts_pos_z[k];
parts_am[j] = parts_a_m[k];
}*/
float dx[3], ai[3], mj, r2, w, ir;
/* Loop over cell i.*/
for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
......@@ -124,7 +112,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
for(k = 0; k < 3; k++) {
ai[k] = 0.0f;
}
mi = parts_a_m[i].w;
for(j = parts_j; j < parts_j + count_j; j++) {
r2 = 0.0f;
......@@ -136,16 +123,12 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
r2 += dx[2] * dx[2];
// ir = 1.0f / sqrtf(r2);
ir = rsqrtf(r2);
w = const_G * ir * ir * ir;
mj = parts_a_m[j].w;
for(k = 0; k < 3; k++) {
ai[k] -= dx[k] * mj * w;
}
// atomicAdd(&parts_a_m[j].x, w*dx[0]*mi);
// atomicAdd(&parts_a_m[j].y, w*dx[1]*mi);
// atomicAdd(&parts_a_m[j].z, w*dx[2]*mi);
}
atomicAdd(&parts_a_m[i].x, ai[0]);
atomicAdd(&parts_a_m[i].y, ai[1]);
......@@ -154,11 +137,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
}
/* Load particles of cell i into shared memory */
/*for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) {
parts_xy[j] = parts_pos_xy[k];
parts_z[j] = parts_pos_z[k];
parts_am[j] = parts_a_m[k];
}*/
/*Loop over cell j. */
for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
xi[0] = parts_pos_xy[i].x;
......@@ -167,7 +145,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
for(k = 0; k < 3; k++) {
ai[k] = 0.0f;
}
mi = parts_a_m[i].w;
for(j = parts_i; j < parts_i + count_i; j++) {
r2 = 0.0f;
......@@ -196,29 +173,15 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) {
int i, k;
int i;
double2 j_com_xy;
double j_com_z;
float j_com_mass;
int count = leaf->count;
int parts = leaf->parts;
int cell_j = cj - cells;
int temp;
float r2, dx[3], ir, w;
// if(cell_j < 0)
// {
// if(threadIdx.x == 0)
// printf("cell_j = %i, leaf = %i, threadIdx.x == %i\n", cell_j, leaf-cells, threadIdx.x);
// __syncthreads();
// asm("trap;");
// }
// if(threadIdx.x == 0)
// printf("%f, %f, %f\n", cj->loc_xy.x, cj->loc_xy.y, cj->loc_z);
temp = cell_j;
/* Init the com's data.*/
j_com_xy = com_xy[cell_j];
......@@ -236,18 +199,10 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
ir = rsqrtf(r2);
w = j_com_mass * const_G * ir * ir * ir;
/* __threadfence();
if(!isfinite(w * dx[0])){
printf("Error in make_interact_pc, j_com_mass = %f, cell_j = %i, temp = %i, i = %i, threadIdx.x=%i\n", j_com_mass, cell_j, temp, i, threadIdx.x); asm("trap;");}
if(!isfinite(w * dx[1])){
printf("Error in make_interact_pc\n"); asm("trap;");}
if(!isfinite(w * dx[2])){
printf("Error in make_interact_pc\n"); asm("trap;");}*/
atomicAdd( &parts_a_m[i].x , w * dx[0]);
atomicAdd( &parts_a_m[i].y , w * dx[1]);
atomicAdd( &parts_a_m[i].z , w * dx[2]);
}
//__syncthreads();
}
/**
......@@ -255,7 +210,6 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
*/
__device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) {
int k;
float dx[3];
double cih = ci->h, cjh = cj->h;
......@@ -274,7 +228,7 @@ __device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, st
}
__device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) {
int k;
float dx[3];
float min_dist = ci->h;
float center_i = ci->loc_xy.x;
......@@ -296,13 +250,6 @@ __device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) {
__device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) {
struct cell *cp ,*cps;
int leafnum = leaf - cells;
//if(threadIdx.x == 0 && leafnum == 23)
// printf("cj = %i\n", cj - cells);
// printf("%i\n", leafnum);
// if(threadIdx.x == 0)
/// printf("ci = %i, cj = %i, leaf = %i\n", ci - cells, cj - cells, leaf - cells);
for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
if(is_inside(leaf, cp)) break;
......@@ -318,16 +265,12 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf
}
} else {
make_interact_pc(leaf, cps);
// if(threadIdx.x == 0 && leafnum == 23)
// printf("leafnum = %i with cps = %i here\n", leafnum, cps - cells);
__syncthreads();
}
}
}else{
for(cps = &cells[cj->firstchild]; cps!= &cells[cj->sibling]; cps = &cells[cps->sibling]) {
// if(threadIdx.x == 0 && leafnum == 23)
// printf("leafnum = %i with cps = %i\n", leafnum, cps - cells);
make_interact_pc(leaf, cps);
}
......@@ -347,17 +290,6 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
struct cell *cp, *cps;
/*if(leaf->split)
{
printf("Leaf split = 1, oh dear.");
asm("trap;");
}
if(c->split > 1)
{
printf("Cell had split > 1\n");
asm("trap;");
}*/
/* Find the subcell of c the leaf is in.*/
/*cp = c;
......@@ -402,31 +334,13 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
__device__ void iact_self_direct(int cellID) {
struct cell *c = &cells[cellID];
double xi[3] = {0.0, 0.0, 0.0};
float ai[3] = {0.0, 0.0, 0.0 }, mi, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w;
__shared__ double2 parts_xy[cell_maxparts];
__shared__ double parts_z[cell_maxparts];
__shared__ float4 parts_am[cell_maxparts];
float ai[3] = {0.0, 0.0, 0.0 }, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w;
int parts;
int count;
int i,j,k;
//if(threadIdx.x == 0)
// printf("%f, %f, %f, %f, %i\n", c->h, c->loc_xy.x, c->loc_xy.y, c->loc_z, c->split);
//If cell is split, interact each child with itself, and with each of its siblings.
/*if(c->split) {
//TODO
} else {*/
parts = c->parts;
count = c->count;
int z = threadIdx.x;
/* Load particle data into shared memory*/
/*for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) {
parts_xy[z] = parts_pos_xy[k];
parts_z[z] = parts_pos_z[k];
parts_am[z] = parts_a_m[k];
}
__syncthreads();*/
for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x)
{
xi[0] = parts_pos_xy[i].x;
......@@ -435,9 +349,7 @@ __device__ void iact_self_direct(int cellID) {
for(k = 0; k < 3; k++) {
ai[k] = 0.0;
}
mi = parts_a_m[i].w;
//for(j = i+1; j!= i; j = (j+1)%count)
for(j = parts; j < parts+count; j++)
{
if(i != j){
......@@ -453,7 +365,6 @@ __device__ void iact_self_direct(int cellID) {
/* Apply the gravitational acceleration. */
//ir = 1.0f / sqrtf(r2);
ir = rsqrtf(r2);
w = const_G * ir * ir * ir;
mj = parts_a_m[j].w;
......@@ -479,7 +390,7 @@ __device__ void iact_self_direct(int cellID) {
*/
static inline int are_neighbours_host(struct cell *ci, struct cell *cj) {
int k;
// int k;
float dx[3];
#ifdef SANITY_CHECKS
......@@ -508,7 +419,7 @@ static inline int are_neighbours_host(struct cell *ci, struct cell *cj) {
struct cell *cell_get()
{
struct cell *res;
// struct cell *res;
if(num_cells == 0)
{
......@@ -645,9 +556,7 @@ void cell_split(int c, struct qsched *s) {
int left[8], right[8];
double pivot[3];
static int root = -1;
// struct cell *progenitors[8];
int progenitors[8];
int c1 = c;
double2 *temp_xy;
double *temp_z;
float4 *temp_a_m;
......@@ -662,7 +571,6 @@ void cell_split(int c, struct qsched *s) {
{
if( cudaHostGetDevicePointer(&temp_xy, &parts_pos_xy_host[cell_pool[c].parts], 0) != cudaSuccess )
error("Failed to get host device pointer.");
// printf("tempxy = %p\n", temp_xy);
cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy,
sizeof(double2) * cell_pool[c].count, parts_pos_xy_temp + cell_pool[c].parts);
}
......@@ -680,7 +588,6 @@ void cell_split(int c, struct qsched *s) {
cell_pool[c].resm = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_a_m,
sizeof(float4) * cell_pool[c].count, parts_a_m_temp + cell_pool[c].parts);
}
// error("Cell has no resource");*///TODO
if(count > cell_maxparts )
{
......@@ -836,13 +743,9 @@ void cell_split(int c, struct qsched *s) {
/* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */
} else {
// struct cell *data[2] = {root, c};
int data[2] = {root, c};
int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
2 * sizeof(int), 3000);
/*qsched_adduse(s, tid, cell_pool[root].res);
qsched_adduse(s, tid, cell_pool[root].resz);
qsched_adduse(s, tid, cell_pool[root].resm);*/
qsched_addlock(s, tid, cell_pool[c].res);
qsched_addlock(s, tid, cell_pool[c].resz);
qsched_addlock(s, tid, cell_pool[c].resm);
......@@ -864,8 +767,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
qsched_task_t tid;
int data[2];
struct cell /**data[2],*/ *cp, *cps;
int cpi;
struct cell *cp, *cps;
if(cj == NULL)
......@@ -967,11 +869,8 @@ qsched_funtype func;
void test_bh(int N, int runs, char *fileName) {
int i, k;
struct cell *root;
struct part *parts;
FILE *file;
struct qsched s;
ticks tic, toc_run, tot_setup = 0, tot_run = 0;
int countMultipoles = 0, countPairs = 0, countCoMs = 0;
struct cell *gpu_ptr_cells;
cudaFree(0);
......@@ -1142,23 +1041,6 @@ float *comm_temp;
if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to copy com pointer to the GPU");
/* for(i = 0; i < s.count; i++){
int *idata = (int*)&s.data[s.tasks[i].data];
if(s.tasks[i].type == task_type_self)
printf("Self task with data[0] = %i\n", idata[0]);
if(s.tasks[i].type == task_type_pair) {
printf("Pair task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
}
if(s.tasks[i].type == task_type_self_pc) {
printf("PC task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
}
}*/
// printf("com_mass_host[152] = %f\n", com_mass_host[152]);
//Run code.
// printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
qsched_run_CUDA( &s , func );
qsched_print_cuda_timers(&s);
......@@ -1169,16 +1051,12 @@ struct task* tasks = qsched_get_timers( &s , s.count );
for(i = 0; i < s.count; i++)
{
printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
// printf("\n");
}
}
/* Dump the particles to a file */
file = fopen("particle_dump.dat", "w");
/* fprintf(file,
"# ID m x y z a_exact.x a_exact.y a_exact.z a_legacy.x "
"a_legacy.y a_legacy.z a_new.x a_new.y a_new.z\n");*/
for (k = 0; k < N; ++k)
fprintf(file, "%e, %e, %e, %e, %e, %e, %e\n",
parts_a_m_host[k].w, parts_pos_xy_host[k].x, parts_pos_xy_host[k].y, parts_pos_z_host[k],
......@@ -1228,9 +1106,6 @@ int main(int argc, char *argv[]) {
/* Tree node information */
printf("Size of cell: %zu bytes.\n", sizeof(struct cell));
/* Part information */
// printf("Size of part: %zu bytes.\n", sizeof(struct part));
/* Dump arguments. */
if (fileName[0] == 0) {
message("Computing the N-body problem over %i random particles using %i "
......
This diff is collapsed.
......@@ -67,7 +67,6 @@ __device__ void quarter(float *data)
__global__ void Manual(float *src)
{
int i;
int datas = blockIdx.x;
cuda_data[datas*1000+threadIdx.x] = src[datas*1000+threadIdx.x];
......@@ -109,13 +108,13 @@ __global__ void Setup()
int main ( int argc , char *argv[] ) {
float *array, *cuda_array, *cuda_array2, *device_array;
float *array, *cuda_array, *device_array;
int i, k=0;
qsched_funtype func;
struct qsched s;
qsched_task_t *tid;
qsched_res_t *rid;
ticks tic, toc_run, tot_setup, tot_run = 0;
ticks tic, toc_run, tot_run = 0;
qsched_init( &s , 1 , qsched_flag_none );
cudaDeviceReset();
cudaSetDevice(0);
......@@ -134,11 +133,6 @@ int main ( int argc , char *argv[] ) {
array[i] = (float)i;
}
// if(cudaHostGetDevicePointer(&cuda_array , &array[k*1000] , 0) != cudaSuccess)
//error("Failed to get device pointer for data: %s", cudaGetErrorString(cudaPeekAtLastError()));
// if ( cudaMallocHost(&array, M*sizeof(float)) != cudaSuccess )
// error("Failed to allocate array");
/* Initialize the scheduler. */
......@@ -193,16 +187,9 @@ printf("Starting second run\n");
printf("Second run complete\n");
struct task *completed_tasks = qsched_get_timers( &s, s.count );
/* tic = getticks();
qsched_run_CUDA( &s , func );
toc_run = getticks();
message( "qsched_run_CUDA took %lli ticks..." , toc_run - tic );
tot_run += toc_run - tic;*/
// if( cudaFreeHost( array) != cudaSuccess)
// error("Failed to free array");
cudaDeviceReset();
......@@ -215,8 +202,6 @@ cudaDeviceReset();
error("Failed to get device pointer for data: %s",cudaGetErrorString(cudaPeekAtLastError()));
if( cudaMalloc(&device_array , M*sizeof(float) ) != cudaSuccess )
error("Failed to allocate device array: %s", cudaGetErrorString(cudaPeekAtLastError()));
/*if( cudaMemcpy( &device_array , array , M*sizeof(float), cudaMemcpyHostToDevice ) != cudaSuccess )
error("Failed to copy device array: %s", cudaGetErrorString(cudaPeekAtLastError()));*/
if( cudaMemcpyToSymbol( cuda_data , &device_array,sizeof(float*), 0 , cudaMemcpyHostToDevice) != cudaSuccess )
error("Failed to copy array pointer to device: %s", cudaGetErrorString(cudaPeekAtLastError()));
tic = getticks();
......@@ -229,6 +214,4 @@ cudaDeviceReset();
if(array[i] != ((float)i)*((float)i)*0.5f)
printf("%i wrong, %.3f != %.3f\n", i, array[i], ((float)i)*((float)i)*0.5f );
//printf("%.3f\n", array[2]);
}
......@@ -22,11 +22,8 @@ extern "C"{
int g_size;
enum task_types { task_SGEQRF , task_SLARFT , task_STSQRF , task_SSSRFT} ;
//#define TID threadIdx.x
#define numthreads 128
//#define tilesize 32
#define cuda_maxtasks 1000000
//#define CO(x,y,ldm) (((y)*ldm) + (x))
#define WARPS 4
__device__ float *GPU_matrix;
......@@ -145,7 +142,7 @@ float* generateColumnMatrix(int size, unsigned long int m_z)
{
float* matrix;
cudaError_t code = cudaMallocHost(&matrix, sizeof(float)*size/**size*/);
cudaError_t code = cudaMallocHost(&matrix, sizeof(float)*size);
if(code != cudaSuccess)
printf("%s size = %i g_size = %i\n", cudaGetErrorString(code),size, g_size);
else
......@@ -211,8 +208,6 @@ __device__ inline void reduceSumMultiWarp( float* value )
}
__threadfence();
*value = stuff[group * 32];
//printf("Not supported.\n");
//asm("trap;");
#endif
}
......@@ -236,7 +231,6 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float*
int i, j;
float norm=0.0, sign, u1, tau, z;
int TID = threadIdx.x % 32;
//int set = threadIdx.x / 32;
float w;
/*Find the householder vector for each row. */
for(i = 0; i < tilesize; i++)
......@@ -253,7 +247,6 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float*
else
sign = 1;
// sign = __shfl(sign, i, 32);
norm = sqrt(norm);
if(TID >= i )
w = cornerTile[ i*tilesize + TID ];
......@@ -262,13 +255,8 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float*
// if(TID==i)
u1 = cornerTile[i*tilesize + i] - sign*norm;
// __syncthreads();
// u1 = __shfl(u1, i, 32);
// if(i==1)
// printf("%.3f\n", u1);
if(u1 != 0.0)
{
if(TID > i)
......@@ -285,8 +273,6 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float*
else
tau = 0.0;
//if(TID == 0)
// printf("%.3f\n", sign);
/*Store the below diagonal vector */
if(TID > i)
......@@ -358,17 +344,9 @@ void __device__ SLARFT(volatile float* cornerTile, volatile float* rowTile, int
/* Below Diagonal!*/
/*Compute w'*A_j*/
z = w * rowTile[j*tilesize+TID];
//for(n = i; n < tileSize; n++)
//{
// z = z + w[n] * rowTile[j*tileSize+n];
//}
reduceSumMultiWarp(&z);
if(TID >= i)
rowTile[j*tilesize + TID] = rowTile[j*tilesize + TID] - tauMatrix[(kk*tilesize+i)*tauNum + kk] * w * z;
//for(n = i; n < tileSize; n++)
//{
// rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z;
//}
}
......@@ -393,16 +371,14 @@ void __device__ SLARFT(volatile float* cornerTile, volatile float* rowTile, int
*/
__device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, int tilesize, int ii, int kk, volatile float* tauMatrix, int tauNum )
{
int i, j, n;
int i, j;
float norm=0.0, sign, u1, tau, z;
float w, wupper;
int TID = threadIdx.x % 32;
//int set = threadIdx.x / 32;
/* For each column compute the householder vector. */
for(i = 0; i < tilesize; i++)
{
// norm = 0.0;
wupper = cornerTile[i*tilesize + i];
......@@ -446,33 +422,6 @@ __device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, in
columnTile[i*tilesize+TID] = w;
if(threadIdx.x == 0)
tauMatrix[(kk*tilesize+i)*tauNum+ii] = tau;
/* Apply to each row to the right.*/
// for(j = i; j < tilesize; j++)
// {
/* Find w'*A_j, w is 0s except for first value with upper tile.*/
// z = 1.0 * cornerTile[ j*tilesize+i ];
// for(n = 0; n < tilesize; n++)
// {
// z = z + w[ tilesize+n ]*columnTile[ j*tilesize+n ];
// }
/* Apply to upper tile.*/
// cornerTile[j*tilesize+i] = cornerTile[j*tilesize+i ] - tau*1.0*z;
// for(n = i+1; n < tilesize; n++)
// {
// cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n ] - tau*w[n]*z;
// }
/* Apply to lower tile.*/
// for(n = 0; n < tilesize; n++)
// {
// columnTile[ j*tilesize+n] = columnTile[ j*tilesize+n ] - tau*w[tilesize+n]*z;
// }
// }
/* Store w*/
// for(j = 0; j < tilesize; j++){
// columnTile[ i*tilesize+j ] = w[tilesize+j];
// }
// tauMatrix[(kk*tilesize+i)*tauNum+ ii] = tau;
}
}
......@@ -495,7 +444,7 @@ __device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, in
*/
__device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,volatile float* rowTile, int tilesize, int ii, int jj, int kk,volatile float* tauMatrix, int tauNum )
{
int i, j, n;
int i, j;
float z;
float w;
int TID = threadIdx.x % 32;
......@@ -520,30 +469,6 @@ __device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,vo
cornerTile[j*tilesize+TID] = cornerTile[j*tilesize+TID] - tau*w*z;
}
__syncthreads();
/*for(j = 0; j < i; j++)
w[j] = 0.0;
w[i] = 1.0;
for(j = i+1; j < tilesize; j++)
w[j] = 0.0;
for(j = 0; j < tilesize; j++)
w[j+tilesize] = columnTile[i*tilesize +j];*/
/* Apply householder vector (w) to the tiles.*/
/* for(j = 0; j < tilesize; j++)
{
z = 0.0;*/
/* Compute w' * A_j */
/* for(n = 0; n < tilesize; n++)
{
z += w[n] * rowTile[j*tilesize+n];
z += w[n + tilesize] * cornerTile[j*tilesize+n];
}
for(n = 0; n < tilesize; n++)
{
rowTile[j*tilesize + n] = rowTile[j*tilesize + n] - tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[n]*z;
cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n]- tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[tilesize+n]*z;
}
}*/
}
}
......@@ -551,37 +476,25 @@ __device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,vo
__device__ void runner ( int type , void *data ) {
__shared__ volatile float blockCache[(32*32) + 128];
volatile float *workVector;
workVector = blockCache + (32*32);
/* Decode the task data. */
int *idata = (int *)data;
int i = idata[0];
int j = idata[1];
int k = idata[2];
// int z;
// double buff[ 2*K*K ];
/* Decode and execute the task. */
switch ( type ) {
case task_SGEQRF:
//if(threadIdx.x == 0)
// printf("SGEQRF %i %i\n", k, (k*cuda_m+k)*32*32);
if(threadIdx.x < 32)
SGEQRF( &GPU_matrix[(k*cuda_m+k)*32*32], 32, GPU_tau, k, cuda_m);
break;
case task_SLARFT:
// if(threadIdx.x == 0)
// printf("SLARFT %i %i %i\n", k,j, (j*cuda_m+k)*32*32);
SLARFT( &GPU_matrix[(k*cuda_m +k)*32*32], &GPU_matrix[(j*cuda_m+k)*32*32], 32, j, k, GPU_tau, cuda_m);
break;
case task_STSQRF:
// if(threadIdx.x == 0)
// printf("STSQRF %i %i %i\n", k,i, (k*cuda_m+i)*32*32);
if(threadIdx.x < 32)
STSQRF( &GPU_matrix[(k*cuda_m+k)*32*32], &GPU_matrix[(k*cuda_m + i)*32*32], 32, i, k, GPU_tau, cuda_m);
break;
case task_SSSRFT:
// if(threadIdx.x == 0)
// printf("SSSRFT %i %i %i %i\n", k,j, i, (j*cuda_m+i)*32*32);
SSSRFT( &GPU_matrix[(j*cuda_m+i)*32*32],&GPU_matrix[(k*cuda_m + i)*32*32],&GPU_matrix[(j*cuda_m+k)*32*32], 32, i, j, k, GPU_tau, cuda_m );
break;
default:
......@@ -630,8 +543,6 @@ float* columnToTile( float* columnMatrix, int size , int m , int n )
cudaMallocHost(&TileMatrix, sizeof(float) * size );
if(TileMatrix == NULL)
error("failed to allocate TileMatrix");
int rows = m*32;
int columns = n*32;
int i,j,k,l;
for( i = 0; i < n ; i++ )
......@@ -691,8 +602,6 @@ float* createIdentity(int m, int n)
cudaMallocHost(&Matrix, sizeof(float) * m*n*32*32);
if(Matrix == NULL)
error("Failed to allocate Matrix");
int rows = m*32;
int columns = n*32;
int i, j;
memset ( Matrix, 0, sizeof(float)*m*n*32*32 );
......@@ -729,24 +638,6 @@ void printMatrix(float* Matrix, int m, int n)
}
/*void printTileMatrix(float* Matrix, int m, int n)
{
int i,j;
float *tempMatrix, *tempMatrix2;
for(i = 0; i < m*32; i++)
{
int tiled = i/32;
tempMatrix = &Matrix[tiled*32*32];
for(j = 0; j < n*32; j++)
{
int tile = j/32;
tempMatrix2 = &tempMatrix[tile*32*32*m + i%32];
printf(" %.1f ", tempMatrix2[j*32]);
//printf(" %.1f,%i ", tempMatrix2[j*32], &tempMatrix2[j*32]-Matrix);
}
printf("\n");
}
}*/
qsched_funtype func;
......@@ -759,7 +650,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
qsched_task_t *tid, tid_new;
qsched_res_t *rid;
int data[3];
ticks tic, toc_run, tot_setup, tot_run = 0;
// ticks tic, tot_run = 0;
#ifdef NO_LOADS
ticks tic_loads;
#endif
......@@ -776,22 +667,15 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
/* Allocate and fill the original matrix. */
if(cudaMallocHost(&A, sizeof(float) * m * n * K * K ) != cudaSuccess ||
cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess /*||
cudaMallocHost(&A_orig, sizeof(float) * m * n * K * K ) != cudaSuccess*/ )
cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess )
error("Failed to allocate matrices.");
g_size = g_size + sizeof(float) * m * n * K * K + sizeof(float) * m * n * K;
//cudaFreeHost(A_orig);
// for ( k = 0 ; k < m * n * K * K ; k++ )
// A_orig[k] = 2.0f*((float)rand()) / RAND_MAX - 1.0f;
A_orig = generateColumnMatrix(m*n*K*K, 35532);
// printMatrix(A_orig, m, n);
float *temp = columnToTile(A_orig, m * n * K *K, m , n);
cudaFreeHost(A_orig);
A_orig = temp;
temp = tileToColumn(A_orig, m*n*K*K, m , n, K);
// printMatrix(temp, m, n);
// printTileMatrix(A_orig, m, n);
memcpy( A , A_orig , sizeof(float) * m * n * K * K );
bzero( tau , sizeof(float) * m * n * K );
......@@ -814,7 +698,6 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
error("Failed to copy matrix pointer to the device");
/* Allocate and init the task ID and resource ID matrix. */
tic = getticks();
/* Allocate and init the task ID and resource ID matrix. */
if( cudaMallocHost(&tid , sizeof(qsched_task_t) * m*n ) != cudaSuccess )
error("Failed to allocate tid");
......@@ -860,7 +743,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
data[0] = i; data[1] = k; data[2] = k;
tid_new = qsched_addtask( &s , task_STSQRF , task_flag_none , data , sizeof(int)*3 , 3 );
qsched_addlock(&s, tid_new, rid[k * m + i]);
qsched_adduse(&s, tid_new, rid[k * m + k]);
qsched_addlock(&s, tid_new, rid[k * m + k]);
qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new);
if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new);
tid[k * m + i] = tid_new;
......@@ -871,7 +754,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
tid_new = qsched_addtask( &s , task_SSSRFT , task_flag_none , data , sizeof(int)*3 , 5 );
qsched_addlock(&s, tid_new, rid[j * m + i]);
qsched_adduse(&s, tid_new, rid[k * m + i]);
qsched_adduse(&s, tid_new, rid[j * m + k]);
qsched_addlock(&s, tid_new, rid[j * m + k]);
qsched_addunlock(&s, tid[k * m + i], tid_new);
qsched_addunlock(&s, tid[j * m + i - 1], tid_new);
if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new);
......@@ -905,15 +788,12 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
// printMatrix(tileToColumn(A, m*n*K*K, m, n, K), m, n);
/* float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
float *Q = computeQ(tempMatrix, m*K, K, tau, m);
float *R = getR(tempMatrix, m*K);
cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q,
m*K, R, m*K, 0.0, tempMatrix, m*K);
free(Q);
// printMatrix(tempMatrix, m, n);
// printf("\n\n\n\n");
Q = tileToColumn(A_orig, m*n*K*K, m, n, K);
for(i = 0; i < m * n * K * K; i++)
{
......@@ -922,15 +802,16 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i],
tempMatrix[i]);
}
printf("Checked for correctness\n");
free(tempMatrix);
free(Q);
free(R);*/
free(R);
// cudaMemcpy( A , device_array , sizeof(float) * m * n * K * K, cudaMemcpyHostToDevice);
// A = tileToColumn(A,m * n * K * K, m, n, K);
// printMatrix(A, m, n);
// printTileMatrix(A, m , n);
struct task* tasks = qsched_get_timers( &s , s.count );
/* struct task* tasks = qsched_get_timers( &s , s.count );
for(i = 0; i < s.count; i++)
{
printf("%i %lli %lli %i ", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
......@@ -947,7 +828,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
printf("0 0");
printf("\n");
}
free(tasks);
free(tasks);*/
cudaDeviceReset();
}
......@@ -956,7 +837,6 @@ void test()
{
int m = 2, n = 2;
float* Matrix = createIdentity(m,n);
//printMatrix(Matrix, m, n);
float* MatrixTile = columnToTile(Matrix, m*n*32*32, m, n);
printMatrix(&MatrixTile[0], 1, 1);
printf("\n \n \n");
......@@ -966,7 +846,6 @@ void test()
printf("\n \n \n");
printMatrix(&MatrixTile[3*32*32], 1, 1);
printf("\n \n \n");
//printTileMatrix(MatrixTile, m, n);
free(Matrix);
Matrix = tileToColumn(MatrixTile, m*n*32*32, m , n, 32);
printMatrix(Matrix, m, n);
......@@ -988,11 +867,9 @@ __global__ void SGEQRF_test(float* cornerTile)
for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
{
// printf("Copying value %i\n", i);
tile[i] = cornerTile[i];
}
// printf("blockDim.x = %i\n", blockDim.x);
__syncthreads();
if(threadIdx.x < 32)
SGEQRF(tile, 32, tau, 0, 1);
......@@ -1029,7 +906,6 @@ __global__ void SLARFT_test(float* cornerTile, float* rowTile)
SGEQRF(tile, 32, tau, 0, 2);
__syncthreads();
// if(threadIdx.x < 32)
SLARFT(tile, tile2, 32, 1, 0, tau, 2);
__syncthreads();
for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
......@@ -1165,12 +1041,6 @@ int main ( int argc , char *argv[] ) {
int c, nr_threads=128;
int M = 4, N = 4, runs = 1, K = 32;
/* Get the number of threads. */
//#pragma omp parallel shared(nr_threads)
// {
// if ( omp_get_thread_num() == 0 )
// nr_threads = omp_get_num_threads();
// }
/* Parse the options */
while ( ( c = getopt( argc , argv , "m:n:k:r:t:D" ) ) != -1 )
......@@ -1194,7 +1064,6 @@ int main ( int argc , char *argv[] ) {
case 't':
if ( sscanf( optarg , "%d" , &nr_threads ) != 1 )
error( "Error parsing number of threads." );
//omp_set_num_threads( nr_threads );
break;
case 'D':
runTests();
......@@ -1206,9 +1075,6 @@ int main ( int argc , char *argv[] ) {
exit( EXIT_FAILURE );
}
/* Dump arguments. */
// message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." ,
// 32*M , 32*N , nr_threads , runs );
test_qr( M , N , K , nr_threads , runs );
......
#!/bin/bash
FLAGS2="-Xcompiler=-fsanitize=address -Xcompiler=-fno-omit-frame-pointer"
DEBUG_FLAGS="-G -DDEBUG_GPU"
FLAGS="-O3 -g -G -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8"
FLAGS="-O3 -g -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8"
# -DGPU_locks -Xptxas -dlcm=cg -Xptxas="-v""
# -DNO_LOADS
......@@ -36,3 +36,7 @@ cd ../examples
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda test_bh_2.cu -lprofiler
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh_2.o ../src/.libs/libquicksched_cuda.a -o test_bh_2 -lprofiler
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda test_bh_3.cu -lprofiler
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh_3.o ../src/.libs/libquicksched_cuda.a -o test_bh_3 -lprofiler
This diff is collapsed.
......@@ -163,8 +163,9 @@ struct qsched {
/* Function prototypes. */
/* Internal functions. */
void qsched_quicksort ( int *data , int *ind , int N , int min , int max );
void qsched_sort ( int *data , int *ind , int N , int min , int max );
void qsched_sort_rec ( int *data , int *ind , int N , int min , int max );
//void qsched_sort_rec ( int *data , int *ind , int N , int min , int max );
struct task *qsched_gettask ( struct qsched *s , int qid );
void qsched_done ( struct qsched *s , struct task *t );
void *qsched_getdata( struct qsched *s , struct task *t );
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment