Skip to content
Snippets Groups Projects
Commit d2c38d96 authored by Aidan Chalk's avatar Aidan Chalk
Browse files

Added a file that wasn't uploaded

parent 563c738e
No related branches found
No related tags found
No related merge requests found
/* Config parameters. */
#include "../config.h"
/* Standard includes. */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <math.h>
/* Local includes. */
extern "C"{
#include "quicksched.h"
}
#include "cuda_queue.h"
enum task_types { task_SGEQRF , task_SLARFT , task_STSQRF , task_SSSRFT} ;
#define TID threadIdx.x
#define numthreads 128
#define tilesize 32
#define cuda_maxtasks 1000000
#define CO(x,y,ldm) (((y)*ldm) + (x))
#define WARPS 4
__device__ float *GPU_matrix;
__device__ float *GPU_tau;
__device__ int cuda_m;
__device__ int cuda_n;
/*
* @brief Performs a reduction sum on a vector, storing the result in v[0].
* @param sumVector SMEM The array to compute over.
* @param startN The number of elements in the vector.
*
* Computes a binary reduction to find the sum of the elements in the vector,
* storing the result in the first element of the vector.
* should be called where threadDim.x <= 1 warp
*/
__device__ void reduceSum (volatile float* sumVector,
char startN)
{
/* Start with n = startN/2 */
char n = startN >> 1;
while(n > 0)
{
/* Use first n threads to add up to 2n elements. */
if(TID < n) sumVector[TID] = sumVector[TID] + sumVector[TID + n];
/* Divide n by 2 for next iteration. */
n = n >> 1;
}
}
__device__ inline void reduceSumMultiWarp( float* value )
{
#if __CUDA_ARCH__ >= 300
#pragma unroll
for(int mask = 16 ; mask > 0 ; mask >>= 1)
*value += __shfl_xor((float)*value, mask);
*value = __shfl((float)*value, 0);
#else
__shared__ float stuff[32*WARPS];
int tid = TID%32;
int group = TID / 32;
char n = 32 >> 1;
while(n > 0 )
{
if(tid < n ) stuff[tid] = stuff[tid] + stuff[tid + n];
n = n >> 1;
}
__threadfence();
*value = stuff[group * 32];
//printf("Not supported.\n");
//asm("trap;");
#endif
}
/* Routines for the tiled QR decomposition.*/
/**
*
* @brief Computes the QR decomposition of a tile.
*
* @param cornerTile A pointer to the tile for which the decomposition is computed.
* @param tilesize The number of elements on a row/column of the tile.
* @param tauMatrix A pointer to the tau Matrix.
* @param k The value of k for the QR decomposition
* @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
*
* Note: 32 threads max.
*/
__device__ void SGEQRF(float* restrict cornerTile, int tileSize, float* restrict tauMatrix, int k, int tauNum, float* w)
{
int i, j;
float norm=0.0, sign, u1, tau, z;
int TID = threadID % 32;
int set = threadID / 32;
/*Find the householder vector for each row. */
for(i = 0; i < tileSize; i++)
{
norm = 0.0;
if(TID > = i )
w[ TID ] = cornerTile[ i*tilesize + TID ] * cornerTile[i * tilesize + TID];
else
w[ TID ] = 0.0;
reduceSumMultiWarp( &w[TID] );
if(cornerTile[i * tilesize + i] >= 0.0)
sign = -1;
else
sign = 1;
norm = sqrt(w[0]);
if(TID == 0)
w[0] = 0.0;
u1 = cornerTile[i*tilesize + i] - sign*norm;
if(u1 != 0.0)
{
if(TID > i)
w[ TID ] = w[TID] / u1;
}
else
{
if(TID > i)
w[ TID ] = 0.0;
}
if(norm != 0.0)
tau = -sign*u1/norm;
else
tau = 0.0;
/*Store the below diagonal vector */
if(TID > i)
cornerTile[ i*tileSize + TID ] = w[TID];
if(TID == 0)
cornerTile[ i*tileSize + i ] = sign*norm;
if(TID == i)
w[i] = 1.0;
/* Apply the householder transformation to the rest of the tile, for everything to the right of the diagonal. */
for(j = i+1; j < tileSize; j++)
{
/*Compute w'*A_j*/
if(TID >= i)
z = cornerTile[j*tileSize + TID] * w[TID];
else
z = 0;
reduceSumMultiWarp(&z);
/* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */
if(TID >= i)
cornerTile[j*tileSize + TID] = cornerTile[j*tileSize + TID] - tau*w[TID]*z;
}
/* Store tau. We're on k*tileSize+ith row. kth column.*/
if(TID == 0)
tauMatrix[(k*tileSize+i)*tauNum+k] = tau;
}
}
/**
*
* @brief Applies the householder factorisation of the corner to the row tile.
*
* @param cornerTile A pointer to the tile for which the householder is stored.
* @param rowTiles The tile to which the householder is applied.
* @param tilesize The number of elements on a row/column of the tile.
* @param tauMatrix A pointer to the tau Matrix.
* @param jj The value of j for the QR decomposition.
* @param kk The value of k for the QR decomposition.
* @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
*
*
*/
void SLARFT(float* cornerTile, float* rowTile, int tileSize, int jj, int kk, float* tauMatrix, int tauNum, float* w)
{
int i, j, n;
float z;
float w;
int TID = threadID % 32;
int set = threadID / 32;
/* For each row in the corner Tile*/
for(i = 0; i < tileSize; i++)
{
/*Get w for row i */
for(j = i; j < tileSize; j++)
{
w[j] = cornerTile[i*tileSize + j];
}
w[i] = 1.0;
/* Apply to the row Tile */
for(j = 0; j < tileSize; j++)
{
z=0.0;
/* Below Diagonal!*/
/*Compute w'*A_j*/
for(n = i; n < tileSize; n++)
{
z = z + w[n] * rowTile[j*tileSize+n];
}
for(n = i; n < tileSize; n++)
{
rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z;
}
}
}
}
/*
* @brief Computes a householder vector for a single tile QR decomposition, storing the result in a shared array.
* @param matelem The TIDth element of the column to be used for calculation.
* @param hhVector SMEM The array used to calculate and store the vector.
* @param k The timestep at which this is called. i.e where the diagonal falls.
*
* Calculates a householder vector for the vector passed as elements into a shared array.
* Used only by SGEQRF_CUDA.
*/
__device__ void calcHH (float matelem,
volatile float hhVector[],
int k)
{
/* To store the value of 1/(v[k] + sign(v[k])*||v||). */
float localdiv;
/* Stores the sign of v[k] */
int sign;
/* Read vectors into shared memory from below the diagonal. */
if(TID >= k)
hhVector[TID] = matelem;
/* Above the diagonal, set to 0. */
if(TID < k)
hhVector[TID] = 0;
/* Square each element to find ||v|| = sqrt(sum(v[i]^2)) */
hhVector[TID] *= hhVector[TID];
/* Reduce to find sum of squares. */
reduceSum(hhVector, 32);
/* Set localdiv to equal ||v|| */
localdiv = sqrt(hhVector[0]);
/* According to Householder algorithm in Golub & Van Loan. */
if(localdiv != 0.0)
{
/* Load shared to communicate v[k] to all threads. */
hhVector[TID] = matelem;
/* Calculate the sign of v[k] locally. */
sign = hhVector[k] >= 0 ? 1 : -1;
/* Compute and store localdiv = (||v||)*sign(v[k]) */
localdiv *= sign;
/* Compute and store localdiv = v[k] + (||v||*sign(v[k])) */
localdiv += hhVector[k];
/* Finally compute and store localdiv = 1/(v[k] + ||v||*sign(v[k])) */
localdiv = 1.0f/localdiv;
}
/* If norm is 0, do not change v. */
else
localdiv = 1.0f;
/* Compute and store in shared memory v = v/(v[k] + ||v||*sign(v[k])) */
if(TID < k)
hhVector[TID] = 0.0f;
/* v[k]/v[k] = 1 */
if(TID == k)
hhVector[TID] = 1.0f;
if(TID > k)
hhVector[TID] = matelem * localdiv;
}
/*
* @brief Applies a householder vector to the submatrix starting at (k,k), also inserts the kth tau value into tau[k].
* @param blockCache SMEM The 1024-element array storing the full tile to which the vector will be applied.
* @param matTau The tau vector.
* @param workingVector SMEM The array used in the computation as a scratchpad.
* @param k Defines the submatrix.
* @param hhVectorelem Element of the cumulatively-stored householder vector to apply.
*
* Works by first calculating the kth tau,
* then proceeding to apply the vector to successive columns of the matrix A_j
* with the formula A_j = A_j - tau[k]*v*(v'A_j).
* The kth tau is then stored into memory.
*/
__device__ void applyHH(volatile float blockCache[],
float* matTau,
volatile float workingVector[],
int k,
float hhVectorelem)
{
float ktau,
/* Stores the value of ktau*v*v'*A_j */
z;
int j;
float reg;
/* Read householder vector into shared memory to calculate ktau = 2/v'v */
reg/*workingVector[TID]*/ = hhVectorelem;
/* Square the elements to start finding v'v */
reg/*workingVector[TID]*/ *= reg/*workingVector[TID]*/;
/* Perform a reduction to compute v'v and store the result in v[0]. */
//reduceSum(workingVector, 32);
reduceSumMultiWarp(&reg);
/* Finally compute ktau = 2/v'v */
ktau = 2.0 / /*workingVector[0]*/reg;
/* For the columns of the submatrix starting at (k,k). */
for(j = k; j < 32; j ++)
{
/* Fill the shared memory to calculate v'*A_j */
reg/*workingVector[TID]*/ = blockCache[TID+(j*32)] * hhVectorelem;
/* Perform reduction to compute v'*A_j, storing the result in v[0] */
//reduceSum(workingVector, 32);
reduceSumMultiWarp(&reg);
/* Compute and store z = ktau*v[tid] */
z = hhVectorelem * ktau;
/* Compute and store z = (v'A_j)*(ktau*v[tid]) */
z *= /*workingVector[0]*/reg;
/* Apply with A_j[tid] = A_j[tid] - v'A_j*ktau*v[tid] */
blockCache[TID + (j*32)] -= z;
}
/* Store the essential (below diagonal) portion of householder vector below the diagonal in the block at column k. */
if(TID > k)
blockCache[TID + k*32] = hhVectorelem;
/* Store the kth tau. */
if(TID == 0)
matTau[k] = ktau;
}
__device__ void runner_cuda_SGEQRF( int i, int j, int k, volatile float workingVector[], volatile float blockCache[] )
{
int kk, jj, refCache;
refCache = TID;
float* matrix = &GPU_matrix[j*cuda_m*32*32 + i*32*32];
float* matTau = &GPU_tau[j*cuda_m*32 + i*32 ];
for(jj = 0; jj < 32; jj ++)//load block into shared memory
{
blockCache[refCache] = matrix[refCache];
refCache += 32;
}
for(kk = 0; kk < 32; kk ++)
{
//calculate the kth hh vector from the kth column of the TIDth row of the matrix
calcHH (blockCache[TID + (kk*32)],//(tid, k)
workingVector,
kk);
//calculate the application of the hhvector along row TID
applyHH (blockCache,
matTau,
workingVector,
kk,
workingVector[TID]);
}
//copy row back
refCache = TID;
for(jj = 0; jj < 32; jj ++)//unload block from shared memory
{
matrix[refCache] = blockCache[refCache];
refCache += 32;
}
__threadfence();
}
__device__ void runner_cuda_SLARFT (int i, int j, int k, volatile float tau[], volatile float blockCache[] )
{
int jj, kk, ii,
tid = TID%32,
group = TID/32;
__shared__ volatile float groupCache[32*4];
float* blockV = &GPU_matrix[k*cuda_m*32*32 + k*32*32];
float* blockA = &GPU_matrix[j*cuda_m*32*32 + i*32*32];
float* blockTau = &GPU_tau[ i*cuda_m*32 + i*32 ];
// cacheCol is a column in the groupCache
volatile float *cacheCol = groupCache + (group*32);
float alpha,
belem;
__syncthreads();
/* Load tau Vector */
if(TID < 32)
tau[TID] = blockTau[TID];
/* Load Vectors */
for(jj = group; jj < 32; jj += WARPS)
{
if(tid < jj)
blockCache[tid + (jj*32)] = 0.0f;
else if(tid == jj)
blockCache[tid + (jj*32)] = 1.0f;
else if(tid > jj)
blockCache[tid + (jj*32)] = blockV[tid + (jj*32)];
//__syncthreads();
}
__syncthreads();
/* Compute b_j -= tau*v*v'b_j, for all vectors in blockCached V */
for(jj = group; jj < 32; jj += WARPS)
{
/* Read the column into registers. */
//if(group == 0)
//{
belem = blockA[tid + (jj*32)];
//__syncthreads();
if(jj == 0)
printf("%.3f %i\n", belem, TID);
/* For each vector in block of vectors. */
for(kk = 0; kk < 32; kk++)
{
/* Compute alpha = v'*b_j */
cacheCol[tid] = blockCache[tid + (kk*32)] * belem;
if(group == 0 && jj == 0 && kk == 0)
printf("%.3f %i %.3f %.3f\n", belem, TID , blockCache[tid+kk*32],cacheCol[tid]);
// __syncthreads();
if(tid < 16)
cacheCol[tid] += cacheCol[tid+16];
// __syncthreads();
if(tid < 8)
cacheCol[tid] += cacheCol[tid+8];
// __syncthreads();
alpha = cacheCol[0];
for(ii = 1; ii < 8; ii ++)
alpha += cacheCol[ii];
/* Compute alpha = tau * v_tid * alpha */
alpha *= tau[kk];
if(TID==0 && kk == 0)
printf("tau[kk] = %.3f\n", tau[kk]);
alpha *= blockCache[tid + (kk*32)];
if(jj == 0 && kk == 0)
printf("%.3f\n", alpha);
/* Compute belem -= alpha */
belem -= alpha;
//__syncthreads();
}
blockA[tid + (jj*32)] = belem;//}
//__syncthreads();
}
__threadfence();
}
__device__ void applyOneHHVectD (float* topElem,
float* lowElem,
int k,
volatile float tau[],
volatile float blockCache[])
{
float alpha;
__shared__ volatile float workV[32];
float reg;
int tid = TID % 32;
/* Compute alpha = sum */
if(tid == k)
reg/*workV[TID]*/ = *topElem;
if(tid != k)
reg/*workV[TID]*/ = 0.0f;
reg/*workV[TID]*/ += *lowElem * blockCache[tid + (k*32)];
//reduceSum(workV, 32);
reduceSumMultiWarp(&reg);
alpha = reg/*workV[0]*/;
/* Multiply by tau */
alpha *= tau[k];
/* Compute alpha *= a_tid,j*v_tid,k */
if(tid == k)
*topElem -= alpha;
/* For lower element. */
alpha *= blockCache[tid + (k*32)];
*lowElem -= alpha;
}
__device__ void calcDoubleHHWY (float topElem,
float lowElem,
int k,
volatile float blockCache[])
{
/* Calculate v_k. */
int sign, i;
float topSum = 0,
lowSum = 0,
alpha;
__shared__ volatile float first;
if(TID == k)
first = topElem;
topSum = first * first;
/* Sum squares of V to compute norm, using column of BlockCache as working space. */
blockCache[TID + (k*32)] = lowElem * lowElem;
for(i = 0; i < 32; i ++)
lowSum += blockCache[i + (k*32)];
alpha = sqrt(topSum + lowSum);
sign = first >= 0.f ? 1.f : -1.f;
if(alpha != 0.0f)
{
/* Zeroth element is = first + sign*norm */
alpha = first + sign * alpha;
alpha = 1.0f/alpha;
}
else
alpha = 1.0f;
//topElem *= alpha;
lowElem *= alpha;
blockCache[TID + (k*32)] = lowElem;
}
__device__ void runner_cuda_STSQRF ( int i, int j, int k, volatile float tauVect[], volatile float blockCache[])
{
/* Idea: for each column j of (AB)^T,
apply HH vectors 0...j-1,
compute new HH vector for column j,
store essential part in blockCache
Uses one block cache, one vector storage. */
float* blockA = &GPU_matrix[k*cuda_m*32*32 + k*32*32];
float* blockB = &GPU_matrix[j*cuda_m*32*32 + i*32*32];
float* blockTau = &GPU_tau[j*cuda_m*32 + i*32];
// int j, k, i;
int tid = TID % 32;
int group = TID/32;
float topElem,
lowElem,
tau;
for(j = 0; j < 32; j ++)
{
if(TID < 32 )
{
if(TID <= j)
topElem = blockA[TID + (j*32)];
if(TID > j)
topElem = 0.f;
lowElem = blockB[TID + (j*32)];
//for(k = 0; k < j; k ++)
//{
/* Compute b_tid,j = b_tid,j - tau*vv'*b_tid,j */
// applyOneHHVectD (&topElem, &lowElem,
// k,
// tauVect,
// blockCache);
//}
calcDoubleHHWY (topElem,
lowElem,
j,
blockCache);
/* Compute new tau = 2/v'v */
tau = 1.0f;
for(i = 0; i < 32; i ++)
tau += blockCache[i + (j*32)] * blockCache[i + (j*32)];
tau = 2.0f/tau;
if(TID == j)
tauVect[j] = tau;
}
__syncthreads();
/* Apply new vector to column. */
for(k = j+group ; k < 32; k +=WARPS )
{
if(k != j)
{
if(tid <= k )
topElem = blockA[tid + k*32];
if( tid > k )
topElem = 0.f;
lowElem = blockB[tid + k*32];
}
applyOneHHVectD (&topElem, &lowElem,
j,
tauVect,
blockCache);
if(tid <= k)
blockA[tid + k*32] = topElem;
blockB[tid + k*32] = lowElem;
// __syncthreads();
}
__syncthreads();
/* Write back */
//if(TID <= j)
// blockA[TID + (j*ldm)] = topElem;
}
/* Write back lower block, containing householder Vectors. */
if(TID < 32)
{
for(j = 0; j < 32; j ++)
blockB[TID + (j*32)] = blockCache[TID + (j*32)];
blockTau[TID] = tauVect[TID];
}
__threadfence();
}
__device__ void runner_cuda_SSSRFT( int i, int j, int k, volatile float tau[], volatile float blockCache[] )
{
__shared__ volatile float workV[WARPS*32];
__syncthreads();
float aelem, belem,
alpha;
float* blockV = &GPU_matrix[k*cuda_m*32*32 + i*32*32];
float* blockA = &GPU_matrix[j*cuda_m*32*32 + k*32*32];
float* blockB = &GPU_matrix[j*cuda_m*32*32 + i*32*32];
float* blockTau = &GPU_tau[k*cuda_m*32 + i*32];
int tid, group,
refMat, refCache;
tid = TID%32;
group = TID/32;
refMat = tid + group*32;
refCache = tid + group*32;
if(TID < 32)
tau[TID] = blockTau[TID];
__syncthreads();
/* Load the essential HH vector block into shared cache. */
for(j = group; j < 32; j += WARPS)
{
blockCache[refCache] = blockV[refMat];
refMat += WARPS*32;
refCache += WARPS*32;
__syncthreads();
}
__syncthreads();
/* For each column of the result. */
for(j = group; j < 32; j += WARPS)
{
//if(tid == 0) printf("%d: column %d.\n", group, j);
/* Load the elements of the column to process. */
aelem = blockA[tid + (j*32)];
belem = blockB[tid + (j*32)];
/* Compute and apply b_j = b_j - tau*vv'b_j
for each Householder vector 1..32. */
for(k = 0; k < 32; k ++)
{
/* Load the kth element into all threads. */
workV[tid + (group*32)] = aelem;
/* Store this as alpha. */
alpha = workV[k + (group*32)];
/* Load components of v'b_j to sum. */
workV[tid + (group*32)] = belem * blockCache[tid + (k*32)];
if(tid < 16)
workV[tid + (group*32)] += workV[16 + tid + (group*32)];
if(tid < 8)
workV[tid + (group*32)] += workV[8 + tid + (group*32)];
/* Compute v'b_j */
for(i = 0; i < 8; i ++)
alpha += workV[i + (group*32)];
/* Multiply by kth tau. */
alpha *= tau[k];
/* Compute b_j -= alpha*v
If kth thread in group, v is 1 at aelem. */
if(tid == k) aelem -= alpha;
/* Compute b_j -= alpha * v for lower half. */
belem -= alpha * blockCache[tid + (k*32)];
__syncthreads();
}
/* put the elements back. */
blockA[tid + (j*32)] = aelem;
blockB[tid + (j*32)] = belem;
//__syncthreads();
}
__syncthreads();
}
__device__ void runner ( int type , void *data ) {
__shared__ volatile float blockCache[(32*32) + 128];
volatile float *workVector;
workVector = blockCache + (32*32);
/* Decode the task data. */
int *idata = (int *)data;
int i = idata[0], j = idata[1], k = idata[2];
// int z;
// double buff[ 2*K*K ];
/* Decode and execute the task. */
switch ( type ) {
case task_SGEQRF:
if(threadIdx.x < 32)
runner_cuda_SGEQRF(i, j, k, workVector, blockCache);
if(threadIdx.x == 0){
printf("SGEQRF: %i %i %i + %i\n",i, j , k, clock());
printf("tau = ");
for(k = 0; k < cuda_m * cuda_n * 32 ; k++)
{
printf("%.3f ", GPU_tau[k]);
}
printf("\n");}
break;
case task_SLARFT:
if(threadIdx.x == 0)
printf("SLARFT: %i %i %i + %i\n",i,j,k, clock());
//runner_cuda_SLARFT(i, j, k, workVector, blockCache);
break;
case task_STSQRF:
if(threadIdx.x == 0)
printf("STSQRF: %i %i %i + %i\n",i,j,k, clock());
//runner_cuda_STSQRF(i, j, k, workVector, blockCache);
break;
case task_SSSRFT:
if(threadIdx.x == 0)
printf("SSSRFT: %i %i %i + %i\n",i,j,k, clock());
//runner_cuda_SSSRFT(i, j, k, workVector, blockCache);
break;
default:
asm("trap;");
}
}
__device__ qsched_funtype function = runner;
__global__ void Setup()
{
printf("%i\n", function);
}
float* generateMatrix( int m, int n)
{
float* Matrix;
cudaMallocHost(&Matrix, sizeof(float) * m*n*32*32);
if(Matrix == NULL)
error("Failed to allocate Matrix");
int i, j;
memset ( Matrix, 0, sizeof(float)*m*n*32*32 );
for(i = 0; i < n*32; i++)
{
for(j = 0; j < m*32; j++)
{
float value = i/32 + j/32;
if(j/32 > 0)
{
value +=2.f;
}
Matrix[i*m*32 + j] = value;
}
}
return Matrix;
}
float* columnToTile( float* columnMatrix, int size , int m , int n )
{
float* TileMatrix;
cudaMallocHost(&TileMatrix, sizeof(float) * size );
if(TileMatrix == NULL)
error("failed to allocate TileMatrix");
int rows = m*32;
int columns = n*32;
int i,j,k,l;
for( i = 0; i < n ; i++ )
{
for( j = 0; j < m; j++ )
{
float *tileStart = &columnMatrix[i*m*32*32 + j*32];
float *tilePos = &TileMatrix[i*m*32*32 + j*32*32];
for(k = 0; k < 32; k++)
{
tileStart = &columnMatrix[i*m*32*32+k*m*32 + j*32];
for( l = 0; l < 32; l++ )
{
tilePos[k*32 + l] = tileStart[l];
}
}
}
}
return TileMatrix;
}
float* tileToColumn( float* tileMatrix, int size, int m , int n )
{
float* ColumnMatrix;
ColumnMatrix = (float*) malloc(sizeof(float) * size );
if(ColumnMatrix == NULL)
error("failed to allocate ColumnMatrix");
int rows = m*32;
int columns = n*32;
int i,j,k,l;
for( i = 0; i < n ; i++ )
{
for(j = 0; j < m ; j++ )
{
/* Tile on ith column is at i*m*32*32.*/
/* Tile on jth is at j*32*32 */
float *tile = &tileMatrix[i*m*32*32 + j*32*32];
/* Column starts at same position as tile. */
/* Row j*32.*/
float *tilePos = &ColumnMatrix[i*m*32*32 + j*32];
for( k = 0; k < 32; k++ )
{
for(l=0; l < 32; l++)
{
tilePos[l] = tile[l];
}
/* Next 32 elements are the position of the tile in the next column.*/
tile = &tile[32];
/* Move to the j*32th position in the next column. */
tilePos = &tilePos[32*m];
}
}
}
return ColumnMatrix;
}
float* createIdentity(int m, int n)
{
float* Matrix;
cudaMallocHost(&Matrix, sizeof(float) * m*n*32*32);
if(Matrix == NULL)
error("Failed to allocate Matrix");
int rows = m*32;
int columns = n*32;
int i, j;
memset ( Matrix, 0, sizeof(float)*m*n*32*32 );
for(i = 0; i < n*32; i++)
{
for(j = 0; j < m*32; j++)
{
if(i==j)
{
Matrix[i*m*32 + j] = 1.f;
}else
{
Matrix[i*m*32 + j] = 0.f;
}
}
}
return Matrix;
}
void printMatrix(float* Matrix, int m, int n)
{
int i, j;
for(i=0; i < m*32; i++)
{
for(j = 0; j < n*32; j++)
{
printf(" %.1f ", Matrix[j*m*32 +i]);
}
printf("\n");
}
}
/*void printTileMatrix(float* Matrix, int m, int n)
{
int i,j;
float *tempMatrix, *tempMatrix2;
for(i = 0; i < m*32; i++)
{
int tiled = i/32;
tempMatrix = &Matrix[tiled*32*32];
for(j = 0; j < n*32; j++)
{
int tile = j/32;
tempMatrix2 = &tempMatrix[tile*32*32*m + i%32];
printf(" %.1f ", tempMatrix2[j*32]);
//printf(" %.1f,%i ", tempMatrix2[j*32], &tempMatrix2[j*32]-Matrix);
}
printf("\n");
}
}*/
qsched_funtype func;
void test_qr(int m , int n , int K , int nr_threads , int runs)
{
int k, j, i;
float *A, *A_orig, *tau, *tau_device, *device_array, *temp_device_array;
struct qsched s;
qsched_task_t *tid, tid_new;
qsched_res_t *rid;
int data[3];
ticks tic, toc_run, tot_setup, tot_run = 0;
/* Initialize the scheduler. */
qsched_init( &s , 1 , qsched_flag_none );
cudaDeviceReset();
cudaSetDevice(0);
Setup<<<1,1>>>();
if(cudaDeviceSynchronize() != cudaSuccess)
error("Setup Failed: %s", cudaGetErrorString(cudaPeekAtLastError()));
/* Allocate and fill the original matrix. */
if(cudaMallocHost(&A, sizeof(float) * m * n * K * K ) != cudaSuccess ||
cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess ||
cudaMallocHost(&A_orig, sizeof(float) * m * n * K * K ) != cudaSuccess )
error("Failed to allocate matrices.");
cudaFreeHost(A_orig);
// for ( k = 0 ; k < m * n * K * K ; k++ )
// A_orig[k] = 2.0f*((float)rand()) / RAND_MAX - 1.0f;
A_orig = generateMatrix(m, n);
float *temp = columnToTile(A_orig, m * n * K *K, m , n);
cudaFreeHost(A_orig);
A_orig = temp;
temp = tileToColumn(A_orig, m*n*K*K, m , n);
// printMatrix(temp, m, n);
// printTileMatrix(A_orig, m, n);
memcpy( A , A_orig , sizeof(float) * m * n * K * K );
bzero( tau , sizeof(float) * m * n * K );
if( cudaMalloc(&tau_device, sizeof(float) * m * n * K ) != cudaSuccess )
error("Failed to allocate tau on the device");
if(cudaMemcpy( tau_device , tau , sizeof(float) * m * n * K , cudaMemcpyHostToDevice ) != cudaSuccess )
error("Failed to copy the tau data to the device.");
if( cudaMemcpyToSymbol ( GPU_tau, &tau_device , sizeof(float *), 0 , cudaMemcpyHostToDevice ) != cudaSuccess )
error("Failed to copy locks pointer to the device.");
if( cudaMalloc( &device_array , sizeof(float) * m * n * K * K ) != cudaSuccess )
error("Failed to allocate the matrix on the device");
if( cudaMemcpyToSymbol( GPU_matrix , &device_array , sizeof(float *) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess )
error("Failed to copy matrix pointer to the device");
/* Allocate and init the task ID and resource ID matrix. */
tic = getticks();
/* Allocate and init the task ID and resource ID matrix. */
if( cudaMallocHost(&tid , sizeof(qsched_task_t) * m*n ) != cudaSuccess )
error("Failed to allocate tid");
if( cudaMallocHost(&rid , sizeof(qsched_task_t) * m*n) != cudaSuccess)
error("Failed to allocate rid");
for ( k = 0 ; k < m * n ; k++ ) {
tid[k] = qsched_task_none;
if( cudaHostGetDevicePointer(&temp_device_array , &A[k*32*32] , 0) != cudaSuccess)
error("Failed to get device pointer to matrix");
rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none , &A[k*32*32], sizeof(float) * 32 * 32, device_array + k * 32 *32);
}
/* Build the tasks. */
for ( k = 0 ; k < m && k < n ; k++ ) {
/* Add kth corner task. */
data[0] = k; data[1] = k; data[2] = k;
tid_new = qsched_addtask( &s , task_SGEQRF , task_flag_none , data , sizeof(int)*3 , 2 );
qsched_addlock( &s , tid_new , rid[ k*m + k ] );
if ( tid[ k*m + k ] != -1 )
qsched_addunlock( &s , tid[ k*m + k ] , tid_new );
tid[ k*m + k ] = tid_new;
/* Add column tasks on kth row. */
for ( j = k+1 ; j < n ; j++ ) {
data[0] = k; data[1] = j; data[2] = k;
tid_new = qsched_addtask( &s , task_SLARFT , task_flag_none , data , sizeof(int)*3 , 3 );
qsched_addlock( &s , tid_new , rid[ j*m + k ] );
qsched_adduse( &s , tid_new , rid[ k*m + k ] );
qsched_addunlock( &s , tid[ k*m + k ] , tid_new );
if ( tid[ j*m + k ] != -1 )
qsched_addunlock( &s , tid[ j*m + k ] , tid_new );
tid[ j*m + k ] = tid_new;
}
/* For each following row... */
for ( i = k+1 ; i < m ; i++ ) {
/* Add the row tasks for the kth column. */
data[0] = i; data[1] = k; data[2] = k;
tid_new = qsched_addtask( &s , task_STSQRF , task_flag_none , data , sizeof(int)*3 , 3 );
qsched_addlock( &s , tid_new , rid[ k*m + i ] );
qsched_adduse( &s , tid_new , rid[ k*m + k ] );
qsched_addunlock( &s , tid[ k*m + (i-1) ] , tid_new );
if ( tid[ k*m + i ] != -1 )
qsched_addunlock( &s , tid[ k*m + i ] , tid_new );
tid[ k*m + i ] = tid_new;
/* Add the inner tasks. */
for ( j = k+1 ; j < n ; j++ ) {
data[0] = i; data[1] = j; data[2] = k;
tid_new = qsched_addtask( &s , task_SSSRFT , task_flag_none , data , sizeof(int)*3 , 5 );
qsched_addlock( &s , tid_new , rid[ j*m + i ] );
qsched_adduse( &s , tid_new , rid[ k*m + i ] );
qsched_adduse( &s , tid_new , rid[ j*m + k ] );
qsched_addunlock( &s , tid[ k*m + i ] , tid_new );
qsched_addunlock( &s , tid[ j*m + k ] , tid_new );
if ( tid[ j*m + i ] != -1 )
qsched_addunlock( &s , tid[ j*m + i ] , tid_new );
tid[ j*m + i ] = tid_new;
}
}
} /* build the tasks. */
if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
error("Failed to copy function pointer from device");
if( cudaMemcpyToSymbol( cuda_m, &m, sizeof(int), 0, cudaMemcpyHostToDevice ) != cudaSuccess )
error("Failed to copy m to the device");
if( cudaMemcpyToSymbol( cuda_n, &n, sizeof(int), 0, cudaMemcpyHostToDevice ) != cudaSuccess )
error("Failed to copy n to the device");
qsched_run_CUDA( &s , func );
cudaMemcpy( A , device_array , sizeof(float) * m * n * K * K, cudaMemcpyHostToDevice);
A = tileToColumn(A,m * n * K * K, m, n);
printMatrix(A, m, n);
// printTileMatrix(A, m , n);
}
void test()
{
int m = 2, n = 2;
float* Matrix = createIdentity(m,n);
//printMatrix(Matrix, m, n);
float* MatrixTile = columnToTile(Matrix, m*n*32*32, m, n);
printMatrix(&MatrixTile[0], 1, 1);
printf("\n \n \n");
printMatrix(&MatrixTile[32*32], 1, 1);
printf("\n \n \n");
printMatrix(&MatrixTile[64*32], 1, 1);
printf("\n \n \n");
printMatrix(&MatrixTile[3*32*32], 1, 1);
printf("\n \n \n");
//printTileMatrix(MatrixTile, m, n);
free(Matrix);
Matrix = tileToColumn(MatrixTile, m*n*32*32, m , n);
printMatrix(Matrix, m, n);
free(MatrixTile);
free(Matrix);
}
/**
* @brief Main function.
*/
int main ( int argc , char *argv[] ) {
int c, nr_threads=128;
int M = 4, N = 4, runs = 1, K = 32;
/* Get the number of threads. */
//#pragma omp parallel shared(nr_threads)
// {
// if ( omp_get_thread_num() == 0 )
// nr_threads = omp_get_num_threads();
// }
/* Parse the options */
while ( ( c = getopt( argc , argv , "m:n:k:r:t:D" ) ) != -1 )
switch( c ) {
case 'm':
if ( sscanf( optarg , "%d" , &M ) != 1 )
error( "Error parsing dimension M." );
break;
case 'n':
if ( sscanf( optarg , "%d" , &N ) != 1 )
error( "Error parsing dimension M." );
break;
case 'k':
if ( sscanf( optarg , "%d" , &K ) != 1 )
error( "Error parsing tile size." );
break;
case 'r':
if ( sscanf( optarg , "%d" , &runs ) != 1 )
error( "Error parsing number of runs." );
break;
case 't':
if ( sscanf( optarg , "%d" , &nr_threads ) != 1 )
error( "Error parsing number of threads." );
//omp_set_num_threads( nr_threads );
break;
case 'D':
test();
exit( EXIT_SUCCESS );
case '?':
fprintf( stderr , "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n" , argv[0] );
fprintf( stderr , "Computes the tiled QR decomposition of an MxN tiled\n"
"matrix using nr_threads threads.\n" );
exit( EXIT_FAILURE );
}
/* Dump arguments. */
message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." ,
32*M , 32*N , nr_threads , runs );
test_qr( M , N , K , nr_threads , runs );
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment