diff --git a/examples/test_qr.cu b/examples/test_qr.cu new file mode 100644 index 0000000000000000000000000000000000000000..270c4ca35cb12113f0fed475842f8abef68d04b2 --- /dev/null +++ b/examples/test_qr.cu @@ -0,0 +1,1176 @@ +/* Config parameters. */ +#include "../config.h" + +/* Standard includes. */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <math.h> + +/* Local includes. */ +extern "C"{ +#include "quicksched.h" +} +#include "cuda_queue.h" + +enum task_types { task_SGEQRF , task_SLARFT , task_STSQRF , task_SSSRFT} ; + +#define TID threadIdx.x +#define numthreads 128 +#define tilesize 32 +#define cuda_maxtasks 1000000 +#define CO(x,y,ldm) (((y)*ldm) + (x)) +#define WARPS 4 + +__device__ float *GPU_matrix; +__device__ float *GPU_tau; +__device__ int cuda_m; +__device__ int cuda_n; + + +/* + * @brief Performs a reduction sum on a vector, storing the result in v[0]. + * @param sumVector SMEM The array to compute over. + * @param startN The number of elements in the vector. + * + * Computes a binary reduction to find the sum of the elements in the vector, + * storing the result in the first element of the vector. + * should be called where threadDim.x <= 1 warp + */ +__device__ void reduceSum (volatile float* sumVector, + char startN) +{ + /* Start with n = startN/2 */ + char n = startN >> 1; + + while(n > 0) + { + /* Use first n threads to add up to 2n elements. */ + if(TID < n) sumVector[TID] = sumVector[TID] + sumVector[TID + n]; + + /* Divide n by 2 for next iteration. */ + n = n >> 1; + } +} + +__device__ inline void reduceSumMultiWarp( float* value ) +{ + #if __CUDA_ARCH__ >= 300 + #pragma unroll + for(int mask = 16 ; mask > 0 ; mask >>= 1) + *value += __shfl_xor((float)*value, mask); + + *value = __shfl((float)*value, 0); + + #else + __shared__ float stuff[32*WARPS]; + int tid = TID%32; + int group = TID / 32; + char n = 32 >> 1; + while(n > 0 ) + { + if(tid < n ) stuff[tid] = stuff[tid] + stuff[tid + n]; + n = n >> 1; + } + __threadfence(); + *value = stuff[group * 32]; + //printf("Not supported.\n"); + //asm("trap;"); + #endif + +} + + +/* Routines for the tiled QR decomposition.*/ + +/** + * + * @brief Computes the QR decomposition of a tile. + * + * @param cornerTile A pointer to the tile for which the decomposition is computed. + * @param tilesize The number of elements on a row/column of the tile. + * @param tauMatrix A pointer to the tau Matrix. + * @param k The value of k for the QR decomposition + * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column). + * + * Note: 32 threads max. + */ +__device__ void SGEQRF(float* restrict cornerTile, int tileSize, float* restrict tauMatrix, int k, int tauNum, float* w) +{ + int i, j; + float norm=0.0, sign, u1, tau, z; + int TID = threadID % 32; + int set = threadID / 32; + + /*Find the householder vector for each row. */ + for(i = 0; i < tileSize; i++) + { + norm = 0.0; + if(TID > = i ) + w[ TID ] = cornerTile[ i*tilesize + TID ] * cornerTile[i * tilesize + TID]; + else + w[ TID ] = 0.0; + reduceSumMultiWarp( &w[TID] ); + + if(cornerTile[i * tilesize + i] >= 0.0) + sign = -1; + else + sign = 1; + + norm = sqrt(w[0]); + if(TID == 0) + w[0] = 0.0; + + u1 = cornerTile[i*tilesize + i] - sign*norm; + if(u1 != 0.0) + { + if(TID > i) + w[ TID ] = w[TID] / u1; + } + else + { + if(TID > i) + w[ TID ] = 0.0; + } + + if(norm != 0.0) + tau = -sign*u1/norm; + else + tau = 0.0; + + /*Store the below diagonal vector */ + if(TID > i) + cornerTile[ i*tileSize + TID ] = w[TID]; + if(TID == 0) + cornerTile[ i*tileSize + i ] = sign*norm; + if(TID == i) + w[i] = 1.0; + /* Apply the householder transformation to the rest of the tile, for everything to the right of the diagonal. */ + for(j = i+1; j < tileSize; j++) + { + /*Compute w'*A_j*/ + if(TID >= i) + z = cornerTile[j*tileSize + TID] * w[TID]; + else + z = 0; + reduceSumMultiWarp(&z); + /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */ + if(TID >= i) + cornerTile[j*tileSize + TID] = cornerTile[j*tileSize + TID] - tau*w[TID]*z; + } + /* Store tau. We're on k*tileSize+ith row. kth column.*/ + if(TID == 0) + tauMatrix[(k*tileSize+i)*tauNum+k] = tau; + + } + +} + + +/** + * + * @brief Applies the householder factorisation of the corner to the row tile. + * + * @param cornerTile A pointer to the tile for which the householder is stored. + * @param rowTiles The tile to which the householder is applied. + * @param tilesize The number of elements on a row/column of the tile. + * @param tauMatrix A pointer to the tau Matrix. + * @param jj The value of j for the QR decomposition. + * @param kk The value of k for the QR decomposition. + * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column). + * + * + */ +void SLARFT(float* cornerTile, float* rowTile, int tileSize, int jj, int kk, float* tauMatrix, int tauNum, float* w) +{ + int i, j, n; + float z; + float w; + int TID = threadID % 32; + int set = threadID / 32; + + /* For each row in the corner Tile*/ + for(i = 0; i < tileSize; i++) + { + /*Get w for row i */ + for(j = i; j < tileSize; j++) + { + w[j] = cornerTile[i*tileSize + j]; + } + w[i] = 1.0; + + /* Apply to the row Tile */ + for(j = 0; j < tileSize; j++) + { + z=0.0; + /* Below Diagonal!*/ + /*Compute w'*A_j*/ + for(n = i; n < tileSize; n++) + { + z = z + w[n] * rowTile[j*tileSize+n]; + } + for(n = i; n < tileSize; n++) + { + rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z; + } + } + + + } + + +} + + +/* + * @brief Computes a householder vector for a single tile QR decomposition, storing the result in a shared array. + * @param matelem The TIDth element of the column to be used for calculation. + * @param hhVector SMEM The array used to calculate and store the vector. + * @param k The timestep at which this is called. i.e where the diagonal falls. + * + * Calculates a householder vector for the vector passed as elements into a shared array. + * Used only by SGEQRF_CUDA. + */ +__device__ void calcHH (float matelem, + volatile float hhVector[], + int k) +{ + /* To store the value of 1/(v[k] + sign(v[k])*||v||). */ + float localdiv; + /* Stores the sign of v[k] */ + int sign; + + /* Read vectors into shared memory from below the diagonal. */ + if(TID >= k) + hhVector[TID] = matelem; + + /* Above the diagonal, set to 0. */ + if(TID < k) + hhVector[TID] = 0; + + /* Square each element to find ||v|| = sqrt(sum(v[i]^2)) */ + hhVector[TID] *= hhVector[TID]; + + /* Reduce to find sum of squares. */ + reduceSum(hhVector, 32); + + /* Set localdiv to equal ||v|| */ + localdiv = sqrt(hhVector[0]); + + /* According to Householder algorithm in Golub & Van Loan. */ + if(localdiv != 0.0) + { + /* Load shared to communicate v[k] to all threads. */ + hhVector[TID] = matelem; + + /* Calculate the sign of v[k] locally. */ + sign = hhVector[k] >= 0 ? 1 : -1; + + /* Compute and store localdiv = (||v||)*sign(v[k]) */ + localdiv *= sign; + + /* Compute and store localdiv = v[k] + (||v||*sign(v[k])) */ + localdiv += hhVector[k]; + + /* Finally compute and store localdiv = 1/(v[k] + ||v||*sign(v[k])) */ + localdiv = 1.0f/localdiv; + } + /* If norm is 0, do not change v. */ + else + localdiv = 1.0f; + + /* Compute and store in shared memory v = v/(v[k] + ||v||*sign(v[k])) */ + if(TID < k) + hhVector[TID] = 0.0f; + /* v[k]/v[k] = 1 */ + if(TID == k) + hhVector[TID] = 1.0f; + if(TID > k) + hhVector[TID] = matelem * localdiv; +} + +/* + * @brief Applies a householder vector to the submatrix starting at (k,k), also inserts the kth tau value into tau[k]. + * @param blockCache SMEM The 1024-element array storing the full tile to which the vector will be applied. + * @param matTau The tau vector. + * @param workingVector SMEM The array used in the computation as a scratchpad. + * @param k Defines the submatrix. + * @param hhVectorelem Element of the cumulatively-stored householder vector to apply. + * + * Works by first calculating the kth tau, + * then proceeding to apply the vector to successive columns of the matrix A_j + * with the formula A_j = A_j - tau[k]*v*(v'A_j). + * The kth tau is then stored into memory. + */ +__device__ void applyHH(volatile float blockCache[], + float* matTau, + volatile float workingVector[], + int k, + float hhVectorelem) +{ + float ktau, + /* Stores the value of ktau*v*v'*A_j */ + z; + + int j; + float reg; + /* Read householder vector into shared memory to calculate ktau = 2/v'v */ + reg/*workingVector[TID]*/ = hhVectorelem; + /* Square the elements to start finding v'v */ + reg/*workingVector[TID]*/ *= reg/*workingVector[TID]*/; + + /* Perform a reduction to compute v'v and store the result in v[0]. */ + //reduceSum(workingVector, 32); + reduceSumMultiWarp(®); + + /* Finally compute ktau = 2/v'v */ + ktau = 2.0 / /*workingVector[0]*/reg; + + /* For the columns of the submatrix starting at (k,k). */ + for(j = k; j < 32; j ++) + { + /* Fill the shared memory to calculate v'*A_j */ + reg/*workingVector[TID]*/ = blockCache[TID+(j*32)] * hhVectorelem; + + /* Perform reduction to compute v'*A_j, storing the result in v[0] */ + //reduceSum(workingVector, 32); + reduceSumMultiWarp(®); + + /* Compute and store z = ktau*v[tid] */ + z = hhVectorelem * ktau; + + /* Compute and store z = (v'A_j)*(ktau*v[tid]) */ + z *= /*workingVector[0]*/reg; + + /* Apply with A_j[tid] = A_j[tid] - v'A_j*ktau*v[tid] */ + blockCache[TID + (j*32)] -= z; + } + + /* Store the essential (below diagonal) portion of householder vector below the diagonal in the block at column k. */ + if(TID > k) + blockCache[TID + k*32] = hhVectorelem; + + /* Store the kth tau. */ + if(TID == 0) + matTau[k] = ktau; +} + + + + + + +__device__ void runner_cuda_SGEQRF( int i, int j, int k, volatile float workingVector[], volatile float blockCache[] ) +{ + int kk, jj, refCache; + refCache = TID; + float* matrix = &GPU_matrix[j*cuda_m*32*32 + i*32*32]; + float* matTau = &GPU_tau[j*cuda_m*32 + i*32 ]; + for(jj = 0; jj < 32; jj ++)//load block into shared memory + { + blockCache[refCache] = matrix[refCache]; + refCache += 32; + } + + for(kk = 0; kk < 32; kk ++) + { + //calculate the kth hh vector from the kth column of the TIDth row of the matrix + calcHH (blockCache[TID + (kk*32)],//(tid, k) + workingVector, + kk); + + //calculate the application of the hhvector along row TID + applyHH (blockCache, + matTau, + workingVector, + kk, + workingVector[TID]); + } + + //copy row back + refCache = TID; + + for(jj = 0; jj < 32; jj ++)//unload block from shared memory + { + matrix[refCache] = blockCache[refCache]; + refCache += 32; + } + __threadfence(); + + +} + +__device__ void runner_cuda_SLARFT (int i, int j, int k, volatile float tau[], volatile float blockCache[] ) +{ + int jj, kk, ii, + tid = TID%32, + group = TID/32; + + __shared__ volatile float groupCache[32*4]; + float* blockV = &GPU_matrix[k*cuda_m*32*32 + k*32*32]; + float* blockA = &GPU_matrix[j*cuda_m*32*32 + i*32*32]; + float* blockTau = &GPU_tau[ i*cuda_m*32 + i*32 ]; + // cacheCol is a column in the groupCache + volatile float *cacheCol = groupCache + (group*32); + + float alpha, + belem; + + __syncthreads(); + + /* Load tau Vector */ + if(TID < 32) + tau[TID] = blockTau[TID]; + + /* Load Vectors */ + for(jj = group; jj < 32; jj += WARPS) + { + if(tid < jj) + blockCache[tid + (jj*32)] = 0.0f; + else if(tid == jj) + blockCache[tid + (jj*32)] = 1.0f; + else if(tid > jj) + blockCache[tid + (jj*32)] = blockV[tid + (jj*32)]; + //__syncthreads(); + } + __syncthreads(); + + /* Compute b_j -= tau*v*v'b_j, for all vectors in blockCached V */ + for(jj = group; jj < 32; jj += WARPS) + { + /* Read the column into registers. */ + //if(group == 0) + //{ + belem = blockA[tid + (jj*32)]; + //__syncthreads(); + if(jj == 0) + printf("%.3f %i\n", belem, TID); + /* For each vector in block of vectors. */ + + for(kk = 0; kk < 32; kk++) + { + /* Compute alpha = v'*b_j */ + cacheCol[tid] = blockCache[tid + (kk*32)] * belem; + if(group == 0 && jj == 0 && kk == 0) + printf("%.3f %i %.3f %.3f\n", belem, TID , blockCache[tid+kk*32],cacheCol[tid]); + // __syncthreads(); + + if(tid < 16) + cacheCol[tid] += cacheCol[tid+16]; + // __syncthreads(); + if(tid < 8) + cacheCol[tid] += cacheCol[tid+8]; + // __syncthreads(); + + alpha = cacheCol[0]; + for(ii = 1; ii < 8; ii ++) + alpha += cacheCol[ii]; + + /* Compute alpha = tau * v_tid * alpha */ + alpha *= tau[kk]; + if(TID==0 && kk == 0) + printf("tau[kk] = %.3f\n", tau[kk]); + alpha *= blockCache[tid + (kk*32)]; + if(jj == 0 && kk == 0) + printf("%.3f\n", alpha); + /* Compute belem -= alpha */ + belem -= alpha; + //__syncthreads(); + } + blockA[tid + (jj*32)] = belem;//} + //__syncthreads(); + } + __threadfence(); +} + +__device__ void applyOneHHVectD (float* topElem, + float* lowElem, + int k, + volatile float tau[], + volatile float blockCache[]) +{ + float alpha; + __shared__ volatile float workV[32]; + float reg; + int tid = TID % 32; + + /* Compute alpha = sum */ + if(tid == k) + reg/*workV[TID]*/ = *topElem; + if(tid != k) + reg/*workV[TID]*/ = 0.0f; + + reg/*workV[TID]*/ += *lowElem * blockCache[tid + (k*32)]; + + //reduceSum(workV, 32); + reduceSumMultiWarp(®); + + alpha = reg/*workV[0]*/; + + /* Multiply by tau */ + alpha *= tau[k]; + + /* Compute alpha *= a_tid,j*v_tid,k */ + if(tid == k) + *topElem -= alpha; + + /* For lower element. */ + alpha *= blockCache[tid + (k*32)]; + *lowElem -= alpha; +} + +__device__ void calcDoubleHHWY (float topElem, + float lowElem, + int k, + volatile float blockCache[]) +{ +/* Calculate v_k. */ + int sign, i; + + float topSum = 0, + lowSum = 0, + alpha; + + __shared__ volatile float first; + + if(TID == k) + first = topElem; + + topSum = first * first; + /* Sum squares of V to compute norm, using column of BlockCache as working space. */ + blockCache[TID + (k*32)] = lowElem * lowElem; + for(i = 0; i < 32; i ++) + lowSum += blockCache[i + (k*32)]; + + alpha = sqrt(topSum + lowSum); + sign = first >= 0.f ? 1.f : -1.f; + + if(alpha != 0.0f) + { + /* Zeroth element is = first + sign*norm */ + alpha = first + sign * alpha; + alpha = 1.0f/alpha; + } + else + alpha = 1.0f; + + //topElem *= alpha; + lowElem *= alpha; + + blockCache[TID + (k*32)] = lowElem; +} + + +__device__ void runner_cuda_STSQRF ( int i, int j, int k, volatile float tauVect[], volatile float blockCache[]) +{ + /* Idea: for each column j of (AB)^T, + apply HH vectors 0...j-1, + compute new HH vector for column j, + store essential part in blockCache + + Uses one block cache, one vector storage. */ + float* blockA = &GPU_matrix[k*cuda_m*32*32 + k*32*32]; + float* blockB = &GPU_matrix[j*cuda_m*32*32 + i*32*32]; + float* blockTau = &GPU_tau[j*cuda_m*32 + i*32]; +// int j, k, i; + int tid = TID % 32; + int group = TID/32; + float topElem, + lowElem, + tau; + + for(j = 0; j < 32; j ++) + { + if(TID < 32 ) + { + if(TID <= j) + topElem = blockA[TID + (j*32)]; + if(TID > j) + topElem = 0.f; + + lowElem = blockB[TID + (j*32)]; + + //for(k = 0; k < j; k ++) + //{ + /* Compute b_tid,j = b_tid,j - tau*vv'*b_tid,j */ + // applyOneHHVectD (&topElem, &lowElem, + // k, + // tauVect, + // blockCache); + //} + + calcDoubleHHWY (topElem, + lowElem, + j, + blockCache); + + /* Compute new tau = 2/v'v */ + tau = 1.0f; + for(i = 0; i < 32; i ++) + tau += blockCache[i + (j*32)] * blockCache[i + (j*32)]; + tau = 2.0f/tau; + + if(TID == j) + tauVect[j] = tau; + } + + __syncthreads(); + /* Apply new vector to column. */ + for(k = j+group ; k < 32; k +=WARPS ) + { + if(k != j) + { + if(tid <= k ) + topElem = blockA[tid + k*32]; + if( tid > k ) + topElem = 0.f; + + lowElem = blockB[tid + k*32]; + } + applyOneHHVectD (&topElem, &lowElem, + j, + tauVect, + blockCache); + + if(tid <= k) + blockA[tid + k*32] = topElem; + + blockB[tid + k*32] = lowElem; + + // __syncthreads(); + } + + __syncthreads(); + /* Write back */ + //if(TID <= j) + // blockA[TID + (j*ldm)] = topElem; + } + + /* Write back lower block, containing householder Vectors. */ + if(TID < 32) + { + for(j = 0; j < 32; j ++) + blockB[TID + (j*32)] = blockCache[TID + (j*32)]; + + blockTau[TID] = tauVect[TID]; + } + __threadfence(); +} + +__device__ void runner_cuda_SSSRFT( int i, int j, int k, volatile float tau[], volatile float blockCache[] ) +{ + __shared__ volatile float workV[WARPS*32]; + __syncthreads(); + + float aelem, belem, + alpha; + float* blockV = &GPU_matrix[k*cuda_m*32*32 + i*32*32]; + float* blockA = &GPU_matrix[j*cuda_m*32*32 + k*32*32]; + float* blockB = &GPU_matrix[j*cuda_m*32*32 + i*32*32]; + float* blockTau = &GPU_tau[k*cuda_m*32 + i*32]; + + int tid, group, + refMat, refCache; + + tid = TID%32; + group = TID/32; + + refMat = tid + group*32; + refCache = tid + group*32; + + if(TID < 32) + tau[TID] = blockTau[TID]; + __syncthreads(); + + + /* Load the essential HH vector block into shared cache. */ + for(j = group; j < 32; j += WARPS) + { + blockCache[refCache] = blockV[refMat]; + + refMat += WARPS*32; + refCache += WARPS*32; + __syncthreads(); + } + + __syncthreads(); + + /* For each column of the result. */ + for(j = group; j < 32; j += WARPS) + { + //if(tid == 0) printf("%d: column %d.\n", group, j); + /* Load the elements of the column to process. */ + aelem = blockA[tid + (j*32)]; + belem = blockB[tid + (j*32)]; + + /* Compute and apply b_j = b_j - tau*vv'b_j + for each Householder vector 1..32. */ + for(k = 0; k < 32; k ++) + { + /* Load the kth element into all threads. */ + workV[tid + (group*32)] = aelem; + + /* Store this as alpha. */ + alpha = workV[k + (group*32)]; + + /* Load components of v'b_j to sum. */ + workV[tid + (group*32)] = belem * blockCache[tid + (k*32)]; + + if(tid < 16) + workV[tid + (group*32)] += workV[16 + tid + (group*32)]; + if(tid < 8) + workV[tid + (group*32)] += workV[8 + tid + (group*32)]; + /* Compute v'b_j */ + for(i = 0; i < 8; i ++) + alpha += workV[i + (group*32)]; + + /* Multiply by kth tau. */ + alpha *= tau[k]; + + /* Compute b_j -= alpha*v + If kth thread in group, v is 1 at aelem. */ + if(tid == k) aelem -= alpha; + + /* Compute b_j -= alpha * v for lower half. */ + belem -= alpha * blockCache[tid + (k*32)]; + __syncthreads(); + } + /* put the elements back. */ + blockA[tid + (j*32)] = aelem; + blockB[tid + (j*32)] = belem; + //__syncthreads(); + } + __syncthreads(); + +} + +__device__ void runner ( int type , void *data ) { + + __shared__ volatile float blockCache[(32*32) + 128]; + volatile float *workVector; + workVector = blockCache + (32*32); + /* Decode the task data. */ + int *idata = (int *)data; + int i = idata[0], j = idata[1], k = idata[2]; + // int z; +// double buff[ 2*K*K ]; + /* Decode and execute the task. */ + switch ( type ) { + case task_SGEQRF: + if(threadIdx.x < 32) + runner_cuda_SGEQRF(i, j, k, workVector, blockCache); + + if(threadIdx.x == 0){ + printf("SGEQRF: %i %i %i + %i\n",i, j , k, clock()); + printf("tau = "); + for(k = 0; k < cuda_m * cuda_n * 32 ; k++) + { + printf("%.3f ", GPU_tau[k]); + } + printf("\n");} + break; + case task_SLARFT: + if(threadIdx.x == 0) + printf("SLARFT: %i %i %i + %i\n",i,j,k, clock()); + //runner_cuda_SLARFT(i, j, k, workVector, blockCache); + break; + case task_STSQRF: + if(threadIdx.x == 0) + printf("STSQRF: %i %i %i + %i\n",i,j,k, clock()); + //runner_cuda_STSQRF(i, j, k, workVector, blockCache); + break; + case task_SSSRFT: + if(threadIdx.x == 0) + printf("SSSRFT: %i %i %i + %i\n",i,j,k, clock()); + //runner_cuda_SSSRFT(i, j, k, workVector, blockCache); + break; + default: + asm("trap;"); + } + +} + +__device__ qsched_funtype function = runner; + +__global__ void Setup() +{ + printf("%i\n", function); +} + + +float* generateMatrix( int m, int n) +{ + float* Matrix; + cudaMallocHost(&Matrix, sizeof(float) * m*n*32*32); + if(Matrix == NULL) + error("Failed to allocate Matrix"); + int i, j; + memset ( Matrix, 0, sizeof(float)*m*n*32*32 ); + + for(i = 0; i < n*32; i++) + { + for(j = 0; j < m*32; j++) + { + float value = i/32 + j/32; + if(j/32 > 0) + { + value +=2.f; + } + Matrix[i*m*32 + j] = value; + } + } + return Matrix; +} + + +float* columnToTile( float* columnMatrix, int size , int m , int n ) +{ + float* TileMatrix; + cudaMallocHost(&TileMatrix, sizeof(float) * size ); + if(TileMatrix == NULL) + error("failed to allocate TileMatrix"); + int rows = m*32; + int columns = n*32; + int i,j,k,l; + + for( i = 0; i < n ; i++ ) + { + for( j = 0; j < m; j++ ) + { + float *tileStart = &columnMatrix[i*m*32*32 + j*32]; + float *tilePos = &TileMatrix[i*m*32*32 + j*32*32]; + for(k = 0; k < 32; k++) + { + tileStart = &columnMatrix[i*m*32*32+k*m*32 + j*32]; + for( l = 0; l < 32; l++ ) + { + tilePos[k*32 + l] = tileStart[l]; + } + } + } + } + + return TileMatrix; + +} + +float* tileToColumn( float* tileMatrix, int size, int m , int n ) +{ + float* ColumnMatrix; + ColumnMatrix = (float*) malloc(sizeof(float) * size ); + if(ColumnMatrix == NULL) + error("failed to allocate ColumnMatrix"); + int rows = m*32; + int columns = n*32; + int i,j,k,l; + for( i = 0; i < n ; i++ ) + { + for(j = 0; j < m ; j++ ) + { + /* Tile on ith column is at i*m*32*32.*/ + /* Tile on jth is at j*32*32 */ + float *tile = &tileMatrix[i*m*32*32 + j*32*32]; + /* Column starts at same position as tile. */ + /* Row j*32.*/ + float *tilePos = &ColumnMatrix[i*m*32*32 + j*32]; + for( k = 0; k < 32; k++ ) + { + for(l=0; l < 32; l++) + { + tilePos[l] = tile[l]; + } + /* Next 32 elements are the position of the tile in the next column.*/ + tile = &tile[32]; + /* Move to the j*32th position in the next column. */ + tilePos = &tilePos[32*m]; + + } + } + } + return ColumnMatrix; +} + + +float* createIdentity(int m, int n) +{ + float* Matrix; + cudaMallocHost(&Matrix, sizeof(float) * m*n*32*32); + if(Matrix == NULL) + error("Failed to allocate Matrix"); + int rows = m*32; + int columns = n*32; + int i, j; + memset ( Matrix, 0, sizeof(float)*m*n*32*32 ); + + for(i = 0; i < n*32; i++) + { + for(j = 0; j < m*32; j++) + { + if(i==j) + { + Matrix[i*m*32 + j] = 1.f; + }else + { + Matrix[i*m*32 + j] = 0.f; + } + } + } + return Matrix; +} + +void printMatrix(float* Matrix, int m, int n) +{ + int i, j; + + for(i=0; i < m*32; i++) + { + for(j = 0; j < n*32; j++) + { + printf(" %.1f ", Matrix[j*m*32 +i]); + + } + printf("\n"); + } + +} + + +/*void printTileMatrix(float* Matrix, int m, int n) +{ + int i,j; + float *tempMatrix, *tempMatrix2; + for(i = 0; i < m*32; i++) + { + int tiled = i/32; + tempMatrix = &Matrix[tiled*32*32]; + for(j = 0; j < n*32; j++) + { + int tile = j/32; + tempMatrix2 = &tempMatrix[tile*32*32*m + i%32]; + printf(" %.1f ", tempMatrix2[j*32]); + //printf(" %.1f,%i ", tempMatrix2[j*32], &tempMatrix2[j*32]-Matrix); + } + printf("\n"); + } +}*/ + + +qsched_funtype func; +void test_qr(int m , int n , int K , int nr_threads , int runs) +{ + + int k, j, i; + float *A, *A_orig, *tau, *tau_device, *device_array, *temp_device_array; + struct qsched s; + qsched_task_t *tid, tid_new; + qsched_res_t *rid; + int data[3]; + ticks tic, toc_run, tot_setup, tot_run = 0; + /* Initialize the scheduler. */ + qsched_init( &s , 1 , qsched_flag_none ); + cudaDeviceReset(); + cudaSetDevice(0); + Setup<<<1,1>>>(); + if(cudaDeviceSynchronize() != cudaSuccess) + error("Setup Failed: %s", cudaGetErrorString(cudaPeekAtLastError())); + + /* Allocate and fill the original matrix. */ + if(cudaMallocHost(&A, sizeof(float) * m * n * K * K ) != cudaSuccess || + cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess || + cudaMallocHost(&A_orig, sizeof(float) * m * n * K * K ) != cudaSuccess ) + error("Failed to allocate matrices."); + cudaFreeHost(A_orig); +// for ( k = 0 ; k < m * n * K * K ; k++ ) + // A_orig[k] = 2.0f*((float)rand()) / RAND_MAX - 1.0f; + A_orig = generateMatrix(m, n); + float *temp = columnToTile(A_orig, m * n * K *K, m , n); + cudaFreeHost(A_orig); + A_orig = temp; + temp = tileToColumn(A_orig, m*n*K*K, m , n); +// printMatrix(temp, m, n); + +// printTileMatrix(A_orig, m, n); + memcpy( A , A_orig , sizeof(float) * m * n * K * K ); + bzero( tau , sizeof(float) * m * n * K ); + + if( cudaMalloc(&tau_device, sizeof(float) * m * n * K ) != cudaSuccess ) + error("Failed to allocate tau on the device"); + if(cudaMemcpy( tau_device , tau , sizeof(float) * m * n * K , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy the tau data to the device."); + if( cudaMemcpyToSymbol ( GPU_tau, &tau_device , sizeof(float *), 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy locks pointer to the device."); + + if( cudaMalloc( &device_array , sizeof(float) * m * n * K * K ) != cudaSuccess ) + error("Failed to allocate the matrix on the device"); + if( cudaMemcpyToSymbol( GPU_matrix , &device_array , sizeof(float *) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy matrix pointer to the device"); + + /* Allocate and init the task ID and resource ID matrix. */ + tic = getticks(); + /* Allocate and init the task ID and resource ID matrix. */ + if( cudaMallocHost(&tid , sizeof(qsched_task_t) * m*n ) != cudaSuccess ) + error("Failed to allocate tid"); + if( cudaMallocHost(&rid , sizeof(qsched_task_t) * m*n) != cudaSuccess) + error("Failed to allocate rid"); + + + for ( k = 0 ; k < m * n ; k++ ) { + tid[k] = qsched_task_none; + if( cudaHostGetDevicePointer(&temp_device_array , &A[k*32*32] , 0) != cudaSuccess) + error("Failed to get device pointer to matrix"); + rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none , &A[k*32*32], sizeof(float) * 32 * 32, device_array + k * 32 *32); + } + + /* Build the tasks. */ + for ( k = 0 ; k < m && k < n ; k++ ) { + + /* Add kth corner task. */ + data[0] = k; data[1] = k; data[2] = k; + tid_new = qsched_addtask( &s , task_SGEQRF , task_flag_none , data , sizeof(int)*3 , 2 ); + qsched_addlock( &s , tid_new , rid[ k*m + k ] ); + if ( tid[ k*m + k ] != -1 ) + qsched_addunlock( &s , tid[ k*m + k ] , tid_new ); + tid[ k*m + k ] = tid_new; + + + /* Add column tasks on kth row. */ + for ( j = k+1 ; j < n ; j++ ) { + data[0] = k; data[1] = j; data[2] = k; + tid_new = qsched_addtask( &s , task_SLARFT , task_flag_none , data , sizeof(int)*3 , 3 ); + qsched_addlock( &s , tid_new , rid[ j*m + k ] ); + qsched_adduse( &s , tid_new , rid[ k*m + k ] ); + qsched_addunlock( &s , tid[ k*m + k ] , tid_new ); + if ( tid[ j*m + k ] != -1 ) + qsched_addunlock( &s , tid[ j*m + k ] , tid_new ); + tid[ j*m + k ] = tid_new; + } + + /* For each following row... */ + for ( i = k+1 ; i < m ; i++ ) { + + /* Add the row tasks for the kth column. */ + data[0] = i; data[1] = k; data[2] = k; + tid_new = qsched_addtask( &s , task_STSQRF , task_flag_none , data , sizeof(int)*3 , 3 ); + qsched_addlock( &s , tid_new , rid[ k*m + i ] ); + qsched_adduse( &s , tid_new , rid[ k*m + k ] ); + qsched_addunlock( &s , tid[ k*m + (i-1) ] , tid_new ); + if ( tid[ k*m + i ] != -1 ) + qsched_addunlock( &s , tid[ k*m + i ] , tid_new ); + tid[ k*m + i ] = tid_new; + + /* Add the inner tasks. */ + for ( j = k+1 ; j < n ; j++ ) { + data[0] = i; data[1] = j; data[2] = k; + tid_new = qsched_addtask( &s , task_SSSRFT , task_flag_none , data , sizeof(int)*3 , 5 ); + qsched_addlock( &s , tid_new , rid[ j*m + i ] ); + qsched_adduse( &s , tid_new , rid[ k*m + i ] ); + qsched_adduse( &s , tid_new , rid[ j*m + k ] ); + qsched_addunlock( &s , tid[ k*m + i ] , tid_new ); + qsched_addunlock( &s , tid[ j*m + k ] , tid_new ); + if ( tid[ j*m + i ] != -1 ) + qsched_addunlock( &s , tid[ j*m + i ] , tid_new ); + tid[ j*m + i ] = tid_new; + } + + } + + } /* build the tasks. */ + + if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess) + error("Failed to copy function pointer from device"); + + if( cudaMemcpyToSymbol( cuda_m, &m, sizeof(int), 0, cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy m to the device"); + if( cudaMemcpyToSymbol( cuda_n, &n, sizeof(int), 0, cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy n to the device"); + + qsched_run_CUDA( &s , func ); + cudaMemcpy( A , device_array , sizeof(float) * m * n * K * K, cudaMemcpyHostToDevice); + A = tileToColumn(A,m * n * K * K, m, n); + printMatrix(A, m, n); +// printTileMatrix(A, m , n); + +} + + +void test() +{ + int m = 2, n = 2; + float* Matrix = createIdentity(m,n); + //printMatrix(Matrix, m, n); + float* MatrixTile = columnToTile(Matrix, m*n*32*32, m, n); + printMatrix(&MatrixTile[0], 1, 1); + printf("\n \n \n"); + printMatrix(&MatrixTile[32*32], 1, 1); + printf("\n \n \n"); + printMatrix(&MatrixTile[64*32], 1, 1); + printf("\n \n \n"); + printMatrix(&MatrixTile[3*32*32], 1, 1); + printf("\n \n \n"); + //printTileMatrix(MatrixTile, m, n); + free(Matrix); + Matrix = tileToColumn(MatrixTile, m*n*32*32, m , n); + printMatrix(Matrix, m, n); + free(MatrixTile); + free(Matrix); +} + +/** + * @brief Main function. + */ + +int main ( int argc , char *argv[] ) { + + int c, nr_threads=128; + int M = 4, N = 4, runs = 1, K = 32; + + /* Get the number of threads. */ + //#pragma omp parallel shared(nr_threads) +// { +// if ( omp_get_thread_num() == 0 ) +// nr_threads = omp_get_num_threads(); +// } + + /* Parse the options */ + while ( ( c = getopt( argc , argv , "m:n:k:r:t:D" ) ) != -1 ) + switch( c ) { + case 'm': + if ( sscanf( optarg , "%d" , &M ) != 1 ) + error( "Error parsing dimension M." ); + break; + case 'n': + if ( sscanf( optarg , "%d" , &N ) != 1 ) + error( "Error parsing dimension M." ); + break; + case 'k': + if ( sscanf( optarg , "%d" , &K ) != 1 ) + error( "Error parsing tile size." ); + break; + case 'r': + if ( sscanf( optarg , "%d" , &runs ) != 1 ) + error( "Error parsing number of runs." ); + break; + case 't': + if ( sscanf( optarg , "%d" , &nr_threads ) != 1 ) + error( "Error parsing number of threads." ); + //omp_set_num_threads( nr_threads ); + break; + case 'D': + test(); + exit( EXIT_SUCCESS ); + case '?': + fprintf( stderr , "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n" , argv[0] ); + fprintf( stderr , "Computes the tiled QR decomposition of an MxN tiled\n" + "matrix using nr_threads threads.\n" ); + exit( EXIT_FAILURE ); + } + + /* Dump arguments. */ + message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." , + 32*M , 32*N , nr_threads , runs ); + + test_qr( M , N , K , nr_threads , runs ); + + } +