Added a file that wasn't uploaded

d2c38d96 · Aidan Chalk · 563c738e · d2c38d96
Commit d2c38d96 authored 10 years ago by Aidan Chalk
--- a/examples/test_qr.cu
+++ b/examples/test_qr.cu
+/* Config parameters. */
+#include "../config.h"
+
+/* Standard includes. */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+
+/* Local includes. */
+extern "C"{
+#include "quicksched.h"
+}
+#include "cuda_queue.h"
+
+enum task_types { task_SGEQRF , task_SLARFT , task_STSQRF , task_SSSRFT} ;
+
+#define TID threadIdx.x
+#define numthreads 128
+#define tilesize 32
+#define cuda_maxtasks 1000000
+#define CO(x,y,ldm) (((y)*ldm) + (x))
+#define WARPS 4
+
+__device__ float *GPU_matrix;
+__device__ float *GPU_tau;
+__device__ int cuda_m;
+__device__ int cuda_n;
+
+
+/*
+ * @brief Performs a reduction sum on a vector, storing the result in v[0].
+ * @param sumVector SMEM The array to compute over.
+ * @param startN The number of elements in the vector.
+ *
+ * Computes a binary reduction to find the sum of the elements in the vector,
+ * storing the result in the first element of the vector.
+ * should be called where threadDim.x <= 1 warp
+ */
+__device__ void reduceSum	(volatile float* sumVector,
+				char startN)
+{
+	/* Start with n = startN/2 */
+	char n = startN >> 1;
+
+	while(n > 0)
+	{
+		/* Use first n threads to add up to 2n elements. */
+		if(TID < n)	sumVector[TID] = sumVector[TID] + sumVector[TID + n];
+		
+		/* Divide n by 2 for next iteration. */
+		n = n >> 1;
+	}
+}
+
+__device__ inline void reduceSumMultiWarp( float* value )
+{
+    #if __CUDA_ARCH__ >= 300
+    #pragma unroll
+    for(int mask = 16 ; mask > 0 ; mask >>= 1)
+        *value += __shfl_xor((float)*value, mask);
+
+    *value = __shfl((float)*value, 0);
+
+    #else
+    __shared__ float stuff[32*WARPS];
+    int tid = TID%32;
+    int group = TID / 32;
+    char n = 32 >> 1;
+    while(n > 0 )
+    {
+        if(tid < n ) stuff[tid] = stuff[tid] + stuff[tid + n];
+        n = n >> 1;
+    }
+    __threadfence();
+    *value = stuff[group * 32];
+        //printf("Not supported.\n");
+        //asm("trap;");
+    #endif
+
+}
+
+
+/* Routines for the tiled QR decomposition.*/
+
+/**
+ *
+ * @brief Computes the QR decomposition of a tile.
+ *
+ * @param cornerTile A pointer to the tile for which the decomposition is computed.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param k The value of k for the QR decomposition
+ * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ *
+ *   Note: 32 threads max.
+ */
+__device__ void SGEQRF(float* restrict cornerTile, int tileSize, float* restrict tauMatrix, int k, int tauNum, float* w)
+{
+    int i, j;
+    float norm=0.0, sign, u1, tau, z;
+    int TID = threadID % 32;
+    int set = threadID / 32;
+
+    /*Find the householder vector for each row. */
+    for(i = 0; i < tileSize; i++)
+    {
+        norm = 0.0;
+        if(TID > = i )
+            w[ TID ] = cornerTile[ i*tilesize + TID ] * cornerTile[i * tilesize + TID];
+        else
+            w[ TID ] = 0.0;
+        reduceSumMultiWarp( &w[TID] );
+        
+        if(cornerTile[i * tilesize + i] >= 0.0)
+            sign = -1;
+        else
+            sign = 1;
+
+        norm = sqrt(w[0]);
+        if(TID == 0)
+            w[0] = 0.0;
+
+        u1 = cornerTile[i*tilesize + i] - sign*norm;
+        if(u1 != 0.0)
+        {
+            if(TID > i)
+                w[ TID ] = w[TID] / u1;
+        }
+        else
+        {
+            if(TID > i)
+                w[ TID ] = 0.0;
+        }
+
+        if(norm != 0.0)
+            tau = -sign*u1/norm;
+        else
+            tau = 0.0;
+
+        /*Store the below diagonal vector */
+        if(TID > i)
+            cornerTile[ i*tileSize + TID ] = w[TID];
+        if(TID == 0)
+            cornerTile[ i*tileSize + i ] = sign*norm;
+        if(TID == i)
+            w[i] = 1.0;
+        /* Apply the householder transformation to the rest of the tile, for everything to the right of the diagonal. */
+        for(j = i+1; j < tileSize; j++)
+        {
+            /*Compute w'*A_j*/
+            if(TID >= i)
+                z = cornerTile[j*tileSize + TID] * w[TID];
+            else
+                z = 0;
+            reduceSumMultiWarp(&z);
+            /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */
+            if(TID >= i)
+                cornerTile[j*tileSize + TID] = cornerTile[j*tileSize + TID] - tau*w[TID]*z;
+        }
+        /* Store tau. We're on k*tileSize+ith row. kth column.*/
+        if(TID == 0)
+            tauMatrix[(k*tileSize+i)*tauNum+k] = tau;
+
+    }
+
+}
+
+
+/**
+ *
+ * @brief Applies the householder factorisation of the corner to the row tile.
+ *
+ * @param cornerTile A pointer to the tile for which the householder is stored.
+ * @param rowTiles The tile to which the householder is applied.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param jj The value of j for the QR decomposition.
+ * @param kk The value of k for the QR decomposition.
+ * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ *
+ *
+ */
+void SLARFT(float* cornerTile, float* rowTile, int tileSize, int jj, int kk, float* tauMatrix, int tauNum, float* w)
+{
+    int i, j, n;
+    float z;
+    float w;
+    int TID = threadID % 32;
+    int set = threadID / 32;
+
+    /* For each row in the corner Tile*/
+    for(i = 0; i < tileSize; i++)
+    {
+        /*Get w for row i */
+        for(j = i; j < tileSize; j++)
+        {
+            w[j] = cornerTile[i*tileSize + j];
+        }
+        w[i] = 1.0;
+        
+        /* Apply to the row Tile */
+        for(j = 0; j < tileSize; j++)
+        {
+            z=0.0;
+            /* Below Diagonal!*/
+            /*Compute w'*A_j*/
+            for(n = i; n < tileSize; n++)
+            {
+                z = z + w[n] * rowTile[j*tileSize+n];
+            }
+            for(n = i; n < tileSize; n++)
+            {
+                rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z;
+            }
+        }
+
+        
+    }
+
+
+}
+
+
+/*
+ * @brief Computes a householder vector for a single tile QR decomposition, storing the result in a shared array.
+ * @param matelem The TIDth element of the column to be used for calculation.
+ * @param hhVector SMEM The array used to calculate and store the vector.
+ * @param k The timestep at which this is called. i.e where the diagonal falls.
+ * 
+ * Calculates a householder vector for the vector passed as elements into a shared array.
+ * Used only by SGEQRF_CUDA.
+ */
+__device__ void calcHH	(float matelem,
+			volatile float hhVector[],
+			int k)
+{
+	/* To store the value of 1/(v[k] + sign(v[k])*||v||). */
+	float localdiv;
+	/* Stores the sign of v[k] */
+	int sign;
+
+	/* Read vectors into shared memory from below the diagonal. */
+	if(TID >= k)
+		hhVector[TID] = matelem;
+
+	/* Above the diagonal, set to 0. */
+	if(TID < k)
+		hhVector[TID] = 0;
+
+	/* Square each element to find ||v|| = sqrt(sum(v[i]^2)) */
+	hhVector[TID] *= hhVector[TID];
+	
+	/* Reduce to find sum of squares. */
+	reduceSum(hhVector, 32);
+
+	/* Set localdiv to equal ||v|| */
+	localdiv = sqrt(hhVector[0]);
+
+	/* According to Householder algorithm in Golub & Van Loan. */
+	if(localdiv != 0.0)
+	{
+		/* Load shared to communicate v[k] to all threads. */
+		hhVector[TID] = matelem;
+
+		/* Calculate the sign of v[k] locally. */
+		sign = hhVector[k] >= 0 ? 1 : -1;
+		
+		/* Compute and store localdiv = (||v||)*sign(v[k]) */
+		localdiv *= sign;
+
+		/* Compute and store localdiv = v[k] + (||v||*sign(v[k])) */
+		localdiv += hhVector[k];
+
+		/* Finally compute and store localdiv = 1/(v[k] + ||v||*sign(v[k])) */
+		localdiv = 1.0f/localdiv;
+	}
+	/* If norm is 0, do not change v. */
+	else
+		localdiv = 1.0f;
+
+	/* Compute and store in shared memory v = v/(v[k] + ||v||*sign(v[k])) */
+	if(TID < k)
+		hhVector[TID] = 0.0f;
+	/* v[k]/v[k] = 1 */
+	if(TID == k)
+		hhVector[TID] = 1.0f;
+	if(TID > k)
+		hhVector[TID] = matelem * localdiv;
+}
+
+/*
+ * @brief Applies a householder vector to the submatrix starting at (k,k), also inserts the kth tau value into tau[k].
+ * @param blockCache SMEM The 1024-element array storing the full tile to which the vector will be applied.
+ * @param matTau The tau vector.
+ * @param workingVector SMEM The array used in the computation as a scratchpad.
+ * @param k Defines the submatrix.
+ * @param hhVectorelem Element of the cumulatively-stored householder vector to apply.
+ *
+ * Works by first calculating the kth tau,
+ * then proceeding to apply the vector to successive columns of the matrix A_j
+ * with the formula A_j = A_j - tau[k]*v*(v'A_j).
+ * The kth tau is then stored into memory.
+ */
+__device__ void applyHH(volatile float blockCache[],
+			float* matTau,
+			volatile float workingVector[],
+			int k,
+			float hhVectorelem)
+{
+	float 	ktau,
+		/* Stores the value of ktau*v*v'*A_j */
+		z;
+
+	int 	j;
+    float reg;
+	/* Read householder vector into shared memory to calculate ktau = 2/v'v */
+	reg/*workingVector[TID]*/ = hhVectorelem;
+	/* Square the elements to start finding v'v */
+	reg/*workingVector[TID]*/ *= reg/*workingVector[TID]*/;
+
+	/* Perform a reduction to compute v'v and store the result in v[0]. */
+	//reduceSum(workingVector, 32);
+    reduceSumMultiWarp(&reg);
+
+	/* Finally compute ktau = 2/v'v */
+	ktau = 2.0 / /*workingVector[0]*/reg;
+	
+	/* For the columns of the submatrix starting at (k,k). */
+	for(j = k; j < 32; j ++)
+	{
+		/* Fill the shared memory to calculate v'*A_j */
+		reg/*workingVector[TID]*/ = blockCache[TID+(j*32)] * hhVectorelem;
+
+		/* Perform reduction to compute v'*A_j, storing the result in v[0] */
+		//reduceSum(workingVector, 32);
+        reduceSumMultiWarp(&reg);
+		
+		/* Compute and store z = ktau*v[tid] */
+		z = hhVectorelem * ktau;
+
+		/* Compute and store z = (v'A_j)*(ktau*v[tid]) */
+		z *= /*workingVector[0]*/reg;
+		
+		/* Apply with A_j[tid] = A_j[tid] - v'A_j*ktau*v[tid] */
+		blockCache[TID + (j*32)] -= z;
+	}
+	
+	/* Store the essential (below diagonal) portion of householder vector below the diagonal in the block at column k. */
+	if(TID > k)
+		blockCache[TID + k*32] = hhVectorelem;
+	
+	/* Store the kth tau. */
+	if(TID == 0)
+		matTau[k] = ktau;
+}
+
+
+
+
+
+
+__device__ void runner_cuda_SGEQRF( int i, int j, int k, volatile float workingVector[], volatile float blockCache[] )
+{
+    int kk, jj, refCache;
+	refCache = TID;
+    float* matrix = &GPU_matrix[j*cuda_m*32*32 + i*32*32];
+    float* matTau = &GPU_tau[j*cuda_m*32 + i*32 ];
+	for(jj = 0; jj < 32; jj ++)//load block into shared memory
+	{
+		blockCache[refCache] = matrix[refCache];
+		refCache += 32;
+	}
+
+	for(kk = 0; kk < 32; kk ++)
+	{
+		//calculate the kth hh vector from the kth column of the TIDth row of the matrix
+		calcHH	(blockCache[TID + (kk*32)],//(tid, k)
+			workingVector,
+			kk);
+
+		//calculate the application of the hhvector along row TID
+		applyHH	(blockCache,
+			matTau,
+			workingVector,
+			kk,
+			workingVector[TID]);
+	}
+
+	//copy row back
+	refCache = TID;
+
+	for(jj = 0; jj < 32; jj ++)//unload block from shared memory
+	{
+		matrix[refCache] = blockCache[refCache];
+		refCache += 32;
+	}
+	__threadfence();
+
+
+}
+
+__device__ void runner_cuda_SLARFT (int i, int j, int k, volatile float tau[], volatile float blockCache[] )
+{
+	int 	jj, kk, ii,
+		tid = TID%32,
+		group = TID/32;
+
+	__shared__ volatile float groupCache[32*4];
+	float* blockV = &GPU_matrix[k*cuda_m*32*32 + k*32*32];
+    float* blockA = &GPU_matrix[j*cuda_m*32*32 + i*32*32];
+    float* blockTau = &GPU_tau[ i*cuda_m*32 + i*32 ];
+	// cacheCol is a column in the groupCache
+	volatile float *cacheCol = groupCache + (group*32);
+	
+	float 	alpha,
+		belem;
+	
+	__syncthreads();
+
+	/* Load tau Vector */
+	if(TID < 32)
+		tau[TID] = blockTau[TID];
+	
+	/* Load Vectors */
+	for(jj = group; jj < 32; jj += WARPS)
+	{
+		if(tid < jj)
+			blockCache[tid + (jj*32)] = 0.0f;
+		else if(tid == jj)
+			blockCache[tid + (jj*32)] = 1.0f;
+		else if(tid > jj)
+			blockCache[tid + (jj*32)] = blockV[tid + (jj*32)];
+		//__syncthreads();
+	}
+    __syncthreads();
+
+	/* Compute b_j -= tau*v*v'b_j, for all vectors in blockCached V */
+	for(jj = group; jj < 32; jj += WARPS)
+	{
+		/* Read the column into registers. */
+		//if(group == 0)
+		//{
+		belem = blockA[tid + (jj*32)];
+		//__syncthreads();
+        if(jj == 0)
+            printf("%.3f %i\n", belem, TID);
+		/* For each vector in block of vectors. */
+
+		for(kk = 0; kk < 32; kk++)
+		{
+			/* Compute alpha = v'*b_j */
+			cacheCol[tid] = blockCache[tid + (kk*32)] * belem;
+        if(group == 0 && jj == 0 && kk == 0)
+        printf("%.3f %i %.3f %.3f\n", belem, TID , blockCache[tid+kk*32],cacheCol[tid]);
+		//	__syncthreads();
+
+			if(tid < 16)
+				cacheCol[tid] += cacheCol[tid+16];
+		//	__syncthreads();
+			if(tid < 8)
+				cacheCol[tid] += cacheCol[tid+8];
+		//	__syncthreads();
+
+			alpha = cacheCol[0];
+			for(ii = 1; ii < 8; ii ++)
+				alpha += cacheCol[ii];
+
+			/* Compute alpha = tau * v_tid * alpha */
+			alpha *= tau[kk];
+            if(TID==0 && kk == 0)
+            printf("tau[kk] = %.3f\n", tau[kk]);
+			alpha *= blockCache[tid + (kk*32)];
+			if(jj == 0 && kk == 0)
+        printf("%.3f\n", alpha);
+			/* Compute belem -= alpha */
+			belem -= alpha;
+			//__syncthreads();
+		}
+		blockA[tid + (jj*32)] = belem;//}
+		//__syncthreads();
+	}
+	__threadfence();
+}
+
+__device__ void applyOneHHVectD	(float* topElem,
+				float* lowElem,
+				int k,
+				volatile float tau[],
+				volatile float blockCache[])
+{
+	float alpha;
+	__shared__ volatile float workV[32];
+    float reg;
+    int tid = TID % 32;
+
+	/* Compute alpha = sum */
+	if(tid == k)
+		reg/*workV[TID]*/ = *topElem;
+	if(tid != k)
+		reg/*workV[TID]*/ = 0.0f;
+
+	reg/*workV[TID]*/ += *lowElem * blockCache[tid + (k*32)];
+
+	//reduceSum(workV, 32);
+    reduceSumMultiWarp(&reg);
+	
+	alpha = reg/*workV[0]*/;
+
+	/* Multiply by tau */
+	alpha *= tau[k];
+
+	/* Compute alpha *= a_tid,j*v_tid,k */
+	if(tid == k)
+		*topElem -= alpha;
+			
+	/* For lower element. */
+	alpha *= blockCache[tid + (k*32)];
+	*lowElem -= alpha;
+}
+
+__device__ void calcDoubleHHWY	(float topElem,
+				float lowElem,
+				int k,
+				volatile float blockCache[])
+{
+/* Calculate v_k. */
+	int sign, i;
+
+	float 	topSum = 0,
+		lowSum = 0,
+		alpha;
+
+	__shared__ volatile float first;
+	
+	if(TID == k)
+		first = topElem;
+	
+	topSum = first * first;
+	/* Sum squares of V to compute norm, using column of BlockCache as working space. */
+	blockCache[TID + (k*32)] = lowElem * lowElem;
+	for(i = 0; i < 32; i ++)
+		lowSum += blockCache[i + (k*32)];
+	
+	alpha = sqrt(topSum + lowSum);
+	sign = first >= 0.f ? 1.f : -1.f;
+
+	if(alpha != 0.0f)
+	{
+		/* Zeroth element is = first + sign*norm */
+		alpha = first + sign * alpha;
+		alpha = 1.0f/alpha;
+	}
+	else
+		alpha = 1.0f;
+
+	//topElem *= alpha;
+	lowElem *= alpha;
+	
+	blockCache[TID + (k*32)] = lowElem;
+}
+
+
+__device__ void runner_cuda_STSQRF ( int i, int j, int k, volatile float tauVect[], volatile float blockCache[])
+{
+	/* Idea: for each column j of (AB)^T,
+		apply HH vectors 0...j-1,
+		compute new HH vector for column j,
+		store essential part in blockCache 
+
+	Uses one block cache, one vector storage. */
+    float* blockA = &GPU_matrix[k*cuda_m*32*32 + k*32*32];
+    float* blockB = &GPU_matrix[j*cuda_m*32*32 + i*32*32];
+    float* blockTau = &GPU_tau[j*cuda_m*32 + i*32];
+//	int j, k, i;
+    int tid = TID % 32;
+    int group = TID/32;
+	float 	topElem,
+		lowElem,
+		tau;
+
+	for(j = 0; j < 32; j ++)
+	{
+        if(TID < 32 )
+        {
+		if(TID <= j)
+			topElem = blockA[TID + (j*32)];
+		if(TID > j)
+			topElem = 0.f;
+
+		lowElem = blockB[TID + (j*32)];
+
+		//for(k = 0; k < j; k ++)
+		//{
+			/* Compute b_tid,j = b_tid,j - tau*vv'*b_tid,j */
+		//	applyOneHHVectD	(&topElem, &lowElem,
+		//			k,
+		//			tauVect,
+		//			blockCache);
+		//}
+
+		calcDoubleHHWY	(topElem,
+				lowElem,
+				j,
+				blockCache);
+
+		/* Compute new tau = 2/v'v */
+		tau = 1.0f;
+		for(i = 0; i < 32; i ++)
+			tau += blockCache[i + (j*32)] * blockCache[i + (j*32)];
+		tau = 2.0f/tau;
+		
+		if(TID == j)
+			tauVect[j] = tau;
+        }
+        
+        __syncthreads();
+		/* Apply new vector to column. */
+        for(k = j+group ; k < 32; k +=WARPS )
+        {
+            if(k != j)
+            {
+                if(tid <= k )
+                    topElem = blockA[tid + k*32];
+                if( tid > k )
+                    topElem = 0.f;
+
+                lowElem = blockB[tid + k*32];
+            }
+		    applyOneHHVectD	(&topElem, &lowElem,
+				j,
+				tauVect,
+				blockCache);
+
+            if(tid <= k)
+                blockA[tid + k*32] = topElem;
+
+            blockB[tid + k*32] = lowElem;
+
+           // __syncthreads();
+        }
+
+        __syncthreads();
+		/* Write back */
+		//if(TID <= j)
+		//	blockA[TID + (j*ldm)] = topElem;
+	}
+
+	/* Write back lower block, containing householder Vectors. */
+    if(TID < 32)
+    {
+	for(j = 0; j < 32; j ++)
+		blockB[TID + (j*32)] = blockCache[TID + (j*32)];
+	
+	blockTau[TID] = tauVect[TID];
+    }
+	__threadfence();
+}
+
+__device__ void runner_cuda_SSSRFT( int i, int j, int k, volatile float tau[], volatile float blockCache[] )
+{
+	__shared__ volatile float workV[WARPS*32];
+	__syncthreads();
+
+	float 	aelem, belem,
+		alpha;
+    float* blockV = &GPU_matrix[k*cuda_m*32*32 + i*32*32];
+    float* blockA = &GPU_matrix[j*cuda_m*32*32 + k*32*32];
+    float* blockB = &GPU_matrix[j*cuda_m*32*32 + i*32*32];
+    float* blockTau = &GPU_tau[k*cuda_m*32 + i*32];
+
+	int tid, group,
+		refMat, refCache;
+	
+	tid = TID%32;
+	group = TID/32;
+
+	refMat = tid + group*32;
+	refCache = tid + group*32;	
+
+	if(TID < 32)
+		tau[TID] = blockTau[TID];
+	__syncthreads();
+
+
+	/* Load the essential HH vector block into shared cache. */
+	for(j = group; j < 32; j += WARPS)
+	{
+		blockCache[refCache] = blockV[refMat];
+
+		refMat += WARPS*32;
+		refCache += WARPS*32;
+		__syncthreads();
+	}
+
+	__syncthreads();
+
+	/* For each column of the result. */
+	for(j = group; j < 32; j += WARPS)
+	{
+		//if(tid == 0)	printf("%d: column %d.\n", group, j);
+		/* Load the elements of the column to process. */
+		aelem = blockA[tid + (j*32)];
+		belem = blockB[tid + (j*32)];
+		
+		/* Compute and apply b_j = b_j - tau*vv'b_j
+			for each Householder vector 1..32. */
+		for(k = 0; k < 32; k ++)
+		{
+			/* Load the kth element into all threads. */
+			workV[tid + (group*32)] = aelem;
+
+			/* Store this as alpha. */
+			alpha = workV[k + (group*32)];
+	
+			/* Load components of v'b_j to sum. */
+			workV[tid + (group*32)] = belem * blockCache[tid + (k*32)];
+
+			if(tid < 16)
+				workV[tid + (group*32)] += workV[16 + tid + (group*32)];
+			if(tid < 8)
+				workV[tid + (group*32)] += workV[8 + tid + (group*32)];
+			/* Compute v'b_j */
+			for(i = 0; i < 8; i ++)
+				alpha += workV[i + (group*32)];
+
+			/* Multiply by kth tau. */
+			alpha *= tau[k];
+			
+			/* Compute b_j -= alpha*v
+				If kth thread in group, v is 1 at aelem. */
+			if(tid == k)	aelem -= alpha;
+
+			/* Compute b_j -= alpha * v for lower half. */
+			belem -= alpha * blockCache[tid + (k*32)];
+			__syncthreads();
+		}
+		/* put the elements back. */
+		blockA[tid + (j*32)] = aelem;
+		blockB[tid + (j*32)] = belem;
+		//__syncthreads();
+	}
+	__syncthreads();
+
+}
+
+__device__ void runner ( int type , void *data ) {
+    
+	__shared__ volatile float blockCache[(32*32) + 128];
+	volatile float *workVector;
+	workVector = blockCache + (32*32);
+        /* Decode the task data. */
+        int *idata = (int *)data;
+        int i = idata[0], j = idata[1], k = idata[2];
+    //    int z;
+//        double buff[ 2*K*K ];
+        /* Decode and execute the task. */
+        switch ( type ) {
+            case task_SGEQRF:
+                if(threadIdx.x < 32)
+                    runner_cuda_SGEQRF(i, j, k, workVector, blockCache);
+
+                if(threadIdx.x == 0){
+                    printf("SGEQRF: %i %i %i + %i\n",i, j , k, clock());
+                printf("tau = ");
+                for(k = 0; k < cuda_m * cuda_n * 32 ; k++)
+                {
+                    printf("%.3f ", GPU_tau[k]);
+                }
+                printf("\n");}
+                break;
+            case task_SLARFT:
+                if(threadIdx.x == 0)
+                    printf("SLARFT: %i %i %i + %i\n",i,j,k, clock());
+                //runner_cuda_SLARFT(i, j, k, workVector, blockCache);
+                break;
+            case task_STSQRF:
+                if(threadIdx.x == 0)
+                    printf("STSQRF: %i %i %i + %i\n",i,j,k, clock());
+                //runner_cuda_STSQRF(i, j, k, workVector, blockCache);
+                break;
+            case task_SSSRFT:
+                if(threadIdx.x == 0)
+                    printf("SSSRFT: %i %i %i + %i\n",i,j,k, clock());
+                //runner_cuda_SSSRFT(i, j, k, workVector, blockCache);
+                break;
+            default:
+                asm("trap;");
+            }
+
+}
+
+__device__ qsched_funtype function = runner;
+
+__global__ void Setup()
+{
+    printf("%i\n", function);
+}
+
+
+float* generateMatrix( int m, int n)
+{
+    float* Matrix;
+    cudaMallocHost(&Matrix, sizeof(float) * m*n*32*32);
+    if(Matrix == NULL)
+        error("Failed to allocate Matrix");
+    int i, j;
+    memset ( Matrix, 0, sizeof(float)*m*n*32*32 );
+
+    for(i = 0; i < n*32; i++)
+    {
+        for(j = 0; j < m*32; j++)
+        {
+            float value = i/32 + j/32;
+            if(j/32 > 0)
+            {
+                value +=2.f;
+            }
+                Matrix[i*m*32 + j] = value;
+        }
+    }
+    return Matrix;
+}
+
+
+float* columnToTile( float* columnMatrix, int size , int m , int n )
+{
+    float* TileMatrix;
+    cudaMallocHost(&TileMatrix, sizeof(float) * size );
+    if(TileMatrix == NULL)
+        error("failed to allocate TileMatrix");
+    int rows = m*32;
+    int columns = n*32;
+    int i,j,k,l;
+
+    for( i = 0; i < n ; i++ )
+    {
+        for( j = 0; j < m; j++ )
+        {
+            float *tileStart = &columnMatrix[i*m*32*32 + j*32];
+            float *tilePos = &TileMatrix[i*m*32*32 + j*32*32];
+            for(k = 0; k < 32; k++)
+            {
+                tileStart = &columnMatrix[i*m*32*32+k*m*32 + j*32];
+                for( l = 0; l < 32; l++ )
+                {
+                    tilePos[k*32 + l] = tileStart[l];
+                }
+            }
+        }
+    }
+
+    return TileMatrix;
+
+}
+
+float* tileToColumn( float* tileMatrix, int size, int m , int n )
+{
+    float* ColumnMatrix;
+    ColumnMatrix = (float*) malloc(sizeof(float) * size );
+    if(ColumnMatrix == NULL)
+        error("failed to allocate ColumnMatrix");
+    int rows = m*32;
+    int columns = n*32;
+    int i,j,k,l;
+    for( i = 0; i < n ; i++ )
+    {
+        for(j = 0; j < m ; j++ )
+        {
+            /* Tile on ith column is at i*m*32*32.*/
+            /* Tile on jth is at j*32*32 */
+            float *tile = &tileMatrix[i*m*32*32 + j*32*32];
+            /* Column starts at same position as tile. */
+            /* Row j*32.*/
+            float *tilePos = &ColumnMatrix[i*m*32*32 + j*32];
+            for( k = 0; k < 32; k++ )
+            {
+                for(l=0; l < 32; l++)
+                {
+                    tilePos[l] = tile[l];
+                }
+                /* Next 32 elements are the position of the tile in the next column.*/
+                tile = &tile[32];
+                /* Move to the j*32th position in the next column. */                
+                tilePos = &tilePos[32*m];
+
+            }
+        }   
+    }
+    return ColumnMatrix;
+}
+
+
+float* createIdentity(int m, int n)
+{
+    float* Matrix;
+    cudaMallocHost(&Matrix, sizeof(float) * m*n*32*32);
+    if(Matrix == NULL)
+        error("Failed to allocate Matrix");
+    int rows = m*32;
+    int columns = n*32;
+    int i, j;
+    memset ( Matrix, 0, sizeof(float)*m*n*32*32 );
+
+    for(i = 0; i < n*32; i++)
+    {
+        for(j = 0; j < m*32; j++)
+        {
+            if(i==j)
+            {
+                Matrix[i*m*32 + j] = 1.f;
+            }else
+            {
+                Matrix[i*m*32 + j] = 0.f;
+            }
+        }
+    }
+    return Matrix;
+}
+
+void printMatrix(float* Matrix, int m, int n)
+{
+    int i, j;
+    
+    for(i=0; i < m*32; i++)
+    {
+        for(j = 0; j < n*32; j++)
+        {
+            printf(" %.1f ", Matrix[j*m*32 +i]);
+
+        }
+        printf("\n");
+    }
+
+}
+
+
+/*void printTileMatrix(float* Matrix, int m, int n)
+{
+    int i,j;
+    float *tempMatrix, *tempMatrix2;
+    for(i = 0; i < m*32; i++)
+    {
+        int tiled = i/32;
+        tempMatrix = &Matrix[tiled*32*32];
+        for(j = 0; j < n*32; j++)
+        {
+            int tile = j/32;
+            tempMatrix2 = &tempMatrix[tile*32*32*m + i%32];
+            printf(" %.1f ", tempMatrix2[j*32]);
+            //printf(" %.1f,%i ", tempMatrix2[j*32], &tempMatrix2[j*32]-Matrix);
+        }   
+        printf("\n");
+    }
+}*/
+
+
+qsched_funtype func;
+void test_qr(int m , int n , int K , int nr_threads , int runs)
+{
+
+    int k, j, i;
+    float *A, *A_orig, *tau, *tau_device, *device_array, *temp_device_array;
+    struct qsched s;
+    qsched_task_t *tid, tid_new;
+    qsched_res_t *rid;
+    int data[3];
+    ticks tic, toc_run, tot_setup, tot_run = 0;
+ /* Initialize the scheduler. */
+    qsched_init( &s , 1 , qsched_flag_none );
+    cudaDeviceReset();
+    cudaSetDevice(0);
+    Setup<<<1,1>>>();
+    if(cudaDeviceSynchronize() != cudaSuccess)
+    error("Setup Failed: %s", cudaGetErrorString(cudaPeekAtLastError()));
+    
+    /* Allocate and fill the original matrix. */
+    if(cudaMallocHost(&A, sizeof(float) * m * n * K * K  ) != cudaSuccess ||
+        cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess ||
+        cudaMallocHost(&A_orig, sizeof(float) * m * n * K * K  ) != cudaSuccess )
+        error("Failed to allocate matrices.");
+    cudaFreeHost(A_orig);
+//    for ( k = 0 ; k < m * n * K * K ; k++ )
+  //      A_orig[k] = 2.0f*((float)rand()) / RAND_MAX - 1.0f;
+    A_orig = generateMatrix(m, n);
+    float *temp = columnToTile(A_orig, m * n * K *K,  m , n);
+    cudaFreeHost(A_orig);
+    A_orig = temp;
+    temp = tileToColumn(A_orig, m*n*K*K, m , n);
+//    printMatrix(temp, m, n);
+    
+//    printTileMatrix(A_orig, m, n);
+    memcpy( A , A_orig , sizeof(float) * m * n * K * K );
+    bzero( tau , sizeof(float) * m * n * K );
+
+    if( cudaMalloc(&tau_device, sizeof(float) * m * n * K ) != cudaSuccess )
+        error("Failed to allocate tau on the device");
+    if(cudaMemcpy( tau_device , tau , sizeof(float) * m * n * K , cudaMemcpyHostToDevice ) != cudaSuccess )
+        error("Failed to copy the tau data to the device.");    
+    if( cudaMemcpyToSymbol ( GPU_tau, &tau_device , sizeof(float *), 0 ,  cudaMemcpyHostToDevice ) != cudaSuccess )
+        error("Failed to copy locks pointer to the device.");
+
+    if( cudaMalloc( &device_array , sizeof(float) * m * n * K * K ) != cudaSuccess )
+        error("Failed to allocate the matrix on the device");
+    if( cudaMemcpyToSymbol( GPU_matrix , &device_array , sizeof(float *) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess )
+        error("Failed to copy matrix pointer to the device");
+
+    /* Allocate and init the task ID and resource ID matrix. */
+    tic = getticks();
+   /* Allocate and init the task ID and resource ID matrix. */
+    if( cudaMallocHost(&tid , sizeof(qsched_task_t) * m*n ) != cudaSuccess )
+        error("Failed to allocate tid");
+    if( cudaMallocHost(&rid , sizeof(qsched_task_t) * m*n) != cudaSuccess)
+        error("Failed to allocate rid");
+
+
+    for ( k = 0 ; k < m * n ; k++ ) {
+        tid[k] = qsched_task_none;
+        if( cudaHostGetDevicePointer(&temp_device_array , &A[k*32*32] , 0) != cudaSuccess)
+            error("Failed to get device pointer to matrix");
+        rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none , &A[k*32*32], sizeof(float) * 32 * 32, device_array + k * 32 *32);
+        }
+
+ /* Build the tasks. */
+    for ( k = 0 ; k < m && k < n ; k++ ) {
+    
+        /* Add kth corner task. */
+        data[0] = k; data[1] = k; data[2] = k;
+        tid_new = qsched_addtask( &s , task_SGEQRF , task_flag_none , data , sizeof(int)*3 , 2 );
+        qsched_addlock( &s , tid_new , rid[ k*m + k ] );
+        if ( tid[ k*m + k ] != -1 )
+            qsched_addunlock( &s , tid[ k*m + k ] , tid_new );
+        tid[ k*m + k ] = tid_new;
+
+
+        /* Add column tasks on kth row. */
+        for ( j = k+1 ; j < n ; j++ ) {
+            data[0] = k; data[1] = j; data[2] = k;
+            tid_new = qsched_addtask( &s , task_SLARFT , task_flag_none , data , sizeof(int)*3 , 3 );
+            qsched_addlock( &s , tid_new , rid[ j*m + k ] );
+            qsched_adduse( &s , tid_new , rid[ k*m + k ] );
+            qsched_addunlock( &s , tid[ k*m + k ] , tid_new );
+            if ( tid[ j*m + k ] != -1 )
+                qsched_addunlock( &s , tid[ j*m + k ] , tid_new );
+            tid[ j*m + k ] = tid_new;
+            }
+            
+        /* For each following row... */
+        for ( i = k+1 ; i < m ; i++ ) {
+        
+            /* Add the row tasks for the kth column. */
+            data[0] = i; data[1] = k; data[2] = k;
+            tid_new = qsched_addtask( &s , task_STSQRF , task_flag_none , data , sizeof(int)*3 , 3 );
+            qsched_addlock( &s , tid_new , rid[ k*m + i ] );
+            qsched_adduse( &s , tid_new , rid[ k*m + k ] );
+            qsched_addunlock( &s , tid[ k*m + (i-1) ] , tid_new );
+            if ( tid[ k*m + i ] != -1 )
+                qsched_addunlock( &s , tid[ k*m + i ] , tid_new );
+            tid[ k*m + i ] = tid_new;
+            
+            /* Add the inner tasks. */
+            for ( j = k+1 ; j < n ; j++ ) {
+                data[0] = i; data[1] = j; data[2] = k;
+                tid_new = qsched_addtask( &s , task_SSSRFT , task_flag_none , data , sizeof(int)*3 , 5 );
+                qsched_addlock( &s , tid_new , rid[ j*m + i ] );
+                qsched_adduse( &s , tid_new , rid[ k*m + i ] );
+                qsched_adduse( &s , tid_new , rid[ j*m + k ] );
+                qsched_addunlock( &s , tid[ k*m + i ] , tid_new );
+                qsched_addunlock( &s , tid[ j*m + k ] , tid_new );
+                if ( tid[ j*m + i ] != -1 )
+                    qsched_addunlock( &s , tid[ j*m + i ] , tid_new );
+                tid[ j*m + i ] = tid_new;
+                }
+        
+            }
+    
+        } /* build the tasks. */
+
+    if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
+        error("Failed to copy function pointer from device");
+
+    if( cudaMemcpyToSymbol( cuda_m, &m, sizeof(int), 0, cudaMemcpyHostToDevice ) != cudaSuccess )
+        error("Failed to copy m to the device");    
+    if( cudaMemcpyToSymbol( cuda_n, &n, sizeof(int), 0, cudaMemcpyHostToDevice ) != cudaSuccess )
+        error("Failed to copy n to the device");    
+
+        qsched_run_CUDA( &s , func );
+    cudaMemcpy( A , device_array , sizeof(float) * m * n * K * K, cudaMemcpyHostToDevice);
+    A = tileToColumn(A,m * n * K * K, m, n);
+    printMatrix(A, m, n);
+//    printTileMatrix(A, m , n);
+
+}
+
+
+void test()
+{
+    int m = 2, n = 2;
+    float* Matrix = createIdentity(m,n);
+    //printMatrix(Matrix, m, n);
+    float* MatrixTile = columnToTile(Matrix, m*n*32*32, m, n);
+    printMatrix(&MatrixTile[0], 1, 1);
+    printf("\n \n \n");
+    printMatrix(&MatrixTile[32*32], 1, 1);
+    printf("\n \n \n");
+    printMatrix(&MatrixTile[64*32], 1, 1);
+    printf("\n \n \n");
+    printMatrix(&MatrixTile[3*32*32], 1, 1);
+    printf("\n \n \n");
+    //printTileMatrix(MatrixTile, m, n);
+    free(Matrix);
+    Matrix = tileToColumn(MatrixTile, m*n*32*32, m , n);
+    printMatrix(Matrix, m, n);
+    free(MatrixTile);
+    free(Matrix);
+}
+
+/**
+ * @brief Main function.
+ */
+ 
+int main ( int argc , char *argv[] ) {
+
+    int c, nr_threads=128;
+    int M = 4, N = 4, runs = 1, K = 32;
+    
+    /* Get the number of threads. */
+    //#pragma omp parallel shared(nr_threads)
+//    {
+//        if ( omp_get_thread_num() == 0 )
+//            nr_threads = omp_get_num_threads();
+//    }
+    
+    /* Parse the options */
+    while ( ( c = getopt( argc , argv  , "m:n:k:r:t:D" ) ) != -1 )
+        switch( c ) {
+	        case 'm':
+	            if ( sscanf( optarg , "%d" , &M ) != 1 )
+	                error( "Error parsing dimension M." );
+	            break;
+	        case 'n':
+	            if ( sscanf( optarg , "%d" , &N ) != 1 )
+	                error( "Error parsing dimension M." );
+	            break;
+	        case 'k':
+	            if ( sscanf( optarg , "%d" , &K ) != 1 )
+	                error( "Error parsing tile size." );
+	            break;
+            case 'r':
+                if ( sscanf( optarg , "%d" , &runs ) != 1 )
+                    error( "Error parsing number of runs." );
+                break;
+	        case 't':
+	            if ( sscanf( optarg , "%d" , &nr_threads ) != 1 )
+	                error( "Error parsing number of threads." );
+	            //omp_set_num_threads( nr_threads );
+                break;
+            case 'D':
+                test();
+                exit( EXIT_SUCCESS );
+	        case '?':
+                fprintf( stderr , "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n" , argv[0] );
+                fprintf( stderr , "Computes the tiled QR decomposition of an MxN tiled\n"
+                                  "matrix using nr_threads threads.\n" );
+	            exit( EXIT_FAILURE );
+	        }
+            
+    /* Dump arguments. */
+    message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." ,
+        32*M , 32*N , nr_threads , runs );
+        
+    test_qr( M , N , K , nr_threads , runs );
+    
+    }
+