diff --git a/examples/Makefile.am b/examples/Makefile.am
index 53fbf930f2911da0182bfe846bf48b452ce86dbf..152221294d79d6d628c5dc5c5776bc9f2e309f28 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -28,7 +28,7 @@ AM_CFLAGS = -g -O3 -Wall -Werror -I../src -ffast-math -fstrict-aliasing \
 AM_LDFLAGS = -lm # -fsanitize=address
 
 # Set-up the library
-bin_PROGRAMS = test test_bh test_bh_2 test_bh_3 test_bh_sorted
+bin_PROGRAMS = test test_bh test_bh_2 test_bh_3 test_bh_sorted test_qr
 #test_qr
 # Sources for test
 test_SOURCES = test.c
@@ -36,9 +36,10 @@ test_CFLAGS = $(AM_CFLAGS)
 test_LDADD =  ../src/.libs/libquicksched.a
 
 # Sources for test_qr
-test_qr_SOURCES = test_qr.c
-test_qr_CFLAGS = $(AM_CFLAGS)
-test_qr_LDADD =  ../src/.libs/libquicksched.a -llapacke -llapacke -lblas
+#-L/usr/bin64/atlas/ /home/aidan/lapack-3.5.0/liblapacke.a /home/aidan/lapack-3.5.0/liblapack.a
+test_qr_SOURCES = test_qr.c /usr/lib64/atlas/libcblas.a /usr/lib64/atlas/libptcblas.a
+test_qr_CFLAGS = $(AM_CFLAGS) -I/home/aidan/ATLAS/ATLAS_linux/include #-I/home/aidan/lapack-3.5.0/lapacke/include
+test_qr_LDADD =  ../src/.libs/libquicksched.a  -lf77blas -lcblas -latlas -lm  -L/home/aidan/ATLAS/ATLAS_linux/lib/ 
 
 # Sources for test_bh
 test_bh_SOURCES = test_bh.c
diff --git a/examples/test_qr.c b/examples/test_qr.c
index 2a754c04d4976e138170aaaf261bfcd8afe77ba5..ada33a33acf4833ed478e10bcd0f0cadf466a5e5 100644
--- a/examples/test_qr.c
+++ b/examples/test_qr.c
@@ -28,202 +28,472 @@
 #include <unistd.h>
 #include <math.h>
 #include <omp.h>
+#include <pthread.h>
+
+
+
 
-/* LAPACKE header. */
-#include <lapacke.h>
 #include <cblas.h>
 
 
 /* Local includes. */
 #include "quicksched.h"
 
-
-/*
- * Sam's routines for the tiled QR decomposition.
+/**
+ * Takes a column major matrix, NOT tile major. size is length of a side of the matrix. Only works for square matrices.
+ * This function is simply for validation and is implemented naively as we know of no implementation to retrieve Q from the tiled QR.
  */
+double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum)
+{
+    double* Q = malloc(sizeof(double)*size*size);
+    double* Qtemp = malloc(sizeof(double)*size*size);
+    double* w = malloc(sizeof(double)*size);
+    double* ww = malloc(sizeof(double)*size*size);
+    double* temp = malloc(sizeof(double)*size*size);
+    int i, k, l, j, n;
+    bzero(Q, sizeof(double)*size*size);
+    bzero(Qtemp, sizeof(double)*size*size);
+    bzero(ww, sizeof(double)*size*size);
+    for(i = 0; i < size; i++)
+    {
+        Q[i*size + i] = 1.0;
+    }
+    int numcoltile = size / tilesize;
+    int numrowtile = size / tilesize;
+    for(k = 0; k < numrowtile; k++)
+    {
+        for(l = 0; l < tilesize; l++)
+        {
+            bzero(Qtemp, sizeof(double)*size*size);
+            for(i = 0; i < size; i++)
+            {
+                Qtemp[i*size + i] = 1.0;
+            }
 
-/*
-  \brief Computes 2-norm of a vector \f$x\f$
-  
-  Computes the 2-norm by computing the following: \f[\textrm{2-norm}=\sqrt_0^lx(i)^2\f]
- */
-double do2norm(double* x, int l)
+
+            for(i = k; i < numcoltile; i++)
+            {
+                bzero(w, sizeof(double)*size);
+                
+                for(j = 0 ; j < tilesize; j++)
+                {
+                    w[i*tilesize + j] = HR[(k*tilesize+l)*size + i*tilesize+j];
+                }
+                    w[k*tilesize+l] = 1.0;
+                if(k*tilesize+l > i*tilesize)
+                {
+                    for(j = 0; j < k*tilesize+l; j++)
+                        w[j] = 0.0;
+                }
+
+
+                /* Compute (I - tau*w*w')' */
+                for(j = 0; j < size; j++)
+                {
+                    for(n = 0; n < size; n++)
+                    {
+                        if(j != n)
+                            ww[n*size + j] = -tau[(k*tilesize+l)*tauNum+ i] * w[j] * w[n];
+                        else
+                            ww[n*size + j] = 1.0 - tau[(k*tilesize+l)*tauNum+ i] * w[j] * w[n];
+                    }
+                }
+
+                /* Qtemp = Qtemp * (I-tau*w*w')' */
+                cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size,  size, 1.0, Qtemp, size, ww, size, 0.0, temp, size);
+                double *b = Qtemp;
+                Qtemp = temp;
+                temp = b;
+            }   
+            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, 1.0, Q, size, Qtemp, size, 0.0, temp, size);
+            double *b = Q;
+            Q = temp;
+            temp = b;         
+        }
+    }
+    
+    free(Qtemp);
+    free(w);
+    free(ww);
+    free(temp);
+    return Q;
+}
+
+double* getR(double*HR, int size)
 {
-	double sum = 0, norm;
-	int i;
+    double* R = malloc(sizeof(double) * size * size);
+    int i, j;
+    bzero(R, sizeof(double) * size * size);
+    for(i = 0; i< size; i++)
+    {
+        for(j = 0; j <= i; j++)
+        {
+            R[i*size + j] = HR[i*size + j];
+        }
+    }
+    return R;
+}
 
-	for(i = 0; i < l; i++)
-		sum += x[i] * x[i];
+void printMatrix(double* Matrix, int m, int n, int tilesize)
+{
+    int i, j;
+    
+    for(i=0; i < m*tilesize; i++)
+    {
+        for(j = 0; j < n*tilesize; j++)
+        {
+            printf(" %.3f ", Matrix[j*m*tilesize +i]);
 
-	norm = sqrt(sum);
+        }
+        printf("\n");
+    }
 
-	return norm;
 }
 
+double* columnToTile( double* columnMatrix, int size , int m , int n , int tilesize)
+{
+    double* TileMatrix;
+    TileMatrix = malloc(sizeof(double) * size );
+    if(TileMatrix == NULL)
+        error("failed to allocate TileMatrix");
+    int i,j,k,l;
+
+    for( i = 0; i < n ; i++ )
+    {
+        for( j = 0; j < m; j++ )
+        {
+            double *tileStart = &columnMatrix[i*m*tilesize*tilesize + j*tilesize];
+            double *tilePos = &TileMatrix[i*m*tilesize*tilesize + j*tilesize*tilesize];
+            for(k = 0; k < tilesize; k++)
+            {
+                tileStart = &columnMatrix[i*m*tilesize*tilesize+k*m*tilesize + j*tilesize];
+                for( l = 0; l < tilesize; l++ )
+                {
+                    tilePos[k*tilesize + l] = tileStart[l];
+                }
+            }
+        }
+    }
+
+    return TileMatrix;
+
+}
+
+double* tileToColumn( double* tileMatrix, int size, int m , int n , int tilesize)
+{
+    double* ColumnMatrix;
+    ColumnMatrix = (double*) malloc(sizeof(double) * size );
+    if(ColumnMatrix == NULL)
+        error("failed to allocate ColumnMatrix");
+    int i,j,k,l;
+    for( i = 0; i < n ; i++ )
+    {
+        for(j = 0; j < m ; j++ )
+        {
+            /* Tile on ith column is at i*m*32*32.*/
+            /* Tile on jth is at j*32*32 */
+            double *tile = &tileMatrix[i*m*tilesize*tilesize + j*tilesize*tilesize];
+            /* Column starts at same position as tile. */
+            /* Row j*32.*/
+            double *tilePos = &ColumnMatrix[i*m*tilesize*tilesize + j*tilesize];
+            for( k = 0; k < tilesize; k++ )
+            {
+                for(l=0; l < tilesize; l++)
+                {
+                    tilePos[l] = tile[l];
+                }
+                /* Next 32 elements are the position of the tile in the next column.*/
+                tile = &tile[tilesize];
+                /* Move to the j*32th position in the next column. */                
+                tilePos = &tilePos[tilesize*m];
+
+            }
+        }   
+    }
+    return ColumnMatrix;
+}
+
+/* Routines for the tiled QR decomposition.*/
+
 /**
- * \brief Computes a Householder reflector from a pair of vectors from coupled blocks
  *
- * Calculates the Householder vector of the vector formed by a column in a pair of coupled blocks.
- * There is a single non-zero element, in the first row, of the top vector. This is passed as topDiag
+ * @brief Computes the QR decomposition of a tile.
+ *
+ * @param cornerTile A pointer to the tile for which the decomposition is computed.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param k The value of k for the QR decomposition
+ * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
  *
- * \param topDiag The only non-zero element of the incoming vector in the top block
- * \param ma The number of elements in the top vector
- * \param xb Pointer to the lower vector
- * \param l The number of elements in the whole vector
- * \param vk A pointer to a pre-allocated array to store the householder vector of size l
  *
- * \returns void
  */
-void calcvkDouble	(double topDiag,
-			int ma,
-			double* xb,
-			int l,
-			double* vk)
+void DGEQRF(double* restrict cornerTile, int tileSize, double* restrict tauMatrix, int k, int tauNum, double* w)
 {
-	int sign, i;
-	double norm, div;
-	//same non-standard normalisation as for single blocks above, but organised without a temporary beta veriable
-
-	sign = topDiag >= 0.0 ? 1 : -1;
-	vk[0] = topDiag;
-	//use vk[0] as beta
-	for(i = 1; i < ma; i++)
-		vk[i] = 0;
-
-	for(; i < l; i++)
-		vk[i] = xb[i - ma];
-
-	norm = do2norm(vk, l);
-	vk[0] += norm * sign;
-
-	if(norm != 0.0)
-	{
-		div = 1/vk[0];
-	
-		for(i = 1; i < l; i++)
-			vk[i] *= div;
-	}
-}
+    int i, j, n;
+    double norm=0.0, sign, u1, tau, z;
 
 
-void updateDoubleQ_WY	(double* blockA,
-			double* blockB,
-			double* blockTau,
-			int k, int ma, int mb, int n,
-			int ldm,
-			double* hhVector)//bottom, essential part.
-{
-	int i, j;
-
-	double tau = 1.0, beta;
+    /*Find the householder vector for each row. */
+    for(i = 0; i < tileSize; i++)
+    {
+        norm = 0.0;
+        /*Fill w with the vector.*/
+        for(j=i; j < tileSize; j++)
+        {
+            /* ith row is i*tileSize, only want elements on diagonal or below.*/
+            w[ j ] = cornerTile[ i*tileSize +j ];
+            /*Find the norm as well */
+            norm = norm + w[j]*w[j];
+        }
+        if(w[i] >= 0.0)
+            sign = -1;
+        else
+            sign = 1;
 
-	/* Compute tau = 2/v'v */
-	for(i = 0; i < mb; i ++)
-		tau += hhVector[i] * hhVector[i];
+        norm = sqrt(norm);
 
-	tau = 2/tau;
+        u1 = w[i] - sign*norm;
 
-	for(j = k; j < n; j ++)
-	{
-		/* Compute v'*b_j */
-		beta = blockA[(j*ldm) + k];
+        if(u1 != 0.0)
+        {
+            for(j=i+1; j < tileSize; j++)
+                w[j] = w[j] / u1;
+        }
+        else
+        {
+            for(j=i+1; j < tileSize; j++)
+                w[j] = 0.0;
+        }
 
-		/* Then for lower half */
-		for(i = 0; i < mb; i ++)
-			beta += blockB[(j*ldm) + i] * hhVector[i];
+        if(norm != 0.0)
+            tau = -sign*u1/norm;
+        else
+            tau = 0.0;
 
-		beta *= tau;
+        /*Store the below diagonal vector */
+        
+        for(j = i+1; j < tileSize; j++)
+        {
+            cornerTile[ i*tileSize +j ] = w[j];
+        }
+        cornerTile[ i*tileSize + i ] = sign*norm;
+        w[i] = 1.0;
+        /* Apply the householder transformation to the rest of the tile, for everything to the right of the diagonal. */
+        for(j = i+1; j < tileSize; j++)
+        {
+            /*Compute w'*A_j*/
+            z = cornerTile[ j*tileSize+i];
+            for(n = i+1; n < tileSize; n++)
+            {
+                z = z + cornerTile[j*tileSize+n] * w[n];
+            }
+            /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */
+            for(n=i; n < tileSize; n++)
+            {
+                cornerTile[j*tileSize+n] = cornerTile[j*tileSize+n] - tau*w[n]*z;
+            }
 
-		/* Compute b_j = b_j - beta*v_k */
-		blockA[(j*ldm) + k] -= beta;
-		
-		for(i = 0; i < mb; i ++)
-			blockB[(j*ldm) + i] -= beta * hhVector[i];
-	}
+        }
+        /* Store tau. We're on k*tileSize+ith row. kth column.*/
+        tauMatrix[(k*tileSize+i)*tauNum+k] = tau;
 
-	/* Insert vector below diagonal. */
-	for(i = 0; i < mb; i ++)
-		blockB[(k*ldm) + i] = hhVector[i];
+    }
 
-	blockTau[k] = tau;
 }
 
-void DTSQRF	(double* blockA,
-		double* blockB,
-		double* blockTau,
-		int ma,
-		int mb,
-		int n,
-		int ldm,
-		double* hhVector)
+
+/**
+ *
+ * @brief Applies the householder factorisation of the corner to the row tile.
+ *
+ * @param cornerTile A pointer to the tile for which the householder is stored.
+ * @param rowTiles The tile to which the householder is applied.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param jj The value of j for the QR decomposition.
+ * @param kk The value of k for the QR decomposition.
+ * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ *
+ *
+ */
+void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize, int jj, int kk, double* restrict tauMatrix, int tauNum, double* w)
 {
-	int k;
-	double* xVectA, *xVectB;
-	
-	xVectA = blockA;
-	xVectB = blockB;
-
-	for(k = 0; k < n; k++)
-	{
-		//vk = sign(x[1])||x||_2e1 + x
-		//vk = vk/vk[0]
-		calcvkDouble(xVectA[0], ma - k, xVectB, (ma + mb) - k, hhVector);//returns essential
-
-		//matA(k:ma,k:na) = matA(k:ma,k:na) - (2/(vk.T*vk))*vk*(vk.T*matA(k:ma,k:na)
-		//update both blocks, preserving the vectors already stored below the diagonal in the top block and treating them as if they were zeros.
-		updateDoubleQ_WY	(blockA, blockB,
-					blockTau,
-					k, ma, mb, n,
-					ldm,
-					hhVector + ma - k);
-
-		xVectA += ldm + 1;
-		xVectB += ldm;
-	}
+    int i, j, n;
+    double z=0.0;
+
+
+    /* For each row in the corner Tile*/
+    for(i = 0; i < tileSize; i++)
+    {
+        /*Get w for row i */
+        for(j = i; j < tileSize; j++)
+        {
+            w[j] = cornerTile[i*tileSize + j];
+        }
+        w[i] = 1.0;
+        
+        /* Apply to the row Tile */
+        for(j = 0; j < tileSize; j++)
+        {
+            z=0.0;
+            /* Below Diagonal!*/
+            /*Compute w'*A_j*/
+            for(n = i; n < tileSize; n++)
+            {
+                z = z + w[n] * rowTile[j*tileSize+n];
+            }
+            for(n = i; n < tileSize; n++)
+            {
+                rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z;
+            }
+        }
+
+        
+    }
+
+
 }
 
-void DSSRFT	(double* blockV,
-		double* blockA, double* blockB,
-		double* blockTau,
-		int b, int n, int ldm)
+/**
+ *
+ * @brief Applies the householder factorisation of the corner to the row tile.
+ *
+ * @param cornerTile The corner tile for this value of k.
+ * @param columnTile The tile for which householders are computed.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param ii The value of i for the QR decomposition.
+ * @param kk The value of k for the QR decomposition.
+ * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ *
+ *
+ */
+void DTSQRF( double* restrict cornerTile, double* restrict columnTile, int tilesize, int ii, int kk, double* restrict tauMatrix, int tauNum, double* w )
 {
-	int i, j, k;
+    int i, j, n;
+    double norm=0.0, sign, u1, tau, z;
 
-	double tau, beta;
+    /* For each column compute the householder vector. */
+    for(i = 0; i < tilesize; i++)
+    {
+        norm = 0.0;
+        w[i] = cornerTile[ i*tilesize+i ];
+        norm = norm + w[i]*w[i];
+        for(j = i+1; j < tilesize; j++)
+        {   
+            w[j] = 0.0;
+        }
+        for(j = 0; j < tilesize; j++)
+        {
+            w[tilesize+j] = columnTile[ i*tilesize+j ];
+            norm = norm + w[tilesize+j]*w[tilesize+j];
+        }
+        
+        norm = sqrt(norm);
+        if(w[i] >= 0.0)
+            sign = -1;
+        else
+            sign = 1;
+        
 
-	/* Compute b_j = b_j - tau*v*v'*b_j for each column j of blocks A & B,
-	   and for each householder vector v of blockV */
+        u1 = w[i] - sign*norm;
+        if(u1 != 0.0)
+        {
+            for(j = i+1; j < 2*tilesize; j++){
+                w[j] = w[j]/u1;
+            }
+        }else
+        {
+            for(j = i+1; j < 2*tilesize; j++)
+                w[j] = 0.0;
+        }
 
-	/* For each column of B */
-	for(j = 0; j < n; j ++)
-	{
-		/* For each householder vector. */
-		for(k = 0; k < n; k ++)
-		{
-			/* tau = 2/v'v, computed earlier, stored in T(k,k). */
-			tau = blockTau[k];
+        if(norm != 0)
+            tau = -sign*u1/norm;
+        else
+            tau = 0.0;
+
+        /* Apply to each row to the right.*/
+        for(j = i; j < tilesize; j++)
+        {
+            /* Find w'*A_j, w is 0s except for first value with upper tile.*/
+            z = 1.0 * cornerTile[ j*tilesize+i ];            
+            for(n = 0; n < tilesize; n++)
+            {
+                z = z + w[ tilesize+n ]*columnTile[ j*tilesize+n ];
+            }
+            /* Apply to upper tile.*/
+            cornerTile[j*tilesize+i] = cornerTile[j*tilesize+i ] - tau*1.0*z;
+            for(n = i+1; n < tilesize; n++)
+            {
+                cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n ] - tau*w[n]*z;
+            }
+            /* Apply to lower tile.*/
+            for(n = 0; n < tilesize; n++)
+            {
+                columnTile[ j*tilesize+n] = columnTile[ j*tilesize+n ] - tau*w[tilesize+n]*z;
+            }
+            
+        }
+        /* Store w*/
+        for(j = 0; j < tilesize; j++){
+            columnTile[ i*tilesize+j ] = w[tilesize+j];
+        }
+        tauMatrix[(kk*tilesize+i)*tauNum+ ii] = tau;
+    }
+}
 
-			/* Compute beta = v_k'b_j. */
-			/* v_k is >0 (=1) only at position k in top half. */
-			beta = blockA[(j*ldm) + k];
+/**
+ *
+ * @brief Applies the householder factorisation of the corner to the row tile.
+ *
+ * @param cornerTile A pointer to the tile to have the householder applied.
+ * @param columnTile The tile in which the householders are stored.
+ * @param rowTile The upper tile to have the householders applied.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param ii The value of i for the QR decomposition.
+ * @param jj The value of j for the QR decomposition.
+ * @param kk The value of k for the QR decomposition.
+ * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ *
+ *
+ */
+void DSSRFT( double* restrict cornerTile, double* restrict columnTile, double* restrict rowTile, int tilesize, int ii, int jj, int kk, double* restrict tauMatrix, int tauNum , double* w)
+{
+    int i, j, n;
+    double z;
 
-			/* For lower portion of v_k, aligning with the lower block */
-			for(i = 0; i < b; i ++)
-				beta += blockB[(j*ldm) + i] * blockV[(k*ldm) + i];
+    
+    for(i = 0; i < tilesize; i++)
+    {
+        for(j = 0; j < i; j++)
+            w[j] = 0.0;
+        w[i] = 1.0;
+        for(j = i+1; j < tilesize; j++)
+            w[j] = 0.0;
+        for(j = 0; j < tilesize; j++)
+            w[j+tilesize] = columnTile[i*tilesize +j];
+        
+        /* Apply householder vector (w) to the tiles.*/
+        for(j = 0; j < tilesize; j++)
+        {
+            z = 0.0;
+            /* Compute w' * A_j */
+            for(n = 0; n < tilesize; n++)
+            {
+                z += w[n] * rowTile[j*tilesize+n];
+                z += w[n + tilesize] * cornerTile[j*tilesize+n];
+            }
+            for(n = 0; n < tilesize; n++)
+            {
+                rowTile[j*tilesize + n] = rowTile[j*tilesize + n] - tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[n]*z;
+                cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n]- tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[tilesize+n]*z;
+            }
+        }
+    }
 
-			beta *= tau;
-			
-			/* Compute b_j = b_j - beta * v */
-			/* v_k = 1 at (k) in top half again */
-			blockA[(j*ldm) + k] -= beta;
-
-			/* Apply to bottom block. */
-			for(i = 0; i < b; i ++)
-				blockB[(j*ldm) + i] -= beta * blockV[(k*ldm) + i];
-		}
-	}
-}
-			
+}		
 			
 /**
  * @brief Computed a tiled QR factorization using QuickSched.
@@ -233,7 +503,7 @@ void DSSRFT	(double* blockV,
  * @param nr_threads Number of threads to use.
  */
  
-void test_qr ( int m , int n , int K , int nr_threads , int runs ) {
+void test_qr ( int m , int n , int K , int nr_threads , int runs, double *matrix ) {
 
     int k, j, i;
     double *A, *A_orig, *tau;
@@ -252,26 +522,21 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) {
         /* Decode the task data. */
         int *idata = (int *)data;
         int i = idata[0], j = idata[1], k = idata[2];
-        double buff[ 2*K*K ];
+        double ww[ 2*K ];
         
         /* Decode and execute the task. */
         switch ( type ) {
             case task_DGEQRF:
-                LAPACKE_dgeqrf_work( LAPACK_COL_MAJOR , K, K ,
-                                &A[ j*m*K*K + i*K ] , m*K , &tau[ j*m*K + i*K ] ,
-                                buff , 2*K*K );
+                DGEQRF( &A[ (k*m+k)*K*K], K, tau, k, m, ww);
                 break;
             case task_DLARFT:
-                LAPACKE_dlarft_work( LAPACK_COL_MAJOR , 'F' , 'C' ,
-                                K , K , &A[ i*m*K*K + i*K ] ,
-                                m*K , &tau[ i*m*K + i*K ] , &A[ j*m*K*K + i*K ] ,
-                                m*K );
+                DLARFT( &A[ (k*m+k)*K*K ],&A[(j*m+k)*K*K], K, j, k, tau, m, ww);
                 break;
             case task_DTSQRF:
-                DTSQRF( &A[ j*m*K*K + j*K ] , &A[ j*m*K*K + i*K ] , &tau[ j*m*K + i*K ] , K , K , K , K*m , buff );
+                DTSQRF( &A[(k*m+k)*K*K], &A[(k*m+i)*K*K], K, i, k, tau, m , ww);
                 break;
             case task_DSSRFT:
-                DSSRFT(	&A[ k*m*K + i*K ] , &A[ j*m*K*K + k*K ] , &A[ j*m*K*K + i*K ] , &tau[ k*m*K + i*K ] , K , K , K*m );
+                DSSRFT(&A[(j*m+i)*K*K], &A[(k*m+i)*K*K], &A[(j*m+k)*K*K], K, i, j, k, tau, m , ww);
                 break;
             default:
                 error( "Unknown task type." );
@@ -286,7 +551,12 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) {
          ( A_orig = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL )
         error( "Failed to allocate matrices." );
     for ( k = 0 ; k < m * n * K * K ; k++ )
-        A_orig[k] = 2*((double)rand()) / RAND_MAX - 1.0;
+    {
+        if(matrix == NULL)
+            A_orig[k] = 2*((double)rand()) / RAND_MAX - 1.0;
+        else
+            A_orig[k] = matrix[k];
+    }
     memcpy( A , A_orig , sizeof(double) * m * n * K * K );
     bzero( tau , sizeof(double) * m * n * K );
     
@@ -356,9 +626,10 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) {
                 qsched_adduse( &s , tid_new , rid[ k*m + i ] );
                 qsched_adduse( &s , tid_new , rid[ j*m + k ] );
                 qsched_addunlock( &s , tid[ k*m + i ] , tid_new );
-                qsched_addunlock( &s , tid[ j*m + k ] , tid_new );
+                qsched_addunlock( &s, tid[j*m+i-1], tid_new);
                 if ( tid[ j*m + i ] != -1 )
                     qsched_addunlock( &s , tid[ j*m + i ] , tid_new );
+
                 tid[ j*m + i ] = tid_new;
                 }
         
@@ -420,12 +691,60 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) {
     for ( k = 0 ; k < qsched_timer_count ; k++ )
         message( "timer %s is %lli ticks." , qsched_timer_names[k] , s.timers[k]/runs );
     
+    if(matrix != NULL)
+    {
+        for(k = 0; k < m*n*K*K; k++)
+            matrix[k] = A[k];
+    }
+
+ 
+    /* Test if the decomposition was correct.*/
+   /*double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
+    double *Q = computeQ(tempMatrix, m*K, K, tau, m);
+    double *R = getR(tempMatrix, m*K);
+    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q, m*K, R, m*K, 0.0, tempMatrix, m*K);
+        free(Q);
+        Q = tileToColumn(A_orig, m*n*K*K, m, n, K);
+    for(i = 0; i < m * n * K * K; i++)
+    {
+        if(Q[i] != 0 && (Q[i] / tempMatrix[i] > 1.005 || Q[i] / tempMatrix[i] < 0.995))
+            printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i], tempMatrix[i]);
+    }
+    free(tempMatrix);
+    free(Q);
+    free(R);*/
+
     /* Clean up. */
+    free(A);
+    free(A_orig);
+    free(tau);
+    free(tid);
+    free(rid);
     qsched_free( &s );
         
     }
 
 
+/* Generates a random matrix. */
+double* generateColumnMatrix(int size)
+{
+    double* matrix = malloc(sizeof(double)*size*size);
+    if(matrix == NULL)
+        error("Failed to allocate matrix");
+
+    unsigned long int m_z = 35532;
+    int i;
+    for(i = 0 ; i < size*size; i++)
+    {
+        m_z = (1664525*m_z + 1013904223) % 4294967296;
+        matrix[i] = m_z % 100;
+        if(matrix[i] < 0)
+            matrix[i] += 100;
+    }
+    return matrix;
+}
+
+
 /**
  * @brief Main function.
  */
@@ -477,7 +796,7 @@ int main ( int argc , char *argv[] ) {
     message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." ,
         32*M , 32*N , nr_threads , runs );
         
-    test_qr( M , N , K , nr_threads , runs );
+    test_qr( M , N , K , nr_threads , runs , NULL);
     
     }