diff --git a/examples/Makefile.am b/examples/Makefile.am
index 2d20cbb1630044ced0bfd7b6d87d05e38696623b..f01bdec4fbb88639d39b19f63a12ea59dcad5c09 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -34,10 +34,9 @@ test_CFLAGS = $(AM_CFLAGS)
 test_LDADD =  ../src/.libs/libquicksched.a
 
 # Sources for test_qr
-test_qr_SOURCES = test_qr.c
-test_qr_CFLAGS = $(AM_CFLAGS) -I/home/aidan/lapack-3.5.0/lapacke/include/
-test_qr_LDFLAGS = -I/home/aidan/lapack-3.5.0/lapacke/include/
-test_qr_LDADD =  ../src/.libs/libquicksched.a /home/aidan/lapack-3.5.0/liblapacke.a /home/aidan/lapack-3.5.0/liblapack.a -lblas 
+test_qr_SOURCES = test_qr.c /usr/lib64/atlas/libcblas.a /usr/lib64/atlas/libptcblas.a
+test_qr_CFLAGS = $(AM_CFLAGS) -I/home/aidan/ATLAS/ATLAS_linux/include #-I/home/aidan/lapack-3.5.0/lapacke/include
+test_qr_LDADD =  ../src/.libs/libquicksched.a  -lf77blas -lcblas -latlas -lm  -L/home/aidan/ATLAS/ATLAS_linux/lib/ 
 
 # Sources for test_bh
 #test_bh_SOURCES = test_bh.c
diff --git a/examples/test_qr.c b/examples/test_qr.c
index ada33a33acf4833ed478e10bcd0f0cadf466a5e5..97860917eb98eb1928a1ab37ce0568bc3f76ec38 100644
--- a/examples/test_qr.c
+++ b/examples/test_qr.c
@@ -1,22 +1,21 @@
 /*******************************************************************************
  * This file is part of QuickSched.
  * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
- ******************************************************************************/
-
+ *
+* *****************************************************************************/
 
 /* Config parameters. */
 #include "../config.h"
@@ -30,187 +29,161 @@
 #include <omp.h>
 #include <pthread.h>
 
-
-
-
 #include <cblas.h>
 
-
 /* Local includes. */
 #include "quicksched.h"
 
 /**
- * Takes a column major matrix, NOT tile major. size is length of a side of the matrix. Only works for square matrices.
- * This function is simply for validation and is implemented naively as we know of no implementation to retrieve Q from the tiled QR.
+ * Takes a column major matrix, NOT tile major. size is length of a side of the
+ * matrix. Only works for square matrices.
+ * This function is simply for validation and is implemented naively as we know
+ * of no implementation to retrieve Q from the tiled QR.
  */
-double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum)
-{
-    double* Q = malloc(sizeof(double)*size*size);
-    double* Qtemp = malloc(sizeof(double)*size*size);
-    double* w = malloc(sizeof(double)*size);
-    double* ww = malloc(sizeof(double)*size*size);
-    double* temp = malloc(sizeof(double)*size*size);
-    int i, k, l, j, n;
-    bzero(Q, sizeof(double)*size*size);
-    bzero(Qtemp, sizeof(double)*size*size);
-    bzero(ww, sizeof(double)*size*size);
-    for(i = 0; i < size; i++)
-    {
-        Q[i*size + i] = 1.0;
-    }
-    int numcoltile = size / tilesize;
-    int numrowtile = size / tilesize;
-    for(k = 0; k < numrowtile; k++)
-    {
-        for(l = 0; l < tilesize; l++)
-        {
-            bzero(Qtemp, sizeof(double)*size*size);
-            for(i = 0; i < size; i++)
-            {
-                Qtemp[i*size + i] = 1.0;
-            }
-
-
-            for(i = k; i < numcoltile; i++)
-            {
-                bzero(w, sizeof(double)*size);
-                
-                for(j = 0 ; j < tilesize; j++)
-                {
-                    w[i*tilesize + j] = HR[(k*tilesize+l)*size + i*tilesize+j];
-                }
-                    w[k*tilesize+l] = 1.0;
-                if(k*tilesize+l > i*tilesize)
-                {
-                    for(j = 0; j < k*tilesize+l; j++)
-                        w[j] = 0.0;
-                }
-
-
-                /* Compute (I - tau*w*w')' */
-                for(j = 0; j < size; j++)
-                {
-                    for(n = 0; n < size; n++)
-                    {
-                        if(j != n)
-                            ww[n*size + j] = -tau[(k*tilesize+l)*tauNum+ i] * w[j] * w[n];
-                        else
-                            ww[n*size + j] = 1.0 - tau[(k*tilesize+l)*tauNum+ i] * w[j] * w[n];
-                    }
-                }
-
-                /* Qtemp = Qtemp * (I-tau*w*w')' */
-                cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size,  size, 1.0, Qtemp, size, ww, size, 0.0, temp, size);
-                double *b = Qtemp;
-                Qtemp = temp;
-                temp = b;
-            }   
-            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, 1.0, Q, size, Qtemp, size, 0.0, temp, size);
-            double *b = Q;
-            Q = temp;
-            temp = b;         
+double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum) {
+  double* Q = malloc(sizeof(double) * size * size);
+  double* Qtemp = malloc(sizeof(double) * size * size);
+  double* w = malloc(sizeof(double) * size);
+  double* ww = malloc(sizeof(double) * size * size);
+  double* temp = malloc(sizeof(double) * size * size);
+  int i, k, l, j, n;
+  bzero(Q, sizeof(double) * size * size);
+  bzero(Qtemp, sizeof(double) * size * size);
+  bzero(ww, sizeof(double) * size * size);
+  for (i = 0; i < size; i++) {
+    Q[i * size + i] = 1.0;
+  }
+  int numcoltile = size / tilesize;
+  int numrowtile = size / tilesize;
+  for (k = 0; k < numrowtile; k++) {
+    for (l = 0; l < tilesize; l++) {
+      bzero(Qtemp, sizeof(double) * size * size);
+      for (i = 0; i < size; i++) {
+        Qtemp[i * size + i] = 1.0;
+      }
+
+      for (i = k; i < numcoltile; i++) {
+        bzero(w, sizeof(double) * size);
+
+        for (j = 0; j < tilesize; j++) {
+          w[i * tilesize + j] =
+              HR[(k * tilesize + l) * size + i * tilesize + j];
+        }
+        w[k * tilesize + l] = 1.0;
+        if (k * tilesize + l > i * tilesize) {
+          for (j = 0; j < k * tilesize + l; j++) w[j] = 0.0;
         }
+
+        /* Compute (I - tau*w*w')' */
+        for (j = 0; j < size; j++) {
+          for (n = 0; n < size; n++) {
+            if (j != n)
+              ww[n * size + j] =
+                  -tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n];
+            else
+              ww[n * size + j] =
+                  1.0 - tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n];
+          }
+        }
+
+        /* Qtemp = Qtemp * (I-tau*w*w')' */
+        cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
+                    1.0, Qtemp, size, ww, size, 0.0, temp, size);
+        double* b = Qtemp;
+        Qtemp = temp;
+        temp = b;
+      }
+      cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
+                  1.0, Q, size, Qtemp, size, 0.0, temp, size);
+      double* b = Q;
+      Q = temp;
+      temp = b;
     }
-    
-    free(Qtemp);
-    free(w);
-    free(ww);
-    free(temp);
-    return Q;
+  }
+
+  free(Qtemp);
+  free(w);
+  free(ww);
+  free(temp);
+  return Q;
 }
 
-double* getR(double*HR, int size)
-{
-    double* R = malloc(sizeof(double) * size * size);
-    int i, j;
-    bzero(R, sizeof(double) * size * size);
-    for(i = 0; i< size; i++)
-    {
-        for(j = 0; j <= i; j++)
-        {
-            R[i*size + j] = HR[i*size + j];
-        }
+double* getR(double* HR, int size) {
+  double* R = malloc(sizeof(double) * size * size);
+  int i, j;
+  bzero(R, sizeof(double) * size * size);
+  for (i = 0; i < size; i++) {
+    for (j = 0; j <= i; j++) {
+      R[i * size + j] = HR[i * size + j];
     }
-    return R;
+  }
+  return R;
 }
 
-void printMatrix(double* Matrix, int m, int n, int tilesize)
-{
-    int i, j;
-    
-    for(i=0; i < m*tilesize; i++)
-    {
-        for(j = 0; j < n*tilesize; j++)
-        {
-            printf(" %.3f ", Matrix[j*m*tilesize +i]);
+void printMatrix(double* Matrix, int m, int n, int tilesize) {
+  int i, j;
 
-        }
-        printf("\n");
+  for (i = 0; i < m * tilesize; i++) {
+    for (j = 0; j < n * tilesize; j++) {
+      printf(" %.3f ", Matrix[j * m * tilesize + i]);
     }
-
+    printf("\n");
+  }
 }
 
-double* columnToTile( double* columnMatrix, int size , int m , int n , int tilesize)
-{
-    double* TileMatrix;
-    TileMatrix = malloc(sizeof(double) * size );
-    if(TileMatrix == NULL)
-        error("failed to allocate TileMatrix");
-    int i,j,k,l;
-
-    for( i = 0; i < n ; i++ )
-    {
-        for( j = 0; j < m; j++ )
-        {
-            double *tileStart = &columnMatrix[i*m*tilesize*tilesize + j*tilesize];
-            double *tilePos = &TileMatrix[i*m*tilesize*tilesize + j*tilesize*tilesize];
-            for(k = 0; k < tilesize; k++)
-            {
-                tileStart = &columnMatrix[i*m*tilesize*tilesize+k*m*tilesize + j*tilesize];
-                for( l = 0; l < tilesize; l++ )
-                {
-                    tilePos[k*tilesize + l] = tileStart[l];
-                }
-            }
+double* columnToTile(double* columnMatrix, int size, int m, int n,
+                     int tilesize) {
+  double* TileMatrix;
+  TileMatrix = malloc(sizeof(double) * size);
+  if (TileMatrix == NULL) error("failed to allocate TileMatrix");
+  int i, j, k, l;
+
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < m; j++) {
+      double* tileStart =
+          &columnMatrix[i * m * tilesize * tilesize + j * tilesize];
+      double* tilePos =
+          &TileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
+      for (k = 0; k < tilesize; k++) {
+        tileStart = &columnMatrix[i * m * tilesize * tilesize +
+                                  k * m * tilesize + j * tilesize];
+        for (l = 0; l < tilesize; l++) {
+          tilePos[k * tilesize + l] = tileStart[l];
         }
+      }
     }
+  }
 
-    return TileMatrix;
-
+  return TileMatrix;
 }
 
-double* tileToColumn( double* tileMatrix, int size, int m , int n , int tilesize)
-{
-    double* ColumnMatrix;
-    ColumnMatrix = (double*) malloc(sizeof(double) * size );
-    if(ColumnMatrix == NULL)
-        error("failed to allocate ColumnMatrix");
-    int i,j,k,l;
-    for( i = 0; i < n ; i++ )
-    {
-        for(j = 0; j < m ; j++ )
-        {
-            /* Tile on ith column is at i*m*32*32.*/
-            /* Tile on jth is at j*32*32 */
-            double *tile = &tileMatrix[i*m*tilesize*tilesize + j*tilesize*tilesize];
-            /* Column starts at same position as tile. */
-            /* Row j*32.*/
-            double *tilePos = &ColumnMatrix[i*m*tilesize*tilesize + j*tilesize];
-            for( k = 0; k < tilesize; k++ )
-            {
-                for(l=0; l < tilesize; l++)
-                {
-                    tilePos[l] = tile[l];
-                }
-                /* Next 32 elements are the position of the tile in the next column.*/
-                tile = &tile[tilesize];
-                /* Move to the j*32th position in the next column. */                
-                tilePos = &tilePos[tilesize*m];
-
-            }
-        }   
+double* tileToColumn(double* tileMatrix, int size, int m, int n, int tilesize) {
+  double* ColumnMatrix;
+  ColumnMatrix = (double*)malloc(sizeof(double) * size);
+  if (ColumnMatrix == NULL) error("failed to allocate ColumnMatrix");
+  int i, j, k, l;
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < m; j++) {
+      /* Tile on ith column is at i*m*32*32.*/
+      /* Tile on jth is at j*32*32 */
+      double* tile =
+          &tileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
+      /* Column starts at same position as tile. */
+      /* Row j*32.*/
+      double* tilePos =
+          &ColumnMatrix[i * m * tilesize * tilesize + j * tilesize];
+      for (k = 0; k < tilesize; k++) {
+        for (l = 0; l < tilesize; l++) {
+          tilePos[l] = tile[l];
+        }
+        /* Next 32 elements are the position of the tile in the next column.*/
+        tile = &tile[tilesize];
+        /* Move to the j*32th position in the next column. */
+        tilePos = &tilePos[tilesize * m];
+      }
     }
-    return ColumnMatrix;
+  }
+  return ColumnMatrix;
 }
 
 /* Routines for the tiled QR decomposition.*/
@@ -219,89 +192,80 @@ double* tileToColumn( double* tileMatrix, int size, int m , int n , int tilesize
  *
  * @brief Computes the QR decomposition of a tile.
  *
- * @param cornerTile A pointer to the tile for which the decomposition is computed.
+ * @param cornerTile A pointer to the tile for which the decomposition is
+* computed.
  * @param tilesize The number of elements on a row/column of the tile.
  * @param tauMatrix A pointer to the tau Matrix.
  * @param k The value of k for the QR decomposition
- * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
  *
  *
  */
-void DGEQRF(double* restrict cornerTile, int tileSize, double* restrict tauMatrix, int k, int tauNum, double* w)
-{
-    int i, j, n;
-    double norm=0.0, sign, u1, tau, z;
-
-
-    /*Find the householder vector for each row. */
-    for(i = 0; i < tileSize; i++)
-    {
-        norm = 0.0;
-        /*Fill w with the vector.*/
-        for(j=i; j < tileSize; j++)
-        {
-            /* ith row is i*tileSize, only want elements on diagonal or below.*/
-            w[ j ] = cornerTile[ i*tileSize +j ];
-            /*Find the norm as well */
-            norm = norm + w[j]*w[j];
-        }
-        if(w[i] >= 0.0)
-            sign = -1;
-        else
-            sign = 1;
-
-        norm = sqrt(norm);
-
-        u1 = w[i] - sign*norm;
-
-        if(u1 != 0.0)
-        {
-            for(j=i+1; j < tileSize; j++)
-                w[j] = w[j] / u1;
-        }
-        else
-        {
-            for(j=i+1; j < tileSize; j++)
-                w[j] = 0.0;
-        }
-
-        if(norm != 0.0)
-            tau = -sign*u1/norm;
-        else
-            tau = 0.0;
+void DGEQRF(double* restrict cornerTile, int tileSize,
+            double* restrict tauMatrix, int k, int tauNum) {
+  int i, j, n;
+  double norm = 0.0, sign, u1, tau, z;
+  double w[tileSize];
+
+  /*Find the householder vector for each row. */
+  for (i = 0; i < tileSize; i++) {
+    norm = 0.0;
+    /*Fill w with the vector.*/
+    for (j = i; j < tileSize; j++) {
+      /* ith row is i*tileSize, only want elements on diagonal or below.*/
+      w[j] = cornerTile[i * tileSize + j];
+      /*Find the norm as well */
+      norm = norm + w[j] * w[j];
+    }
+    if (w[i] >= 0.0)
+      sign = -1;
+    else
+      sign = 1;
+
+    norm = sqrt(norm);
+    u1 = w[i] - sign * norm;
+//        if(i==1)
+  //          printf("%.3f\n", u1);
+    if (u1 != 0.0) {
+      for (j = i + 1; j < tileSize; j++) w[j] = w[j] / u1;
+    } else {
+      for (j = i + 1; j < tileSize; j++) w[j] = 0.0;
+    }
 
-        /*Store the below diagonal vector */
-        
-        for(j = i+1; j < tileSize; j++)
-        {
-            cornerTile[ i*tileSize +j ] = w[j];
-        }
-        cornerTile[ i*tileSize + i ] = sign*norm;
-        w[i] = 1.0;
-        /* Apply the householder transformation to the rest of the tile, for everything to the right of the diagonal. */
-        for(j = i+1; j < tileSize; j++)
-        {
-            /*Compute w'*A_j*/
-            z = cornerTile[ j*tileSize+i];
-            for(n = i+1; n < tileSize; n++)
-            {
-                z = z + cornerTile[j*tileSize+n] * w[n];
-            }
-            /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */
-            for(n=i; n < tileSize; n++)
-            {
-                cornerTile[j*tileSize+n] = cornerTile[j*tileSize+n] - tau*w[n]*z;
-            }
+    if (norm != 0.0)
+      tau = -sign * u1 / norm;
+    else
+      tau = 0.0;
+    
+   // printf("%.3f\n", sign);
 
-        }
-        /* Store tau. We're on k*tileSize+ith row. kth column.*/
-        tauMatrix[(k*tileSize+i)*tauNum+k] = tau;
+    /*Store the below diagonal vector */
 
+    for (j = i + 1; j < tileSize; j++) {
+      cornerTile[i * tileSize + j] = w[j];
     }
-
+    cornerTile[i * tileSize + i] = sign * norm;
+    w[i] = 1.0;
+    /* Apply the householder transformation to the rest of the tile, for
+     * everything to the right of the diagonal. */
+    for (j = i + 1; j < tileSize; j++) {
+      /*Compute w'*A_j*/
+      z = cornerTile[j * tileSize + i];
+      for (n = i + 1; n < tileSize; n++) {
+        z = z + cornerTile[j * tileSize + n] * w[n];
+      }
+      /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */
+      for (n = i; n < tileSize; n++) {
+        cornerTile[j * tileSize + n] =
+            cornerTile[j * tileSize + n] - tau * w[n] * z;
+      }
+    }
+    /* Store tau. We're on k*tileSize+ith row. kth column.*/
+    tauMatrix[(k * tileSize + i) * tauNum + k] = tau;
+  }
 }
 
-
 /**
  *
  * @brief Applies the householder factorisation of the corner to the row tile.
@@ -312,46 +276,39 @@ void DGEQRF(double* restrict cornerTile, int tileSize, double* restrict tauMatri
  * @param tauMatrix A pointer to the tau Matrix.
  * @param jj The value of j for the QR decomposition.
  * @param kk The value of k for the QR decomposition.
- * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
  *
  *
  */
-void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize, int jj, int kk, double* restrict tauMatrix, int tauNum, double* w)
-{
-    int i, j, n;
-    double z=0.0;
-
-
-    /* For each row in the corner Tile*/
-    for(i = 0; i < tileSize; i++)
-    {
-        /*Get w for row i */
-        for(j = i; j < tileSize; j++)
-        {
-            w[j] = cornerTile[i*tileSize + j];
-        }
-        w[i] = 1.0;
-        
-        /* Apply to the row Tile */
-        for(j = 0; j < tileSize; j++)
-        {
-            z=0.0;
-            /* Below Diagonal!*/
-            /*Compute w'*A_j*/
-            for(n = i; n < tileSize; n++)
-            {
-                z = z + w[n] * rowTile[j*tileSize+n];
-            }
-            for(n = i; n < tileSize; n++)
-            {
-                rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z;
-            }
-        }
-
-        
+void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize,
+            int jj, int kk, double* restrict tauMatrix, int tauNum) {
+  int i, j, n;
+  double z = 0.0;
+  double w[tileSize];
+
+  /* For each row in the corner Tile*/
+  for (i = 0; i < tileSize; i++) {
+    /*Get w for row i */
+    for (j = i; j < tileSize; j++) {
+      w[j] = cornerTile[i * tileSize + j];
     }
-
-
+    w[i] = 1.0;
+    /* Apply to the row Tile */
+    for (j = 0; j < tileSize; j++) {
+      z = 0.0;
+      /* Below Diagonal!*/
+      /*Compute w'*A_j*/
+      for (n = i; n < tileSize; n++) {
+        z = z + w[n] * rowTile[j * tileSize + n];
+      }
+      for (n = i; n < tileSize; n++) {
+        rowTile[j * tileSize + n] =
+            rowTile[j * tileSize + n] -
+            tauMatrix[(kk * tileSize + i) * tauNum + kk] * w[n] * z;
+      }
+    }
+  }
 }
 
 /**
@@ -364,83 +321,78 @@ void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize,
  * @param tauMatrix A pointer to the tau Matrix.
  * @param ii The value of i for the QR decomposition.
  * @param kk The value of k for the QR decomposition.
- * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
  *
  *
  */
-void DTSQRF( double* restrict cornerTile, double* restrict columnTile, int tilesize, int ii, int kk, double* restrict tauMatrix, int tauNum, double* w )
-{
-    int i, j, n;
-    double norm=0.0, sign, u1, tau, z;
-
-    /* For each column compute the householder vector. */
-    for(i = 0; i < tilesize; i++)
-    {
-        norm = 0.0;
-        w[i] = cornerTile[ i*tilesize+i ];
-        norm = norm + w[i]*w[i];
-        for(j = i+1; j < tilesize; j++)
-        {   
-            w[j] = 0.0;
-        }
-        for(j = 0; j < tilesize; j++)
-        {
-            w[tilesize+j] = columnTile[ i*tilesize+j ];
-            norm = norm + w[tilesize+j]*w[tilesize+j];
-        }
-        
-        norm = sqrt(norm);
-        if(w[i] >= 0.0)
-            sign = -1;
-        else
-            sign = 1;
-        
-
-        u1 = w[i] - sign*norm;
-        if(u1 != 0.0)
-        {
-            for(j = i+1; j < 2*tilesize; j++){
-                w[j] = w[j]/u1;
-            }
-        }else
-        {
-            for(j = i+1; j < 2*tilesize; j++)
-                w[j] = 0.0;
-        }
+void DTSQRF(double* restrict cornerTile, double* restrict columnTile,
+            int tilesize, int ii, int kk, double* restrict tauMatrix,
+            int tauNum) {
+  int i, j, n;
+  double norm = 0.0, sign, u1, tau, z;
+  double w[2*tilesize];
+
+  /* For each column compute the householder vector. */
+  for (i = 0; i < tilesize; i++) {
+    norm = 0.0;
+    w[i] = cornerTile[i * tilesize + i];
+    norm = norm + w[i] * w[i];
+    for (j = i + 1; j < tilesize; j++) {
+      w[j] = 0.0;
+    }
+    for (j = 0; j < tilesize; j++) {
+      w[tilesize + j] = columnTile[i * tilesize + j];
+      norm = norm + w[tilesize + j] * w[tilesize + j];
+    }
 
-        if(norm != 0)
-            tau = -sign*u1/norm;
-        else
-            tau = 0.0;
-
-        /* Apply to each row to the right.*/
-        for(j = i; j < tilesize; j++)
-        {
-            /* Find w'*A_j, w is 0s except for first value with upper tile.*/
-            z = 1.0 * cornerTile[ j*tilesize+i ];            
-            for(n = 0; n < tilesize; n++)
-            {
-                z = z + w[ tilesize+n ]*columnTile[ j*tilesize+n ];
-            }
-            /* Apply to upper tile.*/
-            cornerTile[j*tilesize+i] = cornerTile[j*tilesize+i ] - tau*1.0*z;
-            for(n = i+1; n < tilesize; n++)
-            {
-                cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n ] - tau*w[n]*z;
-            }
-            /* Apply to lower tile.*/
-            for(n = 0; n < tilesize; n++)
-            {
-                columnTile[ j*tilesize+n] = columnTile[ j*tilesize+n ] - tau*w[tilesize+n]*z;
-            }
-            
-        }
-        /* Store w*/
-        for(j = 0; j < tilesize; j++){
-            columnTile[ i*tilesize+j ] = w[tilesize+j];
-        }
-        tauMatrix[(kk*tilesize+i)*tauNum+ ii] = tau;
+    norm = sqrt(norm);
+//            printf("%.3f\n", norm);
+
+    if (w[i] >= 0.0)
+      sign = -1;
+    else
+      sign = 1;
+    u1 = w[i] - sign * norm;
+    if (u1 != 0.0) {
+      for (j = i + 1; j < 2 * tilesize; j++) {
+        w[j] = w[j] / u1;
+      }
+    } else {
+      for (j = i + 1; j < 2 * tilesize; j++) w[j] = 0.0;
+    }
+
+    if (norm != 0)
+      tau = -sign * u1 / norm;
+    else
+      tau = 0.0;
+
+    /* Apply to each row to the right.*/
+    for (j = i; j < tilesize; j++) {
+      /* Find w'*A_j, w is 0s except for first value with upper tile.*/
+      z = 1.0 * cornerTile[j * tilesize + i];
+      for (n = 0; n < tilesize; n++) {
+        z = z + w[tilesize + n] * columnTile[j * tilesize + n];
+      }
+      /* Apply to upper tile.*/
+      cornerTile[j * tilesize + i] =
+          cornerTile[j * tilesize + i] - tau * 1.0 * z;
+      for (n = i + 1; n < tilesize; n++) {
+        cornerTile[j * tilesize + n] =
+            cornerTile[j * tilesize + n] - tau * w[n] * z;
+      }
+      /* Apply to lower tile.*/
+      for (n = 0; n < tilesize; n++) {
+        columnTile[j * tilesize + n] =
+            columnTile[j * tilesize + n] - tau * w[tilesize + n] * z;
+      }
     }
+    /* Store w*/
+    for (j = 0; j < tilesize; j++) {
+      columnTile[i * tilesize + j] = w[tilesize + j];
+    }
+    tauMatrix[(kk * tilesize + i) * tauNum + ii] = tau;
+  }
 }
 
 /**
@@ -455,46 +407,45 @@ void DTSQRF( double* restrict cornerTile, double* restrict columnTile, int tiles
  * @param ii The value of i for the QR decomposition.
  * @param jj The value of j for the QR decomposition.
  * @param kk The value of k for the QR decomposition.
- * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
  *
  *
  */
-void DSSRFT( double* restrict cornerTile, double* restrict columnTile, double* restrict rowTile, int tilesize, int ii, int jj, int kk, double* restrict tauMatrix, int tauNum , double* w)
-{
-    int i, j, n;
-    double z;
-
-    
-    for(i = 0; i < tilesize; i++)
-    {
-        for(j = 0; j < i; j++)
-            w[j] = 0.0;
-        w[i] = 1.0;
-        for(j = i+1; j < tilesize; j++)
-            w[j] = 0.0;
-        for(j = 0; j < tilesize; j++)
-            w[j+tilesize] = columnTile[i*tilesize +j];
-        
-        /* Apply householder vector (w) to the tiles.*/
-        for(j = 0; j < tilesize; j++)
-        {
-            z = 0.0;
-            /* Compute w' * A_j */
-            for(n = 0; n < tilesize; n++)
-            {
-                z += w[n] * rowTile[j*tilesize+n];
-                z += w[n + tilesize] * cornerTile[j*tilesize+n];
-            }
-            for(n = 0; n < tilesize; n++)
-            {
-                rowTile[j*tilesize + n] = rowTile[j*tilesize + n] - tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[n]*z;
-                cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n]- tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[tilesize+n]*z;
-            }
-        }
+void DSSRFT(double* restrict cornerTile, double* restrict columnTile,
+            double* restrict rowTile, int tilesize, int ii, int jj, int kk,
+            double* restrict tauMatrix, int tauNum) {
+  int i, j, n;
+  double z;
+  double w[2*tilesize];
+
+  for (i = 0; i < tilesize; i++) {
+    for (j = 0; j < i; j++) w[j] = 0.0;
+    w[i] = 1.0;
+    for (j = i + 1; j < tilesize; j++) w[j] = 0.0;
+    for (j = 0; j < tilesize; j++)
+      w[j + tilesize] = columnTile[i * tilesize + j];
+
+    /* Apply householder vector (w) to the tiles.*/
+    for (j = 0; j < tilesize; j++) {
+      z = 0.0;
+      /* Compute w' * A_j */
+      for (n = 0; n < tilesize; n++) {
+        z += w[n] * rowTile[j * tilesize + n];
+        z += w[n + tilesize] * cornerTile[j * tilesize + n];
+      }
+      for (n = 0; n < tilesize; n++) {
+        rowTile[j * tilesize + n] =
+            rowTile[j * tilesize + n] -
+            tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[n] * z;
+        cornerTile[j * tilesize + n] =
+            cornerTile[j * tilesize + n] -
+            tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[tilesize + n] * z;
+      }
     }
+  }
+}
 
-}		
-			
 /**
  * @brief Computed a tiled QR factorization using QuickSched.
  *
@@ -502,302 +453,385 @@ void DSSRFT( double* restrict cornerTile, double* restrict columnTile, double* r
  * @param n Number of tile columns.
  * @param nr_threads Number of threads to use.
  */
- 
-void test_qr ( int m , int n , int K , int nr_threads , int runs, double *matrix ) {
-
-    int k, j, i;
-    double *A, *A_orig, *tau;
-    struct qsched s;
-    qsched_task_t *tid, tid_new;
-    qsched_res_t *rid;
-    int data[3];
-    ticks tic, toc_run, tot_setup, tot_run = 0;
-    
-    enum task_types { task_DGEQRF , task_DLARFT , task_DTSQRF , task_DSSRFT };
-
 
-    /* Runner function to pass to the scheduler. */
-    void runner ( int type , void *data ) {
-    
-        /* Decode the task data. */
-        int *idata = (int *)data;
-        int i = idata[0], j = idata[1], k = idata[2];
-        double ww[ 2*K ];
-        
-        /* Decode and execute the task. */
-        switch ( type ) {
-            case task_DGEQRF:
-                DGEQRF( &A[ (k*m+k)*K*K], K, tau, k, m, ww);
-                break;
-            case task_DLARFT:
-                DLARFT( &A[ (k*m+k)*K*K ],&A[(j*m+k)*K*K], K, j, k, tau, m, ww);
-                break;
-            case task_DTSQRF:
-                DTSQRF( &A[(k*m+k)*K*K], &A[(k*m+i)*K*K], K, i, k, tau, m , ww);
-                break;
-            case task_DSSRFT:
-                DSSRFT(&A[(j*m+i)*K*K], &A[(k*m+i)*K*K], &A[(j*m+k)*K*K], K, i, j, k, tau, m , ww);
-                break;
-            default:
-                error( "Unknown task type." );
-            }
-
-        }
-        
-    
-    /* Allocate and fill the original matrix. */
-    if ( ( A = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL ||
-         ( tau = (double *)malloc( sizeof(double) * m * n * K ) ) == NULL ||
-         ( A_orig = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL )
-        error( "Failed to allocate matrices." );
-    for ( k = 0 ; k < m * n * K * K ; k++ )
-    {
-        if(matrix == NULL)
-            A_orig[k] = 2*((double)rand()) / RAND_MAX - 1.0;
-        else
-            A_orig[k] = matrix[k];
+void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
+
+  int k, j, i;
+  double* A, *A_orig, *tau;
+  struct qsched s;
+  qsched_task_t* tid, tid_new;
+  qsched_res_t* rid;
+  int data[3];
+  ticks tic, toc_run, tot_setup, tot_run = 0;
+
+  enum task_types {
+    task_DGEQRF,
+    task_DLARFT,
+    task_DTSQRF,
+    task_DSSRFT
+  };
+
+  /* Runner function to pass to the scheduler. */
+  void runner(int type, void * data) {
+
+    /* Decode the task data. */
+    int* idata = (int*)data;
+    int i = idata[0], j = idata[1], k = idata[2];
+
+    /* Decode and execute the task. */
+    switch (type) {
+      case task_DGEQRF:
+        DGEQRF(&A[(k * m + k) * K * K], K, tau, k, m);
+        break;
+      case task_DLARFT:
+        DLARFT(&A[(k * m + k) * K * K], &A[(j * m + k) * K * K], K, j, k, tau,
+               m);
+        break;
+      case task_DTSQRF:
+        DTSQRF(&A[(k * m + k) * K * K], &A[(k * m + i) * K * K], K, i, k, tau,
+               m);
+        break;
+      case task_DSSRFT:
+        DSSRFT(&A[(j * m + i) * K * K], &A[(k * m + i) * K * K],
+               &A[(j * m + k) * K * K], K, i, j, k, tau, m);
+        break;
+      default:
+        error("Unknown task type.");
     }
-    memcpy( A , A_orig , sizeof(double) * m * n * K * K );
-    bzero( tau , sizeof(double) * m * n * K );
-    
-    /* Dump A_orig. */
-    /* message( "A_orig = [" );
-    for ( k = 0 ; k < m*K ; k++ ) {
-        for ( j = 0 ; j < n*K ; j++ )
-            printf( "%.3f " , A_orig[ j*m*K + k ] );
-        printf( "\n" );
-        }
-    printf( "];\n" ); */
-    
-    /* Initialize the scheduler. */
-    qsched_init( &s , nr_threads , qsched_flag_none );
-    
-    /* Allocate and init the task ID and resource ID matrix. */
-    tic = getticks();
-    if ( ( tid = (qsched_task_t *)malloc( sizeof(qsched_task_t) * m * n ) ) == NULL ||
-         ( rid = (qsched_res_t *)malloc( sizeof(qsched_res_t) * m * n ) ) == NULL )
-        error( "Failed to allocate tid/rid matrix." );
-    for ( k = 0 ; k < m * n ; k++ ) {
-        tid[k] = qsched_task_none;
-        rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none );
-        }
-    
-    /* Build the tasks. */
-    for ( k = 0 ; k < m && k < n ; k++ ) {
-    
-        /* Add kth corner task. */
-        data[0] = k; data[1] = k; data[2] = k;
-        tid_new = qsched_addtask( &s , task_DGEQRF , task_flag_none , data , sizeof(int)*3 , 2 );
-        qsched_addlock( &s , tid_new , rid[ k*m + k ] );
-        if ( tid[ k*m + k ] != -1 )
-            qsched_addunlock( &s , tid[ k*m + k ] , tid_new );
-        tid[ k*m + k ] = tid_new;
-            
-        /* Add column tasks on kth row. */
-        for ( j = k+1 ; j < n ; j++ ) {
-            data[0] = k; data[1] = j; data[2] = k;
-            tid_new = qsched_addtask( &s , task_DLARFT , task_flag_none , data , sizeof(int)*3 , 3 );
-            qsched_addlock( &s , tid_new , rid[ j*m + k ] );
-            qsched_adduse( &s , tid_new , rid[ k*m + k ] );
-            qsched_addunlock( &s , tid[ k*m + k ] , tid_new );
-            if ( tid[ j*m + k ] != -1 )
-                qsched_addunlock( &s , tid[ j*m + k ] , tid_new );
-            tid[ j*m + k ] = tid_new;
-            }
-            
-        /* For each following row... */
-        for ( i = k+1 ; i < m ; i++ ) {
-        
-            /* Add the row taks for the kth column. */
-            data[0] = i; data[1] = k; data[2] = k;
-            tid_new = qsched_addtask( &s , task_DTSQRF , task_flag_none , data , sizeof(int)*3 , 3 );
-            qsched_addlock( &s , tid_new , rid[ k*m + i ] );
-            qsched_adduse( &s , tid_new , rid[ k*m + k ] );
-            qsched_addunlock( &s , tid[ k*m + (i-1) ] , tid_new );
-            if ( tid[ k*m + i ] != -1 )
-                qsched_addunlock( &s , tid[ k*m + i ] , tid_new );
-            tid[ k*m + i ] = tid_new;
-            
-            /* Add the inner tasks. */
-            for ( j = k+1 ; j < n ; j++ ) {
-                data[0] = i; data[1] = j; data[2] = k;
-                tid_new = qsched_addtask( &s , task_DSSRFT , task_flag_none , data , sizeof(int)*3 , 5 );
-                qsched_addlock( &s , tid_new , rid[ j*m + i ] );
-                qsched_adduse( &s , tid_new , rid[ k*m + i ] );
-                qsched_adduse( &s , tid_new , rid[ j*m + k ] );
-                qsched_addunlock( &s , tid[ k*m + i ] , tid_new );
-                qsched_addunlock( &s, tid[j*m+i-1], tid_new);
-                if ( tid[ j*m + i ] != -1 )
-                    qsched_addunlock( &s , tid[ j*m + i ] , tid_new );
-
-                tid[ j*m + i ] = tid_new;
-                }
-        
-            }
-    
-        } /* build the tasks. */
-    tot_setup = getticks() - tic;
-    
-    /* Dump the number of tasks. */
-    message( "total nr of tasks: %i." , s.count );
-    message( "total nr of deps: %i." , s.count_deps );
-    message( "total nr of res: %i." , s.count_res );
-    message( "total nr of locks: %i." , s.count_locks );
-    message( "total nr of uses: %i." , s.count_uses );    
-        
-        
-    /* Loop over the number of runs. */
-    for ( k = 0 ; k < runs ; k++ ) {
-    
-        /* Execute the the tasks. */
-        tic = getticks();
-        qsched_run( &s , nr_threads , runner );
-        toc_run = getticks(); 
-	    message( "%ith run took %lli ticks..." , k , toc_run - tic );
-        tot_run += toc_run - tic;
-        
-        }
-    
-        
-    /* Dump A. */
-    /* message( "A = [" );
-    for ( k = 0 ; k < m*K ; k++ ) {
-        for ( j = 0 ; j < n*K ; j++ )
-            printf( "%.3f " , A[ j*m*K + k ] );
-        printf( "\n" );
-        }
-    printf( "];\n" ); */
-    
-    /* Dump tau. */
-    /* message( "tau = [" );
-    for ( k = 0 ; k < m*K ; k++ ) {
-        for ( j = 0 ; j < n ; j++ )
-            printf( "%.3f " , tau[ j*m*K + k ] );
-        printf( "\n" );
-        }
-    printf( "];\n" ); */
-    
-    /* Dump the tasks. */
-    /* for ( k = 0 ; k < s.count ; k++ ) {
-        int *d = (int *)&s.data[ s.tasks[k].data ];
-        printf( " %i %i %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid , d[0] , d[1] , s.tasks[k].tic , s.tasks[k].toc );
-        } */
-        
-    /* Dump the costs. */
-    message( "costs: setup=%lli ticks, run=%lli ticks." ,
-        tot_setup , tot_run/runs );
-    
-    /* Dump the timers. */
-    for ( k = 0 ; k < qsched_timer_count ; k++ )
-        message( "timer %s is %lli ticks." , qsched_timer_names[k] , s.timers[k]/runs );
-    
-    if(matrix != NULL)
-    {
-        for(k = 0; k < m*n*K*K; k++)
-            matrix[k] = A[k];
+  }
+
+  /* Allocate and fill the original matrix. */
+  if ((A = (double*)malloc(sizeof(double)* m* n* K* K)) == NULL ||
+      (tau = (double*)malloc(sizeof(double)* m* n* K)) == NULL ||
+      (A_orig = (double*)malloc(sizeof(double) * m * n * K * K)) == NULL)
+    error("Failed to allocate matrices.");
+  for (k = 0; k < m * n * K * K; k++) {
+    if (matrix == NULL)
+      A_orig[k] = 2 * ((double)rand()) / RAND_MAX - 1.0;
+    else
+      A_orig[k] = matrix[k];
+  }
+  memcpy(A, A_orig, sizeof(double) * m * n * K * K);
+  bzero(tau, sizeof(double) * m * n * K);
+
+  /* Dump A_orig. */
+  /* message( "A_orig = [" );
+  for ( k = 0 ; k < m*K ; k++ ) {
+      for ( j = 0 ; j < n*K ; j++ )
+          printf( "%.3f " , A_orig[ j*m*K + k ] );
+      printf( "\n" );
+      }
+  printf( "];\n" ); */
+
+  /* Initialize the scheduler. */
+  qsched_init(&s, nr_threads, qsched_flag_none);
+
+  /* Allocate and init the task ID and resource ID matrix. */
+  tic = getticks();
+  if ((tid = (qsched_task_t*)malloc(sizeof(qsched_task_t)* m* n)) == NULL ||
+      (rid = (qsched_res_t*)malloc(sizeof(qsched_res_t) * m * n)) == NULL)
+    error("Failed to allocate tid/rid matrix.");
+  for (k = 0; k < m * n; k++) {
+    tid[k] = qsched_task_none;
+    rid[k] = qsched_addres(&s, qsched_owner_none, qsched_res_none, NULL, 0);
+  }
+
+  /* Build the tasks. */
+  for (k = 0; k < m && k < n; k++) {
+
+    /* Add kth corner task. */
+    data[0] = k;
+    data[1] = k;
+    data[2] = k;
+    tid_new = qsched_addtask(&s, task_DGEQRF, task_flag_none, data,
+                             sizeof(int) * 3, 2);
+    qsched_addlock(&s, tid_new, rid[k * m + k]);
+    if (tid[k * m + k] != -1) qsched_addunlock(&s, tid[k * m + k], tid_new);
+    tid[k * m + k] = tid_new;
+
+    /* Add column tasks on kth row. */
+    for (j = k + 1; j < n; j++) {
+      data[0] = k;
+      data[1] = j;
+      data[2] = k;
+      tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, data,
+                               sizeof(int) * 3, 3);
+      qsched_addlock(&s, tid_new, rid[j * m + k]);
+      qsched_adduse(&s, tid_new, rid[k * m + k]);
+      qsched_addunlock(&s, tid[k * m + k], tid_new);
+      if (tid[j * m + k] != -1) qsched_addunlock(&s, tid[j * m + k], tid_new);
+      tid[j * m + k] = tid_new;
     }
 
- 
-    /* Test if the decomposition was correct.*/
-   /*double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
-    double *Q = computeQ(tempMatrix, m*K, K, tau, m);
-    double *R = getR(tempMatrix, m*K);
-    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q, m*K, R, m*K, 0.0, tempMatrix, m*K);
-        free(Q);
-        Q = tileToColumn(A_orig, m*n*K*K, m, n, K);
-    for(i = 0; i < m * n * K * K; i++)
-    {
-        if(Q[i] != 0 && (Q[i] / tempMatrix[i] > 1.005 || Q[i] / tempMatrix[i] < 0.995))
-            printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i], tempMatrix[i]);
-    }
-    free(tempMatrix);
-    free(Q);
-    free(R);*/
-
-    /* Clean up. */
-    free(A);
-    free(A_orig);
-    free(tau);
-    free(tid);
-    free(rid);
-    qsched_free( &s );
-        
+    /* For each following row... */
+    for (i = k + 1; i < m; i++) {
+
+      /* Add the row taks for the kth column. */
+      data[0] = i;
+      data[1] = k;
+      data[2] = k;
+      tid_new = qsched_addtask(&s, task_DTSQRF, task_flag_none, data,
+                               sizeof(int) * 3, 3);
+      qsched_addlock(&s, tid_new, rid[k * m + i]);
+      qsched_adduse(&s, tid_new, rid[k * m + k]);
+      qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new);
+      if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new);
+      tid[k * m + i] = tid_new;
+
+      /* Add the inner tasks. */
+      for (j = k + 1; j < n; j++) {
+        data[0] = i;
+        data[1] = j;
+        data[2] = k;
+        tid_new = qsched_addtask(&s, task_DSSRFT, task_flag_none, data,
+                                 sizeof(int) * 3, 5);
+        qsched_addlock(&s, tid_new, rid[j * m + i]);
+        qsched_adduse(&s, tid_new, rid[k * m + i]);
+        qsched_adduse(&s, tid_new, rid[j * m + k]);
+        // qsched_addunlock(&s, tid[k * m + i], tid_new);
+        qsched_addunlock(&s, tid[j * m + i - 1], tid_new);
+        if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new);
+
+        tid[j * m + i] = tid_new;
+      }
     }
 
+  } /* build the tasks. */
+  tot_setup = getticks() - tic;
+
+  /* Dump the number of tasks. */
+  message("total nr of tasks: %i.", s.count);
+  message("total nr of deps: %i.", s.count_deps);
+  message("total nr of res: %i.", s.count_res);
+  message("total nr of locks: %i.", s.count_locks);
+  message("total nr of uses: %i.", s.count_uses);
+
+  /* Loop over the number of runs. */
+  for (k = 0; k < runs; k++) {
+
+    /* Execute the the tasks. */
+    tic = getticks();
+    qsched_run(&s, nr_threads, runner);
+    toc_run = getticks();
+    message("%ith run took %lli ticks...", k, toc_run - tic);
+    tot_run += toc_run - tic;
+  }
+
+  /* Dump A. */
+  /* message( "A = [" );
+  for ( k = 0 ; k < m*K ; k++ ) {
+      for ( j = 0 ; j < n*K ; j++ )
+          printf( "%.3f " , A[ j*m*K + k ] );
+      printf( "\n" );
+      }
+  printf( "];\n" ); */
+
+  /* Dump tau. */
+  /* message( "tau = [" );
+  for ( k = 0 ; k < m*K ; k++ ) {
+      for ( j = 0 ; j < n ; j++ )
+          printf( "%.3f " , tau[ j*m*K + k ] );
+      printf( "\n" );
+      }
+  printf( "];\n" ); */
+
+  /* Dump the tasks. */
+  /* for ( k = 0 ; k < s.count ; k++ ) {
+      int *d = (int *)&s.data[ s.tasks[k].data ];
+      printf( " %i %i %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid ,
+     d[0] , d[1] , s.tasks[k].tic , s.tasks[k].toc );
+      } */
+
+  /* Dump the costs. */
+  message("costs: setup=%lli ticks, run=%lli ticks.", tot_setup,
+          tot_run / runs);
+
+  /* Dump the timers. */
+  for (k = 0; k < qsched_timer_count; k++)
+    message("timer %s is %lli ticks.", qsched_timer_names[k],
+            s.timers[k] / runs);
+
+  if (matrix != NULL) {
+    for (k = 0; k < m * n * K * K; k++) matrix[k] = A[k];
+  }
+
+  /* Test if the decomposition was correct.*/
+  /*double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
+   double *Q = computeQ(tempMatrix, m*K, K, tau, m);
+   double *R = getR(tempMatrix, m*K);
+   cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q,
+   m*K, R, m*K, 0.0, tempMatrix, m*K);
+       free(Q);
+       Q = tileToColumn(A_orig, m*n*K*K, m, n, K);
+   for(i = 0; i < m * n * K * K; i++)
+   {
+       if(Q[i] != 0 && (Q[i] / tempMatrix[i] > 1.005 || Q[i] / tempMatrix[i] <
+   0.995))
+           printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i],
+   tempMatrix[i]);
+   }
+   free(tempMatrix);
+   free(Q);
+   free(R);*/
+
+  /* Clean up. */
+  free(A);
+  free(A_orig);
+  free(tau);
+  free(tid);
+  free(rid);
+  qsched_free(&s);
+}
 
 /* Generates a random matrix. */
-double* generateColumnMatrix(int size)
+double* generateColumnMatrix(int size, unsigned long int m_z) {
+  double* matrix = malloc(sizeof(double) * size * size);
+  if (matrix == NULL) error("Failed to allocate matrix");
+
+  int i;
+  for (i = 0; i < size * size; i++) {
+    m_z = (1664525 * m_z + 1013904223) % 4294967296;
+    matrix[i] = m_z % 100;
+    if (matrix[i] < 0) matrix[i] += 100;
+  }
+  return matrix;
+}
+
+
+/* Unit tests for the tiled QR decomposition.*/
+void test_SSSRFT()
 {
-    double* matrix = malloc(sizeof(double)*size*size);
-    if(matrix == NULL)
-        error("Failed to allocate matrix");
-
-    unsigned long int m_z = 35532;
-    int i;
-    for(i = 0 ; i < size*size; i++)
-    {
-        m_z = (1664525*m_z + 1013904223) % 4294967296;
-        matrix[i] = m_z % 100;
-        if(matrix[i] < 0)
-            matrix[i] += 100;
-    }
-    return matrix;
+    double* cornerTile, *rowTile, *columnTile, *otherTile;
+    double tau[1024];
+    cornerTile = generateColumnMatrix(32, 35136);
+    rowTile = generateColumnMatrix(32, 35239);
+    columnTile = generateColumnMatrix(32, 35339);
+    otherTile = generateColumnMatrix(32, 35739);
+DGEQRF(cornerTile, 32, tau, 0, 2);
+    DTSQRF(cornerTile, columnTile,
+            32, 1, 0, tau,
+            2);
+    DLARFT(cornerTile,rowTile, 32,
+            1, 0, tau, 2);
+ DSSRFT(otherTile, columnTile,
+            rowTile, 32, 1, 1, 0,
+            tau, 2);
+    printMatrix(cornerTile, 1, 1, 32);
+    printf("\n\n\n\n\n Column Tile \n\n\n\n\n");
+    printMatrix(columnTile, 1, 1, 32);
+    printf("\n\n\n\n\n Row Tile \n\n\n\n\n");
+    printMatrix(rowTile, 1, 1, 32);
+    printf("\n\n\n\n\n Other Tile \n\n\n\n\n");
+    printMatrix(otherTile, 1, 1, 32);
+    free(cornerTile);
+    free(rowTile);
+    free(columnTile);
+    free(otherTile);
+    
+}
+void test_STSQRF()
+{
+    double *cornerTile, *columnTile;
+    double tau[1024];
+    cornerTile = generateColumnMatrix(32, 35531);
+    columnTile = generateColumnMatrix(32, 35585);
+    DGEQRF(cornerTile, 32, tau, 0, 2);
+    DTSQRF(cornerTile, columnTile,
+            32, 1, 0, tau,
+            2);
+    printMatrix(cornerTile, 1, 1, 32);
+    printf("\n\n\n\n\n Column Tile \n\n\n\n\n");
+    printMatrix(columnTile, 1, 1, 32);
+    free(cornerTile);
+    free(columnTile);
+}
+void test_SLARFT()
+{
+    double* cornerTile, *rowTile;
+    double tau[1024];
+    cornerTile = generateColumnMatrix(32, 35536);
+    rowTile = generateColumnMatrix(32, 35539);
+    DGEQRF(cornerTile, 32,tau, 0, 2);
+    DLARFT(cornerTile,rowTile, 32,
+            1, 0, tau, 2); 
+    printMatrix(cornerTile, 1, 1, 32);
+
+    printf("\n\n\n\n\n Row Tile \n\n\n\n\n");
+    printMatrix(rowTile, 1, 1, 32);
+    free(cornerTile);
+    free(rowTile);
 }
 
+void test_SGEQRF()
+{
+    double* cornerTile;
+    double tau[1024];
+    cornerTile = generateColumnMatrix(32, 35532);
+    DGEQRF( cornerTile, 32,
+            tau, 0, 2);
+    printMatrix(cornerTile, 1, 1, 32);
+    free(cornerTile);
+}
+void runTests()
+{
+ //   test_SGEQRF();
+//    test_SLARFT();
+//    test_STSQRF();
+    test_SSSRFT();
+}
+
+
 
 /**
  * @brief Main function.
  */
- 
-int main ( int argc , char *argv[] ) {
 
-    int c, nr_threads;
-    int M = 4, N = 4, runs = 1, K = 32;
-    
-    /* Get the number of threads. */
-    #pragma omp parallel shared(nr_threads)
-    {
-        if ( omp_get_thread_num() == 0 )
-            nr_threads = omp_get_num_threads();
+int main(int argc, char* argv[]) {
+
+  int c, nr_threads;
+  int M = 4, N = 4, runs = 1, K = 32;
+
+/* Get the number of threads. */
+#pragma omp parallel shared(nr_threads)
+  {
+    if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads();
+  }
+
+  /* Parse the options */
+  while ((c = getopt(argc, argv, "m:n:k:r:t:D:")) != -1) switch (c) {
+      case 'm':
+        if (sscanf(optarg, "%d", &M) != 1) error("Error parsing dimension M.");
+        break;
+      case 'n':
+        if (sscanf(optarg, "%d", &N) != 1) error("Error parsing dimension M.");
+        break;
+      case 'k':
+        if (sscanf(optarg, "%d", &K) != 1) error("Error parsing tile size.");
+        break;
+      case 'r':
+        if (sscanf(optarg, "%d", &runs) != 1)
+          error("Error parsing number of runs.");
+        break;
+      case 't':
+        if (sscanf(optarg, "%d", &nr_threads) != 1)
+          error("Error parsing number of threads.");
+        omp_set_num_threads(nr_threads);
+        break;
+      case 'D':
+        runTests();
+        exit(EXIT_SUCCESS);
+      case '?':
+        fprintf(stderr, "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n",
+                argv[0]);
+        fprintf(stderr, "Computes the tiled QR decomposition of an MxN tiled\n"
+                        "matrix using nr_threads threads.\n");
+        exit(EXIT_FAILURE);
     }
-    
-    /* Parse the options */
-    while ( ( c = getopt( argc , argv  , "m:n:k:r:t:" ) ) != -1 )
-        switch( c ) {
-	        case 'm':
-	            if ( sscanf( optarg , "%d" , &M ) != 1 )
-	                error( "Error parsing dimension M." );
-	            break;
-	        case 'n':
-	            if ( sscanf( optarg , "%d" , &N ) != 1 )
-	                error( "Error parsing dimension M." );
-	            break;
-	        case 'k':
-	            if ( sscanf( optarg , "%d" , &K ) != 1 )
-	                error( "Error parsing tile size." );
-	            break;
-            case 'r':
-                if ( sscanf( optarg , "%d" , &runs ) != 1 )
-                    error( "Error parsing number of runs." );
-                break;
-	        case 't':
-	            if ( sscanf( optarg , "%d" , &nr_threads ) != 1 )
-	                error( "Error parsing number of threads." );
-	            omp_set_num_threads( nr_threads );
-	            break;
-	        case '?':
-                fprintf( stderr , "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n" , argv[0] );
-                fprintf( stderr , "Computes the tiled QR decomposition of an MxN tiled\n"
-                                  "matrix using nr_threads threads.\n" );
-	            exit( EXIT_FAILURE );
-	        }
-            
-    /* Dump arguments. */
-    message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." ,
-        32*M , 32*N , nr_threads , runs );
-        
-    test_qr( M , N , K , nr_threads , runs , NULL);
-    
-    }
-    
-    
+
+  /* Dump arguments. */
+  message("Computing the tiled QR decomposition of a %ix%i matrix using %i "
+          "threads (%i runs).",
+          32 * M, 32 * N, nr_threads, runs);
+
+  test_qr(M, N, K, nr_threads, runs, NULL);
+}
diff --git a/examples/test_qr.cu b/examples/test_qr.cu
index 270c4ca35cb12113f0fed475842f8abef68d04b2..95b6ad51ea76ebe51707e5ad37bd33046bf6338b 100644
--- a/examples/test_qr.cu
+++ b/examples/test_qr.cu
@@ -16,11 +16,11 @@ extern "C"{
 
 enum task_types { task_SGEQRF , task_SLARFT , task_STSQRF , task_SSSRFT} ;
 
-#define TID threadIdx.x
+//#define TID threadIdx.x
 #define numthreads 128
-#define tilesize 32
+//#define tilesize 32
 #define cuda_maxtasks 1000000
-#define CO(x,y,ldm) (((y)*ldm) + (x))
+//#define CO(x,y,ldm) (((y)*ldm) + (x))
 #define WARPS 4
 
 __device__ float *GPU_matrix;
@@ -28,6 +28,41 @@ __device__ float *GPU_tau;
 __device__ int cuda_m;
 __device__ int cuda_n;
 
+void printMatrix(float *matrix, int m, int n, int tilesize)
+{
+    int i, j;
+    
+    for(i=0; i < m*tilesize; i++)
+    {
+        for(j = 0; j < n*tilesize; j++)
+        {
+            printf(" %.3f ", matrix[j*m*tilesize +i]);
+
+        }
+        printf("\n");
+    }
+
+}
+
+/* Generates a random matrix. */
+float* generateColumnMatrix(int size, unsigned long int m_z)
+{
+    float* matrix;
+    cudaMallocHost(&matrix, sizeof(float)*size*size);
+    if(matrix == NULL)
+        error("Failed to allocate matrix");
+
+    int i;
+    for(i = 0 ; i < size*size; i++)
+    {
+        m_z = (1664525*m_z + 1013904223) % 4294967296;
+        matrix[i] = m_z % 100;
+        if(matrix[i] < 0)
+            matrix[i] += 100;
+    }
+    return matrix;
+}
+
 
 /*
  * @brief Performs a reduction sum on a vector, storing the result in v[0].
@@ -47,7 +82,7 @@ __device__ void reduceSum	(volatile float* sumVector,
 	while(n > 0)
 	{
 		/* Use first n threads to add up to 2n elements. */
-		if(TID < n)	sumVector[TID] = sumVector[TID] + sumVector[TID + n];
+		if(threadIdx.x < n)	sumVector[threadIdx.x] = sumVector[threadIdx.x] + sumVector[threadIdx.x + n];
 		
 		/* Divide n by 2 for next iteration. */
 		n = n >> 1;
@@ -65,8 +100,8 @@ __device__ inline void reduceSumMultiWarp( float* value )
 
     #else
     __shared__ float stuff[32*WARPS];
-    int tid = TID%32;
-    int group = TID / 32;
+    int tid = threadIdx.x%32;
+    int group = threadIdx.x / 32;
     char n = 32 >> 1;
     while(n > 0 )
     {
@@ -81,7 +116,6 @@ __device__ inline void reduceSumMultiWarp( float* value )
 
 }
 
-
 /* Routines for the tiled QR decomposition.*/
 
 /**
@@ -96,42 +130,53 @@ __device__ inline void reduceSumMultiWarp( float* value )
  *
  *   Note: 32 threads max.
  */
-__device__ void SGEQRF(float* restrict cornerTile, int tileSize, float* restrict tauMatrix, int k, int tauNum, float* w)
+__device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float* tauMatrix, int k, int tauNum)
 {
     int i, j;
     float norm=0.0, sign, u1, tau, z;
-    int TID = threadID % 32;
-    int set = threadID / 32;
-
+    int TID = threadIdx.x % 32;
+    //int set = threadIdx.x / 32;
+    float w;
     /*Find the householder vector for each row. */
-    for(i = 0; i < tileSize; i++)
+    for(i = 0; i < tilesize; i++)
     {
         norm = 0.0;
-        if(TID > = i )
-            w[ TID ] = cornerTile[ i*tilesize + TID ] * cornerTile[i * tilesize + TID];
+        if(TID >= i )
+            norm = cornerTile[ i*tilesize + TID ] * cornerTile[i * tilesize + TID];
         else
-            w[ TID ] = 0.0;
-        reduceSumMultiWarp( &w[TID] );
-        
+            norm = 0.0;
+        reduceSumMultiWarp( &norm );
+
         if(cornerTile[i * tilesize + i] >= 0.0)
             sign = -1;
         else
             sign = 1;
 
-        norm = sqrt(w[0]);
-        if(TID == 0)
-            w[0] = 0.0;
+     //   sign = __shfl(sign, i, 32);
+        norm = sqrt(norm);
+        if(TID >= i )
+            w = cornerTile[ i*tilesize + TID ];
+        else
+            w = 0.0;
+
+
+
+//        if(TID==i)
+            u1 = cornerTile[i*tilesize + i] - sign*norm;
 
-        u1 = cornerTile[i*tilesize + i] - sign*norm;
+  //      __syncthreads();
+//        u1 = __shfl(u1, i, 32);     
+  //      if(i==1)
+//            printf("%.3f\n", u1);
         if(u1 != 0.0)
         {
             if(TID > i)
-                w[ TID ] = w[TID] / u1;
+                w = w / u1;
         }
         else
         {
             if(TID > i)
-                w[ TID ] = 0.0;
+                w = 0.0;
         }
 
         if(norm != 0.0)
@@ -139,29 +184,33 @@ __device__ void SGEQRF(float* restrict cornerTile, int tileSize, float* restrict
         else
             tau = 0.0;
 
+        //if(TID == 0)
+          //  printf("%.3f\n", sign);
+
         /*Store the below diagonal vector */
         if(TID > i)
-            cornerTile[ i*tileSize + TID ] = w[TID];
-        if(TID == 0)
-            cornerTile[ i*tileSize + i ] = sign*norm;
+            cornerTile[ i*tilesize + TID ] = w;
+        if(TID == i)
+            cornerTile[ i*tilesize + i ] = sign*norm;
         if(TID == i)
-            w[i] = 1.0;
+            w = 1.0;
         /* Apply the householder transformation to the rest of the tile, for everything to the right of the diagonal. */
-        for(j = i+1; j < tileSize; j++)
+        for(j = i+1; j < tilesize; j++)
         {
             /*Compute w'*A_j*/
             if(TID >= i)
-                z = cornerTile[j*tileSize + TID] * w[TID];
+                z = cornerTile[j*tilesize + TID] * w;
             else
                 z = 0;
             reduceSumMultiWarp(&z);
             /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */
             if(TID >= i)
-                cornerTile[j*tileSize + TID] = cornerTile[j*tileSize + TID] - tau*w[TID]*z;
+                cornerTile[j*tilesize + TID] = cornerTile[j*tilesize + TID] - tau*w*z;
         }
         /* Store tau. We're on k*tileSize+ith row. kth column.*/
-        if(TID == 0)
-            tauMatrix[(k*tileSize+i)*tauNum+k] = tau;
+        if(TID == 0){
+            tauMatrix[(k*tilesize+i)*tauNum+k] = tau;
+        }
 
     }
 
@@ -182,38 +231,43 @@ __device__ void SGEQRF(float* restrict cornerTile, int tileSize, float* restrict
  *
  *
  */
-void SLARFT(float* cornerTile, float* rowTile, int tileSize, int jj, int kk, float* tauMatrix, int tauNum, float* w)
+void __device__ SLARFT(volatile float* cornerTile, volatile float* rowTile, int tilesize, int jj, int kk, volatile float* tauMatrix, int tauNum)
 {
-    int i, j, n;
+    int i, j;
     float z;
     float w;
-    int TID = threadID % 32;
-    int set = threadID / 32;
+    int TID = threadIdx.x % 32;
+    int set = threadIdx.x / 32;
 
     /* For each row in the corner Tile*/
-    for(i = 0; i < tileSize; i++)
+    for(i = 0; i < tilesize; i++)
     {
         /*Get w for row i */
-        for(j = i; j < tileSize; j++)
-        {
-            w[j] = cornerTile[i*tileSize + j];
-        }
-        w[i] = 1.0;
+		if(TID > i)
+            w = cornerTile[i*tilesize + TID];
+		if(TID == i)
+		    w = 1.0;
+		if(TID < i)
+            w = 0.0;
         
         /* Apply to the row Tile */
-        for(j = 0; j < tileSize; j++)
+        for(j = set; j < tilesize; j+=(blockDim.x/32))
         {
             z=0.0;
             /* Below Diagonal!*/
             /*Compute w'*A_j*/
-            for(n = i; n < tileSize; n++)
-            {
-                z = z + w[n] * rowTile[j*tileSize+n];
-            }
-            for(n = i; n < tileSize; n++)
-            {
-                rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z;
-            }
+			z = w * rowTile[j*tilesize+TID];
+            //for(n = i; n < tileSize; n++)
+            //{
+            //    z = z + w[n] * rowTile[j*tileSize+n];
+            //}
+            reduceSumMultiWarp(&z);
+            if(TID >= i)
+                rowTile[j*tilesize + TID] = rowTile[j*tilesize + TID] - tauMatrix[(kk*tilesize+i)*tauNum + kk] * w * z;
+            //for(n = i; n < tileSize; n++)
+            //{
+              //  rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z;
+            //}
         }
 
         
@@ -222,526 +276,174 @@ void SLARFT(float* cornerTile, float* rowTile, int tileSize, int jj, int kk, flo
 
 }
 
-
-/*
- * @brief Computes a householder vector for a single tile QR decomposition, storing the result in a shared array.
- * @param matelem The TIDth element of the column to be used for calculation.
- * @param hhVector SMEM The array used to calculate and store the vector.
- * @param k The timestep at which this is called. i.e where the diagonal falls.
- * 
- * Calculates a householder vector for the vector passed as elements into a shared array.
- * Used only by SGEQRF_CUDA.
- */
-__device__ void calcHH	(float matelem,
-			volatile float hhVector[],
-			int k)
-{
-	/* To store the value of 1/(v[k] + sign(v[k])*||v||). */
-	float localdiv;
-	/* Stores the sign of v[k] */
-	int sign;
-
-	/* Read vectors into shared memory from below the diagonal. */
-	if(TID >= k)
-		hhVector[TID] = matelem;
-
-	/* Above the diagonal, set to 0. */
-	if(TID < k)
-		hhVector[TID] = 0;
-
-	/* Square each element to find ||v|| = sqrt(sum(v[i]^2)) */
-	hhVector[TID] *= hhVector[TID];
-	
-	/* Reduce to find sum of squares. */
-	reduceSum(hhVector, 32);
-
-	/* Set localdiv to equal ||v|| */
-	localdiv = sqrt(hhVector[0]);
-
-	/* According to Householder algorithm in Golub & Van Loan. */
-	if(localdiv != 0.0)
-	{
-		/* Load shared to communicate v[k] to all threads. */
-		hhVector[TID] = matelem;
-
-		/* Calculate the sign of v[k] locally. */
-		sign = hhVector[k] >= 0 ? 1 : -1;
-		
-		/* Compute and store localdiv = (||v||)*sign(v[k]) */
-		localdiv *= sign;
-
-		/* Compute and store localdiv = v[k] + (||v||*sign(v[k])) */
-		localdiv += hhVector[k];
-
-		/* Finally compute and store localdiv = 1/(v[k] + ||v||*sign(v[k])) */
-		localdiv = 1.0f/localdiv;
-	}
-	/* If norm is 0, do not change v. */
-	else
-		localdiv = 1.0f;
-
-	/* Compute and store in shared memory v = v/(v[k] + ||v||*sign(v[k])) */
-	if(TID < k)
-		hhVector[TID] = 0.0f;
-	/* v[k]/v[k] = 1 */
-	if(TID == k)
-		hhVector[TID] = 1.0f;
-	if(TID > k)
-		hhVector[TID] = matelem * localdiv;
-}
-
-/*
- * @brief Applies a householder vector to the submatrix starting at (k,k), also inserts the kth tau value into tau[k].
- * @param blockCache SMEM The 1024-element array storing the full tile to which the vector will be applied.
- * @param matTau The tau vector.
- * @param workingVector SMEM The array used in the computation as a scratchpad.
- * @param k Defines the submatrix.
- * @param hhVectorelem Element of the cumulatively-stored householder vector to apply.
+/**
+ *
+ * @brief Applies the householder factorisation of the corner to the row tile.
+ *
+ * @param cornerTile The corner tile for this value of k.
+ * @param columnTile The tile for which householders are computed.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param ii The value of i for the QR decomposition.
+ * @param kk The value of k for the QR decomposition.
+ * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ *
  *
- * Works by first calculating the kth tau,
- * then proceeding to apply the vector to successive columns of the matrix A_j
- * with the formula A_j = A_j - tau[k]*v*(v'A_j).
- * The kth tau is then stored into memory.
  */
-__device__ void applyHH(volatile float blockCache[],
-			float* matTau,
-			volatile float workingVector[],
-			int k,
-			float hhVectorelem)
+__device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, int tilesize, int ii, int kk, volatile float* tauMatrix, int tauNum )
 {
-	float 	ktau,
-		/* Stores the value of ktau*v*v'*A_j */
-		z;
-
-	int 	j;
-    float reg;
-	/* Read householder vector into shared memory to calculate ktau = 2/v'v */
-	reg/*workingVector[TID]*/ = hhVectorelem;
-	/* Square the elements to start finding v'v */
-	reg/*workingVector[TID]*/ *= reg/*workingVector[TID]*/;
-
-	/* Perform a reduction to compute v'v and store the result in v[0]. */
-	//reduceSum(workingVector, 32);
-    reduceSumMultiWarp(&reg);
-
-	/* Finally compute ktau = 2/v'v */
-	ktau = 2.0 / /*workingVector[0]*/reg;
-	
-	/* For the columns of the submatrix starting at (k,k). */
-	for(j = k; j < 32; j ++)
-	{
-		/* Fill the shared memory to calculate v'*A_j */
-		reg/*workingVector[TID]*/ = blockCache[TID+(j*32)] * hhVectorelem;
-
-		/* Perform reduction to compute v'*A_j, storing the result in v[0] */
-		//reduceSum(workingVector, 32);
-        reduceSumMultiWarp(&reg);
-		
-		/* Compute and store z = ktau*v[tid] */
-		z = hhVectorelem * ktau;
-
-		/* Compute and store z = (v'A_j)*(ktau*v[tid]) */
-		z *= /*workingVector[0]*/reg;
-		
-		/* Apply with A_j[tid] = A_j[tid] - v'A_j*ktau*v[tid] */
-		blockCache[TID + (j*32)] -= z;
-	}
-	
-	/* Store the essential (below diagonal) portion of householder vector below the diagonal in the block at column k. */
-	if(TID > k)
-		blockCache[TID + k*32] = hhVectorelem;
-	
-	/* Store the kth tau. */
-	if(TID == 0)
-		matTau[k] = ktau;
-}
-
-
-
-
-
-
-__device__ void runner_cuda_SGEQRF( int i, int j, int k, volatile float workingVector[], volatile float blockCache[] )
-{
-    int kk, jj, refCache;
-	refCache = TID;
-    float* matrix = &GPU_matrix[j*cuda_m*32*32 + i*32*32];
-    float* matTau = &GPU_tau[j*cuda_m*32 + i*32 ];
-	for(jj = 0; jj < 32; jj ++)//load block into shared memory
-	{
-		blockCache[refCache] = matrix[refCache];
-		refCache += 32;
-	}
-
-	for(kk = 0; kk < 32; kk ++)
-	{
-		//calculate the kth hh vector from the kth column of the TIDth row of the matrix
-		calcHH	(blockCache[TID + (kk*32)],//(tid, k)
-			workingVector,
-			kk);
-
-		//calculate the application of the hhvector along row TID
-		applyHH	(blockCache,
-			matTau,
-			workingVector,
-			kk,
-			workingVector[TID]);
-	}
+    int i, j, n;
+    float norm=0.0, sign, u1, tau, z;
+    float w, wupper;
+    int TID = threadIdx.x % 32;
+    //int set = threadIdx.x / 32;
 
-	//copy row back
-	refCache = TID;
+    /* For each column compute the householder vector. */
+    for(i = 0; i < tilesize; i++)
+    {
+//        norm = 0.0;
+        
+        wupper = cornerTile[i*tilesize + i];
 
-	for(jj = 0; jj < 32; jj ++)//unload block from shared memory
-	{
-		matrix[refCache] = blockCache[refCache];
-		refCache += 32;
-	}
-	__threadfence();
+        w = columnTile[i*tilesize + TID];
+        norm = (w*w);
+        
+        reduceSumMultiWarp(&norm);
+        norm += wupper*wupper;
+        norm = sqrt(norm);
 
+        if(wupper >= 0.0)
+            sign = -1;
+        else
+            sign = 1;
+        u1 = wupper - sign*norm;
+        if(u1 != 0.0)
+        {
+            w = w / u1;
+        }
+        else
+        {
+            w = 0.0;
+        }
 
-}
+        if(norm != 0.0)
+            tau = -sign*u1/norm;
+        else
+            tau = 0.0;
 
-__device__ void runner_cuda_SLARFT (int i, int j, int k, volatile float tau[], volatile float blockCache[] )
-{
-	int 	jj, kk, ii,
-		tid = TID%32,
-		group = TID/32;
-
-	__shared__ volatile float groupCache[32*4];
-	float* blockV = &GPU_matrix[k*cuda_m*32*32 + k*32*32];
-    float* blockA = &GPU_matrix[j*cuda_m*32*32 + i*32*32];
-    float* blockTau = &GPU_tau[ i*cuda_m*32 + i*32 ];
-	// cacheCol is a column in the groupCache
-	volatile float *cacheCol = groupCache + (group*32);
-	
-	float 	alpha,
-		belem;
-	
-	__syncthreads();
-
-	/* Load tau Vector */
-	if(TID < 32)
-		tau[TID] = blockTau[TID];
-	
-	/* Load Vectors */
-	for(jj = group; jj < 32; jj += WARPS)
-	{
-		if(tid < jj)
-			blockCache[tid + (jj*32)] = 0.0f;
-		else if(tid == jj)
-			blockCache[tid + (jj*32)] = 1.0f;
-		else if(tid > jj)
-			blockCache[tid + (jj*32)] = blockV[tid + (jj*32)];
-		//__syncthreads();
-	}
-    __syncthreads();
+        for(j = i; j < tilesize; j++)
+        {
+            z = w * columnTile[j*tilesize+TID];
+            reduceSumMultiWarp(&z);
+            z += cornerTile[j*tilesize+i];
+            if(threadIdx.x == 0)
+                cornerTile[j*tilesize+i] = cornerTile[j*tilesize+i] - tau*z;
 
-	/* Compute b_j -= tau*v*v'b_j, for all vectors in blockCached V */
-	for(jj = group; jj < 32; jj += WARPS)
-	{
-		/* Read the column into registers. */
-		//if(group == 0)
-		//{
-		belem = blockA[tid + (jj*32)];
-		//__syncthreads();
-        if(jj == 0)
-            printf("%.3f %i\n", belem, TID);
-		/* For each vector in block of vectors. */
-
-		for(kk = 0; kk < 32; kk++)
-		{
-			/* Compute alpha = v'*b_j */
-			cacheCol[tid] = blockCache[tid + (kk*32)] * belem;
-        if(group == 0 && jj == 0 && kk == 0)
-        printf("%.3f %i %.3f %.3f\n", belem, TID , blockCache[tid+kk*32],cacheCol[tid]);
-		//	__syncthreads();
-
-			if(tid < 16)
-				cacheCol[tid] += cacheCol[tid+16];
-		//	__syncthreads();
-			if(tid < 8)
-				cacheCol[tid] += cacheCol[tid+8];
-		//	__syncthreads();
-
-			alpha = cacheCol[0];
-			for(ii = 1; ii < 8; ii ++)
-				alpha += cacheCol[ii];
-
-			/* Compute alpha = tau * v_tid * alpha */
-			alpha *= tau[kk];
-            if(TID==0 && kk == 0)
-            printf("tau[kk] = %.3f\n", tau[kk]);
-			alpha *= blockCache[tid + (kk*32)];
-			if(jj == 0 && kk == 0)
-        printf("%.3f\n", alpha);
-			/* Compute belem -= alpha */
-			belem -= alpha;
-			//__syncthreads();
-		}
-		blockA[tid + (jj*32)] = belem;//}
-		//__syncthreads();
-	}
-	__threadfence();
+            columnTile[j*tilesize+TID] = columnTile[j*tilesize+TID] - tau*w*z;
+        }
+  
+        columnTile[i*tilesize+TID] = w;
+        if(threadIdx.x == 0)
+            tauMatrix[(kk*tilesize+i)*tauNum+ii] = tau;
+        /* Apply to each row to the right.*/
+//        for(j = i; j < tilesize; j++)
+  //      {
+            /* Find w'*A_j, w is 0s except for first value with upper tile.*/
+    //        z = 1.0 * cornerTile[ j*tilesize+i ];            
+      //      for(n = 0; n < tilesize; n++)
+        //    {
+          //      z = z + w[ tilesize+n ]*columnTile[ j*tilesize+n ];
+//            }
+            /* Apply to upper tile.*/
+  //         cornerTile[j*tilesize+i] = cornerTile[j*tilesize+i ] - tau*1.0*z;
+    //        for(n = i+1; n < tilesize; n++)
+      //      {
+        //        cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n ] - tau*w[n]*z;
+          //  }
+            /* Apply to lower tile.*/
+//            for(n = 0; n < tilesize; n++)
+  //          {
+    //            columnTile[ j*tilesize+n] = columnTile[ j*tilesize+n ] - tau*w[tilesize+n]*z;
+      //      }
+            
+       // }
+        /* Store w*/
+//        for(j = 0; j < tilesize; j++){
+  //          columnTile[ i*tilesize+j ] = w[tilesize+j];
+    //    }
+      //  tauMatrix[(kk*tilesize+i)*tauNum+ ii] = tau;
+    }
 }
 
-__device__ void applyOneHHVectD	(float* topElem,
-				float* lowElem,
-				int k,
-				volatile float tau[],
-				volatile float blockCache[])
-{
-	float alpha;
-	__shared__ volatile float workV[32];
-    float reg;
-    int tid = TID % 32;
-
-	/* Compute alpha = sum */
-	if(tid == k)
-		reg/*workV[TID]*/ = *topElem;
-	if(tid != k)
-		reg/*workV[TID]*/ = 0.0f;
-
-	reg/*workV[TID]*/ += *lowElem * blockCache[tid + (k*32)];
-
-	//reduceSum(workV, 32);
-    reduceSumMultiWarp(&reg);
-	
-	alpha = reg/*workV[0]*/;
-
-	/* Multiply by tau */
-	alpha *= tau[k];
-
-	/* Compute alpha *= a_tid,j*v_tid,k */
-	if(tid == k)
-		*topElem -= alpha;
-			
-	/* For lower element. */
-	alpha *= blockCache[tid + (k*32)];
-	*lowElem -= alpha;
-}
 
-__device__ void calcDoubleHHWY	(float topElem,
-				float lowElem,
-				int k,
-				volatile float blockCache[])
+/**
+ *
+ * @brief Applies the householder factorisation of the corner to the row tile.
+ *
+ * @param cornerTile A pointer to the tile to have the householder applied.
+ * @param columnTile The tile in which the householders are stored.
+ * @param rowTile The upper tile to have the householders applied.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param ii The value of i for the QR decomposition.
+ * @param jj The value of j for the QR decomposition.
+ * @param kk The value of k for the QR decomposition.
+ * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ *
+ *
+ */
+__device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,volatile float* rowTile, int tilesize, int ii, int jj, int kk,volatile float* tauMatrix, int tauNum )
 {
-/* Calculate v_k. */
-	int sign, i;
-
-	float 	topSum = 0,
-		lowSum = 0,
-		alpha;
-
-	__shared__ volatile float first;
-	
-	if(TID == k)
-		first = topElem;
-	
-	topSum = first * first;
-	/* Sum squares of V to compute norm, using column of BlockCache as working space. */
-	blockCache[TID + (k*32)] = lowElem * lowElem;
-	for(i = 0; i < 32; i ++)
-		lowSum += blockCache[i + (k*32)];
-	
-	alpha = sqrt(topSum + lowSum);
-	sign = first >= 0.f ? 1.f : -1.f;
-
-	if(alpha != 0.0f)
-	{
-		/* Zeroth element is = first + sign*norm */
-		alpha = first + sign * alpha;
-		alpha = 1.0f/alpha;
-	}
-	else
-		alpha = 1.0f;
+    int i, j, n;
+    float z;
+    float w;
+    int TID = threadIdx.x % 32;
+    int set = threadIdx.x / 32;
+    float tau;
 
-	//topElem *= alpha;
-	lowElem *= alpha;
-	
-	blockCache[TID + (k*32)] = lowElem;
-}
+    
+    for(i = 0; i < tilesize; i++)
+    {
 
+    
+        w = columnTile[i*tilesize+TID];
+        tau = tauMatrix[(kk*tilesize+i)*tauNum+ii];
 
-__device__ void runner_cuda_STSQRF ( int i, int j, int k, volatile float tauVect[], volatile float blockCache[])
-{
-	/* Idea: for each column j of (AB)^T,
-		apply HH vectors 0...j-1,
-		compute new HH vector for column j,
-		store essential part in blockCache 
-
-	Uses one block cache, one vector storage. */
-    float* blockA = &GPU_matrix[k*cuda_m*32*32 + k*32*32];
-    float* blockB = &GPU_matrix[j*cuda_m*32*32 + i*32*32];
-    float* blockTau = &GPU_tau[j*cuda_m*32 + i*32];
-//	int j, k, i;
-    int tid = TID % 32;
-    int group = TID/32;
-	float 	topElem,
-		lowElem,
-		tau;
-
-	for(j = 0; j < 32; j ++)
-	{
-        if(TID < 32 )
+        for(j = set; j < tilesize; j+=blockDim.x/32)
         {
-		if(TID <= j)
-			topElem = blockA[TID + (j*32)];
-		if(TID > j)
-			topElem = 0.f;
-
-		lowElem = blockB[TID + (j*32)];
-
-		//for(k = 0; k < j; k ++)
-		//{
-			/* Compute b_tid,j = b_tid,j - tau*vv'*b_tid,j */
-		//	applyOneHHVectD	(&topElem, &lowElem,
-		//			k,
-		//			tauVect,
-		//			blockCache);
-		//}
-
-		calcDoubleHHWY	(topElem,
-				lowElem,
-				j,
-				blockCache);
-
-		/* Compute new tau = 2/v'v */
-		tau = 1.0f;
-		for(i = 0; i < 32; i ++)
-			tau += blockCache[i + (j*32)] * blockCache[i + (j*32)];
-		tau = 2.0f/tau;
-		
-		if(TID == j)
-			tauVect[j] = tau;
+            z = w * cornerTile[j*tilesize+TID];
+            reduceSumMultiWarp(&z);
+            z += rowTile[j*tilesize+i];
+            if(TID == 0)
+                rowTile[j*tilesize+i] = rowTile[j*tilesize+i] - tau*z;
+            cornerTile[j*tilesize+TID] = cornerTile[j*tilesize+TID] - tau*w*z;
         }
-        
         __syncthreads();
-		/* Apply new vector to column. */
-        for(k = j+group ; k < 32; k +=WARPS )
+        /*for(j = 0; j < i; j++)
+            w[j] = 0.0;
+        w[i] = 1.0;
+        for(j = i+1; j < tilesize; j++)
+            w[j] = 0.0;
+        for(j = 0; j < tilesize; j++)
+            w[j+tilesize] = columnTile[i*tilesize +j];*/
+        
+        /* Apply householder vector (w) to the tiles.*/
+/*        for(j = 0; j < tilesize; j++)
         {
-            if(k != j)
+            z = 0.0;*/
+            /* Compute w' * A_j */
+/*            for(n = 0; n < tilesize; n++)
             {
-                if(tid <= k )
-                    topElem = blockA[tid + k*32];
-                if( tid > k )
-                    topElem = 0.f;
-
-                lowElem = blockB[tid + k*32];
+                z += w[n] * rowTile[j*tilesize+n];
+                z += w[n + tilesize] * cornerTile[j*tilesize+n];
             }
-		    applyOneHHVectD	(&topElem, &lowElem,
-				j,
-				tauVect,
-				blockCache);
-
-            if(tid <= k)
-                blockA[tid + k*32] = topElem;
-
-            blockB[tid + k*32] = lowElem;
-
-           // __syncthreads();
-        }
-
-        __syncthreads();
-		/* Write back */
-		//if(TID <= j)
-		//	blockA[TID + (j*ldm)] = topElem;
-	}
-
-	/* Write back lower block, containing householder Vectors. */
-    if(TID < 32)
-    {
-	for(j = 0; j < 32; j ++)
-		blockB[TID + (j*32)] = blockCache[TID + (j*32)];
-	
-	blockTau[TID] = tauVect[TID];
+            for(n = 0; n < tilesize; n++)
+            {
+                rowTile[j*tilesize + n] = rowTile[j*tilesize + n] - tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[n]*z;
+                cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n]- tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[tilesize+n]*z;
+            }
+        }*/
     }
-	__threadfence();
-}
-
-__device__ void runner_cuda_SSSRFT( int i, int j, int k, volatile float tau[], volatile float blockCache[] )
-{
-	__shared__ volatile float workV[WARPS*32];
-	__syncthreads();
-
-	float 	aelem, belem,
-		alpha;
-    float* blockV = &GPU_matrix[k*cuda_m*32*32 + i*32*32];
-    float* blockA = &GPU_matrix[j*cuda_m*32*32 + k*32*32];
-    float* blockB = &GPU_matrix[j*cuda_m*32*32 + i*32*32];
-    float* blockTau = &GPU_tau[k*cuda_m*32 + i*32];
-
-	int tid, group,
-		refMat, refCache;
-	
-	tid = TID%32;
-	group = TID/32;
-
-	refMat = tid + group*32;
-	refCache = tid + group*32;	
-
-	if(TID < 32)
-		tau[TID] = blockTau[TID];
-	__syncthreads();
-
-
-	/* Load the essential HH vector block into shared cache. */
-	for(j = group; j < 32; j += WARPS)
-	{
-		blockCache[refCache] = blockV[refMat];
-
-		refMat += WARPS*32;
-		refCache += WARPS*32;
-		__syncthreads();
-	}
-
-	__syncthreads();
-
-	/* For each column of the result. */
-	for(j = group; j < 32; j += WARPS)
-	{
-		//if(tid == 0)	printf("%d: column %d.\n", group, j);
-		/* Load the elements of the column to process. */
-		aelem = blockA[tid + (j*32)];
-		belem = blockB[tid + (j*32)];
-		
-		/* Compute and apply b_j = b_j - tau*vv'b_j
-			for each Householder vector 1..32. */
-		for(k = 0; k < 32; k ++)
-		{
-			/* Load the kth element into all threads. */
-			workV[tid + (group*32)] = aelem;
-
-			/* Store this as alpha. */
-			alpha = workV[k + (group*32)];
-	
-			/* Load components of v'b_j to sum. */
-			workV[tid + (group*32)] = belem * blockCache[tid + (k*32)];
-
-			if(tid < 16)
-				workV[tid + (group*32)] += workV[16 + tid + (group*32)];
-			if(tid < 8)
-				workV[tid + (group*32)] += workV[8 + tid + (group*32)];
-			/* Compute v'b_j */
-			for(i = 0; i < 8; i ++)
-				alpha += workV[i + (group*32)];
-
-			/* Multiply by kth tau. */
-			alpha *= tau[k];
-			
-			/* Compute b_j -= alpha*v
-				If kth thread in group, v is 1 at aelem. */
-			if(tid == k)	aelem -= alpha;
-
-			/* Compute b_j -= alpha * v for lower half. */
-			belem -= alpha * blockCache[tid + (k*32)];
-			__syncthreads();
-		}
-		/* put the elements back. */
-		blockA[tid + (j*32)] = aelem;
-		blockB[tid + (j*32)] = belem;
-		//__syncthreads();
-	}
-	__syncthreads();
 
 }
 
@@ -759,7 +461,7 @@ __device__ void runner ( int type , void *data ) {
         switch ( type ) {
             case task_SGEQRF:
                 if(threadIdx.x < 32)
-                    runner_cuda_SGEQRF(i, j, k, workVector, blockCache);
+//                    runner_cuda_SGEQRF(i, j, k, workVector, blockCache);
 
                 if(threadIdx.x == 0){
                     printf("SGEQRF: %i %i %i + %i\n",i, j , k, clock());
@@ -1116,6 +818,188 @@ void test()
     free(Matrix);
 }
 
+/* Unit tests for the tiled QR decomposition.*/
+
+__device__  volatile float tile[32*32];
+__device__  volatile float tile2[32*32];
+__device__  volatile float tile3[32*32];
+__device__  volatile float tile4[32*32];
+__device__  volatile float tau[1024];
+
+__global__ void SGEQRF_test(float* cornerTile)
+{
+    int i;
+    
+    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
+    {
+        printf("Copying value %i\n", i);
+        tile[i] = cornerTile[i];
+    }
+
+    printf("blockDim.x = %i\n", blockDim.x);
+    __syncthreads();
+    if(threadIdx.x < 32)
+        SGEQRF(tile, 32, tau, 0, 1);
+    __syncthreads();
+    __threadfence();
+    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
+        cornerTile[i] = tile[i];    
+}
+
+void test_SGEQRF()
+{
+    float* cornerTile;
+    cornerTile = generateColumnMatrix(32, 35532);
+    SGEQRF_test<<<1, 128>>>(cornerTile);
+    cudaDeviceSynchronize();
+    printMatrix(cornerTile, 1, 1, 32);
+    cudaFreeHost(cornerTile);
+}
+
+
+__global__ void SLARFT_test(float* cornerTile, float* rowTile)
+{
+    int i;
+    
+    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
+        tile[i] = cornerTile[i];
+
+    __syncthreads();
+
+    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
+        tile2[i] = rowTile[i];
+    __syncthreads();
+    if(threadIdx.x < 32)
+        SGEQRF(tile, 32, tau, 0, 2);
+
+    __syncthreads();
+//    if(threadIdx.x < 32)
+        SLARFT(tile, tile2, 32, 1, 0, tau, 2);
+    __syncthreads();
+    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
+        cornerTile[i] = tile[i];    
+    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
+        rowTile[i] = tile2[i];
+    __syncthreads();
+}
+
+void test_SLARFT()
+{
+    float* cornerTile, *rowTile;
+    cornerTile = generateColumnMatrix(32, 35536);
+    rowTile = generateColumnMatrix(32, 35539);
+    SLARFT_test<<<1, 128>>>(cornerTile, rowTile);
+    cudaDeviceSynchronize();
+    printMatrix(cornerTile, 1, 1, 32);
+
+    printf("\n\n\n\n\n Row Tile \n\n\n\n\n");
+    printMatrix(rowTile, 1, 1, 32);
+    cudaFreeHost(cornerTile);
+    cudaFreeHost(rowTile);
+}
+
+
+__global__ void STSQRF_test(float* cornerTile, float* columnTile)
+{
+    int i;
+    
+    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
+        tile[i] = cornerTile[i];
+    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
+        tile2[i] = columnTile[i];
+    __syncthreads();
+    if(threadIdx.x < 32)
+        SGEQRF(tile, 32, tau, 0, 2);
+    __syncthreads();
+    if(threadIdx.x < 32)
+        STSQRF( tile, tile2, 32, 1, 0, tau, 2);
+    __syncthreads();
+    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
+        cornerTile[i] = tile[i];
+    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
+        columnTile[i] = tile2[i];
+}
+
+void test_STSQRF()
+{
+    float *cornerTile, *columnTile;
+    cornerTile = generateColumnMatrix(32, 35531);
+    columnTile = generateColumnMatrix(32, 35585);
+    STSQRF_test<<<1,32>>>(cornerTile, columnTile);
+    cudaDeviceSynchronize();
+    printMatrix(cornerTile, 1, 1, 32);
+    printf("\n\n\n\n\n Column Tile \n\n\n\n\n");
+    printMatrix(columnTile, 1, 1, 32);
+    cudaFreeHost(cornerTile);
+    cudaFreeHost(columnTile);
+}
+
+
+
+__global__ void SSSRFT_test(float* cornerTile, float* columnTile, float* rowTile, float* otherTile)
+{
+    int i;
+    
+    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
+        tile[i] = cornerTile[i];
+    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
+        tile2[i] = columnTile[i];
+    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
+        tile3[i] = rowTile[i];
+    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
+        tile4[i] = otherTile[i];
+    __syncthreads();
+    if(threadIdx.x < 32)
+        SGEQRF(tile, 32, tau, 0, 2);
+    __syncthreads();
+    if(threadIdx.x < 32)
+        STSQRF( tile, tile2, 32, 1, 0, tau, 2);
+    __syncthreads();
+    SLARFT(tile, tile3, 32, 1, 0, tau, 2);
+    __syncthreads();
+    SSSRFT( tile4, tile2, tile3, 32, 1, 1 , 0, tau, 2 );
+    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
+        cornerTile[i] = tile[i];
+    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
+        columnTile[i] = tile2[i];
+    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
+        rowTile[i] = tile3[i];
+    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
+        otherTile[i] = tile4[i];
+}
+
+void test_SSSRFT()
+{
+    float* cornerTile, *rowTile, *columnTile, *otherTile;
+    cornerTile = generateColumnMatrix(32, 35136);
+    rowTile = generateColumnMatrix(32, 35239);
+    columnTile = generateColumnMatrix(32, 35339);
+    otherTile = generateColumnMatrix(32, 35739);
+    SSSRFT_test<<<1, 128>>>(cornerTile, columnTile, rowTile, otherTile);
+    cudaDeviceSynchronize();
+    printMatrix(cornerTile, 1, 1, 32);
+    printf("\n\n\n\n\n Column Tile \n\n\n\n\n");
+    printMatrix(columnTile, 1, 1, 32);
+    printf("\n\n\n\n\n Row Tile \n\n\n\n\n");
+    printMatrix(rowTile, 1, 1, 32);
+    printf("\n\n\n\n\n Other Tile \n\n\n\n\n");
+    printMatrix(otherTile, 1, 1, 32);
+    cudaFreeHost(cornerTile);
+    cudaFreeHost(rowTile);
+    cudaFreeHost(columnTile);
+    cudaFreeHost(otherTile);
+    
+}
+
+
+
+void runTests()
+{
+   // test_SGEQRF();
+  //  test_SLARFT();
+ //   test_STSQRF();
+    test_SSSRFT();
+}
 /**
  * @brief Main function.
  */
@@ -1157,7 +1041,7 @@ int main ( int argc , char *argv[] ) {
 	            //omp_set_num_threads( nr_threads );
                 break;
             case 'D':
-                test();
+                runTests();
                 exit( EXIT_SUCCESS );
 	        case '?':
                 fprintf( stderr , "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n" , argv[0] );