diff --git a/examples/Makefile.am b/examples/Makefile.am
index 152221294d79d6d628c5dc5c5776bc9f2e309f28..9708f27ed737f34bc9bb425d4b6bb0529b8feb8a 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -23,23 +23,22 @@ AUTOMAKE_OPTIONS=gnu
 AM_CFLAGS = -g -O3 -Wall -Werror -I../src -ffast-math -fstrict-aliasing \
     -ftree-vectorize -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \
     -DCPU_TPS=2.67e9 -DTIMERS \
-    #-fsanitize=address -fno-omit-frame-pointer
+    # -fsanitize=address -fno-omit-frame-pointer
 
 AM_LDFLAGS = -lm # -fsanitize=address
 
 # Set-up the library
-bin_PROGRAMS = test test_bh test_bh_2 test_bh_3 test_bh_sorted test_qr
-#test_qr
+bin_PROGRAMS = test test_qr test_bh test_bh_2 test_bh_3 test_bh_sorted
+
 # Sources for test
 test_SOURCES = test.c
 test_CFLAGS = $(AM_CFLAGS)
 test_LDADD =  ../src/.libs/libquicksched.a
 
 # Sources for test_qr
-#-L/usr/bin64/atlas/ /home/aidan/lapack-3.5.0/liblapacke.a /home/aidan/lapack-3.5.0/liblapack.a
-test_qr_SOURCES = test_qr.c /usr/lib64/atlas/libcblas.a /usr/lib64/atlas/libptcblas.a
-test_qr_CFLAGS = $(AM_CFLAGS) -I/home/aidan/ATLAS/ATLAS_linux/include #-I/home/aidan/lapack-3.5.0/lapacke/include
-test_qr_LDADD =  ../src/.libs/libquicksched.a  -lf77blas -lcblas -latlas -lm  -L/home/aidan/ATLAS/ATLAS_linux/lib/ 
+test_qr_SOURCES = test_qr.c
+test_qr_CFLAGS = $(AM_CFLAGS)
+test_qr_LDADD =  ../src/.libs/libquicksched.a -llapacke -llapacke -lblas
 
 # Sources for test_bh
 test_bh_SOURCES = test_bh.c
diff --git a/examples/test_qr.c b/examples/test_qr.c
index ada33a33acf4833ed478e10bcd0f0cadf466a5e5..b9cdbecde7b6a2d430f0afbbe24015c88ba8a028 100644
--- a/examples/test_qr.c
+++ b/examples/test_qr.c
@@ -1,22 +1,21 @@
 /*******************************************************************************
  * This file is part of QuickSched.
  * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
- ******************************************************************************/
-
+ *
+* *****************************************************************************/
 
 /* Config parameters. */
 #include "../config.h"
@@ -30,187 +29,161 @@
 #include <omp.h>
 #include <pthread.h>
 
-
-
-
 #include <cblas.h>
 
-
 /* Local includes. */
 #include "quicksched.h"
 
 /**
- * Takes a column major matrix, NOT tile major. size is length of a side of the matrix. Only works for square matrices.
- * This function is simply for validation and is implemented naively as we know of no implementation to retrieve Q from the tiled QR.
+ * Takes a column major matrix, NOT tile major. size is length of a side of the
+ * matrix. Only works for square matrices.
+ * This function is simply for validation and is implemented naively as we know
+ * of no implementation to retrieve Q from the tiled QR.
  */
-double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum)
-{
-    double* Q = malloc(sizeof(double)*size*size);
-    double* Qtemp = malloc(sizeof(double)*size*size);
-    double* w = malloc(sizeof(double)*size);
-    double* ww = malloc(sizeof(double)*size*size);
-    double* temp = malloc(sizeof(double)*size*size);
-    int i, k, l, j, n;
-    bzero(Q, sizeof(double)*size*size);
-    bzero(Qtemp, sizeof(double)*size*size);
-    bzero(ww, sizeof(double)*size*size);
-    for(i = 0; i < size; i++)
-    {
-        Q[i*size + i] = 1.0;
-    }
-    int numcoltile = size / tilesize;
-    int numrowtile = size / tilesize;
-    for(k = 0; k < numrowtile; k++)
-    {
-        for(l = 0; l < tilesize; l++)
-        {
-            bzero(Qtemp, sizeof(double)*size*size);
-            for(i = 0; i < size; i++)
-            {
-                Qtemp[i*size + i] = 1.0;
-            }
-
-
-            for(i = k; i < numcoltile; i++)
-            {
-                bzero(w, sizeof(double)*size);
-                
-                for(j = 0 ; j < tilesize; j++)
-                {
-                    w[i*tilesize + j] = HR[(k*tilesize+l)*size + i*tilesize+j];
-                }
-                    w[k*tilesize+l] = 1.0;
-                if(k*tilesize+l > i*tilesize)
-                {
-                    for(j = 0; j < k*tilesize+l; j++)
-                        w[j] = 0.0;
-                }
-
-
-                /* Compute (I - tau*w*w')' */
-                for(j = 0; j < size; j++)
-                {
-                    for(n = 0; n < size; n++)
-                    {
-                        if(j != n)
-                            ww[n*size + j] = -tau[(k*tilesize+l)*tauNum+ i] * w[j] * w[n];
-                        else
-                            ww[n*size + j] = 1.0 - tau[(k*tilesize+l)*tauNum+ i] * w[j] * w[n];
-                    }
-                }
-
-                /* Qtemp = Qtemp * (I-tau*w*w')' */
-                cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size,  size, 1.0, Qtemp, size, ww, size, 0.0, temp, size);
-                double *b = Qtemp;
-                Qtemp = temp;
-                temp = b;
-            }   
-            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, 1.0, Q, size, Qtemp, size, 0.0, temp, size);
-            double *b = Q;
-            Q = temp;
-            temp = b;         
+double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum) {
+  double* Q = malloc(sizeof(double) * size * size);
+  double* Qtemp = malloc(sizeof(double) * size * size);
+  double* w = malloc(sizeof(double) * size);
+  double* ww = malloc(sizeof(double) * size * size);
+  double* temp = malloc(sizeof(double) * size * size);
+  int i, k, l, j, n;
+  bzero(Q, sizeof(double) * size * size);
+  bzero(Qtemp, sizeof(double) * size * size);
+  bzero(ww, sizeof(double) * size * size);
+  for (i = 0; i < size; i++) {
+    Q[i * size + i] = 1.0;
+  }
+  int numcoltile = size / tilesize;
+  int numrowtile = size / tilesize;
+  for (k = 0; k < numrowtile; k++) {
+    for (l = 0; l < tilesize; l++) {
+      bzero(Qtemp, sizeof(double) * size * size);
+      for (i = 0; i < size; i++) {
+        Qtemp[i * size + i] = 1.0;
+      }
+
+      for (i = k; i < numcoltile; i++) {
+        bzero(w, sizeof(double) * size);
+
+        for (j = 0; j < tilesize; j++) {
+          w[i * tilesize + j] =
+              HR[(k * tilesize + l) * size + i * tilesize + j];
+        }
+        w[k * tilesize + l] = 1.0;
+        if (k * tilesize + l > i * tilesize) {
+          for (j = 0; j < k * tilesize + l; j++) w[j] = 0.0;
         }
+
+        /* Compute (I - tau*w*w')' */
+        for (j = 0; j < size; j++) {
+          for (n = 0; n < size; n++) {
+            if (j != n)
+              ww[n * size + j] =
+                  -tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n];
+            else
+              ww[n * size + j] =
+                  1.0 - tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n];
+          }
+        }
+
+        /* Qtemp = Qtemp * (I-tau*w*w')' */
+        cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
+                    1.0, Qtemp, size, ww, size, 0.0, temp, size);
+        double* b = Qtemp;
+        Qtemp = temp;
+        temp = b;
+      }
+      cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
+                  1.0, Q, size, Qtemp, size, 0.0, temp, size);
+      double* b = Q;
+      Q = temp;
+      temp = b;
     }
-    
-    free(Qtemp);
-    free(w);
-    free(ww);
-    free(temp);
-    return Q;
+  }
+
+  free(Qtemp);
+  free(w);
+  free(ww);
+  free(temp);
+  return Q;
 }
 
-double* getR(double*HR, int size)
-{
-    double* R = malloc(sizeof(double) * size * size);
-    int i, j;
-    bzero(R, sizeof(double) * size * size);
-    for(i = 0; i< size; i++)
-    {
-        for(j = 0; j <= i; j++)
-        {
-            R[i*size + j] = HR[i*size + j];
-        }
+double* getR(double* HR, int size) {
+  double* R = malloc(sizeof(double) * size * size);
+  int i, j;
+  bzero(R, sizeof(double) * size * size);
+  for (i = 0; i < size; i++) {
+    for (j = 0; j <= i; j++) {
+      R[i * size + j] = HR[i * size + j];
     }
-    return R;
+  }
+  return R;
 }
 
-void printMatrix(double* Matrix, int m, int n, int tilesize)
-{
-    int i, j;
-    
-    for(i=0; i < m*tilesize; i++)
-    {
-        for(j = 0; j < n*tilesize; j++)
-        {
-            printf(" %.3f ", Matrix[j*m*tilesize +i]);
+void printMatrix(double* Matrix, int m, int n, int tilesize) {
+  int i, j;
 
-        }
-        printf("\n");
+  for (i = 0; i < m * tilesize; i++) {
+    for (j = 0; j < n * tilesize; j++) {
+      printf(" %.3f ", Matrix[j * m * tilesize + i]);
     }
-
+    printf("\n");
+  }
 }
 
-double* columnToTile( double* columnMatrix, int size , int m , int n , int tilesize)
-{
-    double* TileMatrix;
-    TileMatrix = malloc(sizeof(double) * size );
-    if(TileMatrix == NULL)
-        error("failed to allocate TileMatrix");
-    int i,j,k,l;
-
-    for( i = 0; i < n ; i++ )
-    {
-        for( j = 0; j < m; j++ )
-        {
-            double *tileStart = &columnMatrix[i*m*tilesize*tilesize + j*tilesize];
-            double *tilePos = &TileMatrix[i*m*tilesize*tilesize + j*tilesize*tilesize];
-            for(k = 0; k < tilesize; k++)
-            {
-                tileStart = &columnMatrix[i*m*tilesize*tilesize+k*m*tilesize + j*tilesize];
-                for( l = 0; l < tilesize; l++ )
-                {
-                    tilePos[k*tilesize + l] = tileStart[l];
-                }
-            }
+double* columnToTile(double* columnMatrix, int size, int m, int n,
+                     int tilesize) {
+  double* TileMatrix;
+  TileMatrix = malloc(sizeof(double) * size);
+  if (TileMatrix == NULL) error("failed to allocate TileMatrix");
+  int i, j, k, l;
+
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < m; j++) {
+      double* tileStart =
+          &columnMatrix[i * m * tilesize * tilesize + j * tilesize];
+      double* tilePos =
+          &TileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
+      for (k = 0; k < tilesize; k++) {
+        tileStart = &columnMatrix[i * m * tilesize * tilesize +
+                                  k * m * tilesize + j * tilesize];
+        for (l = 0; l < tilesize; l++) {
+          tilePos[k * tilesize + l] = tileStart[l];
         }
+      }
     }
+  }
 
-    return TileMatrix;
-
+  return TileMatrix;
 }
 
-double* tileToColumn( double* tileMatrix, int size, int m , int n , int tilesize)
-{
-    double* ColumnMatrix;
-    ColumnMatrix = (double*) malloc(sizeof(double) * size );
-    if(ColumnMatrix == NULL)
-        error("failed to allocate ColumnMatrix");
-    int i,j,k,l;
-    for( i = 0; i < n ; i++ )
-    {
-        for(j = 0; j < m ; j++ )
-        {
-            /* Tile on ith column is at i*m*32*32.*/
-            /* Tile on jth is at j*32*32 */
-            double *tile = &tileMatrix[i*m*tilesize*tilesize + j*tilesize*tilesize];
-            /* Column starts at same position as tile. */
-            /* Row j*32.*/
-            double *tilePos = &ColumnMatrix[i*m*tilesize*tilesize + j*tilesize];
-            for( k = 0; k < tilesize; k++ )
-            {
-                for(l=0; l < tilesize; l++)
-                {
-                    tilePos[l] = tile[l];
-                }
-                /* Next 32 elements are the position of the tile in the next column.*/
-                tile = &tile[tilesize];
-                /* Move to the j*32th position in the next column. */                
-                tilePos = &tilePos[tilesize*m];
-
-            }
-        }   
+double* tileToColumn(double* tileMatrix, int size, int m, int n, int tilesize) {
+  double* ColumnMatrix;
+  ColumnMatrix = (double*)malloc(sizeof(double) * size);
+  if (ColumnMatrix == NULL) error("failed to allocate ColumnMatrix");
+  int i, j, k, l;
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < m; j++) {
+      /* Tile on ith column is at i*m*32*32.*/
+      /* Tile on jth is at j*32*32 */
+      double* tile =
+          &tileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
+      /* Column starts at same position as tile. */
+      /* Row j*32.*/
+      double* tilePos =
+          &ColumnMatrix[i * m * tilesize * tilesize + j * tilesize];
+      for (k = 0; k < tilesize; k++) {
+        for (l = 0; l < tilesize; l++) {
+          tilePos[l] = tile[l];
+        }
+        /* Next 32 elements are the position of the tile in the next column.*/
+        tile = &tile[tilesize];
+        /* Move to the j*32th position in the next column. */
+        tilePos = &tilePos[tilesize * m];
+      }
     }
-    return ColumnMatrix;
+  }
+  return ColumnMatrix;
 }
 
 /* Routines for the tiled QR decomposition.*/
@@ -219,89 +192,78 @@ double* tileToColumn( double* tileMatrix, int size, int m , int n , int tilesize
  *
  * @brief Computes the QR decomposition of a tile.
  *
- * @param cornerTile A pointer to the tile for which the decomposition is computed.
+ * @param cornerTile A pointer to the tile for which the decomposition is
+* computed.
  * @param tilesize The number of elements on a row/column of the tile.
  * @param tauMatrix A pointer to the tau Matrix.
  * @param k The value of k for the QR decomposition
- * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
  *
  *
  */
-void DGEQRF(double* restrict cornerTile, int tileSize, double* restrict tauMatrix, int k, int tauNum, double* w)
-{
-    int i, j, n;
-    double norm=0.0, sign, u1, tau, z;
-
-
-    /*Find the householder vector for each row. */
-    for(i = 0; i < tileSize; i++)
-    {
-        norm = 0.0;
-        /*Fill w with the vector.*/
-        for(j=i; j < tileSize; j++)
-        {
-            /* ith row is i*tileSize, only want elements on diagonal or below.*/
-            w[ j ] = cornerTile[ i*tileSize +j ];
-            /*Find the norm as well */
-            norm = norm + w[j]*w[j];
-        }
-        if(w[i] >= 0.0)
-            sign = -1;
-        else
-            sign = 1;
+void DGEQRF(double* restrict cornerTile, int tileSize,
+            double* restrict tauMatrix, int k, int tauNum) {
+  int i, j, n;
+  double norm = 0.0, sign, u1, tau, z;
+  double w[tileSize];
+
+  /*Find the householder vector for each row. */
+  for (i = 0; i < tileSize; i++) {
+    norm = 0.0;
+    /*Fill w with the vector.*/
+    for (j = i; j < tileSize; j++) {
+      /* ith row is i*tileSize, only want elements on diagonal or below.*/
+      w[j] = cornerTile[i * tileSize + j];
+      /*Find the norm as well */
+      norm = norm + w[j] * w[j];
+    }
+    if (w[i] >= 0.0)
+      sign = -1;
+    else
+      sign = 1;
 
-        norm = sqrt(norm);
+    norm = sqrt(norm);
 
-        u1 = w[i] - sign*norm;
+    u1 = w[i] - sign * norm;
 
-        if(u1 != 0.0)
-        {
-            for(j=i+1; j < tileSize; j++)
-                w[j] = w[j] / u1;
-        }
-        else
-        {
-            for(j=i+1; j < tileSize; j++)
-                w[j] = 0.0;
-        }
+    if (u1 != 0.0) {
+      for (j = i + 1; j < tileSize; j++) w[j] = w[j] / u1;
+    } else {
+      for (j = i + 1; j < tileSize; j++) w[j] = 0.0;
+    }
 
-        if(norm != 0.0)
-            tau = -sign*u1/norm;
-        else
-            tau = 0.0;
+    if (norm != 0.0)
+      tau = -sign * u1 / norm;
+    else
+      tau = 0.0;
 
-        /*Store the below diagonal vector */
-        
-        for(j = i+1; j < tileSize; j++)
-        {
-            cornerTile[ i*tileSize +j ] = w[j];
-        }
-        cornerTile[ i*tileSize + i ] = sign*norm;
-        w[i] = 1.0;
-        /* Apply the householder transformation to the rest of the tile, for everything to the right of the diagonal. */
-        for(j = i+1; j < tileSize; j++)
-        {
-            /*Compute w'*A_j*/
-            z = cornerTile[ j*tileSize+i];
-            for(n = i+1; n < tileSize; n++)
-            {
-                z = z + cornerTile[j*tileSize+n] * w[n];
-            }
-            /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */
-            for(n=i; n < tileSize; n++)
-            {
-                cornerTile[j*tileSize+n] = cornerTile[j*tileSize+n] - tau*w[n]*z;
-            }
-
-        }
-        /* Store tau. We're on k*tileSize+ith row. kth column.*/
-        tauMatrix[(k*tileSize+i)*tauNum+k] = tau;
+    /*Store the below diagonal vector */
 
+    for (j = i + 1; j < tileSize; j++) {
+      cornerTile[i * tileSize + j] = w[j];
     }
-
+    cornerTile[i * tileSize + i] = sign * norm;
+    w[i] = 1.0;
+    /* Apply the householder transformation to the rest of the tile, for
+     * everything to the right of the diagonal. */
+    for (j = i + 1; j < tileSize; j++) {
+      /*Compute w'*A_j*/
+      z = cornerTile[j * tileSize + i];
+      for (n = i + 1; n < tileSize; n++) {
+        z = z + cornerTile[j * tileSize + n] * w[n];
+      }
+      /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */
+      for (n = i; n < tileSize; n++) {
+        cornerTile[j * tileSize + n] =
+            cornerTile[j * tileSize + n] - tau * w[n] * z;
+      }
+    }
+    /* Store tau. We're on k*tileSize+ith row. kth column.*/
+    tauMatrix[(k * tileSize + i) * tauNum + k] = tau;
+  }
 }
 
-
 /**
  *
  * @brief Applies the householder factorisation of the corner to the row tile.
@@ -312,46 +274,40 @@ void DGEQRF(double* restrict cornerTile, int tileSize, double* restrict tauMatri
  * @param tauMatrix A pointer to the tau Matrix.
  * @param jj The value of j for the QR decomposition.
  * @param kk The value of k for the QR decomposition.
- * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
  *
  *
  */
-void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize, int jj, int kk, double* restrict tauMatrix, int tauNum, double* w)
-{
-    int i, j, n;
-    double z=0.0;
-
-
-    /* For each row in the corner Tile*/
-    for(i = 0; i < tileSize; i++)
-    {
-        /*Get w for row i */
-        for(j = i; j < tileSize; j++)
-        {
-            w[j] = cornerTile[i*tileSize + j];
-        }
-        w[i] = 1.0;
-        
-        /* Apply to the row Tile */
-        for(j = 0; j < tileSize; j++)
-        {
-            z=0.0;
-            /* Below Diagonal!*/
-            /*Compute w'*A_j*/
-            for(n = i; n < tileSize; n++)
-            {
-                z = z + w[n] * rowTile[j*tileSize+n];
-            }
-            for(n = i; n < tileSize; n++)
-            {
-                rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z;
-            }
-        }
-
-        
+void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize,
+            int jj, int kk, double* restrict tauMatrix, int tauNum) {
+  int i, j, n;
+  double z = 0.0;
+  double w[tileSize];
+
+  /* For each row in the corner Tile*/
+  for (i = 0; i < tileSize; i++) {
+    /*Get w for row i */
+    for (j = i; j < tileSize; j++) {
+      w[j] = cornerTile[i * tileSize + j];
     }
-
-
+    w[i] = 1.0;
+
+    /* Apply to the row Tile */
+    for (j = 0; j < tileSize; j++) {
+      z = 0.0;
+      /* Below Diagonal!*/
+      /*Compute w'*A_j*/
+      for (n = i; n < tileSize; n++) {
+        z = z + w[n] * rowTile[j * tileSize + n];
+      }
+      for (n = i; n < tileSize; n++) {
+        rowTile[j * tileSize + n] =
+            rowTile[j * tileSize + n] -
+            tauMatrix[(kk * tileSize + i) * tauNum + kk] * w[n] * z;
+      }
+    }
+  }
 }
 
 /**
@@ -364,83 +320,77 @@ void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize,
  * @param tauMatrix A pointer to the tau Matrix.
  * @param ii The value of i for the QR decomposition.
  * @param kk The value of k for the QR decomposition.
- * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
  *
  *
  */
-void DTSQRF( double* restrict cornerTile, double* restrict columnTile, int tilesize, int ii, int kk, double* restrict tauMatrix, int tauNum, double* w )
-{
-    int i, j, n;
-    double norm=0.0, sign, u1, tau, z;
-
-    /* For each column compute the householder vector. */
-    for(i = 0; i < tilesize; i++)
-    {
-        norm = 0.0;
-        w[i] = cornerTile[ i*tilesize+i ];
-        norm = norm + w[i]*w[i];
-        for(j = i+1; j < tilesize; j++)
-        {   
-            w[j] = 0.0;
-        }
-        for(j = 0; j < tilesize; j++)
-        {
-            w[tilesize+j] = columnTile[ i*tilesize+j ];
-            norm = norm + w[tilesize+j]*w[tilesize+j];
-        }
-        
-        norm = sqrt(norm);
-        if(w[i] >= 0.0)
-            sign = -1;
-        else
-            sign = 1;
-        
-
-        u1 = w[i] - sign*norm;
-        if(u1 != 0.0)
-        {
-            for(j = i+1; j < 2*tilesize; j++){
-                w[j] = w[j]/u1;
-            }
-        }else
-        {
-            for(j = i+1; j < 2*tilesize; j++)
-                w[j] = 0.0;
-        }
+void DTSQRF(double* restrict cornerTile, double* restrict columnTile,
+            int tilesize, int ii, int kk, double* restrict tauMatrix,
+            int tauNum) {
+  int i, j, n;
+  double norm = 0.0, sign, u1, tau, z;
+  double w[2*tilesize];
+
+  /* For each column compute the householder vector. */
+  for (i = 0; i < tilesize; i++) {
+    norm = 0.0;
+    w[i] = cornerTile[i * tilesize + i];
+    norm = norm + w[i] * w[i];
+    for (j = i + 1; j < tilesize; j++) {
+      w[j] = 0.0;
+    }
+    for (j = 0; j < tilesize; j++) {
+      w[tilesize + j] = columnTile[i * tilesize + j];
+      norm = norm + w[tilesize + j] * w[tilesize + j];
+    }
 
-        if(norm != 0)
-            tau = -sign*u1/norm;
-        else
-            tau = 0.0;
-
-        /* Apply to each row to the right.*/
-        for(j = i; j < tilesize; j++)
-        {
-            /* Find w'*A_j, w is 0s except for first value with upper tile.*/
-            z = 1.0 * cornerTile[ j*tilesize+i ];            
-            for(n = 0; n < tilesize; n++)
-            {
-                z = z + w[ tilesize+n ]*columnTile[ j*tilesize+n ];
-            }
-            /* Apply to upper tile.*/
-            cornerTile[j*tilesize+i] = cornerTile[j*tilesize+i ] - tau*1.0*z;
-            for(n = i+1; n < tilesize; n++)
-            {
-                cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n ] - tau*w[n]*z;
-            }
-            /* Apply to lower tile.*/
-            for(n = 0; n < tilesize; n++)
-            {
-                columnTile[ j*tilesize+n] = columnTile[ j*tilesize+n ] - tau*w[tilesize+n]*z;
-            }
-            
-        }
-        /* Store w*/
-        for(j = 0; j < tilesize; j++){
-            columnTile[ i*tilesize+j ] = w[tilesize+j];
-        }
-        tauMatrix[(kk*tilesize+i)*tauNum+ ii] = tau;
+    norm = sqrt(norm);
+    if (w[i] >= 0.0)
+      sign = -1;
+    else
+      sign = 1;
+
+    u1 = w[i] - sign * norm;
+    if (u1 != 0.0) {
+      for (j = i + 1; j < 2 * tilesize; j++) {
+        w[j] = w[j] / u1;
+      }
+    } else {
+      for (j = i + 1; j < 2 * tilesize; j++) w[j] = 0.0;
     }
+
+    if (norm != 0)
+      tau = -sign * u1 / norm;
+    else
+      tau = 0.0;
+
+    /* Apply to each row to the right.*/
+    for (j = i; j < tilesize; j++) {
+      /* Find w'*A_j, w is 0s except for first value with upper tile.*/
+      z = 1.0 * cornerTile[j * tilesize + i];
+      for (n = 0; n < tilesize; n++) {
+        z = z + w[tilesize + n] * columnTile[j * tilesize + n];
+      }
+      /* Apply to upper tile.*/
+      cornerTile[j * tilesize + i] =
+          cornerTile[j * tilesize + i] - tau * 1.0 * z;
+      for (n = i + 1; n < tilesize; n++) {
+        cornerTile[j * tilesize + n] =
+            cornerTile[j * tilesize + n] - tau * w[n] * z;
+      }
+      /* Apply to lower tile.*/
+      for (n = 0; n < tilesize; n++) {
+        columnTile[j * tilesize + n] =
+            columnTile[j * tilesize + n] - tau * w[tilesize + n] * z;
+      }
+    }
+    /* Store w*/
+    for (j = 0; j < tilesize; j++) {
+      columnTile[i * tilesize + j] = w[tilesize + j];
+    }
+    tauMatrix[(kk * tilesize + i) * tauNum + ii] = tau;
+  }
 }
 
 /**
@@ -455,46 +405,45 @@ void DTSQRF( double* restrict cornerTile, double* restrict columnTile, int tiles
  * @param ii The value of i for the QR decomposition.
  * @param jj The value of j for the QR decomposition.
  * @param kk The value of k for the QR decomposition.
- * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
  *
  *
  */
-void DSSRFT( double* restrict cornerTile, double* restrict columnTile, double* restrict rowTile, int tilesize, int ii, int jj, int kk, double* restrict tauMatrix, int tauNum , double* w)
-{
-    int i, j, n;
-    double z;
-
-    
-    for(i = 0; i < tilesize; i++)
-    {
-        for(j = 0; j < i; j++)
-            w[j] = 0.0;
-        w[i] = 1.0;
-        for(j = i+1; j < tilesize; j++)
-            w[j] = 0.0;
-        for(j = 0; j < tilesize; j++)
-            w[j+tilesize] = columnTile[i*tilesize +j];
-        
-        /* Apply householder vector (w) to the tiles.*/
-        for(j = 0; j < tilesize; j++)
-        {
-            z = 0.0;
-            /* Compute w' * A_j */
-            for(n = 0; n < tilesize; n++)
-            {
-                z += w[n] * rowTile[j*tilesize+n];
-                z += w[n + tilesize] * cornerTile[j*tilesize+n];
-            }
-            for(n = 0; n < tilesize; n++)
-            {
-                rowTile[j*tilesize + n] = rowTile[j*tilesize + n] - tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[n]*z;
-                cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n]- tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[tilesize+n]*z;
-            }
-        }
+void DSSRFT(double* restrict cornerTile, double* restrict columnTile,
+            double* restrict rowTile, int tilesize, int ii, int jj, int kk,
+            double* restrict tauMatrix, int tauNum) {
+  int i, j, n;
+  double z;
+  double w[2*tilesize];
+
+  for (i = 0; i < tilesize; i++) {
+    for (j = 0; j < i; j++) w[j] = 0.0;
+    w[i] = 1.0;
+    for (j = i + 1; j < tilesize; j++) w[j] = 0.0;
+    for (j = 0; j < tilesize; j++)
+      w[j + tilesize] = columnTile[i * tilesize + j];
+
+    /* Apply householder vector (w) to the tiles.*/
+    for (j = 0; j < tilesize; j++) {
+      z = 0.0;
+      /* Compute w' * A_j */
+      for (n = 0; n < tilesize; n++) {
+        z += w[n] * rowTile[j * tilesize + n];
+        z += w[n + tilesize] * cornerTile[j * tilesize + n];
+      }
+      for (n = 0; n < tilesize; n++) {
+        rowTile[j * tilesize + n] =
+            rowTile[j * tilesize + n] -
+            tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[n] * z;
+        cornerTile[j * tilesize + n] =
+            cornerTile[j * tilesize + n] -
+            tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[tilesize + n] * z;
+      }
     }
+  }
+}
 
-}		
-			
 /**
  * @brief Computed a tiled QR factorization using QuickSched.
  *
@@ -502,302 +451,298 @@ void DSSRFT( double* restrict cornerTile, double* restrict columnTile, double* r
  * @param n Number of tile columns.
  * @param nr_threads Number of threads to use.
  */
- 
-void test_qr ( int m , int n , int K , int nr_threads , int runs, double *matrix ) {
-
-    int k, j, i;
-    double *A, *A_orig, *tau;
-    struct qsched s;
-    qsched_task_t *tid, tid_new;
-    qsched_res_t *rid;
-    int data[3];
-    ticks tic, toc_run, tot_setup, tot_run = 0;
-    
-    enum task_types { task_DGEQRF , task_DLARFT , task_DTSQRF , task_DSSRFT };
-
-
-    /* Runner function to pass to the scheduler. */
-    void runner ( int type , void *data ) {
-    
-        /* Decode the task data. */
-        int *idata = (int *)data;
-        int i = idata[0], j = idata[1], k = idata[2];
-        double ww[ 2*K ];
-        
-        /* Decode and execute the task. */
-        switch ( type ) {
-            case task_DGEQRF:
-                DGEQRF( &A[ (k*m+k)*K*K], K, tau, k, m, ww);
-                break;
-            case task_DLARFT:
-                DLARFT( &A[ (k*m+k)*K*K ],&A[(j*m+k)*K*K], K, j, k, tau, m, ww);
-                break;
-            case task_DTSQRF:
-                DTSQRF( &A[(k*m+k)*K*K], &A[(k*m+i)*K*K], K, i, k, tau, m , ww);
-                break;
-            case task_DSSRFT:
-                DSSRFT(&A[(j*m+i)*K*K], &A[(k*m+i)*K*K], &A[(j*m+k)*K*K], K, i, j, k, tau, m , ww);
-                break;
-            default:
-                error( "Unknown task type." );
-            }
 
-        }
-        
-    
-    /* Allocate and fill the original matrix. */
-    if ( ( A = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL ||
-         ( tau = (double *)malloc( sizeof(double) * m * n * K ) ) == NULL ||
-         ( A_orig = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL )
-        error( "Failed to allocate matrices." );
-    for ( k = 0 ; k < m * n * K * K ; k++ )
-    {
-        if(matrix == NULL)
-            A_orig[k] = 2*((double)rand()) / RAND_MAX - 1.0;
-        else
-            A_orig[k] = matrix[k];
+void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
+
+  int k, j, i;
+  double* A, *A_orig, *tau;
+  struct qsched s;
+  qsched_task_t* tid, tid_new;
+  qsched_res_t* rid;
+  int data[3];
+  ticks tic, toc_run, tot_setup, tot_run = 0;
+
+  enum task_types {
+    task_DGEQRF,
+    task_DLARFT,
+    task_DTSQRF,
+    task_DSSRFT
+  };
+
+  /* Runner function to pass to the scheduler. */
+  void runner(int type, void * data) {
+
+    /* Decode the task data. */
+    int* idata = (int*)data;
+    int i = idata[0], j = idata[1], k = idata[2];
+
+    /* Decode and execute the task. */
+    switch (type) {
+      case task_DGEQRF:
+        DGEQRF(&A[(k * m + k) * K * K], K, tau, k, m);
+        break;
+      case task_DLARFT:
+        DLARFT(&A[(k * m + k) * K * K], &A[(j * m + k) * K * K], K, j, k, tau,
+               m);
+        break;
+      case task_DTSQRF:
+        DTSQRF(&A[(k * m + k) * K * K], &A[(k * m + i) * K * K], K, i, k, tau,
+               m);
+        break;
+      case task_DSSRFT:
+        DSSRFT(&A[(j * m + i) * K * K], &A[(k * m + i) * K * K],
+               &A[(j * m + k) * K * K], K, i, j, k, tau, m);
+        break;
+      default:
+        error("Unknown task type.");
     }
-    memcpy( A , A_orig , sizeof(double) * m * n * K * K );
-    bzero( tau , sizeof(double) * m * n * K );
-    
-    /* Dump A_orig. */
-    /* message( "A_orig = [" );
-    for ( k = 0 ; k < m*K ; k++ ) {
-        for ( j = 0 ; j < n*K ; j++ )
-            printf( "%.3f " , A_orig[ j*m*K + k ] );
-        printf( "\n" );
-        }
-    printf( "];\n" ); */
-    
-    /* Initialize the scheduler. */
-    qsched_init( &s , nr_threads , qsched_flag_none );
-    
-    /* Allocate and init the task ID and resource ID matrix. */
-    tic = getticks();
-    if ( ( tid = (qsched_task_t *)malloc( sizeof(qsched_task_t) * m * n ) ) == NULL ||
-         ( rid = (qsched_res_t *)malloc( sizeof(qsched_res_t) * m * n ) ) == NULL )
-        error( "Failed to allocate tid/rid matrix." );
-    for ( k = 0 ; k < m * n ; k++ ) {
-        tid[k] = qsched_task_none;
-        rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none );
-        }
-    
-    /* Build the tasks. */
-    for ( k = 0 ; k < m && k < n ; k++ ) {
-    
-        /* Add kth corner task. */
-        data[0] = k; data[1] = k; data[2] = k;
-        tid_new = qsched_addtask( &s , task_DGEQRF , task_flag_none , data , sizeof(int)*3 , 2 );
-        qsched_addlock( &s , tid_new , rid[ k*m + k ] );
-        if ( tid[ k*m + k ] != -1 )
-            qsched_addunlock( &s , tid[ k*m + k ] , tid_new );
-        tid[ k*m + k ] = tid_new;
-            
-        /* Add column tasks on kth row. */
-        for ( j = k+1 ; j < n ; j++ ) {
-            data[0] = k; data[1] = j; data[2] = k;
-            tid_new = qsched_addtask( &s , task_DLARFT , task_flag_none , data , sizeof(int)*3 , 3 );
-            qsched_addlock( &s , tid_new , rid[ j*m + k ] );
-            qsched_adduse( &s , tid_new , rid[ k*m + k ] );
-            qsched_addunlock( &s , tid[ k*m + k ] , tid_new );
-            if ( tid[ j*m + k ] != -1 )
-                qsched_addunlock( &s , tid[ j*m + k ] , tid_new );
-            tid[ j*m + k ] = tid_new;
-            }
-            
-        /* For each following row... */
-        for ( i = k+1 ; i < m ; i++ ) {
-        
-            /* Add the row taks for the kth column. */
-            data[0] = i; data[1] = k; data[2] = k;
-            tid_new = qsched_addtask( &s , task_DTSQRF , task_flag_none , data , sizeof(int)*3 , 3 );
-            qsched_addlock( &s , tid_new , rid[ k*m + i ] );
-            qsched_adduse( &s , tid_new , rid[ k*m + k ] );
-            qsched_addunlock( &s , tid[ k*m + (i-1) ] , tid_new );
-            if ( tid[ k*m + i ] != -1 )
-                qsched_addunlock( &s , tid[ k*m + i ] , tid_new );
-            tid[ k*m + i ] = tid_new;
-            
-            /* Add the inner tasks. */
-            for ( j = k+1 ; j < n ; j++ ) {
-                data[0] = i; data[1] = j; data[2] = k;
-                tid_new = qsched_addtask( &s , task_DSSRFT , task_flag_none , data , sizeof(int)*3 , 5 );
-                qsched_addlock( &s , tid_new , rid[ j*m + i ] );
-                qsched_adduse( &s , tid_new , rid[ k*m + i ] );
-                qsched_adduse( &s , tid_new , rid[ j*m + k ] );
-                qsched_addunlock( &s , tid[ k*m + i ] , tid_new );
-                qsched_addunlock( &s, tid[j*m+i-1], tid_new);
-                if ( tid[ j*m + i ] != -1 )
-                    qsched_addunlock( &s , tid[ j*m + i ] , tid_new );
-
-                tid[ j*m + i ] = tid_new;
-                }
-        
-            }
-    
-        } /* build the tasks. */
-    tot_setup = getticks() - tic;
-    
-    /* Dump the number of tasks. */
-    message( "total nr of tasks: %i." , s.count );
-    message( "total nr of deps: %i." , s.count_deps );
-    message( "total nr of res: %i." , s.count_res );
-    message( "total nr of locks: %i." , s.count_locks );
-    message( "total nr of uses: %i." , s.count_uses );    
-        
-        
-    /* Loop over the number of runs. */
-    for ( k = 0 ; k < runs ; k++ ) {
-    
-        /* Execute the the tasks. */
-        tic = getticks();
-        qsched_run( &s , nr_threads , runner );
-        toc_run = getticks(); 
-	    message( "%ith run took %lli ticks..." , k , toc_run - tic );
-        tot_run += toc_run - tic;
-        
-        }
-    
-        
-    /* Dump A. */
-    /* message( "A = [" );
-    for ( k = 0 ; k < m*K ; k++ ) {
-        for ( j = 0 ; j < n*K ; j++ )
-            printf( "%.3f " , A[ j*m*K + k ] );
-        printf( "\n" );
-        }
-    printf( "];\n" ); */
-    
-    /* Dump tau. */
-    /* message( "tau = [" );
-    for ( k = 0 ; k < m*K ; k++ ) {
-        for ( j = 0 ; j < n ; j++ )
-            printf( "%.3f " , tau[ j*m*K + k ] );
-        printf( "\n" );
-        }
-    printf( "];\n" ); */
-    
-    /* Dump the tasks. */
-    /* for ( k = 0 ; k < s.count ; k++ ) {
-        int *d = (int *)&s.data[ s.tasks[k].data ];
-        printf( " %i %i %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid , d[0] , d[1] , s.tasks[k].tic , s.tasks[k].toc );
-        } */
-        
-    /* Dump the costs. */
-    message( "costs: setup=%lli ticks, run=%lli ticks." ,
-        tot_setup , tot_run/runs );
-    
-    /* Dump the timers. */
-    for ( k = 0 ; k < qsched_timer_count ; k++ )
-        message( "timer %s is %lli ticks." , qsched_timer_names[k] , s.timers[k]/runs );
-    
-    if(matrix != NULL)
-    {
-        for(k = 0; k < m*n*K*K; k++)
-            matrix[k] = A[k];
+  }
+
+  /* Allocate and fill the original matrix. */
+  if ((A = (double*)malloc(sizeof(double)* m* n* K* K)) == NULL ||
+      (tau = (double*)malloc(sizeof(double)* m* n* K)) == NULL ||
+      (A_orig = (double*)malloc(sizeof(double) * m * n * K * K)) == NULL)
+    error("Failed to allocate matrices.");
+  for (k = 0; k < m * n * K * K; k++) {
+    if (matrix == NULL)
+      A_orig[k] = 2 * ((double)rand()) / RAND_MAX - 1.0;
+    else
+      A_orig[k] = matrix[k];
+  }
+  memcpy(A, A_orig, sizeof(double) * m * n * K * K);
+  bzero(tau, sizeof(double) * m * n * K);
+
+  /* Dump A_orig. */
+  /* message( "A_orig = [" );
+  for ( k = 0 ; k < m*K ; k++ ) {
+      for ( j = 0 ; j < n*K ; j++ )
+          printf( "%.3f " , A_orig[ j*m*K + k ] );
+      printf( "\n" );
+      }
+  printf( "];\n" ); */
+
+  /* Initialize the scheduler. */
+  qsched_init(&s, nr_threads, qsched_flag_none);
+
+  /* Allocate and init the task ID and resource ID matrix. */
+  tic = getticks();
+  if ((tid = (qsched_task_t*)malloc(sizeof(qsched_task_t)* m* n)) == NULL ||
+      (rid = (qsched_res_t*)malloc(sizeof(qsched_res_t) * m * n)) == NULL)
+    error("Failed to allocate tid/rid matrix.");
+  for (k = 0; k < m * n; k++) {
+    tid[k] = qsched_task_none;
+    rid[k] = qsched_addres(&s, qsched_owner_none, qsched_res_none);
+  }
+
+  /* Build the tasks. */
+  for (k = 0; k < m && k < n; k++) {
+
+    /* Add kth corner task. */
+    data[0] = k;
+    data[1] = k;
+    data[2] = k;
+    tid_new = qsched_addtask(&s, task_DGEQRF, task_flag_none, data,
+                             sizeof(int) * 3, 2);
+    qsched_addlock(&s, tid_new, rid[k * m + k]);
+    if (tid[k * m + k] != -1) qsched_addunlock(&s, tid[k * m + k], tid_new);
+    tid[k * m + k] = tid_new;
+
+    /* Add column tasks on kth row. */
+    for (j = k + 1; j < n; j++) {
+      data[0] = k;
+      data[1] = j;
+      data[2] = k;
+      tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, data,
+                               sizeof(int) * 3, 3);
+      qsched_addlock(&s, tid_new, rid[j * m + k]);
+      qsched_adduse(&s, tid_new, rid[k * m + k]);
+      qsched_addunlock(&s, tid[k * m + k], tid_new);
+      if (tid[j * m + k] != -1) qsched_addunlock(&s, tid[j * m + k], tid_new);
+      tid[j * m + k] = tid_new;
     }
 
- 
-    /* Test if the decomposition was correct.*/
-   /*double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
-    double *Q = computeQ(tempMatrix, m*K, K, tau, m);
-    double *R = getR(tempMatrix, m*K);
-    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q, m*K, R, m*K, 0.0, tempMatrix, m*K);
-        free(Q);
-        Q = tileToColumn(A_orig, m*n*K*K, m, n, K);
-    for(i = 0; i < m * n * K * K; i++)
-    {
-        if(Q[i] != 0 && (Q[i] / tempMatrix[i] > 1.005 || Q[i] / tempMatrix[i] < 0.995))
-            printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i], tempMatrix[i]);
-    }
-    free(tempMatrix);
-    free(Q);
-    free(R);*/
-
-    /* Clean up. */
-    free(A);
-    free(A_orig);
-    free(tau);
-    free(tid);
-    free(rid);
-    qsched_free( &s );
-        
+    /* For each following row... */
+    for (i = k + 1; i < m; i++) {
+
+      /* Add the row taks for the kth column. */
+      data[0] = i;
+      data[1] = k;
+      data[2] = k;
+      tid_new = qsched_addtask(&s, task_DTSQRF, task_flag_none, data,
+                               sizeof(int) * 3, 3);
+      qsched_addlock(&s, tid_new, rid[k * m + i]);
+      qsched_adduse(&s, tid_new, rid[k * m + k]);
+      qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new);
+      if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new);
+      tid[k * m + i] = tid_new;
+
+      /* Add the inner tasks. */
+      for (j = k + 1; j < n; j++) {
+        data[0] = i;
+        data[1] = j;
+        data[2] = k;
+        tid_new = qsched_addtask(&s, task_DSSRFT, task_flag_none, data,
+                                 sizeof(int) * 3, 5);
+        qsched_addlock(&s, tid_new, rid[j * m + i]);
+        qsched_adduse(&s, tid_new, rid[k * m + i]);
+        qsched_adduse(&s, tid_new, rid[j * m + k]);
+        // qsched_addunlock(&s, tid[k * m + i], tid_new);
+        qsched_addunlock(&s, tid[j * m + i - 1], tid_new);
+        if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new);
+
+        tid[j * m + i] = tid_new;
+      }
     }
 
+  } /* build the tasks. */
+  tot_setup = getticks() - tic;
 
-/* Generates a random matrix. */
-double* generateColumnMatrix(int size)
-{
-    double* matrix = malloc(sizeof(double)*size*size);
-    if(matrix == NULL)
-        error("Failed to allocate matrix");
-
-    unsigned long int m_z = 35532;
-    int i;
-    for(i = 0 ; i < size*size; i++)
-    {
-        m_z = (1664525*m_z + 1013904223) % 4294967296;
-        matrix[i] = m_z % 100;
-        if(matrix[i] < 0)
-            matrix[i] += 100;
-    }
-    return matrix;
+  /* Dump the number of tasks. */
+  message("total nr of tasks: %i.", s.count);
+  message("total nr of deps: %i.", s.count_deps);
+  message("total nr of res: %i.", s.count_res);
+  message("total nr of locks: %i.", s.count_locks);
+  message("total nr of uses: %i.", s.count_uses);
+
+  /* Loop over the number of runs. */
+  for (k = 0; k < runs; k++) {
+
+    /* Execute the the tasks. */
+    tic = getticks();
+    qsched_run(&s, nr_threads, runner);
+    toc_run = getticks();
+    message("%ith run took %lli ticks...", k, toc_run - tic);
+    tot_run += toc_run - tic;
+  }
+
+  /* Dump A. */
+  /* message( "A = [" );
+  for ( k = 0 ; k < m*K ; k++ ) {
+      for ( j = 0 ; j < n*K ; j++ )
+          printf( "%.3f " , A[ j*m*K + k ] );
+      printf( "\n" );
+      }
+  printf( "];\n" ); */
+
+  /* Dump tau. */
+  /* message( "tau = [" );
+  for ( k = 0 ; k < m*K ; k++ ) {
+      for ( j = 0 ; j < n ; j++ )
+          printf( "%.3f " , tau[ j*m*K + k ] );
+      printf( "\n" );
+      }
+  printf( "];\n" ); */
+
+  /* Dump the tasks. */
+  /* for ( k = 0 ; k < s.count ; k++ ) {
+      int *d = (int *)&s.data[ s.tasks[k].data ];
+      printf( " %i %i %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid ,
+     d[0] , d[1] , s.tasks[k].tic , s.tasks[k].toc );
+      } */
+
+  /* Dump the costs. */
+  message("costs: setup=%lli ticks, run=%lli ticks.", tot_setup,
+          tot_run / runs);
+
+  /* Dump the timers. */
+  for (k = 0; k < qsched_timer_count; k++)
+    message("timer %s is %lli ticks.", qsched_timer_names[k],
+            s.timers[k] / runs);
+
+  if (matrix != NULL) {
+    for (k = 0; k < m * n * K * K; k++) matrix[k] = A[k];
+  }
+
+  /* Test if the decomposition was correct.*/
+  /*double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
+   double *Q = computeQ(tempMatrix, m*K, K, tau, m);
+   double *R = getR(tempMatrix, m*K);
+   cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q,
+   m*K, R, m*K, 0.0, tempMatrix, m*K);
+       free(Q);
+       Q = tileToColumn(A_orig, m*n*K*K, m, n, K);
+   for(i = 0; i < m * n * K * K; i++)
+   {
+       if(Q[i] != 0 && (Q[i] / tempMatrix[i] > 1.005 || Q[i] / tempMatrix[i] <
+   0.995))
+           printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i],
+   tempMatrix[i]);
+   }
+   free(tempMatrix);
+   free(Q);
+   free(R);*/
+
+  /* Clean up. */
+  free(A);
+  free(A_orig);
+  free(tau);
+  free(tid);
+  free(rid);
+  qsched_free(&s);
 }
 
+/* Generates a random matrix. */
+double* generateColumnMatrix(int size) {
+  double* matrix = malloc(sizeof(double) * size * size);
+  if (matrix == NULL) error("Failed to allocate matrix");
+
+  unsigned long int m_z = 35532;
+  int i;
+  for (i = 0; i < size * size; i++) {
+    m_z = (1664525 * m_z + 1013904223) % 4294967296;
+    matrix[i] = m_z % 100;
+    if (matrix[i] < 0) matrix[i] += 100;
+  }
+  return matrix;
+}
 
 /**
  * @brief Main function.
  */
- 
-int main ( int argc , char *argv[] ) {
-
-    int c, nr_threads;
-    int M = 4, N = 4, runs = 1, K = 32;
-    
-    /* Get the number of threads. */
-    #pragma omp parallel shared(nr_threads)
-    {
-        if ( omp_get_thread_num() == 0 )
-            nr_threads = omp_get_num_threads();
-    }
-    
-    /* Parse the options */
-    while ( ( c = getopt( argc , argv  , "m:n:k:r:t:" ) ) != -1 )
-        switch( c ) {
-	        case 'm':
-	            if ( sscanf( optarg , "%d" , &M ) != 1 )
-	                error( "Error parsing dimension M." );
-	            break;
-	        case 'n':
-	            if ( sscanf( optarg , "%d" , &N ) != 1 )
-	                error( "Error parsing dimension M." );
-	            break;
-	        case 'k':
-	            if ( sscanf( optarg , "%d" , &K ) != 1 )
-	                error( "Error parsing tile size." );
-	            break;
-            case 'r':
-                if ( sscanf( optarg , "%d" , &runs ) != 1 )
-                    error( "Error parsing number of runs." );
-                break;
-	        case 't':
-	            if ( sscanf( optarg , "%d" , &nr_threads ) != 1 )
-	                error( "Error parsing number of threads." );
-	            omp_set_num_threads( nr_threads );
-	            break;
-	        case '?':
-                fprintf( stderr , "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n" , argv[0] );
-                fprintf( stderr , "Computes the tiled QR decomposition of an MxN tiled\n"
-                                  "matrix using nr_threads threads.\n" );
-	            exit( EXIT_FAILURE );
-	        }
-            
-    /* Dump arguments. */
-    message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." ,
-        32*M , 32*N , nr_threads , runs );
-        
-    test_qr( M , N , K , nr_threads , runs , NULL);
-    
+
+int main(int argc, char* argv[]) {
+
+  int c, nr_threads;
+  int M = 4, N = 4, runs = 1, K = 32;
+
+/* Get the number of threads. */
+#pragma omp parallel shared(nr_threads)
+  {
+    if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads();
+  }
+
+  /* Parse the options */
+  while ((c = getopt(argc, argv, "m:n:k:r:t:")) != -1) switch (c) {
+      case 'm':
+        if (sscanf(optarg, "%d", &M) != 1) error("Error parsing dimension M.");
+        break;
+      case 'n':
+        if (sscanf(optarg, "%d", &N) != 1) error("Error parsing dimension M.");
+        break;
+      case 'k':
+        if (sscanf(optarg, "%d", &K) != 1) error("Error parsing tile size.");
+        break;
+      case 'r':
+        if (sscanf(optarg, "%d", &runs) != 1)
+          error("Error parsing number of runs.");
+        break;
+      case 't':
+        if (sscanf(optarg, "%d", &nr_threads) != 1)
+          error("Error parsing number of threads.");
+        omp_set_num_threads(nr_threads);
+        break;
+      case '?':
+        fprintf(stderr, "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n",
+                argv[0]);
+        fprintf(stderr, "Computes the tiled QR decomposition of an MxN tiled\n"
+                        "matrix using nr_threads threads.\n");
+        exit(EXIT_FAILURE);
     }
-    
-    
+
+  /* Dump arguments. */
+  message("Computing the tiled QR decomposition of a %ix%i matrix using %i "
+          "threads (%i runs).",
+          32 * M, 32 * N, nr_threads, runs);
+
+  test_qr(M, N, K, nr_threads, runs, NULL);
+}
diff --git a/examples/test_qr_ompss.c b/examples/test_qr_ompss.c
index 014d1e0d2b0227379c69917690c3e270c4179979..8d9286775f01d4ac3b8514f6a657c20f2eede68f 100644
--- a/examples/test_qr_ompss.c
+++ b/examples/test_qr_ompss.c
@@ -1,22 +1,21 @@
 /*******************************************************************************
  * This file is part of QuickSched.
  * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
- ******************************************************************************/
-
+ *
+* *****************************************************************************/
 
 /* Config parameters. */
 #include "../config.h"
@@ -33,262 +32,378 @@
 #include <lapacke.h>
 #include <cblas.h>
 
-
 /* Local includes. */
 #include "cycle.h"
 
-
 /* Error macro. */
-#define error(s, ...) { fprintf( stderr , "%s:%s():%i: " s "\n" , __FILE__ , __FUNCTION__ , __LINE__ , ##__VA_ARGS__ ); abort(); }
+#define error(s, ...)                                                        \
+  {                                                                          \
+    fprintf(stderr, "%s:%s():%i: " s "\n", __FILE__, __FUNCTION__, __LINE__, \
+            ##__VA_ARGS__);                                                  \
+    abort();                                                                 \
+  }
 
 /* Message macro. */
-#define message(s, ...) { printf( "%s: " s "\n" , __FUNCTION__ , ##__VA_ARGS__ ); fflush(stdout); }
-
+#define message(s, ...)                                 \
+  {                                                     \
+    printf("%s: " s "\n", __FUNCTION__, ##__VA_ARGS__); \
+    fflush(stdout);                                     \
+  }
 
 /* Stuff to collect task data. */
 struct timer {
-    int threadID, type;
-    ticks tic, toc;
-    };
-struct timer *timers;
+  int threadID, type;
+  ticks tic, toc;
+};
+struct timer* timers;
 int nr_timers = 0;
 
 
-/*
- * Sam's routines for the tiled QR decomposition.
- */
-
-/*
-  \brief Computes 2-norm of a vector \f$x\f$
-  
-  Computes the 2-norm by computing the following: \f[\textrm{2-norm}=\sqrt_0^lx(i)^2\f]
- */
-double do2norm(double* x, int l)
-{
-	double sum = 0, norm;
-	int i;
-
-	for(i = 0; i < l; i++)
-		sum += x[i] * x[i];
+double* columnToTile(double* columnMatrix, int size, int m, int n,
+                     int tilesize) {
+  double* TileMatrix;
+  TileMatrix = malloc(sizeof(double) * size);
+  if (TileMatrix == NULL) error("failed to allocate TileMatrix");
+  int i, j, k, l;
+
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < m; j++) {
+      double* tileStart =
+          &columnMatrix[i * m * tilesize * tilesize + j * tilesize];
+      double* tilePos =
+          &TileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
+      for (k = 0; k < tilesize; k++) {
+        tileStart = &columnMatrix[i * m * tilesize * tilesize +
+                                  k * m * tilesize + j * tilesize];
+        for (l = 0; l < tilesize; l++) {
+          tilePos[k * tilesize + l] = tileStart[l];
+        }
+      }
+    }
+  }
 
-	norm = sqrt(sum);
+  return TileMatrix;
+}
 
-	return norm;
+double* tileToColumn(double* tileMatrix, int size, int m, int n, int tilesize) {
+  double* ColumnMatrix;
+  ColumnMatrix = (double*)malloc(sizeof(double) * size);
+  if (ColumnMatrix == NULL) error("failed to allocate ColumnMatrix");
+  int i, j, k, l;
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < m; j++) {
+      /* Tile on ith column is at i*m*32*32.*/
+      /* Tile on jth is at j*32*32 */
+      double* tile =
+          &tileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
+      /* Column starts at same position as tile. */
+      /* Row j*32.*/
+      double* tilePos =
+          &ColumnMatrix[i * m * tilesize * tilesize + j * tilesize];
+      for (k = 0; k < tilesize; k++) {
+        for (l = 0; l < tilesize; l++) {
+          tilePos[l] = tile[l];
+        }
+        /* Next 32 elements are the position of the tile in the next column.*/
+        tile = &tile[tilesize];
+        /* Move to the j*32th position in the next column. */
+        tilePos = &tilePos[tilesize * m];
+      }
+    }
+  }
+  return ColumnMatrix;
 }
 
+/* Routines for the tiled QR decomposition.*/
+
 /**
- * \brief Computes a Householder reflector from a pair of vectors from coupled blocks
  *
- * Calculates the Householder vector of the vector formed by a column in a pair of coupled blocks.
- * There is a single non-zero element, in the first row, of the top vector. This is passed as topDiag
+ * @brief Computes the QR decomposition of a tile.
+ *
+ * @param cornerTile A pointer to the tile for which the decomposition is
+* computed.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param k The value of k for the QR decomposition
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
  *
- * \param topDiag The only non-zero element of the incoming vector in the top block
- * \param ma The number of elements in the top vector
- * \param xb Pointer to the lower vector
- * \param l The number of elements in the whole vector
- * \param vk A pointer to a pre-allocated array to store the householder vector of size l
  *
- * \returns void
  */
-void calcvkDouble	(double topDiag,
-			int ma,
-			double* xb,
-			int l,
-			double* vk)
-{
-	int sign, i;
-	double norm, div;
-	//same non-standard normalisation as for single blocks above, but organised without a temporary beta veriable
-
-	sign = topDiag >= 0.0 ? 1 : -1;
-	vk[0] = topDiag;
-	//use vk[0] as beta
-	for(i = 1; i < ma; i++)
-		vk[i] = 0;
-
-	for(; i < l; i++)
-		vk[i] = xb[i - ma];
-
-	norm = do2norm(vk, l);
-	vk[0] += norm * sign;
-
-	if(norm != 0.0)
-	{
-		div = 1/vk[0];
-	
-		for(i = 1; i < l; i++)
-			vk[i] *= div;
-	}
-}
-
-
-void updateDoubleQ_WY	(double* blockA,
-			double* blockB,
-			double* blockTau,
-			int k, int ma, int mb, int n,
-			int ldm,
-			double* hhVector)//bottom, essential part.
-{
-	int i, j;
-
-	double tau = 1.0, beta;
-
-	/* Compute tau = 2/v'v */
-	for(i = 0; i < mb; i ++)
-		tau += hhVector[i] * hhVector[i];
-
-	tau = 2/tau;
-
-	for(j = k; j < n; j ++)
-	{
-		/* Compute v'*b_j */
-		beta = blockA[(j*ldm) + k];
+#pragma omp task inout(cornerTile[0]) out(tauMatrix[0])
+void DGEQRF(double* restrict cornerTile, int tileSize,
+            double* restrict tauMatrix, int k, int tauNum) {
+  int i, j, n;
+  double norm = 0.0, sign, u1, tau, z;
+  double w[tileSize];
+
+  /* Timer stuff. */
+  int ind = __sync_fetch_and_add( &nr_timers , 1 );
+  timers[ind].threadID = omp_get_thread_num();
+  timers[ind].type = 0;
+  timers[ind].tic = getticks();
+    
+  /*Find the householder vector for each row. */
+  for (i = 0; i < tileSize; i++) {
+    norm = 0.0;
+    /*Fill w with the vector.*/
+    for (j = i; j < tileSize; j++) {
+      /* ith row is i*tileSize, only want elements on diagonal or below.*/
+      w[j] = cornerTile[i * tileSize + j];
+      /*Find the norm as well */
+      norm = norm + w[j] * w[j];
+    }
+    if (w[i] >= 0.0)
+      sign = -1;
+    else
+      sign = 1;
 
-		/* Then for lower half */
-		for(i = 0; i < mb; i ++)
-			beta += blockB[(j*ldm) + i] * hhVector[i];
+    norm = sqrt(norm);
 
-		beta *= tau;
+    u1 = w[i] - sign * norm;
 
-		/* Compute b_j = b_j - beta*v_k */
-		blockA[(j*ldm) + k] -= beta;
-		
-		for(i = 0; i < mb; i ++)
-			blockB[(j*ldm) + i] -= beta * hhVector[i];
-	}
+    if (u1 != 0.0) {
+      for (j = i + 1; j < tileSize; j++) w[j] = w[j] / u1;
+    } else {
+      for (j = i + 1; j < tileSize; j++) w[j] = 0.0;
+    }
 
-	/* Insert vector below diagonal. */
-	for(i = 0; i < mb; i ++)
-		blockB[(k*ldm) + i] = hhVector[i];
+    if (norm != 0.0)
+      tau = -sign * u1 / norm;
+    else
+      tau = 0.0;
 
-	blockTau[k] = tau;
-}
+    /*Store the below diagonal vector */
 
-// #pragma omp task in( blockA[0] ) inout( blockB[0] ) in( blockTau[0] )
-void DTSQRF	(double* blockA,
-		double* blockB,
-		double* blockTau,
-		int ma,
-		int mb,
-		int n,
-		int ldm )
-{
-	int k;
-	double* xVectA, *xVectB;
-    double hhVector[ 2*ma ];
-	
-    int ind = __sync_fetch_and_add( &nr_timers , 1 );
-    timers[ind].threadID = omp_get_thread_num();
-    timers[ind].type = 2;
-    timers[ind].tic = getticks();
-    
-	xVectA = blockA;
-	xVectB = blockB;
-
-	for(k = 0; k < n; k++)
-	{
-		//vk = sign(x[1])||x||_2e1 + x
-		//vk = vk/vk[0]
-		calcvkDouble(xVectA[0], ma - k, xVectB, (ma + mb) - k, hhVector);//returns essential
-
-		//matA(k:ma,k:na) = matA(k:ma,k:na) - (2/(vk.T*vk))*vk*(vk.T*matA(k:ma,k:na)
-		//update both blocks, preserving the vectors already stored below the diagonal in the top block and treating them as if they were zeros.
-		updateDoubleQ_WY	(blockA, blockB,
-					blockTau,
-					k, ma, mb, n,
-					ldm,
-					hhVector + ma - k);
-
-		xVectA += ldm + 1;
-		xVectB += ldm;
-	}
-    timers[ind].toc = getticks();
+    for (j = i + 1; j < tileSize; j++) {
+      cornerTile[i * tileSize + j] = w[j];
+    }
+    cornerTile[i * tileSize + i] = sign * norm;
+    w[i] = 1.0;
+    /* Apply the householder transformation to the rest of the tile, for
+     * everything to the right of the diagonal. */
+    for (j = i + 1; j < tileSize; j++) {
+      /*Compute w'*A_j*/
+      z = cornerTile[j * tileSize + i];
+      for (n = i + 1; n < tileSize; n++) {
+        z = z + cornerTile[j * tileSize + n] * w[n];
+      }
+      /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */
+      for (n = i; n < tileSize; n++) {
+        cornerTile[j * tileSize + n] =
+            cornerTile[j * tileSize + n] - tau * w[n] * z;
+      }
+    }
+    /* Store tau. We're on k*tileSize+ith row. kth column.*/
+    tauMatrix[(k * tileSize + i) * tauNum + k] = tau;
+  }
+  timers[ind].toc = getticks();
 }
 
-// #pragma omp task in( blockV[0] ) in( blockB[0] ) inout( blockA[0] ) in( blockTau[0] )
-void DSSRFT	(double* blockV,
-		double* blockA, double* blockB,
-		double* blockTau,
-		int b, int n, int ldm)
-{
-	int i, j, k;
-
-	double tau, beta;
-
-    int ind = __sync_fetch_and_add( &nr_timers , 1 );
-    timers[ind].threadID = omp_get_thread_num();
-    timers[ind].type = 3;
-    timers[ind].tic = getticks();
+/**
+ *
+ * @brief Applies the householder factorisation of the corner to the row tile.
+ *
+ * @param cornerTile A pointer to the tile for which the householder is stored.
+ * @param rowTiles The tile to which the householder is applied.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param jj The value of j for the QR decomposition.
+ * @param kk The value of k for the QR decomposition.
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
+ *
+ *
+ */
+#pragma omp task in(cornerTile[0]) inout(rowTile[0]) in(tauMatrix[0])
+void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize,
+            int jj, int kk, double* restrict tauMatrix, int tauNum) {
+  int i, j, n;
+  double z = 0.0;
+  double w[tileSize];
+
+  /* Timer stuff. */
+  int ind = __sync_fetch_and_add( &nr_timers , 1 );
+  timers[ind].threadID = omp_get_thread_num();
+  timers[ind].type = 1;
+  timers[ind].tic = getticks();
     
-	/* Compute b_j = b_j - tau*v*v'*b_j for each column j of blocks A & B,
-	   and for each householder vector v of blockV */
-
-	/* For each column of B */
-	for(j = 0; j < n; j ++)
-	{
-		/* For each householder vector. */
-		for(k = 0; k < n; k ++)
-		{
-			/* tau = 2/v'v, computed earlier, stored in T(k,k). */
-			tau = blockTau[k];
-
-			/* Compute beta = v_k'b_j. */
-			/* v_k is >0 (=1) only at position k in top half. */
-			beta = blockA[(j*ldm) + k];
-
-			/* For lower portion of v_k, aligning with the lower block */
-			for(i = 0; i < b; i ++)
-				beta += blockB[(j*ldm) + i] * blockV[(k*ldm) + i];
-
-			beta *= tau;
-			
-			/* Compute b_j = b_j - beta * v */
-			/* v_k = 1 at (k) in top half again */
-			blockA[(j*ldm) + k] -= beta;
-
-			/* Apply to bottom block. */
-			for(i = 0; i < b; i ++)
-				blockB[(j*ldm) + i] -= beta * blockV[(k*ldm) + i];
-		}
-	}
-    timers[ind].toc = getticks();
+  /* For each row in the corner Tile*/
+  for (i = 0; i < tileSize; i++) {
+    /*Get w for row i */
+    for (j = i; j < tileSize; j++) {
+      w[j] = cornerTile[i * tileSize + j];
+    }
+    w[i] = 1.0;
+
+    /* Apply to the row Tile */
+    for (j = 0; j < tileSize; j++) {
+      z = 0.0;
+      /* Below Diagonal!*/
+      /*Compute w'*A_j*/
+      for (n = i; n < tileSize; n++) {
+        z = z + w[n] * rowTile[j * tileSize + n];
+      }
+      for (n = i; n < tileSize; n++) {
+        rowTile[j * tileSize + n] =
+            rowTile[j * tileSize + n] -
+            tauMatrix[(kk * tileSize + i) * tauNum + kk] * w[n] * z;
+      }
+    }
+  }
+  timers[ind].toc = getticks();
 }
 
-
 /**
- * @breif Wrapper to get the dependencies right.
+ *
+ * @brief Applies the householder factorisation of the corner to the row tile.
+ *
+ * @param cornerTile The corner tile for this value of k.
+ * @param columnTile The tile for which householders are computed.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param ii The value of i for the QR decomposition.
+ * @param kk The value of k for the QR decomposition.
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
+ *
+ *
  */
- 
-// #pragma omp task inout( a[0] ) inout( tau[0] )
-void DGEQRF ( int matrix_order, lapack_int m, lapack_int n,
-                                double* a, lapack_int lda, double* tau ) {
-    int ind = __sync_fetch_and_add( &nr_timers , 1 );
-    timers[ind].threadID = omp_get_thread_num();
-    timers[ind].type = 0;
-    timers[ind].tic = getticks();
-    LAPACKE_dgeqrf( matrix_order, m, n, a, lda, tau );
-    timers[ind].toc = getticks();
+#pragma omp task inout(cornerTile[0]) inout(columnTile[0]) out(tauMatrix[0])
+void DTSQRF(double* restrict cornerTile, double* restrict columnTile,
+            int tilesize, int ii, int kk, double* restrict tauMatrix,
+            int tauNum) {
+  int i, j, n;
+  double norm = 0.0, sign, u1, tau, z;
+  double w[2*tilesize];
+
+  /* Timer stuff. */
+  int ind = __sync_fetch_and_add( &nr_timers , 1 );
+  timers[ind].threadID = omp_get_thread_num();
+  timers[ind].type = 2;
+  timers[ind].tic = getticks();
+    
+  /* For each column compute the householder vector. */
+  for (i = 0; i < tilesize; i++) {
+    norm = 0.0;
+    w[i] = cornerTile[i * tilesize + i];
+    norm = norm + w[i] * w[i];
+    for (j = i + 1; j < tilesize; j++) {
+      w[j] = 0.0;
+    }
+    for (j = 0; j < tilesize; j++) {
+      w[tilesize + j] = columnTile[i * tilesize + j];
+      norm = norm + w[tilesize + j] * w[tilesize + j];
     }
 
+    norm = sqrt(norm);
+    if (w[i] >= 0.0)
+      sign = -1;
+    else
+      sign = 1;
+
+    u1 = w[i] - sign * norm;
+    if (u1 != 0.0) {
+      for (j = i + 1; j < 2 * tilesize; j++) {
+        w[j] = w[j] / u1;
+      }
+    } else {
+      for (j = i + 1; j < 2 * tilesize; j++) w[j] = 0.0;
+    }
+
+    if (norm != 0)
+      tau = -sign * u1 / norm;
+    else
+      tau = 0.0;
+
+    /* Apply to each row to the right.*/
+    for (j = i; j < tilesize; j++) {
+      /* Find w'*A_j, w is 0s except for first value with upper tile.*/
+      z = 1.0 * cornerTile[j * tilesize + i];
+      for (n = 0; n < tilesize; n++) {
+        z = z + w[tilesize + n] * columnTile[j * tilesize + n];
+      }
+      /* Apply to upper tile.*/
+      cornerTile[j * tilesize + i] =
+          cornerTile[j * tilesize + i] - tau * 1.0 * z;
+      for (n = i + 1; n < tilesize; n++) {
+        cornerTile[j * tilesize + n] =
+            cornerTile[j * tilesize + n] - tau * w[n] * z;
+      }
+      /* Apply to lower tile.*/
+      for (n = 0; n < tilesize; n++) {
+        columnTile[j * tilesize + n] =
+            columnTile[j * tilesize + n] - tau * w[tilesize + n] * z;
+      }
+    }
+    /* Store w*/
+    for (j = 0; j < tilesize; j++) {
+      columnTile[i * tilesize + j] = w[tilesize + j];
+    }
+    tauMatrix[(kk * tilesize + i) * tauNum + ii] = tau;
+  }
+  timers[ind].toc = getticks();
+}
 
 /**
- * @breif Wrapper to get the dependencies right.
+ *
+ * @brief Applies the householder factorisation of the corner to the row tile.
+ *
+ * @param cornerTile A pointer to the tile to have the householder applied.
+ * @param columnTile The tile in which the householders are stored.
+ * @param rowTile The upper tile to have the householders applied.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param ii The value of i for the QR decomposition.
+ * @param jj The value of j for the QR decomposition.
+ * @param kk The value of k for the QR decomposition.
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
+ *
+ *
  */
- 
-// #pragma omp task inout( v[0] ) in( tau[0] ) inout( t[0] )
-void DLARFT ( int matrix_order, char direct, char storev,
-                                lapack_int n, lapack_int k, const double* v,
-                                lapack_int ldv, const double* tau, double* t,
-                                lapack_int ldt ) {
-    int ind = __sync_fetch_and_add( &nr_timers , 1 );
-    timers[ind].threadID = omp_get_thread_num();
-    timers[ind].type = 1;
-    timers[ind].tic = getticks();
-    LAPACKE_dlarft_work( matrix_order, direct, storev, n, k, v, ldv, tau, t, ldt );
-    timers[ind].toc = getticks();
+#pragma omp task inout(cornerTile[0]) in(columnTile[0]) inout(rowTile[0]) in(tauMatrix[0])
+void DSSRFT(double* restrict cornerTile, double* restrict columnTile,
+            double* restrict rowTile, int tilesize, int ii, int jj, int kk,
+            double* restrict tauMatrix, int tauNum) {
+  int i, j, n;
+  double z;
+  double w[2*tilesize];
+
+  /* Timer stuff. */
+  int ind = __sync_fetch_and_add( &nr_timers , 1 );
+  timers[ind].threadID = omp_get_thread_num();
+  timers[ind].type = 3;
+  timers[ind].tic = getticks();
+    
+  for (i = 0; i < tilesize; i++) {
+    for (j = 0; j < i; j++) w[j] = 0.0;
+    w[i] = 1.0;
+    for (j = i + 1; j < tilesize; j++) w[j] = 0.0;
+    for (j = 0; j < tilesize; j++)
+      w[j + tilesize] = columnTile[i * tilesize + j];
+
+    /* Apply householder vector (w) to the tiles.*/
+    for (j = 0; j < tilesize; j++) {
+      z = 0.0;
+      /* Compute w' * A_j */
+      for (n = 0; n < tilesize; n++) {
+        z += w[n] * rowTile[j * tilesize + n];
+        z += w[n + tilesize] * cornerTile[j * tilesize + n];
+      }
+      for (n = 0; n < tilesize; n++) {
+        rowTile[j * tilesize + n] =
+            rowTile[j * tilesize + n] -
+            tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[n] * z;
+        cornerTile[j * tilesize + n] =
+            cornerTile[j * tilesize + n] -
+            tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[tilesize + n] * z;
+      }
     }
-			
-			
+  }
+  timers[ind].toc = getticks();
+}
+
 /**
  * @brief Computed a tiled QR factorization using QuickSched.
  *
@@ -296,151 +411,140 @@ void DLARFT ( int matrix_order, char direct, char storev,
  * @param n Number of tile columns.
  * @param nr_threads Number of threads to use.
  */
- 
-void test_qr ( int m , int n , int K , int nr_threads , int runs ) {
-
-    int k, j, i, r;
-    double *A, *A_orig, *tau;
-    ticks tic, toc_run, tot_setup, tot_run = 0;
-    int tid[ m*n ];
-
-
-    /* Allocate and fill the original matrix. */
-    if ( ( A = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL ||
-         ( tau = (double *)malloc( sizeof(double) * m * n * K ) ) == NULL ||
-         ( A_orig = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL )
-        error( "Failed to allocate matrices." );
-    for ( k = 0 ; k < m * n * K * K ; k++ )
-        A_orig[k] = 2*((double)rand()) / RAND_MAX - 1.0;
-    memcpy( A , A_orig , sizeof(double) * m * n * K * K );
-    bzero( tau , sizeof(double) * m * n * K );
-    
-    /* Dump A_orig. */
-    /* message( "A_orig = [" );
-    for ( k = 0 ; k < m*K ; k++ ) {
-        for ( j = 0 ; j < n*K ; j++ )
-            printf( "%.3f " , A_orig[ j*m*K + k ] );
-        printf( "\n" );
-        }
-    printf( "];\n" ); */
-    
-    /* Loop over the number of runs. */
-    for ( r = 0 ; r < runs ; r++ ) {
-    
-        /* Start the clock. */
-        tic = getticks();
-        nr_timers = 0;
-        
-        /* Launch the tasks. */
-        for ( k = 0 ; k < m && k < n ; k++ ) {
-
-            /* Add kth corner task. */
-            #pragma omp task inout( tid[ k*m + k ] )
-            DGEQRF( LAPACK_COL_MAJOR , K, K ,
-                            &A[ k*m*K*K + k*K ] , m*K , &tau[ k*m*K + k*K ] );
-
-            /* Add column tasks on kth row. */
-            for ( j = k+1 ; j < n ; j++ ) {
-                #pragma omp task inout( tid[ j*m + k ] ) in( tid[ k*m + k ] )
-                DLARFT( LAPACK_COL_MAJOR , 'F' , 'C' ,
-                                K , K , &A[ k*m*K*K + k*K ] ,
-                                m*K , &tau[ k*m*K + k*K ] , &A[ j*m*K*K + k*K ] ,
-                                m*K );
-                }
-
-            /* For each following row... */
-            for ( i = k+1 ; i < m ; i++ ) {
-
-                /* Add the row taks for the kth column. */
-                #pragma omp task inout( tid[ k*m + i ] ) in( tid[ k*m + k ] )
-                DTSQRF( &A[ k*m*K*K + k*K ] , &A[ k*m*K*K + i*K ] , &tau[ k*m*K + i*K ] , K , K , K , K*m );
-
-                /* Add the inner tasks. */
-                for ( j = k+1 ; j < n ; j++ ) {
-                    #pragma omp task inout( tid[ j*m + i ] ) in( tid[ k*m + i ] , tid[ j*m + k ] )
-                    DSSRFT(	&A[ k*m*K + i*K ] , &A[ j*m*K*K + k*K ] , &A[ j*m*K*K + i*K ] , &tau[ k*m*K + i*K ] , K , K , K*m );
-                    }
-
-                }
-
-            } /* build the tasks. */
-    
-        /* Collect timers. */
-        #pragma omp taskwait
-        toc_run = getticks(); 
-	    message( "%ith run took %lli ticks..." , r , toc_run - tic );
-        tot_run += toc_run - tic;
-        
+
+void test_qr(int m, int n, int K, int nr_threads, int runs) {
+
+  int k, j, i, r;
+  double* A, *A_orig, *tau;
+  ticks tic, toc_run, tot_run = 0;
+
+  /* Allocate and fill the original matrix. */
+  if ((A = (double*)malloc(sizeof(double)* m* n* K* K)) == NULL ||
+      (tau = (double*)malloc(sizeof(double)* m* n* K)) == NULL ||
+      (A_orig = (double*)malloc(sizeof(double) * m * n * K * K)) == NULL)
+    error("Failed to allocate matrices.");
+  for (k = 0; k < m * n * K * K; k++)
+    A_orig[k] = 2 * ((double)rand()) / RAND_MAX - 1.0;
+  memcpy(A, A_orig, sizeof(double) * m * n * K * K);
+  bzero(tau, sizeof(double) * m * n * K);
+
+  /* Dump A_orig. */
+  /* message( "A_orig = [" );
+  for ( k = 0 ; k < m*K ; k++ ) {
+      for ( j = 0 ; j < n*K ; j++ )
+          printf( "%.3f " , A_orig[ j*m*K + k ] );
+      printf( "\n" );
+      }
+  printf( "];\n" ); */
+
+  /* Loop over the number of runs. */
+  for (r = 0; r < runs; r++) {
+
+    /* Start the clock. */
+    tic = getticks();
+    nr_timers = 0;
+
+    /* Launch the tasks. */
+    for (k = 0; k < m && k < n; k++) {
+
+      /* Add kth corner task. */
+      // #pragma omp task inout( tid[ k*m + k ] )
+      DGEQRF(&A[(k * m + k) * K * K], K, tau, k, m);
+
+      /* Add column tasks on kth row. */
+      for (j = k + 1; j < n; j++) {
+        // #pragma omp task inout( tid[ j*m + k ] ) in( tid[ k*m + k ] )
+        DLARFT(&A[(k * m + k) * K * K], &A[(j * m + k) * K * K], K, j, k, tau,
+               m);
+      }
+
+      /* For each following row... */
+      for (i = k + 1; i < m; i++) {
+
+        /* Add the row taks for the kth column. */
+        // #pragma omp task inout( tid[ k*m + i ] ) in( tid[ k*m + k ] )
+        DTSQRF(&A[(k * m + k) * K * K], &A[(k * m + i) * K * K], K, i, k, tau,
+               m);
+
+        /* Add the inner tasks. */
+        for (j = k + 1; j < n; j++) {
+          // #pragma omp task inout( tid[ j*m + i ] ) in( tid[ k*m + i ] , tid[
+          // j*m + k ] )
+          DSSRFT(&A[(j * m + i) * K * K], &A[(k * m + i) * K * K],
+                 &A[(j * m + k) * K * K], K, i, j, k, tau, m);
         }
-    
-    /* Dump the costs. */
-    message( "costs: setup=%lli ticks, run=%lli ticks." ,
-        tot_setup , tot_run/runs );
-    
-    /* Dump the tasks. */
-    /* for ( k = 0 ; k < nr_timers ; k++ )
-        printf( "%i %i %lli %lli\n" , timers[k].threadID , timers[k].type , timers[k].tic , timers[k].toc ); */
-        
-    }
+      }
+
+    } /* build the tasks. */
+
+/* Collect timers. */
+#pragma omp taskwait
+    toc_run = getticks();
+    message("%ith run took %lli ticks...", r, toc_run - tic);
+    tot_run += toc_run - tic;
+  }
 
+  /* Dump the costs. */
+  message("costs: run=%lli ticks.", tot_run / runs);
+
+  /* Dump the tasks. */
+  /* for ( k = 0 ; k < nr_timers ; k++ )
+      printf( "%i %i %lli %lli\n" , timers[k].threadID , timers[k].type ,
+     timers[k].tic , timers[k].toc ); */
+}
 
 /**
  * @brief Main function.
  */
- 
-int main ( int argc , char *argv[] ) {
 
-    int c, nr_threads;
-    int M = 4, N = 4, runs = 1, K = 32;
-    
-    /* Get the number of threads. */
-    #pragma omp parallel shared(nr_threads)
-    {
-        if ( omp_get_thread_num() == 0 )
-            nr_threads = omp_get_num_threads();
+int main(int argc, char* argv[]) {
+
+  int c, nr_threads;
+  int M = 4, N = 4, runs = 1, K = 32;
+
+/* Get the number of threads. */
+#pragma omp parallel shared(nr_threads)
+  {
+    if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads();
+  }
+
+  /* Parse the options */
+  while ((c = getopt(argc, argv, "m:n:k:r:t:")) != -1) switch (c) {
+      case 'm':
+        if (sscanf(optarg, "%d", &M) != 1) error("Error parsing dimension M.");
+        break;
+      case 'n':
+        if (sscanf(optarg, "%d", &N) != 1) error("Error parsing dimension M.");
+        break;
+      case 'k':
+        if (sscanf(optarg, "%d", &K) != 1) error("Error parsing tile size.");
+        break;
+      case 'r':
+        if (sscanf(optarg, "%d", &runs) != 1)
+          error("Error parsing number of runs.");
+        break;
+      case 't':
+        if (sscanf(optarg, "%d", &nr_threads) != 1)
+          error("Error parsing number of threads.");
+        omp_set_num_threads(nr_threads);
+        break;
+      case '?':
+        fprintf(stderr, "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n",
+                argv[0]);
+        fprintf(stderr, "Computes the tiled QR decomposition of an MxN tiled\n"
+                        "matrix using nr_threads threads.\n");
+        exit(EXIT_FAILURE);
     }
-    
-    /* Parse the options */
-    while ( ( c = getopt( argc , argv  , "m:n:k:r:t:" ) ) != -1 )
-        switch( c ) {
-	        case 'm':
-	            if ( sscanf( optarg , "%d" , &M ) != 1 )
-	                error( "Error parsing dimension M." );
-	            break;
-	        case 'n':
-	            if ( sscanf( optarg , "%d" , &N ) != 1 )
-	                error( "Error parsing dimension M." );
-	            break;
-	        case 'k':
-	            if ( sscanf( optarg , "%d" , &K ) != 1 )
-	                error( "Error parsing tile size." );
-	            break;
-            case 'r':
-                if ( sscanf( optarg , "%d" , &runs ) != 1 )
-                    error( "Error parsing number of runs." );
-                break;
-	        case 't':
-	            if ( sscanf( optarg , "%d" , &nr_threads ) != 1 )
-	                error( "Error parsing number of threads." );
-	            omp_set_num_threads( nr_threads );
-	            break;
-	        case '?':
-                fprintf( stderr , "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n" , argv[0] );
-                fprintf( stderr , "Computes the tiled QR decomposition of an MxN tiled\n"
-                                  "matrix using nr_threads threads.\n" );
-	            exit( EXIT_FAILURE );
-	        }
-            
-    /* Dump arguments. */
-    message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." ,
-        32*M , 32*N , nr_threads , runs );
-        
-    /* Initialize the timers. */
-    if ( ( timers = (struct timer *)malloc( sizeof(struct timer) * M*M*N ) ) == NULL )
-        error( "Failed to allocate timers." );
-        
-    test_qr( M , N , K , nr_threads , runs );
-    
-    }
-    
-    
+
+  /* Dump arguments. */
+  message("Computing the tiled QR decomposition of a %ix%i matrix using %i "
+          "threads (%i runs).",
+          32 * M, 32 * N, nr_threads, runs);
+
+  /* Initialize the timers. */
+  if ((timers = (struct timer*)malloc(sizeof(struct timer) * M * M * N)) ==
+      NULL)
+    error("Failed to allocate timers.");
+
+  test_qr(M, N, K, nr_threads, runs);
+}