diff --git a/examples/Makefile.am b/examples/Makefile.am
index d05fbd7c6714be89285f5715bfa407b477204e77..df37d2cd20cc62ce9afb52dad7675d950e5d62ef 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -71,5 +71,5 @@ test_qr_mpi_LDFLAGS = $(MPI_THREAD_LIBS)
 
 test_qr_mpi_cblas_SOURCES = test_qr_mpi.c
 test_qr_mpi_cblas_CFLAGS = $(AM_CFLAGS) -DWITH_MPI -DWITH_CBLAS_LIB
-test_qr_mpi_cblas_LDADD =  ../src/.libs/libquickschedMPI.a $(METIS_LIBS) -llapacke -llapacke -lblas -lcblas
+test_qr_mpi_cblas_LDADD =  ../src/.libs/libquickschedMPI.a -llapacke -llapacke -lblas -lcblas $(METIS_LIBS)
 test_qr_mpi_cblas_LDFLAGS = $(MPI_THREAD_LIBS)
diff --git a/examples/test_qr.c b/examples/test_qr.c
index 005e83e9c69beb48ad51f6728d37e5e1981635c3..c52db67a5a7ccb17a2f01375cddd7f1d7dbb565e 100644
--- a/examples/test_qr.c
+++ b/examples/test_qr.c
@@ -30,6 +30,7 @@
 #include <pthread.h>
 
 #include <cblas.h>
+#include <lapacke.h>
 
 /* Local includes. */
 #include "quicksched.h"
@@ -497,7 +498,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
         error("Unknown task type.");
     }
   }
-
+	srand(6);
   /* Allocate and fill the original matrix. */
   if ((A = (double*)malloc(sizeof(double)* m* n* K* K)) == NULL ||
       (tau = (double*)malloc(sizeof(double)* m* n* K)) == NULL ||
@@ -513,13 +514,13 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
   bzero(tau, sizeof(double) * m * n * K);
 
   /* Dump A_orig. */
-  /* message( "A_orig = [" );
+  message( "A_orig = [" );
   for ( k = 0 ; k < m*K ; k++ ) {
       for ( j = 0 ; j < n*K ; j++ )
           printf( "%.3f " , A_orig[ j*m*K + k ] );
       printf( "\n" );
       }
-  printf( "];\n" ); */
+  printf( "];\n" );
 
   /* Initialize the scheduler. */
   qsched_init(&s, nr_threads, qsched_flag_none);
@@ -541,7 +542,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
     data[0] = k;
     data[1] = k;
     data[2] = k;
-    tid_new = qsched_addtask(&s, task_DGEQRF, task_flag_none, data,
+    tid_new = qsched_addtask_local(&s, task_DGEQRF, task_flag_none, data,
                              sizeof(int) * 3, 2);
     qsched_addlock(&s, tid_new, rid[k * m + k]);
     if (tid[k * m + k] != -1) qsched_addunlock(&s, tid[k * m + k], tid_new);
@@ -552,7 +553,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
       data[0] = k;
       data[1] = j;
       data[2] = k;
-      tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, data,
+      tid_new = qsched_addtask_local(&s, task_DLARFT, task_flag_none, data,
                                sizeof(int) * 3, 3);
       qsched_addlock(&s, tid_new, rid[j * m + k]);
       qsched_adduse(&s, tid_new, rid[k * m + k]);
@@ -568,7 +569,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
       data[0] = i;
       data[1] = k;
       data[2] = k;
-      tid_new = qsched_addtask(&s, task_DTSQRF, task_flag_none, data,
+      tid_new = qsched_addtask_local(&s, task_DTSQRF, task_flag_none, data,
                                sizeof(int) * 3, 3);
       qsched_addlock(&s, tid_new, rid[k * m + i]);
       qsched_adduse(&s, tid_new, rid[k * m + k]);
@@ -581,12 +582,12 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
         data[0] = i;
         data[1] = j;
         data[2] = k;
-        tid_new = qsched_addtask(&s, task_DSSRFT, task_flag_none, data,
+        tid_new = qsched_addtask_local(&s, task_DSSRFT, task_flag_none, data,
                                  sizeof(int) * 3, 5);
         qsched_addlock(&s, tid_new, rid[j * m + i]);
         qsched_adduse(&s, tid_new, rid[k * m + i]);
-        qsched_adduse(&s, tid_new, rid[j * m + k]);
-        // qsched_addunlock(&s, tid[k * m + i], tid_new);
+        qsched_addlock(&s, tid_new, rid[j * m + k]);
+        qsched_addunlock(&s, tid[k * m + i], tid_new);
         qsched_addunlock(&s, tid[j * m + (i - 1)], tid_new);
         if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new);
 
@@ -616,22 +617,22 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
   }
 
   /* Dump A. */
-  /* message( "A = [" );
+  message( "A = [" );
   for ( k = 0 ; k < m*K ; k++ ) {
       for ( j = 0 ; j < n*K ; j++ )
           printf( "%.3f " , A[ j*m*K + k ] );
       printf( "\n" );
       }
-  printf( "];\n" ); */
+  printf( "];\n" ); 
 
   /* Dump tau. */
-  /* message( "tau = [" );
+   message( "tau = [" );
   for ( k = 0 ; k < m*K ; k++ ) {
       for ( j = 0 ; j < n ; j++ )
           printf( "%.3f " , tau[ j*m*K + k ] );
       printf( "\n" );
       }
-  printf( "];\n" ); */
+  printf( "];\n" ); 
 
   /* Dump the tasks. */
   /* for ( k = 0 ; k < s.count ; k++ ) {
@@ -654,7 +655,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
   }
 
   /* Test if the decomposition was correct.*/
-  /*double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
+  double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
    double *Q = computeQ(tempMatrix, m*K, K, tau, m);
    double *R = getR(tempMatrix, m*K);
    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q,
@@ -670,7 +671,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
    }
    free(tempMatrix);
    free(Q);
-   free(R);*/
+   free(R);
 
   /* Clean up. */
   free(A);
diff --git a/examples/test_qr_mpi.c b/examples/test_qr_mpi.c
index 640bb4594dcc9381a725d39ac436f098bad0ce84..6bb7c1d47cdacdefde3699ff2ee98c57edb74f85 100644
--- a/examples/test_qr_mpi.c
+++ b/examples/test_qr_mpi.c
@@ -33,9 +33,8 @@
 #include <mpi.h>
 
 
-#ifdef WITH_CLBAS
 #include <cblas.h>
-#endif
+
 
 /* Local includes. */
 #include "quicksched.h"
@@ -639,7 +638,7 @@ task_j = (int*)calloc(sizeof(int),  m*n*m*n);
 task_k = (int*)calloc(sizeof(int),  m*n*m*n);
 #endif
 
-
+    srand(6);
   /* Allocate and fill the original matrix. */
   if ((A = (double*)malloc(sizeof(double)* m* n* K* K)) == NULL ||
       (tau = (double*)malloc(sizeof(double)* m* n* K)) == NULL ||
@@ -681,6 +680,17 @@ if(s.rank == 0) {
   for (k = 0; k < m * n; k++) {
     tid[k] = qsched_task_none;
   }
+
+#ifdef WITH_CBLAS_LIB
+  /* Dump A_orig. */
+  message( "A_orig = [" );
+  for ( k = 0 ; k < m*K ; k++ ) {
+      for ( j = 0 ; j < n*K ; j++ )
+          printf( "%.3f " , A_orig[ j*m*K + k ] );
+      printf( "\n" );
+      }
+  printf( "];\n" ); 
+#endif
 /* Build the tasks. */
   for (k = 0; k < m && k < n; k++) {
 
@@ -907,20 +917,59 @@ for(i = 0; i < n; i++)
 {
     for(j = 0; j < m; j++)
     {
-        long long int id = tau_id[j*m+i];
+        long long int id = tau_id[i*m+j];
         struct res *r = &s.res[s.res_ranks[(id>>48)] + (id & 0xFFFFFFFFFFFFFF)];
         if(r->node == s.rank)
         {
-            memcpy(&tau[(j*m+i)*K], qsched_getresdata(&s, r->ID), sizeof(double)*K );
+            memcpy(&tau[(i*m+j)*K], qsched_getresdata(&s, r->ID), sizeof(double)*K );
         }
     }
 }
 
 MPI_Allreduce(MPI_IN_PLACE, tau, m*n*K, MPI_DOUBLE, MPI_SUM, s.comm);
 
+double *tau_new = (double*)calloc(sizeof(double), m*n*K);
+
+
+for(i = 0; i < n; i++)
+{
+	double *column_old = &tau[i*m*K];
+	double *column_new = &tau_new[i*m*K];
+
+	for(j = 0; j < m; j++)
+	{
+		for(k = 0; k < K; k++)
+		{
+			column_new[k*m+j] = column_old[j*K+k];
+		}
+	}
+}
+
+free(tau);
+tau = tau_new;
 
 #ifdef WITH_CBLAS_LIB
     //This should check correctness.
+
+if(s.rank == 0){
+ message( "A = [" );
+  for ( k = 0 ; k < m*K ; k++ ) {
+      for ( j = 0 ; j < n*K ; j++ )
+          printf( "%.3f " , A[ j*m*K + k ] );
+      printf( "\n" );
+      }
+  printf( "];\n" ); 
+
+  /* Dump tau. */
+   message( "tau = [" );
+  for ( k = 0 ; k < m*K ; k++ ) {
+      for ( j = 0 ; j < n ; j++ )
+          printf( "%.3f " , tau[ j*m*K + k ] );
+      printf( "\n" );
+      }
+  printf( "];\n" ); 
+
+
 double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
    double *Q = computeQ(tempMatrix, m*K, K, tau, m);
    double *R = getR(tempMatrix, m*K);
@@ -938,6 +987,7 @@ double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
    free(tempMatrix);
    free(Q);
    free(R);
+}
 #endif
 
  //TODO Clean up the resource data.