diff --git a/examples/plot.py b/examples/plot.py
index ce4c28bc95438f6817e6e6c67d70864c9cf0b8e7..2b26ddb804ad0188bdf8d4759ef220f288bb04be 100644
--- a/examples/plot.py
+++ b/examples/plot.py
@@ -39,17 +39,17 @@ print "Plotting..."
 # Read Quickshed accelerations
 data=loadtxt("particle_dump.out")
 id = data[:,0]
-accx_e=data[:,4]
-accy_e=data[:,5]
-accz_e=data[:,6]
+accx_e=data[:,5]
+accy_e=data[:,6]
+accz_e=data[:,7]
 
-accx_bh=data[:,7]
-accy_bh=data[:,8]
-accz_bh=data[:,9]
+accx_bh=data[:,8]
+accy_bh=data[:,9]
+accz_bh=data[:,10]
 
-accx_new=data[:,10]
-accy_new=data[:,11]
-accz_new=data[:,12]
+accx_new=data[:,11]
+accy_new=data[:,12]
+accz_new=data[:,13]
 
 # Sort accelerations
 rank = argsort(id)
diff --git a/examples/test_bh_mpi.c b/examples/test_bh_mpi.c
index 79ee40a04a0271b2640994986acd8ebcaebee83e..deef47386e0d7b284885d14ac928843694769935 100644
--- a/examples/test_bh_mpi.c
+++ b/examples/test_bh_mpi.c
@@ -49,7 +49,7 @@
 #define SANITY_CHECKS
 #define NO_COM_AS_TASK
 #define NO_COUNTERS
-#define EXACT
+#define NO_EXACT
 
 
 
@@ -1287,7 +1287,7 @@ for(i = 0; i < s.count_ranks; i++)
 }
 
 //Need to clean up everything.
-    free(parts);
+//    free(parts);
     //TODO Clean up the cell-resource data.
     qsched_free(&s);
 
@@ -1330,6 +1330,8 @@ int main(int argc, char *argv[]) {
         if (sscanf(optarg, "%d", &nr_threads) != 1)
           error("Error parsing number of threads.");
         omp_set_num_threads(nr_threads);
+        message("omp_get_max_threads() = %i\n", omp_get_max_threads());
+        message("omp_get_num_procs() = %i\n", omp_get_num_procs());
         break;
       case 'f':
         if (sscanf(optarg, "%s", &fileName[0]) != 1)
diff --git a/examples/test_qr.c b/examples/test_qr.c
index 556de9e01fb02fa1effd93bc731447bc0800160b..005e83e9c69beb48ad51f6728d37e5e1981635c3 100644
--- a/examples/test_qr.c
+++ b/examples/test_qr.c
@@ -587,7 +587,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
         qsched_adduse(&s, tid_new, rid[k * m + i]);
         qsched_adduse(&s, tid_new, rid[j * m + k]);
         // qsched_addunlock(&s, tid[k * m + i], tid_new);
-        qsched_addunlock(&s, tid[j * m + i - 1], tid_new);
+        qsched_addunlock(&s, tid[j * m + (i - 1)], tid_new);
         if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new);
 
         tid[j * m + i] = tid_new;
diff --git a/examples/test_qr_mpi.c b/examples/test_qr_mpi.c
index 673f1ee5ce0512d0c160264011999d25c750b0b2..7fed164ea6e7d9d3b9b933a532e2d25925f6e319 100644
--- a/examples/test_qr_mpi.c
+++ b/examples/test_qr_mpi.c
@@ -40,6 +40,7 @@
 #include "quicksched.h"
 #include "res.h"
 
+#define TASK_TIMERS
 
 #ifdef WITH_CBLAS
 /**
@@ -466,14 +467,27 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
   int k, j, i;
   double* A, *A_orig= NULL, *tau;
   struct qsched s;
-  qsched_task_t* tid = NULL, tid_new;
+  qsched_task_t* tid = NULL, tid_new = -1;
   qsched_res_t *rid = NULL, *tau_id = NULL;
   double **rids = NULL, **taus = NULL;
   int data[3];
+#ifdef TASK_TIMERS
+  long long int MPI_data[7];
+#else
   long long int MPI_data[6];
+#endif
 //  ticks tic, toc_run, tot_setup, tot_run = 0;
+#ifdef TASK_TIMERS
+  ticks tic, toc_run;
+#endif
 char processor_name[MPI_MAX_PROCESSOR_NAME];
     int name_len;
+  #ifdef TASK_TIMERS
+  long long int *task_start;
+  long long int *task_finish;
+  int *task_tids;
+  int *task_types;
+  #endif
 
     // Initialize the MPI environment
 int MpiThreadLevel;
@@ -486,19 +500,23 @@ if(MpiThreadLevel != MPI_THREAD_MULTIPLE)
     MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN );
 
   enum task_types {
-    task_DGEQRF,
+    task_DGEQRF=1,
     task_DLARFT,
     task_DTSQRF,
     task_DSSRFT
   };
 
+    printf("Task_DGEQRF = %i\n", task_DGEQRF);
   /* Runner function to pass to the scheduler. */
   void runner(struct qsched *s, int type, void * data) {
-
+    #ifdef TASK_TIMERS
+    tic = getticks();
+    #endif
     /* Decode the task data. */
     long long int* idata = (long long int*)data;
 //    int i = idata[0], j = idata[1], k = idata[2];
     long long int i, j , a , t;
+    long long int tid = -1;
     /* Need to pull the resources.*/
 
     /* Decode and execute the task. */
@@ -509,6 +527,9 @@ if(MpiThreadLevel != MPI_THREAD_MULTIPLE)
         double *cornerTile = (double*)qsched_getresdata(s, i);
         double *tau = (double*)qsched_getresdata(s, j);
         DGEQRF(cornerTile, K, tau, k, m);
+        #ifdef TASK_TIMERS
+        tid = (idata[2] * m * m);
+        #endif
         break;
       case task_DLARFT:
         t = idata[2];
@@ -519,6 +540,9 @@ if(MpiThreadLevel != MPI_THREAD_MULTIPLE)
         tau = (double*)qsched_getresdata(s, t);
         DLARFT(cornerTile, lockedTile, K, j, k, tau,
                m);
+        #ifdef TASK_TIMERS
+        tid = (idata[3] * m * m) + (idata[4] * m);
+        #endif
         break;
       case task_DTSQRF:
         i = idata[0];
@@ -529,6 +553,9 @@ if(MpiThreadLevel != MPI_THREAD_MULTIPLE)
         tau = (double*)qsched_getresdata(s, t);
         DTSQRF(cornerTile, lockedTile, K, i, k, tau,
                m);
+        #ifdef TASK_TIMERS
+        tid = (idata[3] * m * m) + (idata[3] * m) + idata[4];
+        #endif
         break;
       case task_DSSRFT:
         a = idata[2];
@@ -541,13 +568,33 @@ if(MpiThreadLevel != MPI_THREAD_MULTIPLE)
         tau = (double*)qsched_getresdata(s, t);
         DSSRFT(lockedTile1, usedTile,
                lockedTile2, K, i, j, k, tau, m);
+        #ifdef TASK_TIMERS
+        tid = (idata[4] * m * m) + (idata[6] * m) + (idata[5]);
+        #endif
         break;
 //      default:
   //      error("Unknown task type.");
     }
+    #ifdef TASK_TIMERS
+    if(type > 0){
+        toc_run = getticks();
+        task_start[tid] = tic;
+        task_finish[tid] = toc_run;
+        task_tids[tid] = omp_get_thread_num();
+        task_types[tid] = type;
+    }
+    #endif
   }
 
-    qsched_init(&s, nr_threads, qsched_flag_noreown, MPI_COMM_WORLD);
+    qsched_init(&s, 2, qsched_flag_none, MPI_COMM_WORLD);
+
+#ifdef TASK_TIMERS
+task_start = (long long int*)calloc(sizeof(long long int), m*n*K*K);
+printf("Created task_start of size %i", m*n*K*K);
+task_finish = (long long int*)calloc(sizeof(long long int), m*n*K*K);
+task_tids = (int*)calloc(sizeof(int), m*n*K*K);
+task_types = (int*)calloc(sizeof(int), m*n*K*K);
+#endif
 
 if(s.rank == 0){
   /* Allocate and fill the original matrix. */
@@ -599,8 +646,14 @@ if(s.rank == 0) {
     data[2] = k;
     MPI_data[0] = rid[k*m+k];
     MPI_data[1] = tau_id[k*m+k];
+    #ifdef TASK_TIMERS
+    MPI_data[2] = k;
+    tid_new = qsched_addtask(&s, task_DGEQRF, task_flag_none, MPI_data,
+                               sizeof(long long int) * 3, 300);
+    #else
     tid_new = qsched_addtask(&s, task_DGEQRF, task_flag_none, MPI_data,
                              sizeof(long long int) * 2, 200);
+    #endif
     qsched_addlock(&s, tid_new, rid[k * m + k]);
     qsched_addlock(&s, tid_new, tau_id[k*m+k]);
     if(k == 0)
@@ -618,8 +671,15 @@ if(s.rank == 0) {
       MPI_data[0] = rid[j*m+k];
       MPI_data[1] = rid[k*m+k];
       MPI_data[2] = tau_id[k*m+k];
+      #ifdef TASK_TIMERS
+      MPI_data[3] = k;
+      MPI_data[4] = j;
+      tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, MPI_data,
+                               sizeof(long long int) * 5, 300);
+      #else
       tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, MPI_data,
                                sizeof(long long int) * 3, 300);
+      #endif
         if(k == 0)
         {
            memcpy(rids[j*m+k], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K);
@@ -642,8 +702,15 @@ if(s.rank == 0) {
       MPI_data[0] = rid[k*m+i];
       MPI_data[1] = rid[k*m+k];
       MPI_data[2] = tau_id[k*m+i];
+      #ifdef TASK_TIMERS
+      MPI_data[3] = k;
+      MPI_data[4] = i;
+      tid_new = qsched_addtask(&s, task_DTSQRF, task_flag_none, MPI_data,
+                               sizeof(long long int) * 5, 300);
+      #else
       tid_new = qsched_addtask(&s, task_DTSQRF, task_flag_none, MPI_data,
                                sizeof(long long int) * 3, 300);
+      #endif
         if(k == 0)
         {
             memcpy(rids[k*m+i], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K);
@@ -654,7 +721,6 @@ if(s.rank == 0) {
       qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new);
       if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new);
       tid[k * m + i] = tid_new;
-
       /* Add the inner tasks. */
       for (j = k + 1; j < n; j++) {
         data[0] = i;
@@ -664,8 +730,16 @@ if(s.rank == 0) {
         MPI_data[1] = rid[k*m+i];
         MPI_data[2] = rid[j*m+k];
         MPI_data[3] = tau_id[k*m+i];
+        #ifdef TASK_TIMERS
+        MPI_data[4] = k;
+        MPI_data[5] = i;
+        MPI_data[6] = j;
+        tid_new = qsched_addtask(&s, task_DSSRFT, task_flag_none, MPI_data,
+                                 sizeof(long long int) * 7, 300);
+        #else
         tid_new = qsched_addtask(&s, task_DSSRFT, task_flag_none, MPI_data,
                                  sizeof(long long int) * 4, 500);
+        #endif
         if(k == 0)
         {
             memcpy(rids[j*m+i], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K);
@@ -692,6 +766,30 @@ qsched_run_MPI(&s, nr_threads, runner);
  //TODO Clean up the resource data.
     qsched_free(&s);
 
+#ifdef TASK_TIMERS
+FILE *file = NULL;
+    if(s.rank == 0)
+        file = fopen("Task_timers0.out", "w");
+    else if (s.rank == 1)
+        file = fopen("Task_timers1.out", "w");
+    else if (s.rank == 2)
+        file = fopen("Task_timers2.out", "w");
+    else if (s.rank == 3)
+        file = fopen("Task_timers3.out", "w");
+
+    for(i = 0; i < m*n*K*K; i++)
+    {
+        if(task_types[i] > 0)
+            fprintf(file, "%i %i %lli %lli\n", task_types[i], task_tids[i],task_start[i], task_finish[i]  );
+    }
+
+
+    fclose(file);
+    free(task_types);
+    free(task_tids);
+    free(task_start);
+    free(task_finish);
+#endif
     // Finalize the MPI environment.
     MPI_Finalize();
 }
@@ -705,6 +803,12 @@ int main(int argc, char* argv[]) {
   int c, nr_threads=1;
   int M = 4, N = 4, runs = 1, K = 32;
 
+/* Get the number of threads. */
+//#pragma omp parallel shared(nr_threads)
+  //{
+   // if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads();
+  //}
+
   /* Parse the options */
   while ((c = getopt(argc, argv, "m:n:k:r:t:")) != -1) switch (c) {
       case 'm':
@@ -724,6 +828,8 @@ int main(int argc, char* argv[]) {
         if (sscanf(optarg, "%d", &nr_threads) != 1)
           error("Error parsing number of threads.");
         omp_set_num_threads(nr_threads);
+        message("omp_get_max_threads() = %i\n", omp_get_max_threads());
+        message("omp_get_num_procs() = %i\n", omp_get_num_procs());
         break;
       case '?':
         fprintf(stderr, "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n",
diff --git a/src/qsched.c b/src/qsched.c
index a9ba6fb0ccc90f938291435580c1f12199dde38f..a30ae00ecae6d489d27e9332721491466f6ec38d 100644
--- a/src/qsched.c
+++ b/src/qsched.c
@@ -3428,6 +3428,7 @@ printf("Rank[%i]: qsched_prepare_mpi took %lli (= %e) ticks\n", s->rank,
     {
         /* Local variable. */
         struct task *t;
+        printf("Hello from thread %i on CPU %i\n", omp_get_thread_num(), sched_getcpu());
         /* Get the ID of the current thread. */
         int qid = omp_get_thread_num() % s->nr_queues;