diff --git a/examples/test_bh.cu b/examples/test_bh.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f269cd47bf8de54327dbac53a1acef6fbef05821
--- /dev/null
+++ b/examples/test_bh.cu
@@ -0,0 +1,1294 @@
+/*******************************************************************************
+ * This file is part of QuickSched.
+ * Coypright (c) 2014 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+* *****************************************************************************/
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Standard includes. */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+#include <omp.h>
+#include <fenv.h>
+
+
+
+/* Local includes. */
+extern "C"{
+#include "quicksched.h"
+#include "res.h"
+#include <cblas.h>
+}
+#include "cuda_queue.h"
+
+/* Some local constants. */
+#define cell_pool_grow 1000
+#define cell_maxparts 128
+#define task_limit 1e8
+#define const_G 1    // 6.6738e-8
+#define dist_min 0.5 /* Used for legacy walk only */
+#define dist_cutoff_ratio 1.5
+
+#define ICHECK -1
+#define NO_SANITY_CHECKS
+#define NO_COM_AS_TASK
+#define NO_COUNTERS
+
+/** Data structure for the particles. */
+struct part {
+  double x[3];
+  // union {
+  float a[3];
+  float a_legacy[3];
+  float a_exact[3];
+  // };
+  float mass;
+  int id;
+};  // __attribute__((aligned(64)));
+
+struct multipole {
+  double com[3];
+  float mass;
+};
+
+struct cell_device{
+    double loc[3];
+    double h;
+    int count;
+    unsigned short int split,sorted;
+    int part_loc; /*Index of particles in part array. */
+    int first_child; /* Index of first child in cell array. */
+    int sibling; /* Next node.*/
+    struct multipole mpole;
+    int res, com_tid;
+}
+
+/** Data structure for the BH tree cell. */
+struct cell {
+  double loc[3];
+  double h;
+
+  int count;
+  unsigned short int split, sorted;
+  struct part *parts;
+
+  struct cell *firstchild; /* Next node if opening */
+  struct cell *sibling;    /* Next node */
+
+/** Global variable for the pool of allocated cells. */
+
+  /* We keep both CoMs and masses to make sure the comp_com calculation is
+   * correct (use an anonymous union to keep variable names compact).  */
+//  union {
+
+    /* Information for the legacy walk */
+//    struct multipole legacy;
+
+    /* Information for the QuickShed walk */
+  struct multipole mpole;
+//  };
+
+  int res, com_tid;
+  struct index *indices;
+
+} __attribute__((aligned(128)));
+struct cell *cell_pool = NULL;
+
+/** Task types. */
+enum task_type {
+  task_type_self = 0,
+  task_type_pair,
+  task_type_self_pc,
+  task_type_com,
+  task_type_count
+};
+
+__device__ struct part *parts;
+
+
+#ifdef TODO
+__device__ double *part_x1;
+__device__ double *part_x2;
+__device__ double *part_x3;
+__device__ float  *part_a1;
+__device__ float  *part_a2;
+__device__ float  *part_a3;
+__device__ float *part_mass;
+#endif
+
+
+#define cuda_error(s, ...) {if(threadIdx.x == 0){fprintf( stderr , "%s:%s():%i: " s "\n" , __FILE__ , __FUNCTION__ , __LINE__ , ##__VA_ARGS__ );} __syncthreads(); asm("trap\n"); }
+
+
+__device__ double atomicAdd(double* address, double val)
+{
+    unsigned long long int* address_as_ull =
+                                          (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed, 
+                        __double_as_longlong(val + 
+                        __longlong_as_double(assumed)));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+
+__device__ inline double __shfl_down_dbl(double var, unsigned int srcLane, int width)
+{
+  int2 a = *reinterpret_cast<int2*>(&var);
+  a.x = __shfl_down(a.x, srcLane, width);
+  a.y = __shfl_down(a.y, srcLane, width);
+  return *reinterpret_cast<double*>(&a);
+}
+
+__device__ inline void reduceSumDouble( double* value)
+{
+    for(int mask = 16; mask > 0; mask >>= 1)
+        *value += __shfl_down_dbl( *value, mask, 32);
+}
+
+__device__ inline void reduceSum( float* value )
+{
+    #pragma unroll
+    for(int mask = 16 ; mask > 0 ; mask >>= 1)
+        *value += __shfl_down((float)*value, mask);
+
+   // *value = __shfl((float)*value, 0);
+}
+
+/**
+ * @brief Checks whether the cells are direct neighbours ot not
+ */
+__device__ __forceinline__ int are_neighbours_different_size_CUDA(struct cell *ci,
+                                                struct cell *cj) {
+
+  int k;
+  float dx[3];
+  double cih = ci->h, cjh = cj->h;
+
+  /* Maximum allowed distance */
+  float min_dist = 0.5 * (cih + cjh);
+
+  /* (Manhattan) Distance between the cells */
+  for (k = 0; k < 3; k++) {
+    float center_i = ci->loc[k] + 0.5 * cih;
+    float center_j = cj->loc[k] + 0.5 * cjh;
+    dx[k] = fabsf(center_i - center_j);
+  }
+
+  return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
+}
+/**
+ * @brief Compute the center of mass of a given cell.
+ *
+ * @param c The #cell.
+ */
+__device__ void comp_com_CUDA(struct cell *c) {
+
+  int k, count = c->count;
+  struct cell *cp;
+  struct part *parts = c->parts;
+  double com[3] = {0.0, 0.0, 0.0};
+  float mass = 0.0;
+
+  if (c->split) {
+
+    /* Loop over the projenitors and collect the multipole information. */
+    if(threadIdx.x == 0)
+    {
+        for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
+          float cp_mass = cp->mpole.mass;
+          com[0] += cp->mpole.com[0] * cp_mass;
+          com[1] += cp->mpole.com[1] * cp_mass;
+          com[2] += cp->mpole.com[2] * cp_mass;
+          mass += cp_mass;
+        }
+    /* Store the COM data, if any was collected. */
+      if (mass > 0.0) {
+        float imass = 1.0f / mass;
+        c->mpole.com[0] = com[0] * imass;
+        c->mpole.com[1] = com[1] * imass;
+        c->mpole.com[2] = com[2] * imass;
+        c->mpole.mass = mass;
+      } else {
+        c->mpole.com[0] = 0.0;
+        c->mpole.com[1] = 0.0;
+        c->mpole.com[2] = 0.0;
+        c->mpole.mass = 0.0;
+      }
+    }
+    /* Otherwise, collect the multipole from the particles. */
+  } else {
+
+    for (k = threadIdx.x; k < count; k+= blockDim.x) {
+      float p_mass = parts[k].mass;
+      com[0] += parts[k].x[0] * p_mass;
+      com[1] += parts[k].x[1] * p_mass;
+      com[2] += parts[k].x[2] * p_mass;
+      mass += p_mass;
+    }
+    reduceSumDouble(com);
+    reduceSumDouble(&com[1]);
+    reduceSumDouble(&com[2]);
+    reduceSum(&mass);
+    if(threadIdx.x % 32 == 0)
+    {   
+        if (mass > 0.0) {
+        float imass = 1.0f / mass;
+        atomicAdd( &c->mpole.com[0] , com[0] * imass);
+        atomicAdd( &c->mpole.com[1] , com[1] * imass);
+        atomicAdd( &c->mpole.com[2] , com[2] * imass);
+        atomicAdd( &c->mpole.mass , mass);
+      } else if(threadIdx.x == 0) {
+        c->mpole.com[0] = 0.0;
+        c->mpole.com[1] = 0.0;
+        c->mpole.com[2] = 0.0;
+        c->mpole.mass = 0.0;
+      }
+    }
+  }
+
+}
+
+
+/**
+ * @brief Checks whether the cells are direct neighbours ot not. Both cells have
+ * to be of the same size
+ */
+__device__ __forceinline__ int are_neighbours_CUDA(struct cell *ci, struct cell *cj) {
+
+  int k;
+  float dx[3];
+
+#ifdef SANITY_CHECKS
+  if (ci->h != cj->h)
+    error(" Cells of different size in distance calculation.");
+#endif
+
+  /* Maximum allowed distance */
+  float min_dist = ci->h;
+
+  /* (Manhattan) Distance between the cells */
+  for (k = 0; k < 3; k++) {
+    float center_i = ci->loc[k];
+    float center_j = cj->loc[k];
+    dx[k] = fabsf(center_i - center_j);
+  }
+
+  return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
+}
+
+/**
+ * @brief Compute the interactions between all particles in a cell
+ *        and all particles in the other cell (N^2 algorithm).
+ * Only applies the forces to cell ci.
+ *
+ * @param ci The #cell containing the particles.
+ * @param cj The #cell containing the other particles
+ */
+__device__ __forceinline__ void iact_pair_direct_CUDA(struct cell *ci, struct cell *cj)
+{
+    int i, j, k;
+    int count_i = ci->count, count_j = cj->count;
+    struct part *parts_i = ci->parts, *parts_j = cj->parts;
+    double xi[3];
+    float dx[3], ai[3], r2, w, ir;
+
+#ifdef SANITY_CHECKS
+
+  /* Bad stuff will happen if cell sizes are different */
+    if (ci->h != cj->h)
+        error("Non matching cell sizes !! h_i=%f h_j=%f\n", ci->h, cj->h);
+
+#endif
+
+    /* Loop over particles in cell i */
+    for(i=0; i < count_i; i+= blockDim.x)
+    {
+        /* Init the ith particle's data. */
+        for (k = 0; k < 3; k++) {
+          xi[k] = parts_i[i].x[k];
+          ai[k] = 0.0f;
+        }
+     //   mi = parts_i[i].mass;
+        /* Loop over particles in cell j */
+        for(j = 0; j < count_j; j++)
+        {
+            /* Compute the pairwise distance. */
+            for (r2 = 0.0, k = 0; k < 3; k++) {
+            dx[k] = xi[k] - parts_j[j].x[k];
+            r2 += dx[k] * dx[k];
+            }
+            /* Apply the gravitational acceleration. */
+            ir = 1.0f / sqrtf(r2);
+            w = const_G * ir * ir * ir;
+            for(k = 0; k < 3; k++) {
+                ai[k] = w * dx[k] * parts_j[j].mass;
+            }
+        }
+
+        for (k = 0; k < 3; k++) atomicAdd( &parts[i].a[k] , ai[k]);
+    }/* Loop over particles in cell i.*/
+}
+
+/**
+ * @brief Decides whether two cells use the direct summation interaction or the
+ * multipole interactions
+ *
+ * @param ci The #cell.
+ * @param cj The other #cell.
+ */
+__device__ void iact_pair_CUDA(struct cell *ci, struct cell *cj) {
+
+  struct cell *cp, *cps;
+
+#ifdef SANITY_CHECKS
+
+  /* Early abort? */
+  if (ci->count == 0 || cj->count == 0) error("Empty cell !");
+
+  /* Bad stuff will happen if cell sizes are different */
+  if (ci->h != cj->h)
+    error("Non matching cell sizes !! h_i=%f h_j=%f\n", ci->h, cj->h);
+
+  /* Sanity check */
+  if (ci == cj)
+    error("The impossible has happened: pair interaction between a cell and "
+          "itself.");
+
+#endif
+
+  /* Are the cells direct neighbours? */
+  if (are_neighbours_CUDA(ci, cj)) {
+
+    /* Are both cells split ? */
+    if (ci->split && cj->split) {
+
+      /* Let's split both cells and build all possible pairs */
+      for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) {
+        for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) {
+
+          /* If the cells are neighbours, recurse. */
+          if (are_neighbours_CUDA(cp, cps)) {
+            iact_pair_CUDA(cp, cps);
+          }
+        }
+      }
+    } else {/* Otherwise, compute the interactions at this level directly. */
+      iact_pair_direct_CUDA(ci, cj);
+      iact_pair_direct_CUDA(cj, ci);
+    }
+  }
+}
+
+__device__ void iact_self_direct_CUDA(struct cell *c)
+{
+    int i, j, k, count = c->count;
+    double xi[3] = {0.0, 0.0, 0.0};
+    float ai[3] = {0.0, 0.0, 0.0}, mj, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w;
+    struct part *parts = c->parts;
+    struct cell *cp, *cps;
+    
+#ifdef SANITY_CHECKS
+
+  /* Early abort? */
+    if (count == 0) 
+    {
+        cuda_error("Empty cell");
+    }
+
+#endif
+    
+    /* If the cell is split, interact each progeny with itself, and with
+     each of its siblings. */    
+    if(c->split)
+    {
+        for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
+            iact_self_direct_CUDA(cp);
+            for (cps = cp->sibling; cps != c->sibling; cps = cps->sibling)
+                iact_pair_CUDA(cp, cps);
+        }
+    } else {
+
+        /* Loop over all particles. */
+        for(i = threadIdx.x; i < count; i+= blockDim.x)
+        {
+            /* Init the ith particle's data. */
+            for (k = 0; k < 3; k++) {
+                xi[k] = parts[i].x[k];
+                ai[k] = 0.0;
+            }
+//            mi = parts[i].mass;
+
+           /* Loop all particles - note this was an improvement in quicksched so we use it here. */
+            for(j = 0; j < count; j++)
+            {
+                /* Compute the pairwise distance. */
+                for (r2 = 0.0, k = 0; k < 3; k++) {
+                    dx[k] = xi[k] - parts[j].x[k];
+                    r2 += dx[k] * dx[k];
+                }
+                /* Apply the gravitational acceleration. */
+                ir = 1.0f / sqrtf(r2);
+                w = const_G * ir * ir * ir;
+                mj = parts[j].mass;
+                for(k = 0; k < 3; k++)
+                {
+                    ai[k] -= w * dx[k] * mj;
+                }
+            } /* Inner particle loop. */
+        /* Store the accumulated acceleration on the ith part. */
+      for (k = 0; k < 3; k++) atomicAdd( &parts[i].a[k] , ai[k]);
+        }/* Outer particle loop */
+    }/* All done!*/
+
+}
+
+/**
+ * @brief Checks whether the cell leaf is a subvolume of the cell c
+ */
+__device__ __forceinline__ int is_inside_CUDA(struct cell *leaf, struct cell *c) {
+  return (leaf->parts >= c->parts) && (leaf->parts < c->parts + c->count);
+}
+
+
+/**
+ * @brief Interacts all particles in ci with the monopole in cj
+ */
+__device__ __forceinline__ void make_interact_pc_CUDA(struct cell *leaf, struct cell *cj) {
+
+  int j, k, count = leaf->count;
+  double com[3] = {0.0, 0.0, 0.0};
+  float mcom, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w;
+  struct part *parts = leaf->parts;
+
+#ifdef SANITY_CHECKS
+
+  /* Sanity checks */
+  if (leaf->count == 0) error("Empty cell!");
+
+  /* Sanity check. */
+  if (cj->new.mass == 0.0) {
+    message("%e %e %e %d %p %d %p", cj->new.com[0], cj->new.com[1],
+            cj->new.com[2], cj->count, cj, cj->split, cj->sibling);
+
+    for (j = 0; j < cj->count; ++j)
+      message("part %d mass=%e id=%d", j, cj->parts[j].mass, cj->parts[j].id);
+
+    error("com does not seem to have been set.");
+  }
+
+#endif
+
+  /* Init the com's data. */
+  for (k = 0; k < 3; k++) com[k] = cj->mpole.com[k];
+  mcom = cj->mpole.mass;
+
+  /* Loop over every particle in leaf. */
+  for (j = threadIdx.x; j < count; j+=blockDim.x) {
+
+    /* Compute the pairwise distance. */
+    for (r2 = 0.0, k = 0; k < 3; k++) {
+      dx[k] = com[k] - parts[j].x[k];
+      r2 += dx[k] * dx[k];
+    }
+
+    /* Apply the gravitational acceleration. */
+    ir = 1.0f / sqrtf(r2);
+    w = mcom * const_G * ir * ir * ir;
+    
+    for (k = 0; k < 3; k++) atomicAdd( &parts[j].a[k] , w * dx[k]);
+
+#if ICHECK >= 0
+    if (leaf->parts[j].id == ICHECK)
+      printf("[DEBUG] cell [%f,%f,%f] interacting with cj->loc=[%f,%f,%f] "
+             "m=%f h=%f\n",
+             leaf->loc[0], leaf->loc[1], leaf->loc[2], cj->loc[0], cj->loc[1],
+             cj->loc[2], mcom, cj->h);
+
+    if (leaf->parts[j].id == ICHECK)
+      printf("[NEW] Interaction with monopole a=( %e %e %e ) h= %f Nj= %d m_j= "
+             "%f\n",
+             w * dx[0], w * dx[1], w * dx[2], cj->h, cj->count, mcom);
+#endif
+
+  } /* loop over every particle. */
+}
+
+/**
+ * @brief Compute the interactions between all particles in a cell leaf
+ *        and the center of mass of all the cells in a part of the tree
+ * described by ci and cj
+ *
+ * @param ci The #cell containing the particle
+ * @param cj The #cell containing the center of mass.
+ */
+__device__ __forceinline__ void iact_pair_pc_CUDA(struct cell *ci, struct cell *cj,
+                                struct cell *leaf) {
+
+  struct cell *cp, *cps;
+
+#ifdef SANITY_CHECKS
+
+  /* Early abort? */
+  if (ci->count == 0 || cj->count == 0) error("Empty cell !");
+
+  /* Sanity check */
+  if (ci == cj)
+    error("The impossible has happened: pair interaction between a cell and "
+          "itself.");
+
+  /* Sanity check */
+  if (!is_inside(leaf, ci))
+    error("The impossible has happened: The leaf is not within ci");
+
+  /* Are the cells direct neighbours? */
+  if (!are_neighbours(ci, cj)) error("Cells are not neighours");
+
+  /* Are both cells split ? */
+  if (!ci->split || !cj->split) error("One of the cells is not split !");
+#endif
+
+  /* Let's find in which subcell of ci the leaf is */
+  for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) {
+
+    if (is_inside_CUDA(leaf, cp)) break;
+  }
+
+  if (are_neighbours_different_size_CUDA(cp, cj)) {
+
+    /* Now interact this subcell with all subcells of cj */
+    for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) {
+
+      /* Check whether we have to recurse or can directly jump to the multipole
+       * calculation */
+      if (are_neighbours_CUDA(cp, cps)) {
+
+        /* We only recurse if the children are split */
+        if (cp->split && cps->split) {
+          iact_pair_pc_CUDA(cp, cps, leaf);
+        }
+
+      } else {
+        make_interact_pc_CUDA(leaf, cps);
+      }
+    }
+  } else {
+
+    /* If cp is not a neoghbour of cj, we can directly interact with the
+     * multipoles */
+    for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) {
+
+      make_interact_pc_CUDA(leaf, cps);
+    }
+  }
+}
+
+
+__device__ void iact_self_pc_CUDA(struct cell *c, struct cell *leaf)
+{
+
+struct cell *cp, *cps;
+
+    #ifdef SANITY_CHECKS
+    
+      /* Early abort? */
+      if (c->count == 0) error("Empty cell !");
+    
+      if (!c->split) error("Cell is not split !");
+
+    #endif
+
+/* Find in which subcell of c the leaf is */
+  for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
+
+    /* Only recurse if the leaf is in this part of the tree */
+    if (is_inside_CUDA(leaf, cp)) break;
+  }
+
+if (cp->split) {
+
+    /* Recurse if the cell can be split */
+    iact_self_pc_CUDA(cp, leaf);
+
+    /* Now, interact with every other subcell */
+    for (cps = c->firstchild; cps != c->sibling; cps = cps->sibling) {
+
+      /* Since cp and cps will be direct neighbours it is only worth recursing
+       */
+      /* if the cells can both be split */
+      if (cp != cps && cps->split) iact_pair_pc_CUDA(cp, cps, leaf);
+    }
+  }
+}
+
+
+
+
+/**
+ * @brief Get a #cell from the pool.
+ */
+struct cell *cell_get() {
+
+  struct cell *res;
+  int k;
+
+  /* Allocate a new batch? */
+  if (cell_pool == NULL) {
+
+    /* Allocate the cell array. */
+    if ((cell_pool =
+             (struct cell *)calloc(cell_pool_grow, sizeof(struct cell))) ==
+        NULL)
+      error("Failed to allocate fresh batch of cells.");
+
+    /* Link them up via their progeny pointers. */
+    for (k = 1; k < cell_pool_grow; k++)
+      cell_pool[k - 1].firstchild = &cell_pool[k];
+  }
+
+  /* Pick a cell off the pool. */
+  res = cell_pool;
+  cell_pool = cell_pool->firstchild;
+
+  /* Clean up a few things. */
+  res->res = qsched_res_none;
+  res->firstchild = 0;
+
+  /* Return the cell. */
+  return res;
+}
+
+
+/**
+ * @brief Sort the parts into eight bins along the given pivots and
+ *        fill the multipoles. Also adds the hierarchical resources
+ *        to the sched.
+ *
+ * @param c The #cell to be split.
+ * @param N The total number of parts.
+ * @param s The #sched to store the resources.
+ */
+void cell_split(struct cell *c, struct qsched *s) {
+
+  int i, j, k, kk, count = c->count;
+  struct part temp, *parts = c->parts;
+  struct cell *cp;
+  int left[8], right[8];
+  double pivot[3];
+  static struct cell *root = NULL;
+  struct cell *progenitors[8];
+
+  /* Set the root cell. */
+  if (root == NULL) {
+    root = c;
+    c->sibling = 0;
+  }
+
+  /* Add a resource for this cell if it doesn't have one yet. */
+  if (c->res == qsched_res_none)
+    c->res = qsched_addres(s, qsched_owner_none, qsched_res_none, NULL, 0 ,NULL);
+
+/* Attach a center-of-mass task to the cell. */
+#ifdef COM_AS_TASK
+  if (count > 0)
+    c->com_tid = qsched_addtask(s, task_type_com, task_flag_none, &c,
+                                sizeof(struct cell *), 1);
+#endif
+
+  /* Does this cell need to be split? */
+  if (count > cell_maxparts) {
+
+    /* Mark this cell as split. */
+    c->split = 1;
+
+    /* Create the progeny. */
+    for (k = 0; k < 8; k++) {
+      progenitors[k] = cp = cell_get();
+      cp->loc[0] = c->loc[0];
+      cp->loc[1] = c->loc[1];
+      cp->loc[2] = c->loc[2];
+      cp->h = c->h * 0.5;
+      cp->res = qsched_addres(s, qsched_owner_none, c->res, NULL, 0, NULL);
+      if (k & 4) cp->loc[0] += cp->h;
+      if (k & 2) cp->loc[1] += cp->h;
+      if (k & 1) cp->loc[2] += cp->h;
+    }
+
+    /* Init the pivots. */
+    for (k = 0; k < 3; k++) pivot[k] = c->loc[k] + c->h * 0.5;
+
+    /* Split along the x-axis. */
+    i = 0;
+    j = count - 1;
+    while (i <= j) {
+      while (i <= count - 1 && parts[i].x[0] < pivot[0]) i += 1;
+      while (j >= 0 && parts[j].x[0] >= pivot[0]) j -= 1;
+      if (i < j) {
+        temp = parts[i];
+        parts[i] = parts[j];
+        parts[j] = temp;
+      }
+    }
+    left[1] = i;
+    right[1] = count - 1;
+    left[0] = 0;
+    right[0] = j;
+
+    /* Split along the y axis, twice. */
+    for (k = 1; k >= 0; k--) {
+      i = left[k];
+      j = right[k];
+      while (i <= j) {
+        while (i <= right[k] && parts[i].x[1] < pivot[1]) i += 1;
+        while (j >= left[k] && parts[j].x[1] >= pivot[1]) j -= 1;
+        if (i < j) {
+          temp = parts[i];
+          parts[i] = parts[j];
+          parts[j] = temp;
+        }
+      }
+      left[2 * k + 1] = i;
+      right[2 * k + 1] = right[k];
+      left[2 * k] = left[k];
+      right[2 * k] = j;
+    }
+
+    /* Split along the z axis, four times. */
+    for (k = 3; k >= 0; k--) {
+      i = left[k];
+      j = right[k];
+      while (i <= j) {
+        while (i <= right[k] && parts[i].x[2] < pivot[2]) i += 1;
+        while (j >= left[k] && parts[j].x[2] >= pivot[2]) j -= 1;
+        if (i < j) {
+          temp = parts[i];
+          parts[i] = parts[j];
+          parts[j] = temp;
+        }
+      }
+      left[2 * k + 1] = i;
+      right[2 * k + 1] = right[k];
+      left[2 * k] = left[k];
+      right[2 * k] = j;
+    }
+
+    /* Store the counts and offsets. */
+    for (k = 0; k < 8; k++) {
+      progenitors[k]->count = right[k] - left[k] + 1;
+      progenitors[k]->parts = &c->parts[left[k]];
+    }
+
+    /* Find the first non-empty progenitor */
+    for (k = 0; k < 8; k++)
+      if (progenitors[k]->count > 0) {
+        c->firstchild = progenitors[k];
+        break;
+      }
+
+#ifdef SANITY_CHECKS
+    if (c->firstchild == NULL)
+      error("Cell has been split but all progenitors have 0 particles");
+#endif
+
+    /* Prepare the pointers. */
+    for (k = 0; k < 8; k++) {
+
+      /* Find the next non-empty sibling */
+      for (kk = k + 1; kk < 8; ++kk) {
+        if (progenitors[kk]->count > 0) {
+          progenitors[k]->sibling = progenitors[kk];
+          break;
+        }
+      }
+
+      /* No non-empty sibling ? Go back a level */
+      if (kk == 8) progenitors[k]->sibling = c->sibling;
+    }
+
+    /* Recurse. */
+    for (k = 0; k < 8; k++)
+      if (progenitors[k]->count > 0) cell_split(progenitors[k], s);
+
+/* Link the COM tasks. */
+#ifdef COM_AS_TASK
+    for (k = 0; k < 8; k++)
+      if (progenitors[k]->count > 0)
+        qsched_addunlock(s, progenitors[k]->com_tid, c->com_tid);
+#endif
+
+    /* Otherwise, we're at a leaf, so create the cell's particle-cell task. */
+  } else {
+    struct cell *data[2] = {root, c};
+    int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
+                             2 * sizeof(struct cell *), 1);
+    qsched_addlock(s, tid, c->res);
+#ifdef COM_AS_TASK
+    qsched_addunlock(s, root->com_tid, tid);
+#endif
+  } /* does the cell need to be split? */
+
+/* Compute the cell's center of mass. */
+#ifndef COM_AS_TASK
+  comp_com(c);
+#endif
+
+  /* Set this cell's resources ownership. */
+  qsched_res_own(s, c->res,
+                 s->nr_queues * (c->parts - root->parts) / root->count);
+}
+
+/**
+ * @brief Checks whether the cells are direct neighbours ot not. Both cells have
+ * to be of the same size
+ */
+static inline int are_neighbours(struct cell *ci, struct cell *cj) {
+
+  int k;
+  float dx[3];
+
+#ifdef SANITY_CHECKS
+  if (ci->h != cj->h)
+    error(" Cells of different size in distance calculation.");
+#endif
+
+  /* Maximum allowed distance */
+  float min_dist = ci->h;
+
+  /* (Manhattan) Distance between the cells */
+  for (k = 0; k < 3; k++) {
+    float center_i = ci->loc[k];
+    float center_j = cj->loc[k];
+    dx[k] = fabs(center_i - center_j);
+  }
+
+  return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
+}
+
+/**
+ * @brief Create the tasks for the cell pair/self.
+ *
+ * @param s The #sched in which to create the tasks.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ */
+void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
+
+  qsched_task_t tid;
+  struct cell *data[2], *cp, *cps;
+
+#ifdef SANITY_CHECKS
+
+  /* If either cell is empty, stop. */
+  if (ci->count == 0 || (cj != NULL && cj->count == 0)) error("Empty cell !");
+
+#endif
+
+  /* Single cell? */
+  if (cj == NULL) {
+
+    /* Is this cell split and above the task limit ? */
+    if (ci->split && ci->count > task_limit / ci->count) {
+
+      /* Loop over each of this cell's progeny. */
+      for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) {
+
+        /* Make self-interaction task. */
+        create_tasks(s, cp, NULL);
+
+        /* Make all pair-interaction tasks. */
+        for (cps = cp->sibling; cps != ci->sibling; cps = cps->sibling)
+          create_tasks(s, cp, cps);
+      }
+
+      /* Otherwise, add a self-interaction task. */
+    } else {
+
+      /* Set the data. */
+      data[0] = ci;
+      data[1] = NULL;
+
+      /* Create the task. */
+      tid =
+          qsched_addtask(s, task_type_self, task_flag_none, data,
+                         sizeof(struct cell *) * 2, ci->count * ci->count / 2);
+
+      /* Add the resource (i.e. the cell) to the mpole task. */
+      qsched_addlock(s, tid, ci->res);
+
+/* If this call might recurse, add a dependency on the cell's COM
+   task. */
+#ifdef COM_AS_TASK
+      if (ci->split) qsched_addunlock(s, ci->com_tid, tid);
+#endif
+    }
+
+    /* Otherwise, it's a pair. */
+  } else {
+
+    /* Are the cells NOT neighbours ? */
+    if (!are_neighbours(ci, cj)) {
+
+    } else {/* Cells are direct neighbours */
+
+      /* Are both cells split ? */
+      if (ci->split && cj->split && ci->count > task_limit / cj->count) {
+
+        /* Let's split both cells and build all possible pairs */
+        for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) {
+          for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) {
+            /* Recurse */
+            create_tasks(s, cp, cps);
+          }
+        }
+        /* Otherwise, at least one of the cells is not split, build a direct
+         * interaction. */
+      } else {
+
+        /* Set the data. */
+        data[0] = ci;
+        data[1] = cj;
+
+        /* Create the task. */
+        tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
+                             sizeof(struct cell *) * 2, ci->count * cj->count);
+
+        /* Add the resources. */
+        qsched_addlock(s, tid, ci->res);
+        qsched_addlock(s, tid, cj->res);
+      }
+    } /* Cells are direct neighbours */
+  }   /* Otherwise it's a pair */
+}
+
+/**
+ * @brief Set up and run a task-based Barnes-Hutt N-body solver.
+ *
+ * @param N The number of random particles to use.
+ * @param nr_threads Number of threads to use.
+ * @param runs Number of force evaluations to use as a benchmark.
+ * @param fileName Input file name. If @c NULL or an empty string, random
+ *        particle positions will be used.
+ */
+void test_bh(int N, int nr_threads, int runs, char *fileName) {
+
+  int i, k;
+  struct cell *root;
+  struct part *parts;
+  FILE *file;
+  struct qsched s;
+  ticks tic, toc_run, tot_setup = 0, tot_run = 0;
+  int countMultipoles = 0, countPairs = 0, countCoMs = 0;
+
+  /* Initialize the per-task type timers. */
+//  for (k = 0; k < task_type_count; k++) task_timers[k] = 0;
+
+  /* Initialize the scheduler. */
+  qsched_init(&s, 1, qsched_flag_none);
+
+  /* Init and fill the particle array. */
+  if ((parts = (struct part *)malloc(sizeof(struct part) * N)) == NULL)
+    error("Failed to allocate particle buffer.");
+
+    double *partsx0, *partsx1, *partsx2;
+    float *partsa0, *partsa1, *partsa2, *partsmass;
+
+    if ((partsx0 = (double*)malloc(sizeof(double)*N)) == NULL)
+        error("Failed to allocate particle buffer.");
+    if ((partsx1 = (double*)malloc(sizeof(double)*N)) == NULL)
+        error("Failed to allocate particle buffer.");
+    if ((partsx2 = (double*)malloc(sizeof(double)*N)) == NULL)
+        error("Failed to allocate particle buffer.");
+
+    if ((partsa0 = (float*)malloc(sizeof(float)*N)) == NULL)
+        error("Failed to allocate particle buffer.");
+    if ((partsa1 = (float*)malloc(sizeof(float)*N)) == NULL)
+        error("Failed to allocate particle buffer.");
+    if ((partsa2 = (float*)malloc(sizeof(float)*N)) == NULL)
+        error("Failed to allocate particle buffer.");
+    if ((partsmass = (float*)malloc(sizeof(float)*N)) == NULL)
+        error("Failed to allocate particle buffer.");
+
+
+
+
+  /* If no input file was specified, generate random particle positions. */
+  if (fileName == NULL || fileName[0] == 0) {
+    for (k = 0; k < N; k++) {
+      parts[k].id = k;
+      parts[k].x[0] = ((double)rand()) / RAND_MAX;
+      parts[k].x[1] = ((double)rand()) / RAND_MAX;
+      parts[k].x[2] = ((double)rand()) / RAND_MAX;
+      parts[k].mass = ((double)rand()) / RAND_MAX;
+      parts[k].a_legacy[0] = 0.0;
+      parts[k].a_legacy[1] = 0.0;
+      parts[k].a_legacy[2] = 0.0;
+      partsx0[k] = parts[k].x[0];
+      partsx1[k] = parts[k].x[1];
+      partsx2[k] = parts[k].x[2];
+      partsmass[k] = parts[k].mass;
+      partsa0[k] = 0.0f;
+      partsa1[k] = 0.0f;
+      partsa2[k] = 0.0f;
+    }
+
+    /* Otherwise, read them from a file. */
+  } else {
+    file = fopen(fileName, "r");
+    if (file) {
+      for (k = 0; k < N; k++) {
+        if (fscanf(file, "%d", &parts[k].id) != 1)
+          error("Failed to read ID of part %i.", k);
+        if (fscanf(file, "%lf%lf%lf", &parts[k].x[0], &parts[k].x[1],
+                   &parts[k].x[2]) !=
+            3)
+          error("Failed to read position of part %i.", k);
+        if (fscanf(file, "%f", &parts[k].mass) != 1)
+          error("Failed to read mass of part %i.", k);
+
+          partsx0[k] = parts[k].x[0];
+          partsx1[k] = parts[k].x[1];
+          partsx2[k] = parts[k].x[2];
+          partsmass[k] = parts[k].mass;
+          partsa0[k] = 0.0f;
+          partsa1[k] = 0.0f;
+          partsa2[k] = 0.0f;
+      }
+      fclose(file);
+    }
+  }
+
+  /* Init the cells. */
+  root = cell_get();
+  root->loc[0] = 0.0;
+  root->loc[1] = 0.0;
+  root->loc[2] = 0.0;
+  root->h = 1.0;
+  root->count = N;
+  root->parts = parts;
+  cell_split(root, &s);
+
+  /* Iterate over the cells and get the average number of particles
+     per leaf. */
+  struct cell *c = root;
+  int nr_leaves = 0;
+  while (c != NULL) {
+    if (!c->split) {
+      nr_leaves++;
+      c = c->sibling;
+    } else {
+      c = c->firstchild;
+    }
+  }
+  message("Average number of parts per leaf is %f.", ((double)N) / nr_leaves);
+
+#if ICHECK > 0
+  printf("----------------------------------------------------------\n");
+
+  /* Do a N^2 interactions calculation */
+
+  ticks tic_exact = getticks();
+  interact_exact(N, parts, ICHECK);
+  ticks toc_exact = getticks();
+
+  printf("Exact calculation (1 thread) took %lli (= %e) ticks\n",
+         toc_exact - tic_exact, (float)(toc_exact - tic_exact));
+
+  printf("----------------------------------------------------------\n");
+#endif
+
+  /* Create the tasks. */
+  tic = getticks();
+  create_tasks(&s, root, NULL);
+  tot_setup += getticks() - tic;
+
+  /* Dump the number of tasks. */
+  message("total nr of tasks: %i.", s.count);
+  message("total nr of deps: %i.", s.count_deps);
+  message("total nr of res: %i.", s.count_res);
+  message("total nr of locks: %i.", s.count_locks);
+  message("total nr of uses: %i.", s.count_uses);
+  int counts[task_type_count];
+  for (k = 0; k < task_type_count; k++) counts[k] = 0;
+  for (k = 0; k < s.count; k++) counts[s.tasks[k].type] += 1;
+
+  // char buffer[200];
+  // sprintf(buffer, "timings_legacy_%d_%d.dat", cell_maxparts, nr_threads);
+  // FILE *fileTime = fopen(buffer, "w");
+
+// fclose(fileTime);
+
+#if ICHECK >= 0
+  message("[check] accel of part %i is [%.3e,%.3e,%.3e]", ICHECK,
+          root->parts[ICHECK].a[0], root->parts[ICHECK].a[1],
+          root->parts[ICHECK].a[2]);
+#endif
+  printf("task counts: [ %8s %8s %8s %8s %8s ]\n", "self", "pair", "m-poles",
+         "direct", "CoMs");
+  printf("task counts: [ %8i %8i %8i %8i %8i ] (legacy).\n", 0, 0,
+         countMultipoles, countPairs, countCoMs);
+  printf("task counts: [ ");
+  for (k = 0; k < task_type_count; k++) printf("%8i ", counts[k]);
+  printf("] (mpole).\n");
+
+  /* Loop over the number of runs. */
+  for (k = 0; k < runs; k++) {
+
+    for (i = 0; i < N; ++i) {
+      parts[i].a[0] = 0.0;
+      parts[i].a[1] = 0.0;
+      parts[i].a[2] = 0.0;
+    }
+    
+    /* Execute the tasks. */
+    tic = getticks();
+//    qsched_run(&s, nr_threads, runner);
+    toc_run = getticks();
+    message("%ith run took %lli (= %e) ticks...", k, toc_run - tic,
+            (float)(toc_run - tic));
+    tot_run += toc_run - tic;
+  }
+
+//  message("[check] root mass= %f %f", root->legacy.mass, root->mpole.mass);
+//  message("[check] root CoMx= %f %f", root->legacy.com[0], root->mpole.com[0]);
+//  message("[check] root CoMy= %f %f", root->legacy.com[1], root->mpole.com[1]);
+//  message("[check] root CoMz= %f %f", root->legacy.com[2], root->mpole.com[2]);
+#if ICHECK >= 0
+  for (i = 0; i < N; ++i)
+    if (root->parts[i].id == ICHECK)
+      message("[check] accel of part %i is [%.3e,%.3e,%.3e]", ICHECK,
+              root->parts[i].a[0], root->parts[i].a[1], root->parts[i].a[2]);
+#endif
+
+  /* Dump the tasks. */
+  /* for ( k = 0 ; k < s.count ; k++ )
+      printf( " %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid ,
+     s.tasks[k].tic , s.tasks[k].toc ); */
+
+  /* Dump the costs. */
+  message("costs: setup=%lli ticks, run=%lli ticks.", tot_setup,
+          tot_run / runs);
+
+  /* Dump the timers. */
+  for (k = 0; k < qsched_timer_count; k++)
+    message("timer %s is %lli ticks.", qsched_timer_names[k],
+            s.timers[k] / runs);
+
+  /* Dump the per-task type timers. */
+//  printf("task timers: [ ");
+//  for (k = 0; k < task_type_count; k++) printf("%lli ", task_timers[k] / runs);
+  //printf("] ticks.\n");
+
+  /* Dump the particles to a file */
+  file = fopen("particle_dump.dat", "w");
+  fprintf(file,
+          "# ID m x y z a_exact.x   a_exact.y    a_exact.z    a_legacy.x    "
+          "a_legacy.y    a_legacy.z    a_mpole.x     a_mpole.y    a_mpole.z\n");
+  for (k = 0; k < N; ++k)
+    fprintf(file, "%d %e %e %e %e %e %e %e %e %e %e %e %e %e\n", parts[k].id,
+            parts[k].mass, parts[k].x[0], parts[k].x[1], parts[k].x[2],
+            parts[k].a_exact[0], parts[k].a_exact[1], parts[k].a_exact[2],
+            parts[k].a_legacy[0], parts[k].a_legacy[1], parts[k].a_legacy[2],
+            parts[k].a[0], parts[k].a[1], parts[k].a[2]);
+  fclose(file);
+
+  /* Clean up. */
+  qsched_free(&s);
+  free(parts);
+}
+
+/**
+ * @brief Main function.
+ */
+
+int main(int argc, char *argv[]) {
+
+  int c, nr_threads;
+  int N = 1000, runs = 1;
+  char fileName[100] = {0};
+
+  /* Die on FP-exceptions. */
+  feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+/* Get the number of threads. */
+#pragma omp parallel shared(nr_threads)
+  {
+    if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads();
+  }
+
+  /* Parse the options */
+  while ((c = getopt(argc, argv, "n:r:t:f:c:i:")) != -1) switch (c) {
+      case 'n':
+        if (sscanf(optarg, "%d", &N) != 1)
+          error("Error parsing number of particles.");
+        break;
+      case 'r':
+        if (sscanf(optarg, "%d", &runs) != 1)
+          error("Error parsing number of runs.");
+        break;
+      case 't':
+        if (sscanf(optarg, "%d", &nr_threads) != 1)
+          error("Error parsing number of threads.");
+        omp_set_num_threads(nr_threads);
+        break;
+      case 'f':
+        if (sscanf(optarg, "%s", &fileName[0]) != 1)
+          error("Error parsing file name.");
+        break;
+      case '?':
+        fprintf(stderr, "Usage: %s [-t nr_threads] [-n N] [-r runs] [-f file] "
+                        "[-c Nparts] [-i Niterations] \n",
+                argv[0]);
+        fprintf(stderr, "Solves the N-body problem using a Barnes-Hut\n"
+                        "tree code with N random particles read from a file in "
+                        "[0,1]^3 using"
+                        "nr_threads threads.\n"
+                        "A test of the neighbouring cells interaction with "
+                        "Nparts per cell is also run Niterations times.\n");
+        exit(EXIT_FAILURE);
+    }
+
+  /* Tree node information */
+  printf("Size of cell: %zu bytes.\n", sizeof(struct cell));
+
+  /* Part information */
+  printf("Size of part: %zu bytes.\n", sizeof(struct part));
+
+  /* Dump arguments. */
+  if (fileName[0] == 0) {
+    message("Computing the N-body problem over %i random particles using %i "
+            "threads (%i runs).",
+            N, nr_threads, runs);
+  } else {
+    message("Computing the N-body problem over %i particles read from '%s' "
+            "using %i threads (%i runs).",
+            N, fileName, nr_threads, runs);
+  }
+
+  /* Run the BH test. */
+  test_bh(N, nr_threads, runs, fileName);
+
+  return 0;
+}
diff --git a/examples/test_qr.cu b/examples/test_qr.cu
index 6f075d4b58b3b2c03c6d2912747102c543f3d333..b93b68da5c38ab5958648aa7be0d7f2fe35f2712 100644
--- a/examples/test_qr.cu
+++ b/examples/test_qr.cu
@@ -892,7 +892,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
 
 
         qsched_run_CUDA( &s , func );
-
+    qsched_print_cuda_timers(&s);
     #ifdef NO_LOADS
     tic = getticks();
     if( cudaMemcpy( device_array, A, sizeof(float) * m * n * K *K, cudaMemcpyDeviceToHost ) != cudaSuccess)
@@ -906,7 +906,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
 
     
    // printMatrix(tileToColumn(A, m*n*K*K, m, n, K), m, n);
-   /*float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
+/*   float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
    float *Q = computeQ(tempMatrix, m*K, K, tau, m);
    float *R = getR(tempMatrix, m*K);
    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q,
@@ -931,7 +931,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
    // printMatrix(A, m, n);
 //    printTileMatrix(A, m , n);
     struct task* tasks =  qsched_get_timers( &s , s.count );
-/*    for(i = 0; i < s.count; i++)
+    for(i = 0; i < s.count; i++)
     {
       printf("%i %lli %lli %i ", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
         if(tasks[i].type < 0)
@@ -946,7 +946,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
         }else
             printf("0 0");
         printf("\n");
-    }*/
+    }
     free(tasks);
 cudaDeviceReset();
 }
diff --git a/src/cuda_queue.cu b/src/cuda_queue.cu
index 5a684959016050b6e3d39bb77ce76c051fb0023d..971255fb4f2e72f9475d9ba5fc107a4f2fd43067 100644
--- a/src/cuda_queue.cu
+++ b/src/cuda_queue.cu
@@ -37,6 +37,30 @@ extern "C"{
 #define type_ghost -102
 
 
+enum {timers_load_q=0, timers_unload_q, timers_queue, timers_doload, timers_dounload, timers_doother, timers_count};
+#ifdef TIMINGS
+#ifdef TIMER_TIC
+#undef TIMER_TIC
+#endif
+#ifdef TIMER_TOC
+#undef TIMER_TOC
+#endif
+#define TIMER_TIC int timer = clock();
+#define TIMER_TOC(N) if(threadIdx.x == 0){atomicAdd( &cuda_timers[N], clock()-timer);}
+#define TIMER_TIC_ND timer = clock();
+__device__ unsigned long long int cuda_timers[timers_count];
+#else
+#ifdef TIMER_TIC
+#undef TIMER_TIC
+#endif
+#ifdef TIMER_TOC
+#undef TIMER_TOC
+#endif
+#define TIMER_TIC
+#define TIMER_TOC(N)
+#define TIMER_TIC_ND
+#endif
+
 /*Define if conflicts are enabled or not.*/
 //#define GPU_locks
 
@@ -356,7 +380,15 @@ __device__ int runner_cuda_gettask ( struct queue_cuda *q ) {
     return tid;
 
     }
+#ifdef PRIQ
+
+
+__device__ int runner_cuda_gettask_priority ( struct queue_cuda *q ) {
 
+    return runner_cuda_gettask(q);
+
+    }
+#endif
 
 
 #else
@@ -372,7 +404,6 @@ __device__ int runner_cuda_gettask ( struct queue_cuda *q ) {
  * This routine blocks until a valid task is picked up, or the
  * specified queue is empty.
  */
-#ifndef PRIQ
 __device__ int runner_cuda_gettask ( struct queue_cuda *q ) {
 
     int tid = -1;
@@ -397,7 +428,7 @@ __device__ int runner_cuda_gettask ( struct queue_cuda *q ) {
     return tid;
 
     }
-#endif
+
 #ifdef PRIQ
 
 __device__ int get_best_task(struct queue_cuda *q )
@@ -436,7 +467,6 @@ __device__ int get_best_task(struct queue_cuda *q )
     ind2 %= cuda_queue_size; 
     /* Loop until there is a valid task at that index. */
     while ( tot_num_tasks > 0 && ( tid1 = q->data[ind1] ) < 0);
-         //   atomicAdd((int*) &tot_num_tasks, -1);
     while ( tot_num_tasks > 0 && ( tid2 = q->data[ind2] ) < 0);
 
     if(tid1 < 0 && tid2 < 0)
@@ -446,55 +476,6 @@ __device__ int get_best_task(struct queue_cuda *q )
         pri1 = tasks_cuda[tid1].weight;
         pri2 = tasks_cuda[tid2].weight;
 
-
-    int res1, res2;
-    if( (tid1 >= 0 && tasks_cuda[tid1].type == type_load) )
-    {
-        res1 = cuda_trymultilock(&res_cuda[tasks_cuda[tid1].locks[0]].lock);
-        //printf("%i %i\n", res1, res_cuda[tasks_cuda[tid1].locks[0]].lock);
-    }
-    else
-        res1 = 1;
-
-    if( tid2 >= 0 && tasks_cuda[tid2].type == type_load && (tid1 >= 0 && tasks_cuda[tid1].type != type_load ))
-        res2 = cuda_trymultilock(&res_cuda[tasks_cuda[tid2].locks[0]].lock);
-    else
-        res2 = 1;
-
-    if(res1 == 0 && res2 == 0)
-    {
-        cuda_multiunlock(&res_cuda[tasks_cuda[tid1].locks[0]].lock);
-        q->data[ind1] = -1;
-        q->data[ind2] = -1;
-        cuda_queue_puttask( q, tid1);
-        cuda_queue_puttask( q, tid2);
-        return -1;
-    }
-
-    if(res1 == 0)
-    {
-           cuda_multiunlock(&res_cuda[tasks_cuda[tid1].locks[0]].lock);
-            q->data[ind1] = -1;
-            q->data[ind2] = -1;
-            atomicAdd((int*) &tot_num_tasks, -1);
-            cuda_queue_puttask( q, tid1);
-            return tid2;
-    }
-
-    if(res2 == 0)
-    {
-           cuda_multiunlock(&res_cuda[tasks_cuda[tid2].locks[0]].lock);
-            q->data[ind1] = -1;
-            q->data[ind2] = -1;
-            atomicAdd((int*) &tot_num_tasks, -1);
-            cuda_queue_puttask( q, tid2);
-            return tid1;
-    }
-    
-
-
-
-
     if(pri1 >= pri2)
     {
 
@@ -518,7 +499,7 @@ __device__ int get_best_task(struct queue_cuda *q )
 }
 
 
-__device__ int runner_cuda_gettask ( struct queue_cuda *q ) {
+__device__ int runner_cuda_gettask_priority ( struct queue_cuda *q ) {
 
     int tid = -1;
     if( atomicAdd((int*)&q->nr_avail_tasks, -1) <= 0)
@@ -563,28 +544,39 @@ __global__ void qsched_device_kernel ( )
         if ( threadIdx.x == 0 ) {
              tid = -1;
                 /* Highest priority queue, holds the unload tasks. */
-#ifndef PRIQ
 #ifndef NO_LOADS
                 if(cuda_queues[2].nr_avail_tasks > 0 )
-                tid = runner_cuda_gettask( &cuda_queues[2] );
+                {
+                    TIMER_TIC
+                    tid = runner_cuda_gettask( &cuda_queues[2] );
+                    TIMER_TOC(timers_unload_q)
+                }
 #endif
 
-
              /* Middle priority queue, contains user-specifed tasks. */
+#ifdef PRIQ
+            if( tid < 0 && cuda_queues[0].nr_avail_tasks > 0 )
+            {
+                TIMER_TIC
+                tid = runner_cuda_gettask_priority ( &cuda_queues[0]);
+                TIMER_TOC(timers_queue)
+            }   
+#else
              if( tid < 0 && cuda_queues[0].nr_avail_tasks > 0 )
+            {
+                TIMER_TIC
                 tid = runner_cuda_gettask ( &cuda_queues[0]);
+                TIMER_TOC(timers_queue)
+            }
+#endif
 #ifndef NO_LOADS
              /* Low priority queue, contains the load tasks. */
-             if( tid < 0 && cuda_queues[1].nr_avail_tasks > 0 /*&& blockIdx.x < 1*/ )
-{
+             if( tid < 0 && cuda_queues[1].nr_avail_tasks > 0 /*&& blockIdx.x < 12*/ )
+             {
+                TIMER_TIC
                 tid = runner_cuda_gettask ( &cuda_queues[1]);
-
-}
-#endif
-
-#else
-            if(tot_num_tasks > 0 && cuda_queues[0].nr_avail_tasks > 0)
-                tid = runner_cuda_gettask ( &cuda_queues[0] );
+                TIMER_TOC(timers_load_q)
+             }
 #endif
             }
             
@@ -614,23 +606,25 @@ __global__ void qsched_device_kernel ( )
         /* Pick task type to do, if its not load, unload or ghost use the user supplied function. */
         if( tasks_cuda[tid].type == type_load )
         {
+        TIMER_TIC
         int *d = (int*)&data_cuda[tasks_cuda[tid].data];
             src = (int*)res_cuda[d[0]].data;
             dest = (int*)res_cuda[d[0]].gpu_data;
             cuda_memcpy_tasks( dest, src , res_cuda[d[0]].size, tid);
-            #ifdef PRIQ
-            __syncthreads();
-            if(threadIdx.x == 0)
-                cuda_multiunlock(&res_cuda[tasks_cuda[tid].locks[0]].lock);
-            #endif
+            TIMER_TOC(timers_doload)
         }else if( tasks_cuda[tid].type == type_unload )
         {
+            TIMER_TIC
         int *d = (int*)&data_cuda[tasks_cuda[tid].data];
             src = (int*)res_cuda[d[0]].gpu_data;
             dest = (int*)res_cuda[d[0]].data;
             cuda_memcpy_tasks( dest, src , res_cuda[d[0]].size, d[0]);
+            TIMER_TOC(timers_dounload)
         }else if (tasks_cuda[tid].type != type_ghost ){
+            TIMER_TIC
            fun(tasks_cuda[tid].type , &data_cuda[tasks_cuda[tid].data]);
+            __syncthreads();
+            TIMER_TOC(timers_doother)
         }
         __syncthreads();
         /*Stop the task clock*/
@@ -648,16 +642,12 @@ __global__ void qsched_device_kernel ( )
             if( atomicSub( &tasks_cuda[tasks_cuda[tid].unlocks[i]].wait , 1 ) == 1 && !( tasks_cuda[tasks_cuda[tid].unlocks[i]].flags & task_flag_skip ))
             {
                 /* Place unloads into highest priority queue, any other task goes to normal priority queue. Load tasks are never unlocked.*/
-                #ifdef PRIQ
-                    cuda_queue_puttask( &cuda_queues[0] , tasks_cuda[tid].unlocks[i] );
-                #else
                 if(tasks_cuda[tasks_cuda[tid].unlocks[i]].type != type_unload)
                 {
                     cuda_queue_puttask( &cuda_queues[0] , tasks_cuda[tid].unlocks[i] );
                 }
                 else
                     cuda_queue_puttask( &cuda_queues[2] , tasks_cuda[tid].unlocks[i] );
-                #endif
             }
         }
     }
@@ -712,10 +702,10 @@ int maxVal( int *array, int size )
    int i, maxi=-3200000;
    for (i=0; i<size; i++)
    {
-	 if (array[i]>maxi)
-	 {
-	    maxi=array[i];
-	 }
+     if (array[i]>maxi)
+     {
+        maxi=array[i];
+     }
    }
    return maxi ;
 }
@@ -725,10 +715,10 @@ int minVal( int *array, int size )
    int i, maxi=3200000;
    for (i=0; i<size; i++)
    {
-	 if (array[i]<maxi)
-	 {
-	    maxi=array[i];
-	 }
+     if (array[i]<maxi)
+     {
+        maxi=array[i];
+     }
    }
    return maxi ;
 }
@@ -1202,7 +1192,7 @@ for(i = sorted[s->count_res]; i >= 0; i-- )
 }
 
 toc_run = getticks();
-//	message( "Sorting took %.3f ms" , ((double)(toc_run - tic)) * itpms );
+//    message( "Sorting took %.3f ms" , ((double)(toc_run - tic)) * itpms );
 
 tic = getticks();
 /* If nothing overlaps create tasks.*/
@@ -1220,7 +1210,7 @@ for( i = sorted[s->count_res]; i < s->count_res; i++ )
 
 toc_run = getticks();
 tic = getticks();
-//	message( "Creating load tasks took %.3f ms" , ((double)(toc_run - tic)) * itpms );
+//    message( "Creating load tasks took %.3f ms" , ((double)(toc_run - tic)) * itpms );
 
 
 /* Check all resources have load tasks - if not give parents (recursively)*/
@@ -1603,7 +1593,7 @@ toc2 += getticks() - tic;
 }*/
 //qsched_prepare_deps( s );
 //printf("Number dependencies = %i\n", s->count_deps);
-#ifdef PRIQ
+/*#ifdef PRIQ
 int PCI_res;
 PCI_res = qsched_addres(s , qsched_owner_none , qsched_res_none , NULL, 0 , NULL);
 s->res[PCI_res].lock = PCIEX;
@@ -1614,9 +1604,9 @@ for(i = 0; i < s->count; i++)
         qsched_addlock(s, i, PCI_res);
     }
 }
-#endif
+#endif*/
 toc_run = getticks();
-	//message( "Setting up dependencies took %.3f ms" , toc2 * itpms );
+    //message( "Setting up dependencies took %.3f ms" , toc2 * itpms );
 //error("Got to here");
 }
 
@@ -1757,7 +1747,7 @@ ticks tic, toc_run ;
 qsched_prepare_loads(s);
 #endif
 toc_run = getticks();
-//	message( "prepare_loads took %.3f ms" , ((double)(toc_run - tic)) * itpms );
+//    message( "prepare_loads took %.3f ms" , ((double)(toc_run - tic)) * itpms );
     
 
     
@@ -2013,21 +2003,16 @@ if( cudaMemcpyToSymbol ( tasks_cuda, &cuda_t ,  sizeof(struct task *) , 0 ,  cud
     error("Failed to copy task pointer to the device.");
 
 /* Initialize the queues. */
-#ifdef PRIQ
-int nr_queues= 1, qsize;
-#else
 int nr_queues= 3,qsize;
-#endif
 int *data2;
 struct queue_cuda queues[ cuda_numqueues ];
 
 
-#ifdef PRIQ
+/*#ifdef PRIQ
 qsize = max(2*s->count, 512);
     if ( cudaMemcpyToSymbol( cuda_queue_size , &qsize , sizeof(int) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess )
             error("Failed to copy queue size to the device.");
 
-    /* Allocate a temporary buffer for the queue data. */
         if ( ( data = (int *)malloc( sizeof(int) * qsize ) ) == NULL )
             error("Failed to allocate data buffer.");
         if( ( data2 = (int *) malloc( sizeof(int) * qsize ) ) == NULL )
@@ -2075,7 +2060,6 @@ qsize = max(2*s->count, 512);
      for ( i = queues[0].count ; i < qsize ; i++ )
         data[i] = -1; 
 
-    /* Allocate and copy the data. */
     if ( cudaMalloc( &queues[0].data , sizeof(int) * qsize ) != cudaSuccess )
         error("Failed to allocate queue data on the device.");
     if ( cudaMemcpy( (void *)queues[0].data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess )
@@ -2084,24 +2068,21 @@ qsize = max(2*s->count, 512);
     for ( k = 0; k < qsize; k++ )
             data[k] = -1;
             
-    /* Allocate and copy the recyling data. */
     if ( cudaMalloc( &queues[0].rec_data , sizeof(int) * qsize ) != cudaSuccess )
         error("Failed to allocate queue data on the device.");
     if ( cudaMemcpy( (void *)queues[0].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess )
         error("Failed to copy queue data pointer to the device");
 
         
-    /* Set some other values. */
     queues[0].first = 0;
     queues[0].last = queues[0].count;
     queues[0].nr_avail_tasks = queues[0].last;
     queues[0].rec_count = 0;  
     queues[0].count = s->count;
 
-       /* Copy the queue structures to the device. */
         if ( cudaMemcpyToSymbol( cuda_queues , &queues , sizeof(struct queue_cuda) * nr_queues , 0 , cudaMemcpyHostToDevice ) != cudaSuccess )
             error("Failed to copy the queues to the device");   
-#else
+#else*/
  qsize = max(2*s->count / nr_queues, 256);
     if ( cudaMemcpyToSymbol( cuda_queue_size , &qsize , sizeof(int) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess )
             error("Failed to copy queue size to the device.");
@@ -2212,7 +2193,6 @@ qsize = max(2*s->count, 512);
        /* Copy the queue structures to the device. */
         if ( cudaMemcpyToSymbol( cuda_queues , &queues , sizeof(struct queue_cuda) * nr_queues , 0 , cudaMemcpyHostToDevice ) != cudaSuccess )
             error("Failed to copy the queues to the device");
-#endif
     /* Clean up. */
     free( tid );
         
@@ -2227,6 +2207,23 @@ qsize = max(2*s->count, 512);
     
 }
 
+void qsched_print_cuda_timers(struct qsched *s)
+{
+#ifdef TIMINGS
+unsigned long long int timers[timers_count];
+int i;
+cudaMemcpyFromSymbol(timers,cuda_timers, sizeof(unsigned long long int)*timers_count, 0 ,cudaMemcpyDeviceToHost );
+printf("timers: ");
+for(i = 0; i < timers_count; i++)
+{
+    printf("%llu ", timers[i]);
+}
+printf("\n");
+#endif
+}
+
+
+
 struct task* qsched_get_timers( struct qsched *s, int numtasks )
 {
     struct task *gpu_tasks = NULL, *cuda_tasks;
@@ -2259,6 +2256,15 @@ struct task* qsched_get_timers( struct qsched *s, int numtasks )
 void qsched_run_CUDA ( struct qsched *s, qsched_funtype func) {
 
 #ifdef WITH_CUDA
+    #ifdef TIMINGS
+    int timers[timers_count];
+    int k;
+    for ( k = 0 ; k < timers_count ; k++ )
+            timers[k] = 0;
+    cudaMemcpyToSymbol(cuda_timers , timers , sizeof(int) * timers_count , 0 , cudaMemcpyHostToDevice);
+    #endif
+    cudaEvent_t startEvent, stopEvent;
+    float time; 
     //ProfilerStart("/home/aidan/quicksched-code/examples/profiler.out");
     double itpms = 1000.0 / CPU_TPS;
     ticks tic, toc_run ;
@@ -2266,19 +2272,24 @@ void qsched_run_CUDA ( struct qsched *s, qsched_funtype func) {
     qsched_prepare_cuda( s );
   toc_run = getticks(); 
     printf("%.3f ", ((double)(toc_run - tic)) * itpms );
-//	message( "prepare_cuda took %.3f ms" , ((double)(toc_run - tic)) * itpms );
+    cudaEventCreate(&startEvent);
+    cudaEventCreate(&stopEvent);
+//    message( "prepare_cuda took %.3f ms" , ((double)(toc_run - tic)) * itpms );
     cudaMemcpyToSymbol( fun , &func , sizeof(qsched_funtype));
     tic = getticks();
   //  ProfilerStop();
     cudaMemcpyToSymbol( tot_num_tasks, &s->count, sizeof(int) );
+    cudaEventRecord(startEvent,0);
     qsched_device_kernel<<<128, 128 >>> (  );
+    cudaEventRecord(stopEvent,0);
     if( cudaDeviceSynchronize() != cudaSuccess )
         error("Failed to execute kernel:%s", cudaGetErrorString(cudaPeekAtLastError()));
   toc_run = getticks(); 
+    cudaEventElapsedTime(&time, startEvent, stopEvent);
     #ifdef NO_LOADS
-    printf("%.3f ", ((double)(toc_run - tic)) * itpms ); 
+    printf("%.3f ", time ); 
     #else
-        printf("%.3f\n", ((double)(toc_run - tic)) * itpms );
+        printf("%.3f\n", time );
     #endif
    // message( "run_CUDA took %.3f ms" , ((double)(toc_run - tic)) * itpms );
 
@@ -2289,3 +2300,23 @@ void qsched_run_CUDA ( struct qsched *s, qsched_funtype func) {
 
 }
 
+
+#ifdef TIMER_TIC
+#undef TIMER_TIC
+#endif
+#ifdef TIMER_TOC
+#undef TIMER_TOC
+#endif
+#ifdef TIMER_TIC2
+#undef TIMER_TIC2
+#endif
+#ifdef TIMERS
+    #define TIMER_TIC ticks __tic = getticks();
+    #define TIMER_TIC2 __tic = getticks();
+    #define TIMER_TOC(s,tid) atomic_add( &s->timers[tid] , getticks() - __tic );
+#else
+    #define TIMER_TIC
+    #define TIMER_TIC2
+    #define TIMER_TOC
+#endif
+
diff --git a/src/cuda_queue.h b/src/cuda_queue.h
index b548f8ca512c7844b35609e85cc933f60a27acc7..f2aed4836688b0c11738c3fd51d35e7e0a1335fc 100644
--- a/src/cuda_queue.h
+++ b/src/cuda_queue.h
@@ -47,3 +47,4 @@ struct queue_cuda {
 void qsched_run_CUDA ( struct qsched *s, qsched_funtype fun );
 extern "C" void qsched_prepare_cuda ( struct qsched *s );
 struct task* qsched_get_timers( struct qsched *s, int numtasks );
+void qsched_print_cuda_timers(struct qsched *s);