diff --git a/examples/test_bh_2.cu b/examples/test_bh_2.cu
index be5dcd4ba73a655bd6d43cb27befa0cdc06aaaf8..259a31f7d1a5fd8aa732f1d274f86aca523bec45 100644
--- a/examples/test_bh_2.cu
+++ b/examples/test_bh_2.cu
@@ -34,8 +34,20 @@
 #include <fenv.h>
 
 /* Local includes. */
+extern "C"{
 #include "quicksched.h"
+#include "res.h"
+}
+#include "cuda_queue.h"
 
+/** Task types. */
+enum task_type {
+  task_type_self = 0,
+  task_type_pair,
+  task_type_self_pc,
+  task_type_com,
+  task_type_count
+};
 
 struct cell{
 
@@ -45,16 +57,19 @@ double h;
 int count;
 unsigned short int split, sorted;
 int parts, firstchild, sibling;
-int res, com_tid;
+int res, resz, resm, com_tid;
 
 }__attribute__((aligned(64)));
 
+
+#define const_G 1
 /* Requred variables to obtain cells. */
+#define cell_maxparts 128
 #define CELL_STRETCH 2
 #define INITIAL_CELLS 256
-struct *cell cell_pool=NULL;
+struct cell *cell_pool = NULL;
 int used_cells=0;
-int num_cells = INITIAL_CELLS;
+int num_cells = 0;
 int cell_size = INITIAL_CELLS*sizeof(struct cell);
 
 /* Device locations for the particle values. */
@@ -63,7 +78,9 @@ __device__ double *parts_pos_z;
 __device__ float4 *parts_a_m;
 __device__ double2 *com_xy;
 __device__ double *com_z;
-__device__ float com_mass;
+__device__ float *com_mass;
+
+__device__ struct cell *cells;
 
 
 /* Host locations for the particle values. */
@@ -72,14 +89,339 @@ double *parts_pos_z_host;
 float4 *parts_a_m_host;
 double2 *com_xy_host;
 double *com_z_host;
-float com_mass_host;
+float *com_mass_host;
+
+
+
+
+__device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *cj) {
+    int i, j, k;
+    int count_i = ci->count, count_j = cj->count;
+    int parts_i = ci->parts, parts_j = cj->parts;
+    double xi[3];
+    float dx[3], ai[3], mi, mj, r2, w, ir;
+    __shared__ double2 parts_xy[cell_maxparts];
+    __shared__ double parts_z[cell_maxparts];
+    __shared__ float4 parts_am[cell_maxparts];
+
+    /* Load particles of cell j into shared memory */
+    for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) {
+        parts_xy[j] = parts_pos_xy[k];
+        parts_z[j] = parts_pos_z[k];
+        parts_am[j] = parts_a_m[k];
+    }
+
+    /* Loop over cell i.*/
+    for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
+        xi[0] = parts_xy[i].x;
+        xi[1] = parts_xy[i].y;
+        xi[2] = parts_z[i];     
+        for(k = 0; k < 3; k++) {
+            ai[k] = 0.0f;
+        }
+        mi = parts_a_m[i].w;
+        
+        for(j = 0; j < count_j; j++) {
+            r2 = 0.0f;
+            dx[0] = xi[0] - parts_xy[j].x;
+            dx[1] = xi[1] - parts_xy[j].y;
+            dx[2] = xi[2] - parts_z[j];
+            r2 += dx[0] * dx[0];
+            r2 += dx[1] * dx[1];
+            r2 += dx[2] * dx[2];
+
+
+//            ir = 1.0f / sqrtf(r2);
+            ir = rsqrtf(r2);
+            w = const_G * ir * ir * ir;
+            mj = parts_am[j].w;
+            for(k = 0; k < 3; k++) {
+                ai[k] -= dx[k] * mj * w;
+            }
+        }            
+         
+       atomicAdd(&parts_a_m[i].x, ai[0]);           
+       atomicAdd(&parts_a_m[i].y, ai[1]);
+       atomicAdd(&parts_a_m[i].z, ai[2]);
+        
+    }
+
+    /* Load particles of cell i into shared memory */
+    for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) {
+        parts_xy[j] = parts_pos_xy[k];
+        parts_z[j] = parts_pos_z[k];
+        parts_am[j] = parts_a_m[k];
+    }
+ /*Loop over cell j. */
+    for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
+        xi[0] = parts_xy[i].x;
+        xi[1] = parts_xy[i].y;
+        xi[2] = parts_z[i];     
+        for(k = 0; k < 3; k++) {
+            ai[k] = 0.0f;
+        }
+        mi = parts_a_m[i].w;
+        
+        for(j = 0; j < count_j; j++) {
+            r2 = 0.0f;
+            dx[0] = xi[0] - parts_xy[j].x;
+            dx[1] = xi[1] - parts_xy[j].y;
+            dx[2] = xi[2] - parts_z[j];
+            r2 += dx[0] * dx[0];
+            r2 += dx[1] * dx[1];
+            r2 += dx[2] * dx[2];
+
+
+            ir = rsqrtf(r2);
+            w = const_G * ir * ir * ir;
+            mj = parts_am[j].w;
+            for(k = 0; k < 3; k++) {
+                ai[k] -= dx[k] * mj * w;
+            }
+        }            
+         
+       atomicAdd(&parts_a_m[i].x, ai[0]);           
+       atomicAdd(&parts_a_m[i].y, ai[1]);
+       atomicAdd(&parts_a_m[i].z, ai[2]);
+        
+    }
+
+}
+
+/*__device__ void iact_pair(int celli, int cellj) {
+
+    struct cell *ci, *cj;
+    ci = &cells[celli];
+    cj = &cells[cellj];
+
+    if(Check if neighbours0)
+    {
+        if(ci->split && cj->split) {
+            //Split both cells and do all possible pairs.
+
+        }else {
+            iact_pair_direct(ci, cj);
+        }
+        
+    }
+
+}*/
+
+__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) {
+
+    int i, k;
+    double2 j_com_xy;
+    double j_com_z;
+    float j_com_mass;
+    int count = leaf->count;    
+    int parts = leaf->parts;
+    int cell_j = cj - cells;
+    float r2, dx[3], ir, w;
+
+
+    /* Init the com's data.*/
+    j_com_xy = com_xy[cell_j];
+    j_com_z = com_z[cell_j];
+    j_com_mass = com_mass[cell_j];
+
+    for(i = parts; i < parts+count; i++) {
+    
+        r2 = 0.0;
+        dx[0] = j_com_xy.x - parts_pos_xy[i].x;
+        r2 += dx[0] * dx[0];
+        dx[1] = j_com_xy.y - parts_pos_xy[i].y;
+        r2 += dx[1] * dx[1];
+        dx[2] = j_com_z - parts_pos_z[i];
+        r2 += dx[2] * dx[2];
+    
+        ir = rsqrtf(r2);
+        w = j_com_mass * const_G * ir * ir * ir;
+        
+        parts_a_m[i].x += w * dx[0];
+        parts_a_m[i].y += w * dx[1];
+        parts_a_m[i].z += w * dx[2];
+    }
+}
+
+/**
+ * @brief Checks whether the cells are direct neighbours ot not
+ */
+__device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) {
+
+    int k;
+    float dx[3];
+    double cih = ci->h, cjh = cj->h;
+    
+    float min_dist = 0.5*(cih+cjh);
+
+    float center_i = ci->loc_xy.x + 0.5*cih;
+    float center_j = cj->loc_xy.x + 0.5*cjh;
+    dx[0] = fabsf(center_i - center_j);    
+    center_i = ci->loc_xy.y + 0.5*cih;
+    center_j = cj->loc_xy.y + 0.5*cjh;
+    dx[1] = fabsf(center_i - center_j);       
+    center_i = ci->loc_z + 0.5*cih;
+    center_j = cj->loc_z + 0.5*cjh;
+    dx[2] = fabsf(center_i - center_j);    
+    return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); 
+}
+
+__device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) {
+    int k;
+    float dx[3];
+    float min_dist = ci->h;
+    float center_i = ci->loc_xy.x;
+    float center_j = cj->loc_xy.x;
+    dx[0] = fabsf(center_i - center_j);    
+    center_i = ci->loc_xy.y;
+    center_j = cj->loc_xy.y;
+    dx[1] = fabsf(center_i - center_j);       
+    center_i = ci->loc_z;
+    center_j = cj->loc_z;
+    dx[2] = fabsf(center_i - center_j);  
+    return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); 
+}
+
+__device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) {
+    return (leaf->parts >= c->parts) && (leaf->parts < c->parts + c->count);
+}
+
+__device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) {
+
+    struct cell *cp ,*cps;
+
+    for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
+        if(is_inside(leaf, cp)) break;
+    }
+
+    if(are_neighbours_different_size(cp, cj)) {
+        for(cps = &cells[cj->firstchild]; cps != &cells[ci->sibling]; cps = &cells[cps->sibling]) {
+            if(are_neighbours(cp, cps)) {
+                if(cp->split && cps->split) {
+                    iact_pair_pc(cp, cps, leaf);
+                }
+            } else {
+                make_interact_pc(leaf, cps);
+            }
+        }
+    }
+
+
+}   
+
+/**
+ * @brief Compute the interactions between all particles in a leaf and
+ *        and all the monopoles in the cell c
+ *
+ * @param c The #cell containing the monopoles
+ * @param leaf The #cell containing the particles
+ */
+__device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
+
+    struct cell *cp, *cps;
+
+    /* Find the subcell of c the leaf is in.*/
+    for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) {
+        if(is_inside(leaf, cp)) break;
+    }
+
+    if(cp->split) {
+    
+        iact_self_pc(cp, leaf);
+
+        for(cps = &cells[c->firstchild]; cps != &cells[c->sibling]; cps = &cells[cps->sibling]) {
+
+            if(cp != cps && cps->split) iact_pair_pc(cp,cps,leaf);
+        }
+
+    }
+}
+
+
+
+/**
+ * @brief Compute the interactions between all particles in a cell.
+ *
+ * @param cellID The cell ID to compute interactions on.
+ */
+__device__ void iact_self_direct(int cellID) {
+    struct cell *c = &cells[cellID];
+    double xi[3] = {0.0, 0.0, 0.0};
+    float ai[3] = {0.0, 0.0, 0.0 }, mi, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w;
+    __shared__ double2 parts_xy[cell_maxparts];
+    __shared__ double parts_z[cell_maxparts];
+    __shared__ float4 parts_am[cell_maxparts];
+    int parts;
+    int count;
+    int i,j,k;
+    
+
+    //If cell is split, interact each child with itself, and with each of its siblings.
+    /*if(c->split) {
+        //TODO
+
+    } else {*/
+        parts = c->parts;
+        count = c->count;
+        int z = threadIdx.x;
+        /* Load particle data into shared memory*/
+        for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) {
+            parts_xy[z] = parts_pos_xy[k];
+            parts_z[z] = parts_pos_z[k];
+            parts_am[z] = parts_a_m[k];
+        }
+        __syncthreads();
+        for(i = threadIdx.x; i < count; i += blockDim.x)
+        {
+            xi[0] = parts_xy[i].x;
+            xi[1] = parts_xy[i].y;
+            xi[2] = parts_z[i];
+            for(k = 0; k < 3; k++) {
+                ai[k] = 0.0;
+            }
+            mi = parts_a_m[i].w;
+            
+            //for(j = i+1; j!= i; j = (j+1)%count) 
+            for(j = 0; j < count; j++)
+            {
+                if(i != j){
+
+                    /* Compute the pairwise distance. */
+                    r2 = 0.0;
+                    dx[0] = xi[0] - parts_pos_xy[j].x;
+                    r2 += dx[0]*dx[0];
+                    dx[1] = xi[1] - parts_pos_xy[j].y;
+                    r2 += dx[1]*dx[1];
+                    dx[2] = xi[2] - parts_pos_z[j];
+                    r2 += dx[2]*dx[2];
+
+                    /* Apply the gravitational acceleration. */
+
+                    //ir = 1.0f / sqrtf(r2);
+                    ir = rsqrtf(r2);
+                    w = const_G * ir * ir * ir;
+                    mj = parts_am[j].w;
+                    for(k = 0; k < 3; k++) {
+                        ai[k] -= w * dx[k] * mj;
+                    }
+        
+                }
+            }
+            //Update.
+            atomicAdd(&parts_a_m[i].x,ai[0] );
+            atomicAdd(&parts_a_m[i].y,ai[1] );
+            atomicAdd(&parts_a_m[i].z,ai[2] );
+        }
+
+
+}
 
 
 /**
  * @brief Checks whether the cells are direct neighbours ot not. Both cells have
  * to be of the same size
  */
-static inline int are_neighbours(struct cell *ci, struct cell *cj) {
+static inline int are_neighbours_host(struct cell *ci, struct cell *cj) {
 
   int k;
   float dx[3];
@@ -101,49 +443,78 @@ static inline int are_neighbours(struct cell *ci, struct cell *cj) {
     center_j = loc2.y;
     dx[1] = fabs(center_i - center_j);
     center_i = ci->loc_z;
-    center_J = cj->loc_z;
+    center_j = cj->loc_z;
     dx[2] = fabs(center_i - center_j);
 
   return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
 }
 
 
-struct *cell cell_get()
+struct cell *cell_get()
 {
-    struct *cell res;
+    struct cell *res;
     
+    if(num_cells == 0)
+    {
+        cell_pool = (struct cell*) calloc(INITIAL_CELLS, sizeof(struct cell));
+        if(cell_pool == NULL)
+            error("Failed to allocate cell_pool");
+        com_xy_host = (double2*) calloc(INITIAL_CELLS, sizeof(double2));
+        if(com_xy_host == NULL)
+            error("Failed to allocate cell_pool");
+        com_z_host = (double*) calloc(INITIAL_CELLS, sizeof(double));
+        if(com_z_host == NULL)
+            error("Failed to allocate cell_pool");
+        com_mass_host = (float*) calloc(INITIAL_CELLS, sizeof(float));
+        if(com_mass_host == NULL)
+            error("Failed to allocate cell_pool");
+        num_cells = INITIAL_CELLS;
+    }
+
     if(used_cells >= num_cells)
     {
         /* Stretch */
-        struct *cell new_pool;
+        struct cell *new_pool;
         cell_size *= CELL_STRETCH;
-        new_pool = (struct *cell) calloc(cell_size);
+        new_pool = (struct cell*) calloc(num_cells*CELL_STRETCH, sizeof(struct cell));
+        if(new_pool == NULL)
+            error("Failed to allocate new_pool");
         if(cell_pool != NULL)
         memcpy(new_pool, cell_pool, num_cells*sizeof(struct cell));
 
 
         
-        double2 *temp = (*double2) calloc(num_cells*sizeof(double2));
-        memcpy(temp, com_xy_host, sizeof(double2)*num_cells);
+        double2 *tempxy = (double2*) calloc(num_cells*CELL_STRETCH, sizeof(double2));
+        if(tempxy == NULL)
+            error("Failed to allocate tempxy");
+        memcpy(tempxy, com_xy_host, sizeof(double2)*num_cells);
         free(com_xy_host);
-        com_xy_host = temp;
-        double temp2 = (*double) calloc(num_cells*sizeof(double));
-        memcpy(temp2, com_z_host, num_cells*sizeof(double));
+        com_xy_host = tempxy; 
+        double *tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double));
+        if(tempz == NULL)
+            error("Failed to allocate tempz");
+        memcpy(tempz, com_z_host, num_cells*sizeof(double));
         free(com_z_host);
-        com_z_host = temp2;
-        float temp3 = (*float) calloc(num_cells*sizeof(float));
-        memcpy(temp3, com_mass_host, num_cells*sizeof(float));
+        com_z_host = tempz;
+        float *tempm = (float*) calloc(num_cells*CELL_STRETCH, sizeof(float));
+        if(tempm == NULL)
+            error("Failed to allocate tempm");
+        memcpy(tempm, com_mass_host, num_cells*sizeof(float));
         free(com_mass_host);
-        com_mass_host = temp3;
+        com_mass_host = tempm;
 
         num_cells *= CELL_STRETCH;
         free(cell_pool);
         cell_pool = new_pool;
+
+        message("Increased size of arrays");
     }
     used_cells++;
     cell_pool[used_cells-1].sibling = -1;
     cell_pool[used_cells-1].firstchild = -1;
-    cell_poo[used_cells-1].res = qsched_res_none;
+    cell_pool[used_cells-1].res = qsched_res_none;
+    cell_pool[used_cells-1].resz = qsched_res_none;
+    cell_pool[used_cells-1].resm = qsched_res_none;
     return &cell_pool[used_cells-1];
 }
 
@@ -156,7 +527,7 @@ void comp_com(struct cell *c){
     double com[3] = {0.0, 0.0, 0.0}, mass = 0.0;
 
     if(c->split) {
-        for(cp = &cell_pool[(cpi = c->firstchild)]; cp != &cell_pool[c->sibling]; &cell_pool[(cpi = cp->sibling)]) {
+        for(cp = &cell_pool[(cpi = c->firstchild)]; cpi != c->sibling; cp = &cell_pool[(cpi = cp->sibling)]) {
             float cp_mass = com_mass_host[cpi];
             com[0] += com_xy_host[cpi].x * cp_mass;
             com[1] += com_xy_host[cpi].y * cp_mass;
@@ -208,46 +579,73 @@ void comp_com(struct cell *c){
  * @param N The total number of parts.
  * @param s The #sched to store the resources.
  */
-void cell_split(struct cell *c, struct qsched *s) {
-    int i, j, k, kk, count = c->count;
-    int parts = c->parts;
-    double2 temp;
-    double temp1;
-    float4 temp2;
-    struct cell *cp;
+void cell_split(int c, struct qsched *s) {
+    int i, j, k, kk, count = cell_pool[c].count;
+    int parts = cell_pool[c].parts;
+    double2 tempxy;
+    double tempxy1;
+    float4 tempxy2;
+    struct cell *cp, *cell;
     int left[8], right[8];
     double pivot[3];
-    static struct cell *root = NULL;
-    struct cell *progenitors[8];
+    static int root = -1;
+//    struct cell *progenitors[8];
+    int progenitors[8];
+    int c1 = c;
+    double2 *temp_xy;
+    double *temp_z;
+    float4 *temp_a_m;
 
     /* Set the root cell. */
-    if (root == NULL) {
+    if (root < 0) {
         root = c;
-        c->sibling = 0;
+        cell_pool[c].sibling = -1;
     }
 
-    if(c->res == qsched_res_none)
-        error("Cell has no resource");
+    if(cell_pool[c].res == qsched_res_none)
+    {
+        if( cudaHostGetDevicePointer(&temp_xy, &parts_pos_xy_host[cell_pool[c].parts], 0) != cudaSuccess )
+            error("Failed to get host device pointer.");
+        printf("tempxy = %p\n", temp_xy);
+        cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy,
+                                        sizeof(double2) * cell_pool[c].count, parts_pos_xy + cell_pool[c].parts);
+    }
+    if(cell_pool[c].resz == qsched_res_none)
+    {
+        if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[c].parts], 0) != cudaSuccess )
+            error("Failed to get host device pointer.");
+        cell_pool[c].resz = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_z,
+                                        sizeof(double) * cell_pool[c].count, parts_pos_z + cell_pool[c].parts);
+    }
+    if(cell_pool[c].resm == qsched_res_none)
+    {
+        if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[c].parts], 0) != cudaSuccess )
+            error("Failed to get host device pointer.");
+        cell_pool[c].resm = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_a_m,
+                                        sizeof(float4) * cell_pool[c].count, parts_a_m + cell_pool[c].parts);
+    }
+       // error("Cell has no resource");*///TODO
 
-    if(c->count > cell_maxparts )
+    if(count > cell_maxparts )
     {
-        c->split = 1;
+        cell_pool[c].split = 1;
 
         for(k = 0; k < 8; k++)
         {
-            progenitors[k] = cp = cell_get();
-            cp->loc_xy = c->loc_xy;
-            cp->loc_z = c->loc_z;
-            cp->h = c->h*0.5;
+            progenitors[k] = (cp = cell_get()) - cell_pool;
+            cell = &cell_pool[c];
+            cp->loc_xy = cell->loc_xy;
+            cp->loc_z = cell->loc_z;
+            cp->h = cell->h*0.5;
             if(k & 4) cp->loc_xy.x += cp->h;
             if(k & 2) cp->loc_xy.y += cp->h;
             if(k & 1) cp->loc_z += cp->h;
         }
 
         /* Init the pivots.*/
-        pivot[0] = c->loc_xy.x + c->h * 0.5;
-        pivot[1] = c->loc_xy.y + c->h * 0.5;
-        pivot[2] = c->loc_z + c->h * 0.5;
+        pivot[0] = cell->loc_xy.x + cell->h * 0.5;
+        pivot[1] = cell->loc_xy.y + cell->h * 0.5;
+        pivot[2] = cell->loc_z + cell->h * 0.5;
 
         /* Split along the x axis. */
         i = parts;
@@ -257,22 +655,21 @@ void cell_split(struct cell *c, struct qsched *s) {
             while(i <= parts+count-1 && parts_pos_xy_host[i].x < pivot[0]) i += 1;
             while(j >= parts && parts_pos_xy_host[j].x >= pivot[0]) j -= 1;
             if(i < j){
-                temp = parts_pos_xy_host[i];
-                temp1 = parts_pos_z_host[i];
-                temp2 = parts_a_m_host[i];
+                tempxy = parts_pos_xy_host[i];
+                tempxy1 = parts_pos_z_host[i];
+                tempxy2 = parts_a_m_host[i];
                 parts_pos_xy_host[i] = parts_pos_xy_host[j];
                 parts_pos_z_host[i] = parts_pos_z_host[j];
                 parts_a_m_host[i] = parts_a_m_host[j];
-                parts_pos_xy_host[j] = temp;
-                parts_pos_z_host[j] = temp1;
-                parts_a_m_host[j] = temp2;
+                parts_pos_xy_host[j] = tempxy;
+                parts_pos_z_host[j] = tempxy1;
+                parts_a_m_host[j] = tempxy2;
             }
         }
         left[1] = i;
-        right[1] parts+count-1;
+        right[1] = parts+count-1;
         left[0] = parts;
         right[0] = j;
-        
 
         /* Split along the y axis twice. */
         for (k = 1; k >= 0; k--) {
@@ -283,15 +680,15 @@ void cell_split(struct cell *c, struct qsched *s) {
                 while(j >= left[k] && parts_pos_xy_host[j].y >= pivot[1]) j -= 1;
                 if(i < j)
                 {
-                    temp = parts_pos_xy_host[i];
-                    temp1 = parts_pos_z_host[i];
-                    temp2 = parts_a_m_host[i];
+                    tempxy = parts_pos_xy_host[i];
+                    tempxy1 = parts_pos_z_host[i];
+                    tempxy2 = parts_a_m_host[i];
                     parts_pos_xy_host[i] = parts_pos_xy_host[j];
                     parts_pos_z_host[i] = parts_pos_z_host[j];
                     parts_a_m_host[i] = parts_a_m_host[j];
-                    parts_pos_xy_host[j] = temp;
-                    parts_pos_z_host[j] = temp1;
-                    parts_a_m_host[j] = temp2;
+                    parts_pos_xy_host[j] = tempxy;
+                    parts_pos_z_host[j] = tempxy1;
+                    parts_a_m_host[j] = tempxy2;
                 }
             }
             left[2*k+1] = i;
@@ -299,7 +696,6 @@ void cell_split(struct cell *c, struct qsched *s) {
             left[2*k] = left[k];
             right[2*k] = j;
         }
-        
         /* Split along the z axis four times.*/
         for(k = 3; k >=0; k--)
         {
@@ -307,18 +703,18 @@ void cell_split(struct cell *c, struct qsched *s) {
             j = right[k];
             while(i <= j){
                 while(i <= right[k] && parts_pos_z_host[i] < pivot[2]) i += 1;
-                while(i >= left[k] && parts_post_z_host[i] >= pivot[2]) j -= 1;
+                while(j >= left[k] && parts_pos_z_host[j] >= pivot[2]) j -= 1;
                 if(i < j)
                 {
-                    temp = parts_pos_xy_host[i];
-                    temp1 = parts_pos_z_host[i];
-                    temp2 = parts_a_m_host[i];
+                    tempxy = parts_pos_xy_host[i];
+                    tempxy1 = parts_pos_z_host[i];
+                    tempxy2 = parts_a_m_host[i];
                     parts_pos_xy_host[i] = parts_pos_xy_host[j];
                     parts_pos_z_host[i] = parts_pos_z_host[j];
                     parts_a_m_host[i] = parts_a_m_host[j];
-                    parts_pos_xy_host[j] = temp;
-                    parts_pos_z_host[j] = temp1;
-                    parts_a_m_host[j] = temp2;
+                    parts_pos_xy_host[j] = tempxy;
+                    parts_pos_z_host[j] = tempxy1;
+                    parts_a_m_host[j] = tempxy2;
                 }
             }
             left[2 * k + 1] = i;
@@ -326,27 +722,38 @@ void cell_split(struct cell *c, struct qsched *s) {
             left[2 * k] = left[k];
             right[2 * k] = j;
         }
-        
+
         /* Store the counts and offsets. */
         for(k = 0; k < 8; k++)
         {
-            progenitors[k]->count = right[k]-left[k]+1;
-            progenitors[k]->parts = left[k];
-            //TODO ADD RESOURCES HERE.
+            cell_pool[progenitors[k]].count = right[k]-left[k]+1;
+            cell_pool[progenitors[k]].parts = left[k];
+        if( cudaHostGetDevicePointer(&temp_xy, &parts_pos_xy_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
+            error("Failed to get host device pointer.");
+        cell_pool[progenitors[k]].res = qsched_addres(s, qsched_owner_none, cell->res, temp_xy,
+                                        sizeof(double2) * cell_pool[progenitors[k]].count, parts_pos_xy + cell_pool[progenitors[k]].parts);
+        if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
+            error("Failed to get host device pointer.");
+        cell_pool[progenitors[k]].resz = qsched_addres(s, qsched_owner_none, cell->resz, temp_z,
+                                        sizeof(double) * cell_pool[progenitors[k]].count, parts_pos_z + cell_pool[progenitors[k]].parts);
+        if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
+            error("Failed to get host device pointer.");
+        cell_pool[progenitors[k]].resm = qsched_addres(s, qsched_owner_none, cell->resm, temp_a_m,
+                                        sizeof(float4) * cell_pool[progenitors[k]].count, parts_a_m + cell_pool[progenitors[k]].parts);
         }
 
         /* Find the first non-empty progenitor */
         for(k = 0; k < 8; k++)
         {
-            if(progenitors[k]->count > 0)
+            if(cell_pool[progenitors[k]].count > 0)
             {
-                c->firstchild = &progenitors[k]-cell_pool;
+                cell->firstchild = progenitors[k];
                 break;
             }
         }
 
         #ifdef SANITY_CHECKS
-            if(c->firstchild == -1)
+            if(cell->firstchild == -1)
                 error("Cell has been split but all children have 0 parts");
         #endif
 
@@ -355,34 +762,35 @@ void cell_split(struct cell *c, struct qsched *s) {
         {
             /* Find the next non-empty sibling */
             for(kk = k+1; kk < 8; ++kk){
-                if(progenitors[kk]->count > 0){
-                    progenitors[k]->sibling = &progenitors[kk]-cell_pool;
+                if(cell_pool[progenitors[kk]].count > 0){
+                    cell_pool[progenitors[k]].sibling = progenitors[kk];
                     break;
                 }
             }
 
             /* No non-empty sibling, go back a level.*/
-            if(kk == 8) progenitors[k]->sibling = c->sibling;
+            if(kk == 8) cell_pool[progenitors[k]].sibling = cell->sibling;
 
         }
 
         /* Recurse */
         for(k = 0; k < 8; k++)
-            if(progenitors[k]->count > 0) cell_split(progenitors[k], s);
+            if(cell_pool[progenitors[k]].count > 0) cell_split(progenitors[k], s);
      
     /* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */   
     } else {
 
-    struct cell *data[2] = {root, c};
+//    struct cell *data[2] = {root, c};
+    int data[2] = {root, c};
         int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
-                                 2 * sizeof(struct cell *), 1);
-        qsched_addlock(s, tid, c->res);
-        //TODO Create task.
-        //TODO Deal with multiple resources.
+                                 2 * sizeof(int), 1);
+        qsched_addlock(s, tid, cell_pool[c].res);
+        qsched_addlock(s, tid, cell_pool[c].resz);
+        qsched_addlock(s, tid, cell_pool[c].resm);
     }
 
 #ifndef COM_AS_TASK
-    comp_com(c);
+    comp_com(&cell_pool[c]);
 #endif
 }
 
@@ -396,7 +804,7 @@ void cell_split(struct cell *c, struct qsched *s) {
 void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
     
     qsched_task_t tid;
-    int *data[2];
+    int data[2];
     struct cell /**data[2],*/ *cp, *cps;
     int cpi;
     
@@ -421,11 +829,13 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
 
             tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, ci->count*ci->count/2);
             qsched_addlock(s, tid, ci->res);
+            qsched_addlock(s, tid, ci->resz);
+            qsched_addlock(s, tid, ci->resm);
         }
     }
     /* Else its a pair!*/
     else{
-        if(are_neighbours(ci,cj){/* Cells are neighbours */
+        if(are_neighbours_host(ci,cj)){/* Cells are neighbours */
             /*Are both split? */
             if(ci->split && cj->split)
             {
@@ -446,7 +856,11 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
 
                 /* Add the resources. */
                 qsched_addlock(s, tid, ci->res);
+                qsched_addlock(s, tid, ci->resz);
+                qsched_addlock(s, tid, ci->resm);
                 qsched_addlock(s, tid, cj->res);
+                qsched_addlock(s, tid, cj->resz);
+                qsched_addlock(s, tid, cj->resm);
             }
         }
 
@@ -456,6 +870,33 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
 
 }
 
+
+__device__ void runner( int type , void *data ) {
+        
+        int *idata = (int *)data;
+        int i = idata[0];
+        int j = idata[1];
+
+        switch ( type ) {
+            case task_type_self:
+                iact_self_direct(i);
+                break;
+            case task_type_pair:
+                iact_pair_direct(&cells[i], &cells[j]);
+                break;
+            case task_type_self_pc: 
+                iact_self_pc( &cells[i], &cells[j] );
+                break;
+            default:
+                asm("trap;");
+        }
+    __syncthreads();
+}
+
+__device__ qsched_funtype function = runner;
+
+
+qsched_funtype func;
 /**
  * @brief Set up and run a task-based Barnes-Hutt N-body solver.
  *
@@ -472,7 +913,13 @@ void test_bh(int N, int runs, char *fileName) {
   struct qsched s;
   ticks tic, toc_run, tot_setup = 0, tot_run = 0;
   int countMultipoles = 0, countPairs = 0, countCoMs = 0;
+  double2 *parts_pos_xy_temp;
+  double *parts_pos_z_temp;
+  float4 *parts_a_m_temp;
 
+  cudaFree(0);
+    if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
+        error("Failed to copy function pointer from device");
 
   /* Initialize the scheduler. */
   qsched_init(&s, 1, qsched_flag_none);
@@ -485,21 +932,40 @@ void test_bh(int N, int runs, char *fileName) {
     if( cudaMallocHost( &parts_a_m_host, sizeof(float4) * N) != cudaSuccess )
         error("Failed to allocate parts array");
 
+    if( cudaMalloc(&parts_pos_xy_temp, sizeof(double2) * N) != cudaSuccess)
+        error("Failed to allocate device parts array");
+    printf("parts_pos_xy_temp = %p\n", parts_pos_xy_temp);
+    if( cudaMemcpyToSymbol(parts_pos_xy, &parts_pos_xy_temp, sizeof(double2*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to set device symbol for parts array");
+    printf("parts_pos_xy = %p\n", parts_pos_xy);
+    if( cudaMalloc(&parts_pos_z_temp, sizeof(double) * N) != cudaSuccess)
+        error("Failed to allocate device parts array");
+    if( cudaMemcpyToSymbol(parts_pos_z, &parts_pos_z_temp, sizeof(double*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to set device symbol for parts array");
+    if( cudaMalloc(&parts_a_m_temp, sizeof(float4) * N) != cudaSuccess)
+        error("Failed to allocate device parts array");
+    if( cudaMemcpyToSymbol(parts_a_m, &parts_a_m_temp, sizeof(float4*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to set device symbol for parts array");
+
+
     if(fileName == NULL || fileName[0] == 0) {
         for(k = 0; k < N; k++) {
-            parts_pos_xy_host.x = ((double)rand())/ RAND_MAX;
-            parts_pos_xy_host.y = ((double)rand())/ RAND_MAX;
-            parts_pos_z_host = ((double)rand())/ RAND_MAX;
-            parts_a_m_host.w = ((double)rand()) / RAND_MAX;
+            parts_pos_xy_host[k].x = ((double)rand())/ RAND_MAX;
+            parts_pos_xy_host[k].y = ((double)rand())/ RAND_MAX;
+            parts_pos_z_host[k] = ((double)rand())/ RAND_MAX;
+            parts_a_m_host[k].w = ((double)rand()) / RAND_MAX;
         }
     
     } else {
         file = fopen(fileName, "r");
+        int tempxy;
         if(file) {
             for(k = 0; k < N; k++) {
-                if(fscanf(file, "%lf%lf%lf", &parts_pos_xy_host.x, &parts_pos_xy_host.y, &parts_pos_z_host) != 3)
+                if(fscanf(file, "%d", &tempxy) != 1)
+                    error("Failed to read ID");
+                if(fscanf(file, "%lf%lf%lf", &parts_pos_xy_host[k].x, &parts_pos_xy_host[k].y, &parts_pos_z_host[k]) != 3)
                     error("Failed to read position of part %i.", k);
-                if(fscanf(file, "%f", &parts[k].mass != 1)
+                if(fscanf(file, "%f", &parts_a_m_host[k].w) != 1)
                     error("Failed to read mass of part %i.", k);
             }
             fclose(file);
@@ -514,21 +980,63 @@ void test_bh(int N, int runs, char *fileName) {
     root->h = 1.0;
     root->count = N;
     root->parts = 0;
-    cell_split(root, &s);
-
-    int c = root - cell_pool;
+    root->sibling = -1;
+    int c = root-cell_pool;
+    cell_split(root - cell_pool, &s);
+    root = &cell_pool[c];
     int nr_leaves = 0;
+    int maxparts=0, minparts=1000000;
+    int number = 0;
     while(c >= 0) {
+        if(cell_pool[c].count > 0)
+        {
+            number++;
+            if(cell_pool[c].res == qsched_res_none)
+               message("cell %i has no res", c);
+            if(cell_pool[c].resz == qsched_res_none)
+               message("cell %i has no resz", c);
+            if(cell_pool[c].resm == qsched_res_none)
+               message("cell %i has no resm", c);
+        }
         if(!cell_pool[c].split) {
             nr_leaves++;
-            c = c->sibling;
+            if(cell_pool[c].count > maxparts)
+            {
+                maxparts = cell_pool[c].count;
+            }
+            if(cell_pool[c].count < minparts)
+            {
+                minparts = cell_pool[c].count;
+            }
+            c = cell_pool[c].sibling;
         } else {
-            c = c->firstchild;
+            c = cell_pool[c].firstchild;
         }
     }
-
+    message("root.sibling = %i, root.split = %i", root->sibling, root->split);
+    printf("nr_leaves = %i\n", nr_leaves);
     message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves));
+    message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts);
     
+    create_tasks(&s, root, NULL);    
+
+    message("total number of tasks: %i.", s.count);
+    message("total number of cells: %i.", number);
+    message("total number of deps: %i.", s.count_deps);
+    message("total number of res: %i.", s.count_res);
+    message("total number of locks: %i.", s.count_locks);
+
+    for(k = 0; k < runs; k++) {
+        for(i = 0; i < N; ++i) {
+            parts_a_m_host[i].x = 0.0;
+            parts_a_m_host[i].y = 0.0;
+            parts_a_m_host[i].z = 0.0;
+        }
+
+        //Run code.
+        printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
+        qsched_run_CUDA( &s , func );
+    }
 }
 
 
@@ -573,7 +1081,7 @@ int main(int argc, char *argv[]) {
   printf("Size of cell: %zu bytes.\n", sizeof(struct cell));
 
   /* Part information */
-  printf("Size of part: %zu bytes.\n", sizeof(struct part));
+//  printf("Size of part: %zu bytes.\n", sizeof(struct part));
 
   /* Dump arguments. */
   if (fileName[0] == 0) {
@@ -587,5 +1095,5 @@ int main(int argc, char *argv[]) {
   }
 
   /* Run the BH test. */
-  test_bh(N, nr_threads, runs, fileName);
+  test_bh(N, runs, fileName);
 }
diff --git a/src/CUDACompile.sh b/src/CUDACompile.sh
index df36d95939ff236fe45ef30ec5a0afa26bb8a7c2..0a3a0bf0b49b085d80024b34763e3a3369a6d48a 100755
--- a/src/CUDACompile.sh
+++ b/src/CUDACompile.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 FLAGS2="-Xcompiler=-fsanitize=address -Xcompiler=-fno-omit-frame-pointer"
 DEBUG_FLAGS="-G -DDEBUG_GPU"
-FLAGS="-O3 -g -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8"
+FLAGS="-O3 -g -G -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8"
 # -DGPU_locks -Xptxas -dlcm=cg  -Xptxas="-v"" 
 # -DNO_LOADS
 
@@ -33,6 +33,6 @@ cd ../examples
 
 /home/aidan/cuda_6.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_hierarchy.o ../src/.libs/libquicksched_cuda.a -o test_heirarchy -lprofiler
 
-/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda  test_bh.cu -lprofiler
+/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda  test_bh_2.cu -lprofiler
 
-/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh.o ../src/.libs/libquicksched_cuda.a -o test_heirarchy -lprofiler
+/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh_2.o ../src/.libs/libquicksched_cuda.a -o test_bh_2 -lprofiler
diff --git a/src/cuda_queue.cu b/src/cuda_queue.cu
index 971255fb4f2e72f9475d9ba5fc107a4f2fd43067..9beba72c9f2fe2189f12a7ccece603762d278888 100644
--- a/src/cuda_queue.cu
+++ b/src/cuda_queue.cu
@@ -739,8 +739,8 @@ void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, in
         if(parent >= 0)
         {
             /* Create dependecy to parent. */ 
-            /*qsched_addunlock(s, task, s->res[parent].task );
-            qsched_addunlock(s, s->res[parent].utask, utask );*/
+          //  qsched_addunlock(s, task, s->res[parent].task );
+          ///  qsched_addunlock(s, s->res[parent].utask, utask );
         }
         for(i = sorted[ID]; i < sorted[ID+1]; i++)
         {
@@ -759,8 +759,8 @@ void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, in
         /* If it has a parent then set the parents ghost task to be dependent on this.*/
         if(parent >= 0)
         {
-            /*qsched_addunlock(s, task, s->res[parent].task );
-            qsched_addunlock(s, s->res[parent].utask, utask);*/
+          //  qsched_addunlock(s, task, s->res[parent].task );
+          //  qsched_addunlock(s, s->res[parent].utask, utask);
         }
     }
 }
@@ -1306,6 +1306,10 @@ for(i = 0; i < s->count_res; i++ )
     int parent = s->res[ID].parent;
     struct res *resource = &s->res[ res[i] ];
 
+    printf("ID = %i, size = %i, numChild = %i, parent = %i\n", ID, size, numChildren, parent);
+    printf("task = %i, utask = %i\n", s->res[ID].task, s->res[ID].utask);
+    if(s->res[ID].task == s->res[parent].task)
+        continue;
     /* Loop through children if there are any. */    
     if(numChildren > 0)
     {
@@ -1843,7 +1847,7 @@ toc_run = getticks();
         }
 
         /* Print all dependencies */
-        /*for(i = 0; i < count; i++ )
+        for(i = 0; i < count; i++ )
         {
             printf("Task ID: %i, type=%i, ", i, tasks[i].type);
             for(j = 0; j < tasks[i].nr_unlocks; j++)
@@ -1851,46 +1855,19 @@ toc_run = getticks();
                 printf("%i ", tasks[i].unlocks[j]);
             }
             printf("\n");
-        }*/
+        }
     if ( k < count )
     {
         printf("k = %i, wait = %i\n", tid[k-1], tasks[tid[k-1]].wait);
-        printf("tasks[0].nr_unlocks=%i\n", tasks[0].nr_unlocks);
-        for(i = 0; i < k-1; i++)
+        for(i = 0; i < count; i++)
         {
-            t = &tasks[tid[i]];
+            t = &tasks[i];
             for(j = 0; j < t->nr_unlocks; j++)
             {
                 if(t->unlocks[j] == tid[k-1])
-                    printf("%i ", tid[i]);
-
-            }
+                    printf("Task %i is unlocking task %i\n",j, tid[k-1]);
+            } 
         }
-        printf("\n");
-        for(i = 0; i < tasks[tid[k-1]].nr_unlocks; i++)
-        {
-            printf("%i ", tasks[tid[k-1]].unlocks[i]);
-        }
-        printf("\n");
-        for(i = 0; i < tasks[tid[k-1]].nr_unlocks; i++)
-        {
-            printf("unlocks[%i] = %i wait = %i\n", i, tasks[tid[k-1]].unlocks[i], tasks[tasks[tid[k-1]].unlocks[i]].wait);
-            for(j = 0; j < tasks[tasks[tid[k-1]].unlocks[i]].nr_unlocks; j++)
-            {
-                printf("%i ", tasks[tasks[tid[k-1]].unlocks[i]].unlocks[j]);
-            }
-            printf("\n");
-        }
-        printf("\n\n");
-        for(i = 0; i < k; i++)
-        {
-            if(tasks[tid[i]].type == 0)
-            {
-                int* idata = (int*)&(s->data[tasks[tid[i]].data]);
-                printf("%i %i,", tid[i], idata[0]);
-            }
-        }
-        printf("\n");
         error( "Circular dependencies detected." );
      }   
     /* Run through the topologically sorted tasks backwards and