Added a non-recursive version of pc functions. Poor performance currently

320d4466 · aidan · 60922b1a · 320d4466 · 320d4466
Commit 320d4466 authored 10 years ago by aidan
--- a/examples/test_bh_2.cu
+++ b/examples/test_bh_2.cu
@@ -59,7 +59,7 @@ unsigned short int split, sorted;
 int parts, firstchild, sibling;
 int res, resz, resm, com_tid;
-}__attribute__((aligned(64)));
+};//__attribute__((aligned(64)));
 #define const_G 1
@@ -105,13 +105,16 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
    __shared__ double2 parts_xy[cell_maxparts];
    __shared__ double parts_z[cell_maxparts];
    __shared__ float4 parts_am[cell_maxparts];
+    /*if(threadIdx.x == 0)
+    printf("%f, %f, %f, %f, %i, %f, %f, %f, %f, %i\n", ci->h, ci->loc_xy.x, ci->loc_xy.y, ci->loc_z, ci->split, 
+            cj->h, cj->loc_xy.x, cj->loc_xy.y, cj->loc_z, cj->split);*/
    /* Load particles of cell j into shared memory */
-    for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) {
+    /*for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) {
        parts_xy[j] = parts_pos_xy[k];
        parts_z[j] = parts_pos_z[k];
        parts_am[j] = parts_a_m[k];
-    }
+    }*/
    /* Loop over cell i.*/
    for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
@@ -123,25 +126,27 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
        }
        mi = parts_a_m[i].w;
-        for(j = 0; j < count_j; j++) {
+        for(j = parts_j; j < parts_j + count_j; j++) {
            r2 = 0.0f;
-            dx[0] = xi[0] - parts_xy[j].x;
+            dx[0] = xi[0] - parts_pos_xy[j].x;
-            dx[1] = xi[1] - parts_xy[j].y;
+            dx[1] = xi[1] - parts_pos_xy[j].y;
-            dx[2] = xi[2] - parts_z[j];
+            dx[2] = xi[2] - parts_pos_z[j];
            r2 += dx[0] * dx[0];
            r2 += dx[1] * dx[1];
            r2 += dx[2] * dx[2];
-//            ir = 1.0f / sqrtf(r2);
+    //        ir = 1.0f / sqrtf(r2);
            ir = rsqrtf(r2);
            w = const_G * ir * ir * ir;
-            mj = parts_am[j].w;
+            mj = parts_a_m[j].w;
            for(k = 0; k < 3; k++) {
                ai[k] -= dx[k] * mj * w;
            }
+   //         atomicAdd(&parts_a_m[j].x, w*dx[0]*mi);
+    //        atomicAdd(&parts_a_m[j].y, w*dx[1]*mi);
+     //       atomicAdd(&parts_a_m[j].z, w*dx[2]*mi);
        }            
       atomicAdd(&parts_a_m[i].x, ai[0]);           
       atomicAdd(&parts_a_m[i].y, ai[1]);
       atomicAdd(&parts_a_m[i].z, ai[2]);
@@ -149,11 +154,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
    }
    /* Load particles of cell i into shared memory */
-    for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) {
+    /*for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) {
        parts_xy[j] = parts_pos_xy[k];
        parts_z[j] = parts_pos_z[k];
        parts_am[j] = parts_a_m[k];
-    }
+    }*/
 /*Loop over cell j. */
    for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
        xi[0] = parts_pos_xy[i].x;
@@ -164,11 +169,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
        }
        mi = parts_a_m[i].w;
-        for(j = 0; j < count_j; j++) {
+        for(j = parts_i; j < parts_i + count_i; j++) {
            r2 = 0.0f;
-            dx[0] = xi[0] - parts_xy[j].x;
+            dx[0] = xi[0] - parts_pos_xy[j].x;
-            dx[1] = xi[1] - parts_xy[j].y;
+            dx[1] = xi[1] - parts_pos_xy[j].y;
-            dx[2] = xi[2] - parts_z[j];
+            dx[2] = xi[2] - parts_pos_z[j];
            r2 += dx[0] * dx[0];
            r2 += dx[1] * dx[1];
            r2 += dx[2] * dx[2];
@@ -176,12 +181,11 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
            ir = rsqrtf(r2);
            w = const_G * ir * ir * ir;
-            mj = parts_am[j].w;
+            mj = parts_a_m[j].w;
            for(k = 0; k < 3; k++) {
                ai[k] -= dx[k] * mj * w;
            }
        }            
       atomicAdd(&parts_a_m[i].x, ai[0]);           
       atomicAdd(&parts_a_m[i].y, ai[1]);
       atomicAdd(&parts_a_m[i].z, ai[2]);
@@ -190,25 +194,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
 }
-/*__device__ void iact_pair(int celli, int cellj) {
-    struct cell *ci, *cj;
-    ci = &cells[celli];
-    cj = &cells[cellj];
-    if(Check if neighbours0)
-    {
-        if(ci->split && cj->split) {
-            //Split both cells and do all possible pairs.
-        }else {
-            iact_pair_direct(ci, cj);
-        }
-    }
-}*/
 __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) {
    int i, k;
@@ -218,16 +203,29 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
    int count = leaf->count;    
    int parts = leaf->parts;
    int cell_j = cj - cells;
+    int temp;
    float r2, dx[3], ir, w;
+  //  if(cell_j < 0)
+//    {
+//        if(threadIdx.x == 0)
+  //          printf("cell_j = %i, leaf = %i, threadIdx.x == %i\n", cell_j, leaf-cells, threadIdx.x);
+  //      __syncthreads();
+//        asm("trap;");
+//    }
+ //   if(threadIdx.x == 0)
+ //       printf("%f, %f, %f\n", cj->loc_xy.x, cj->loc_xy.y, cj->loc_z);
+    temp = cell_j;
    /* Init the com's data.*/
    j_com_xy = com_xy[cell_j];
    j_com_z = com_z[cell_j];
    j_com_mass = com_mass[cell_j];
-    for(i = parts; i < parts+count; i++) {
+    for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
        r2 = 0.0;
        dx[0] = j_com_xy.x - parts_pos_xy[i].x;
        r2 += dx[0] * dx[0];
@@ -238,11 +236,18 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
        ir = rsqrtf(r2);
        w = j_com_mass * const_G * ir * ir * ir;
+       /* __threadfence();
-        parts_a_m[i].x += w * dx[0];
+        if(!isfinite(w * dx[0])){
-        parts_a_m[i].y += w * dx[1];
+            printf("Error in make_interact_pc, j_com_mass = %f, cell_j = %i, temp = %i, i = %i, threadIdx.x=%i\n", j_com_mass, cell_j, temp, i, threadIdx.x); asm("trap;");}
-        parts_a_m[i].z += w * dx[2];
+        if(!isfinite(w * dx[1])){
+            printf("Error in make_interact_pc\n"); asm("trap;");}
+        if(!isfinite(w * dx[2])){
+            printf("Error in make_interact_pc\n"); asm("trap;");}*/
+        atomicAdd( &parts_a_m[i].x , w * dx[0]);
+        atomicAdd( &parts_a_m[i].y , w * dx[1]);
+        atomicAdd( &parts_a_m[i].z , w * dx[2]);
    }
+//__syncthreads();
 }
 /**
@@ -291,38 +296,41 @@ __device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) {
 __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) {
    struct cell *cp ,*cps;
+    int leafnum = leaf - cells;
+//if(threadIdx.x == 0 && leafnum == 23)
+  //      printf("cj = %i\n", cj - cells);
+//    printf("%i\n", leafnum);
-    if(leaf->split)
+ //   if(threadIdx.x == 0)
-    {
+ ///       printf("ci = %i, cj = %i, leaf = %i\n", ci - cells, cj - cells, leaf - cells);
-        printf("Leaf split = 1, oh dear.");
-        asm("trap;");
-    }
-if(ci->split > 1)
-    {
-        printf("Cell %i had split > 1\n", ci - cells);
-        asm("trap;");
-    }
-    if(cj->split > 1)
-    {
-        printf("cell %i had split > 1\n", cj - cells);
-        asm("trap;");
-    }
    for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
        if(is_inside(leaf, cp)) break;
    }
    if(are_neighbours_different_size(cp, cj)) {
        for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
            if(are_neighbours(cp, cps)) {
                if(cp->split && cps->split) {
                    iact_pair_pc(cp, cps, leaf);
                }
            } else {
                make_interact_pc(leaf, cps);
+             //   if(threadIdx.x == 0 && leafnum == 23)
+               //         printf("leafnum = %i with cps = %i here\n", leafnum, cps - cells);
                __syncthreads();
            }
        }
+    }else{
+        for(cps = &cells[cj->firstchild]; cps!= &cells[cj->sibling]; cps = &cells[cps->sibling]) {
+         //   if(threadIdx.x == 0 && leafnum == 23)
+           //             printf("leafnum = %i with cps = %i\n", leafnum, cps - cells);
+            make_interact_pc(leaf, cps);
+        }
    }
    __syncthreads();
@@ -339,7 +347,7 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
    struct cell *cp, *cps;
-    if(leaf->split)
+    /*if(leaf->split)
    {
        printf("Leaf split = 1, oh dear.");
        asm("trap;");
@@ -348,9 +356,26 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
    {
        printf("Cell had split > 1\n");
        asm("trap;");
-    }
+    }*/
    /* Find the subcell of c the leaf is in.*/
+    /*cp = c;
+    cps = c;
+    while(c->split)
+    {
+        for(cp = &cells[cp->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]){
+              if(is_inside(leaf, cp)) break;
+        }
+        if(cp->split){
+            for(cps = &cells[c->firstchild]; cps != &cells[c->sibling]; cps = &cells[cps->sibling]) {
+                if(cp != cps && cps->split) iact_pair_pc(cp, cps, leaf);
+            }
+        }
+        c = cp;
+    }*/
    for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) {
        if(is_inside(leaf, cp)) break;
    }
@@ -364,7 +389,7 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
            if(cp != cps && cps->split) iact_pair_pc(cp,cps,leaf);
        }
-    }
+    }//TODO
 }
@@ -385,7 +410,8 @@ __device__ void iact_self_direct(int cellID) {
    int count;
    int i,j,k;
+    //if(threadIdx.x == 0)
+     //   printf("%f, %f, %f, %f, %i\n", c->h, c->loc_xy.x, c->loc_xy.y, c->loc_z, c->split);
    //If cell is split, interact each child with itself, and with each of its siblings.
    /*if(c->split) {
        //TODO
@@ -395,24 +421,24 @@ __device__ void iact_self_direct(int cellID) {
        count = c->count;
        int z = threadIdx.x;
        /* Load particle data into shared memory*/
-        for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) {
+        /*for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) {
            parts_xy[z] = parts_pos_xy[k];
            parts_z[z] = parts_pos_z[k];
            parts_am[z] = parts_a_m[k];
        }
-        __syncthreads();
+        __syncthreads();*/
-        for(i = threadIdx.x; i < count; i += blockDim.x)
+        for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x)
        {
-            xi[0] = parts_xy[i].x;
+            xi[0] = parts_pos_xy[i].x;
-            xi[1] = parts_xy[i].y;
+            xi[1] = parts_pos_xy[i].y;
-            xi[2] = parts_z[i];
+            xi[2] = parts_pos_z[i];
            for(k = 0; k < 3; k++) {
                ai[k] = 0.0;
            }
            mi = parts_a_m[i].w;
            //for(j = i+1; j!= i; j = (j+1)%count) 
-            for(j = 0; j < count; j++)
+            for(j = parts; j < parts+count; j++)
            {
                if(i != j){
@@ -430,7 +456,7 @@ __device__ void iact_self_direct(int cellID) {
                    //ir = 1.0f / sqrtf(r2);
                    ir = rsqrtf(r2);
                    w = const_G * ir * ir * ir;
-                    mj = parts_am[j].w;
+                    mj = parts_a_m[j].w;
                    for(k = 0; k < 3; k++) {
                        ai[k] -= w * dx[k] * mj;
                    }
@@ -813,10 +839,10 @@ void cell_split(int c, struct qsched *s) {
 //    struct cell *data[2] = {root, c};
    int data[2] = {root, c};
        int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
-                                 2 * sizeof(int), 1);
+                                 2 * sizeof(int), 3000);
-        qsched_addlock(s, tid, cell_pool[root].res);
+        /*qsched_adduse(s, tid, cell_pool[root].res);
-        qsched_addlock(s, tid, cell_pool[root].resz);
+        qsched_adduse(s, tid, cell_pool[root].resz);
-        qsched_addlock(s, tid, cell_pool[root].resm);
+        qsched_adduse(s, tid, cell_pool[root].resm);*/
        qsched_addlock(s, tid, cell_pool[c].res);
        qsched_addlock(s, tid, cell_pool[c].resz);
        qsched_addlock(s, tid, cell_pool[c].resm);
@@ -859,8 +885,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
        else{
            data[0] = ci - cell_pool;
            data[1] = -1;
+            tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, 2);
-            tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, ci->count*ci->count/2);
            qsched_addlock(s, tid, ci->res);
            qsched_addlock(s, tid, ci->resz);
            qsched_addlock(s, tid, ci->resm);
@@ -868,7 +893,9 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
    }
    /* Else its a pair!*/
    else{
-        if(are_neighbours_host(ci,cj)){/* Cells are neighbours */
+        if(!are_neighbours_host(ci,cj)){/* Cells are neighbours */
+        }else{
            /*Are both split? */
            if(ci->split && cj->split)
            {
@@ -885,7 +912,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
                /* Create the task. */
                tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
-                                     sizeof(struct cell *) * 2, ci->count * cj->count);
+                                     sizeof(struct cell *) * 2, 1);
                /* Add the resources. */
                qsched_addlock(s, tid, ci->res);
@@ -909,7 +936,6 @@ __device__ void runner( int type , void *data ) {
        int *idata = (int *)data;
        int i = idata[0];
        int j = idata[1];
        switch ( type ) {
            case task_type_self:
                iact_self_direct(i);
@@ -1044,18 +1070,31 @@ void test_bh(int N, int runs, char *fileName) {
            c = cell_pool[c].firstchild;
        }
    }
-    message("root.sibling = %i, root.split = %i", root->sibling, root->split);
-    printf("nr_leaves = %i\n", nr_leaves);
    message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves));
    message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts);
-    for(k = 0; k < num_cells; k++)
+   /* for(k = 0; k < num_cells; k++)
        if(cell_pool[k].split > 1 ) 
-            printf("Split > 1\n");
+            printf("Split > 1\n");*/
    create_tasks(&s, root, NULL);    
+    int self = 0, pair = 0, pc = 0;
+    for(k = 0; k < s.count; k++)
+    {
+        if(s.tasks[k].type == task_type_self)
+            self++;
+        else if (s.tasks[k].type == task_type_pair)
+            pair++;
+        else if (s.tasks[k].type >= 0)
+            pc++;
+    }
    message("total number of tasks: %i.", s.count);
+    message("total number of pair tasks: %i.", pair);
+    message("total number of self tasks: %i.", self);
+    message("total number of pc tasks: %i.", pc);
    message("total number of cells: %i.", number);
    message("total number of deps: %i.", s.count_deps);
    message("total number of res: %i.", s.count_res);
@@ -1098,7 +1137,7 @@ float *comm_temp;
    if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess)
        error("Failed to allocate com on the GPU");
-    if( cudaMemcpy( comm_temp, com_z_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
+    if( cudaMemcpy( comm_temp, com_mass_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
        error("failed to copy com to the GPU");
    if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
        error("Failed to copy com pointer to the GPU");
@@ -1115,13 +1154,37 @@ float *comm_temp;
        }
    }*/
+      //  printf("com_mass_host[152] = %f\n", com_mass_host[152]);
        //Run code.
-        printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
+//        printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
        qsched_run_CUDA( &s , func );
-    }
+        qsched_print_cuda_timers(&s);
+    k = 0;
+        printf("%e, %e, %e, %e, %e, %e, %e\n", parts_a_m_host[k].w, parts_pos_xy_host[k].x, parts_pos_xy_host[k].y, parts_pos_z_host[k],
+            parts_a_m_host[k].x, parts_a_m_host[k].y, parts_a_m_host[k].z);
+struct task* tasks =  qsched_get_timers( &s , s.count );
+    for(i = 0; i < s.count; i++)
+    {
+      printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
+       // printf("\n");
+  }
+}   
+  /* Dump the particles to a file */
+  file = fopen("particle_dump.dat", "w");
+/*  fprintf(file,
+          "# ID m x y z a_exact.x   a_exact.y    a_exact.z    a_legacy.x    "
+          "a_legacy.y    a_legacy.z    a_new.x     a_new.y    a_new.z\n");*/
+  for (k = 0; k < N; ++k)
+    fprintf(file, "%e, %e, %e, %e, %e, %e, %e\n",
+            parts_a_m_host[k].w, parts_pos_xy_host[k].x, parts_pos_xy_host[k].y, parts_pos_z_host[k],
+            parts_a_m_host[k].x, parts_a_m_host[k].y, parts_a_m_host[k].z);
+  fclose(file);
 }

--- a/examples/test_bh_3.cu
+++ b/examples/test_bh_3.cu
+/*******************************************************************************
+ * This file is part of QuickSched.
+ * Coypright (c) 2014 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
+ *                    Aidan Chalk (aidan.chalk@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+* *****************************************************************************/
+/* Config parameters. */
+#include "../config.h"
+/* Standard includes. */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+#include <omp.h>
+#include <fenv.h>
+/* Local includes. */
+extern "C"{
+#include "quicksched.h"
+#include "res.h"
+}
+#include "cuda_queue.h"
+/** Task types. */
+enum task_type {
+  task_type_self = 0,
+  task_type_pair,
+  task_type_self_pc,
+  task_type_com,
+  task_type_count
+};
+struct cell{
+double2 loc_xy;
+double loc_z;
+double h;
+int count;
+unsigned short int split, sorted;
+int parts, firstchild, sibling;
+int res, /*resz, resm,*/ com_tid;
+};//__attribute__((aligned(64)));
+struct part{
+double loc[3];
+float a[3];
+float m;
+};//__attribute__((aligned(64)));
+#define const_G 1
+/* Requred variables to obtain cells. */
+#define cell_maxparts 256
+#define CELL_STRETCH 2
+#define INITIAL_CELLS 256
+struct cell *cell_pool = NULL;
+int used_cells=0;
+int num_cells = 0;
+int cell_size = INITIAL_CELLS*sizeof(struct cell);
+/* Device locations for the particle values. */
+//__device__ double2 *parts_pos_xy;
+//__device__ double *parts_pos_z;
+//__device__ float4 *parts_a_m;
+__device__ double2 *com_xy;
+__device__ double *com_z;
+__device__ float *com_mass;
+__device__ struct part *parts_cuda;
+__device__ struct cell *cells;
+/* Host locations for the particle values. */
+//double2 *parts_pos_xy_host;
+//double *parts_pos_z_host;
+//float4 *parts_a_m_host;
+double2 *com_xy_host;
+double *com_z_host;
+float *com_mass_host;
+double2 *parts_pos_xy_temp;
+double *parts_pos_z_temp;
+float4 *parts_a_m_temp;
+struct part *parts_host;
+struct part *parts_temp;
+__device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *cj) {
+    int i, j, k;
+    int count_i = ci->count, count_j = cj->count;
+    int parts_i = ci->parts, parts_j = cj->parts;
+    double xi[3];
+    float dx[3], ai[3], mi, mj, r2, w, ir;
+    __shared__ double2 parts_xy[cell_maxparts];
+    __shared__ double parts_z[cell_maxparts];
+    __shared__ float4 parts_am[cell_maxparts];
+    /*if(threadIdx.x == 0)
+    printf("%f, %f, %f, %f, %i, %f, %f, %f, %f, %i\n", ci->h, ci->loc_xy.x, ci->loc_xy.y, ci->loc_z, ci->split, 
+            cj->h, cj->loc_xy.x, cj->loc_xy.y, cj->loc_z, cj->split);*/
+    /* Load particles of cell j into shared memory */
+    /*for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) {
+        parts_xy[j] = parts_pos_xy[k];
+        parts_z[j] = parts_pos_z[k];
+        parts_am[j] = parts_a_m[k];
+    }*/
+    /* Loop over cell i.*/
+    for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
+        xi[0] = parts_cuda[i].loc[0];
+        xi[1] = parts_cuda[i].loc[1];
+        xi[2] = parts_cuda[i].loc[2];
+        for(k = 0; k < 3; k++) {
+            ai[k] = 0.0f;
+        }
+        mi = parts_cuda[i].m;
+        for(j = parts_j; j < parts_j + count_j; j++) {
+            r2 = 0.0f;
+            dx[0] = xi[0] - parts_cuda[j].loc[0];
+            dx[1] = xi[1] - parts_cuda[j].loc[1];
+            dx[2] = xi[2] - parts_cuda[j].loc[2];
+            r2 += dx[0] * dx[0];
+            r2 += dx[1] * dx[1];
+            r2 += dx[2] * dx[2];
+    //        ir = 1.0f / sqrtf(r2);
+            ir = rsqrtf(r2);
+            w = const_G * ir * ir * ir;
+            mj = parts_cuda[j].m;
+            for(k = 0; k < 3; k++) {
+                ai[k] -= dx[k] * mj * w;
+            }
+   //         atomicAdd(&parts_a_m[j].x, w*dx[0]*mi);
+    //        atomicAdd(&parts_a_m[j].y, w*dx[1]*mi);
+     //       atomicAdd(&parts_a_m[j].z, w*dx[2]*mi);
+        }            
+       atomicAdd(&parts_cuda[i].a[0], ai[0]);           
+       atomicAdd(&parts_cuda[i].a[1], ai[1]);
+       atomicAdd(&parts_cuda[i].a[2], ai[2]);
+    }
+    /* Load particles of cell i into shared memory */
+    /*for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) {
+        parts_xy[j] = parts_pos_xy[k];
+        parts_z[j] = parts_pos_z[k];
+        parts_am[j] = parts_a_m[k];
+    }*/
+ /*Loop over cell j. */
+    for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
+        xi[0] = parts_cuda[i].loc[0];
+        xi[1] = parts_cuda[i].loc[1];
+        xi[2] = parts_cuda[i].loc[2];     
+        for(k = 0; k < 3; k++) {
+            ai[k] = 0.0f;
+        }
+        mi = parts_cuda[i].m;
+        for(j = parts_i; j < parts_i + count_i; j++) {
+            r2 = 0.0f;
+            dx[0] = xi[0] - parts_cuda[j].loc[0];
+            dx[1] = xi[1] - parts_cuda[j].loc[1];
+            dx[2] = xi[2] - parts_cuda[j].loc[2];
+            r2 += dx[0] * dx[0];
+            r2 += dx[1] * dx[1];
+            r2 += dx[2] * dx[2];
+            ir = rsqrtf(r2);
+            w = const_G * ir * ir * ir;
+            mj = parts_cuda[j].m;
+            for(k = 0; k < 3; k++) {
+                ai[k] -= dx[k] * mj * w;
+            }
+        }            
+       atomicAdd(&parts_cuda[i].a[0], ai[0]);           
+       atomicAdd(&parts_cuda[i].a[1], ai[1]);
+       atomicAdd(&parts_cuda[i].a[2], ai[2]);
+    }
+}
+__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) {
+    int i, k;
+    double2 j_com_xy;
+    double j_com_z;
+    float j_com_mass;
+    int count = leaf->count;    
+    int parts = leaf->parts;
+    int cell_j = cj - cells;
+    int temp;
+    float r2, dx[3], ir, w;
+  //  if(cell_j < 0)
+//    {
+//        if(threadIdx.x == 0)
+  //          printf("cell_j = %i, leaf = %i, threadIdx.x == %i\n", cell_j, leaf-cells, threadIdx.x);
+  //      __syncthreads();
+//        asm("trap;");
+//    }
+ //   if(threadIdx.x == 0)
+ //       printf("%f, %f, %f\n", cj->loc_xy.x, cj->loc_xy.y, cj->loc_z);
+    temp = cell_j;
+    /* Init the com's data.*/
+    j_com_xy = com_xy[cell_j];
+    j_com_z = com_z[cell_j];
+    j_com_mass = com_mass[cell_j];
+    for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
+        r2 = 0.0;
+        dx[0] = j_com_xy.x - parts_cuda[i].loc[0];
+        r2 += dx[0] * dx[0];
+        dx[1] = j_com_xy.y - parts_cuda[i].loc[1];
+        r2 += dx[1] * dx[1];
+        dx[2] = j_com_z - parts_cuda[i].loc[2];
+        r2 += dx[2] * dx[2];
+        ir = rsqrtf(r2);
+        w = j_com_mass * const_G * ir * ir * ir;
+        atomicAdd( &parts_cuda[i].a[0] , w * dx[0]);
+        atomicAdd( &parts_cuda[i].a[1] , w * dx[1]);
+        atomicAdd( &parts_cuda[i].a[2] , w * dx[2]);
+    }
+//__syncthreads();
+}
+/**
+ * @brief Checks whether the cells are direct neighbours ot not
+ */
+__device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) {
+    int k;
+    float dx[3];
+    double cih = ci->h, cjh = cj->h;
+    float min_dist = 0.5*(cih+cjh);
+    float center_i = ci->loc_xy.x + 0.5*cih;
+    float center_j = cj->loc_xy.x + 0.5*cjh;
+    dx[0] = fabsf(center_i - center_j);    
+    center_i = ci->loc_xy.y + 0.5*cih;
+    center_j = cj->loc_xy.y + 0.5*cjh;
+    dx[1] = fabsf(center_i - center_j);       
+    center_i = ci->loc_z + 0.5*cih;
+    center_j = cj->loc_z + 0.5*cjh;
+    dx[2] = fabsf(center_i - center_j);    
+    return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); 
+}
+__device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) {
+    int k;
+    float dx[3];
+    float min_dist = ci->h;
+    float center_i = ci->loc_xy.x;
+    float center_j = cj->loc_xy.x;
+    dx[0] = fabsf(center_i - center_j);    
+    center_i = ci->loc_xy.y;
+    center_j = cj->loc_xy.y;
+    dx[1] = fabsf(center_i - center_j);       
+    center_i = ci->loc_z;
+    center_j = cj->loc_z;
+    dx[2] = fabsf(center_i - center_j);  
+    return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); 
+}
+__device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) {
+    return (leaf->parts >= c->parts) && (leaf->parts < c->parts + c->count);
+}
+__device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) {
+    struct cell *cp ,*cps;
+    /* 
+        Find child of ci containing leaf.
+        if child and cj are_neighbours_different_size
+            Loop through children of cj.
+                If child and childj are neighbours
+                    If both split -> recurse
+                else
+                    make_interact_pc
+        else
+            loop through children of cj
+                make interact pc                    
+    */
+    struct cell *loopend;
+    cp = ci;
+    while(cp->split)
+    {
+        for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
+            if(is_inside(leaf, cp)) break;
+        }
+        if(are_neighbours_different_size(cp, cj)) {
+            loopend = &cells[cj->sibling];
+            cps = &cells[cj->firstchild];
+            while(cps != loopend)
+            {
+                if(!are_neighbours(cp, cps)){
+                    make_interact_pc(leaf, cps);
+                    //__syncthreads();
+                    cps = &cells[cps->sibling];
+                }else{
+                    if(cps->split)
+                        cps = &cells[cps->firstchild];
+                    else
+                        cps = &cells[cps->sibling];
+                }
+            }   
+        }else{
+            for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
+                make_interact_pc(leaf, cps);
+               // __syncthreads();
+            }
+            break;
+        }
+        ci = cp;
+    }
+  /* for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
+        if(is_inside(leaf, cp)) break;
+    }
+    if(are_neighbours_different_size(cp, cj)) {
+        for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
+            if(!are_neighbours(cp, cps)) {
+                make_interact_pc(leaf, cps);
+                __syncthreads();
+            } else {
+                if(cp->split && cps->split) {
+                    iact_pair_pc(cp, cps, leaf);
+                }
+            }
+        }
+    }else{
+        for(cps = &cells[cj->firstchild]; cps!= &cells[cj->sibling]; cps = &cells[cps->sibling]) {
+            make_interact_pc(leaf, cps);
+        }
+    }*/
+//    __syncthreads();
+}   
+/**
+ * @brief Compute the interactions between all particles in a leaf and
+ *        and all the monopoles in the cell c
+ *
+ * @param c The #cell containing the monopoles
+ * @param leaf The #cell containing the particles
+ */
+__device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
+    struct cell *cp, *cps;
+    /*if(leaf->split)
+    {
+        printf("Leaf split = 1, oh dear.");
+        asm("trap;");
+    }
+    if(c->split > 1)
+    {
+        printf("Cell had split > 1\n");
+        asm("trap;");
+    }*/
+    /* Find the subcell of c the leaf is in.*/
+    cp = c;
+    cps = c;
+    while(c->split)
+    {
+        for(cp = &cells[cp->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]){
+              if(is_inside(leaf, cp)) break;
+        }
+        if(cp->split){
+            for(cps = &cells[c->firstchild]; cps != &cells[c->sibling]; cps = &cells[cps->sibling]) {
+                if(cp != cps && cps->split) iact_pair_pc(cp, cps, leaf);
+            }
+        }
+        c = cp;
+    }
+    /*for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) {
+        if(is_inside(leaf, cp)) break;
+    }
+    if(cp->split) {
+        iact_self_pc(cp, leaf);
+        for(cps = &cells[c->firstchild]; cps != &cells[c->sibling]; cps = &cells[cps->sibling]) {
+            if(cp != cps && cps->split) iact_pair_pc(cp,cps,leaf);
+        }
+    }*///TODO
+}
+/**
+ * @brief Compute the interactions between all particles in a cell.
+ *
+ * @param cellID The cell ID to compute interactions on.
+ */
+__device__ void iact_self_direct(int cellID) {
+    struct cell *c = &cells[cellID];
+    double xi[3] = {0.0, 0.0, 0.0};
+    float ai[3] = {0.0, 0.0, 0.0 }, mi, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w;
+    __shared__ double2 parts_xy[cell_maxparts];
+    __shared__ double parts_z[cell_maxparts];
+    __shared__ float4 parts_am[cell_maxparts];
+    int parts;
+    int count;
+    int i,j,k;
+    //if(threadIdx.x == 0)
+     //   printf("%f, %f, %f, %f, %i\n", c->h, c->loc_xy.x, c->loc_xy.y, c->loc_z, c->split);
+    //If cell is split, interact each child with itself, and with each of its siblings.
+    /*if(c->split) {
+        //TODO
+    } else {*/
+        parts = c->parts;
+        count = c->count;
+        int z = threadIdx.x;
+        /* Load particle data into shared memory*/
+        /*for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) {
+            parts_xy[z] = parts_pos_xy[k];
+            parts_z[z] = parts_pos_z[k];
+            parts_am[z] = parts_a_m[k];
+        }
+        __syncthreads();*/
+        for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x)
+        {
+            xi[0] = parts_cuda[i].loc[0];
+            xi[1] = parts_cuda[i].loc[1];
+            xi[2] = parts_cuda[i].loc[2];
+            for(k = 0; k < 3; k++) {
+                ai[k] = 0.0;
+            }
+            mi = parts_cuda[i].m;
+            //for(j = i+1; j!= i; j = (j+1)%count) 
+            for(j = parts; j < parts+count; j++)
+            {
+                if(i != j){
+                    /* Compute the pairwise distance. */
+                    r2 = 0.0;
+                    dx[0] = xi[0] - parts_cuda[j].loc[0];
+                    r2 += dx[0]*dx[0];
+                    dx[1] = xi[1] - parts_cuda[j].loc[1];
+                    r2 += dx[1]*dx[1];
+                    dx[2] = xi[2] - parts_cuda[j].loc[2];
+                    r2 += dx[2]*dx[2];
+                    /* Apply the gravitational acceleration. */
+                    //ir = 1.0f / sqrtf(r2);
+                    ir = rsqrtf(r2);
+                    w = const_G * ir * ir * ir;
+                    mj = parts_cuda[j].m;
+                    for(k = 0; k < 3; k++) {
+                        ai[k] -= w * dx[k] * mj;
+                    }
+                }
+            }
+            //Update.
+            atomicAdd(&parts_cuda[i].a[0],ai[0] );
+            atomicAdd(&parts_cuda[i].a[1],ai[1] );
+            atomicAdd(&parts_cuda[i].a[2],ai[2] );
+        }
+}
+/**
+ * @brief Checks whether the cells are direct neighbours ot not. Both cells have
+ * to be of the same size
+ */
+static inline int are_neighbours_host(struct cell *ci, struct cell *cj) {
+  int k;
+  float dx[3];
+#ifdef SANITY_CHECKS
+  if (ci->h != cj->h)
+    error(" Cells of different size in distance calculation.");
+#endif
+  /* Maximum allowed distance */
+  float min_dist = ci->h;
+  /* (Manhattan) Distance between the cells */
+    double2 loc1=ci->loc_xy, loc2=cj->loc_xy;
+    float center_i = loc1.x;
+    float center_j = loc2.x;
+    dx[0] = fabs(center_i - center_j);
+    center_i = loc1.y;
+    center_j = loc2.y;
+    dx[1] = fabs(center_i - center_j);
+    center_i = ci->loc_z;
+    center_j = cj->loc_z;
+    dx[2] = fabs(center_i - center_j);
+  return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
+}
+struct cell *cell_get()
+{
+    struct cell *res;
+    if(num_cells == 0)
+    {
+        cell_pool = (struct cell*) calloc(INITIAL_CELLS, sizeof(struct cell));
+        if(cell_pool == NULL)
+            error("Failed to allocate cell_pool");
+        com_xy_host = (double2*) calloc(INITIAL_CELLS, sizeof(double2));
+        if(com_xy_host == NULL)
+            error("Failed to allocate cell_pool");
+        com_z_host = (double*) calloc(INITIAL_CELLS, sizeof(double));
+        if(com_z_host == NULL)
+            error("Failed to allocate cell_pool");
+        com_mass_host = (float*) calloc(INITIAL_CELLS, sizeof(float));
+        if(com_mass_host == NULL)
+            error("Failed to allocate cell_pool");
+        num_cells = INITIAL_CELLS;
+    }
+    if(used_cells >= num_cells)
+    {
+        /* Stretch */
+        struct cell *new_pool;
+        cell_size *= CELL_STRETCH;
+        new_pool = (struct cell*) calloc(num_cells*CELL_STRETCH, sizeof(struct cell));
+        if(new_pool == NULL)
+            error("Failed to allocate new_pool");
+        if(cell_pool != NULL)
+        memcpy(new_pool, cell_pool, num_cells*sizeof(struct cell));
+        double2 *tempxy = (double2*) calloc(num_cells*CELL_STRETCH, sizeof(double2));
+        if(tempxy == NULL)
+            error("Failed to allocate tempxy");
+        memcpy(tempxy, com_xy_host, sizeof(double2)*num_cells);
+        free(com_xy_host);
+        com_xy_host = tempxy; 
+        double *tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double));
+        if(tempz == NULL)
+            error("Failed to allocate tempz");
+        memcpy(tempz, com_z_host, num_cells*sizeof(double));
+        free(com_z_host);
+        com_z_host = tempz;
+        float *tempm = (float*) calloc(num_cells*CELL_STRETCH, sizeof(float));
+        if(tempm == NULL)
+            error("Failed to allocate tempm");
+        memcpy(tempm, com_mass_host, num_cells*sizeof(float));
+        free(com_mass_host);
+        com_mass_host = tempm;
+        num_cells *= CELL_STRETCH;
+        free(cell_pool);
+        cell_pool = new_pool;
+        message("Increased size of arrays");
+    }
+    used_cells++;
+    cell_pool[used_cells-1].sibling = -1;
+    cell_pool[used_cells-1].firstchild = -1;
+    cell_pool[used_cells-1].res = qsched_res_none;
+    //cell_pool[used_cells-1].resz = qsched_res_none;
+    //cell_pool[used_cells-1].resm = qsched_res_none;
+    return &cell_pool[used_cells-1];
+}
+void comp_com(struct cell *c){
+    int k, count = c->count;
+    int cpi;
+    struct cell *cp;
+    int parts = c->parts;
+    double com[3] = {0.0, 0.0, 0.0}, mass = 0.0;
+    if(c->split) {
+        for(cp = &cell_pool[(cpi = c->firstchild)]; cpi != c->sibling; cp = &cell_pool[(cpi = cp->sibling)]) {
+            float cp_mass = com_mass_host[cpi];
+            com[0] += com_xy_host[cpi].x * cp_mass;
+            com[1] += com_xy_host[cpi].y * cp_mass;
+            com[2] += com_z_host[cpi] * cp_mass;
+            mass += cp_mass;
+        }
+     /* Otherwise collect the multiple from the particles */
+    } else {
+        for(k = parts; k < parts+count; k++)
+        {
+            float p_mass = parts_host[k].m;
+            com[0] += parts_host[k].loc[0] * p_mass;
+            com[1] += parts_host[k].loc[1] * p_mass;
+            com[2] += parts_host[k].loc[2] * p_mass;
+            mass += p_mass;
+        }
+    }
+    k = c - cell_pool;
+    /* Store the COM data, if it was collected. */
+    if(mass > 0.0) {
+        float imass = 1.0f/mass;
+        com_xy_host[k].x = com[0] * imass;
+        com_xy_host[k].y = com[1] * imass;
+        com_z_host[k] = com[2] * imass;
+        com_mass_host[k] = mass;
+    }else
+    {
+        com_xy_host[k].x = 0.0;
+        com_xy_host[k].y = 0.0;
+        com_z_host[k] = 0.0;
+        com_mass_host[k] = 0.0f;
+    }
+}
+/**
+ * @brief Sort the parts into eight bins along the given pivots and
+ *        fill the multipoles. Also adds the hierarchical resources
+ *        to the sched (TODO).
+ *
+ * @param c The #cell to be split.
+ * @param N The total number of parts.
+ * @param s The #sched to store the resources.
+ */
+void cell_split(int c, struct qsched *s) {
+    int i, j, k, kk, count = cell_pool[c].count;
+    int parts = cell_pool[c].parts;
+    struct part temp_part;
+    double2 tempxy;
+    double tempxy1;
+    float4 tempxy2;
+    struct cell *cp, *cell;
+    int left[8], right[8];
+    double pivot[3];
+    static int root = -1;
+//    struct cell *progenitors[8];
+    int progenitors[8];
+    int c1 = c;
+    struct part *temp_xy;
+    double *temp_z;
+    float4 *temp_a_m;
+    /* Set the root cell. */
+    if (root < 0) {
+        root = c;
+        cell_pool[c].sibling = -1;
+    }
+    if(cell_pool[c].res == qsched_res_none)
+    {
+        if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[c].parts], 0) != cudaSuccess )
+            error("Failed to get host device pointer.");
+//        printf("tempxy = %p\n", temp_xy);
+        cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy,
+                                        sizeof(struct part) * cell_pool[c].count, parts_pos_xy_temp + cell_pool[c].parts);
+    }
+    /*if(cell_pool[c].resz == qsched_res_none)
+    {
+        if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[c].parts], 0) != cudaSuccess )
+            error("Failed to get host device pointer.");
+        cell_pool[c].resz = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_z,
+                                        sizeof(double) * cell_pool[c].count, parts_pos_z_temp + cell_pool[c].parts);
+    }
+    if(cell_pool[c].resm == qsched_res_none)
+    {
+        if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[c].parts], 0) != cudaSuccess )
+            error("Failed to get host device pointer.");
+        cell_pool[c].resm = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_a_m,
+                                        sizeof(float4) * cell_pool[c].count, parts_a_m_temp + cell_pool[c].parts);
+    }*/
+       // error("Cell has no resource");*///TODO
+    if(count > cell_maxparts )
+    {
+        cell_pool[c].split = 1;
+        for(k = 0; k < 8; k++)
+        {
+            progenitors[k] = (cp = cell_get()) - cell_pool;
+            cell = &cell_pool[c];
+            cp->loc_xy = cell->loc_xy;
+            cp->loc_z = cell->loc_z;
+            cp->h = cell->h*0.5;
+            if(k & 4) cp->loc_xy.x += cp->h;
+            if(k & 2) cp->loc_xy.y += cp->h;
+            if(k & 1) cp->loc_z += cp->h;
+        }
+        /* Init the pivots.*/
+        pivot[0] = cell->loc_xy.x + cell->h * 0.5;
+        pivot[1] = cell->loc_xy.y + cell->h * 0.5;
+        pivot[2] = cell->loc_z + cell->h * 0.5;
+        /* Split along the x axis. */
+        i = parts;
+        j = parts+count-1;
+        while(i < j)
+        {
+            while(i <= parts+count-1 && parts_host[i].loc[0] < pivot[0]) i += 1;
+            while(j >= parts && parts_host[j].loc[0] >= pivot[0]) j -= 1;
+            if(i < j){
+                temp_part = parts_host[i];
+                parts_host[i] = parts_host[j];
+                parts_host[j] = temp_part;
+            }
+        }
+        left[1] = i;
+        right[1] = parts+count-1;
+        left[0] = parts;
+        right[0] = j;
+        /* Split along the y axis twice. */
+        for (k = 1; k >= 0; k--) {
+            i = left[k];
+            j = right[k];
+            while(i <= j){
+                while(i <= right[k] && parts_host[i].loc[1] < pivot[1]) i += 1;
+                while(j >= left[k] && parts_host[j].loc[1] >= pivot[1]) j -= 1;
+                if(i < j)
+                {
+                    temp_part = parts_host[i];
+                    parts_host[i] = parts_host[j];
+                    parts_host[j] = temp_part;
+                }
+            }
+            left[2*k+1] = i;
+            right[2*k+1] = right[k];
+            left[2*k] = left[k];
+            right[2*k] = j;
+        }
+        /* Split along the z axis four times.*/
+        for(k = 3; k >=0; k--)
+        {
+            i = left[k];
+            j = right[k];
+            while(i <= j){
+                while(i <= right[k] && parts_host[i].loc[2] < pivot[2]) i += 1;
+                while(j >= left[k] && parts_host[j].loc[2] >= pivot[2]) j -= 1;
+                if(i < j)
+                {
+                    temp_part = parts_host[i];
+                    parts_host[i] = parts_host[j];
+                    parts_host[j] = temp_part;
+                }
+            }
+            left[2 * k + 1] = i;
+            right[2 * k + 1] = right[k];
+            left[2 * k] = left[k];
+            right[2 * k] = j;
+        }
+        /* Store the counts and offsets. */
+        for(k = 0; k < 8; k++)
+        {
+            cell_pool[progenitors[k]].count = right[k]-left[k]+1;
+            cell_pool[progenitors[k]].parts = left[k];
+        if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
+            error("Failed to get host device pointer.");
+        cell_pool[progenitors[k]].res = qsched_addres(s, qsched_owner_none, cell->res, temp_xy,
+                                        sizeof(struct part) * cell_pool[progenitors[k]].count, parts_temp + cell_pool[progenitors[k]].parts);
+        /*if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
+            error("Failed to get host device pointer.");
+        cell_pool[progenitors[k]].resz = qsched_addres(s, qsched_owner_none, cell->resz, temp_z,
+                                        sizeof(double) * cell_pool[progenitors[k]].count, parts_pos_z_temp + cell_pool[progenitors[k]].parts);
+        if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
+            error("Failed to get host device pointer.");
+        cell_pool[progenitors[k]].resm = qsched_addres(s, qsched_owner_none, cell->resm, temp_a_m,
+                                        sizeof(float4) * cell_pool[progenitors[k]].count, parts_a_m_temp + cell_pool[progenitors[k]].parts);*/
+        }
+        /* Find the first non-empty progenitor */
+        for(k = 0; k < 8; k++)
+        {
+            if(cell_pool[progenitors[k]].count > 0)
+            {
+                cell->firstchild = progenitors[k];
+                break;
+            }
+        }
+        #ifdef SANITY_CHECKS
+            if(cell->firstchild == -1)
+                error("Cell has been split but all children have 0 parts");
+        #endif
+        /*Prepare the pointers*/
+        for(k = 0; k < 8; k++)
+        {
+            /* Find the next non-empty sibling */
+            for(kk = k+1; kk < 8; ++kk){
+                if(cell_pool[progenitors[kk]].count > 0){
+                    cell_pool[progenitors[k]].sibling = progenitors[kk];
+                    break;
+                }
+            }
+            /* No non-empty sibling, go back a level.*/
+            if(kk == 8) cell_pool[progenitors[k]].sibling = cell->sibling;
+        }
+        /* Recurse */
+        for(k = 0; k < 8; k++)
+            if(cell_pool[progenitors[k]].count > 0) cell_split(progenitors[k], s);
+    /* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */   
+    } else {
+//    struct cell *data[2] = {root, c};
+    int data[2] = {root, c};
+        int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
+                                 2 * sizeof(int), 3000);
+        /*qsched_adduse(s, tid, cell_pool[root].res);
+        qsched_adduse(s, tid, cell_pool[root].resz);
+        qsched_adduse(s, tid, cell_pool[root].resm);*/
+        qsched_addlock(s, tid, cell_pool[c].res);
+     //   qsched_addlock(s, tid, cell_pool[c].resz);
+      //  qsched_addlock(s, tid, cell_pool[c].resm);
+    }
+#ifndef COM_AS_TASK
+    comp_com(&cell_pool[c]);
+#endif
+}
+/**
+ * @brief Create the tasks for the cell pair/self.
+ *
+ * @param s The #sched in which to create the tasks.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ */
+void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
+    qsched_task_t tid;
+    int data[2];
+    struct cell /**data[2],*/ *cp, *cps;
+    int cpi;
+    if(cj == NULL)
+    {
+        if(ci->split)
+        {
+            for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling])
+            {
+                //Self Interaction.
+                create_tasks(s, cp, NULL);
+                for(cps = &cell_pool[cp->sibling]; cps != &cell_pool[ci->sibling]; cps = &cell_pool[cps->sibling])
+                    create_tasks(s, cp, cps);
+            }
+        }
+        /* Self task */
+        else{
+            data[0] = ci - cell_pool;
+            data[1] = -1;
+            tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, 2);
+            qsched_addlock(s, tid, ci->res);
+            //qsched_addlock(s, tid, ci->resz);
+            //qsched_addlock(s, tid, ci->resm);
+        }
+    }
+    /* Else its a pair!*/
+    else{
+        if(!are_neighbours_host(ci,cj)){/* Cells are neighbours */
+        }else{
+            /*Are both split? */
+            if(ci->split && cj->split)
+            {
+                /* Recurse over both cells. */
+                for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling])
+                    for(cps = &cell_pool[cj->firstchild]; cps != &cell_pool[cj->sibling]; cps = &cell_pool[cps->sibling])
+                        create_tasks(s, cp, cps);
+            /* Otherwise, at least one of the cells is not split, build a direct
+             * interaction. */
+            }else{
+                data[0] = ci-cell_pool;
+                data[1] = cj-cell_pool;
+                /* Create the task. */
+                tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
+                                     sizeof(struct cell *) * 2, 1);
+                /* Add the resources. */
+                qsched_addlock(s, tid, ci->res);
+              //  qsched_addlock(s, tid, ci->resz);
+               // qsched_addlock(s, tid, ci->resm);
+                qsched_addlock(s, tid, cj->res);
+               // qsched_addlock(s, tid, cj->resz);
+               // qsched_addlock(s, tid, cj->resm);
+            }
+        }
+    }
+}
+__device__ void runner( int type , void *data ) {
+        int *idata = (int *)data;
+        int i = idata[0];
+        int j = idata[1];
+        switch ( type ) {
+            case task_type_self:
+                iact_self_direct(i);
+                break;
+            case task_type_pair:
+                iact_pair_direct(&cells[i], &cells[j]);
+                break;
+            case task_type_self_pc: 
+                iact_self_pc( &cells[i], &cells[j] );
+                break;
+            default:
+                asm("trap;");
+        }
+    __syncthreads();
+}
+__device__ qsched_funtype function = runner;
+qsched_funtype func;
+/**
+ * @brief Set up and run a task-based Barnes-Hutt N-body solver.
+ *
+ * @param N The number of random particles to use.
+ * @param runs Number of force evaluations to use as a benchmark.
+ * @param fileName Input file name. If @c NULL or an empty string, random
+ *        particle positions will be used.
+ */
+void test_bh(int N, int runs, char *fileName) {
+  int i, k;
+  struct cell *root;
+  struct part *parts;
+  FILE *file;
+  struct qsched s;
+  ticks tic, toc_run, tot_setup = 0, tot_run = 0;
+  int countMultipoles = 0, countPairs = 0, countCoMs = 0;
+  struct cell *gpu_ptr_cells;
+  cudaFree(0);
+    if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
+        error("Failed to copy function pointer from device");
+  /* Initialize the scheduler. */
+  qsched_init(&s, 1, qsched_flag_none);
+    //Create host particle arrays.
+    if( cudaMallocHost(&parts_host, sizeof(struct part) * N) != cudaSuccess)
+        error("Failed to allocate parts array");
+    if( cudaMalloc(&parts_temp, sizeof(struct part) * N) != cudaSuccess)
+        error("Failed to allocate device parts array");
+    if( cudaMemcpyToSymbol(parts_cuda, &parts_temp, sizeof(struct part*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to set device symbol for parts array");
+    if(fileName == NULL || fileName[0] == 0) {
+        for(k = 0; k < N; k++) {
+            parts_host[k].loc[0] = ((double)rand())/ RAND_MAX;
+            parts_host[k].loc[1] = ((double)rand())/ RAND_MAX;
+            parts_host[k].loc[2] = ((double)rand())/ RAND_MAX;
+            parts_host[k].m = ((double)rand()) / RAND_MAX;
+        }
+    } else {
+        file = fopen(fileName, "r");
+        int tempxy;
+        if(file) {
+            for(k = 0; k < N; k++) {
+                if(fscanf(file, "%d", &tempxy) != 1)
+                    error("Failed to read ID");
+                if(fscanf(file, "%lf%lf%lf", &parts_host[k].loc[0], &parts_host[k].loc[1], &parts_host[k].loc[2]) != 3)
+                    error("Failed to read position of part %i.", k);
+                if(fscanf(file, "%f", &parts_host[k].m) != 1)
+                    error("Failed to read mass of part %i.", k);
+            }
+            fclose(file);
+        }
+    }
+    /* Init the cells. */
+    root = cell_get();
+    root->loc_xy.x = 0.0;
+    root->loc_xy.y = 0.0;
+    root->loc_z = 0.0;
+    root->h = 1.0;
+    root->count = N;
+    root->parts = 0;
+    root->sibling = -1;
+    int c = root-cell_pool;
+    cell_split(root - cell_pool, &s);
+    root = &cell_pool[c];
+    int nr_leaves = 0;
+    int maxparts=0, minparts=1000000;
+    int number = 0;
+    while(c >= 0) {
+        if(cell_pool[c].count > 0)
+        {
+            number++;
+            if(cell_pool[c].res == qsched_res_none)
+               message("cell %i has no res", c);
+            //if(cell_pool[c].resz == qsched_res_none)
+             //  message("cell %i has no resz", c);
+            //if(cell_pool[c].resm == qsched_res_none)
+             //  message("cell %i has no resm", c);
+        }
+        if(!cell_pool[c].split) {
+            nr_leaves++;
+            if(cell_pool[c].count > maxparts)
+            {
+                maxparts = cell_pool[c].count;
+            }
+            if(cell_pool[c].count < minparts)
+            {
+                minparts = cell_pool[c].count;
+            }
+            c = cell_pool[c].sibling;
+        } else {
+            c = cell_pool[c].firstchild;
+        }
+    }
+    message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves));
+    message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts);
+   /* for(k = 0; k < num_cells; k++)
+        if(cell_pool[k].split > 1 ) 
+            printf("Split > 1\n");*/
+    create_tasks(&s, root, NULL);    
+    int self = 0, pair = 0, pc = 0;
+    for(k = 0; k < s.count; k++)
+    {
+        if(s.tasks[k].type == task_type_self)
+            self++;
+        else if (s.tasks[k].type == task_type_pair)
+            pair++;
+        else if (s.tasks[k].type >= 0)
+            pc++;
+    }
+    message("total number of tasks: %i.", s.count);
+    message("total number of pair tasks: %i.", pair);
+    message("total number of self tasks: %i.", self);
+    message("total number of pc tasks: %i.", pc);
+    message("total number of cells: %i.", number);
+    message("total number of deps: %i.", s.count_deps);
+    message("total number of res: %i.", s.count_res);
+    message("total number of locks: %i.", s.count_locks);
+    for(k = 0; k < runs; k++) {
+        for(i = 0; i < N; ++i) {
+            parts_host[i].a[0] = 0.0;
+            parts_host[i].a[1] = 0.0;
+            parts_host[i].a[2] = 0.0;
+        }
+    /* Copy the cells to the device. */
+    if( cudaMalloc( &gpu_ptr_cells , sizeof(struct cell) * used_cells) != cudaSuccess)
+        error("Failed to allocate cells on the GPU");
+    if( cudaMemcpy( gpu_ptr_cells, cell_pool, sizeof(struct cell) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy cells to the GPU");
+    if( cudaMemcpyToSymbol(cells, &gpu_ptr_cells, sizeof(struct cell*), 0, cudaMemcpyHostToDevice) != cudaSuccess )
+        error("Failed to copy cell pointer to the GPU");
+double2 *com_temp;
+double *comz_temp;
+float *comm_temp;
+    if(cudaMalloc( &com_temp, sizeof(double2) * used_cells) != cudaSuccess)
+        error("Failed to allocate com on the GPU");
+    if( cudaMemcpy( com_temp, com_xy_host, sizeof(double2) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
+        error("failed to copy com to the GPU");
+    if( cudaMemcpyToSymbol(com_xy, &com_temp, sizeof(double2 *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy com pointer to the GPU");
+    if(cudaMalloc( &comz_temp, sizeof(double) * used_cells) != cudaSuccess)
+        error("Failed to allocate com on the GPU");
+    if( cudaMemcpy( comz_temp, com_z_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
+        error("failed to copy com to the GPU");
+    if( cudaMemcpyToSymbol(com_z, &comz_temp, sizeof(double *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy com pointer to the GPU");
+    if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess)
+        error("Failed to allocate com on the GPU");
+    if( cudaMemcpy( comm_temp, com_mass_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
+        error("failed to copy com to the GPU");
+    if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy com pointer to the GPU");
+   /* for(i = 0; i < s.count; i++){
+        int *idata = (int*)&s.data[s.tasks[i].data];
+        if(s.tasks[i].type == task_type_self)
+            printf("Self task with data[0] = %i\n", idata[0]);
+        if(s.tasks[i].type == task_type_pair) {
+            printf("Pair task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
+        }
+        if(s.tasks[i].type == task_type_self_pc) {
+            printf("PC task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
+        }
+    }*/
+      //  printf("com_mass_host[152] = %f\n", com_mass_host[152]);
+        //Run code.
+//        printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
+        qsched_run_CUDA( &s , func );
+        qsched_print_cuda_timers(&s);
+    k = 0;
+        printf("%e, %e, %e, %e, %e, %e, %e\n", parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2],
+            parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]);
+struct task* tasks =  qsched_get_timers( &s , s.count );
+    for(i = 0; i < s.count; i++)
+    {
+      printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
+       // printf("\n");
+  }
+}   
+  /* Dump the particles to a file */
+  file = fopen("particle_dump.dat", "w");
+/*  fprintf(file,
+          "# ID m x y z a_exact.x   a_exact.y    a_exact.z    a_legacy.x    "
+          "a_legacy.y    a_legacy.z    a_new.x     a_new.y    a_new.z\n");*/
+  for (k = 0; k < N; ++k)
+    fprintf(file, "%e, %e, %e, %e, %e, %e, %e\n",
+            parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2],
+            parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]);
+  fclose(file);
+}
+int main(int argc, char *argv[]) {
+    int c, nr_threads;
+    int N = 1000, runs = 1;
+    char fileName[100] = {0};
+    /* Parse the options */
+  while ((c = getopt(argc, argv, "n:r:t:f:c:i:")) != -1) switch (c) {
+      case 'n':
+        if (sscanf(optarg, "%d", &N) != 1)
+          error("Error parsing number of particles.");
+        break;
+      case 'r':
+        if (sscanf(optarg, "%d", &runs) != 1)
+          error("Error parsing number of runs.");
+        break;
+      case 't':
+        if (sscanf(optarg, "%d", &nr_threads) != 1)
+          error("Error parsing number of threads.");
+//        omp_set_num_threads(nr_threads);
+        break;
+      case 'f':
+        if (sscanf(optarg, "%s", &fileName[0]) != 1)
+          error("Error parsing file name.");
+        break;
+      case '?':
+        fprintf(stderr, "Usage: %s [-t nr_threads] [-n N] [-r runs] [-f file] "
+                        "[-c Nparts] [-i Niterations] \n",
+                argv[0]);
+        fprintf(stderr, "Solves the N-body problem using a Barnes-Hut\n"
+                        "tree code with N random particles read from a file in "
+                        "[0,1]^3 using"
+                        "nr_threads threads.\n"
+                        "A test of the neighbouring cells interaction with "
+                        "Nparts per cell is also run Niterations times.\n");
+        exit(EXIT_FAILURE);
+    }
+  /* Tree node information */
+  printf("Size of cell: %zu bytes.\n", sizeof(struct cell));
+  /* Part information */
+//  printf("Size of part: %zu bytes.\n", sizeof(struct part));
+  /* Dump arguments. */
+  if (fileName[0] == 0) {
+    message("Computing the N-body problem over %i random particles using %i "
+            "threads (%i runs).",
+            N, nr_threads, runs);
+  } else {
+    message("Computing the N-body problem over %i particles read from '%s' "
+            "using %i threads (%i runs).",
+            N, fileName, nr_threads, runs);
+  }
+  /* Run the BH test. */
+  test_bh(N, runs, fileName);
+}