diff --git a/README b/README
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..59ae0410860ec6e1fad951920ef78dd4c832793e 100644
--- a/README
+++ b/README
@@ -0,0 +1,6 @@
+Installation Instructions:
+
+1) Run ./autogen.sh
+2) Run ./configure
+3) Run make in the main directory.
+4) Navigate to the src/ directory and execute the ./CUDACompile.sh script. With gcc-4.8 as the default compiler and nvcc command registered it should work.
diff --git a/autogen.sh b/autogen.sh
index 6334558115f4bf217877cb630ef76842e3052cb4..c513213392e44e1b736fcb9d9f39a0e0cac9cb6e 100755
--- a/autogen.sh
+++ b/autogen.sh
@@ -31,4 +31,3 @@ autoheader -I m4
 
 # run automake
 automake --add-missing
-
diff --git a/examples/test_bh.cu b/examples/test_bh.cu
index cfc5af5e9802a1a85e1431b793c78eee7d4b5a73..6c8f7bfc6df1d7c4dc5621cc12350da446b5734c 100644
--- a/examples/test_bh.cu
+++ b/examples/test_bh.cu
@@ -1,7 +1,7 @@
 /*******************************************************************************
  * This file is part of QuickSched.
  * Coypright (c) 2014 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
- *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *                    Aidan Chalk (aidan.chalk@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -17,6 +17,9 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  *
 * *****************************************************************************/
+//#define EXACT
+//#define ID
+#define NO_QUADRUPOLES
 
 /* Config parameters. */
 #include "../config.h"
@@ -32,868 +35,892 @@
 #include <omp.h>
 #include <fenv.h>
 
-
-
 /* Local includes. */
 extern "C"{
 #include "quicksched.h"
 #include "res.h"
-#include <cblas.h>
 }
 #include "cuda_queue.h"
-
-/* Some local constants. */
-#define cell_pool_grow 1000
-#define cell_maxparts 128
-#define task_limit 1e8
-#define const_G 1    // 6.6738e-8
-#define dist_min 0.5 /* Used for legacy walk only */
-#define dist_cutoff_ratio 1.5
-
-#define ICHECK -1
-#define NO_SANITY_CHECKS
-#define NO_COM_AS_TASK
-#define NO_COUNTERS
-
-/** Data structure for the particles. */
-struct part {
-  double x[3];
-  // union {
-  float a[3];
-  float a_legacy[3];
-  float a_exact[3];
-  // };
-  float mass;
-  int id;
-};  // __attribute__((aligned(64)));
-
-struct multipole {
-  double com[3];
-  float mass;
-};
-
-struct cell_device{
-    double loc[3];
-    double h;
-    int count;
-    unsigned short int split,sorted;
-    int part_loc; /*Index of particles in part array. */
-    int first_child; /* Index of first child in cell array. */
-    int sibling; /* Next node.*/
-    struct multipole mpole;
-    int res, com_tid;
-}
-
-/** Data structure for the BH tree cell. */
-struct cell {
-  double loc[3];
-  double h;
-
-  int count;
-  unsigned short int split, sorted;
-  struct part *parts;
-
-  struct cell *firstchild; /* Next node if opening */
-  struct cell *sibling;    /* Next node */
-
-/** Global variable for the pool of allocated cells. */
-
-  /* We keep both CoMs and masses to make sure the comp_com calculation is
-   * correct (use an anonymous union to keep variable names compact).  */
-//  union {
-
-    /* Information for the legacy walk */
-//    struct multipole legacy;
-
-    /* Information for the QuickShed walk */
-  struct multipole mpole;
-//  };
-
-  int res, com_tid;
-  struct index *indices;
-
-} __attribute__((aligned(128)));
-struct cell *cell_pool = NULL;
+#define double float
+#define double2 float2
+//#define float double
+//#define float2 double2
 
 /** Task types. */
 enum task_type {
   task_type_self = 0,
   task_type_pair,
-  task_type_self_pc,
+  task_type_pair_pc,
+  task_type_pair_pc_split,
   task_type_com,
   task_type_count
 };
 
-__device__ struct part *parts;
+struct cell{
 
+double2 loc_xy;
+double loc_z;
+double h;
+int count;
+unsigned short int split, sorted;
+int parts, firstchild, sibling;
+int res, com_tid;
 
-#ifdef TODO
-__device__ double *part_x0;
-__device__ double *part_x1;
-__device__ double *part_x2;
-__device__ float  *part_a0;
-__device__ float  *part_a1;
-__device__ float  *part_a2;
-__device__ float *part_mass;
+};
+struct part{
+#ifdef ID
+int id;
 #endif
+double loc[3];
+float a[3];
+float m;
+};
 
 
-#define cuda_error(s, ...) {if(threadIdx.x == 0){fprintf( stderr , "%s:%s():%i: " s "\n" , __FILE__ , __FUNCTION__ , __LINE__ , ##__VA_ARGS__ );} __syncthreads(); asm("trap\n"); }
-
+#define const_G 1
+/* Requred variables to obtain cells. */
+#define cell_maxparts 128
+#define CELL_STRETCH 2
+#define INITIAL_CELLS 256
+struct cell *cell_pool = NULL;
+int used_cells=0;
+int num_cells = 0;
+int cell_size = INITIAL_CELLS*sizeof(struct cell);
+
+/* Device locations for the particle values. */
+__device__ double2 *com_xy;
+__device__ double *com_z;
+__device__ float *com_mass;
+#ifdef QUADRUPOLES
+__device__ double *I_xx;
+__device__ double *I_yy;
+__device__ double *I_zz;
+__device__ double *I_xy;
+__device__ double *I_xz;
+__device__ double *I_yz;
+#endif
+__device__ struct part *parts_cuda;
+__device__ struct cell *cells;
+__device__ int pc_calcs = 0;
+
+
+/* Host locations for the particle values. */
+double2 *com_xy_host;
+double *com_z_host;
+float *com_mass_host;
+#ifdef QUADRUPOLES
+double *I_xx_host;
+double *I_yy_host;
+double *I_zz_host;
+double *I_xy_host;
+double *I_xz_host;
+double *I_yz_host;
+#endif
+double2 *parts_pos_xy_temp;
+double *parts_pos_z_temp;
+float4 *parts_a_m_temp;
 
-__device__ double atomicAdd(double* address, double val)
-{
-    unsigned long long int* address_as_ull =
-                                          (unsigned long long int*)address;
-    unsigned long long int old = *address_as_ull, assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed, 
-                        __double_as_longlong(val + 
-                        __longlong_as_double(assumed)));
-    } while (assumed != old);
-    return __longlong_as_double(old);
-}
+struct part *parts_host;
+struct part *parts_temp;
 
-__device__ inline double __shfl_down_dbl(double var, unsigned int srcLane, int width)
-{
-  int2 a = *reinterpret_cast<int2*>(&var);
-  a.x = __shfl_down(a.x, srcLane, width);
-  a.y = __shfl_down(a.y, srcLane, width);
-  return *reinterpret_cast<double*>(&a);
-}
+#ifndef double
+__device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) 
+} while (assumed != old); return __longlong_as_double(old); }
+#endif
 
-__device__ inline void reduceSumDouble( double* value)
-{
-    for(int mask = 16; mask > 0; mask >>= 1)
-        *value += __shfl_down_dbl( *value, mask, 32);
-}
 
-__device__ inline void reduceSum( float* value )
-{
-    #pragma unroll
-    for(int mask = 16 ; mask > 0 ; mask >>= 1)
-        *value += __shfl_down((float)*value, mask);
+__device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *cj) {
+    int i, j, k;
+    int count_i = ci->count, count_j = cj->count;
+    int parts_i = ci->parts, parts_j = cj->parts;
+    double xi[3];
+    float dx[3];
+    float ai[3], mj, r2, w, ir;
+
+    /* Loop over cell i.*/
+    for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
+        xi[0] = parts_cuda[i].loc[0];
+        xi[1] = parts_cuda[i].loc[1];
+        xi[2] = parts_cuda[i].loc[2];
+        for(k = 0; k < 3; k++) {
+            ai[k] = 0.0f;
+        }
+        
+        for(j = parts_j; j < parts_j + count_j; j++) {
+            r2 = 0.0f;
+            dx[0] = xi[0] - parts_cuda[j].loc[0];
+            dx[1] = xi[1] - parts_cuda[j].loc[1];
+            dx[2] = xi[2] - parts_cuda[j].loc[2];
+            r2 += dx[0] * dx[0];
+            r2 += dx[1] * dx[1];
+            r2 += dx[2] * dx[2];
 
-   // *value = __shfl((float)*value, 0);
-}
 
-/**
- * @brief Checks whether the cells are direct neighbours ot not
- */
-__device__ __forceinline__ int are_neighbours_different_size_CUDA(struct cell *ci,
-                                                struct cell *cj) {
+            ir = rsqrtf(r2);
+            w = const_G * ir * ir * ir;
+            mj = parts_cuda[j].m;
+            for(k = 0; k < 3; k++) {
+                ai[k] -= dx[k] * mj * w;
+            }
+        }            
+       atomicAdd(&parts_cuda[i].a[0], ai[0]);           
+       atomicAdd(&parts_cuda[i].a[1], ai[1]);
+       atomicAdd(&parts_cuda[i].a[2], ai[2]);
+        
+    }
 
-  int k;
-  float dx[3];
-  double cih = ci->h, cjh = cj->h;
+    /* Load particles of cell i into shared memory */
+ /*Loop over cell j. */
+    for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
+        xi[0] = parts_cuda[i].loc[0];
+        xi[1] = parts_cuda[i].loc[1];
+        xi[2] = parts_cuda[i].loc[2];     
+        for(k = 0; k < 3; k++) {
+            ai[k] = 0.0f;
+        }
+        
+        for(j = parts_i; j < parts_i + count_i; j++) {
+            r2 = 0.0f;
+            dx[0] = xi[0] - parts_cuda[j].loc[0];
+            dx[1] = xi[1] - parts_cuda[j].loc[1];
+            dx[2] = xi[2] - parts_cuda[j].loc[2];
+            r2 += dx[0] * dx[0];
+            r2 += dx[1] * dx[1];
+            r2 += dx[2] * dx[2];
 
-  /* Maximum allowed distance */
-  float min_dist = 0.5 * (cih + cjh);
 
-  /* (Manhattan) Distance between the cells */
-  for (k = 0; k < 3; k++) {
-    float center_i = ci->loc[k] + 0.5 * cih;
-    float center_j = cj->loc[k] + 0.5 * cjh;
-    dx[k] = fabsf(center_i - center_j);
-  }
+            ir = rsqrtf(r2);
+            w = const_G * ir * ir * ir;
+            mj = parts_cuda[j].m;
+            for(k = 0; k < 3; k++) {
+                ai[k] -= dx[k] * mj * w;
+            }
+        }            
+       atomicAdd(&parts_cuda[i].a[0], ai[0]);           
+       atomicAdd(&parts_cuda[i].a[1], ai[1]);
+       atomicAdd(&parts_cuda[i].a[2], ai[2]);
+        
+    }
 
-  return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
 }
-/**
- * @brief Compute the center of mass of a given cell.
- *
- * @param c The #cell.
- */
-__device__ void comp_com_CUDA(struct cell *c) {
 
-  int k, count = c->count;
-  struct cell *cp;
-  struct part *parts = c->parts;
-  double com[3] = {0.0, 0.0, 0.0};
-  float mass = 0.0;
+__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) {
 
-  if (c->split) {
+    int i;
+    int count = leaf->count;    
+    int parts = leaf->parts;
+    int cell_j = cj - cells;
+    float r2;
+    float dx[3], ir, w;
 
-    /* Loop over the projenitors and collect the multipole information. */
     if(threadIdx.x == 0)
-    {
-        for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
-          float cp_mass = cp->mpole.mass;
-          com[0] += cp->mpole.com[0] * cp_mass;
-          com[1] += cp->mpole.com[1] * cp_mass;
-          com[2] += cp->mpole.com[2] * cp_mass;
-          mass += cp_mass;
-        }
-    /* Store the COM data, if any was collected. */
-      if (mass > 0.0) {
-        float imass = 1.0f / mass;
-        c->mpole.com[0] = com[0] * imass;
-        c->mpole.com[1] = com[1] * imass;
-        c->mpole.com[2] = com[2] * imass;
-        c->mpole.mass = mass;
-      } else {
-        c->mpole.com[0] = 0.0;
-        c->mpole.com[1] = 0.0;
-        c->mpole.com[2] = 0.0;
-        c->mpole.mass = 0.0;
-      }
-    }
-    /* Otherwise, collect the multipole from the particles. */
-  } else {
-
-    for (k = threadIdx.x; k < count; k+= blockDim.x) {
-      float p_mass = part_mass[k];
-      com[0] += part_x0[k] * p_mass;
-      com[1] += part_x1[k] * p_mass;
-      com[2] += part_x2[k] * p_mass;
-      mass += p_mass;
-    }
-    reduceSumDouble(com);
-    reduceSumDouble(&com[1]);
-    reduceSumDouble(&com[2]);
-    reduceSum(&mass);
-    if(threadIdx.x % 32 == 0)
-    {   
-        if (mass > 0.0) {
-        float imass = 1.0f / mass;
-        atomicAdd( &c->mpole.com[0] , com[0] * imass);
-        atomicAdd( &c->mpole.com[1] , com[1] * imass);
-        atomicAdd( &c->mpole.com[2] , com[2] * imass);
-        atomicAdd( &c->mpole.mass , mass);
-      } else if(threadIdx.x == 0) {
-        c->mpole.com[0] = 0.0;
-        c->mpole.com[1] = 0.0;
-        c->mpole.com[2] = 0.0;
-        c->mpole.mass = 0.0;
-      }
+        atomicAdd(&pc_calcs, 1);
+
+   /* int leaf_num = leaf - cells;
+    int cjnum = cj - cells;
+    if(threadIdx.x == 0 && leaf_num == 55)
+        printf("Calculating with %d %i whose firstchild is %i and cj->split == %i\n", (cjnum), cj->firstchild, cj->split);*/
+/*    if(threadIdx.x == 0)
+    printf("leaf = %i\n", leaf - cells);*/
+
+    for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
+        r2 = 0.0;
+        dx[0] = com_xy[cell_j].x - parts_cuda[i].loc[0];
+        r2 += dx[0] * dx[0];
+        dx[1] = com_xy[cell_j].y - parts_cuda[i].loc[1];
+        r2 += dx[1] * dx[1];
+        dx[2] = com_z[cell_j] - parts_cuda[i].loc[2];
+        r2 += dx[2] * dx[2];
+    
+        ir = rsqrtf(r2);
+        w = com_mass[cell_j] * const_G * ir * ir * ir;
+        printf("Doing make_interact_pc\n");
+/*        if(threadIdx.x == 0)
+            printf("part.a = %e w*dx = %e \n", parts_cuda[i].a[0], w*dx[0]);*/
+        atomicAdd(&parts_cuda[i].a[0], w*dx[0]);
+        atomicAdd(&parts_cuda[i].a[1], w*dx[1]);
+        atomicAdd(&parts_cuda[i].a[2], w*dx[2]);
+/*        (*accels).x+= w*dx[0];
+        (*accels).y+= w*dx[1];
+        (*accels).z+= w*dx[2];*/
     }
-  }
-
 }
 
-
 /**
- * @brief Checks whether the cells are direct neighbours ot not. Both cells have
- * to be of the same size
+ * @brief Checks whether the cells are direct neighbours ot not
  */
-__device__ __forceinline__ int are_neighbours_CUDA(struct cell *ci, struct cell *cj) {
-
-  int k;
-  float dx[3];
-
-#ifdef SANITY_CHECKS
-  if (ci->h != cj->h)
-    error(" Cells of different size in distance calculation.");
-#endif
-
-  /* Maximum allowed distance */
-  float min_dist = ci->h;
+__device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) {
 
-  /* (Manhattan) Distance between the cells */
-  for (k = 0; k < 3; k++) {
-    float center_i = ci->loc[k];
-    float center_j = cj->loc[k];
-    dx[k] = fabsf(center_i - center_j);
-  }
-
-  return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
+    float dx[3];
+    float cih = ci->h, cjh = cj->h;
+    
+    float min_dist = 0.5*(cih+cjh);
+
+    float center_i = ci->loc_xy.x + 0.5*cih;
+    float center_j = cj->loc_xy.x + 0.5*cjh;
+    dx[0] = fabsf(center_i - center_j);    
+    center_i = ci->loc_xy.y + 0.5*cih;
+    center_j = cj->loc_xy.y + 0.5*cjh;
+    dx[1] = fabsf(center_i - center_j);       
+    center_i = ci->loc_z + 0.5*cih;
+    center_j = cj->loc_z + 0.5*cjh;
+    dx[2] = fabsf(center_i - center_j);    
+    return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); 
 }
 
-/**
- * @brief Compute the interactions between all particles in a cell
- *        and all particles in the other cell (N^2 algorithm).
- * Only applies the forces to cell ci.
- *
- * @param ci The #cell containing the particles.
- * @param cj The #cell containing the other particles
- */
-__device__ __forceinline__ void iact_pair_direct_CUDA(struct cell_device *ci, struct cell_device *cj)
-{
-    int i, j, k;
-    int count_i = ci->count, count_j = cj->count;
-    //struct part *parts_i = ci->parts, *parts_j = cj->parts;
-    int parts_it = ci->part_loc, parts_jt = cj->part_loc;
-    double xi[3];
-    float dx[3], ai[3], r2, w, ir;
+/*__device__ __forceinline__ int are_neighbours_different_size(double cih, float ci_x, float ci_y, float ci_z, struct cell *cj) {
+    float dx_x, dx_y, dx_z;
+    double cjh = cj->h;
+    float min_dist = 0.5*(cih+cjh);
+    float center_i = ci_x + 0.5*cih;
+    float center_j = cj->loc_xy.x + 0.5*cjh;
+    dx_x = fabsf(center_i - center_j);
+    center_i = ci_y + 0.5*cih;
+    center_j = cj->loc_xy.y + 0.5*cjh;
+    dx_y = fabsf(center_i - center_j);
+    center_i = ci_z + 0.5*cih;
+    center_j = cj->loc_z + 0.5*cjh;
+    dx_z = fabsf(center_i - center_j);
+    return (dx_x <= min_dist) && (dx_y <= min_dist) && (dx_z <= min_dist); 
+}*/
+
+__device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) {
+    float dx[3];
+    float min_dist = ci->h;
+    float center_i = ci->loc_xy.x;
+    float center_j = cj->loc_xy.x;
+    dx[0] = fabsf(center_i - center_j);    
+    center_i = ci->loc_xy.y;
+    center_j = cj->loc_xy.y;
+    dx[1] = fabsf(center_i - center_j);       
+    center_i = ci->loc_z;
+    center_j = cj->loc_z;
+    dx[2] = fabsf(center_i - center_j);  
+    return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); 
+}
 
-#ifdef SANITY_CHECKS
+/*__device__ __forceinline__ int are_neighbours(float cih, float ci_x, float ci_y, float ci_z, struct cell *cj) {
+    float dx_x, dx_y, dx_z;
+    float center_j = cj->loc_xy.x;
+    dx_x = fabsf(ci_x - center_j);
+    center_j = cj->loc_xy.y;
+    dx_y = fabsf(ci_y - center_j);
+    center_j = cj->loc_z;
+    dx_z = fabsf(ci_z - center_j);
+    return (dx_x <= cih) && (dx_y <= cih) && (dx_z <= cih); 
+}*/
+
+__device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) {
+    return (leaf->parts >= c->parts) && (leaf->parts < c->parts + c->count);
+}
 
-  /* Bad stuff will happen if cell sizes are different */
-    if (ci->h != cj->h)
-        error("Non matching cell sizes !! h_i=%f h_j=%f\n", ci->h, cj->h);
 
+__device__ __forceinline__ void make_interact_pc_new(struct cell *child, int *mpoles, int count2)
+{
+int i,j;
+    int count = child->count;    
+    int parts = child->parts;
+//    int cell_j = cj - cells;
+    float r2;
+    float dx[3], ir, w;
+    float a[3];
+#ifdef QUADRUPOLES
+    __shared__ double I_xx_local[8];
+    __shared__ double I_xy_local[8];
+    __shared__ double I_xz_local[8];
+    __shared__ double I_yy_local[8];
+    __shared__ double I_yz_local[8];
+    __shared__ double I_zz_local[8];
+
+    if(threadIdx.x < count2){
+        i = threadIdx.x;
+        I_xx_local[i] = I_xx[mpoles[i]];
+        I_xy_local[i] = I_xy[mpoles[i]];
+        I_xz_local[i] = I_xz[mpoles[i]];
+        I_yy_local[i] = I_yy[mpoles[i]];
+        I_yz_local[i] = I_yz[mpoles[i]];
+        I_zz_local[i] = I_zz[mpoles[i]];
+    }
+    __syncthreads();
 #endif
-
-    /* Loop over particles in cell i */
-    for(i=0; i < count_i; i+= blockDim.x)
-    {
-        /* Init the ith particle's data. */
-          xi[0] = part_x0[parts_it+i];
-          xi[1] = part_x1[parts_it+i];
-          xi[2] = part_x2[parts_it+i];
-        for (k = 0; k < 3; k++) {
-          ai[k] = 0.0f;
-        }
-     //   mi = parts_i[i].mass;
-        /* Loop over particles in cell j */
-        for(j = 0; j < count_j; j++)
-        {
-            /* Compute the pairwise distance. */
-//            for (r2 = 0.0, k = 0; k < 3; k++) {
-            dx[0] = xi[0] - part_x0[parts_jt+j];
-            dx[1] = xi[1] - part_x1[parts_jt+j];
-            dx[2] = xi[2] - part_x2[parts_jt+j];
-            r2 = dx[0] * dx[0];
+    for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
+        a[0] = 0.0;
+        a[1] = 0.0;
+        a[2] = 0.0;
+        for(j = 0; j < count2; j++){
+           r2 = 0.0;
+            dx[0] = com_xy[mpoles[j]].x - parts_cuda[i].loc[0];
+            r2 += dx[0] * dx[0];
+            dx[1] = com_xy[mpoles[j]].y - parts_cuda[i].loc[1];
             r2 += dx[1] * dx[1];
+            dx[2] = com_z[mpoles[j]] - parts_cuda[i].loc[2];
             r2 += dx[2] * dx[2];
-  //          }
-            /* Apply the gravitational acceleration. */
-            ir = 1.0f / sqrtf(r2);
-            w = const_G * ir * ir * ir;
-            for(k = 0; k < 3; k++) {
-                ai[k] += w * dx[k] * part_mass[parts_jt+j];;
-            }
-        }
-        atomicAdd(&parts_a0[parts_it+i], ai[0]);
-        atomicAdd(&parts_a1[parts_it+i], ai[1]);
-        atomicAdd(&parts_a2[parts_it+i], ai[2]);
-    }/* Loop over particles in cell i.*/
-}
-
-/**
- * @brief Decides whether two cells use the direct summation interaction or the
- * multipole interactions
- *
- * @param ci The #cell.
- * @param cj The other #cell.
- */
-__device__ void iact_pair_CUDA(struct cell *ci, struct cell *cj) {
-
-  struct cell *cp, *cps;
-
-#ifdef SANITY_CHECKS
-
-  /* Early abort? */
-  if (ci->count == 0 || cj->count == 0) error("Empty cell !");
-
-  /* Bad stuff will happen if cell sizes are different */
-  if (ci->h != cj->h)
-    error("Non matching cell sizes !! h_i=%f h_j=%f\n", ci->h, cj->h);
-
-  /* Sanity check */
-  if (ci == cj)
-    error("The impossible has happened: pair interaction between a cell and "
-          "itself.");
-
+        
+            ir = rsqrtf(r2);
+            w = com_mass[mpoles[j]] * const_G * ir * ir * ir;
+#ifndef QUADRUPOLES
+            a[0] += w*dx[0];
+            a[1] += w*dx[1];
+            a[2] += w*dx[2];
+#else
+        float mrinv5 = w * ir * ir;
+        float mrinv7 = mrinv5 * ir * ir;
+        float D1 = -w;
+        float D2 =  3.f * mrinv5;
+        float D3 = -15.f * mrinv7;
+
+        float q = I_xx_local[j] + I_yy_local[j] + I_zz_local[j];
+        float qRx = I_xx_local[j] * dx[0] + I_xy_local[j] * dx[1] + I_xz_local[j] * dx[2];
+        float qRy = I_xy_local[j] * dx[0] + I_yy_local[j] * dx[1] + I_yz_local[j] * dx[2];
+        float qRz = I_xz_local[j] * dx[0] + I_yz_local[j] * dx[1] + I_zz_local[j] * dx[2];
+        float qRR = qRx * dx[0] + qRy * dx[1] + qRz * dx[2];
+        float C = D1 + 0.5f * D2 * q + 0.5f * D3 * qRR;
+
+        a[0] -= C * dx[0] + D2 * qRx;
+        a[1] -= C * dx[1] + D2 * qRy;
+        a[2] -= C * dx[2] + D2 * qRz;
+        
 #endif
+        }
+/*        if(threadIdx.x == 0)
+            printf("part.a = %e w*dx = %e \n", parts_cuda[i].a[0], w*dx[0]);*/
+        atomicAdd(&parts_cuda[i].a[0], a[0]);
+        atomicAdd(&parts_cuda[i].a[1], a[1]);
+        atomicAdd(&parts_cuda[i].a[2], a[2]);
+/*        (*accels).x+= w*dx[0];
+        (*accels).y+= w*dx[1];
+        (*accels).z+= w*dx[2];*/
+    }
 
-  /* Are the cells direct neighbours? */
-  if (are_neighbours_CUDA(ci, cj)) {
+}
 
-    /* Are both cells split ? */
-    if (ci->split && cj->split) {
+__device__ __forceinline__ void iact_pair_pc(struct cell *ci, struct cell *cj)
+{
+struct cell *cp, *cps;
+int icells[8];
+int count = 0;
 
-      /* Let's split both cells and build all possible pairs */
-      for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) {
-        for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) {
+    /* Let's split both cells and build all possible non-neighbouring pairs */
+  for (cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
+    count = 0;
+    for (cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
 
-          /* If the cells are neighbours, recurse. */
-          if (are_neighbours_CUDA(cp, cps)) {
-            iact_pair_CUDA(cp, cps);
-          }
-        }
+      if ( ! are_neighbours(cp, cps) ) {
+        icells[count] = cps - cells;
+        count++;
       }
-    } else {/* Otherwise, compute the interactions at this level directly. */
-      iact_pair_direct_CUDA(ci, cj);
-      iact_pair_direct_CUDA(cj, ci);
     }
+    make_interact_pc_new(cp, icells, count);
   }
 }
 
-__device__ void iact_self_direct_CUDA(struct cell_device *c)
+__device__ __forceinline__ void iact_pair_pc_split(struct cell *child, struct cell *cj)
 {
-    int i, j, k, count = c->count;
-    double xi[3] = {0.0, 0.0, 0.0};
-   // float ai[3] = {0.0, 0.0, 0.0}, mj, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w;
-    struct part *parts = c->parts;
-    int partst = c->part_loc;
-    struct cell *cp, *cps;
-    
-#ifdef SANITY_CHECKS
-
-  /* Early abort? */
-    if (count == 0) 
-    {
-        cuda_error("Empty cell");
+    struct cell *cps;
+    int icells[8];
+    int count = 0;
+    for (cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
+        if(!are_neighbours(child, cps) ){
+            icells[count] = cps - cells;
+            count++;
+        }
     }
+    make_interact_pc_new(child, icells, count);
+}
 
-#endif
+#ifdef OLD
+__device__ __forceinline__ void iact_self_pc(struct cell *c, struct cell *leaf)
+{
     
-    /* If the cell is split, interact each progeny with itself, and with
-     each of its siblings. */    
-    if(c->split)
-    {
-        for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
-            iact_self_direct_CUDA(cp);
-            for (cps = cp->sibling; cps != c->sibling; cps = cps->sibling)
-                iact_pair_CUDA(cp, cps);
-        }
-    } else {
+    struct cell *ci, *cj, *cp, *cps;
+    float leafh = leaf->h;
+    float3 accelerations;
 
-        /* Loop over all particles. */
-        for(i = threadIdx.x; i < count; i+= blockDim.x)
-        {
-            /* Init the ith particle's data. */
-            for (k = 0; k < 3; k++) {
-                xi[k] = parts[i].x[k];
-                ai[k] = 0.0;
-            }
-//            mi = parts[i].mass;
+    int i;
 
-           /* Loop all particles - note this was an improvement in mdcore so we use it here. */
-            for(j = 0; j < count; j++)
+    if(threadIdx.x < leaf->count)
+{
+    accelerations.x = 0.0f;
+    accelerations.y = 0.0f;
+    accelerations.z = 0.0f;
+
+    while(c != leaf)
+    {
+        ci = NULL;
+        /* Loop over children of c. */
+        for(cj = &cells[c->firstchild]; cj != &cells[c->sibling]; cj = &cells[cj->sibling])
+        {
+            if(ci == NULL && is_inside(leaf, cj))
             {
-                /* Compute the pairwise distance. */
-                for (r2 = 0.0, k = 0; k < 3; k++) {
-                    dx[k] = xi[k] - parts[j].x[k];
-                    r2 += dx[k] * dx[k];
-                }
-                /* Apply the gravitational acceleration. */
-                ir = 1.0f / sqrtf(r2);
-                w = const_G * ir * ir * ir;
-                mj = parts[j].mass;
-                for(k = 0; k < 3; k++)
+                ci = cj;
+            }else{
+                //Depth first search of cj.
+                cp = cj;
+                while(cp != &cells[cj->sibling])
                 {
-                    ai[k] -= w * dx[k] * mj;
+                    if(!are_neighbours(cp, leaf))
+                    {
+//                        if(!cp->split)
+                            make_interact_pc(leaf, cp, &accelerations);
+//                        else
+  //                         for(cps = &cells[cp->firstchild]; cps != &cells[cp->sibling]; cps = &cells[cps->sibling])
+    //                            make_interact_pc(leaf, cps, &accelerations);
+
+                        cp = &cells[cp->sibling];
+                    }else{
+                        if(cp->split && cp->h > leafh)
+                            cp = &cells[cp->firstchild];
+                        else
+                            cp = &cells[cp->sibling];
+                    }
                 }
-            } /* Inner particle loop. */
-        /* Store the accumulated acceleration on the ith part. */
-      for (k = 0; k < 3; k++) atomicAdd( &parts[i].a[k] , ai[k]);
-        }/* Outer particle loop */
-    }/* All done!*/
+            }
+            
+        }
 
-}
+        c = ci;
 
-/**
- * @brief Checks whether the cell leaf is a subvolume of the cell c
- */
-__device__ __forceinline__ int is_inside_CUDA(struct cell *leaf, struct cell *c) {
-  return (leaf->parts >= c->parts) && (leaf->parts < c->parts + c->count);
+    }
+    int parts = leaf->parts;
+    int count = leaf->count;
+    for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
+        atomicAdd( &parts_cuda[i].a[0] , accelerations.x);
+        atomicAdd( &parts_cuda[i].a[1] , accelerations.y);
+        atomicAdd( &parts_cuda[i].a[2] , accelerations.z);
+    }
 }
 
-
-/**
- * @brief Interacts all particles in ci with the monopole in cj
- */
-__device__ __forceinline__ void make_interact_pc_CUDA(struct cell *leaf, struct cell *cj) {
-
-  int j, k, count = leaf->count;
-  double com[3] = {0.0, 0.0, 0.0};
-  float mcom, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w;
-  struct part *parts = leaf->parts;
-
-#ifdef SANITY_CHECKS
-
-  /* Sanity checks */
-  if (leaf->count == 0) error("Empty cell!");
-
-  /* Sanity check. */
-  if (cj->new.mass == 0.0) {
-    message("%e %e %e %d %p %d %p", cj->new.com[0], cj->new.com[1],
-            cj->new.com[2], cj->count, cj, cj->split, cj->sibling);
-
-    for (j = 0; j < cj->count; ++j)
-      message("part %d mass=%e id=%d", j, cj->parts[j].mass, cj->parts[j].id);
-
-    error("com does not seem to have been set.");
-  }
-
+}
 #endif
 
-  /* Init the com's data. */
-  for (k = 0; k < 3; k++) com[k] = cj->mpole.com[k];
-  mcom = cj->mpole.mass;
 
-  /* Loop over every particle in leaf. */
-  for (j = threadIdx.x; j < count; j+=blockDim.x) {
 
-    /* Compute the pairwise distance. */
-    for (r2 = 0.0, k = 0; k < 3; k++) {
-      dx[k] = com[k] - parts[j].x[k];
-      r2 += dx[k] * dx[k];
-    }
 
-    /* Apply the gravitational acceleration. */
-    ir = 1.0f / sqrtf(r2);
-    w = mcom * const_G * ir * ir * ir;
-    
-    for (k = 0; k < 3; k++) atomicAdd( &parts[j].a[k] , w * dx[k]);
-
-#if ICHECK >= 0
-    if (leaf->parts[j].id == ICHECK)
-      printf("[DEBUG] cell [%f,%f,%f] interacting with cj->loc=[%f,%f,%f] "
-             "m=%f h=%f\n",
-             leaf->loc[0], leaf->loc[1], leaf->loc[2], cj->loc[0], cj->loc[1],
-             cj->loc[2], mcom, cj->h);
-
-    if (leaf->parts[j].id == ICHECK)
-      printf("[NEW] Interaction with monopole a=( %e %e %e ) h= %f Nj= %d m_j= "
-             "%f\n",
-             w * dx[0], w * dx[1], w * dx[2], cj->h, cj->count, mcom);
-#endif
 
-  } /* loop over every particle. */
-}
 
 /**
- * @brief Compute the interactions between all particles in a cell leaf
- *        and the center of mass of all the cells in a part of the tree
- * described by ci and cj
- *
- * @param ci The #cell containing the particle
- * @param cj The #cell containing the center of mass.
+ * @brief Checks whether the cells are direct neighbours ot not. Both cells have
+ * to be of the same size
  */
-__device__ __forceinline__ void iact_pair_pc_CUDA(struct cell *ci, struct cell *cj,
-                                struct cell *leaf) {
+static inline int are_neighbours_host(struct cell *ci, struct cell *cj) {
 
-  struct cell *cp, *cps;
+//  int k;
+  float dx[3];
 
 #ifdef SANITY_CHECKS
-
-  /* Early abort? */
-  if (ci->count == 0 || cj->count == 0) error("Empty cell !");
-
-  /* Sanity check */
-  if (ci == cj)
-    error("The impossible has happened: pair interaction between a cell and "
-          "itself.");
-
-  /* Sanity check */
-  if (!is_inside(leaf, ci))
-    error("The impossible has happened: The leaf is not within ci");
-
-  /* Are the cells direct neighbours? */
-  if (!are_neighbours(ci, cj)) error("Cells are not neighours");
-
-  /* Are both cells split ? */
-  if (!ci->split || !cj->split) error("One of the cells is not split !");
+  if (ci->h != cj->h)
+    error(" Cells of different size in distance calculation.");
 #endif
 
-  /* Let's find in which subcell of ci the leaf is */
-  for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) {
-
-    if (is_inside_CUDA(leaf, cp)) break;
-  }
-
-  if (are_neighbours_different_size_CUDA(cp, cj)) {
-
-    /* Now interact this subcell with all subcells of cj */
-    for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) {
-
-      /* Check whether we have to recurse or can directly jump to the multipole
-       * calculation */
-      if (are_neighbours_CUDA(cp, cps)) {
-
-        /* We only recurse if the children are split */
-        if (cp->split && cps->split) {
-          iact_pair_pc_CUDA(cp, cps, leaf);
-        }
-
-      } else {
-        make_interact_pc_CUDA(leaf, cps);
-      }
-    }
-  } else {
+  /* Maximum allowed distance */
+  float min_dist = ci->h;
 
-    /* If cp is not a neoghbour of cj, we can directly interact with the
-     * multipoles */
-    for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) {
+  /* (Manhattan) Distance between the cells */
+    double2 loc1=ci->loc_xy, loc2=cj->loc_xy;
+    float center_i = loc1.x;
+    float center_j = loc2.x;
+    dx[0] = fabs(center_i - center_j);
+    center_i = loc1.y;
+    center_j = loc2.y;
+    dx[1] = fabs(center_i - center_j);
+    center_i = ci->loc_z;
+    center_j = cj->loc_z;
+    dx[2] = fabs(center_i - center_j);
 
-      make_interact_pc_CUDA(leaf, cps);
-    }
-  }
+  return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
 }
 
 
-__device__ void iact_self_pc_CUDA(struct cell *c, struct cell *leaf)
+struct cell *cell_get()
 {
-
-struct cell *cp, *cps;
-
-    #ifdef SANITY_CHECKS
     
-      /* Early abort? */
-      if (c->count == 0) error("Empty cell !");
-    
-      if (!c->split) error("Cell is not split !");
-
-    #endif
-
-/* Find in which subcell of c the leaf is */
-  for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
-
-    /* Only recurse if the leaf is in this part of the tree */
-    if (is_inside_CUDA(leaf, cp)) break;
-  }
-
-if (cp->split) {
-
-    /* Recurse if the cell can be split */
-    iact_self_pc_CUDA(cp, leaf);
-
-    /* Now, interact with every other subcell */
-    for (cps = c->firstchild; cps != c->sibling; cps = cps->sibling) {
+    if(num_cells == 0)
+    {
+        cell_pool = (struct cell*) calloc(INITIAL_CELLS, sizeof(struct cell));
+        if(cell_pool == NULL)
+            error("Failed to allocate cell_pool");
+        com_xy_host = (double2*) calloc(INITIAL_CELLS, sizeof(double2));
+        if(com_xy_host == NULL)
+            error("Failed to allocate cell_pool");
+        com_z_host = (double*) calloc(INITIAL_CELLS, sizeof(double));
+        if(com_z_host == NULL)
+            error("Failed to allocate cell_pool");
+        com_mass_host = (float*) calloc(INITIAL_CELLS, sizeof(float));
+        if(com_mass_host == NULL)
+            error("Failed to allocate cell_pool");
+        #ifdef QUADRUPOLES
+        I_xx_host = (double*) calloc(INITIAL_CELLS, sizeof(double));
+        if(I_xx_host == NULL)
+            error("Failed to allocate quadrupoles");
+        I_xy_host = (double*) calloc(INITIAL_CELLS, sizeof(double));
+        if(I_xy_host == NULL)
+            error("Failed to allocate quadrupoles");
+        I_xz_host = (double*) calloc(INITIAL_CELLS, sizeof(double));
+        if(I_xz_host == NULL)
+            error("Failed to allocate quadrupoles");
+        I_yy_host = (double*) calloc(INITIAL_CELLS, sizeof(double));
+        if(I_yy_host == NULL)
+            error("Failed to allocate quadrupoles");
+        I_yz_host = (double*) calloc(INITIAL_CELLS, sizeof(double));
+        if(I_yz_host == NULL)
+            error("Failed to allocate quadrupoles");
+        I_zz_host = (double*) calloc(INITIAL_CELLS, sizeof(double));
+        if(I_zz_host == NULL)
+            error("Failed to allocate quadrupoles");
+        #endif
+        num_cells = INITIAL_CELLS;
+    }
 
-      /* Since cp and cps will be direct neighbours it is only worth recursing
-       */
-      /* if the cells can both be split */
-      if (cp != cps && cps->split) iact_pair_pc_CUDA(cp, cps, leaf);
+    if(used_cells >= num_cells)
+    {
+        /* Stretch */
+        struct cell *new_pool;
+        cell_size *= CELL_STRETCH;
+        new_pool = (struct cell*) calloc(num_cells*CELL_STRETCH, sizeof(struct cell));
+        if(new_pool == NULL)
+            error("Failed to allocate new_pool");
+        if(cell_pool != NULL)
+        memcpy(new_pool, cell_pool, num_cells*sizeof(struct cell));
+
+
+        
+        double2 *tempxy = (double2*) calloc(num_cells*CELL_STRETCH, sizeof(double2));
+        if(tempxy == NULL)
+            error("Failed to allocate tempxy");
+        memcpy(tempxy, com_xy_host, sizeof(double2)*num_cells);
+        free(com_xy_host);
+        com_xy_host = tempxy; 
+        double *tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double));
+        if(tempz == NULL)
+            error("Failed to allocate tempz");
+        memcpy(tempz, com_z_host, num_cells*sizeof(double));
+        free(com_z_host);
+        com_z_host = tempz;
+        float *tempm = (float*) calloc(num_cells*CELL_STRETCH, sizeof(float));
+        if(tempm == NULL)
+            error("Failed to allocate tempm");
+        memcpy(tempm, com_mass_host, num_cells*sizeof(float));
+        free(com_mass_host);
+        com_mass_host = tempm;
+
+        #ifdef QUADRUPOLES
+        tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double));
+        if(tempz == NULL)
+            error("Failed to allocate tempz");
+        memcpy(tempz, I_xx_host, num_cells*sizeof(double));
+        free(I_xx_host);
+        I_xx_host = tempz;
+
+        tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double));
+        if(tempz == NULL)
+            error("Failed to allocate tempz");
+        memcpy(tempz, I_xy_host, num_cells*sizeof(double));
+        free(I_xy_host);
+        I_xy_host = tempz;
+
+        tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double));
+        if(tempz == NULL)
+            error("Failed to allocate tempz");
+        memcpy(tempz, I_xz_host, num_cells*sizeof(double));
+        free(I_xz_host);
+        I_xz_host = tempz;
+
+        tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double));
+        if(tempz == NULL)
+            error("Failed to allocate tempz");
+        memcpy(tempz, I_yy_host, num_cells*sizeof(double));
+        free(I_yy_host);
+        I_yy_host = tempz;
+
+        tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double));
+        if(tempz == NULL)
+            error("Failed to allocate tempz");
+        memcpy(tempz, I_yz_host, num_cells*sizeof(double));
+        free(I_yz_host);
+        I_yz_host = tempz;
+
+        tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double));
+        if(tempz == NULL)
+            error("Failed to allocate tempz");
+        memcpy(tempz, I_zz_host, num_cells*sizeof(double));
+        free(I_zz_host);
+        I_zz_host = tempz;
+        #endif
+
+        num_cells *= CELL_STRETCH;
+        free(cell_pool);
+        cell_pool = new_pool;
+
+        message("Increased size of arrays");
     }
-  }
+    used_cells++;
+    cell_pool[used_cells-1].sibling = -1;
+    cell_pool[used_cells-1].firstchild = -1;
+    cell_pool[used_cells-1].res = qsched_res_none;
+    return &cell_pool[used_cells-1];
 }
 
+void comp_com(struct cell *c){
+
+    int k, count = c->count;
+    int cpi;
+    struct cell *cp;
+    int parts = c->parts;
+    double com[3] = {0.0, 0.0, 0.0}, mass = 0.0;
+
+#ifdef QUADRUPOLES
+  double I_xx_local = 0.;
+  double I_yy_local = 0.;
+  double I_zz_local = 0.;
+  double I_xy_local = 0.;
+  double I_xz_local = 0.;
+  double I_yz_local = 0.;
+#endif
 
+    if(c->split) {
+        for(cp = &cell_pool[(cpi = c->firstchild)]; cpi != c->sibling; cp = &cell_pool[(cpi = cp->sibling)]) {
+            float cp_mass = com_mass_host[cpi];
+            com[0] += com_xy_host[cpi].x * cp_mass;
+            com[1] += com_xy_host[cpi].y * cp_mass;
+            com[2] += com_z_host[cpi] * cp_mass;
+            mass += cp_mass;
+            #ifdef QUADRUPOLES
+                I_xx_local += I_xx_host[cpi];
+                I_yy_local += I_yy_host[cpi];
+                I_zz_local += I_zz_host[cpi];
+                I_xy_local += I_xy_host[cpi];
+                I_xz_local += I_xz_host[cpi];
+                I_yz_local += I_yz_host[cpi];
+            #endif
+        }
 
 
-/**
- * @brief Get a #cell from the pool.
- */
-struct cell *cell_get() {
+     /* Otherwise collect the multiple from the particles */
+    } else {
 
-  struct cell *res;
-  int k;
+        for(k = parts; k < parts+count; k++)
+        {
+            float p_mass = parts_host[k].m;
+            com[0] += parts_host[k].loc[0] * p_mass;
+            com[1] += parts_host[k].loc[1] * p_mass;
+            com[2] += parts_host[k].loc[2] * p_mass;
+            mass += p_mass;
+
+            #ifdef QUADRUPOLES
+              I_xx_local += parts_host[k].loc[0] * parts_host[k].loc[0] * p_mass;
+              I_yy_local += parts_host[k].loc[1] * parts_host[k].loc[1] * p_mass;
+              I_zz_local += parts_host[k].loc[2] * parts_host[k].loc[2] * p_mass;
+              I_xy_local += parts_host[k].loc[0] * parts_host[k].loc[1] * p_mass;
+              I_xz_local += parts_host[k].loc[0] * parts_host[k].loc[2] * p_mass;
+              I_yz_local += parts_host[k].loc[1] * parts_host[k].loc[2] * p_mass;
+            #endif 
+        }
+    }
 
-  /* Allocate a new batch? */
-  if (cell_pool == NULL) {
 
-    /* Allocate the cell array. */
-    if ((cell_pool =
-             (struct cell *)calloc(cell_pool_grow, sizeof(struct cell))) ==
-        NULL)
-      error("Failed to allocate fresh batch of cells.");
+    k = c - cell_pool;
+    /* Store the COM data, if it was collected. */
+    if(mass > 0.0f) {
+        float imass = 1.0f/mass;
+        com_xy_host[k].x = com[0] * imass;
+        com_xy_host[k].y = com[1] * imass;
+        com_z_host[k] = com[2] * imass;
+        com_mass_host[k] = mass;
+    }else
+    {
+        com_xy_host[k].x = 0.0;
+        com_xy_host[k].y = 0.0;
+        com_z_host[k] = 0.0;
+        com_mass_host[k] = 0.0f;
+    }
 
-    /* Link them up via their progeny pointers. */
-    for (k = 1; k < cell_pool_grow; k++)
-      cell_pool[k - 1].firstchild = &cell_pool[k];
-  }
+#ifdef QUADRUPOLES
+    I_xx_host[k] = I_xx_local;
+    I_yy_host[k] = I_yy_local;
+    I_zz_host[k] = I_zz_local;
+    I_xy_host[k] = I_xy_local;
+    I_xz_host[k] = I_xz_local;
+    I_yz_host[k] = I_yz_local;
+#endif
 
-  /* Pick a cell off the pool. */
-  res = cell_pool;
-  cell_pool = cell_pool->firstchild;
 
-  /* Clean up a few things. */
-  res->res = qsched_res_none;
-  res->firstchild = 0;
 
-  /* Return the cell. */
-  return res;
 }
 
-
 /**
  * @brief Sort the parts into eight bins along the given pivots and
  *        fill the multipoles. Also adds the hierarchical resources
- *        to the sched.
+ *        to the sched (TODO).
  *
  * @param c The #cell to be split.
  * @param N The total number of parts.
  * @param s The #sched to store the resources.
  */
-void cell_split(struct cell *c, struct qsched *s) {
-
-  int i, j, k, kk, count = c->count;
-  struct part temp, *parts = c->parts;
-  struct cell *cp;
-  int left[8], right[8];
-  double pivot[3];
-  static struct cell *root = NULL;
-  struct cell *progenitors[8];
-
-  /* Set the root cell. */
-  if (root == NULL) {
-    root = c;
-    c->sibling = 0;
-  }
-
-  /* Add a resource for this cell if it doesn't have one yet. */
-  if (c->res == qsched_res_none)
-    c->res = qsched_addres(s, qsched_owner_none, qsched_res_none, NULL, 0 ,NULL);
-
-/* Attach a center-of-mass task to the cell. */
-#ifdef COM_AS_TASK
-  if (count > 0)
-    c->com_tid = qsched_addtask(s, task_type_com, task_flag_none, &c,
-                                sizeof(struct cell *), 1);
-#endif
-
-  /* Does this cell need to be split? */
-  if (count > cell_maxparts) {
-
-    /* Mark this cell as split. */
-    c->split = 1;
-
-    /* Create the progeny. */
-    for (k = 0; k < 8; k++) {
-      progenitors[k] = cp = cell_get();
-      cp->loc[0] = c->loc[0];
-      cp->loc[1] = c->loc[1];
-      cp->loc[2] = c->loc[2];
-      cp->h = c->h * 0.5;
-      cp->res = qsched_addres(s, qsched_owner_none, c->res, NULL, 0, NULL);
-      if (k & 4) cp->loc[0] += cp->h;
-      if (k & 2) cp->loc[1] += cp->h;
-      if (k & 1) cp->loc[2] += cp->h;
-    }
-
-    /* Init the pivots. */
-    for (k = 0; k < 3; k++) pivot[k] = c->loc[k] + c->h * 0.5;
-
-    /* Split along the x-axis. */
-    i = 0;
-    j = count - 1;
-    while (i <= j) {
-      while (i <= count - 1 && parts[i].x[0] < pivot[0]) i += 1;
-      while (j >= 0 && parts[j].x[0] >= pivot[0]) j -= 1;
-      if (i < j) {
-        temp = parts[i];
-        parts[i] = parts[j];
-        parts[j] = temp;
-      }
-    }
-    left[1] = i;
-    right[1] = count - 1;
-    left[0] = 0;
-    right[0] = j;
-
-    /* Split along the y axis, twice. */
-    for (k = 1; k >= 0; k--) {
-      i = left[k];
-      j = right[k];
-      while (i <= j) {
-        while (i <= right[k] && parts[i].x[1] < pivot[1]) i += 1;
-        while (j >= left[k] && parts[j].x[1] >= pivot[1]) j -= 1;
-        if (i < j) {
-          temp = parts[i];
-          parts[i] = parts[j];
-          parts[j] = temp;
-        }
-      }
-      left[2 * k + 1] = i;
-      right[2 * k + 1] = right[k];
-      left[2 * k] = left[k];
-      right[2 * k] = j;
+void cell_split(int c, struct qsched *s) {
+    int i, j, k, kk, count = cell_pool[c].count;
+    int parts = cell_pool[c].parts;
+    struct part temp_part;
+    struct cell *cp, *cell;
+    int left[8], right[8];
+    double pivot[3];
+    static int root = -1;
+    int progenitors[8];
+    struct part *temp_xy;
+
+    /* Set the root cell. */
+    if (root < 0) {
+        root = c;
+        cell_pool[c].sibling = -1;
     }
 
-    /* Split along the z axis, four times. */
-    for (k = 3; k >= 0; k--) {
-      i = left[k];
-      j = right[k];
-      while (i <= j) {
-        while (i <= right[k] && parts[i].x[2] < pivot[2]) i += 1;
-        while (j >= left[k] && parts[j].x[2] >= pivot[2]) j -= 1;
-        if (i < j) {
-          temp = parts[i];
-          parts[i] = parts[j];
-          parts[j] = temp;
-        }
-      }
-      left[2 * k + 1] = i;
-      right[2 * k + 1] = right[k];
-      left[2 * k] = left[k];
-      right[2 * k] = j;
-    }
-
-    /* Store the counts and offsets. */
-    for (k = 0; k < 8; k++) {
-      progenitors[k]->count = right[k] - left[k] + 1;
-      progenitors[k]->parts = &c->parts[left[k]];
+    if(cell_pool[c].res == qsched_res_none)
+    {
+        if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[c].parts], 0) != cudaSuccess )
+            error("Failed to get host device pointer.");
+        cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy,
+                                        sizeof(struct part) * cell_pool[c].count, parts_temp + cell_pool[c].parts);
     }
 
-    /* Find the first non-empty progenitor */
-    for (k = 0; k < 8; k++)
-      if (progenitors[k]->count > 0) {
-        c->firstchild = progenitors[k];
-        break;
-      }
+    if(count > cell_maxparts )
+    {
+        cell_pool[c].split = 1;
 
-#ifdef SANITY_CHECKS
-    if (c->firstchild == NULL)
-      error("Cell has been split but all progenitors have 0 particles");
-#endif
+        for(k = 0; k < 8; k++)
+        {
+            progenitors[k] = (cp = cell_get()) - cell_pool;
+            cell = &cell_pool[c];
+            cp->loc_xy = cell->loc_xy;
+            cp->loc_z = cell->loc_z;
+            cp->h = cell->h*0.5;
+            if(k & 4) cp->loc_xy.x += cp->h;
+            if(k & 2) cp->loc_xy.y += cp->h;
+            if(k & 1) cp->loc_z += cp->h;
+        }
 
-    /* Prepare the pointers. */
-    for (k = 0; k < 8; k++) {
+        /* Init the pivots.*/
+        pivot[0] = cell->loc_xy.x + cell->h * 0.5;
+        pivot[1] = cell->loc_xy.y + cell->h * 0.5;
+        pivot[2] = cell->loc_z + cell->h * 0.5;
 
-      /* Find the next non-empty sibling */
-      for (kk = k + 1; kk < 8; ++kk) {
-        if (progenitors[kk]->count > 0) {
-          progenitors[k]->sibling = progenitors[kk];
-          break;
+        /* Split along the x axis. */
+        i = parts;
+        j = parts+count-1;
+        while(i < j)
+        {
+            while(i <= parts+count-1 && parts_host[i].loc[0] < pivot[0]) i += 1;
+            while(j >= parts && parts_host[j].loc[0] >= pivot[0]) j -= 1;
+            if(i < j){
+                temp_part = parts_host[i];
+                parts_host[i] = parts_host[j];
+                parts_host[j] = temp_part;
+            }
+        }
+        left[1] = i;
+        right[1] = parts+count-1;
+        left[0] = parts;
+        right[0] = j;
+
+        /* Split along the y axis twice. */
+        for (k = 1; k >= 0; k--) {
+            i = left[k];
+            j = right[k];
+            while(i <= j){
+                while(i <= right[k] && parts_host[i].loc[1] < pivot[1]) i += 1;
+                while(j >= left[k] && parts_host[j].loc[1] >= pivot[1]) j -= 1;
+                if(i < j)
+                {
+                    temp_part = parts_host[i];
+                    parts_host[i] = parts_host[j];
+                    parts_host[j] = temp_part;
+                }
+            }
+            left[2*k+1] = i;
+            right[2*k+1] = right[k];
+            left[2*k] = left[k];
+            right[2*k] = j;
+        }
+        /* Split along the z axis four times.*/
+        for(k = 3; k >=0; k--)
+        {
+            i = left[k];
+            j = right[k];
+            while(i <= j){
+                while(i <= right[k] && parts_host[i].loc[2] < pivot[2]) i += 1;
+                while(j >= left[k] && parts_host[j].loc[2] >= pivot[2]) j -= 1;
+                if(i < j)
+                {
+                    temp_part = parts_host[i];
+                    parts_host[i] = parts_host[j];
+                    parts_host[j] = temp_part;
+                }
+            }
+            left[2 * k + 1] = i;
+            right[2 * k + 1] = right[k];
+            left[2 * k] = left[k];
+            right[2 * k] = j;
         }
-      }
 
-      /* No non-empty sibling ? Go back a level */
-      if (kk == 8) progenitors[k]->sibling = c->sibling;
-    }
+        /* Store the counts and offsets. */
+        for(k = 0; k < 8; k++)
+        {
+            cell_pool[progenitors[k]].count = right[k]-left[k]+1;
+            cell_pool[progenitors[k]].parts = left[k];
+            if(cell_pool[progenitors[k]].count > 0){
+                if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
+                    error("Failed to get host device pointer.");
+                cell_pool[progenitors[k]].res = qsched_addres(s, qsched_owner_none, cell->res, temp_xy,
+                                        sizeof(struct part) * cell_pool[progenitors[k]].count, parts_temp + cell_pool[progenitors[k]].parts);
+            }
+        }
 
-    /* Recurse. */
-    for (k = 0; k < 8; k++)
-      if (progenitors[k]->count > 0) cell_split(progenitors[k], s);
+        /* Find the first non-empty progenitor */
+        for(k = 0; k < 8; k++)
+        {
+            if(cell_pool[progenitors[k]].count > 0)
+            {
+                cell->firstchild = progenitors[k];
+                break;
+            }
+        }
 
-/* Link the COM tasks. */
-#ifdef COM_AS_TASK
-    for (k = 0; k < 8; k++)
-      if (progenitors[k]->count > 0)
-        qsched_addunlock(s, progenitors[k]->com_tid, c->com_tid);
-#endif
+        #ifdef SANITY_CHECKS
+            if(cell->firstchild == -1)
+                error("Cell has been split but all children have 0 parts");
+        #endif
 
-    /* Otherwise, we're at a leaf, so create the cell's particle-cell task. */
-  } else {
-    struct cell *data[2] = {root, c};
-    int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
-                             2 * sizeof(struct cell *), 1);
-    qsched_addlock(s, tid, c->res);
-#ifdef COM_AS_TASK
-    qsched_addunlock(s, root->com_tid, tid);
-#endif
-  } /* does the cell need to be split? */
+        /*Prepare the pointers*/
+        for(k = 0; k < 8; k++)
+        {
+            /* Find the next non-empty sibling */
+            for(kk = k+1; kk < 8; ++kk){
+                if(cell_pool[progenitors[kk]].count > 0){
+                    cell_pool[progenitors[k]].sibling = progenitors[kk];
+                    break;
+                }
+            }
 
-/* Compute the cell's center of mass. */
-#ifndef COM_AS_TASK
-  comp_com(c);
-#endif
+            /* No non-empty sibling, go back a level.*/
+            if(kk == 8) cell_pool[progenitors[k]].sibling = cell->sibling;
 
-  /* Set this cell's resources ownership. */
-  qsched_res_own(s, c->res,
-                 s->nr_queues * (c->parts - root->parts) / root->count);
-}
+        }
 
-/**
- * @brief Checks whether the cells are direct neighbours ot not. Both cells have
- * to be of the same size
- */
-static inline int are_neighbours(struct cell *ci, struct cell *cj) {
+        /* Recurse */
+        for(k = 0; k < 8; k++)
+            if(cell_pool[progenitors[k]].count > 0) cell_split(progenitors[k], s);
+     
+    /* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */   
+    } /*else {
 
-  int k;
-  float dx[3];
+//    struct cell *data[2] = {root, c};
+    int data[2] = {root, c};
+        int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
+                                 2 * sizeof(int), 3000);
+        qsched_addlock(s, tid, cell_pool[c].res);
+    }*/
 
-#ifdef SANITY_CHECKS
-  if (ci->h != cj->h)
-    error(" Cells of different size in distance calculation.");
+#ifndef COM_AS_TASK
+    comp_com(&cell_pool[c]);
 #endif
-
-  /* Maximum allowed distance */
-  float min_dist = ci->h;
-
-  /* (Manhattan) Distance between the cells */
-  for (k = 0; k < 3; k++) {
-    float center_i = ci->loc[k];
-    float center_j = cj->loc[k];
-    dx[k] = fabs(center_i - center_j);
-  }
-
-  return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
 }
 
 /**
@@ -906,7 +933,8 @@ static inline int are_neighbours(struct cell *ci, struct cell *cj) {
 void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
 
   qsched_task_t tid;
-  struct cell *data[2], *cp, *cps;
+  int   data[2];
+  struct cell *cp, *cps;
 
 #ifdef SANITY_CHECKS
 
@@ -919,16 +947,16 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
   if (cj == NULL) {
 
     /* Is this cell split and above the task limit ? */
-    if (ci->split && ci->count > task_limit / ci->count) {
+    if (ci->split /*&& ci->count > task_limit / ci->count*/) {
 
       /* Loop over each of this cell's progeny. */
-      for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) {
+      for (cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]) {
 
         /* Make self-interaction task. */
         create_tasks(s, cp, NULL);
 
         /* Make all pair-interaction tasks. */
-        for (cps = cp->sibling; cps != ci->sibling; cps = cps->sibling)
+        for (cps = &cell_pool[cp->sibling]; cps != &cell_pool[ci->sibling]; cps = &cell_pool[cps->sibling])
           create_tasks(s, cp, cps);
       }
 
@@ -936,18 +964,16 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
     } else {
 
       /* Set the data. */
-      data[0] = ci;
-      data[1] = NULL;
+      data[0] = ci -cell_pool;
+      data[1] = -1;
 
       /* Create the task. */
-      tid =
-          qsched_addtask(s, task_type_self, task_flag_none, data,
-                         sizeof(struct cell *) * 2, ci->count * ci->count / 2);
-
-      /* Add the resource (i.e. the cell) to the mpole task. */
+      tid =  qsched_addtask(s, task_type_self, task_flag_none, data,
+			    sizeof(int) * 2, ci->count * ci->count * 0.05);
+      /* Add the resource (i.e. the cell) to the new task. */
       qsched_addlock(s, tid, ci->res);
 
-/* If this call might recurse, add a dependency on the cell's COM
+/* Since this call might recurse, add a dependency on the cell's COM
    task. */
 #ifdef COM_AS_TASK
       if (ci->split) qsched_addunlock(s, ci->com_tid, tid);
@@ -957,304 +983,583 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
     /* Otherwise, it's a pair. */
   } else {
 
-    /* Are the cells NOT neighbours ? */
-    if (!are_neighbours(ci, cj)) {
+    /* Are both cells split and we are above the task limit ? */
+    if (ci->split && cj->split ) {
+      
+      /* Let's split both cells and build all possible pairs */
+      for (cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]) {
+	for (cps = &cell_pool[cj->firstchild]; cps != &cell_pool[cj->sibling]; cps = &cell_pool[cps->sibling]) {
+	  
+	  /* Recurse */
+	  if (are_neighbours_host(cp, cps)) {
+	    create_tasks(s, cp, cps);
+	  }
+	}
+      }
 
-    } else {/* Cells are direct neighbours */
+    if( ci->count > 64*cell_maxparts)
+    {
+      /* Let's also build a particle-monopole task */
+	for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]){
+      /* Create the task. */
+      data[0] = cp-cell_pool;
+      data[1] = cj-cell_pool;
+      tid = qsched_addtask(s, task_type_pair_pc_split, task_flag_none, data,
+			   sizeof(int) * 2, cp->count * cj->count);
+
+      /* Add the resource and dependance */
+      qsched_addlock(s, tid, cp->res);
+//      qsched_addlock(s, tid, cj->res);
+    }
+    for(cp = &cell_pool[cj->firstchild]; cp != &cell_pool[cj->sibling]; cp = &cell_pool[cp->sibling]){
+      /* Create the task. */
+      data[0] = cp-cell_pool;
+      data[1] = ci-cell_pool;
+      tid = qsched_addtask(s, task_type_pair_pc_split, task_flag_none, data,
+			   sizeof(int) * 2, cp->count * cj->count);
+      
+      /* Add the resource and dependance */
+      qsched_addlock(s, tid, cp->res);
+//      qsched_addlock(s, tid, cj->res);
+    }
+    }else
+    {
+        data[0] = ci -cell_pool;
+        data[1] = cj - cell_pool;
+        tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(int) * 2, ci->count * cj->count);
+    /* Add the resource and dependance */
+      qsched_addlock(s, tid, ci->res);
+        data[0] = cj - cell_pool;
+        data[1] = ci - cell_pool;
+        tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(int) * 2, ci->count * cj->count);
+      qsched_addlock(s, tid, cj->res);
+    }
 
-      /* Are both cells split ? */
-      if (ci->split && cj->split && ci->count > task_limit / cj->count) {
+#ifdef COM_AS_TASK
+      qsched_addunlock(s, cj->com_tid, tid);
+      qsched_addunlock(s, ci->com_tid, tid);
+#endif
 
-        /* Let's split both cells and build all possible pairs */
-        for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) {
-          for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) {
-            /* Recurse */
-            create_tasks(s, cp, cps);
-          }
-        }
-        /* Otherwise, at least one of the cells is not split, build a direct
-         * interaction. */
-      } else {
+     
+    } else {  /* Otherwise, at least one of the cells is not split, build a direct
+	       * interaction. */
 
-        /* Set the data. */
-        data[0] = ci;
-        data[1] = cj;
+      /* Set the data. */
+      data[0] = ci-cell_pool;
+      data[1] = cj-cell_pool;
+      
+      /* Create the task. */
+      tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
+			   sizeof(int) * 2, ci->count * cj->count * 0.1);
+      
+      /* Add the resources. */
+      qsched_addlock(s, tid, ci->res);
+      qsched_addlock(s, tid, cj->res);
 
-        /* Create the task. */
-        tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
-                             sizeof(struct cell *) * 2, ci->count * cj->count);
+    }
 
-        /* Add the resources. */
-        qsched_addlock(s, tid, ci->res);
-        qsched_addlock(s, tid, cj->res);
-      }
-    } /* Cells are direct neighbours */
   }   /* Otherwise it's a pair */
 }
 
+
+/**
+ * @brief Compute the interactions between all particles in a cell.
+ *
+ * @param cellID The cell ID to compute interactions on.
+ */
+__device__ __forceinline__ void iact_self_direct(int cellID) {
+    struct cell *c = &cells[cellID];
+    double xi[3] = {0.0, 0.0, 0.0};
+    float ai[3] = {0.0, 0.0, 0.0 }, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w;
+    int parts;
+    int count;
+    int i,j,k;
+    
+        parts = c->parts;
+        count = c->count;
+        for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x)
+        {
+            xi[0] = parts_cuda[i].loc[0];
+            xi[1] = parts_cuda[i].loc[1];
+            xi[2] = parts_cuda[i].loc[2];
+            for(k = 0; k < 3; k++) {
+                ai[k] = 0.0;
+            }
+            
+            for(j = parts; j < parts+count; j++)
+            {
+                if(i != j){
+
+                    /* Compute the pairwise distance. */
+                    r2 = 0.0;
+                    dx[0] = xi[0] - parts_cuda[j].loc[0];
+                    r2 += dx[0]*dx[0];
+                    dx[1] = xi[1] - parts_cuda[j].loc[1];
+                    r2 += dx[1]*dx[1];
+                    dx[2] = xi[2] - parts_cuda[j].loc[2];
+                    r2 += dx[2]*dx[2];
+
+                    /* Apply the gravitational acceleration. */
+
+                    ir = rsqrtf(r2);
+                    w = const_G * ir * ir * ir;
+                    mj = parts_cuda[j].m;
+                    for(k = 0; k < 3; k++) {
+                        ai[k] -= w * dx[k] * mj;
+                    }
+        
+                }
+            }
+            //Update.
+            atomicAdd(&parts_cuda[i].a[0],ai[0] );
+            atomicAdd(&parts_cuda[i].a[1],ai[1] );
+            atomicAdd(&parts_cuda[i].a[2],ai[2] );
+        }
+
+
+}
+
+#ifdef EXACT
+__global__ void interact_exact(int count){
+    
+    int number = blockIdx.x * blockDim.x + threadIdx.x;
+    int j, k;
+    double dx[3], ir, r2, w;
+    if(number < count)
+    {
+        double pix[3] = {parts_cuda[number].loc[0], parts_cuda[number].loc[1], parts_cuda[number].loc[2]};
+        //float mi = parts_cuda[number].m;
+        for(j = 0; j < count; j++)
+        {
+            if(j != number)
+            {
+                for (r2 = 0.0, k = 0; k < 3; k++) {
+                    dx[k] = parts_cuda[j].loc[k] - pix[k];
+                    r2 += dx[k] * dx[k];
+                }
+                ir = rsqrt(r2);
+                w = const_G * ir * ir * ir;
+
+                for(k = 0; k < 3; k++)
+                {
+                    atomicAdd(&parts_cuda[number].a[k] , w * dx[k] * parts_cuda[j].m);
+                }
+            }
+        }
+
+    }
+}
+#endif
+
+__device__ __forceinline__ void runner( int type , void *data ) {
+        
+        int *idata = (int *)data;
+        int i = idata[0];
+        int j = idata[1];
+        switch ( type ) {
+            case task_type_self:
+                iact_self_direct(i);
+                break;
+            case task_type_pair:
+                iact_pair_direct(&cells[i], &cells[j]);
+                break;
+            case task_type_pair_pc: 
+                iact_pair_pc( &cells[i], &cells[j] );
+                break;
+            case task_type_pair_pc_split:
+                iact_pair_pc_split(&cells[i], &cells[j]);
+                break;
+            default:
+                printf("Got to default?\n");
+                asm("trap;");
+        }
+    __syncthreads();
+}
+
+__device__ qsched_funtype function = runner;
+
+
+
+
+
+int calcMaxDepth(struct cell *c, int depth)
+{
+    struct cell *cp;
+    if(c->split)
+    {
+        int maxd = 0, temp;
+        for(cp = &cell_pool[c->firstchild]; cp != &cell_pool[c->sibling]; cp = &cell_pool[cp->sibling])
+        {
+            temp = calcMaxDepth(cp, depth+1);
+            if(temp > maxd)
+                maxd = temp;
+        }
+        return maxd;
+    }else{
+        return depth+1;
+    }
+}
+
+#ifdef QUADRUPOLES
+void update_quads() {
+    int k;
+    for( k = 0; k < used_cells; k++)
+    {
+        double imass = 1. / com_mass_host[k];
+        I_xx_host[k] = I_xx_host[k] * imass - com_xy_host[k].x * com_xy_host[k].x;    
+        I_xy_host[k] = I_xy_host[k] * imass - com_xy_host[k].x * com_xy_host[k].y;    
+        I_xz_host[k] = I_xz_host[k] * imass - com_xy_host[k].x * com_z_host[k];    
+        I_yy_host[k] = I_yy_host[k] * imass - com_xy_host[k].y * com_xy_host[k].y;    
+        I_yz_host[k] = I_yz_host[k] * imass - com_xy_host[k].y * com_z_host[k];    
+        I_zz_host[k] = I_zz_host[k] * imass - com_z_host[k] * com_z_host[k];    
+    }
+}
+#endif
+qsched_funtype func;
 /**
  * @brief Set up and run a task-based Barnes-Hutt N-body solver.
  *
  * @param N The number of random particles to use.
- * @param nr_threads Number of threads to use.
  * @param runs Number of force evaluations to use as a benchmark.
  * @param fileName Input file name. If @c NULL or an empty string, random
  *        particle positions will be used.
  */
-void test_bh(int N, int nr_threads, int runs, char *fileName) {
-
+void test_bh(int N, int runs, char *fileName) {
   int i, k;
   struct cell *root;
-  struct part *parts;
   FILE *file;
   struct qsched s;
-  ticks tic, toc_run, tot_setup = 0, tot_run = 0;
-  int countMultipoles = 0, countPairs = 0, countCoMs = 0;
+  struct cell *gpu_ptr_cells;
+double2 *com_temp;
+double *comz_temp;
+float *comm_temp;
 
-  /* Initialize the per-task type timers. */
-//  for (k = 0; k < task_type_count; k++) task_timers[k] = 0;
+  cudaFree(0);
+    cudaThreadSetCacheConfig(cudaFuncCachePreferL1);
+    if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
+        error("Failed to copy function pointer from device");
 
   /* Initialize the scheduler. */
   qsched_init(&s, 1, qsched_flag_none);
 
-  /* Init and fill the particle array. */
-  if ((parts = (struct part *)malloc(sizeof(struct part) * N)) == NULL)
-    error("Failed to allocate particle buffer.");
-
-    double *partsx0, *partsx1, *partsx2;
-    float *partsa0, *partsa1, *partsa2, *partsmass;
-
-    if ((partsx0 = (double*)malloc(sizeof(double)*N)) == NULL)
-        error("Failed to allocate particle buffer.");
-    if ((partsx1 = (double*)malloc(sizeof(double)*N)) == NULL)
-        error("Failed to allocate particle buffer.");
-    if ((partsx2 = (double*)malloc(sizeof(double)*N)) == NULL)
-        error("Failed to allocate particle buffer.");
-
-    if ((partsa0 = (float*)malloc(sizeof(float)*N)) == NULL)
-        error("Failed to allocate particle buffer.");
-    if ((partsa1 = (float*)malloc(sizeof(float)*N)) == NULL)
-        error("Failed to allocate particle buffer.");
-    if ((partsa2 = (float*)malloc(sizeof(float)*N)) == NULL)
-        error("Failed to allocate particle buffer.");
-    if ((partsmass = (float*)malloc(sizeof(float)*N)) == NULL)
-        error("Failed to allocate particle buffer.");
-
-
-
-
-  /* If no input file was specified, generate random particle positions. */
-  if (fileName == NULL || fileName[0] == 0) {
-    for (k = 0; k < N; k++) {
-      parts[k].id = k;
-      parts[k].x[0] = ((double)rand()) / RAND_MAX;
-      parts[k].x[1] = ((double)rand()) / RAND_MAX;
-      parts[k].x[2] = ((double)rand()) / RAND_MAX;
-      parts[k].mass = ((double)rand()) / RAND_MAX;
-      parts[k].a_legacy[0] = 0.0;
-      parts[k].a_legacy[1] = 0.0;
-      parts[k].a_legacy[2] = 0.0;
+    //Create host particle arrays.
+    if( cudaMallocHost(&parts_host, sizeof(struct part) * N) != cudaSuccess)
+        error("Failed to allocate parts array");
+
+
+
+    if( cudaMalloc(&parts_temp, sizeof(struct part) * N) != cudaSuccess)
+        error("Failed to allocate device parts array");
+    if( cudaMemcpyToSymbol(parts_cuda, &parts_temp, sizeof(struct part*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to set device symbol for parts array");
+
+    if(fileName == NULL || fileName[0] == 0) {
+        for(k = 0; k < N; k++) {
+            parts_host[k].loc[0] = ((double)rand())/ RAND_MAX;
+            parts_host[k].loc[1] = ((double)rand())/ RAND_MAX;
+            parts_host[k].loc[2] = ((double)rand())/ RAND_MAX;
+            parts_host[k].m = ((double)rand()) / RAND_MAX;
+        }
+    
+    } else {
+        file = fopen(fileName, "r");
+        #ifndef ID
+        long long int tempxy;
+        #endif
+//        double temp;
+        printf("Reading input from file\n");
+        if(file) {
+            for(k = 0; k < N; k++) {
+#ifdef ID
+                if(fscanf(file, "%d", &parts_host[k].id) != 1)
+#else
+                if(fscanf(file, "%Ld", &tempxy) != 1)
+#endif
+                    error("Failed to read ID");
+		#ifndef double
+                if(fscanf(file, "%f", &parts_host[k].m) != 1)
+                    error("Failed to read mass of part %i.", k);
+                if(fscanf(file, "%lf %lf %lf", &parts_host[k].loc[0], &parts_host[k].loc[1], &parts_host[k].loc[2]) != 3)
+                    error("Failed to read position of part %i.", k);
+		//if(fscanf(file,"%lf %lf %lf %lf %lf %lf %lf %lf %lf", &temp,&temp,&temp,&temp,&temp,&temp,&temp,&temp,&temp ) != 9)
+          //          error("Failed to read extra stuff");
+		#else
+                if(fscanf(file, "%f", &parts_host[k].m) != 1)
+                    error("Failed to read mass of part %i.", k);
+                if(fscanf(file, "%f %f %f", &parts_host[k].loc[0], &parts_host[k].loc[1], &parts_host[k].loc[2]) != 3)
+                    error("Failed to read position of part %i.", k);	
+//		if(fscanf(file,"%f %f %f %f %f %f %f %f %f", &temp,&temp,&temp,&temp,&temp,&temp,&temp,&temp,&temp ) != 9)
+  //                  error("Failed to read extra stuff");
+		#endif
+            }
+            fclose(file);
+        }
     }
 
-    /* Otherwise, read them from a file. */
-  } else {
-    file = fopen(fileName, "r");
-    if (file) {
-      for (k = 0; k < N; k++) {
-        if (fscanf(file, "%d", &parts[k].id) != 1)
-          error("Failed to read ID of part %i.", k);
-        if (fscanf(file, "%lf%lf%lf", &parts[k].x[0], &parts[k].x[1],
-                   &parts[k].x[2]) !=
-            3)
-          error("Failed to read position of part %i.", k);
-        if (fscanf(file, "%f", &parts[k].mass) != 1)
-          error("Failed to read mass of part %i.", k);
-
-          partsx0[k] = parts[k].x[0];
-          partsx1[k] = parts[k].x[1];
-          partsx2[k] = parts[k].x[2];
-          partsmass[k] = parts[k].mass;
-          partsa0[k] = 0.0f;
-          partsa1[k] = 0.0f;
-          partsa2[k] = 0.0f;
-      }
-      fclose(file);
+    /* Init the cells. */
+    root = cell_get();
+    root->loc_xy.x = 0.0;
+    root->loc_xy.y = 0.0;
+    root->loc_z = 0.0;
+    root->h = 1.0;
+    root->count = N;
+    root->parts = 0;
+    root->sibling = -1;
+    int c = root-cell_pool;
+    cell_split(root - cell_pool, &s);
+    root = &cell_pool[c];
+    int nr_leaves = 0;
+    int maxparts=0, minparts=1000000;
+    int number = 0;
+    while(c >= 0) {
+        if(cell_pool[c].count > 0)
+        {
+            number++;
+            if(cell_pool[c].res == qsched_res_none)
+               message("cell %i has no res", c);
+        }
+        if(!cell_pool[c].split) {
+            nr_leaves++;
+            if(cell_pool[c].count > maxparts)
+            {
+                maxparts = cell_pool[c].count;
+            }
+            if(cell_pool[c].count < minparts)
+            {
+                minparts = cell_pool[c].count;
+            }
+            c = cell_pool[c].sibling;
+        } else {
+            c = cell_pool[c].firstchild;
+        }
     }
-  }
+    message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves));
+    message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts);
+    
+    create_tasks(&s, root, NULL);    
+    #ifdef QUADRUPOLES
+        update_quads();
+        //Copy Quadrupoles to the Device.
+
+    if( cudaMalloc(&comz_temp, sizeof(double) * used_cells) != cudaSuccess)
+        error("Failed to allocate Quads on the GPU");
+    if(cudaMemcpy(comz_temp, I_xx_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy Quads to the GPU");
+    if( cudaMemcpyToSymbol( I_xx, &comz_temp, sizeof(double*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy Quads pointer to the GPU");
+
+    if( cudaMalloc(&comz_temp, sizeof(double) * used_cells) != cudaSuccess)
+        error("Failed to allocate Quads on the GPU");
+    if(cudaMemcpy(comz_temp, I_xy_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy Quads to the GPU");
+    if( cudaMemcpyToSymbol( I_xy, &comz_temp, sizeof(double*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy Quads pointer to the GPU");
+
+    if( cudaMalloc(&comz_temp, sizeof(double) * used_cells) != cudaSuccess)
+        error("Failed to allocate Quads on the GPU");
+    if(cudaMemcpy(comz_temp, I_xz_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy Quads to the GPU");
+    if( cudaMemcpyToSymbol( I_xz, &comz_temp, sizeof(double*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy Quads pointer to the GPU");
+
+    if( cudaMalloc(&comz_temp, sizeof(double) * used_cells) != cudaSuccess)
+        error("Failed to allocate Quads on the GPU");
+    if(cudaMemcpy(comz_temp, I_yy_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy Quads to the GPU");
+    if( cudaMemcpyToSymbol( I_yy, &comz_temp, sizeof(double*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy Quads pointer to the GPU");
+
+    if( cudaMalloc(&comz_temp, sizeof(double) * used_cells) != cudaSuccess)
+        error("Failed to allocate Quads on the GPU");
+    if(cudaMemcpy(comz_temp, I_yz_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy Quads to the GPU");
+    if( cudaMemcpyToSymbol( I_yz, &comz_temp, sizeof(double*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy Quads pointer to the GPU");
+
+    if( cudaMalloc(&comz_temp, sizeof(double) * used_cells) != cudaSuccess)
+        error("Failed to allocate Quads on the GPU");
+    if(cudaMemcpy(comz_temp, I_zz_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy Quads to the GPU");
+    if( cudaMemcpyToSymbol( I_zz, &comz_temp, sizeof(double*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy Quads pointer to the GPU");
+    #endif
 
-  /* Init the cells. */
-  root = cell_get();
-  root->loc[0] = 0.0;
-  root->loc[1] = 0.0;
-  root->loc[2] = 0.0;
-  root->h = 1.0;
-  root->count = N;
-  root->parts = parts;
-  cell_split(root, &s);
-
-  /* Iterate over the cells and get the average number of particles
-     per leaf. */
-  struct cell *c = root;
-  int nr_leaves = 0;
-  while (c != NULL) {
-    if (!c->split) {
-      nr_leaves++;
-      c = c->sibling;
-    } else {
-      c = c->firstchild;
+    int self = 0, pair = 0, pc = 0;
+
+    for(k = 0; k < s.count; k++)
+    {
+        if(s.tasks[k].type == task_type_self)
+            self++;
+        else if (s.tasks[k].type == task_type_pair)
+            pair++;
+        else if (s.tasks[k].type >= 0)
+            pc++;
     }
-  }
-  message("Average number of parts per leaf is %f.", ((double)N) / nr_leaves);
-
-  for(k = 0; k < N; k++)
-  {
-      partsx0[k] = parts[k].x[0];
-      partsx1[k] = parts[k].x[1];
-      partsx2[k] = parts[k].x[2];
-      partsmass[k] = parts[k].mass;
-      partsa0[k] = 0.0f;
-      partsa1[k] = 0.0f;
-      partsa2[k] = 0.0f;
-  }
 
-#if ICHECK > 0
-  printf("----------------------------------------------------------\n");
+    message("total number of tasks: %i.", s.count);
+    message("total number of pair tasks: %i.", pair);
+    message("total number of self tasks: %i.", self);
+    message("total number of pc tasks: %i.", pc);
+    message("total number of cells: %i.", number);
+    message("total number of deps: %i.", s.count_deps);
+    message("total number of res: %i.", s.count_res);
+    message("total number of locks: %i.", s.count_locks);
+
+    for(k = 0; k < runs; k++) {
+        for(i = 0; i < N; ++i) {
+            parts_host[i].a[0] = 0.0;
+            parts_host[i].a[1] = 0.0;
+            parts_host[i].a[2] = 0.0;
+        }
 
-  /* Do a N^2 interactions calculation */
 
-  ticks tic_exact = getticks();
-  interact_exact(N, parts, ICHECK);
-  ticks toc_exact = getticks();
+    /* Copy the cells to the device. */
+    if( cudaMalloc( &gpu_ptr_cells , sizeof(struct cell) * used_cells) != cudaSuccess)
+        error("Failed to allocate cells on the GPU");
+    if( cudaMemcpy( gpu_ptr_cells, cell_pool, sizeof(struct cell) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy cells to the GPU");
+    if( cudaMemcpyToSymbol(cells, &gpu_ptr_cells, sizeof(struct cell*), 0, cudaMemcpyHostToDevice) != cudaSuccess )
+        error("Failed to copy cell pointer to the GPU");
 
-  printf("Exact calculation (1 thread) took %lli (= %e) ticks\n",
-         toc_exact - tic_exact, (float)(toc_exact - tic_exact));
 
-  printf("----------------------------------------------------------\n");
-#endif
 
-  /* Create the tasks. */
-  tic = getticks();
-  create_tasks(&s, root, NULL);
-  tot_setup += getticks() - tic;
-
-  /* Dump the number of tasks. */
-  message("total nr of tasks: %i.", s.count);
-  message("total nr of deps: %i.", s.count_deps);
-  message("total nr of res: %i.", s.count_res);
-  message("total nr of locks: %i.", s.count_locks);
-  message("total nr of uses: %i.", s.count_uses);
-  int counts[task_type_count];
-  for (k = 0; k < task_type_count; k++) counts[k] = 0;
-  for (k = 0; k < s.count; k++) counts[s.tasks[k].type] += 1;
-
-  // char buffer[200];
-  // sprintf(buffer, "timings_legacy_%d_%d.dat", cell_maxparts, nr_threads);
-  // FILE *fileTime = fopen(buffer, "w");
-
-// fclose(fileTime);
-
-#if ICHECK >= 0
-  message("[check] accel of part %i is [%.3e,%.3e,%.3e]", ICHECK,
-          root->parts[ICHECK].a[0], root->parts[ICHECK].a[1],
-          root->parts[ICHECK].a[2]);
-#endif
-  printf("task counts: [ %8s %8s %8s %8s %8s ]\n", "self", "pair", "m-poles",
-         "direct", "CoMs");
-  printf("task counts: [ %8i %8i %8i %8i %8i ] (legacy).\n", 0, 0,
-         countMultipoles, countPairs, countCoMs);
-  printf("task counts: [ ");
-  for (k = 0; k < task_type_count; k++) printf("%8i ", counts[k]);
-  printf("] (mpole).\n");
-
-  /* Loop over the number of runs. */
-  for (k = 0; k < runs; k++) {
-
-    for (i = 0; i < N; ++i) {
-      parts[i].a[0] = 0.0;
-      parts[i].a[1] = 0.0;
-      parts[i].a[2] = 0.0;
-      partsa0[k] = 0.0f;
-      partsa1[k] = 0.0f;
-      partsa2[k] = 0.0f;
-    }
-    
-    /* Execute the tasks. */
-    tic = getticks();
-//    qsched_run(&s, nr_threads, runner);
-    toc_run = getticks();
-    message("%ith run took %lli (= %e) ticks...", k, toc_run - tic,
-            (float)(toc_run - tic));
-    tot_run += toc_run - tic;
-  }
+    if(cudaMalloc( &com_temp, sizeof(double2) * used_cells) != cudaSuccess)
+        error("Failed to allocate com on the GPU");
+    if( cudaMemcpy( com_temp, com_xy_host, sizeof(double2) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
+        error("failed to copy com to the GPU");
+    if( cudaMemcpyToSymbol(com_xy, &com_temp, sizeof(double2 *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy com pointer to the GPU");
 
-//  message("[check] root mass= %f %f", root->legacy.mass, root->mpole.mass);
-//  message("[check] root CoMx= %f %f", root->legacy.com[0], root->mpole.com[0]);
-//  message("[check] root CoMy= %f %f", root->legacy.com[1], root->mpole.com[1]);
-//  message("[check] root CoMz= %f %f", root->legacy.com[2], root->mpole.com[2]);
-#if ICHECK >= 0
-  for (i = 0; i < N; ++i)
-    if (root->parts[i].id == ICHECK)
-      message("[check] accel of part %i is [%.3e,%.3e,%.3e]", ICHECK,
-              root->parts[i].a[0], root->parts[i].a[1], root->parts[i].a[2]);
-#endif
 
-  /* Dump the tasks. */
-  /* for ( k = 0 ; k < s.count ; k++ )
-      printf( " %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid ,
-     s.tasks[k].tic , s.tasks[k].toc ); */
+    if(cudaMalloc( &comz_temp, sizeof(double) * used_cells) != cudaSuccess)
+        error("Failed to allocate com on the GPU");
+    if( cudaMemcpy( comz_temp, com_z_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
+        error("failed to copy com to the GPU");
+    if( cudaMemcpyToSymbol(com_z, &comz_temp, sizeof(double *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy com pointer to the GPU");
+
+    if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess)
+        error("Failed to allocate com on the GPU");
+    if( cudaMemcpy( comm_temp, com_mass_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
+        error("failed to copy com to the GPU");
+    if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy com pointer to the GPU");
+    printf("Max depth is %i\n", calcMaxDepth(root, 0));
+
+    #ifdef NO_LOADS
+    ticks tic_loads, tic;
+     double itpms = 1000.0 / CPU_TPS;
+    tic = getticks();
+    if( cudaMemcpy( parts_host, parts_temp, sizeof(struct part) * N, cudaMemcpyHostToDevice ) != cudaSuccess)
+        error("Failed to copy matrix to device");
+    tic_loads = getticks() - tic;
+    printf("load %.3f \n", tic_loads * itpms );
+    #endif
+        qsched_run_CUDA( &s , func );
+        qsched_print_cuda_timers(&s);
+    #ifdef NO_LOADS
+    tic = getticks();
+    if(cudaMemcpy( parts_temp, parts_host, sizeof(struct part) *N, cudaMemcpyDeviceToHost) != cudaSuccess)
+        error("Failed to copy from device");
+    tic_loads = getticks() - tic;
+    printf("unload %.3f \n", tic_loads * itpms );
+    #endif        
+
+
+       int pcs;
+    if( cudaMemcpyFromSymbol( &pcs, pc_calcs, sizeof(int), 0, cudaMemcpyDeviceToHost) != cudaSuccess)
+        error("Failed");
+    printf("pc calcs = %i\n", pcs);
+struct task* tasks =  qsched_get_timers( &s , s.count );
+    for(i = 0; i < s.count; i++)
+    {
+      printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
+       // printf("\n");
+    
+  }
 
-  /* Dump the costs. */
-  message("costs: setup=%lli ticks, run=%lli ticks.", tot_setup,
-          tot_run / runs);
+}   
+ #ifdef EXACT
+    struct part *parts2;
+        parts2 = (struct part*) malloc(sizeof(struct part) * N );
+        for(k = 0; k < N; k++)
+        {
+            parts2[k].loc[0] = parts_host[k].loc[0];
+            parts2[k].loc[1] = parts_host[k].loc[1];
+            parts2[k].loc[2] = parts_host[k].loc[2];
+            parts2[k].m = parts_host[k].m;
+            parts2[k].a[0] = 0.0f;
+            parts2[k].a[1] = 0.0f;
+            parts2[k].a[2] = 0.0f;
+        }
+//    cudaDeviceReset();
+    if(cudaMalloc(&com_temp, sizeof(struct part) * N) != cudaSuccess)
+        error("Couldn't allocate com_temp");
+    if(cudaMemcpy(com_temp, parts2, sizeof(struct part) * N, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Couldn't copy part data");
+    if(cudaMemcpyToSymbol(parts_cuda, &com_temp, sizeof(struct part*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy part data");
+
+    int numblocks = N / 128 + 1;
+    double itpms = 1000.0 / CPU_TPS;
+ticks tic, toc_run ;
+    tic = getticks();
+    interact_exact<<<numblocks, 128>>>(N);
+    cudaDeviceSynchronize();
+    toc_run = getticks();
+    printf("Cuda kernel took %.3f ms to run exact solution\n", ((double)(toc_run - tic)) * itpms);
+    if( cudaMemcpy( parts2, com_temp, sizeof(struct part) * N, cudaMemcpyDeviceToHost) != cudaSuccess)
+        error("Failed to copy data back from device");    
 
-  /* Dump the timers. */
-  for (k = 0; k < qsched_timer_count; k++)
-    message("timer %s is %lli ticks.", qsched_timer_names[k],
-            s.timers[k] / runs);
+    printf("%e\n", parts_host[0].m);
 
-  /* Dump the per-task type timers. */
-//  printf("task timers: [ ");
-//  for (k = 0; k < task_type_count; k++) printf("%lli ", task_timers[k] / runs);
-  //printf("] ticks.\n");
 
+    k = 0;
+        printf("%e, %e, %e, %e, %e, %e, %e, %e, %e, %e\n", parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2],
+            parts2[k].a[0], parts2[k].a[1], parts2[k].a[2], parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]);
+    #endif
   /* Dump the particles to a file */
   file = fopen("particle_dump.dat", "w");
-  fprintf(file,
+/*  fprintf(file,
           "# ID m x y z a_exact.x   a_exact.y    a_exact.z    a_legacy.x    "
-          "a_legacy.y    a_legacy.z    a_mpole.x     a_mpole.y    a_mpole.z\n");
+          "a_legacy.y    a_legacy.z    a_new.x     a_new.y    a_new.z\n");*/
+#ifdef EXACT
+    printf("Printing exact\n");
+    for(k = 0; k < N; ++k)
+    fprintf(file, "%i %e %e %e %e %e %e %e %e %e %e %e %e %e\n", 
+            #ifdef ID
+            parts_host[k].id,
+            #else
+            k,
+            #endif
+            parts_host[k].m,parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2],
+            parts2[k].a[0], parts2[k].a[1], parts2[k].a[2], parts2[k].a[0], parts2[k].a[1], parts2[k].a[2], parts_host[k].a[0],
+            parts_host[k].a[1], parts_host[k].a[2]);
+
+#else
   for (k = 0; k < N; ++k)
-    fprintf(file, "%d %e %e %e %e %e %e %e %e %e %e %e %e %e\n", parts[k].id,
-            parts[k].mass, parts[k].x[0], parts[k].x[1], parts[k].x[2],
-            parts[k].a_exact[0], parts[k].a_exact[1], parts[k].a_exact[2],
-            parts[k].a_legacy[0], parts[k].a_legacy[1], parts[k].a_legacy[2],
-            parts[k].a[0], parts[k].a[1], parts[k].a[2]);
+    fprintf(file, "%i %e %e %e %e %e %e %e\n",
+            k, parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2],
+            parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]);
+#endif
   fclose(file);
-
-  /* Clean up. */
-  qsched_free(&s);
-  free(parts);
+    file = fopen("particle_pos.dat", "w");
+    fprintf(file, "m x[1] x[2] x[3]\n");
+    for(k = 0; k < N; k++)
+        fprintf(file, "%e %e %e %e\n", parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2]);
+
+    fclose(file);
+    cudaFreeHost(parts_host);
+    //free(parts_host);
 }
 
-/**
- * @brief Main function.
- */
 
 int main(int argc, char *argv[]) {
-
-  int c, nr_threads;
-  int N = 1000, runs = 1;
-  char fileName[100] = {0};
-
-  /* Die on FP-exceptions. */
-  feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
-
-/* Get the number of threads. */
-#pragma omp parallel shared(nr_threads)
-  {
-    if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads();
-  }
-
-  /* Parse the options */
+    int c, nr_threads;
+    int N = 1000, runs = 1;
+    char fileName[100];
+    fileName[0] = 0;
+    /* Parse the options */
   while ((c = getopt(argc, argv, "n:r:t:f:c:i:")) != -1) switch (c) {
       case 'n':
         if (sscanf(optarg, "%d", &N) != 1)
@@ -1267,7 +1572,6 @@ int main(int argc, char *argv[]) {
       case 't':
         if (sscanf(optarg, "%d", &nr_threads) != 1)
           error("Error parsing number of threads.");
-        omp_set_num_threads(nr_threads);
         break;
       case 'f':
         if (sscanf(optarg, "%s", &fileName[0]) != 1)
@@ -1304,7 +1608,5 @@ int main(int argc, char *argv[]) {
   }
 
   /* Run the BH test. */
-  test_bh(N, nr_threads, runs, fileName);
-
-  return 0;
+  test_bh(N, runs, fileName);
 }
diff --git a/examples/test_qr.cu b/examples/test_qr.cu
index 5446870ae5d44da45b7dd971e633e8f042782907..c37277ab7f51b7efd9f1ac4bb26bfb6261cd4a23 100644
--- a/examples/test_qr.cu
+++ b/examples/test_qr.cu
@@ -7,20 +7,22 @@
 #include <string.h>
 #include <unistd.h>
 #include <math.h>
-#include <gperftools/profiler.h>
+//#include <gperftools/profiler.h>
 
 
 //#define WITH_CULA
-#ifdef WITH_CULA
-#include <cula.h>
-#endif
+//#ifdef WITH_CULA
+//#include <cula.h>
+//#endif
 
 
 /* Local includes. */
 extern "C"{
 #include "quicksched.h"
 #include "res.h"
+#ifdef WITH_CBLAS
 #include <cblas.h>
+#endif
 }
 #include "cuda_queue.h"
 
@@ -95,6 +97,7 @@ void printMatrix(float *matrix, int m, int n, int tilesize)
  * This function is simply for validation and is implemented naively as we know
  * of no implementation to retrieve Q from the tiled QR.
  */
+#ifdef WITH_CBLAS
 float* computeQ(float* HR, int size, int tilesize, float* tau, int tauNum) {
   float* Q = (float*)malloc(sizeof(float) * size * size);
   float* Qtemp = (float*)malloc(sizeof(float) * size * size);
@@ -163,6 +166,7 @@ float* computeQ(float* HR, int size, int tilesize, float* tau, int tauNum) {
   free(temp);
   return Q;
 }
+#endif
 
 float* getR(float* HR, int size) {
   float* R = (float*)malloc(sizeof(float) * size * size);
@@ -693,7 +697,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
     int data[3];
 //    ticks tic, tot_run = 0;
     #ifdef NO_LOADS
-    ticks tic_loads;
+    ticks tic_loads, tic;
     #endif
  /* Initialize the scheduler. */
     qsched_init( &s , 1 , qsched_flag_none );
@@ -730,10 +734,12 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
     if( cudaMalloc( &device_array , sizeof(float) * m * n * K * K ) != cudaSuccess )
         error("Failed to allocate the matrix on the device");
     #ifdef NO_LOADS
+    double itpms = 1000.0 / CPU_TPS;
     tic = getticks();
     if( cudaMemcpy( A, device_array, sizeof(float) * m * n * K *K, cudaMemcpyHostToDevice ) != cudaSuccess)
         error("Failed to copy matrix to device");
     tic_loads = getticks() - tic;
+    printf("load %.3f \n", tic_loads * itpms );
     #endif
     if( cudaMemcpyToSymbol( GPU_matrix , &device_array , sizeof(float *) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess )
         error("Failed to copy matrix pointer to the device");
@@ -816,14 +822,16 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
 
 
         qsched_run_CUDA( &s , func );
-    qsched_print_cuda_timers(&s);
     #ifdef NO_LOADS
     tic = getticks();
     if( cudaMemcpy( device_array, A, sizeof(float) * m * n * K *K, cudaMemcpyDeviceToHost ) != cudaSuccess)
         error("Failed to copy matrix from device");
-    tic_loads += getticks() - tic;double itpms = 1000.0 / CPU_TPS;
-    printf("%.3f\n", ((double)(tic_loads)) * itpms );
+    tic_loads = getticks() - tic;
+    printf("unload %.3f\n", ((double)(tic_loads)) * itpms );
     #endif
+
+    qsched_print_cuda_timers(&s);
+
     if(cudaMemcpy(  tau , tau_device , sizeof(float) * m * n * K , cudaMemcpyDeviceToHost ) != cudaSuccess )
         error("Failed to copy the tau data from the device.");
 
diff --git a/src/cuda_queue.cu b/src/cuda_queue.cu
index 71d7dbebe725866fdcecbfbc40cdb7a8a1f6665b..3d4e5f3bd10d50f10df3ff9cecee6c2055525e58 100644
--- a/src/cuda_queue.cu
+++ b/src/cuda_queue.cu
@@ -20,11 +20,11 @@
 #include "../config.h"
 
 /* Standard includes. */
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <gperftools/profiler.h>
-#include <math.h>
+
 
 extern "C"{
 #include "quicksched.h"