Added test_bh_4 which contains the final version of the barnes hut. Everything...

Added test_bh_4 which contains the final version of the barnes hut. Everything seems to work correctly

Added test_bh_4 which contains the final version of the barnes hut. Everything...
342961e3 · aidan · 1614dd27 · 342961e3
Commit 342961e3 authored 10 years ago by aidan
--- a/examples/test_bh_4.cu
+++ b/examples/test_bh_4.cu
+/*******************************************************************************
+ * This file is part of QuickSched.
+ * Coypright (c) 2014 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
+ *                    Aidan Chalk (aidan.chalk@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+* *****************************************************************************/
+//#define EXACT
+//#define ID
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Standard includes. */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+#include <omp.h>
+#include <fenv.h>
+
+/* Local includes. */
+extern "C"{
+#include "quicksched.h"
+#include "res.h"
+}
+#include "cuda_queue.h"
+//#define double float
+//#define double2 float2
+//#define float double
+//#define float2 double2
+
+/** Task types. */
+enum task_type {
+  task_type_self = 0,
+  task_type_pair,
+  task_type_pair_pc,
+  task_type_pair_pc_split,
+  task_type_com,
+  task_type_count
+};
+
+struct cell{
+
+double2 loc_xy;
+double loc_z;
+double h;
+int count;
+unsigned short int split, sorted;
+int parts, firstchild, sibling;
+int res, com_tid;
+
+};
+struct part{
+#ifdef ID
+int id;
+#endif
+double loc[3];
+float a[3];
+float m;
+};
+
+
+#define const_G 1
+/* Requred variables to obtain cells. */
+#define cell_maxparts 128
+#define CELL_STRETCH 2
+#define INITIAL_CELLS 256
+struct cell *cell_pool = NULL;
+int used_cells=0;
+int num_cells = 0;
+int cell_size = INITIAL_CELLS*sizeof(struct cell);
+
+/* Device locations for the particle values. */
+__device__ double2 *com_xy;
+__device__ double *com_z;
+__device__ float *com_mass;
+__device__ struct part *parts_cuda;
+__device__ struct cell *cells;
+__device__ int pc_calcs = 0;
+
+
+/* Host locations for the particle values. */
+double2 *com_xy_host;
+double *com_z_host;
+float *com_mass_host;
+double2 *parts_pos_xy_temp;
+double *parts_pos_z_temp;
+float4 *parts_a_m_temp;
+
+struct part *parts_host;
+struct part *parts_temp;
+
+__device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) 
+} while (assumed != old); return __longlong_as_double(old); }
+
+
+
+__device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *cj) {
+    int i, j, k;
+    int count_i = ci->count, count_j = cj->count;
+    int parts_i = ci->parts, parts_j = cj->parts;
+    double xi[3];
+    float dx[3];
+    float ai[3], mj, r2, w, ir;
+
+    /* Loop over cell i.*/
+    for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
+        xi[0] = parts_cuda[i].loc[0];
+        xi[1] = parts_cuda[i].loc[1];
+        xi[2] = parts_cuda[i].loc[2];
+        for(k = 0; k < 3; k++) {
+            ai[k] = 0.0f;
+        }
+        
+        for(j = parts_j; j < parts_j + count_j; j++) {
+            r2 = 0.0f;
+            dx[0] = xi[0] - parts_cuda[j].loc[0];
+            dx[1] = xi[1] - parts_cuda[j].loc[1];
+            dx[2] = xi[2] - parts_cuda[j].loc[2];
+            r2 += dx[0] * dx[0];
+            r2 += dx[1] * dx[1];
+            r2 += dx[2] * dx[2];
+
+
+            ir = rsqrtf(r2);
+            w = const_G * ir * ir * ir;
+            mj = parts_cuda[j].m;
+            for(k = 0; k < 3; k++) {
+                ai[k] -= dx[k] * mj * w;
+            }
+        }            
+       atomicAdd(&parts_cuda[i].a[0], ai[0]);           
+       atomicAdd(&parts_cuda[i].a[1], ai[1]);
+       atomicAdd(&parts_cuda[i].a[2], ai[2]);
+        
+    }
+
+    /* Load particles of cell i into shared memory */
+ /*Loop over cell j. */
+    for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
+        xi[0] = parts_cuda[i].loc[0];
+        xi[1] = parts_cuda[i].loc[1];
+        xi[2] = parts_cuda[i].loc[2];     
+        for(k = 0; k < 3; k++) {
+            ai[k] = 0.0f;
+        }
+        
+        for(j = parts_i; j < parts_i + count_i; j++) {
+            r2 = 0.0f;
+            dx[0] = xi[0] - parts_cuda[j].loc[0];
+            dx[1] = xi[1] - parts_cuda[j].loc[1];
+            dx[2] = xi[2] - parts_cuda[j].loc[2];
+            r2 += dx[0] * dx[0];
+            r2 += dx[1] * dx[1];
+            r2 += dx[2] * dx[2];
+
+
+            ir = rsqrtf(r2);
+            w = const_G * ir * ir * ir;
+            mj = parts_cuda[j].m;
+            for(k = 0; k < 3; k++) {
+                ai[k] -= dx[k] * mj * w;
+            }
+        }            
+       atomicAdd(&parts_cuda[i].a[0], ai[0]);           
+       atomicAdd(&parts_cuda[i].a[1], ai[1]);
+       atomicAdd(&parts_cuda[i].a[2], ai[2]);
+        
+    }
+
+}
+
+__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) {
+
+    int i;
+    int count = leaf->count;    
+    int parts = leaf->parts;
+    int cell_j = cj - cells;
+    float r2;
+    float dx[3], ir, w;
+
+    if(threadIdx.x == 0)
+        atomicAdd(&pc_calcs, 1);
+
+   /* int leaf_num = leaf - cells;
+    int cjnum = cj - cells;
+    if(threadIdx.x == 0 && leaf_num == 55)
+        printf("Calculating with %d %i whose firstchild is %i and cj->split == %i\n", (cjnum), cj->firstchild, cj->split);*/
+/*    if(threadIdx.x == 0)
+    printf("leaf = %i\n", leaf - cells);*/
+
+    for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
+        r2 = 0.0;
+        dx[0] = com_xy[cell_j].x - parts_cuda[i].loc[0];
+        r2 += dx[0] * dx[0];
+        dx[1] = com_xy[cell_j].y - parts_cuda[i].loc[1];
+        r2 += dx[1] * dx[1];
+        dx[2] = com_z[cell_j] - parts_cuda[i].loc[2];
+        r2 += dx[2] * dx[2];
+    
+        ir = rsqrtf(r2);
+        w = com_mass[cell_j] * const_G * ir * ir * ir;
+/*        if(threadIdx.x == 0)
+            printf("part.a = %e w*dx = %e \n", parts_cuda[i].a[0], w*dx[0]);*/
+        atomicAdd(&parts_cuda[i].a[0], w*dx[0]);
+        atomicAdd(&parts_cuda[i].a[1], w*dx[1]);
+        atomicAdd(&parts_cuda[i].a[2], w*dx[2]);
+/*        (*accels).x+= w*dx[0];
+        (*accels).y+= w*dx[1];
+        (*accels).z+= w*dx[2];*/
+    }
+}
+
+/**
+ * @brief Checks whether the cells are direct neighbours ot not
+ */
+__device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) {
+
+    float dx[3];
+    float cih = ci->h, cjh = cj->h;
+    
+    float min_dist = 0.5*(cih+cjh);
+
+    float center_i = ci->loc_xy.x + 0.5*cih;
+    float center_j = cj->loc_xy.x + 0.5*cjh;
+    dx[0] = fabsf(center_i - center_j);    
+    center_i = ci->loc_xy.y + 0.5*cih;
+    center_j = cj->loc_xy.y + 0.5*cjh;
+    dx[1] = fabsf(center_i - center_j);       
+    center_i = ci->loc_z + 0.5*cih;
+    center_j = cj->loc_z + 0.5*cjh;
+    dx[2] = fabsf(center_i - center_j);    
+    return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); 
+}
+
+/*__device__ __forceinline__ int are_neighbours_different_size(double cih, float ci_x, float ci_y, float ci_z, struct cell *cj) {
+    float dx_x, dx_y, dx_z;
+    double cjh = cj->h;
+    float min_dist = 0.5*(cih+cjh);
+    float center_i = ci_x + 0.5*cih;
+    float center_j = cj->loc_xy.x + 0.5*cjh;
+    dx_x = fabsf(center_i - center_j);
+    center_i = ci_y + 0.5*cih;
+    center_j = cj->loc_xy.y + 0.5*cjh;
+    dx_y = fabsf(center_i - center_j);
+    center_i = ci_z + 0.5*cih;
+    center_j = cj->loc_z + 0.5*cjh;
+    dx_z = fabsf(center_i - center_j);
+    return (dx_x <= min_dist) && (dx_y <= min_dist) && (dx_z <= min_dist); 
+}*/
+
+__device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) {
+    float dx[3];
+    float min_dist = ci->h;
+    float center_i = ci->loc_xy.x;
+    float center_j = cj->loc_xy.x;
+    dx[0] = fabsf(center_i - center_j);    
+    center_i = ci->loc_xy.y;
+    center_j = cj->loc_xy.y;
+    dx[1] = fabsf(center_i - center_j);       
+    center_i = ci->loc_z;
+    center_j = cj->loc_z;
+    dx[2] = fabsf(center_i - center_j);  
+    return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); 
+}
+
+/*__device__ __forceinline__ int are_neighbours(float cih, float ci_x, float ci_y, float ci_z, struct cell *cj) {
+    float dx_x, dx_y, dx_z;
+    float center_j = cj->loc_xy.x;
+    dx_x = fabsf(ci_x - center_j);
+    center_j = cj->loc_xy.y;
+    dx_y = fabsf(ci_y - center_j);
+    center_j = cj->loc_z;
+    dx_z = fabsf(ci_z - center_j);
+    return (dx_x <= cih) && (dx_y <= cih) && (dx_z <= cih); 
+}*/
+
+__device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) {
+    return (leaf->parts >= c->parts) && (leaf->parts < c->parts + c->count);
+}
+
+
+__device__ __forceinline__ void make_interact_pc_new(struct cell *child, int *mpoles, int count2)
+{
+int i,j;
+    int count = child->count;    
+    int parts = child->parts;
+//    int cell_j = cj - cells;
+    float r2;
+    float dx[3], ir, w;
+    float a[3];
+    
+    for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
+        a[0] = 0.0;
+        a[1] = 0.0;
+        a[2] = 0.0;
+        for(j = 0; j < count2; j++){
+           r2 = 0.0;
+            dx[0] = com_xy[mpoles[j]].x - parts_cuda[i].loc[0];
+            r2 += dx[0] * dx[0];
+            dx[1] = com_xy[mpoles[j]].y - parts_cuda[i].loc[1];
+            r2 += dx[1] * dx[1];
+            dx[2] = com_z[mpoles[j]] - parts_cuda[i].loc[2];
+            r2 += dx[2] * dx[2];
+        
+            ir = rsqrtf(r2);
+            w = com_mass[mpoles[j]] * const_G * ir * ir * ir;
+            a[0] += w*dx[0];
+            a[1] += w*dx[1];
+            a[2] += w*dx[2];
+        }
+/*        if(threadIdx.x == 0)
+            printf("part.a = %e w*dx = %e \n", parts_cuda[i].a[0], w*dx[0]);*/
+        atomicAdd(&parts_cuda[i].a[0], a[0]);
+        atomicAdd(&parts_cuda[i].a[1], a[1]);
+        atomicAdd(&parts_cuda[i].a[2], a[2]);
+/*        (*accels).x+= w*dx[0];
+        (*accels).y+= w*dx[1];
+        (*accels).z+= w*dx[2];*/
+    }
+
+}
+
+__device__ __forceinline__ void iact_pair_pc(struct cell *ci, struct cell *cj)
+{
+struct cell *cp, *cps;
+int icells[8];
+int count = 0;
+
+    /* Let's split both cells and build all possible non-neighbouring pairs */
+  for (cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
+    count = 0;
+    for (cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
+
+      if ( ! are_neighbours(cp, cps) ) {
+        icells[count] = cps - cells;
+        count++;
+      }
+    }
+    make_interact_pc_new(cp, icells, count);
+  }
+}
+
+__device__ __forceinline__ void iact_pair_pc_split(struct cell *child, struct cell *cj)
+{
+    struct cell *cps;
+    int icells[8];
+    int count = 0;
+    for (cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
+        if(!are_neighbours(child, cps) ){
+            icells[count] = cps - cells;
+            count++;
+        }
+    }
+    make_interact_pc_new(child, icells, count);
+}
+
+#ifdef OLD
+__device__ __forceinline__ void iact_self_pc(struct cell *c, struct cell *leaf)
+{
+    
+    struct cell *ci, *cj, *cp, *cps;
+    float leafh = leaf->h;
+    float3 accelerations;
+
+    int i;
+
+    if(threadIdx.x < leaf->count)
+{
+    accelerations.x = 0.0f;
+    accelerations.y = 0.0f;
+    accelerations.z = 0.0f;
+
+    while(c != leaf)
+    {
+        ci = NULL;
+        /* Loop over children of c. */
+        for(cj = &cells[c->firstchild]; cj != &cells[c->sibling]; cj = &cells[cj->sibling])
+        {
+            if(ci == NULL && is_inside(leaf, cj))
+            {
+                ci = cj;
+            }else{
+                //Depth first search of cj.
+                cp = cj;
+                while(cp != &cells[cj->sibling])
+                {
+                    if(!are_neighbours(cp, leaf))
+                    {
+//                        if(!cp->split)
+                            make_interact_pc(leaf, cp, &accelerations);
+//                        else
+  //                         for(cps = &cells[cp->firstchild]; cps != &cells[cp->sibling]; cps = &cells[cps->sibling])
+    //                            make_interact_pc(leaf, cps, &accelerations);
+
+                        cp = &cells[cp->sibling];
+                    }else{
+                        if(cp->split && cp->h > leafh)
+                            cp = &cells[cp->firstchild];
+                        else
+                            cp = &cells[cp->sibling];
+                    }
+                }
+            }
+            
+        }
+
+        c = ci;
+
+    }
+    int parts = leaf->parts;
+    int count = leaf->count;
+    for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
+        atomicAdd( &parts_cuda[i].a[0] , accelerations.x);
+        atomicAdd( &parts_cuda[i].a[1] , accelerations.y);
+        atomicAdd( &parts_cuda[i].a[2] , accelerations.z);
+    }
+}
+
+}
+#endif
+
+
+
+
+
+
+/**
+ * @brief Checks whether the cells are direct neighbours ot not. Both cells have
+ * to be of the same size
+ */
+static inline int are_neighbours_host(struct cell *ci, struct cell *cj) {
+
+//  int k;
+  float dx[3];
+
+#ifdef SANITY_CHECKS
+  if (ci->h != cj->h)
+    error(" Cells of different size in distance calculation.");
+#endif
+
+  /* Maximum allowed distance */
+  float min_dist = ci->h;
+
+  /* (Manhattan) Distance between the cells */
+    double2 loc1=ci->loc_xy, loc2=cj->loc_xy;
+    float center_i = loc1.x;
+    float center_j = loc2.x;
+    dx[0] = fabs(center_i - center_j);
+    center_i = loc1.y;
+    center_j = loc2.y;
+    dx[1] = fabs(center_i - center_j);
+    center_i = ci->loc_z;
+    center_j = cj->loc_z;
+    dx[2] = fabs(center_i - center_j);
+
+  return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
+}
+
+
+struct cell *cell_get()
+{
+    
+    if(num_cells == 0)
+    {
+        cell_pool = (struct cell*) calloc(INITIAL_CELLS, sizeof(struct cell));
+        if(cell_pool == NULL)
+            error("Failed to allocate cell_pool");
+        com_xy_host = (double2*) calloc(INITIAL_CELLS, sizeof(double2));
+        if(com_xy_host == NULL)
+            error("Failed to allocate cell_pool");
+        com_z_host = (double*) calloc(INITIAL_CELLS, sizeof(double));
+        if(com_z_host == NULL)
+            error("Failed to allocate cell_pool");
+        com_mass_host = (float*) calloc(INITIAL_CELLS, sizeof(float));
+        if(com_mass_host == NULL)
+            error("Failed to allocate cell_pool");
+        num_cells = INITIAL_CELLS;
+    }
+
+    if(used_cells >= num_cells)
+    {
+        /* Stretch */
+        struct cell *new_pool;
+        cell_size *= CELL_STRETCH;
+        new_pool = (struct cell*) calloc(num_cells*CELL_STRETCH, sizeof(struct cell));
+        if(new_pool == NULL)
+            error("Failed to allocate new_pool");
+        if(cell_pool != NULL)
+        memcpy(new_pool, cell_pool, num_cells*sizeof(struct cell));
+
+
+        
+        double2 *tempxy = (double2*) calloc(num_cells*CELL_STRETCH, sizeof(double2));
+        if(tempxy == NULL)
+            error("Failed to allocate tempxy");
+        memcpy(tempxy, com_xy_host, sizeof(double2)*num_cells);
+        free(com_xy_host);
+        com_xy_host = tempxy; 
+        double *tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double));
+        if(tempz == NULL)
+            error("Failed to allocate tempz");
+        memcpy(tempz, com_z_host, num_cells*sizeof(double));
+        free(com_z_host);
+        com_z_host = tempz;
+        float *tempm = (float*) calloc(num_cells*CELL_STRETCH, sizeof(float));
+        if(tempm == NULL)
+            error("Failed to allocate tempm");
+        memcpy(tempm, com_mass_host, num_cells*sizeof(float));
+        free(com_mass_host);
+        com_mass_host = tempm;
+
+        num_cells *= CELL_STRETCH;
+        free(cell_pool);
+        cell_pool = new_pool;
+
+        message("Increased size of arrays");
+    }
+    used_cells++;
+    cell_pool[used_cells-1].sibling = -1;
+    cell_pool[used_cells-1].firstchild = -1;
+    cell_pool[used_cells-1].res = qsched_res_none;
+    return &cell_pool[used_cells-1];
+}
+
+void comp_com(struct cell *c){
+
+    int k, count = c->count;
+    int cpi;
+    struct cell *cp;
+    int parts = c->parts;
+    double com[3] = {0.0, 0.0, 0.0}, mass = 0.0;
+
+    if(c->split) {
+        for(cp = &cell_pool[(cpi = c->firstchild)]; cpi != c->sibling; cp = &cell_pool[(cpi = cp->sibling)]) {
+            float cp_mass = com_mass_host[cpi];
+            com[0] += com_xy_host[cpi].x * cp_mass;
+            com[1] += com_xy_host[cpi].y * cp_mass;
+            com[2] += com_z_host[cpi] * cp_mass;
+            mass += cp_mass;
+        }
+
+
+     /* Otherwise collect the multiple from the particles */
+    } else {
+
+        for(k = parts; k < parts+count; k++)
+        {
+            float p_mass = parts_host[k].m;
+            com[0] += parts_host[k].loc[0] * p_mass;
+            com[1] += parts_host[k].loc[1] * p_mass;
+            com[2] += parts_host[k].loc[2] * p_mass;
+            mass += p_mass;
+        }
+    }
+
+
+    k = c - cell_pool;
+    /* Store the COM data, if it was collected. */
+    if(mass > 0.0f) {
+        float imass = 1.0f/mass;
+        com_xy_host[k].x = com[0] * imass;
+        com_xy_host[k].y = com[1] * imass;
+        com_z_host[k] = com[2] * imass;
+        com_mass_host[k] = mass;
+    }else
+    {
+        com_xy_host[k].x = 0.0;
+        com_xy_host[k].y = 0.0;
+        com_z_host[k] = 0.0;
+        com_mass_host[k] = 0.0f;
+    }
+
+
+
+}
+
+/**
+ * @brief Sort the parts into eight bins along the given pivots and
+ *        fill the multipoles. Also adds the hierarchical resources
+ *        to the sched (TODO).
+ *
+ * @param c The #cell to be split.
+ * @param N The total number of parts.
+ * @param s The #sched to store the resources.
+ */
+void cell_split(int c, struct qsched *s) {
+    int i, j, k, kk, count = cell_pool[c].count;
+    int parts = cell_pool[c].parts;
+    struct part temp_part;
+    struct cell *cp, *cell;
+    int left[8], right[8];
+    double pivot[3];
+    static int root = -1;
+    int progenitors[8];
+    struct part *temp_xy;
+
+    /* Set the root cell. */
+    if (root < 0) {
+        root = c;
+        cell_pool[c].sibling = -1;
+    }
+
+    if(cell_pool[c].res == qsched_res_none)
+    {
+        if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[c].parts], 0) != cudaSuccess )
+            error("Failed to get host device pointer.");
+        cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy,
+                                        sizeof(struct part) * cell_pool[c].count, parts_temp + cell_pool[c].parts);
+    }
+
+    if(count > cell_maxparts )
+    {
+        cell_pool[c].split = 1;
+
+        for(k = 0; k < 8; k++)
+        {
+            progenitors[k] = (cp = cell_get()) - cell_pool;
+            cell = &cell_pool[c];
+            cp->loc_xy = cell->loc_xy;
+            cp->loc_z = cell->loc_z;
+            cp->h = cell->h*0.5;
+            if(k & 4) cp->loc_xy.x += cp->h;
+            if(k & 2) cp->loc_xy.y += cp->h;
+            if(k & 1) cp->loc_z += cp->h;
+        }
+
+        /* Init the pivots.*/
+        pivot[0] = cell->loc_xy.x + cell->h * 0.5;
+        pivot[1] = cell->loc_xy.y + cell->h * 0.5;
+        pivot[2] = cell->loc_z + cell->h * 0.5;
+
+        /* Split along the x axis. */
+        i = parts;
+        j = parts+count-1;
+        while(i < j)
+        {
+            while(i <= parts+count-1 && parts_host[i].loc[0] < pivot[0]) i += 1;
+            while(j >= parts && parts_host[j].loc[0] >= pivot[0]) j -= 1;
+            if(i < j){
+                temp_part = parts_host[i];
+                parts_host[i] = parts_host[j];
+                parts_host[j] = temp_part;
+            }
+        }
+        left[1] = i;
+        right[1] = parts+count-1;
+        left[0] = parts;
+        right[0] = j;
+
+        /* Split along the y axis twice. */
+        for (k = 1; k >= 0; k--) {
+            i = left[k];
+            j = right[k];
+            while(i <= j){
+                while(i <= right[k] && parts_host[i].loc[1] < pivot[1]) i += 1;
+                while(j >= left[k] && parts_host[j].loc[1] >= pivot[1]) j -= 1;
+                if(i < j)
+                {
+                    temp_part = parts_host[i];
+                    parts_host[i] = parts_host[j];
+                    parts_host[j] = temp_part;
+                }
+            }
+            left[2*k+1] = i;
+            right[2*k+1] = right[k];
+            left[2*k] = left[k];
+            right[2*k] = j;
+        }
+        /* Split along the z axis four times.*/
+        for(k = 3; k >=0; k--)
+        {
+            i = left[k];
+            j = right[k];
+            while(i <= j){
+                while(i <= right[k] && parts_host[i].loc[2] < pivot[2]) i += 1;
+                while(j >= left[k] && parts_host[j].loc[2] >= pivot[2]) j -= 1;
+                if(i < j)
+                {
+                    temp_part = parts_host[i];
+                    parts_host[i] = parts_host[j];
+                    parts_host[j] = temp_part;
+                }
+            }
+            left[2 * k + 1] = i;
+            right[2 * k + 1] = right[k];
+            left[2 * k] = left[k];
+            right[2 * k] = j;
+        }
+
+        /* Store the counts and offsets. */
+        for(k = 0; k < 8; k++)
+        {
+            cell_pool[progenitors[k]].count = right[k]-left[k]+1;
+            cell_pool[progenitors[k]].parts = left[k];
+        if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
+            error("Failed to get host device pointer.");
+        cell_pool[progenitors[k]].res = qsched_addres(s, qsched_owner_none, cell->res, temp_xy,
+                                        sizeof(struct part) * cell_pool[progenitors[k]].count, parts_temp + cell_pool[progenitors[k]].parts);
+        }
+
+        /* Find the first non-empty progenitor */
+        for(k = 0; k < 8; k++)
+        {
+            if(cell_pool[progenitors[k]].count > 0)
+            {
+                cell->firstchild = progenitors[k];
+                break;
+            }
+        }
+
+        #ifdef SANITY_CHECKS
+            if(cell->firstchild == -1)
+                error("Cell has been split but all children have 0 parts");
+        #endif
+
+        /*Prepare the pointers*/
+        for(k = 0; k < 8; k++)
+        {
+            /* Find the next non-empty sibling */
+            for(kk = k+1; kk < 8; ++kk){
+                if(cell_pool[progenitors[kk]].count > 0){
+                    cell_pool[progenitors[k]].sibling = progenitors[kk];
+                    break;
+                }
+            }
+
+            /* No non-empty sibling, go back a level.*/
+            if(kk == 8) cell_pool[progenitors[k]].sibling = cell->sibling;
+
+        }
+
+        /* Recurse */
+        for(k = 0; k < 8; k++)
+            if(cell_pool[progenitors[k]].count > 0) cell_split(progenitors[k], s);
+     
+    /* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */   
+    } /*else {
+
+//    struct cell *data[2] = {root, c};
+    int data[2] = {root, c};
+        int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
+                                 2 * sizeof(int), 3000);
+        qsched_addlock(s, tid, cell_pool[c].res);
+    }*/
+
+#ifndef COM_AS_TASK
+    comp_com(&cell_pool[c]);
+#endif
+}
+
+/**
+ * @brief Create the tasks for the cell pair/self.
+ *
+ * @param s The #sched in which to create the tasks.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ */
+void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
+
+  qsched_task_t tid;
+  int   data[2];
+  struct cell *cp, *cps;
+
+#ifdef SANITY_CHECKS
+
+  /* If either cell is empty, stop. */
+  if (ci->count == 0 || (cj != NULL && cj->count == 0)) error("Empty cell !");
+
+#endif
+
+  /* Single cell? */
+  if (cj == NULL) {
+
+    /* Is this cell split and above the task limit ? */
+    if (ci->split /*&& ci->count > task_limit / ci->count*/) {
+
+      /* Loop over each of this cell's progeny. */
+      for (cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]) {
+
+        /* Make self-interaction task. */
+        create_tasks(s, cp, NULL);
+
+        /* Make all pair-interaction tasks. */
+        for (cps = &cell_pool[cp->sibling]; cps != &cell_pool[ci->sibling]; cps = &cell_pool[cps->sibling])
+          create_tasks(s, cp, cps);
+      }
+
+      /* Otherwise, add a self-interaction task. */
+    } else {
+
+      /* Set the data. */
+      data[0] = ci -cell_pool;
+      data[1] = -1;
+
+      /* Create the task. */
+      tid =  qsched_addtask(s, task_type_self, task_flag_none, data,
+			    sizeof(int) * 2, ci->count * ci->count / 2);
+
+      /* Add the resource (i.e. the cell) to the new task. */
+      qsched_addlock(s, tid, ci->res);
+
+/* Since this call might recurse, add a dependency on the cell's COM
+   task. */
+#ifdef COM_AS_TASK
+      if (ci->split) qsched_addunlock(s, ci->com_tid, tid);
+#endif
+    }
+
+    /* Otherwise, it's a pair. */
+  } else {
+
+    /* Are both cells split and we are above the task limit ? */
+    if (ci->split && cj->split ) {
+      
+      /* Let's split both cells and build all possible pairs */
+      for (cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]) {
+	for (cps = &cell_pool[cj->firstchild]; cps != &cell_pool[cj->sibling]; cps = &cell_pool[cps->sibling]) {
+	  
+	  /* Recurse */
+	  if (are_neighbours_host(cp, cps)) {
+	    create_tasks(s, cp, cps);
+	  }
+	}
+      }
+
+    if( 0 && ci->count > 64*cell_maxparts)
+    {
+      /* Let's also build a particle-monopole task */
+	for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]){
+      /* Create the task. */
+      data[0] = cp-cell_pool;
+      data[1] = cj-cell_pool;
+      tid = qsched_addtask(s, task_type_pair_pc_split, task_flag_none, data,
+			   sizeof(int) * 2, ci->count + cj->count);
+      
+      /* Add the resource and dependance */
+      qsched_addlock(s, tid, cp->res);
+//      qsched_addlock(s, tid, cj->res);
+    }
+    for(cp = &cell_pool[cj->firstchild]; cp != &cell_pool[cj->sibling]; cp = &cell_pool[cp->sibling]){
+      /* Create the task. */
+      data[0] = cp-cell_pool;
+      data[1] = ci-cell_pool;
+      tid = qsched_addtask(s, task_type_pair_pc_split, task_flag_none, data,
+			   sizeof(int) * 2, ci->count + cj->count);
+      
+      /* Add the resource and dependance */
+      qsched_addlock(s, tid, cp->res);
+//      qsched_addlock(s, tid, cj->res);
+    }
+    }else
+    {
+        data[0] = ci -cell_pool;
+        data[1] = cj - cell_pool;
+        tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(int) * 2, ci->count + cj->count);
+    /* Add the resource and dependance */
+      qsched_addlock(s, tid, ci->res);
+        data[0] = cj - cell_pool;
+        data[1] = ci - cell_pool;
+        tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(int) * 2, ci->count + cj->count);
+      qsched_addlock(s, tid, cj->res);
+    }
+
+#ifdef COM_AS_TASK
+      qsched_addunlock(s, cj->com_tid, tid);
+      qsched_addunlock(s, ci->com_tid, tid);
+#endif
+
+     
+    } else {  /* Otherwise, at least one of the cells is not split, build a direct
+	       * interaction. */
+
+      /* Set the data. */
+      data[0] = ci-cell_pool;
+      data[1] = cj-cell_pool;
+      
+      /* Create the task. */
+      tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
+			   sizeof(int) * 2, ci->count * cj->count);
+      
+      /* Add the resources. */
+      qsched_addlock(s, tid, ci->res);
+      qsched_addlock(s, tid, cj->res);
+
+    }
+
+  }   /* Otherwise it's a pair */
+}
+
+
+/**
+ * @brief Compute the interactions between all particles in a cell.
+ *
+ * @param cellID The cell ID to compute interactions on.
+ */
+__device__ __forceinline__ void iact_self_direct(int cellID) {
+    struct cell *c = &cells[cellID];
+    double xi[3] = {0.0, 0.0, 0.0};
+    float ai[3] = {0.0, 0.0, 0.0 }, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w;
+    int parts;
+    int count;
+    int i,j,k;
+    
+        parts = c->parts;
+        count = c->count;
+        for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x)
+        {
+            xi[0] = parts_cuda[i].loc[0];
+            xi[1] = parts_cuda[i].loc[1];
+            xi[2] = parts_cuda[i].loc[2];
+            for(k = 0; k < 3; k++) {
+                ai[k] = 0.0;
+            }
+            
+            for(j = parts; j < parts+count; j++)
+            {
+                if(i != j){
+
+                    /* Compute the pairwise distance. */
+                    r2 = 0.0;
+                    dx[0] = xi[0] - parts_cuda[j].loc[0];
+                    r2 += dx[0]*dx[0];
+                    dx[1] = xi[1] - parts_cuda[j].loc[1];
+                    r2 += dx[1]*dx[1];
+                    dx[2] = xi[2] - parts_cuda[j].loc[2];
+                    r2 += dx[2]*dx[2];
+
+                    /* Apply the gravitational acceleration. */
+
+                    ir = rsqrtf(r2);
+                    w = const_G * ir * ir * ir;
+                    mj = parts_cuda[j].m;
+                    for(k = 0; k < 3; k++) {
+                        ai[k] -= w * dx[k] * mj;
+                    }
+        
+                }
+            }
+            //Update.
+            atomicAdd(&parts_cuda[i].a[0],ai[0] );
+            atomicAdd(&parts_cuda[i].a[1],ai[1] );
+            atomicAdd(&parts_cuda[i].a[2],ai[2] );
+        }
+
+
+}
+
+#ifdef EXACT
+__global__ void interact_exact(int count){
+    
+    int number = blockIdx.x * blockDim.x + threadIdx.x;
+    int i, j, k;
+    double dx[3], ir, r2, w;
+    if(number < count)
+    {
+        double pix[3] = {parts_cuda[number].loc[0], parts_cuda[number].loc[1], parts_cuda[number].loc[2]};
+        float mi = parts_cuda[number].m;
+        for(j = 0; j < count; j++)
+        {
+            if(j != number)
+            {
+                for (r2 = 0.0, k = 0; k < 3; k++) {
+                    dx[k] = parts_cuda[j].loc[k] - pix[k];
+                    r2 += dx[k] * dx[k];
+                }
+                ir = rsqrt(r2);
+                w = const_G * ir * ir * ir;
+
+                for(k = 0; k < 3; k++)
+                {
+                    atomicAdd(&parts_cuda[number].a[k] , w * dx[k] * parts_cuda[j].m);
+                }
+            }
+        }
+
+    }
+}
+#endif
+
+__device__ __forceinline__ void runner( int type , void *data ) {
+        
+        int *idata = (int *)data;
+        int i = idata[0];
+        int j = idata[1];
+        switch ( type ) {
+            case task_type_self:
+                iact_self_direct(i);
+                break;
+            case task_type_pair:
+                iact_pair_direct(&cells[i], &cells[j]);
+                break;
+            case task_type_pair_pc: 
+                iact_pair_pc( &cells[i], &cells[j] );
+                break;
+            case task_type_pair_pc_split:
+                iact_pair_pc_split(&cells[i], &cells[j]);
+                break;
+            default:
+                printf("Got to default?\n");
+                asm("trap;");
+        }
+    __syncthreads();
+}
+
+__device__ qsched_funtype function = runner;
+
+
+
+
+
+int calcMaxDepth(struct cell *c, int depth)
+{
+    struct cell *cp;
+    if(c->split)
+    {
+        int maxd = 0, temp;
+        for(cp = &cell_pool[c->firstchild]; cp != &cell_pool[c->sibling]; cp = &cell_pool[cp->sibling])
+        {
+            temp = calcMaxDepth(cp, depth+1);
+            if(temp > maxd)
+                maxd = temp;
+        }
+        return maxd;
+    }else{
+        return depth+1;
+    }
+}
+
+
+qsched_funtype func;
+/**
+ * @brief Set up and run a task-based Barnes-Hutt N-body solver.
+ *
+ * @param N The number of random particles to use.
+ * @param runs Number of force evaluations to use as a benchmark.
+ * @param fileName Input file name. If @c NULL or an empty string, random
+ *        particle positions will be used.
+ */
+void test_bh(int N, int runs, char *fileName) {
+  int i, k;
+  struct cell *root;
+  FILE *file;
+  struct qsched s;
+  struct cell *gpu_ptr_cells;
+double2 *com_temp;
+double *comz_temp;
+float *comm_temp;
+
+  cudaFree(0);
+    cudaThreadSetCacheConfig(cudaFuncCachePreferL1);
+    if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
+        error("Failed to copy function pointer from device");
+
+  /* Initialize the scheduler. */
+  qsched_init(&s, 1, qsched_flag_none);
+
+    //Create host particle arrays.
+    if( cudaMallocHost(&parts_host, sizeof(struct part) * N) != cudaSuccess)
+        error("Failed to allocate parts array");
+
+
+
+    if( cudaMalloc(&parts_temp, sizeof(struct part) * N) != cudaSuccess)
+        error("Failed to allocate device parts array");
+    if( cudaMemcpyToSymbol(parts_cuda, &parts_temp, sizeof(struct part*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to set device symbol for parts array");
+
+    if(fileName == NULL || fileName[0] == 0) {
+        for(k = 0; k < N; k++) {
+            parts_host[k].loc[0] = ((double)rand())/ RAND_MAX;
+            parts_host[k].loc[1] = ((double)rand())/ RAND_MAX;
+            parts_host[k].loc[2] = ((double)rand())/ RAND_MAX;
+            parts_host[k].m = ((double)rand()) / RAND_MAX;
+        }
+    
+    } else {
+        file = fopen(fileName, "r");
+        #ifndef ID
+        int tempxy;
+        #endif
+        double temp;
+        printf("Reading input from file\n");
+        if(file) {
+            for(k = 0; k < N; k++) {
+#ifdef ID
+                if(fscanf(file, "%d", &parts_host[k].id) != 1)
+#else
+                if(fscanf(file, "%d", &tempxy) != 1)
+#endif
+                    error("Failed to read ID");
+                if(fscanf(file, "%f", &parts_host[k].m) != 1)
+                    error("Failed to read mass of part %i.", k);
+                if(fscanf(file, "%lf%lf%lf", &parts_host[k].loc[0], &parts_host[k].loc[1], &parts_host[k].loc[2]) != 3)
+                    error("Failed to read position of part %i.", k);
+                if(fscanf(file,"%lf %lf %lf %lf %lf %lf %lf %lf %lf", &temp,&temp,&temp,&temp,&temp,&temp,&temp,&temp,&temp ) != 9)
+                    error("Failed to read extra stuff");
+            }
+            fclose(file);
+        }
+    }
+
+    /* Init the cells. */
+    root = cell_get();
+    root->loc_xy.x = 0.0;
+    root->loc_xy.y = 0.0;
+    root->loc_z = 0.0;
+    root->h = 1.0;
+    root->count = N;
+    root->parts = 0;
+    root->sibling = -1;
+    int c = root-cell_pool;
+    cell_split(root - cell_pool, &s);
+    root = &cell_pool[c];
+    int nr_leaves = 0;
+    int maxparts=0, minparts=1000000;
+    int number = 0;
+    while(c >= 0) {
+        if(cell_pool[c].count > 0)
+        {
+            number++;
+            if(cell_pool[c].res == qsched_res_none)
+               message("cell %i has no res", c);
+        }
+        if(!cell_pool[c].split) {
+            nr_leaves++;
+            if(cell_pool[c].count > maxparts)
+            {
+                maxparts = cell_pool[c].count;
+            }
+            if(cell_pool[c].count < minparts)
+            {
+                minparts = cell_pool[c].count;
+            }
+            c = cell_pool[c].sibling;
+        } else {
+            c = cell_pool[c].firstchild;
+        }
+    }
+    message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves));
+    message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts);
+    
+    create_tasks(&s, root, NULL);    
+
+    int self = 0, pair = 0, pc = 0;
+
+    for(k = 0; k < s.count; k++)
+    {
+        if(s.tasks[k].type == task_type_self)
+            self++;
+        else if (s.tasks[k].type == task_type_pair)
+            pair++;
+        else if (s.tasks[k].type >= 0)
+            pc++;
+    }
+
+    message("total number of tasks: %i.", s.count);
+    message("total number of pair tasks: %i.", pair);
+    message("total number of self tasks: %i.", self);
+    message("total number of pc tasks: %i.", pc);
+    message("total number of cells: %i.", number);
+    message("total number of deps: %i.", s.count_deps);
+    message("total number of res: %i.", s.count_res);
+    message("total number of locks: %i.", s.count_locks);
+
+    for(k = 0; k < runs; k++) {
+        for(i = 0; i < N; ++i) {
+            parts_host[i].a[0] = 0.0;
+            parts_host[i].a[1] = 0.0;
+            parts_host[i].a[2] = 0.0;
+        }
+
+
+    /* Copy the cells to the device. */
+    if( cudaMalloc( &gpu_ptr_cells , sizeof(struct cell) * used_cells) != cudaSuccess)
+        error("Failed to allocate cells on the GPU");
+    if( cudaMemcpy( gpu_ptr_cells, cell_pool, sizeof(struct cell) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy cells to the GPU");
+    if( cudaMemcpyToSymbol(cells, &gpu_ptr_cells, sizeof(struct cell*), 0, cudaMemcpyHostToDevice) != cudaSuccess )
+        error("Failed to copy cell pointer to the GPU");
+
+
+
+    if(cudaMalloc( &com_temp, sizeof(double2) * used_cells) != cudaSuccess)
+        error("Failed to allocate com on the GPU");
+    if( cudaMemcpy( com_temp, com_xy_host, sizeof(double2) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
+        error("failed to copy com to the GPU");
+    if( cudaMemcpyToSymbol(com_xy, &com_temp, sizeof(double2 *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy com pointer to the GPU");
+
+
+    if(cudaMalloc( &comz_temp, sizeof(double) * used_cells) != cudaSuccess)
+        error("Failed to allocate com on the GPU");
+    if( cudaMemcpy( comz_temp, com_z_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
+        error("failed to copy com to the GPU");
+    if( cudaMemcpyToSymbol(com_z, &comz_temp, sizeof(double *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy com pointer to the GPU");
+
+    if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess)
+        error("Failed to allocate com on the GPU");
+    if( cudaMemcpy( comm_temp, com_mass_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
+        error("failed to copy com to the GPU");
+    if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy com pointer to the GPU");
+    printf("Max depth is %i\n", calcMaxDepth(root, 0));
+        qsched_run_CUDA( &s , func );
+        qsched_print_cuda_timers(&s);
+        
+       int pcs;
+    if( cudaMemcpyFromSymbol( &pcs, pc_calcs, sizeof(int), 0, cudaMemcpyDeviceToHost) != cudaSuccess)
+        error("Failed");
+    printf("pc calcs = %i\n", pcs);
+/*struct task* tasks =  qsched_get_timers( &s , s.count );
+    for(i = 0; i < s.count; i++)
+    {
+      printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
+       // printf("\n");
+    
+  }*/
+
+}   
+ #ifdef EXACT
+    struct part *parts2;
+        parts2 = (struct part*) malloc(sizeof(struct part) * N );
+        for(k = 0; k < N; k++)
+        {
+            parts2[k].loc[0] = parts_host[k].loc[0];
+            parts2[k].loc[1] = parts_host[k].loc[1];
+            parts2[k].loc[2] = parts_host[k].loc[2];
+            parts2[k].m = parts_host[k].m;
+            parts2[k].a[0] = 0.0f;
+            parts2[k].a[1] = 0.0f;
+            parts2[k].a[2] = 0.0f;
+        }
+//    cudaDeviceReset();
+    if(cudaMalloc(&com_temp, sizeof(struct part) * N) != cudaSuccess)
+        error("Couldn't allocate com_temp");
+    if(cudaMemcpy(com_temp, parts2, sizeof(struct part) * N, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Couldn't copy part data");
+    if(cudaMemcpyToSymbol(parts_cuda, &com_temp, sizeof(struct part*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+        error("Failed to copy part data");
+
+    int numblocks = N / 128 + 1;
+    double itpms = 1000.0 / CPU_TPS;
+ticks tic, toc_run ;
+    tic = getticks();
+    interact_exact<<<numblocks, 128>>>(N);
+    cudaDeviceSynchronize();
+    toc_run = getticks();
+    printf("Cuda kernel took %.3f ms to run exact solution\n", ((double)(toc_run - tic)) * itpms);
+    if( cudaMemcpy( parts2, com_temp, sizeof(struct part) * N, cudaMemcpyDeviceToHost) != cudaSuccess)
+        error("Failed to copy data back from device");    
+
+    printf("%e\n", parts_host[0].m);
+
+
+    k = 0;
+        printf("%e, %e, %e, %e, %e, %e, %e, %e, %e, %e\n", parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2],
+            parts2[k].a[0], parts2[k].a[1], parts2[k].a[2], parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]);
+    #endif
+  /* Dump the particles to a file */
+  file = fopen("particle_dump.dat", "w");
+/*  fprintf(file,
+          "# ID m x y z a_exact.x   a_exact.y    a_exact.z    a_legacy.x    "
+          "a_legacy.y    a_legacy.z    a_new.x     a_new.y    a_new.z\n");*/
+#ifdef EXACT
+    printf("Printing exact\n");
+    for(k = 0; k < N; ++k)
+    fprintf(file, "%i %e %e %e %e %e %e %e %e %e %e %e %e %e\n", 
+            #ifdef ID
+            parts_host[k].id,
+            #else
+            k,
+            #endif
+            parts_host[k].m,parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2],
+            parts2[k].a[0], parts2[k].a[1], parts2[k].a[2], parts2[k].a[0], parts2[k].a[1], parts2[k].a[2], parts_host[k].a[0],
+            parts_host[k].a[1], parts_host[k].a[2]);
+
+#else
+  for (k = 0; k < N; ++k)
+    fprintf(file, "%i %e %e %e %e %e %e %e\n",
+            k, parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2],
+            parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]);
+#endif
+  fclose(file);
+    file = fopen("particle_pos.dat", "w");
+    fprintf(file, "m x[1] x[2] x[3]\n");
+    for(k = 0; k < N; k++)
+        fprintf(file, "%e %e %e %e\n", parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2]);
+
+    fclose(file);
+    cudaFreeHost(parts_host);
+    //free(parts_host);
+}
+
+
+int main(int argc, char *argv[]) {
+    int c, nr_threads;
+    int N = 1000, runs = 1;
+    char fileName[100] = {0};
+
+    /* Parse the options */
+  while ((c = getopt(argc, argv, "n:r:t:f:c:i:")) != -1) switch (c) {
+      case 'n':
+        if (sscanf(optarg, "%d", &N) != 1)
+          error("Error parsing number of particles.");
+        break;
+      case 'r':
+        if (sscanf(optarg, "%d", &runs) != 1)
+          error("Error parsing number of runs.");
+        break;
+      case 't':
+        if (sscanf(optarg, "%d", &nr_threads) != 1)
+          error("Error parsing number of threads.");
+        break;
+      case 'f':
+        if (sscanf(optarg, "%s", &fileName[0]) != 1)
+          error("Error parsing file name.");
+        break;
+      case '?':
+        fprintf(stderr, "Usage: %s [-t nr_threads] [-n N] [-r runs] [-f file] "
+                        "[-c Nparts] [-i Niterations] \n",
+                argv[0]);
+        fprintf(stderr, "Solves the N-body problem using a Barnes-Hut\n"
+                        "tree code with N random particles read from a file in "
+                        "[0,1]^3 using"
+                        "nr_threads threads.\n"
+                        "A test of the neighbouring cells interaction with "
+                        "Nparts per cell is also run Niterations times.\n");
+        exit(EXIT_FAILURE);
+    }
+
+  /* Tree node information */
+  printf("Size of cell: %zu bytes.\n", sizeof(struct cell));
+
+  /* Part information */
+  printf("Size of part: %zu bytes.\n", sizeof(struct part));
+
+  /* Dump arguments. */
+  if (fileName[0] == 0) {
+    message("Computing the N-body problem over %i random particles using %i "
+            "threads (%i runs).",
+            N, nr_threads, runs);
+  } else {
+    message("Computing the N-body problem over %i particles read from '%s' "
+            "using %i threads (%i runs).",
+            N, fileName, nr_threads, runs);
+  }
+
+  /* Run the BH test. */
+  test_bh(N, runs, fileName);
+}