diff --git a/tests/test_pair.cu b/tests/test_pair.cu
deleted file mode 100644
index 5f1d1b565a14031aff038f1cda51e62a38e2f917..0000000000000000000000000000000000000000
--- a/tests/test_pair.cu
+++ /dev/null
@@ -1,1905 +0,0 @@
-/*******************************************************************************
- * This file is part of SWIFT.
- * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- ******************************************************************************/
-
-// NEED TO USE
-//   gpuErrchk( cudaPeekAtLastError() );
-//  gpuErrchk( cudaDeviceSynchronize() );
-
-#ifndef static
-#define static
-#endif
-#ifndef restrict
-#define restrict __restrict__
-#endif
-//#define FULL_COMP
-extern "C" {
-#include "testcuda.h"
-/* Some standard headers. */
-#include <fenv.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-}
-
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-
-
-/* Host function to check cuda functions don't return errors */
-__host__ inline void cudaErrCheck(cudaError_t status) {
-  if (status != cudaSuccess) {
-    printf("%s\n", cudaGetErrorString(status));
-  }
-}
-
-/* Task function to execute a force task. Uses naive n^2 algorithm without
- * symmetry*/
-/* To do force between Cell i and cell j this needs to be called twice. */
-__device__ void dopair_force(struct cell_cuda *ci, struct cell_cuda *cj) {
-
-  /* Are these cells active? */
-  if (!cuda_cell_is_active(ci) && !cuda_cell_is_active(cj)) return;
-
-  const int count_i = ci->part_count;
-  const int count_j = cj->part_count;
-  int part_i = ci->first_part;
-  int part_j = cj->first_part;
-
-  float3 a_hydro;
-  float h_dt, v_sig_stor, entropy_dt;
-
-  double shift[3] = {0.0, 0.0, 0.0};
-  /* Deal with periodicity concerns. */
-  for (int k = 0; k < 3; k++) {
-    if (cj->loc[k] - ci->loc[k] < -dim[k] / 2)
-      shift[k] = dim[k];
-    else if (cj->loc[k] - ci->loc[k] > dim[k] / 2)
-      shift[k] = -dim[k];
-  }
-
-  /* Loop over the parts in cell ci */
-  for (int pid = part_i + threadIdx.x; pid < part_i + count_i;
-       pid += blockDim.x) {
-
-    const float hi = cuda_parts.h[pid];
-    if (!cuda_part_is_active(pid)) continue;
-    /* Reset the values. */
-    a_hydro.x = 0.0f;
-    a_hydro.y = 0.0f;
-    a_hydro.z = 0.0f;
-    h_dt = 0.0f;
-    v_sig_stor = cuda_parts.v_sig[pid];
-    entropy_dt = 0.0f;
-
-    double pix[3];
-    pix[0] = cuda_parts.x_x[pid] - shift[0];
-    pix[1] = cuda_parts.x_y[pid] - shift[1];
-    pix[2] = cuda_parts.x_z[pid] - shift[2];
-
-    const float hig2 = hi * hi * kernel_gamma2;
-
-    /* Loop over the particles in cj. */
-    for (int pjd = part_j; pjd < part_j + count_j; pjd++) {
-
-      /* Compute the pairwise distance. */
-      float r2 = 0.0f;
-      float dx[3];
-      dx[0] = pix[0] - cuda_parts.x_x[pjd];
-      r2 += dx[0] * dx[0];
-      dx[1] = pix[1] - cuda_parts.x_y[pjd];
-      r2 += dx[1] * dx[1];
-      dx[2] = pix[2] - cuda_parts.x_z[pjd];
-      r2 += dx[2] * dx[2];
-
-      const float hj = cuda_parts.h[pjd];
-      if (r2 < hig2 || r2 < hj * hj * kernel_gamma2) {
-        float wi, wj, wi_dx, wj_dx;
-        const float fac_mu = 1.f;
-
-        const float r = sqrtf(r2);
-        const float r_inv = 1.0f / r;
-
-        /* Load some data.*/
-        const float mj = cuda_parts.mass[pjd];
-        const float rhoi = cuda_parts.rho[pid];
-        const float rhoj = cuda_parts.rho[pjd];
-
-        /* Get the kernel for hi. */
-        const float hi_inv = 1.0f / hi;
-        const float hid_inv = cuda_pow_dimension_plus_one(hi_inv);
-        const float ui = r * hi_inv;
-        cuda_kernel_deval(ui, &wi, &wi_dx);
-        const float wi_dr = hid_inv * wi_dx;
-
-        /* Get the kernel for hj. */
-        const float hj_inv = 1.0f / hj;
-        const float hjd_inv = cuda_pow_dimension_plus_one(hj_inv);
-        const float xj = r * hj_inv;
-        cuda_kernel_deval(xj, &wj, &wj_dx);
-        const float wj_dr = hjd_inv * wj_dx;
-
-        /* Compute h-gradient terms */
-        const float f_i = cuda_parts.f[pid];
-        const float f_j = cuda_parts.f[pjd];
-
-        /* Compute pressure terms */
-        const float P_over_rho2_i = cuda_parts.P_over_rho2[pid];
-        const float P_over_rho2_j = cuda_parts.P_over_rho2[pjd];
-
-        /* Compute sound speeds*/
-        const float ci = cuda_parts.soundspeed[pid];
-        const float cj = cuda_parts.soundspeed[pjd];
-
-        /* Compute dv dot r. */
-        const float dvdr = (cuda_parts.v[pid].x - cuda_parts.v[pjd].x) * dx[0] +
-                           (cuda_parts.v[pid].y - cuda_parts.v[pjd].y) * dx[1] +
-                           (cuda_parts.v[pid].z - cuda_parts.v[pjd].z) * dx[2];
-
-        /* Balsara term */
-        const float balsara_i = cuda_parts.balsara[pid];
-        const float balsara_j = cuda_parts.balsara[pjd];
-
-        /* Are the particles moving towards each other? */
-        const float omega_ij = (dvdr < 0.f) ? dvdr : 0.f;
-        const float mu_ij = fac_mu * r_inv * omega_ij;
-
-        /* Signal velocity */
-        const float v_sig = ci + cj - 3.f * mu_ij;
-
-        /* Now construct the full viscosity term */
-        const float rho_ij = 0.5f * (rhoi + rhoj);
-        const float visc = -0.25 * const_viscosity_alpha * v_sig * mu_ij *
-                           (balsara_i + balsara_j) / rho_ij;
-
-        /* Now convolce with the kernel */
-        const float visc_term = 0.5f * visc * (wi_dr + wj_dr) * r_inv;
-        const float sph_term =
-            (f_i * P_over_rho2_i * wi_dr + f_j * P_over_rho2_j * wj_dr) * r_inv;
-
-        /* Compute the acceleration */
-        const float acc = visc_term + sph_term;
-
-        /* Compute the force */
-        a_hydro.x -= mj * acc * dx[0];
-        a_hydro.y -= mj * acc * dx[1];
-        a_hydro.z -= mj * acc * dx[2];
-
-        /* Get the time derivative for h. */
-        h_dt -= mj * dvdr * r_inv / rhoj * wi_dr;
-
-        /* Update the signal velocity. */
-        v_sig_stor = (v_sig_stor > v_sig) ? v_sig_stor : v_sig;
-
-        /* Change in entropy */
-        entropy_dt += mj * visc_term * dvdr;
-        // INTERACT
-      }
-
-    }  // Loop over cell cj.
-
-    /* Flush to global stores.*/
-    atomicAdd(&cuda_parts.a_hydro[pid].x, a_hydro.x);
-    atomicAdd(&cuda_parts.a_hydro[pid].y, a_hydro.y);
-    atomicAdd(&cuda_parts.a_hydro[pid].z, a_hydro.z);
-    atomicAdd(&cuda_parts.h_dt[pid], h_dt);
-    atomicAdd(&cuda_parts.entropy_dt[pid], entropy_dt);
-
-    /* Update the signal velocity */
-    float global_vsig = cuda_parts.v_sig[pid];
-    int *address_as_int = (int *)&cuda_parts.v_sig[pid];
-    int old = *address_as_int;
-    int assumed;
-    do {
-      global_vsig = cuda_parts.v_sig[pid];
-      assumed = old;
-      if (v_sig_stor > global_vsig)
-        old = atomicCAS(address_as_int, assumed, __float_as_int(v_sig_stor));
-    } while (assumed != old && v_sig_stor > global_vsig);
-
-  }  // Loop over cell ci.
-}
-
-/**
- * @brief Compute the interactions between a cell pair (non-symmetric).
- *
- * @param r The #runner.
- * @param ci The first #cell.
- * @param cj The second #cell.
- * @param sid The direction of the pair
- */
-__device__ void dopair_density_sorted(struct cell_cuda *ci, struct cell_cuda *cj,
-				      const int sid) {
-
-  /* Pick-out the sorted lists. */
-  const long long int f_i = ci->first_part;
-  const long long int f_j = cj->first_part;
-  const struct entry *sort_i = &cuda_parts.sort[sid][f_i];
-  const struct entry *sort_j = &cuda_parts.sort[sid][f_j];
-
-  /* Get some other useful values. */
-  const double hi_max = ci->h_max * kernel_gamma;
-  const double hj_max = cj->h_max * kernel_gamma;
-  const int count_i = ci->part_count;
-  const int count_j = cj->part_count;
-  const double dj_min = sort_j[0].d;
-  const double di_max = sort_i[count_i - 1].d;
-  
-  float rho, rho_dh, div_v, wcount, wcount_dh;
-  float3 rot_v;
-
-  if (cuda_cell_is_active(ci)) {
-
-    /* Loop over the parts in ci. */
-    for (int pid = count_i - 1 - threadIdx.x;
-         pid >= 0 && sort_i[pid].d + hi_max > dj_min; pid-= blockDim.x) {
-      /* Get a hold of the ith part in ci. */
-      const int pi = sort_i[pid].i + f_i;
-      if (!cuda_part_is_active(pi)) continue;
-      const float hi = cuda_parts.h[pi];
-      const double di = sort_i[pid].d + hi * kernel_gamma;
-      if (di < dj_min) continue;
-      
-      double pix[3], piv[3];
-      pix[0] = cuda_parts.x_x[pi];
-      pix[1] = cuda_parts.x_y[pi];
-      pix[2] = cuda_parts.x_z[pi];
-      
-      piv[0] = cuda_parts.v[pi].x;
-      piv[1] = cuda_parts.v[pi].y;
-      piv[2] = cuda_parts.v[pi].z;
-
-      const float hig2 = hi * hi * kernel_gamma2;
-
-      rho = 0.0f;
-      rho_dh = 0.0f;
-      div_v = 0.0f;
-      wcount = 0.0f;
-      wcount_dh = 0.0f;
-      rot_v.x = 0.0f;
-      rot_v.y = 0.0f;
-      rot_v.z = 0.0f;
-
-
-      /* Loop over the parts in cj. */
-      for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
-	//printf("pjd : %lli\n", pjd);
-        /* Get a pointer to the jth particle. */
-        const int pj= sort_j[pjd].i + f_j;
-
-        /* Compute the pairwise distance. */
-        float r2 = 0.0f;
-        float dx[3];
-	dx[0] = pix[0] - cuda_parts.x_x[pj];
-	dx[1] = pix[1] - cuda_parts.x_y[pj];
-	dx[2] = pix[2] - cuda_parts.x_z[pj];
-	for (int k = 0; k < 3; k++) {
-	  r2 += dx[k] * dx[k];
-	}
-	
-        /* Hit or miss? */
-        if (r2 < hig2) {
-	  
-	  float wi, wi_dx;
-	  float dv[3], curlvr[3];
-	    
-	  /* Get the masses. */
-	  const float mj = cuda_parts.mass[pj];
-	    
-	  /* Get r and r inverse. */
-	  const float r = sqrtf(r2);
-	  const float r_inv = 1.0f / r;
-
-	  /* Compute the kernel function */
-	  const float hi_inv = 1.0f / hi;
-	  const float ui = r * hi_inv;
-	  cuda_kernel_deval(ui, &wi, &wi_dx);
-
-	  /* Compute contribution to the density */
-	  rho += mj * wi;
-	  rho_dh -= mj * (hydro_dimension * wi + ui * wi_dx);
-
-	  /* Compute contribution to the number of neighbours */
-	  wcount += wi;
-	  wcount_dh -= (hydro_dimension * wi + ui * wi_dx);
-
-	  const float fac = mj * wi_dx * r_inv;
-
-	  /* Compute dv dot r */
-	  dv[0] = piv[0] - cuda_parts.v[pj].x;
-	  dv[1] = piv[1] - cuda_parts.v[pj].y;
-	  dv[2] = piv[2] - cuda_parts.v[pj].z;
-	  const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
-	  div_v -= fac * dvdr;
-
-	  /* Compute dv cross r */
-	  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
-	  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
-	  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];
-	  
-	  rot_v.x += fac * curlvr[0];
-	  rot_v.y += fac * curlvr[1];
-	  rot_v.z += fac * curlvr[2];
-	}
-
-      }
-      int pid_write = pid + f_i;
-      atomicAdd(&cuda_parts.rho[pid_write], rho);
-      atomicAdd(&cuda_parts.rho_dh[pid_write], rho_dh);
-      atomicAdd(&cuda_parts.wcount[pid_write], wcount);
-      atomicAdd(&cuda_parts.wcount_dh[pid_write], wcount_dh);
-      atomicAdd(&cuda_parts.div_v[pid_write], div_v);
-      atomicAdd(&cuda_parts.rot_v[pid_write].x, rot_v.x);
-      atomicAdd(&cuda_parts.rot_v[pid_write].y, rot_v.y);
-      atomicAdd(&cuda_parts.rot_v[pid_write].z, rot_v.z);
-    }
-    
-  }
-
-  if (cuda_cell_is_active(cj)) {
-
-    /* Loop over the parts in cj. */
-    for (int pjd = threadIdx.x; pjd < count_j && sort_j[pjd].d - hj_max < di_max; pjd+=blockDim.x) {
-      /* Get a hold of the ith part in ci. */
-      const int pj = sort_j[pjd].i + f_j;
-      if (!cuda_part_is_active(pj)) continue;
-      const float hj = cuda_parts.h[pj];
-      const double dj = sort_j[pjd].d - hj * kernel_gamma;
-      if (dj > di_max) continue;
-      
-      double pjx[3], pjv[3];
-      pjx[0] = cuda_parts.x_x[pj];
-      pjx[1] = cuda_parts.x_y[pj];
-      pjx[2] = cuda_parts.x_z[pj];
-      
-      pjv[0] = cuda_parts.v[pj].x;
-      pjv[1] = cuda_parts.v[pj].y;
-      pjv[2] = cuda_parts.v[pj].z;
-
-      const float hjg2 = hj * hj * kernel_gamma2;
-
-      rho = 0.0f;
-      rho_dh = 0.0f;
-      div_v = 0.0f;
-      wcount = 0.0f;
-      wcount_dh = 0.0f;
-      rot_v.x = 0.0f;
-      rot_v.y = 0.0f;
-      rot_v.z = 0.0f;
-
-
-      /* Loop over the parts in ci. */
-      for (int pid = count_i - 1;
-	   pid >= 0 && sort_i[pid].d > dj; pid--) {
-	//printf("pjd : %lli\n", pjd);
-        /* Get a pointer to the jth particle. */
-        const int pi= sort_i[pid].i + f_i;
-
-        /* Compute the pairwise distance. */
-        float r2 = 0.0f;
-        float dx[3];
-	dx[0] = pjx[0] - cuda_parts.x_x[pi];
-	dx[1] = pjx[1] - cuda_parts.x_y[pi];
-	dx[2] = pjx[2] - cuda_parts.x_z[pi];
-	for (int k = 0; k < 3; k++) {
-	  r2 += dx[k] * dx[k];
-	}
-	
-        /* Hit or miss? */
-        if (r2 < hjg2) {
-	  
-	  float wj, wj_dx;
-	  float dv[3], curlvr[3];
-	    
-	  /* Get the masses. */
-	  const float mi = cuda_parts.mass[pi];
-	    
-	  /* Get r and r inverse. */
-	  const float r = sqrtf(r2);
-	  const float r_inv = 1.0f / r;
-
-	  /* Compute the kernel function */
-	  const float hj_inv = 1.0f / hj;
-	  const float uj = r * hj_inv;
-	  cuda_kernel_deval(uj, &wj, &wj_dx);
-
-	  /* Compute contribution to the density */
-	  rho += mi * wj;
-	  rho_dh -= mi * (hydro_dimension * wj + uj * wj_dx);
-
-	  /* Compute contribution to the number of neighbours */
-	  wcount += wj;
-	  wcount_dh -= (hydro_dimension * wj + uj * wj_dx);
-
-	  const float fac = mi * wj_dx * r_inv;
-
-	  /* Compute dv dot r */
-	  dv[0] = pjv[0] - cuda_parts.v[pi].x;
-	  dv[1] = pjv[1] - cuda_parts.v[pi].y;
-	  dv[2] = pjv[2] - cuda_parts.v[pi].z;
-	  const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
-	  div_v -= fac * dvdr;
-
-	  /* Compute dv cross r */
-	  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
-	  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
-	  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];
-	  
-	  rot_v.x += fac * curlvr[0];
-	  rot_v.y += fac * curlvr[1];
-	  rot_v.z += fac * curlvr[2];
-	}
-
-      }
-      int pjd_write = pjd + f_j;
-      atomicAdd(&cuda_parts.rho[pjd_write], rho);
-      atomicAdd(&cuda_parts.rho_dh[pjd_write], rho_dh);
-      atomicAdd(&cuda_parts.wcount[pjd_write], wcount);
-      atomicAdd(&cuda_parts.wcount_dh[pjd_write], wcount_dh);
-      atomicAdd(&cuda_parts.div_v[pjd_write], div_v);
-      atomicAdd(&cuda_parts.rot_v[pjd_write].x, rot_v.x);
-      atomicAdd(&cuda_parts.rot_v[pjd_write].y, rot_v.y);
-      atomicAdd(&cuda_parts.rot_v[pjd_write].z, rot_v.z);
-    }
-    
-  }
-
-
-}
-
-/* Task function to execute a density task. Uses naive n^2 algorithm without
- * symmetry*/
-/* To do density between Cell i and cell j this needs to be called twice. */
-__device__ void dopair_density(struct cell_cuda *ci, struct cell_cuda *cj) {
-
-  /* Are these cells active? */
-  if (!cuda_cell_is_active(ci) && !cuda_cell_is_active(cj)) return;
-
-  const int count_i = ci->part_count;
-  const int count_j = cj->part_count;
-  int part_i = ci->first_part;
-  int part_j = cj->first_part;
-  float rho, rho_dh, div_v, wcount, wcount_dh;
-  float3 rot_v;
-  double shift[3] = {0.0, 0.0, 0.0};
-
-  /* Deal with periodicity concerns. */
-  for (int k = 0; k < 3; k++) {
-    if (cj->loc[k] - ci->loc[k] < -dim[k] / 2)
-      shift[k] = dim[k];
-    else if (cj->loc[k] - ci->loc[k] > dim[k] / 2)
-      shift[k] = -dim[k];
-  }
-
-  /* Loop over the parts in cell ci */
-  for (int pid = part_i + threadIdx.x; pid < part_i + count_i;
-       pid += blockDim.x) {
-
-    const float hi = cuda_parts.h[pid];
-
-    double pix[3];
-    pix[0] = cuda_parts.x_x[pid] - shift[0];
-    pix[1] = cuda_parts.x_y[pid] - shift[1];
-    pix[2] = cuda_parts.x_z[pid] - shift[2];
-    const float hig2 = hi * hi * kernel_gamma2;
-
-    if (!cuda_part_is_active(pid)) continue;
-
-    /* Reset local values. */
-    rho = 0.0f;
-    rho_dh = 0.0f;
-    div_v = 0.0f;
-    wcount = 0.0f;
-    wcount_dh = 0.0f;
-    rot_v.x = 0.0f;
-    rot_v.y = 0.0f;
-    rot_v.z = 0.0f;
-
-    /* Loop over the parts in cj. */
-    /* TODO May be possible to optimize this loop ordering.*/
-    for (int pjd = part_j; pjd < part_j + count_j; pjd++) {
-
-      /* Compute the pairwise distance. */
-      float r2 = 0.0f;
-      float dx[3];
-      dx[0] = pix[0] - cuda_parts.x_x[pjd];
-      r2 += dx[0] * dx[0];
-      dx[1] = pix[1] - cuda_parts.x_y[pjd];
-      r2 += dx[1] * dx[1];
-      dx[2] = pix[2] - cuda_parts.x_z[pjd];
-      r2 += dx[2] * dx[2];
-      /* If in range then interact. */
-      if (r2 < hig2) {
-        float w, dw_dx;
-        float dv[3], curlvr[3];
-
-        /* Load mass on particle pj. */
-        const float mj = cuda_parts.mass[pjd];
-
-        /* Get r and 1/r */
-        const float r = sqrtf(r2);
-        const float ri = 1.0f / r;
-
-        /* Compute the kernel function */
-        const float hi_inv = 1.0f / hi;
-        const float ui = r * hi_inv;
-
-        cuda_kernel_deval(ui, &w, &dw_dx);
-
-        /* Compute contribution to the density. */
-        rho += mj * w;
-        rho_dh -= mj * (hydro_dimension * w + ui * dw_dx);
-
-        /* Compute contribution to the number of neighbours */
-        wcount += w;
-        wcount_dh -= (hydro_dimension * w + ui * dw_dx);
-
-        const float fac = mj * dw_dx * ri;
-
-        /* Compute dv dot r */
-        float3 piv, pjv;
-        piv = cuda_parts.v[pid];
-        pjv = cuda_parts.v[pjd];
-        dv[0] = piv.x - pjv.x;
-        dv[1] = piv.y - pjv.y;
-        dv[2] = piv.z - pjv.z;
-        const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
-
-        div_v -= fac * dvdr;
-
-        curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
-        curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
-        curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];
-        
-        rot_v.x += fac * curlvr[0];
-        rot_v.y += fac * curlvr[1];
-        rot_v.z += fac * curlvr[2];
-        // w);
-      }
-    }  // Loop over cj.
-    /* Write data for particle pid back to global stores. */
-    atomicAdd(&cuda_parts.rho[pid], rho);
-    atomicAdd(&cuda_parts.rho_dh[pid], rho_dh);
-    atomicAdd(&cuda_parts.wcount[pid], wcount);
-    atomicAdd(&cuda_parts.wcount_dh[pid], wcount_dh);
-    atomicAdd(&cuda_parts.div_v[pid], div_v);
-    atomicAdd(&cuda_parts.rot_v[pid].x, rot_v.x);
-    atomicAdd(&cuda_parts.rot_v[pid].y, rot_v.y);
-    atomicAdd(&cuda_parts.rot_v[pid].z, rot_v.z);
-
-  }  // Loop over ci.
-}
-
-/* Task function to perform a self force task. No symmetry */
-__device__ void doself_force(struct cell_cuda *ci) {
-  /* Is the cell active? */
-  if (!cuda_cell_is_active(ci)) return;
-
-  const int count_i = ci->part_count;
-  int part_i = ci->first_part;
-
-  float3 a_hydro;
-  float h_dt, v_sig_stor, entropy_dt;
-
-  /* Loop over the particles */
-  for (int pid = part_i + threadIdx.x; pid < part_i + count_i;
-       pid += blockDim.x) {
-
-    const float hi = cuda_parts.h[pid];
-    if (!cuda_part_is_active(pid)) continue;
-    /* Reset the values. */
-    a_hydro.x = 0.0f;
-    a_hydro.y = 0.0f;
-    a_hydro.z = 0.0f;
-    h_dt = 0.0f;
-    v_sig_stor = cuda_parts.v_sig[pid];
-    entropy_dt = 0.0f;
-
-    double pix[3];
-    pix[0] = cuda_parts.x_x[pid];
-    pix[1] = cuda_parts.x_y[pid];
-    pix[2] = cuda_parts.x_z[pid];
-
-    const float hig2 = hi * hi * kernel_gamma2;
-
-    /* Loop over the particles in cj. */
-    for (int pjd = part_i; pjd < part_i + count_i; pjd++) {
-
-      if (pid == pjd) continue;
-      /* Compute the pairwise distance. */
-      float r2 = 0.0f;
-      float dx[3];
-      dx[0] = pix[0] - cuda_parts.x_x[pjd];
-      r2 += dx[0] * dx[0];
-      dx[1] = pix[1] - cuda_parts.x_y[pjd];
-      r2 += dx[1] * dx[1];
-      dx[2] = pix[2] - cuda_parts.x_z[pjd];
-      r2 += dx[2] * dx[2];
-      const float hj = cuda_parts.h[pjd];
-      if (r2 < hig2 || r2 < hj * hj * kernel_gamma2) {
-        float wi, wj, wi_dx, wj_dx;
-        const float fac_mu = 1.f;
-
-        const float r = sqrtf(r2);
-        const float r_inv = 1.0f / r;
-
-        /* Load some data.*/
-        const float mj = cuda_parts.mass[pjd];
-        const float rhoi = cuda_parts.rho[pid];
-        const float rhoj = cuda_parts.rho[pjd];
-
-        /* Get the kernel for hi. */
-        const float hi_inv = 1.0f / hi;
-        const float hid_inv = cuda_pow_dimension_plus_one(hi_inv);
-        const float ui = r * hi_inv;
-        cuda_kernel_deval(ui, &wi, &wi_dx);
-        const float wi_dr = hid_inv * wi_dx;
-
-        /* Get the kernel for hj. */
-        const float hj_inv = 1.0f / hj;
-        const float hjd_inv = cuda_pow_dimension_plus_one(hj_inv);
-        const float xj = r * hj_inv;
-        cuda_kernel_deval(xj, &wj, &wj_dx);
-        const float wj_dr = hjd_inv * wj_dx;
-
-        /* Compute h-gradient terms */
-        const float f_i = cuda_parts.f[pid];
-        const float f_j = cuda_parts.f[pjd];
-
-        /* Compute pressure terms */
-        const float P_over_rho2_i = cuda_parts.P_over_rho2[pid];
-        const float P_over_rho2_j = cuda_parts.P_over_rho2[pjd];
-
-        /* Compute sound speeds*/
-        const float ci = cuda_parts.soundspeed[pid];
-        const float cj = cuda_parts.soundspeed[pjd];
-
-        /* Compute dv dot r. */
-        const float dvdr = (cuda_parts.v[pid].x - cuda_parts.v[pjd].x) * dx[0] +
-                           (cuda_parts.v[pid].y - cuda_parts.v[pjd].y) * dx[1] +
-                           (cuda_parts.v[pid].z - cuda_parts.v[pjd].z) * dx[2];
-
-        /* Balsara term */
-        const float balsara_i = cuda_parts.balsara[pid];
-        const float balsara_j = cuda_parts.balsara[pjd];
-
-        /* Are the particles moving towards each other? */
-        const float omega_ij = (dvdr < 0.f) ? dvdr : 0.f;
-        const float mu_ij = fac_mu * r_inv * omega_ij;
-
-        /* Signal velocity */
-        const float v_sig = ci + cj - 3.f * mu_ij;
-        const float rho_ij = 0.5f * (rhoi + rhoj);
-        const float visc = -0.25 * const_viscosity_alpha * v_sig * mu_ij *
-                           (balsara_i + balsara_j) / rho_ij;
-
-        /* Now convolce with the kernel */
-        const float visc_term = 0.5f * visc * (wi_dr + wj_dr) * r_inv;
-        const float sph_term =
-            (f_i * P_over_rho2_i * wi_dr + f_j * P_over_rho2_j * wj_dr) * r_inv;
-
-        /* Compute the acceleration */
-        const float acc = visc_term + sph_term;
-
-        /* Compute the force */
-        a_hydro.x -= mj * acc * dx[0];
-        a_hydro.y -= mj * acc * dx[1];
-        a_hydro.z -= mj * acc * dx[2];
-
-
-        /* Get the time derivative for h. */
-        h_dt -= mj * dvdr * r_inv / rhoj * wi_dr;
-
-        /* Update the signal velocity. */
-        v_sig_stor = (v_sig_stor > v_sig) ? v_sig_stor : v_sig;
-
-        /* Change in entropy */
-        entropy_dt += mj * visc_term * dvdr;
-      }
-
-    }  // Inner loop
-
-    /* Flush to global stores.*/
-    atomicAdd(&cuda_parts.a_hydro[pid].x, a_hydro.x);
-    atomicAdd(&cuda_parts.a_hydro[pid].y, a_hydro.y);
-    atomicAdd(&cuda_parts.a_hydro[pid].z, a_hydro.z);
-    atomicAdd(&cuda_parts.h_dt[pid], h_dt);
-    atomicAdd(&cuda_parts.entropy_dt[pid], entropy_dt);
-
-    /* Update the signal velocity */
-    float global_vsig = cuda_parts.v_sig[pid];
-    int *address_as_int = (int *)&cuda_parts.v_sig[pid];
-    int old = *address_as_int;
-    int assumed;
-    do {
-      global_vsig = cuda_parts.v_sig[pid];  // Scary line.
-      assumed = old;
-      if (v_sig_stor > global_vsig)
-        old = atomicCAS(address_as_int, assumed, __float_as_int(v_sig_stor));
-    } while (assumed != old && v_sig_stor > global_vsig);
-
-  }  // Outer loop
-}
-
-/* Task function to execute a self-density task. */
-__device__ void doself_density_symmetric(struct cell_cuda *ci) {
-
-  /* Is the cell active? */
-  if (!cuda_cell_is_active(ci)) {
-    printf(
-        "Cell isn't active..., ti_end_min=%i, ti_current=%i, "
-        "max_active_bin=%i\n",
-        ci->ti_end_min, ti_current, max_active_bin);
-    return;
-  }
-
-  const int count_i = ci->part_count;
-  int part_i = ci->first_part;
-  float rho, rho_dh, div_v, wcount, wcount_dh;
-  float3 rot_v;
-
-  for (int pid = part_i + threadIdx.x; pid < part_i + count_i;
-       pid += blockDim.x) {
-    double pix[3];
-    pix[0] = cuda_parts.x_x[pid];
-    pix[1] = cuda_parts.x_y[pid];
-    pix[2] = cuda_parts.x_z[pid];
-    const float hi = cuda_parts.h[pid];
-    const float hig2 = hi * hi * kernel_gamma2;
-
-    /* Reset local values. */
-    rho = 0.0f;
-    rho_dh = 0.0f;
-    div_v = 0.0f;
-    wcount = 0.0f;
-    wcount_dh = 0.0f;
-    rot_v.x = 0.0f;
-    rot_v.y = 0.0f;
-    rot_v.z = 0.0f;
-
-    /* Search for the neighbours! */
-    for (int pjd = pid + 1; pjd < part_i + count_i; pjd++) {
-      /* Particles don't interact with themselves */
-      float dx[3], r2 = 0.0f;
-      dx[0] = pix[0] - cuda_parts.x_x[pjd];
-      r2 += dx[0] * dx[0];
-      dx[1] = pix[1] - cuda_parts.x_y[pjd];
-      r2 += dx[1] * dx[1];
-      dx[2] = pix[2] - cuda_parts.x_z[pjd];
-      r2 += dx[2] * dx[2];
-
-      const float hj = cuda_parts.h[pjd];
-      const float hjg2 = hj * hj * kernel_gamma2;
-      /* If in range then interact. */
-      if (r2 < hig2 || r2 < hjg2) {
-        float w, dw_dx;
-        float dv[3], curlvr[3];
-        /* Get r and 1/r */
-        const float r = sqrtf(r2);
-        const float ri = 1.0f / r;
-
-	if (r2 < hig2 && cuda_part_is_active(pid)) {
-	  /* Load mass on particle pj. */
-	  const float mj = cuda_parts.mass[pjd];
-
-
-	  /* Compute the kernel function */
-	  const float hi_inv = 1.0f / hi;
-	  const float ui = r * hi_inv;
-
-	  cuda_kernel_deval(ui, &w, &dw_dx);
-	  /* Compute contribution to the density. */
-	  rho = mj * w;
-	  rho_dh = - mj * (hydro_dimension * w + ui * dw_dx);
-
-	  /* Compute contribution to the number of neighbours */
-	  wcount = w;
-	  wcount_dh = -(hydro_dimension * w + ui * dw_dx);
-
-	  const float fac = mj * dw_dx * ri;
-
-	  /* Compute dv dot r */
-	  float3 piv, pjv;
-	  piv = cuda_parts.v[pid];
-	  pjv = cuda_parts.v[pjd];
-	  dv[0] = piv.x - pjv.x;
-	  dv[1] = piv.y - pjv.y;
-	  dv[2] = piv.z - pjv.z;
-	  const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
-
-	  div_v = - fac * dvdr;
-
-	  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
-	  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
-	  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];
-	  
-	  rot_v.x = fac * curlvr[0];
-	  rot_v.y = fac * curlvr[1];
-	  rot_v.z = fac * curlvr[2];
-
-	  atomicAdd(&cuda_parts.rho[pid], rho);
-	  atomicAdd(&cuda_parts.rho_dh[pid], rho_dh);
-	  atomicAdd(&cuda_parts.wcount[pid], wcount);
-	  atomicAdd(&cuda_parts.wcount_dh[pid], wcount_dh);
-	  atomicAdd(&cuda_parts.div_v[pid], div_v);
-	  atomicAdd(&cuda_parts.rot_v[pid].x, rot_v.x);
-	  atomicAdd(&cuda_parts.rot_v[pid].y, rot_v.y);
-	  atomicAdd(&cuda_parts.rot_v[pid].z, rot_v.z);
-	}
-	
-	if (r2 < hjg2 && cuda_part_is_active(pjd)) {
-	  /* Load mass on particle pj. */
-	  const float mi = cuda_parts.mass[pid];
-
-
-	  /* Compute the kernel function */
-	  const float hj_inv = 1.0f / hj;
-	  const float uj = r * hj_inv;
-
-	  cuda_kernel_deval(uj, &w, &dw_dx);
-	  /* Compute contribution to the density. */
-	  rho = mi * w;
-	  rho_dh = - mi * (hydro_dimension * w + uj * dw_dx);
-
-	  /* Compute contribution to the number of neighbours */
-	  wcount = w;
-	  wcount_dh = -(hydro_dimension * w + uj * dw_dx);
-
-	  const float fac = mi * dw_dx * ri;
-
-	  /* Compute dv dot r */
-	  float3 piv, pjv;
-	  piv = cuda_parts.v[pid];
-	  pjv = cuda_parts.v[pjd];
-	  dv[0] = pjv.x - piv.x;
-	  dv[1] = pjv.y - piv.y;
-	  dv[2] = pjv.z - piv.z;
-	  const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
-
-	  div_v = - fac * dvdr;
-
-	  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
-	  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
-	  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];
-	  
-	  rot_v.x = fac * curlvr[0];
-	  rot_v.y = fac * curlvr[1];
-	  rot_v.z = fac * curlvr[2];
-
-	  atomicAdd(&cuda_parts.rho[pjd], rho);
-	  atomicAdd(&cuda_parts.rho_dh[pjd], rho_dh);
-	  atomicAdd(&cuda_parts.wcount[pjd], wcount);
-	  atomicAdd(&cuda_parts.wcount_dh[pjd], wcount_dh);
-	  atomicAdd(&cuda_parts.div_v[pjd], div_v);
-	  atomicAdd(&cuda_parts.rot_v[pjd].x, rot_v.x);
-	  atomicAdd(&cuda_parts.rot_v[pjd].y, rot_v.y);
-	  atomicAdd(&cuda_parts.rot_v[pjd].z, rot_v.z);
-	}
-      }
-    }  // Loop over cj.
-    /* Write data for particle pid back to global stores. */
-  }
-}
-
-/* Task function to execute a self-density task. */
-__device__ void doself_density(struct cell_cuda *ci) {
-
-  /* Is the cell active? */
-  if (!cuda_cell_is_active(ci)) {
-    printf(
-        "Cell isn't active..., ti_end_min=%i, ti_current=%i, "
-        "max_active_bin=%i\n",
-        ci->ti_end_min, ti_current, max_active_bin);
-    return;
-  }
-
-  const int count_i = ci->part_count;
-  int part_i = ci->first_part;
-  float rho, rho_dh, div_v, wcount, wcount_dh;
-  float3 rot_v;
-
-  for (int pid = part_i + threadIdx.x; pid < part_i + count_i;
-       pid += blockDim.x) {
-    double pix[3];
-    pix[0] = cuda_parts.x_x[pid];
-    pix[1] = cuda_parts.x_y[pid];
-    pix[2] = cuda_parts.x_z[pid];
-    const float hi = cuda_parts.h[pid];
-    const float hig2 = hi * hi * kernel_gamma2;
-
-    /* Reset local values. */
-    rho = 0.0f;
-    rho_dh = 0.0f;
-    div_v = 0.0f;
-    wcount = 0.0f;
-    wcount_dh = 0.0f;
-    rot_v.x = 0.0f;
-    rot_v.y = 0.0f;
-    rot_v.z = 0.0f;
-
-    /* If the particle isn't active skip it. */
-    if (!cuda_part_is_active(pid)) {
-      continue;
-    }
-
-    /* Search for the neighbours! */
-    for (int pjd = part_i; pjd < part_i + count_i; pjd++) {
-      /* Particles don't interact with themselves */
-      if (pid == pjd) continue;
-      float dx[3], r2 = 0.0f;
-      dx[0] = pix[0] - cuda_parts.x_x[pjd];
-      r2 += dx[0] * dx[0];
-      dx[1] = pix[1] - cuda_parts.x_y[pjd];
-      r2 += dx[1] * dx[1];
-      dx[2] = pix[2] - cuda_parts.x_z[pjd];
-      r2 += dx[2] * dx[2];
-
-      /* If in range then interact. */
-      if (r2 < hig2) {
-        float w, dw_dx;
-        float dv[3], curlvr[3];
-        /* Load mass on particle pj. */
-        const float mj = cuda_parts.mass[pjd];
-
-        /* Get r and 1/r */
-        const float r = sqrtf(r2);
-        const float ri = 1.0f / r;
-
-        /* Compute the kernel function */
-        const float hi_inv = 1.0f / hi;
-        const float ui = r * hi_inv;
-
-        cuda_kernel_deval(ui, &w, &dw_dx);
-        /* Compute contribution to the density. */
-        rho += mj * w;
-        rho_dh -= mj * (hydro_dimension * w + ui * dw_dx);
-
-        /* Compute contribution to the number of neighbours */
-        wcount += w;
-        wcount_dh -= (hydro_dimension * w + ui * dw_dx);
-
-        const float fac = mj * dw_dx * ri;
-
-        /* Compute dv dot r */
-        float3 piv, pjv;
-        piv = cuda_parts.v[pid];
-        pjv = cuda_parts.v[pjd];
-        dv[0] = piv.x - pjv.x;
-        dv[1] = piv.y - pjv.y;
-        dv[2] = piv.z - pjv.z;
-        const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
-
-        div_v -= fac * dvdr;
-
-        curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
-        curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
-        curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];
-
-        rot_v.x += fac * curlvr[0];
-        rot_v.y += fac * curlvr[1];
-        rot_v.z += fac * curlvr[2];
-      }
-    }  // Loop over cj.
-    /* Write data for particle pid back to global stores. */
-    atomicAdd(&cuda_parts.rho[pid], rho);
-    atomicAdd(&cuda_parts.rho_dh[pid], rho_dh);
-    atomicAdd(&cuda_parts.wcount[pid], wcount);
-    atomicAdd(&cuda_parts.wcount_dh[pid], wcount_dh);
-    atomicAdd(&cuda_parts.div_v[pid], div_v);
-    atomicAdd(&cuda_parts.rot_v[pid].x, rot_v.x);
-    atomicAdd(&cuda_parts.rot_v[pid].y, rot_v.y);
-    atomicAdd(&cuda_parts.rot_v[pid].z, rot_v.z);
-  }
-}
-
-
-__global__ void do_test_self_density_symmetric(struct cell_cuda *ci) {
-
-  doself_density_symmetric(ci);
-
-}
-
-__global__ void do_test_self_density(struct cell_cuda *ci) {
-
-  doself_density(ci);
-
-}
-
-__global__ void do_test_self_force(struct cell_cuda *ci) {
-
-  doself_force(ci);
-
-}
-__global__ void do_test_pair_density(struct cell_cuda *ci, struct cell_cuda *cj) {
-
-  dopair_density(ci,cj);
-  dopair_density(cj,ci);
-
-}
-
-/* Need to give cells in right order!
- */
-__global__ void do_test_pair_density_sorted(struct cell_cuda *ci, struct cell_cuda *cj, const int sid) {
-
-  dopair_density_sorted(ci, cj, sid);
-
-}
-
-__global__ void do_test_pair_force(struct cell_cuda *ci, struct cell_cuda *cj) {
-
-  dopair_force(ci,cj);
-  dopair_force(cj,ci);
-}
-
-void allocate_parts(struct particle_arrays *p, size_t num_part) { 
-  p->id = (long long int*) malloc (sizeof(long long int) * num_part);
-  p->x_x = (double*) malloc(sizeof(double) * num_part);
-  p->x_y = (double*) malloc(sizeof(double) * num_part);
-  p->x_z = (double*) malloc(sizeof(double) * num_part);
-  p->v = (float3*) malloc(sizeof(float3) * num_part);
-  p->a_hydro = (float3*) malloc(sizeof(float3) * num_part);
-  p->h = (float*) malloc(sizeof(float) * num_part);
-  p->mass = (float*) malloc(sizeof(float) * num_part);
-  p->rho = (float*) malloc(sizeof(float) * num_part);
-  p->entropy = (float*) malloc(sizeof(float) * num_part);
-  p->entropy_dt = (float*) malloc(sizeof(float) * num_part);
-
-  // density
-  p->wcount = (float*) malloc(sizeof(float) * num_part);
-  p->wcount_dh = (float*) malloc(sizeof(float) * num_part);
-  p->rho_dh = (float*) malloc(sizeof(float) * num_part);
-  p->rot_v = (float3*) malloc(sizeof(float3) * num_part);
-  p->div_v = (float*) malloc(sizeof(float) * num_part);
-
-  // force
-  p->balsara = (float*) malloc(sizeof(float) * num_part);
-  p->f = (float*) malloc(sizeof(float) * num_part);
-  p->P_over_rho2 = (float*) malloc(sizeof(float) * num_part);
-  p->soundspeed = (float*) malloc(sizeof(float) * num_part);
-  p->v_sig = (float*) malloc(sizeof(float) * num_part);
-  p->h_dt = (float*) malloc(sizeof(float) * num_part);
-  
-  p->time_bin = (timebin_t*) malloc(sizeof(timebin_t) * num_part);
-
-  for (int i=0; i<NBRE_DIR; i++)
-    p->sort[i] = (struct entry *)malloc(sizeof(struct entry)*(num_part + 1));
-
-}
-
-void allocate_device_parts(size_t num_part) {
-  struct particle_arrays c_parts;
-  
-  cudaErrCheck(cudaMalloc(&c_parts.id, sizeof(long long int) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.x_x, sizeof(double) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.x_y, sizeof(double) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.x_z, sizeof(double) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.v, sizeof(float3) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.a_hydro, sizeof(float3) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.h, sizeof(float) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.mass, sizeof(float) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.rho, sizeof(float) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.entropy, sizeof(float) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.entropy_dt, sizeof(float) * num_part));
-
-  // density
-  cudaErrCheck(cudaMalloc(&c_parts.wcount, sizeof(float) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.wcount_dh, sizeof(float) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.rho_dh, sizeof(float) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.rot_v, sizeof(float3) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.div_v, sizeof(float) * num_part));
-
-  // force
-  cudaErrCheck(cudaMalloc(&c_parts.balsara, sizeof(float) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.f, sizeof(float) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.P_over_rho2, sizeof(float) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.soundspeed, sizeof(float) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.v_sig, sizeof(volatile float) * num_part));
-  cudaErrCheck(cudaMalloc(&c_parts.h_dt, sizeof(float) * num_part));
-
-  cudaErrCheck(cudaMalloc(&c_parts.time_bin, sizeof(timebin_t) * num_part));
-
-  for (int i=0; i<NBRE_DIR; i++)
-    cudaErrCheck(cudaMalloc(&c_parts.sort[i], sizeof(struct entry) * num_part));
-
-   cudaErrCheck(cudaMemcpyToSymbol(cuda_parts, &c_parts, sizeof(struct particle_arrays)));
-
-}
-
-void free_parts(struct particle_arrays p) {
-  free(p.id); p.id = NULL;
-  free(p.x_x); p.x_x = NULL;
-  free(p.x_y); p.x_y = NULL;
-  free(p.x_z); p.x_z = NULL;
-  free(p.v); p.v = NULL;
-  free(p.a_hydro); p.a_hydro = NULL;
-  free(p.h); p.h = NULL;
-  free(p.mass); p.mass = NULL;
-  free(p.rho); p.rho = NULL;
-  free(p.entropy); p.entropy = NULL;
-  free(p.entropy_dt); p.entropy_dt = NULL;
-
-  // density
-  free(p.wcount); p.wcount = NULL;
-  free(p.wcount_dh); p.wcount_dh = NULL;
-  free(p.rho_dh); p.rho_dh = NULL;
-  free(p.rot_v); p.rot_v = NULL;
-  free(p.div_v); p.div_v = NULL;
-
-  // force
-  free(p.balsara); p.balsara = NULL;
-  free(p.f); p.f = NULL;
-  free(p.P_over_rho2); p.P_over_rho2 = NULL;
-  free(p.soundspeed); p.soundspeed = NULL;
-  free((void *)p.v_sig); p.v_sig = NULL;
-  free(p.h_dt); p.h_dt = NULL;
-  
-  free(p.time_bin); p.time_bin = NULL;
-}
-
-void free_device_parts() {
-  struct particle_arrays parts;
-  cudaErrCheck(cudaMemcpyFromSymbol(&parts, cuda_parts,
-				    sizeof(struct particle_arrays)));
-
-  cudaErrCheck(cudaFree(parts.id));
-  cudaErrCheck(cudaFree(parts.x_x));
-  cudaErrCheck(cudaFree(parts.x_y));
-  cudaErrCheck(cudaFree(parts.x_z));
-  cudaErrCheck(cudaFree(parts.v));
-  cudaErrCheck(cudaFree(parts.a_hydro));
-  cudaErrCheck(cudaFree(parts.h));
-  cudaErrCheck(cudaFree(parts.mass));
-  cudaErrCheck(cudaFree(parts.rho));
-  cudaErrCheck(cudaFree(parts.entropy));
-  cudaErrCheck(cudaFree(parts.entropy_dt));
-
-  // density
-  cudaErrCheck(cudaFree(parts.wcount));
-  cudaErrCheck(cudaFree(parts.wcount_dh));
-  cudaErrCheck(cudaFree(parts.rho_dh));
-  cudaErrCheck(cudaFree(parts.rot_v));
-  cudaErrCheck(cudaFree(parts.div_v));
-
-  // force
-  cudaErrCheck(cudaFree(parts.balsara));
-  cudaErrCheck(cudaFree(parts.f));
-  cudaErrCheck(cudaFree(parts.P_over_rho2));
-  cudaErrCheck(cudaFree(parts.soundspeed));
-  cudaErrCheck(cudaFree((void*)parts.v_sig));
-  cudaErrCheck(cudaFree(parts.h_dt));
-  
-  for (int i=0; i<NBRE_DIR; i++)
-    cudaErrCheck(cudaFree(parts.sort[i]));
-
-  cudaErrCheck(cudaFree(parts.time_bin));
-}
-
-void copy_to_device_array(struct cell *ci, int offset) {
-
-  struct particle_arrays h_p;
-    
-  int num_part = ci->count;
-  
-  allocate_parts(&h_p, num_part);
-  
-  //copy particles data
-  for(int i=0;i<num_part;i++) {
-    struct part p = ci->parts[i];
-    h_p.id[i] = p.id;
-    h_p.x_x[i] = p.x[0];
-    h_p.x_y[i] = p.x[1];
-    h_p.x_z[i] = p.x[2];
-    h_p.v[i].x = p.v[0];
-    h_p.v[i].y = p.v[1];
-    h_p.v[i].z = p.v[2];
-    h_p.a_hydro[i].x = p.a_hydro[0];
-    h_p.a_hydro[i].y = p.a_hydro[1];
-    h_p.a_hydro[i].z = p.a_hydro[2];
-    h_p.h[i] = p.h;
-    h_p.mass[i] = p.mass;
-    h_p.rho[i] = p.rho;
-    h_p.entropy[i] = p.entropy;
-    h_p.entropy_dt[i] = p.entropy_dt;
-
-    // density
-    h_p.wcount[i] = p.density.wcount;
-    h_p.wcount_dh[i] = p.density.wcount_dh;
-    h_p.rho_dh[i] = p.density.rho_dh;
-    h_p.rot_v[i].x = p.density.rot_v[0];
-    h_p.rot_v[i].y = p.density.rot_v[1];
-    h_p.rot_v[i].z = p.density.rot_v[2];
-    h_p.div_v[i] = p.density.div_v;
-
-    // force
-    h_p.balsara[i] = p.force.balsara;
-    h_p.f[i] = p.force.f;
-    h_p.P_over_rho2[i] = p.force.P_over_rho2;
-    h_p.soundspeed[i] = p.force.soundspeed;
-    h_p.v_sig[i] = p.force.v_sig;
-    h_p.h_dt[i] = p.force.h_dt;
-
-    for (int j=0; j<NBRE_DIR; j++) {
-      h_p.sort[j][i].d = ci->sort[j][i].d;
-      h_p.sort[j][i].i = ci->sort[j][i].i;
-    }
-      
-    h_p.time_bin[i] = p.time_bin;
-  }
-
-  struct particle_arrays c_parts;
-  cudaErrCheck(cudaMemcpyFromSymbol(&c_parts, cuda_parts,
-                                    sizeof(struct particle_arrays)));
-
-  void *p_data = c_parts.id + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.id, sizeof(long long int) * num_part,cudaMemcpyHostToDevice));
-
-  p_data = c_parts.x_x + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.x_x, sizeof(double) * num_part, cudaMemcpyHostToDevice));
-  
-  p_data = c_parts.x_y + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.x_y, sizeof(double) * num_part, cudaMemcpyHostToDevice));
-
-  p_data = c_parts.x_z + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.x_z, sizeof(double) * num_part, cudaMemcpyHostToDevice));
-
-  p_data = c_parts.v + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.v, sizeof(float3) * num_part, cudaMemcpyHostToDevice));
-
-  p_data = c_parts.a_hydro + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.a_hydro,sizeof(float3) * num_part, cudaMemcpyHostToDevice));
-
-  p_data = c_parts.h + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.h,sizeof(float) * num_part, cudaMemcpyHostToDevice));
-
-  p_data = c_parts.mass + offset;
-  cudaErrCheck(cudaMemcpy(p_data,h_p.mass, sizeof(float) * num_part, cudaMemcpyHostToDevice));
-
-  p_data = c_parts.rho + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.rho,sizeof(float) * num_part, cudaMemcpyHostToDevice));
-
-  p_data = c_parts.entropy + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.entropy,sizeof(float) * num_part, cudaMemcpyHostToDevice));
-
-  p_data = c_parts.entropy_dt + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.entropy_dt,sizeof(float) * num_part, cudaMemcpyHostToDevice));
-
-  // density
-  p_data = c_parts.wcount + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.wcount, sizeof(float) * num_part, cudaMemcpyHostToDevice));
-
-  p_data = c_parts.wcount_dh + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.wcount_dh, sizeof(float) * num_part, cudaMemcpyHostToDevice));
-
-  p_data = c_parts.rho_dh + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.rho_dh, sizeof(float) * num_part, cudaMemcpyHostToDevice));
-
-  p_data = c_parts.rot_v + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.rot_v, sizeof(float3) * num_part, cudaMemcpyHostToDevice));
-
-  p_data = c_parts.div_v + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.div_v, sizeof(float) * num_part, cudaMemcpyHostToDevice));
-
-  // force
-  p_data = c_parts.balsara + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.balsara, sizeof(float) * num_part, cudaMemcpyHostToDevice));
-
-  p_data = c_parts.f + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.f, sizeof(float) * num_part, cudaMemcpyHostToDevice));
-
-  p_data = c_parts.P_over_rho2 + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.P_over_rho2, sizeof(float) * num_part, cudaMemcpyHostToDevice));
-
-  p_data = c_parts.soundspeed + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.soundspeed, sizeof(float) * num_part, cudaMemcpyHostToDevice));
-
-  p_data = (void*)(c_parts.v_sig + offset);
-  cudaErrCheck(cudaMemcpy(p_data, (void*)h_p.v_sig, sizeof(volatile float) * num_part, cudaMemcpyHostToDevice));
-
-  p_data = c_parts.h_dt + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.h_dt, sizeof(float) * num_part, cudaMemcpyHostToDevice));
-
-  
-  for (int j=0; j<NBRE_DIR; j++) {
-    p_data = c_parts.sort[j] + offset;
-    cudaErrCheck(cudaMemcpy(p_data, h_p.sort[j], sizeof(struct entry) * num_part, cudaMemcpyHostToDevice));
-  }
-  
-  p_data = c_parts.time_bin + offset;
-  cudaErrCheck(cudaMemcpy(p_data, h_p.time_bin,sizeof(timebin_t) * num_part, cudaMemcpyHostToDevice));
-
-  free_parts(h_p);
-}
-
-void copy_from_device_array(struct particle_arrays *h_p, int offset, size_t num_part) {
-
-  struct particle_arrays c_parts;
-  cudaErrCheck(cudaMemcpyFromSymbol(&c_parts, cuda_parts,
-                                    sizeof(struct particle_arrays)));
-
-  void *p_data = c_parts.id + offset;
-  cudaErrCheck(cudaMemcpy(h_p->id, p_data, sizeof(long long int) * num_part, cudaMemcpyDeviceToHost));
-
-  p_data = c_parts.x_x + offset;
-  cudaErrCheck(cudaMemcpy(h_p->x_x, p_data, sizeof(double) * num_part, cudaMemcpyDeviceToHost));
-  
-  p_data = c_parts.x_y + offset;
-  cudaErrCheck(cudaMemcpy(h_p->x_y, p_data, sizeof(double) * num_part, cudaMemcpyDeviceToHost));
-
-  p_data = c_parts.x_z + offset;
-  cudaErrCheck(cudaMemcpy(h_p->x_z, p_data, sizeof(double) * num_part, cudaMemcpyDeviceToHost));
-
-  p_data = c_parts.v + offset;
-  cudaErrCheck(cudaMemcpy(h_p->v, p_data, sizeof(float3) * num_part, cudaMemcpyDeviceToHost));
-
-  p_data = c_parts.a_hydro + offset;
-  cudaErrCheck(cudaMemcpy(h_p->a_hydro, p_data, sizeof(float3) * num_part, cudaMemcpyDeviceToHost));
-
-  p_data = c_parts.h + offset;
-  cudaErrCheck(cudaMemcpy(h_p->h, p_data, sizeof(float) * num_part, cudaMemcpyDeviceToHost));
-
-  p_data = c_parts.mass + offset;
-  cudaErrCheck(cudaMemcpy(h_p->mass, p_data, sizeof(float) * num_part, cudaMemcpyDeviceToHost));
-
-  p_data = c_parts.rho + offset;
-  cudaErrCheck(cudaMemcpy(h_p->rho, p_data, sizeof(float) * num_part, cudaMemcpyDeviceToHost));
-
-  p_data = c_parts.entropy + offset;
-  cudaErrCheck(cudaMemcpy(h_p->entropy, p_data, sizeof(float) * num_part, cudaMemcpyDeviceToHost));
-
-  p_data = c_parts.entropy_dt + offset;
-  cudaErrCheck(cudaMemcpy(h_p->entropy_dt, p_data, sizeof(float) * num_part, cudaMemcpyDeviceToHost));
-
-  // density
-  p_data = c_parts.wcount + offset;
-  cudaErrCheck(cudaMemcpy(h_p->wcount, p_data, sizeof(float) * num_part, cudaMemcpyDeviceToHost));
-  
-  p_data = c_parts.wcount_dh + offset;
-  cudaErrCheck(cudaMemcpy(h_p->wcount_dh, p_data, sizeof(float) * num_part, cudaMemcpyDeviceToHost));
-  
-  p_data = c_parts.rho_dh + offset;
-  cudaErrCheck(cudaMemcpy(h_p->rho_dh, p_data, sizeof(float) * num_part, cudaMemcpyDeviceToHost));
-  
-  p_data = c_parts.rot_v + offset;
-  cudaErrCheck(cudaMemcpy(h_p->rot_v, p_data, sizeof(float3) * num_part, cudaMemcpyDeviceToHost));
-  
-  p_data = c_parts.div_v + offset;
-  cudaErrCheck(cudaMemcpy(h_p->div_v, p_data, sizeof(float) * num_part, cudaMemcpyDeviceToHost));
-  
-
-  // force
-  p_data = c_parts.balsara + offset;
-  cudaErrCheck(cudaMemcpy(h_p->balsara, p_data, sizeof(float) * num_part, cudaMemcpyDeviceToHost));
-  
-  p_data = c_parts.f + offset;
-  cudaErrCheck(cudaMemcpy(h_p->f, p_data, sizeof(float) * num_part, cudaMemcpyDeviceToHost));
-  
-  p_data = c_parts.P_over_rho2 + offset;
-  cudaErrCheck(cudaMemcpy(h_p->P_over_rho2, p_data, sizeof(float) * num_part, cudaMemcpyDeviceToHost));
-  
-  p_data = c_parts.soundspeed + offset;
-  cudaErrCheck(cudaMemcpy(h_p->soundspeed, p_data, sizeof(float) * num_part, cudaMemcpyDeviceToHost));
-  
-  p_data = (void*)(c_parts.v_sig + offset);
-  cudaErrCheck(cudaMemcpy((void*)h_p->v_sig, p_data, sizeof(volatile float) * num_part, cudaMemcpyDeviceToHost));
-  
-  p_data = c_parts.h_dt + offset;
-  cudaErrCheck(cudaMemcpy(h_p->h_dt, p_data, sizeof(float) * num_part, cudaMemcpyDeviceToHost));
-  
-
-  for (int j=0; j<NBRE_DIR; j++) {
-    p_data = c_parts.sort[j] + offset;
-    cudaErrCheck(cudaMemcpy(h_p->sort[j], p_data, sizeof(struct entry) * num_part, cudaMemcpyDeviceToHost));
-  }
-
-  p_data = c_parts.time_bin + offset;
-  cudaErrCheck(cudaMemcpy(h_p->time_bin, p_data, sizeof(timebin_t) * num_part, cudaMemcpyDeviceToHost));
-}
-
-struct cell_cuda* copy_from_host(struct cell *ci, int offset) {
-  copy_to_device_array(ci, offset);
-  /* Set the host pointer. */
-  struct cell_cuda c2;
-
-  for (int k=0; k<DIM; k++) {
-    c2.loc[k] = ci->loc[k];
-    c2.width[k] = ci->width[k];
-  
-  }
-  c2.h_max = ci->h_max;
-  c2.first_part = offset;
-  c2.part_count = ci->count;
-  if (ci->parent != NULL) {
-    c2.parent = ci->parent->cuda_ID;
-  } else {
-    c2.parent = -1;
-  }
-  if (ci->super != NULL) {
-    c2.super = ci->super->cuda_ID;
-  } else {
-    c2.super = -1;
-  }
-  c2.ti_end_min = ci->ti_end_min;
-  c2.ti_end_max = ci->ti_end_max;
-  c2.dmin = ci->dmin;
-  c2.nr_links = 0;
-  c2.split = ci->split;
-
-  struct cell_cuda *ret;
-  cudaErrCheck( cudaMalloc(&ret, sizeof(struct cell_cuda)) );
-  cudaErrCheck( cudaMemcpy(ret, &c2, sizeof(struct cell_cuda), cudaMemcpyHostToDevice) );
-  return ret;
-}
-
-void copy_to_host(struct cell_cuda *cuda_c, struct cell *c) {
-  struct cell_cuda *h_cuda_cell = (struct cell_cuda*) malloc(sizeof(struct cell_cuda));
-
-  cudaMemcpy(h_cuda_cell, cuda_c, sizeof(struct cell_cuda), cudaMemcpyDeviceToHost);
-  int N = h_cuda_cell->part_count;
-  int p_i = h_cuda_cell->first_part;
-
-  struct particle_arrays h_cuda_part;
-  allocate_parts(&h_cuda_part, N);
-  copy_from_device_array(&h_cuda_part, p_i, N);
-
-  for (int i=0; i<N; i++) {
-    struct part *p = &c->parts[i];
-
-    p->id = h_cuda_part.id[i];
-    p->h = h_cuda_part.h[i];
-    p->rho = h_cuda_part.rho[i];
-    p->entropy = h_cuda_part.entropy[i];
-    p->entropy_dt = h_cuda_part.entropy_dt[i];
-    //density
-    p->density.wcount = h_cuda_part.wcount[i];
-    p->density.wcount_dh = h_cuda_part.wcount_dh[i];
-    p->density.rho_dh = h_cuda_part.rho_dh[i];
-    p->density.div_v = h_cuda_part.div_v[i];
-
-    //force
-    p->force.balsara = h_cuda_part.balsara[i];
-    p->force.f = h_cuda_part.f[i];
-    p->force.P_over_rho2 = h_cuda_part.P_over_rho2[i];
-    p->force.soundspeed = h_cuda_part.soundspeed[i];
-    p->force.v_sig = h_cuda_part.v_sig[i];
-    p->force.h_dt = h_cuda_part.h_dt[i];
-    
-    p->x[0] = h_cuda_part.x_x[i];
-    p->v[0] = h_cuda_part.v[i].x;
-    p->a_hydro[0] = h_cuda_part.a_hydro[i].x;
-    p->density.rot_v[0] = h_cuda_part.rot_v[i].x;
-#if DIM > 1
-    p->x[1] = h_cuda_part.x_y[i];
-    p->v[1] = h_cuda_part.v[i].y;
-    p->a_hydro[1] = h_cuda_part.a_hydro[i].y;
-    p->density.rot_v[1] = h_cuda_part.rot_v[i].y;
-#if DIM > 2
-    p->x[2] = h_cuda_part.x_z[i];
-    p->v[2] = h_cuda_part.v[i].z;
-    p->a_hydro[2] = h_cuda_part.a_hydro[i].z;
-    p->density.rot_v[2] = h_cuda_part.rot_v[i].z;
-#endif
-#endif
-
-    for (int j=0; j<NBRE_DIR; j++)
-      c->sort[j][i] = h_cuda_part.sort[j][i];
-  }
-}
-
-
-/* n is both particles per axis and box size:
- * particles are generated on a mesh with unit spacing
- */
-struct cell *make_cell(size_t n, double *offset, double size, double h,
-                       double density, unsigned long long *partId,
-                       double pert) {
-  const size_t count = n * n * n;
-  const double volume = size * size * size;
-  struct cell *cell = (struct cell*)malloc(sizeof(struct cell));
-
-  bzero(cell, sizeof(struct cell));
-
-  if (posix_memalign((void **)&cell->parts, part_align,
-                     count * sizeof(struct part)) != 0) {
-    error("couldn't allocate particles, no. of particles: %d", (int)count);
-  }
-  bzero(cell->parts, count * sizeof(struct part));
-
-  /* Construct the parts */
-  struct part *part = cell->parts;
-  for (size_t x = 0; x < n; ++x) {
-    for (size_t y = 0; y < n; ++y) {
-      for (size_t z = 0; z < n; ++z) {
-        part->x[0] = offset[0] +
-	  size * (x + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
-        part->x[1] = offset[1] +
-	  size * (y + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
-        part->x[2] = offset[2] +
-	  size * (z + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
-
-	part->v[0] = random_uniform(-0.05, 0.05);
-        part->v[1] = random_uniform(-0.05, 0.05);
-        part->v[2] = random_uniform(-0.05, 0.05);
-        part->h = size * h / (float)n;
-        part->id = ++(*partId);
-        part->mass = density * volume / count;
-        part->time_bin = (int) random_uniform(1, 500);
-
-	part->density.wcount = 0.;
-	part->density.wcount_dh = 0.;
-	part->density.rho_dh = 0.;
-	part->density.rot_v[0] = 0.;
-	part->density.rot_v[1] = 0.;
-	part->density.rot_v[2] = 0.;
-	part->density.div_v = 0.;
-
-	part->force.balsara = 0.;
-	part->force.f = 0.;
-	part->force.P_over_rho2 = 0.;
-	part->force.soundspeed = 0.;
-	part->force.v_sig = 0.;
-	part->force.h_dt = 0.;
-
-        ++part;
-      }
-    }
-  }
-
-  /* Cell properties */
-  cell->split = 0;
-  cell->h_max = h;
-  cell->count = count;
-  //cell->dx_max_part = 0.;
-  //cell->dx_max_sort = 0.;
-  cell->width[0] = n;
-  cell->width[1] = n;
-  cell->width[2] = n;
-  cell->loc[0] = offset[0];
-  cell->loc[1] = offset[1];
-  cell->loc[2] = offset[2];
-
-  cell->ti_old_part = 8;
-  cell->ti_end_min = 8;
-  cell->ti_end_max = 8;
-
-  //shuffle_particles(cell->parts, cell->count);
-
-  return cell;
-}
-
-void clean_up(struct cell *ci) {
-  free(ci->parts);
-  //for (int k = 0; k < 13; k++)
-  //if (ci->sort[k] != NULL) free(ci->sort[k]);
-  free(ci);
-}
-
-/**
- * @brief Initializes all particles field to be ready for a density calculation
- */
-void zero_particle_fields(struct cell *c) {
-  for (int pid = 0; pid < c->count; pid++) {
-    struct part* p = &c->parts[pid];
-    p->rho = 0.f;
-
-    // density
-    p->density.wcount = 0.f;
-    p->density.wcount_dh = 0.f;
-    p->density.rho_dh = 0.f;
-    p->density.div_v = 0.f;
-    p->density.rot_v[0] = 0.f;
-    p->density.rot_v[1] = 0.f;
-    p->density.rot_v[2] = 0.f;
-
-    // force
-    p->force.balsara = 0.5f;
-    p->force.f = 0.9f;
-    p->force.P_over_rho2 = 1.2f;
-    p->force.soundspeed = 0.f;
-    p->force.v_sig = 0.f;
-    p->force.h_dt = 0.f;
-
-  }
-  for(int i=0; i<NBRE_DIR; i++) {
-    free(c->sort[i]);
-    c->sort[i] = NULL;
-  }
-}
-
-/**
- * @brief Dump all the particles to a file
- */
-void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) {
-  FILE *file = fopen(fileName, "w");
-
-  /* Write header */
-#ifdef FULL_COMP
-  fprintf(file,
-          "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s "
-          "%13s %13s %13s %13s %13s %13s %13s %13s\n",
-          "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh",
-          "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz",
-	  "a_hydro.x","a_hydro.y","a_hydro.z","h_dt","entropy_dt");
-#else
-  fprintf(file,
-          "# %4s %10s %10s %10s %10s %10s %10s %13s\n",
-          "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho");
-#endif
-
-  fprintf(file, "# ci --------------------------------------------\n");
-
-  for (int pid = 0; pid < ci->count; pid++) {
-#ifdef FULL_COMP
-    fprintf(file,
-            "%6llu %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e "
-            "%13e %13e %13e %13e %13e %13e %13e %13e %13e\n",
-            ci->parts[pid].id, ci->parts[pid].x[0], ci->parts[pid].x[1],
-            ci->parts[pid].x[2], ci->parts[pid].v[0], ci->parts[pid].v[1],
-            ci->parts[pid].v[2], ci->parts[pid].rho,
-            ci->parts[pid].density.rho_dh,
-            ci->parts[pid].density.wcount, ci->parts[pid].density.wcount_dh,
-            ci->parts[pid].density.div_v, ci->parts[pid].density.rot_v[0],
-            ci->parts[pid].density.rot_v[1], ci->parts[pid].density.rot_v[2],
-            ci->parts[pid].a_hydro[0], ci->parts[pid].a_hydro[1], ci->parts[pid].a_hydro[2],
-            ci->parts[pid].force.h_dt, ci->parts[pid].entropy_dt
-            );
-#else
-    fprintf(file,
-            "%6llu %10f %10f %10f %10f %10f %13e %13e\n",
-            ci->parts[pid].id, ci->parts[pid].x[0], ci->parts[pid].x[1],
-            ci->parts[pid].x[2], ci->parts[pid].v[0], ci->parts[pid].v[1],
-            ci->parts[pid].v[2], ci->parts[pid].rho
-            );
-    
-#endif
-  }
-
-  fprintf(file, "# cj --------------------------------------------\n");
-
-  for (int pjd = 0; pjd < cj->count; pjd++) {
-#ifdef FULL_COMP
-    fprintf(file,
-            "%6llu %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e "
-            "%13e %13e %13e %13e %13e %13e %13e %13e %13e\n",
-            cj->parts[pjd].id, cj->parts[pjd].x[0], cj->parts[pjd].x[1],
-            cj->parts[pjd].x[2], cj->parts[pjd].v[0], cj->parts[pjd].v[1],
-            cj->parts[pjd].v[2], cj->parts[pjd].rho,
-            cj->parts[pjd].density.rho_dh,
-            cj->parts[pjd].density.wcount, cj->parts[pjd].density.wcount_dh,
-            cj->parts[pjd].density.div_v, cj->parts[pjd].density.rot_v[0],
-            cj->parts[pjd].density.rot_v[1], cj->parts[pjd].density.rot_v[2],
-            cj->parts[pjd].a_hydro[0], cj->parts[pjd].a_hydro[1], cj->parts[pjd].a_hydro[2],
-            cj->parts[pjd].force.h_dt, cj->parts[pjd].entropy_dt
-            );
-#else
-    fprintf(file,
-            "%6llu %10f %10f %10f %10f %10f %13e %13e\n",
-            cj->parts[pjd].id, cj->parts[pjd].x[0], cj->parts[pjd].x[1],
-            cj->parts[pjd].x[2], cj->parts[pjd].v[0], cj->parts[pjd].v[1],
-            cj->parts[pjd].v[2], cj->parts[pjd].rho
-            );
-#endif
-  }
-  fclose(file);
-}
-
-int main(int argc, char *argv[]) {
-  cudaEvent_t gpu_start, gpu_stop;
-  cudaStream_t stream;
-  cudaEventCreate(&gpu_start);
-  cudaEventCreate(&gpu_stop);
-
-
-  float host_kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)];
-  for (int a = 0; a < (kernel_degree + 1) * (kernel_ivals + 1); a++) {
-    host_kernel_coeffs[a] = kernel_coeffs[a];
-  }
-  cudaErrCheck(cudaMemcpyToSymbol(
-    cuda_kernel_coeffs, &host_kernel_coeffs,
-    sizeof(float) * (kernel_degree + 1) * (kernel_ivals + 1)));
-
-  size_t particles = 0, runs = 0, volume, type = 0;
-  double offset[3] = {0, 0, 0}, h = 1.1255, size = 1., rho = 1.;
-  double perturbation = 0.;
-  struct cell *ci, *cj;
-
-  char c;
-  static unsigned long long partId = 0;
-  char outputFileNameExtension[200] = "";
-  char outputFileName[200] = "";
-
-  //  ticks tic, toc, time;
-
-  /* Initialize CPU frequency, this also starts time. */
-  //unsigned long long cpufreq = 0;
-  //clocks_set_cpufreq(cpufreq);
-
-  /* Choke on FP-exceptions */
-  //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
-
-  srand(0);
-
-  while ((c = getopt(argc, argv, "h:p:r:t:d:f:")) != -1) {
-    switch (c) {
-      case 'h':
-        sscanf(optarg, "%lf", &h);
-        break;
-      case 'p':
-        sscanf(optarg, "%zu", &particles);
-        break;
-      case 'r':
-        sscanf(optarg, "%zu", &runs);
-        break;
-      case 't':
-        sscanf(optarg, "%zu", &type);
-        break;
-      case 'd':
-        sscanf(optarg, "%lf", &perturbation);
-        break;
-      case 'f':
-        strcpy(outputFileNameExtension, optarg);
-        break;
-      case '?':
-        error("Unknown option.");
-        break;
-    }
-  }
-
-  if (h < 0 || particles == 0 || runs == 0 || type > 2) {
-    printf(
-        "\nUsage: %s -p PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n"
-        "\nGenerates a cell pair, filled with particles on a Cartesian grid."
-        "\nThese are then interacted using runner_dopair1_density."
-        "\n\nOptions:"
-        "\n-t TYPE=0          - cells share face (0), edge (1) or corner (2)"
-        "\n-h DISTANCE=1.1255 - smoothing length"
-        "\n-d pert            - perturbation to apply to the particles [0,1["
-        "\n-f fileName        - part of the file name used to save the dumps\n",
-        argv[0]);
-    exit(1);
-  }
-
-  volume = particles * particles * particles;
-  message("particles: %zu B\npositions: 0 B", 2 * volume * sizeof(struct part));
-
-  ci = make_cell(particles, offset, size, h, rho, &partId, perturbation);
-  for (size_t i = 0; i < type + 1; ++i) offset[i] = 1.;
-  cj = make_cell(particles, offset, size, h, rho, &partId, perturbation);
-  int dir;
-  int ind;
-  int half = particles / 2;
-  switch(type) {
-    case 2:
-      ind = volume - 1;
-      dir = 0;
-      break;
-    case 1:
-      ind = (particles-1)*particles*particles + (particles-1)*particles + half;
-      dir = 1;
-      break;
-    case 0:
-      ind = (particles-1)*particles*particles + half*particles + half;
-      dir = 4;
-      break;
-  }
-
-  sprintf(outputFileName, "swift_gpu_init_%s.dat", outputFileNameExtension);
-  dump_particle_fields(outputFileName, ci, cj);
-
-  allocate_device_parts(2*volume);
-
-  struct cell_cuda *cuda_ci;
-  struct cell_cuda *cuda_cj;
-  
-  //  time = 0;
-  for (size_t i = 0; i < runs; ++i) {
-    /* Zero the fields */
-    zero_particle_fields(ci);
-    zero_particle_fields(cj);
-
-    do_sort(ci);
-    do_sort(cj);
-
-    cuda_ci = copy_from_host(ci, 0);
-    cuda_cj = copy_from_host(cj, ci->count);
-    //exit(1);
-    //tic = getticks();
-
-    /* Run the test */
-    
-    //cudaEventRecord(gpu_start,stream);
-    do_test_pair_density<<<1,CUDA_THREADS>>>(cuda_ci, cuda_cj);
-
-    copy_to_host(cuda_ci, ci);
-    copy_to_host(cuda_cj, cj);
-    float rho = ci->parts[ind].rho;
-
-    zero_particle_fields(ci);
-    zero_particle_fields(cj);
-
-    do_sort(ci);
-    do_sort(cj);
-
-    cuda_ci = copy_from_host(ci, 0);
-    cuda_cj = copy_from_host(cj, ci->count);
-
-    do_test_pair_density_sorted<<<1,CUDA_THREADS>>>(cuda_ci, cuda_cj, dir);
-    //cudaEventRecord(gpu_stop,0);
-    cudaErrCheck( cudaPeekAtLastError() );
-    cudaErrCheck( cudaDeviceSynchronize() );
-    //cudaEventSynchronize(gpu_stop);
-    //float gpu_time;
-    //cudaEventElapsedTime(&gpu_time, gpu_start, gpu_stop);
-    //printf("%g\n", gpu_time);
-    //do_test_self_density<<<1,CUDA_THREADS>>>(cuda_ci);
-    //do_test_self_density_symmetric<<<1,CUDA_THREADS>>>(cuda_ci);
-    //cudaErrCheck( cudaPeekAtLastError() );
-    //cudaErrCheck( cudaDeviceSynchronize() );
-
-    // do_test_pair_force<<<1,CUDA_THREADS>>>(cuda_ci, cuda_cj);
-    // cudaErrCheck( cudaPeekAtLastError() );
-    // cudaErrCheck( cudaDeviceSynchronize() );
-    
-    // do_test_self_force<<<1,CUDA_THREADS>>>(cuda_ci);
-    // cudaErrCheck( cudaPeekAtLastError() );
-    // cudaErrCheck( cudaDeviceSynchronize() );
-    //toc = getticks();
-    //time += toc - tic;
-
-    copy_to_host(cuda_ci, ci);
-    copy_to_host(cuda_cj, cj);
-    rho -= ci->parts[ind].rho;
-    printf("diff %g value %g\n", rho, ci->parts[ind].rho);
-    /* Dump if necessary */
-    if (i % 50 == 0) {
-      sprintf(outputFileName, "swift_gpu_%s.dat", outputFileNameExtension);
-      dump_particle_fields(outputFileName, ci, cj);
-    }
-  }
-
-  printf("rho %g\n", ci->parts[0].rho);
-  /* Output timing */
-//  message("SWIFT calculation took       %lli ticks.", time / runs);
-
-  /* Now perform a brute-force version for accuracy tests */
-
-  /* Zero the fields */
-  //zero_particle_fields(ci);
-  //zero_particle_fields(cj);
-
-//  tic = getticks();
-
-  /* Run the brute-force test */
-  //pairs_all_density(&runner, ci, cj);
-
-//  toc = getticks();
-
-  /* Dump */
-  sprintf(outputFileName, "swift_gpu_end_%s.dat", outputFileNameExtension);
-  dump_particle_fields(outputFileName, ci, cj);
-
-  /* Output timing */
-//  message("Brute force calculation took %lli ticks.", toc - tic);
-
-  /* Clean things to make the sanitizer happy ... */
-  clean_up(ci);
-  clean_up(cj);
-
-  return 0;
-}
diff --git a/tests/testcuda.cu b/tests/testcuda.cu
index 861cdd3a864cb19732d16cce8e4febeaafb92ed8..4bc90c169707475d3522048b624c239231fa65f7 100644
--- a/tests/testcuda.cu
+++ b/tests/testcuda.cu
@@ -42,6 +42,7 @@ extern "C" {
 }
 
 #include <cuda.h>
+#include <cuda_runtime_api.h>
 
 
 /* Host function to check cuda functions don't return errors */
@@ -342,6 +343,245 @@ __device__ void dopair_density(struct cell_cuda *ci, struct cell_cuda *cj) {
   }  // Loop over ci.
 }
 
+__device__ void dopair_density_sorted(struct cell_cuda *ci, struct cell_cuda *cj,
+				      const int sid) {
+
+  /* Pick-out the sorted lists. */
+  const long long int f_i = ci->first_part;
+  const long long int f_j = cj->first_part;
+  const struct entry *sort_i = &cuda_parts.sort[sid][f_i];
+  const struct entry *sort_j = &cuda_parts.sort[sid][f_j];
+
+  /* Get some other useful values. */
+  const double hi_max = ci->h_max * kernel_gamma;
+  const double hj_max = cj->h_max * kernel_gamma;
+  const int count_i = ci->part_count;
+  const int count_j = cj->part_count;
+  const double dj_min = sort_j[0].d;
+  const double di_max = sort_i[count_i - 1].d;
+  
+  float rho, rho_dh, div_v, wcount, wcount_dh;
+  float3 rot_v;
+
+  if (cuda_cell_is_active(ci)) {
+
+    /* Loop over the parts in ci. */
+    for (int pid = count_i - 1 - threadIdx.x;
+         pid >= 0 && sort_i[pid].d + hi_max > dj_min; pid-= blockDim.x) {
+      /* Get a hold of the ith part in ci. */
+      const int pi = sort_i[pid].i + f_i;
+      if (!cuda_part_is_active(pi)) continue;
+      const float hi = cuda_parts.h[pi];
+      const double di = sort_i[pid].d + hi * kernel_gamma;
+      if (di < dj_min) continue;
+      
+      double pix[3], piv[3];
+      pix[0] = cuda_parts.x_x[pi];
+      pix[1] = cuda_parts.x_y[pi];
+      pix[2] = cuda_parts.x_z[pi];
+      
+      piv[0] = cuda_parts.v[pi].x;
+      piv[1] = cuda_parts.v[pi].y;
+      piv[2] = cuda_parts.v[pi].z;
+
+      const float hig2 = hi * hi * kernel_gamma2;
+
+      rho = 0.0f;
+      rho_dh = 0.0f;
+      div_v = 0.0f;
+      wcount = 0.0f;
+      wcount_dh = 0.0f;
+      rot_v.x = 0.0f;
+      rot_v.y = 0.0f;
+      rot_v.z = 0.0f;
+
+
+      /* Loop over the parts in cj. */
+      for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
+	//printf("pjd : %lli\n", pjd);
+        /* Get a pointer to the jth particle. */
+        const int pj= sort_j[pjd].i + f_j;
+
+        /* Compute the pairwise distance. */
+        float r2 = 0.0f;
+        float dx[3];
+	dx[0] = pix[0] - cuda_parts.x_x[pj];
+	dx[1] = pix[1] - cuda_parts.x_y[pj];
+	dx[2] = pix[2] - cuda_parts.x_z[pj];
+	for (int k = 0; k < 3; k++) {
+	  r2 += dx[k] * dx[k];
+	}
+	
+        /* Hit or miss? */
+        if (r2 < hig2) {
+	  
+	  float wi, wi_dx;
+	  float dv[3], curlvr[3];
+	    
+	  /* Get the masses. */
+	  const float mj = cuda_parts.mass[pj];
+	    
+	  /* Get r and r inverse. */
+	  const float r = sqrtf(r2);
+	  const float r_inv = 1.0f / r;
+
+	  /* Compute the kernel function */
+	  const float hi_inv = 1.0f / hi;
+	  const float ui = r * hi_inv;
+	  cuda_kernel_deval(ui, &wi, &wi_dx);
+
+	  /* Compute contribution to the density */
+	  rho += mj * wi;
+	  rho_dh -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+	  /* Compute contribution to the number of neighbours */
+	  wcount += wi;
+	  wcount_dh -= (hydro_dimension * wi + ui * wi_dx);
+
+	  const float fac = mj * wi_dx * r_inv;
+
+	  /* Compute dv dot r */
+	  dv[0] = piv[0] - cuda_parts.v[pj].x;
+	  dv[1] = piv[1] - cuda_parts.v[pj].y;
+	  dv[2] = piv[2] - cuda_parts.v[pj].z;
+	  const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
+	  div_v -= fac * dvdr;
+
+	  /* Compute dv cross r */
+	  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
+	  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
+	  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];
+	  
+	  rot_v.x += fac * curlvr[0];
+	  rot_v.y += fac * curlvr[1];
+	  rot_v.z += fac * curlvr[2];
+	}
+
+      }
+      int pid_write = pid + f_i;
+      atomicAdd(&cuda_parts.rho[pid_write], rho);
+      atomicAdd(&cuda_parts.rho_dh[pid_write], rho_dh);
+      atomicAdd(&cuda_parts.wcount[pid_write], wcount);
+      atomicAdd(&cuda_parts.wcount_dh[pid_write], wcount_dh);
+      atomicAdd(&cuda_parts.div_v[pid_write], div_v);
+      atomicAdd(&cuda_parts.rot_v[pid_write].x, rot_v.x);
+      atomicAdd(&cuda_parts.rot_v[pid_write].y, rot_v.y);
+      atomicAdd(&cuda_parts.rot_v[pid_write].z, rot_v.z);
+    }
+    
+  }
+
+  if (cuda_cell_is_active(cj)) {
+
+    /* Loop over the parts in cj. */
+    for (int pjd = threadIdx.x; pjd < count_j && sort_j[pjd].d - hj_max < di_max; pjd+=blockDim.x) {
+      /* Get a hold of the ith part in ci. */
+      const int pj = sort_j[pjd].i + f_j;
+      if (!cuda_part_is_active(pj)) continue;
+      const float hj = cuda_parts.h[pj];
+      const double dj = sort_j[pjd].d - hj * kernel_gamma;
+      if (dj > di_max) continue;
+      
+      double pjx[3], pjv[3];
+      pjx[0] = cuda_parts.x_x[pj];
+      pjx[1] = cuda_parts.x_y[pj];
+      pjx[2] = cuda_parts.x_z[pj];
+      
+      pjv[0] = cuda_parts.v[pj].x;
+      pjv[1] = cuda_parts.v[pj].y;
+      pjv[2] = cuda_parts.v[pj].z;
+
+      const float hjg2 = hj * hj * kernel_gamma2;
+
+      rho = 0.0f;
+      rho_dh = 0.0f;
+      div_v = 0.0f;
+      wcount = 0.0f;
+      wcount_dh = 0.0f;
+      rot_v.x = 0.0f;
+      rot_v.y = 0.0f;
+      rot_v.z = 0.0f;
+
+
+      /* Loop over the parts in ci. */
+      for (int pid = count_i - 1;
+	   pid >= 0 && sort_i[pid].d > dj; pid--) {
+	//printf("pjd : %lli\n", pjd);
+        /* Get a pointer to the jth particle. */
+        const int pi= sort_i[pid].i + f_i;
+
+        /* Compute the pairwise distance. */
+        float r2 = 0.0f;
+        float dx[3];
+	dx[0] = pjx[0] - cuda_parts.x_x[pi];
+	dx[1] = pjx[1] - cuda_parts.x_y[pi];
+	dx[2] = pjx[2] - cuda_parts.x_z[pi];
+	for (int k = 0; k < 3; k++) {
+	  r2 += dx[k] * dx[k];
+	}
+	
+        /* Hit or miss? */
+        if (r2 < hjg2) {
+	  
+	  float wj, wj_dx;
+	  float dv[3], curlvr[3];
+	    
+	  /* Get the masses. */
+	  const float mi = cuda_parts.mass[pi];
+	    
+	  /* Get r and r inverse. */
+	  const float r = sqrtf(r2);
+	  const float r_inv = 1.0f / r;
+
+	  /* Compute the kernel function */
+	  const float hj_inv = 1.0f / hj;
+	  const float uj = r * hj_inv;
+	  cuda_kernel_deval(uj, &wj, &wj_dx);
+
+	  /* Compute contribution to the density */
+	  rho += mi * wj;
+	  rho_dh -= mi * (hydro_dimension * wj + uj * wj_dx);
+
+	  /* Compute contribution to the number of neighbours */
+	  wcount += wj;
+	  wcount_dh -= (hydro_dimension * wj + uj * wj_dx);
+
+	  const float fac = mi * wj_dx * r_inv;
+
+	  /* Compute dv dot r */
+	  dv[0] = pjv[0] - cuda_parts.v[pi].x;
+	  dv[1] = pjv[1] - cuda_parts.v[pi].y;
+	  dv[2] = pjv[2] - cuda_parts.v[pi].z;
+	  const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
+	  div_v -= fac * dvdr;
+
+	  /* Compute dv cross r */
+	  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
+	  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
+	  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];
+	  
+	  rot_v.x += fac * curlvr[0];
+	  rot_v.y += fac * curlvr[1];
+	  rot_v.z += fac * curlvr[2];
+	}
+
+      }
+      int pjd_write = pjd + f_j;
+      atomicAdd(&cuda_parts.rho[pjd_write], rho);
+      atomicAdd(&cuda_parts.rho_dh[pjd_write], rho_dh);
+      atomicAdd(&cuda_parts.wcount[pjd_write], wcount);
+      atomicAdd(&cuda_parts.wcount_dh[pjd_write], wcount_dh);
+      atomicAdd(&cuda_parts.div_v[pjd_write], div_v);
+      atomicAdd(&cuda_parts.rot_v[pjd_write].x, rot_v.x);
+      atomicAdd(&cuda_parts.rot_v[pjd_write].y, rot_v.y);
+      atomicAdd(&cuda_parts.rot_v[pjd_write].z, rot_v.z);
+    }
+    
+  }
+
+
+}
+
 /* Task function to perform a self force task. No symmetry */
 __device__ void doself_force(struct cell_cuda *ci) {
   /* Is the cell active? */
@@ -604,6 +844,165 @@ __device__ void doself_density(struct cell_cuda *ci) {
   }
 }
 
+/* Task function to execute a self-density task. */
+__device__ void doself_density_symmetric(struct cell_cuda *ci) {
+
+  /* Is the cell active? */
+  if (!cuda_cell_is_active(ci)) {
+    printf(
+        "Cell isn't active..., ti_end_min=%i, ti_current=%i, "
+        "max_active_bin=%i\n",
+        ci->ti_end_min, ti_current, max_active_bin);
+    return;
+  }
+
+  const int count_i = ci->part_count;
+  int part_i = ci->first_part;
+  float rho, rho_dh, div_v, wcount, wcount_dh;
+  float3 rot_v;
+
+  for (int pid = part_i + threadIdx.x; pid < part_i + count_i;
+       pid += blockDim.x) {
+    double pix[3];
+    pix[0] = cuda_parts.x_x[pid];
+    pix[1] = cuda_parts.x_y[pid];
+    pix[2] = cuda_parts.x_z[pid];
+    const float hi = cuda_parts.h[pid];
+    const float hig2 = hi * hi * kernel_gamma2;
+
+    /* Reset local values. */
+    rho = 0.0f;
+    rho_dh = 0.0f;
+    div_v = 0.0f;
+    wcount = 0.0f;
+    wcount_dh = 0.0f;
+    rot_v.x = 0.0f;
+    rot_v.y = 0.0f;
+    rot_v.z = 0.0f;
+
+    /* Search for the neighbours! */
+    for (int pjd = pid + 1; pjd < part_i + count_i; pjd++) {
+      /* Particles don't interact with themselves */
+      float dx[3], r2 = 0.0f;
+      dx[0] = pix[0] - cuda_parts.x_x[pjd];
+      r2 += dx[0] * dx[0];
+      dx[1] = pix[1] - cuda_parts.x_y[pjd];
+      r2 += dx[1] * dx[1];
+      dx[2] = pix[2] - cuda_parts.x_z[pjd];
+      r2 += dx[2] * dx[2];
+
+      const float hj = cuda_parts.h[pjd];
+      const float hjg2 = hj * hj * kernel_gamma2;
+      /* If in range then interact. */
+      if (r2 < hig2 || r2 < hjg2) {
+        float w, dw_dx;
+        float dv[3], curlvr[3];
+        /* Get r and 1/r */
+        const float r = sqrtf(r2);
+        const float ri = 1.0f / r;
+
+	if (r2 < hig2 && cuda_part_is_active(pid)) {
+	  /* Load mass on particle pj. */
+	  const float mj = cuda_parts.mass[pjd];
+
+
+	  /* Compute the kernel function */
+	  const float hi_inv = 1.0f / hi;
+	  const float ui = r * hi_inv;
+
+	  cuda_kernel_deval(ui, &w, &dw_dx);
+	  /* Compute contribution to the density. */
+	  rho = mj * w;
+	  rho_dh = - mj * (hydro_dimension * w + ui * dw_dx);
+
+	  /* Compute contribution to the number of neighbours */
+	  wcount = w;
+	  wcount_dh = -(hydro_dimension * w + ui * dw_dx);
+
+	  const float fac = mj * dw_dx * ri;
+
+	  /* Compute dv dot r */
+	  float3 piv, pjv;
+	  piv = cuda_parts.v[pid];
+	  pjv = cuda_parts.v[pjd];
+	  dv[0] = piv.x - pjv.x;
+	  dv[1] = piv.y - pjv.y;
+	  dv[2] = piv.z - pjv.z;
+	  const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
+
+	  div_v = - fac * dvdr;
+
+	  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
+	  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
+	  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];
+	  
+	  rot_v.x = fac * curlvr[0];
+	  rot_v.y = fac * curlvr[1];
+	  rot_v.z = fac * curlvr[2];
+
+	  atomicAdd(&cuda_parts.rho[pid], rho);
+	  atomicAdd(&cuda_parts.rho_dh[pid], rho_dh);
+	  atomicAdd(&cuda_parts.wcount[pid], wcount);
+	  atomicAdd(&cuda_parts.wcount_dh[pid], wcount_dh);
+	  atomicAdd(&cuda_parts.div_v[pid], div_v);
+	  atomicAdd(&cuda_parts.rot_v[pid].x, rot_v.x);
+	  atomicAdd(&cuda_parts.rot_v[pid].y, rot_v.y);
+	  atomicAdd(&cuda_parts.rot_v[pid].z, rot_v.z);
+	}
+	
+	if (r2 < hjg2 && cuda_part_is_active(pjd)) {
+	  /* Load mass on particle pj. */
+	  const float mi = cuda_parts.mass[pid];
+
+
+	  /* Compute the kernel function */
+	  const float hj_inv = 1.0f / hj;
+	  const float uj = r * hj_inv;
+
+	  cuda_kernel_deval(uj, &w, &dw_dx);
+	  /* Compute contribution to the density. */
+	  rho = mi * w;
+	  rho_dh = - mi * (hydro_dimension * w + uj * dw_dx);
+
+	  /* Compute contribution to the number of neighbours */
+	  wcount = w;
+	  wcount_dh = -(hydro_dimension * w + uj * dw_dx);
+
+	  const float fac = mi * dw_dx * ri;
+
+	  /* Compute dv dot r */
+	  float3 piv, pjv;
+	  piv = cuda_parts.v[pid];
+	  pjv = cuda_parts.v[pjd];
+	  dv[0] = pjv.x - piv.x;
+	  dv[1] = pjv.y - piv.y;
+	  dv[2] = pjv.z - piv.z;
+	  const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
+
+	  div_v = - fac * dvdr;
+
+	  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
+	  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
+	  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];
+	  
+	  rot_v.x = fac * curlvr[0];
+	  rot_v.y = fac * curlvr[1];
+	  rot_v.z = fac * curlvr[2];
+
+	  atomicAdd(&cuda_parts.rho[pjd], rho);
+	  atomicAdd(&cuda_parts.rho_dh[pjd], rho_dh);
+	  atomicAdd(&cuda_parts.wcount[pjd], wcount);
+	  atomicAdd(&cuda_parts.wcount_dh[pjd], wcount_dh);
+	  atomicAdd(&cuda_parts.div_v[pjd], div_v);
+	  atomicAdd(&cuda_parts.rot_v[pjd].x, rot_v.x);
+	  atomicAdd(&cuda_parts.rot_v[pjd].y, rot_v.y);
+	  atomicAdd(&cuda_parts.rot_v[pjd].z, rot_v.z);
+	}
+      }
+    }  // Loop over cj.
+    /* Write data for particle pid back to global stores. */
+  }
+}
 
 __global__ void do_test_self_density(struct cell_cuda *ci) {
 
@@ -621,6 +1020,16 @@ __global__ void do_test_pair_density(struct cell_cuda *ci, struct cell_cuda *cj)
   dopair_density(ci,cj);
   dopair_density(cj,ci);
 
+}
+__global__ void do_test_pair_density_sorted(struct cell_cuda *ci, struct cell_cuda *cj, const int sid) {
+
+  dopair_density_sorted(ci, cj, sid);
+
+}
+__global__ void do_test_self_density_symmetric(struct cell_cuda *ci) {
+
+  doself_density_symmetric(ci);
+
 }
 
 __global__ void do_test_pair_force(struct cell_cuda *ci, struct cell_cuda *cj) {
@@ -656,7 +1065,10 @@ void allocate_parts(struct particle_arrays *p, size_t num_part) {
   p->soundspeed = (float*) malloc(sizeof(float) * num_part);
   p->v_sig = (float*) malloc(sizeof(float) * num_part);
   p->h_dt = (float*) malloc(sizeof(float) * num_part);
-  
+
+  for (int i=0; i<NBRE_DIR; i++)
+    p->sort[i] = (struct entry *)malloc(sizeof(struct entry)*(num_part + 1));
+
   p->time_bin = (timebin_t*) malloc(sizeof(timebin_t) * num_part);
 }
 
@@ -692,6 +1104,9 @@ void allocate_device_parts(size_t num_part) {
 
   cudaErrCheck(cudaMalloc(&c_parts.time_bin, sizeof(timebin_t) * num_part));
 
+  for (int i=0; i<NBRE_DIR; i++)
+    cudaErrCheck(cudaMalloc(&c_parts.sort[i], sizeof(struct entry) * num_part));
+
   cudaErrCheck(cudaMemcpyToSymbol(cuda_parts, &c_parts, sizeof(struct particle_arrays)));
 }
 
@@ -758,11 +1173,12 @@ void free_device_parts() {
   cudaErrCheck(cudaFree((void*)parts.v_sig));
   cudaErrCheck(cudaFree(parts.h_dt));
 
+  for (int i=0; i<NBRE_DIR; i++)
+    cudaErrCheck(cudaFree(parts.sort[i]));
   
   cudaErrCheck(cudaFree(parts.time_bin));
 }
 
-
 void copy_to_device_array(struct cell *ci, int offset) {
 
   struct particle_arrays h_p;
@@ -807,6 +1223,11 @@ void copy_to_device_array(struct cell *ci, int offset) {
     h_p.v_sig[i] = p.force.v_sig;
     h_p.h_dt[i] = p.force.h_dt;
     
+    for (int j=0; j<NBRE_DIR; j++) {
+      h_p.sort[j][i].d = ci->sort[j][i].d;
+      h_p.sort[j][i].i = ci->sort[j][i].i;
+    }
+    
     h_p.time_bin[i] = p.time_bin;
   }
 
@@ -882,6 +1303,10 @@ void copy_to_device_array(struct cell *ci, int offset) {
   p_data = c_parts.h_dt + offset;
   cudaErrCheck(cudaMemcpy(p_data, h_p.h_dt, sizeof(float) * num_part, cudaMemcpyHostToDevice));
 
+  for (int j=0; j<NBRE_DIR; j++) {
+    p_data = c_parts.sort[j] + offset;
+    cudaErrCheck(cudaMemcpy(p_data, h_p.sort[j], sizeof(struct entry) * num_part, cudaMemcpyHostToDevice));
+  }
   
   p_data = c_parts.time_bin + offset;
   cudaErrCheck(cudaMemcpy(p_data, h_p.time_bin,sizeof(timebin_t) * num_part, cudaMemcpyHostToDevice));
@@ -964,6 +1389,10 @@ void copy_from_device_array(struct particle_arrays *h_p, int offset, size_t num_
   p_data = c_parts.h_dt + offset;
   cudaErrCheck(cudaMemcpy(h_p->h_dt, p_data, sizeof(float) * num_part, cudaMemcpyDeviceToHost));
   
+  for (int j=0; j<NBRE_DIR; j++) {
+    p_data = c_parts.sort[j] + offset;
+    cudaErrCheck(cudaMemcpy(h_p->sort[j], p_data, sizeof(struct entry) * num_part, cudaMemcpyDeviceToHost));
+  }
 
   p_data = c_parts.time_bin + offset;
   cudaErrCheck(cudaMemcpy(h_p->time_bin, p_data, sizeof(timebin_t) * num_part, cudaMemcpyDeviceToHost));
@@ -1018,43 +1447,44 @@ void copy_to_host(struct cell_cuda *cuda_c, struct cell *c) {
   for (int i=0; i<N; i++) {
     struct part *p = &c->parts[i];
 
-    p->id = h_cuda_part.id[p_i];
-    p->h = h_cuda_part.h[p_i];
-    p->rho = h_cuda_part.rho[p_i];
-    p->entropy = h_cuda_part.entropy[p_i];
-    p->entropy_dt = h_cuda_part.entropy_dt[p_i];
+    p->id = h_cuda_part.id[i];
+    p->h = h_cuda_part.h[i];
+    p->rho = h_cuda_part.rho[i];
+    p->entropy = h_cuda_part.entropy[i];
+    p->entropy_dt = h_cuda_part.entropy_dt[i];
     //density
-    p->density.wcount = h_cuda_part.wcount[p_i];
-    p->density.wcount_dh = h_cuda_part.wcount_dh[p_i];
-    p->density.rho_dh = h_cuda_part.rho_dh[p_i];
-    p->density.div_v = h_cuda_part.div_v[p_i];
+    p->density.wcount = h_cuda_part.wcount[i];
+    p->density.wcount_dh = h_cuda_part.wcount_dh[i];
+    p->density.rho_dh = h_cuda_part.rho_dh[i];
+    p->density.div_v = h_cuda_part.div_v[i];
 
     //force
-    p->force.balsara = h_cuda_part.balsara[p_i];
-    p->force.f = h_cuda_part.f[p_i];
-    p->force.P_over_rho2 = h_cuda_part.P_over_rho2[p_i];
-    p->force.soundspeed = h_cuda_part.soundspeed[p_i];
-    p->force.v_sig = h_cuda_part.v_sig[p_i];
-    p->force.h_dt = h_cuda_part.h_dt[p_i];
+    p->force.balsara = h_cuda_part.balsara[i];
+    p->force.f = h_cuda_part.f[i];
+    p->force.P_over_rho2 = h_cuda_part.P_over_rho2[i];
+    p->force.soundspeed = h_cuda_part.soundspeed[i];
+    p->force.v_sig = h_cuda_part.v_sig[i];
+    p->force.h_dt = h_cuda_part.h_dt[i];
     
-    p->x[0] = h_cuda_part.x_x[p_i];
-    p->v[0] = h_cuda_part.v[p_i].x;
-    p->a_hydro[0] = h_cuda_part.a_hydro[p_i].x;
-    p->density.rot_v[0] = h_cuda_part.rot_v[p_i].x;
+    p->x[0] = h_cuda_part.x_x[i];
+    p->v[0] = h_cuda_part.v[i].x;
+    p->a_hydro[0] = h_cuda_part.a_hydro[i].x;
+    p->density.rot_v[0] = h_cuda_part.rot_v[i].x;
 #if DIM > 1
-    p->x[1] = h_cuda_part.x_y[p_i];
-    p->v[1] = h_cuda_part.v[p_i].y;
-    p->a_hydro[1] = h_cuda_part.a_hydro[p_i].y;
-    p->density.rot_v[1] = h_cuda_part.rot_v[p_i].y;
+    p->x[1] = h_cuda_part.x_y[i];
+    p->v[1] = h_cuda_part.v[i].y;
+    p->a_hydro[1] = h_cuda_part.a_hydro[i].y;
+    p->density.rot_v[1] = h_cuda_part.rot_v[i].y;
 #if DIM > 2
-    p->x[2] = h_cuda_part.x_z[p_i];
-    p->v[2] = h_cuda_part.v[p_i].z;
-    p->a_hydro[2] = h_cuda_part.a_hydro[p_i].z;
-    p->density.rot_v[2] = h_cuda_part.rot_v[p_i].z;
+    p->x[2] = h_cuda_part.x_z[i];
+    p->v[2] = h_cuda_part.v[i].z;
+    p->a_hydro[2] = h_cuda_part.a_hydro[i].z;
+    p->density.rot_v[2] = h_cuda_part.rot_v[i].z;
 #endif
 #endif
 
-    p_i++;
+    for (int j=0; j<NBRE_DIR; j++)
+      c->sort[j][i] = h_cuda_part.sort[j][i];
   }
 }
 
@@ -1121,8 +1551,8 @@ struct cell *make_cell(size_t n, double *offset, double size, double h,
   cell->split = 0;
   cell->h_max = h;
   cell->count = count;
-  cell->dx_max_part = 0.;
-  cell->dx_max_sort = 0.;
+  //cell->dx_max_part = 0.;
+  //cell->dx_max_sort = 0.;
   cell->width[0] = n;
   cell->width[1] = n;
   cell->width[2] = n;
@@ -1171,6 +1601,10 @@ void zero_particle_fields(struct cell *c) {
     p->force.v_sig = 0.f;
     p->force.h_dt = 0.f;
   }
+  for(int i=0; i<NBRE_DIR; i++) {
+    free(c->sort[i]);
+    c->sort[i] = NULL;
+  }
 }
 
 /**
@@ -1252,14 +1686,10 @@ void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) {
 
 
 int main(int argc, char *argv[]) {
-
-  bool runPair = true;
-  int tmp = 8;
-  cudaErrCheck(
-      cudaMemcpyToSymbol(ti_current, &tmp, sizeof(int)));
-  tmp = 1000;
-  cudaErrCheck(
-      cudaMemcpyToSymbol(max_active_bin, &tmp, sizeof(int)));
+  cudaEvent_t gpu_start, gpu_stop;
+  cudaStream_t stream;
+  cudaEventCreate(&gpu_start);
+  cudaEventCreate(&gpu_stop);
 
   float host_kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)];
   for (int a = 0; a < (kernel_degree + 1) * (kernel_ivals + 1); a++) {
@@ -1337,6 +1767,24 @@ int main(int argc, char *argv[]) {
   for (size_t i = 0; i < type + 1; ++i) offset[i] = 1.;
   cj = make_cell(particles, offset, size, h, rho, &partId, perturbation);
 
+  int dir;
+  int ind;
+  int half = particles / 2;
+  switch(type) {
+    case 2:
+      ind = volume - 1;
+      dir = 0;
+      break;
+    case 1:
+      ind = (particles-1)*particles*particles + (particles-1)*particles + half;
+      dir = 1;
+      break;
+    case 0:
+      ind = (particles-1)*particles*particles + half*particles + half;
+      dir = 4;
+      break;
+  }
+
   sprintf(outputFileName, "swift_gpu_init_%s.dat", outputFileNameExtension);
   dump_particle_fields(outputFileName, ci, cj);
 
@@ -1349,43 +1797,67 @@ int main(int argc, char *argv[]) {
   for (size_t i = 0; i < runs; ++i) {
     /* Zero the fields */
     zero_particle_fields(ci);
-    if (runPair)
-      zero_particle_fields(cj);
-    
+    zero_particle_fields(cj);
+
+    do_sort(ci);
+    do_sort(cj);
+
     cuda_ci = copy_from_host(ci, 0);
-    if (runPair)
-      cuda_cj = copy_from_host(cj, ci->count);
+    cuda_cj = copy_from_host(cj, ci->count);
     //exit(1);
     //tic = getticks();
 
     /* Run the test */
-    if (runPair) {
-      do_test_pair_density<<<1,CUDA_THREADS>>>(cuda_ci, cuda_cj);
-      cudaErrCheck( cudaPeekAtLastError() );
-      cudaErrCheck( cudaDeviceSynchronize() );
-    }
+    
+    //cudaEventRecord(gpu_start,stream);
+    do_test_pair_density<<<1,CUDA_THREADS>>>(cuda_ci, cuda_cj);
+
+    copy_to_host(cuda_ci, ci);
+    copy_to_host(cuda_cj, cj);
+    float rho = ci->parts[ind].rho;
+
+    zero_particle_fields(ci);
+    zero_particle_fields(cj);
+
+    do_sort(ci);
+    do_sort(cj);
+
+    cuda_ci = copy_from_host(ci, 0);
+    cuda_cj = copy_from_host(cj, ci->count);
+
+    do_test_pair_density_sorted<<<1,CUDA_THREADS>>>(cuda_ci, cuda_cj, dir);
+    //cudaEventRecord(gpu_stop,0);
+    cudaErrCheck( cudaPeekAtLastError() );
+    cudaErrCheck( cudaDeviceSynchronize() );
+    //cudaEventSynchronize(gpu_stop);
+    //float gpu_time;
+    //cudaEventElapsedTime(&gpu_time, gpu_start, gpu_stop);
+    //printf("%g\n", gpu_time);
+    copy_to_host(cuda_ci, ci);
+    copy_to_host(cuda_cj, cj);
+    rho -= ci->parts[ind].rho;
+    printf("diff %g value %g\n", rho, ci->parts[ind].rho);
 
     do_test_self_density<<<1,CUDA_THREADS>>>(cuda_ci);
-    //do_test_self_density_symmetric<<<1,CUDA_THREADS>>>(cuda_ci);
     cudaErrCheck( cudaPeekAtLastError() );
     cudaErrCheck( cudaDeviceSynchronize() );
 
-    if (runPair) {
-      do_test_pair_force<<<1,CUDA_THREADS>>>(cuda_ci, cuda_cj);
-      cudaErrCheck( cudaPeekAtLastError() );
-      cudaErrCheck( cudaDeviceSynchronize() );
-    }
+    do_test_self_density_symmetric<<<1,CUDA_THREADS>>>(cuda_ci);
+    cudaErrCheck( cudaPeekAtLastError() );
+    cudaErrCheck( cudaDeviceSynchronize() );
 
-    do_test_self_force<<<1,CUDA_THREADS>>>(cuda_ci);
+    do_test_pair_force<<<1,CUDA_THREADS>>>(cuda_ci, cuda_cj);
     cudaErrCheck( cudaPeekAtLastError() );
     cudaErrCheck( cudaDeviceSynchronize() );
     
+    do_test_self_force<<<1,CUDA_THREADS>>>(cuda_ci);
+    cudaErrCheck( cudaPeekAtLastError() );
+    cudaErrCheck( cudaDeviceSynchronize() );
     //toc = getticks();
     //time += toc - tic;
 
     copy_to_host(cuda_ci, ci);
-    if (runPair)
-      copy_to_host(cuda_cj, cj);
+    copy_to_host(cuda_cj, cj);
     /* Dump if necessary */
     if (i % 50 == 0) {
       sprintf(outputFileName, "swift_gpu_%s.dat", outputFileNameExtension);
diff --git a/tests/testcuda.h b/tests/testcuda.h
index 24e7ed3ca3776793a8b67c9de4c004d206faafc3..515bdb1f56089915c3f2ab94cab273a4b7049e51 100644
--- a/tests/testcuda.h
+++ b/tests/testcuda.h
@@ -302,7 +302,7 @@ struct cell {
   float dmin;
 
   /*! Maximum particle movement in this cell since the last sort. */
-  float dx_max_sort;
+  //float dx_max_sort;
 
   /*! Maximum part movement in this cell since last construction. */
   float dx_max_part;