diff --git a/examples/SedovBlast/makeIC.py b/examples/SedovBlast/makeIC.py
index 5440df7b8f2b1d3d8b1ec748227c3cf2b8bb779a..1dd440dd9953d638f1fe0945326da8f4442cdf56 100644
--- a/examples/SedovBlast/makeIC.py
+++ b/examples/SedovBlast/makeIC.py
@@ -67,7 +67,7 @@ for i in range(L):
             v[index,1] = 0.
             v[index,2] = 0.
             m[index] = mass
-            h[index] = 2.251 * boxSize / L
+            h[index] = 2.251 / 2 * boxSize / L
             u[index] = internalEnergy
             ids[index] = index
             if sqrt((x - boxSize/2.)**2 + (y - boxSize/2.)**2 + (z - boxSize/2.)**2) < 2.01 * boxSize/L:
diff --git a/examples/SedovBlast/rdf.py b/examples/SedovBlast/rdf.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8474939e2732f1b5291fc088f4a19f51d52ce5c
--- /dev/null
+++ b/examples/SedovBlast/rdf.py
@@ -0,0 +1,120 @@
+###############################################################################
+ # This file is part of SWIFT.
+ # Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
+ #                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ # 
+ # This program is free software: you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published
+ # by the Free Software Foundation, either version 3 of the License, or
+ # (at your option) any later version.
+ # 
+ # This program is distributed in the hope that it will be useful,
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ # GNU General Public License for more details.
+ # 
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ # 
+ ##############################################################################
+
+import h5py
+import random
+import sys
+import math
+from numpy import *
+
+# Reads the HDF5 output of SWIFT and generates a radial density profile
+# of the different physical values.
+
+# Input values?
+if len(sys.argv) < 3 :
+    print "Useage: " , sys.argv[0] , " <filename> <nr. bins>"
+    exit()
+    
+# Get the input arguments
+fileName = sys.argv[1];
+nr_bins = int( sys.argv[2] );
+
+
+# Open the file
+fileName = sys.argv[1];
+file = h5py.File( fileName , 'r' )
+
+# Get the space dimensions.
+grp = file[ "/Header" ]
+boxsize = grp.attrs[ 'BoxSize' ]
+boxsize = boxsize[0]
+
+# Get the particle data
+grp = file.get( '/PartType0' )
+ds = grp.get( 'Coordinates' )
+coords = ds[...]
+ds = grp.get( 'Velocities' )
+v = ds[...]
+ds = grp.get( 'Masses' )
+m = ds[...]
+ds = grp.get( 'SmoothingLength' )
+h = ds[...]
+ds = grp.get( 'InternalEnergy' )
+u = ds[...]
+ds = grp.get( 'ParticleIDs' )
+ids = ds[...]
+ds = grp.get( 'Density' )
+rho = ds[...]
+
+# Set the origin to be the middle of the box
+origin = [ boxsize / 2 , boxsize / 2 , boxsize / 2 ]
+
+# Get the maximum radius
+r_max = boxsize / 2
+
+# Init the bins
+nr_parts = coords.shape[0]
+bins_v = zeros( nr_bins )
+bins_m = zeros( nr_bins )
+bins_h = zeros( nr_bins )
+bins_u = zeros( nr_bins )
+bins_rho = zeros( nr_bins )
+bins_count = zeros( nr_bins )
+bins_P = zeros( nr_bins )
+
+# Loop over the particles and fill the bins.
+for i in range( nr_parts ):
+
+    # Get the box index.
+    r = coords[i] - origin
+    r = sqrt( r[0]*r[0] + r[1]*r[1] + r[2]*r[2] )
+    ind = floor( r / r_max * nr_bins )
+    
+    # Update the bins
+    if ( ind < nr_bins ):
+        bins_count[ind] += 1
+        bins_v[ind] += sqrt( v[i,0]*v[i,0] + v[i,1]*v[i,1] + v[i,2]*v[i,2] )
+        bins_m[ind] += m[i]
+        bins_h[ind] += h[i]
+        bins_u[ind] += u[i]
+        bins_rho[ind] += rho[i]
+        bins_P[ind] += (2.0/3)*u[i]*rho[i]
+    
+# Loop over the bins and dump them
+print "# bucket left right count v m h u rho"
+for i in range( nr_bins ):
+
+    # Normalize by the bin volume.
+    r_left = r_max * i / nr_bins
+    r_right = r_max * (i+1) / nr_bins
+    vol = 4/3*math.pi*(r_right*r_right*r_right - r_left*r_left*r_left)
+    ivol = 1.0 / vol
+
+    print "%i %.3e %.3e %.3e %.3e %.3e %.3e %.3e %.3e %.3e" % \
+        ( i , r_left , r_right , \
+          bins_count[i] * ivol , \
+          bins_v[i] / bins_count[i] , \
+          bins_m[i] * ivol , \
+          bins_h[i] / bins_count[i] , \
+          bins_u[i] / bins_count[i] , \
+          bins_rho[i] / bins_count[i] ,
+          bins_P[i] / bins_count[i] )
+    
+    
diff --git a/examples/SedovBlast/solution.py b/examples/SedovBlast/solution.py
index cd26aa4d408de58ac71c570cd131a017f13ec792..1b2dcf5957bc6684ebaa88d4ea08bee68e56cd6f 100644
--- a/examples/SedovBlast/solution.py
+++ b/examples/SedovBlast/solution.py
@@ -33,7 +33,7 @@ P_0 = 1.e-5         # Background Pressure
 E_0 = 1.e2          # Energy of the explosion
 gamma = 5./3.     # Gas polytropic index
 
-t = 0.15           # Time of the solution
+t = 0.2           # Time of the solution
 
 N = 1000          # Number of radial points
 R_max = 3.        # Maximal radius
diff --git a/examples/SodShock/rhox.py b/examples/SodShock/rhox.py
new file mode 100644
index 0000000000000000000000000000000000000000..36771c5c0d9dcdec99403249ea8a7bb1a4bc58c6
--- /dev/null
+++ b/examples/SodShock/rhox.py
@@ -0,0 +1,115 @@
+###############################################################################
+ # This file is part of SWIFT.
+ # Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
+ #                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ # 
+ # This program is free software: you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published
+ # by the Free Software Foundation, either version 3 of the License, or
+ # (at your option) any later version.
+ # 
+ # This program is distributed in the hope that it will be useful,
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ # GNU General Public License for more details.
+ # 
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ # 
+ ##############################################################################
+
+import h5py
+import random
+import sys
+import math
+from numpy import *
+
+# Reads the HDF5 output of SWIFT and generates a radial density profile
+# of the different physical values.
+
+# Input values?
+if len(sys.argv) < 3 :
+    print "Useage: " , sys.argv[0] , " <filename> <nr. bins>"
+    exit()
+    
+# Get the input arguments
+fileName = sys.argv[1];
+nr_bins = int( sys.argv[2] );
+
+
+# Open the file
+fileName = sys.argv[1];
+file = h5py.File( fileName , 'r' )
+
+# Get the space dimensions.
+grp = file[ "/Header" ]
+boxsize = grp.attrs[ 'BoxSize' ]
+boxsize = boxsize[0]
+
+# Get the particle data
+grp = file.get( '/PartType0' )
+ds = grp.get( 'Coordinates' )
+coords = ds[...]
+ds = grp.get( 'Velocities' )
+v = ds[...]
+# ds = grp.get( 'Mass' )
+# m = ds[...]
+ds = grp.get( 'SmoothingLength' )
+h = ds[...]
+ds = grp.get( 'InternalEnergy' )
+u = ds[...]
+ds = grp.get( 'ParticleIDs' )
+ids = ds[...]
+ds = grp.get( 'Density' )
+rho = ds[...]
+
+# Get the maximum radius
+r_max = boxsize
+
+# Init the bins
+nr_parts = coords.shape[0]
+bins_v = zeros( nr_bins )
+bins_m = zeros( nr_bins )
+bins_h = zeros( nr_bins )
+bins_u = zeros( nr_bins )
+bins_rho = zeros( nr_bins )
+bins_count = zeros( nr_bins )
+bins_P = zeros( nr_bins )
+
+# Loop over the particles and fill the bins.
+for i in range( nr_parts ):
+
+    # Get the box index.
+    r = coords[i,0]
+    ind = floor( r / r_max * nr_bins )
+    
+    # Update the bins
+    bins_count[ind] += 1
+    bins_v[ind] += v[i,0] # sqrt( v[i,0]*v[i,0] + v[i,1]*v[i,1] + v[i,2]*v[i,2] )
+    # bins_m[ind] += m[i]
+    bins_h[ind] += h[i]
+    bins_u[ind] += u[i]
+    bins_rho[ind] += rho[i]
+    bins_P[ind] += (2.0/3)*u[i]*rho[i]
+    
+# Loop over the bins and dump them
+print "# bucket left right count v m h u rho"
+for i in range( nr_bins ):
+
+    # Normalize by the bin volume.
+    r_left = r_max * i / nr_bins
+    r_right = r_max * (i+1) / nr_bins
+    vol = 4/3*math.pi*(r_right*r_right*r_right - r_left*r_left*r_left)
+    ivol = 1.0 / vol
+
+    print "%i %.3e %.3e %.3e %.3e %.3e %.3e %.3e %.3e %.3e" % \
+        ( i , r_left , r_right , \
+          bins_count[i] * ivol , \
+          bins_v[i] / bins_count[i] , \
+          bins_m[i] * ivol , \
+          bins_h[i] / bins_count[i] , \
+          bins_u[i] / bins_count[i] , \
+          bins_rho[i] / bins_count[i] ,
+          bins_P[i] / bins_count[i] )
+    
+    
diff --git a/examples/test.c b/examples/test.c
index bf9b05da642e54c536823236b8316c74842761fd..7155f233aaa4bde4ea1e5624bfb731e51d478ad3 100644
--- a/examples/test.c
+++ b/examples/test.c
@@ -646,7 +646,7 @@ void kernel_dump ( int N ) {
     // float dw_dx4[4] __attribute__ ((aligned (16)));
 
     for ( k = 0 ; k <= N ; k++ ) {
-        x = ((float)k) / N * kernel_igamma;
+        x = ((float)k) / N;
         x4[3] = x4[2]; x4[2] = x4[1]; x4[1] = x4[0]; x4[0] = x;
         kernel_deval( x , &w , &dw_dx );
         // kernel_deval_vec( (vector *)x4 , (vector *)w4 , (vector *)dw_dx4 );
@@ -790,13 +790,13 @@ int main ( int argc , char *argv[] ) {
 
     /* Read particles and space information from (GADGET) IC */
     tic = getticks();
-    read_ic(ICfileName, dim, &parts, &N, &periodic);
+    read_ic( ICfileName , dim , &parts , &N , &periodic );
     printf( "main: reading particle properties took %.3f ms.\n" , ((double)(getticks() - tic)) / CPU_TPS * 1000 ); fflush(stdout);
     
     /* Apply h scaling */
     if(scaling != 1.0)
       for ( k = 0 ; k < N ; k++ )
-	parts[k].h *= scaling;
+	    parts[k].h *= scaling;
     
     /* Apply shift */
     if(shift[0] !=0 || shift[1] !=0 || shift[2] !=0 )
diff --git a/examples/test_single.c b/examples/test_single.c
index 895fd4cf31cfde621f9bc688ae7c91f808649918..3d1657e1fe481992d5cf9a6b24bfdda383bf4d85 100644
--- a/examples/test_single.c
+++ b/examples/test_single.c
@@ -64,15 +64,15 @@ int main ( int argc , char *argv[] ) {
 
     int k, N = 100;
     struct part p1, p2;
-    float r2, dx[3] = { 0.0f , 0.0f , 0.0f };
+    float x, w, dwdx, r2, dx[3] = { 0.0f , 0.0f , 0.0f };
     
     /* Init the particles. */
     for ( k = 0 ; k < 3 ; k++ ) {
         p1.a[k] = 0.0f; p1.v[k] = 0.0f; p1.x[k] = 0.0;
         p2.a[k] = 0.0f; p2.v[k] = 0.0f; p2.x[k] = 0.0;
         }
-    p1.rho = 1.0f; p1.mass = 9.7059e-4; p1.h = 0.222871287;
-    p2.rho = 1.0f; p2.mass = 9.7059e-4; p2.h = 0.222871287;
+    p1.rho = 1.0f; p1.mass = 9.7059e-4; p1.h = 0.222871287 / 2;
+    p2.rho = 1.0f; p2.mass = 9.7059e-4; p2.h = 0.222871287 / 2;
     p1.force.c = 0.0f; p1.force.balsara = 0.0f;
     p2.force.c = 0.0f; p2.force.balsara = 0.0f;
     p1.u = 1.e-5 / ((const_gamma - 1.)*p1.rho);
@@ -87,19 +87,32 @@ int main ( int argc , char *argv[] ) {
     for ( k = 1 ; k <= N ; k++ ) {
     
         /* Set the distance/radius. */
-        dx[0] = -((float)k)/N * fmaxf( p1.h , p2.h );
+        dx[0] = -((float)k)/N * fmaxf( p1.h , p2.h ) * kernel_gamma;
         r2 = dx[0]*dx[0];
         
         /* Clear the particle fields. */
-        p1.a[0] = 0.0f; p1.force.u_dt = 0.0f;
-        p2.a[0] = 0.0f; p2.force.u_dt = 0.0f;
+        /* p1.a[0] = 0.0f; p1.force.u_dt = 0.0f;
+        p2.a[0] = 0.0f; p2.force.u_dt = 0.0f; */
         
         /* Interact the particles. */
-        runner_iact_force( r2 , dx , p1.h , p2.h , &p1 , &p2 );
+        // runner_iact_force( r2 , dx , p1.h , p2.h , &p1 , &p2 );
+        
+        /* Clear the particle fields. */
+        p1.rho = 0.0f; p1.density.wcount = 0.0f;
+        p2.rho = 0.0f; p2.density.wcount = 0.0f;
+        
+        /* Interact the particles. */
+        runner_iact_density( r2 , dx , p1.h , p2.h , &p1 , &p2 );
+        
+        /* Evaluate just the kernel. */
+        x = fabsf( dx[0] ) / p1.h;
+        kernel_deval( x , &w , &dwdx );
         
         /* Output the results. */
-        printf( "%.3e %.3e %.3e %.3e %.3e\n" ,
-            -dx[0] , p1.a[0] , p1.force.u_dt , p2.a[0] , p2.force.u_dt );
+        printf( "%.3e %.3e %.3e %.3e %.3e %.3e %.3e\n" ,
+            // -dx[0] , p1.a[0] , p1.force.u_dt , p2.a[0] , p2.force.u_dt ,
+            -dx[0] , p1.rho , p1.density.wcount , p2.rho , p2.density.wcount ,
+            w , dwdx );
     
         } /* loop over radii. */
     
diff --git a/src/const.h b/src/const.h
index e2d7409006e96a26c064cbe5a761e68de773417e..98476d129fbfb9be7c1b48e041d38fa23010e417 100644
--- a/src/const.h
+++ b/src/const.h
@@ -21,7 +21,7 @@
 
 /* Hydrodynamical constants. */
 #define const_gamma             (5.0f/3.0f)
-#define const_viscosity_alpha   0.8
+#define const_viscosity_alpha   0.8f
 
 /* Time integration constants. */
 #define const_cfl               0.3f
diff --git a/src/engine.c b/src/engine.c
index 686e14f1fd7fcef844d81936baf0b02d9839b174..451cb558c3242d5cce99ff9ccd2bc3c9c0786049 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -229,7 +229,7 @@ void engine_map_kick_first ( struct cell *c , void *data ) {
                         (x[1] - x_old[1])*(x[1] - x_old[1]) +
                         (x[2] - x_old[2])*(x[2] - x_old[2]) );
             dx_max = fmaxf( dx_max , dx );
-            h2dx_max = fmaxf( h2dx_max , dx*2 + h );
+            h2dx_max = fmaxf( h2dx_max , dx*2 + h*kernel_gamma );
 
             /* Update positions and energies at the half-step. */
             p->v[0] = v[0] += dt * a[0];
@@ -362,12 +362,63 @@ void engine_collect_kick2 ( struct cell *c ) {
  * @brief Compute the force on a single particle brute-force.
  */
 
-void engine_single ( double *dim , long long int pid , struct part *__restrict__ parts , int N , int periodic ) {
+void engine_single_density ( double *dim , long long int pid , struct part *__restrict__ parts , int N , int periodic ) {
 
     int i, k;
     double r2, dx[3];
-    float fdx[3], ihg;
-    struct part p, p2;
+    float fdx[3], ih;
+    struct part p;
+    
+    /* Find "our" part. */
+    for ( k = 0 ; k < N && parts[k].id != pid ; k++ );
+    if ( k == N )
+        error( "Part not found." );
+    p = parts[k];
+    
+    /* Clear accumulators. */
+    ih = 1.0f / p.h;
+    p.rho = 0.0f; p.rho_dh = 0.0f;
+    p.density.wcount = 0.0f; p.density.wcount_dh = 0.0f;
+	p.density.div_v = 0.0;
+	for ( k=0 ; k < 3 ; k++)
+		p.density.curl_v[k] = 0.0;
+            
+    /* Loop over all particle pairs (force). */
+    for ( k = 0 ; k < N ; k++ ) {
+        if ( parts[k].id == p.id )
+            continue;
+        for ( i = 0 ; i < 3 ; i++ ) {
+            dx[i] = p.x[i] - parts[k].x[i];
+            if ( periodic ) {
+                if ( dx[i] < -dim[i]/2 )
+                    dx[i] += dim[i];
+                else if ( dx[i] > dim[i]/2 )
+                    dx[i] -= dim[i];
+                }
+            fdx[i] = dx[i];
+            }
+        r2 = fdx[0]*fdx[0] + fdx[1]*fdx[1] + fdx[2]*fdx[2];
+        if ( r2 < p.h*p.h*kernel_gamma2 ) {
+            runner_iact_nonsym_density( r2 , fdx , p.h , parts[k].h , &p , &parts[k] );
+            }
+        }
+        
+    /* Dump the result. */
+    p.rho = ih * ih * ih * ( p.rho + p.mass*kernel_root );
+    p.rho_dh = p.rho_dh * ih * ih * ih * ih;
+    p.density.wcount = ( p.density.wcount + kernel_root ) * ( 4.0f / 3.0 * M_PI * kernel_gamma3 );
+    printf( "pairs_single: part %lli (h=%e) has wcount=%e, rho=%e, rho_dh=%e.\n" , p.id , p.h , p.density.wcount , p.rho , p.rho_dh );
+    fflush(stdout);
+    
+    }
+
+
+void engine_single_force ( double *dim , long long int pid , struct part *__restrict__ parts , int N , int periodic ) {
+
+    int i, k;
+    double r2, dx[3];
+    float fdx[3];
+    struct part p;
     
     /* Find "our" part. */
     for ( k = 0 ; k < N && parts[k].id != pid ; k++ );
@@ -376,12 +427,12 @@ void engine_single ( double *dim , long long int pid , struct part *__restrict__
     p = parts[k];
     
     /* Clear accumulators. */
-    ihg = kernel_igamma / p.h;
     p.a[0] = 0.0f; p.a[1] = 0.0f; p.a[2] = 0.0f;
     p.force.u_dt = 0.0f; p.force.h_dt = 0.0f; p.force.v_sig = 0.0f;
             
     /* Loop over all particle pairs (force). */
-    for ( k = 0 ; k < N ; k++ ) {
+    // for ( k = 0 ; k < N ; k++ ) {
+    for ( k = N-1 ; k >= 0 ; k-- ) {
         if ( parts[k].id == p.id )
             continue;
         for ( i = 0 ; i < 3 ; i++ ) {
@@ -395,21 +446,19 @@ void engine_single ( double *dim , long long int pid , struct part *__restrict__
             fdx[i] = dx[i];
             }
         r2 = fdx[0]*fdx[0] + fdx[1]*fdx[1] + fdx[2]*fdx[2];
-        if ( r2 < p.h*p.h || r2 < parts[k].h*parts[k].h ) {
+        if ( r2 < p.h*p.h*kernel_gamma2 || r2 < parts[k].h*parts[k].h*kernel_gamma2 ) {
             p.a[0] = 0.0f; p.a[1] = 0.0f; p.a[2] = 0.0f;
             p.force.u_dt = 0.0f; p.force.h_dt = 0.0f; p.force.v_sig = 0.0f;
-            p2 = parts[k];
-            runner_iact_nonsym_force( r2 , fdx , p.h , p2.h , &p , &p2 );
-            printf( "pairs_simple: interacting particles %lli and %lli (rho=%.3e,r=%e): a=[%.3e,%.3e,%.3e], u_dt=%.3e, h_dt=%.3e, v_sig=%.3e.\n" ,
+            runner_iact_nonsym_force( r2 , fdx , p.h , parts[k].h , &p , &parts[k] );
+            /* printf( "pairs_simple: interacting particles %lli and %lli (rho=%.3e,r=%e): a=[%.3e,%.3e,%.3e], u_dt=%.3e, h_dt=%.3e, v_sig=%.3e.\n" ,
                 pid , p2.id , p2.rho , sqrtf(r2) ,
                 p.a[0] , p.a[1] , p.a[2] ,
-                p.force.u_dt , p.force.h_dt , p.force.v_sig );
+                p.force.u_dt , p.force.h_dt , p.force.v_sig ); */
             }
         }
         
     /* Dump the result. */
-    ihg = kernel_igamma / p.h;
-    printf( "pairs_single: part %lli (h=%e) has wcount=%f, rho=%f.\n" , p.id , p.h , p.density.wcount + 32.0/3 , ihg * ihg * ihg * ( p.rho + p.mass*kernel_root ) );
+    printf( "pairs_single: part %lli (h=%e) has a=[%.3e,%.3e,%.3e], udt=%e.\n" , p.id , p.h , p.a[0] , p.a[1] , p.a[2] , p.force.u_dt );
     fflush(stdout);
     
     }
@@ -472,6 +521,8 @@ void engine_step ( struct engine *e , int sort_queues ) {
             }
         }
         
+    // engine_single_density( e->s->dim , 6178 , e->s->parts , e->s->nr_parts , e->s->periodic );
+
     /* Start the clock. */
     TIMER_TIC_ND
     
@@ -488,11 +539,12 @@ void engine_step ( struct engine *e , int sort_queues ) {
     /* Stop the clock. */
     TIMER_TOC(timer_runners);
     
+    // engine_single_force( e->s->dim , 6178 , e->s->parts , e->s->nr_parts , e->s->periodic );
+    
     // for(k=0; k<10; ++k)
     //   printParticle(parts, k);
-    // engine_single( e->s->dim , 432626 , e->s->parts , e->s->nr_parts , e->s->periodic );
     // printParticle( parts , 432626 );
-    // printParticle( parts , 432628 );
+    // printParticle( e->s->parts , 6178 , e->s->nr_parts );
 
     /* Collect the cell data from the second kick. */
     for ( k = 0 ; k < e->s->nr_cells ; k++ ) {
diff --git a/src/kernel.h b/src/kernel.h
index 09781ee080263eb6e69694cdff30a8afd54e5b0b..5996e182611ecaf985e3b7f05fad1d7e4bd1bc29 100644
--- a/src/kernel.h
+++ b/src/kernel.h
@@ -32,16 +32,15 @@
 /* Coefficients for the kernel. */ 
 #define kernel_degree 3
 #define kernel_ivals 2
-#define kernel_gamma 0.5f
-#define kernel_igamma 2.0f
-#define kernel_igamma3 8.0f
-#define kernel_igamma4 16.0f
+#define kernel_gamma 2.0f
+#define kernel_gamma2 4.0f
+#define kernel_gamma3 8.0f
 static float kernel_coeffs[ (kernel_degree + 1) * (kernel_ivals + 1) ] __attribute__ ((aligned (16))) =
     { 3.0/4.0*M_1_PI , -3.0/2.0*M_1_PI , 0.0 , M_1_PI , 
       -0.25*M_1_PI , 3.0/2.0*M_1_PI , -3.0*M_1_PI , M_2_PI , 
       0.0 , 0.0 , 0.0 , 0.0 };
 #define kernel_root ( kernel_coeffs[ kernel_degree ] )
-#define kernel_wroot ( 4.0/3.0*M_PI*kernel_igamma3*kernel_coeffs[ kernel_degree ] )
+#define kernel_wroot ( 4.0/3.0*M_PI*kernel_coeffs[ kernel_degree ] )
       
       
 /**
diff --git a/src/runner.c b/src/runner.c
index 2ee122e82c9cff49993500bb750983b0e4f1adb7..7e72ef0574b28edeab1ed31ad07dbb85c6761dd3 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -335,7 +335,7 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
     struct cell *finger;
     int i, k, redo, count = c->count;
     int *pid;
-    float h, hg, ihg, ihg2, ihg4, h_corr, rho, rho_dh, u, fc;
+    float h, ih, ih2, ih4, h_corr, rho, wcount, rho_dh, wcount_dh, u, fc;
     float normDiv_v, normCurl_v;
     float dt_step = r->e->dt_step;
     TIMER_TIC
@@ -372,23 +372,21 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
             
   	            /* Some smoothing length multiples. */
 	            h = p->h;
-                hg = kernel_gamma * h;
-                ihg = 1.0f / hg;
-                ihg2 = ihg * ihg;
-                ihg4 = ihg2 * ihg2;
-
-                /* Adjust the computed weighted number of neighbours. */
-                p->density.wcount += kernel_wroot;
+                ih = 1.0f / h;
+                ih2 = ih * ih;
+                ih4 = ih2 * ih2;
 
 		        /* Final operation on the density. */
-                p->rho = rho = ihg * ihg2 * ( p->rho + p->mass*kernel_root );
-                p->rho_dh = rho_dh = p->rho_dh * ihg4;
+                p->rho = rho = ih * ih2 * ( p->rho + p->mass*kernel_root );
+                p->rho_dh = rho_dh = p->rho_dh * ih4;
+                wcount = ( p->density.wcount + kernel_root ) * ( 4.0f / 3.0 * M_PI * kernel_gamma3 );
+                wcount_dh = p->density.wcount_dh * ih * ( 4.0f / 3.0 * M_PI * kernel_gamma3 );
                     
                 /* Compute the smoothing length update (Newton step). */
-                if ( p->density.wcount_dh == 0.0f )
+                if ( wcount_dh == 0.0f )
                     h_corr = p->h;
                 else {
-                    h_corr = ( const_nwneigh - p->density.wcount ) / p->density.wcount_dh;
+                    h_corr = ( const_nwneigh - wcount ) / wcount_dh;
 
                     /* Truncate to the range [ -p->h/2 , p->h ]. */
                     h_corr = fminf( h_corr , h );
@@ -400,9 +398,9 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
                 cp->h = p->h;
 
                 /* Did we get the right number density? */
-                if ( p->density.wcount > const_nwneigh + const_delta_nwneigh ||
-                     p->density.wcount < const_nwneigh - const_delta_nwneigh ) {
-                    // printf( "runner_doghost: particle %lli (h=%e,depth=%i) has bad wcount=%.3f.\n" , p->id , p->h , c->depth , p->density.wcount ); fflush(stdout);
+                if ( wcount > const_nwneigh + const_delta_nwneigh ||
+                     wcount < const_nwneigh - const_delta_nwneigh ) {
+                    // printf( "runner_doghost: particle %lli (h=%e,depth=%i) has bad wcount=%.3f.\n" , p->id , p->h , c->depth , wcount ); fflush(stdout);
                     // p->h += ( p->density.wcount + kernel_root - const_nwneigh ) / p->density.wcount_dh;
                     pid[redo] = pid[i];
                     redo += 1;
@@ -417,8 +415,8 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
                     }
 
                 /* Pre-compute some stuff for the balsara switch. */
-		        normDiv_v = fabs( p->density.div_v / rho * ihg4 );
-		        normCurl_v = sqrtf( p->density.curl_v[0] * p->density.curl_v[0] + p->density.curl_v[1] * p->density.curl_v[1] + p->density.curl_v[2] * p->density.curl_v[2] ) / rho * ihg4;
+		        normDiv_v = fabs( p->density.div_v / rho * ih4 );
+		        normCurl_v = sqrtf( p->density.curl_v[0] * p->density.curl_v[0] + p->density.curl_v[1] * p->density.curl_v[1] + p->density.curl_v[2] * p->density.curl_v[2] ) / rho * ih4;
                 
                 /* As of here, particle force variables will be set. Do _NOT_
                    try to read any particle density variables! */
@@ -428,10 +426,10 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
                 p->force.c = fc = sqrtf( const_gamma * ( const_gamma - 1.0f ) * u );
 
                 /* Compute the P/Omega/rho2. */
-                p->force.POrho2 = u * ( const_gamma - 1.0f ) / ( rho + hg * rho_dh / 3.0f );
+                p->force.POrho2 = u * ( const_gamma - 1.0f ) / ( rho + h * rho_dh / 3.0f );
 
-		/* Balsara switch */
-		p->force.balsara = normDiv_v / ( normDiv_v + normCurl_v + 0.0001f * fc * ihg );
+		        /* Balsara switch */
+		        p->force.balsara = normDiv_v / ( normDiv_v + normCurl_v + 0.0001f * fc * ih );
                 
                 /* Reset the acceleration. */
                 for ( k = 0 ; k < 3 ; k++ )
@@ -543,7 +541,7 @@ void runner_dokick2 ( struct runner *r , struct cell *c ) {
             }
 
         /* Update the particle's time step. */
-        p->dt = pdt = const_cfl * kernel_gamma * h / ( p->force.v_sig );
+        p->dt = pdt = const_cfl * h / p->force.v_sig;
 
         /* Update positions and energies at the half-step. */
         p->v[0] = v[0] = xp->v_old[0] + hdt * p->a[0];
@@ -565,9 +563,9 @@ void runner_dokick2 ( struct runner *r , struct cell *c ) {
         mom[2] += m * p->v[2];
 
 	    /* Collect angular momentum */
-	    ang[0] += m * ( x[1]*v[2] - v[2]*x[1] );
-	    ang[1] += m * ( x[2]*v[0] - v[0]*x[2] );
-	    ang[2] += m * ( x[0]*v[1] - v[1]*x[0] );
+	    ang[0] += m * ( x[1]*v[2] - x[2]*v[1] );
+	    ang[1] += m * ( x[2]*v[0] - x[0]*v[2] );
+	    ang[2] += m * ( x[0]*v[1] - x[1]*v[0] );
 
 	    /* Collect entropic function */
 	    // lent += u * pow( p->rho, 1.f-const_gamma );
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index dd304a8bb93094ea6692168e5f61c399ba036fda..9efb74f364288cb07edc200b309346ca27cab99e 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -105,7 +105,7 @@ void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *r
     struct cpart *restrict cpj, *restrict cparts_j = cj->cparts;
     struct cpart *restrict cpi, *restrict cparts_i = ci->cparts;
     double pix[3];
-    float dx[3], hi, hi2, r2;
+    float dx[3], hi, hig2, r2;
     float dt_step = e->dt_step;
     #ifdef VECTORIZE
         int icount = 0;
@@ -142,7 +142,7 @@ void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *r
         for ( k = 0 ; k < 3 ; k++ )
             pix[k] = cpi->x[k] - shift[k];
         hi = cpi->h;
-        hi2 = hi * hi;
+        hig2 = hi * hi * kernel_gamma2;
         
         /* Loop over the parts in cj. */
         for ( pjd = 0 ; pjd < count_j ; pjd++ ) {
@@ -158,7 +158,7 @@ void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *r
                 }
                 
             /* Hit or miss? */
-            if ( r2 < hi2 || r2 < cpj->h*cpj->h ) {
+            if ( r2 < hig2 || r2 < cpj->h*cpj->h*kernel_gamma2 ) {
             
                 #ifndef VECTORIZE
                         
@@ -214,7 +214,7 @@ void DOSELF_NAIVE ( struct runner *r , struct cell *restrict c ) {
     struct part *restrict parts = c->parts;
     struct cpart *restrict cpi,  *restrict cpj,*restrict cparts = c->cparts;
     double pix[3];
-    float dx[3], hi, hi2, r2;
+    float dx[3], hi, hig2, r2;
     float dt_step = r->e->dt_step;
     #ifdef VECTORIZE
         int icount = 0;
@@ -243,7 +243,7 @@ void DOSELF_NAIVE ( struct runner *r , struct cell *restrict c ) {
         for ( k = 0 ; k < 3 ; k++ )
             pix[k] = cpi->x[k];
         hi = cpi->h;
-        hi2 = hi * hi;
+        hig2 = hi * hi * kernel_gamma2;
         
         /* Loop over the parts in cj. */
         for ( pjd = pid+1 ; pjd < count ; pjd++ ) {
@@ -259,7 +259,7 @@ void DOSELF_NAIVE ( struct runner *r , struct cell *restrict c ) {
                 }
                 
             /* Hit or miss? */
-            if ( r2 < hi2 || r2 < cpj->h*cpj->h ) {
+            if ( r2 < hig2 || r2 < cpj->h*cpj->h*kernel_gamma2 ) {
             
                 #ifndef VECTORIZE
                         
@@ -328,7 +328,7 @@ void DOPAIR_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
     struct part *restrict pi, *restrict parts_j = cj->parts;
     struct cpart *restrict cpj, *restrict cparts_j = cj->cparts;
     double pix[3];
-    float dx[3], hi, hi2, r2, di, dxj;
+    float dx[3], hi, hig2, r2, di, dxj;
     struct entry *sort_j;
     #ifdef VECTORIZE
         int icount = 0;
@@ -376,8 +376,8 @@ void DOPAIR_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
             for ( k = 0 ; k < 3 ; k++ )
                 pix[k] = pi->x[k] - shift[k];
             hi = pi->h;
-            hi2 = hi * hi;
-            di = hi + dxj + pix[0]*runner_shift[ 3*sid + 0 ] + pix[1]*runner_shift[ 3*sid + 1 ] + pix[2]*runner_shift[ 3*sid + 2 ];
+            hig2 = hi * hi * kernel_gamma2;
+            di = hi*kernel_gamma + dxj + pix[0]*runner_shift[ 3*sid + 0 ] + pix[1]*runner_shift[ 3*sid + 1 ] + pix[2]*runner_shift[ 3*sid + 2 ];
 
             /* Loop over the parts in cj. */
             for ( pjd = 0 ; pjd < count_j && sort_j[ pjd ].d < di ; pjd++ ) {
@@ -393,7 +393,7 @@ void DOPAIR_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
                     }
 
                 /* Hit or miss? */
-                if ( r2 < hi2 ) {
+                if ( r2 < hig2 ) {
 
                     #ifndef VECTORIZE
                         
@@ -439,8 +439,8 @@ void DOPAIR_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
             for ( k = 0 ; k < 3 ; k++ )
                 pix[k] = pi->x[k] - shift[k];
             hi = pi->h;
-            hi2 = hi * hi;
-            di = -hi - dxj + pix[0]*runner_shift[ 3*sid + 0 ] + pix[1]*runner_shift[ 3*sid + 1 ] + pix[2]*runner_shift[ 3*sid + 2 ];
+            hig2 = hi * hi * kernel_gamma2;
+            di = -hi*kernel_gamma - dxj + pix[0]*runner_shift[ 3*sid + 0 ] + pix[1]*runner_shift[ 3*sid + 1 ] + pix[2]*runner_shift[ 3*sid + 2 ];
 
             /* Loop over the parts in cj. */
             for ( pjd = count_j-1 ; pjd >= 0 && di < sort_j[ pjd ].d ; pjd-- ) {
@@ -456,7 +456,7 @@ void DOPAIR_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
                     }
 
                 /* Hit or miss? */
-                if ( r2 < hi2 ) {
+                if ( r2 < hig2 ) {
 
                     #ifndef VECTORIZE
                         
@@ -526,7 +526,7 @@ void DOSELF_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
     struct part *restrict pi;
     struct cpart *restrict cpj, *restrict cparts = ci->cparts;
     double pix[3];
-    float dx[3], hi, hi2, r2;
+    float dx[3], hi, hig2, r2;
     #ifdef VECTORIZE
         int icount = 0;
         float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
@@ -550,7 +550,7 @@ void DOSELF_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
         for ( k = 0 ; k < 3 ; k++ )
             pix[k] = pi->x[k];
         hi = pi->h;
-        hi2 = hi * hi;
+        hig2 = hi * hi * kernel_gamma2;
         
         /* Loop over the parts in cj. */
         for ( pjd = 0 ; pjd < count_i ; pjd++ ) {
@@ -566,7 +566,7 @@ void DOSELF_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
                 }
                 
             /* Hit or miss? */
-            if ( r2 > 0.0f && r2 < hi2 ) {
+            if ( r2 > 0.0f && r2 < hig2 ) {
             
                 #ifndef VECTORIZE
 
@@ -634,7 +634,7 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) {
     struct cpart *restrict cpi, *restrict cparts_i;
     struct cpart *restrict cpj, *restrict cparts_j;
     double pix[3], pjx[3], di, dj;
-    float dx[3], hi, hi2, hj, hj2, r2, dx_max;
+    float dx[3], hi, hig2, hj, hjg2, r2, dx_max;
     double hi_max, hj_max;
     double di_max, dj_min;
     int count_i, count_j;
@@ -669,7 +669,7 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) {
     sort_j = &cj->sort[ sid*(cj->count + 1) ];
     
     /* Get some other useful values. */
-    hi_max = ci->h_max - rshift; hj_max = cj->h_max;
+    hi_max = ci->h_max*kernel_gamma - rshift; hj_max = cj->h_max*kernel_gamma;
     count_i = ci->count; count_j = cj->count;
     parts_i = ci->parts; parts_j = cj->parts;
     cparts_i = ci->cparts; cparts_j = cj->cparts;
@@ -690,11 +690,11 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) {
         if ( cpi->dt > dt_step )
             continue;
         hi = cpi->h;
-        di = sort_i[pid].d + hi + dx_max - rshift;
+        di = sort_i[pid].d + hi*kernel_gamma + dx_max - rshift;
         if ( di < dj_min )
             continue;
             
-        hi2 = hi * hi;
+        hig2 = hi * hi * kernel_gamma2;
         for ( k = 0 ; k < 3 ; k++ )
             pix[k] = cpi->x[k] - shift[k];
         
@@ -712,7 +712,7 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) {
                 }
                 
             /* Hit or miss? */
-            if ( r2 < hi2 ) {
+            if ( r2 < hig2 ) {
             
                 #ifndef VECTORIZE
                         
@@ -757,13 +757,13 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) {
         if ( cpj->dt > dt_step )
             continue;
         hj = cpj->h;
-        dj = sort_j[pjd].d - hj - dx_max - rshift;
+        dj = sort_j[pjd].d - hj*kernel_gamma - dx_max - rshift;
         if ( dj > di_max )
             continue;
             
         for ( k = 0 ; k < 3 ; k++ )
             pjx[k] = cpj->x[k] + shift[k];
-        hj2 = hj * hj;
+        hjg2 = hj * hj * kernel_gamma2;
         
         /* Loop over the parts in ci. */
         for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d > dj ; pid-- ) {
@@ -779,7 +779,7 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) {
                 }
                 
             /* Hit or miss? */
-            if ( r2 < hj2 ) {
+            if ( r2 < hjg2 ) {
             
                 #ifndef VECTORIZE
                         
@@ -840,7 +840,7 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
     struct cpart *restrict cpi, *restrict cparts_i;
     struct cpart *restrict cpj, *restrict cparts_j;
     double pix[3], pjx[3], di, dj;
-    float dx[3], hi, hi2, hj, hj2, r2, dx_max;
+    float dx[3], hi, hig2, hj, hjg2, r2, dx_max;
     double hi_max, hj_max;
     double di_max, dj_min;
     int count_i, count_j;
@@ -881,7 +881,7 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
     sort_j = &cj->sort[ sid*(cj->count + 1) ];
     
     /* Get some other useful values. */
-    hi_max = ci->h_max - rshift; hj_max = cj->h_max;
+    hi_max = ci->h_max*kernel_gamma - rshift; hj_max = cj->h_max*kernel_gamma;
     count_i = ci->count; count_j = cj->count;
     parts_i = ci->parts; parts_j = cj->parts;
     cparts_i = ci->cparts; cparts_j = cj->cparts;
@@ -924,11 +924,11 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
         pi = &parts_i[ sort_i[ pid ].i ];
         cpi = &cparts_i[ sort_i[ pid ].i ];
         hi = cpi->h;
-        di = sort_i[pid].d + hi + dx_max - rshift;
+        di = sort_i[pid].d + hi*kernel_gamma + dx_max - rshift;
         if ( di < dj_min )
             continue;
             
-        hi2 = hi * hi;
+        hig2 = hi * hi * kernel_gamma2;
         for ( k = 0 ; k < 3 ; k++ )
             pix[k] = cpi->x[k] - shift[k];
             
@@ -949,7 +949,7 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
                     }
 
                 /* Hit or miss? */
-                if ( r2 < hi2 ) {
+                if ( r2 < hig2 ) {
 
                     #ifndef VECTORIZE
 
@@ -999,7 +999,7 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
                     }
 
                 /* Hit or miss? */
-                if ( r2 < hi2 ) {
+                if ( r2 < hig2 ) {
 
                     #ifndef VECTORIZE
 
@@ -1074,13 +1074,13 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
         pj = &parts_j[ sort_j[ pjd ].i ];
         cpj = &cparts_j[ sort_j[ pjd ].i ];
         hj = cpj->h;
-        dj = sort_j[pjd].d - hj - dx_max - rshift;
+        dj = sort_j[pjd].d - hj*kernel_gamma - dx_max - rshift;
         if ( dj > di_max )
             continue;
             
         for ( k = 0 ; k < 3 ; k++ )
             pjx[k] = cpj->x[k] + shift[k];
-        hj2 = hj * hj;
+        hjg2 = hj * hj * kernel_gamma2;
         
         /* Is this particle outside the dt? */
         if ( cpj->dt > dt_step ) {
@@ -1099,7 +1099,7 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
                     }
 
                 /* Hit or miss? */
-                if ( r2 < hj2 && r2 > cpi->h*cpi->h ) {
+                if ( r2 < hjg2 && r2 > cpi->h*cpi->h*kernel_gamma2 ) {
 
                     #ifndef VECTORIZE
 
@@ -1148,7 +1148,7 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
                     }
 
                 /* Hit or miss? */
-                if ( r2 < hj2 && r2 > cpi->h*cpi->h ) {
+                if ( r2 < hjg2 && r2 > cpi->h*cpi->h*kernel_gamma2 ) {
 
                     #ifndef VECTORIZE
 
@@ -1243,7 +1243,7 @@ void DOSELF1 ( struct runner *r , struct cell *restrict c ) {
 
     int k, pid, pjd, count = c->count;
     double pix[3];
-    float dx[3], hi, hi2, r2;
+    float dx[3], hi, hig2, r2;
     struct part *restrict parts = c->parts, *restrict pi;
     struct cpart *restrict cpi, *restrict cpj, *restrict cparts = c->cparts;
     float dt_step = r->e->dt_step;
@@ -1274,7 +1274,7 @@ void DOSELF1 ( struct runner *r , struct cell *restrict c ) {
         for ( k = 0 ; k < 3 ; k++ )
             pix[k] = cpi->x[k];
         hi = cpi->h;
-        hi2 = hi * hi;
+        hig2 = hi * hi * kernel_gamma2;
             
         /* Loop over the other particles .*/
         for ( pjd = 0 ; pjd < count ; pjd++ ) {
@@ -1294,7 +1294,7 @@ void DOSELF1 ( struct runner *r , struct cell *restrict c ) {
                 }
                 
             /* Hit or miss? */
-            if ( r2 < hi2 ) {
+            if ( r2 < hig2 ) {
             
                 #ifndef VECTORIZE
                         
@@ -1347,7 +1347,7 @@ void DOSELF2 ( struct runner *r , struct cell *restrict c ) {
 
     int k, pid, pjd, count = c->count;
     double pix[3];
-    float dx[3], hi, hi2, r2;
+    float dx[3], hi, hig2, r2;
     struct part *restrict parts = c->parts, *restrict pi;
     struct cpart *restrict cpi, *restrict cpj, *restrict cparts = c->cparts;
     float dt_step = r->e->dt_step;
@@ -1392,7 +1392,7 @@ void DOSELF2 ( struct runner *r , struct cell *restrict c ) {
         for ( k = 0 ; k < 3 ; k++ )
             pix[k] = cpi->x[k];
         hi = cpi->h;
-        hi2 = hi * hi;
+        hig2 = hi * hi * kernel_gamma2;
         
         /* Is the ith particle not active? */
         if ( cpi->dt > dt_step ) {
@@ -1411,7 +1411,7 @@ void DOSELF2 ( struct runner *r , struct cell *restrict c ) {
                     }
 
                 /* Hit or miss? */
-                if ( r2 < hi2 || r2 < cpj->h*cpj->h ) {
+                if ( r2 < hig2 || r2 < cpj->h*cpj->h*kernel_gamma2 ) {
 
                     #ifndef VECTORIZE
 
@@ -1464,7 +1464,7 @@ void DOSELF2 ( struct runner *r , struct cell *restrict c ) {
                     }
 
                 /* Hit or miss? */
-                if ( r2 < hi2 || r2 < cpj->h*cpj->h ) {
+                if ( r2 < hig2 || r2 < cpj->h*cpj->h*kernel_gamma2 ) {
 
                     #ifndef VECTORIZE
 
@@ -2120,7 +2120,7 @@ void DOSUB2 ( struct runner *r , struct cell *ci , struct cell *cj , int sid ) {
     }
 
 
-void DOSUB_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *parts , int *ind , int count , struct cell *restrict cj , int sid ) {
+void DOSUB_SUBSET ( struct runner *r , struct cell *ci , struct part *parts , int *ind , int count , struct cell *cj , int sid ) {
 
     int j, k;
     double shift[3];
@@ -2178,31 +2178,8 @@ void DOSUB_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *p
              ci->h2dx_max*2 < h && cj->h2dx_max*2 < h ) {
              
             /* Get the type of pair if not specified explicitly. */
-            if ( sid < 0 ) {
-
-                /* Get the relative distance between the pairs, wrapping. */
-                for ( k = 0 ; k < 3 ; k++ ) {
-                    if ( cj->loc[k] - ci->loc[k] < -s->dim[k]/2 )
-                        shift[k] = s->dim[k];
-                    else if ( cj->loc[k] - ci->loc[k] > s->dim[k]/2 )
-                        shift[k] = -s->dim[k];
-                    else
-                        shift[k] = 0.0;
-                    }
-
-                /* Get the sorting index. */
-                for ( sid = 0 , k = 0 ; k < 3 ; k++ )
-                    sid = 3*sid + ( (cj->loc[k] - ci->loc[k] + shift[k] < 0) ? 0 : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1 );
-
-                /* Flip? */
-                if ( sid < 13 ) {
-                    struct cell *temp = cj; cj = ci; ci = temp;
-                    }
-                else
-                    sid = 26 - sid;
+            sid = space_getsid( s , &ci , &cj , shift );
 
-                }
-    
             /* Different types of flags. */
             switch ( sid ) {
 
diff --git a/src/runner_iact.h b/src/runner_iact.h
index a8090d160687935a94bcd8ced8d720053fd85863..ab0887be2674accafc510c814cfd833d09118bf1 100644
--- a/src/runner_iact.h
+++ b/src/runner_iact.h
@@ -42,7 +42,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_density ( float r
 
     float r = sqrtf( r2 ), ri = 1.0f / r;
     float xi, xj;
-    float h_inv, hg_inv;
+    float h_inv;
     float wi, wj, wi_dx, wj_dx;
     float mi, mj;
     float dvdr;
@@ -66,18 +66,16 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_density ( float r
     for ( k = 0 ; k < 3 ; k++ )
         curlvr[k] *= ri;
             
-    if ( r2 < hi*hi ) {
+    if ( r2 < hi*hi*kernel_gamma2 ) {
         
         h_inv = 1.0 / hi;
-        hg_inv = kernel_igamma * h_inv;
-        xi = r * hg_inv;
+        xi = r * h_inv;
         kernel_deval( xi , &wi , &wi_dx );
         
         pi->rho += mj * wi;
         pi->rho_dh -= mj * ( 3.0*wi + xi*wi_dx );
-        pi->density.wcount += wi * ( 4.0f * M_PI / 3.0f * kernel_igamma3 );
-        pi->density.wcount_dh -= xi * h_inv * wi_dx * ( 4.0f * M_PI / 3.0f * kernel_igamma3 );
-        // pi->icount += 1;
+        pi->density.wcount += wi;
+        pi->density.wcount_dh -= xi * wi_dx;
 
 	    pi->density.div_v += mj * dvdr * wi_dx;
 	    for ( k = 0 ; k < 3 ; k++ )
@@ -85,18 +83,16 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_density ( float r
             
         }
 
-    if ( r2 < hj*hj ) {
+    if ( r2 < hj*hj*kernel_gamma2 ) {
         
         h_inv = 1.0 / hj;
-        hg_inv = kernel_igamma * h_inv;
-        xj = r * hg_inv;
+        xj = r * h_inv;
         kernel_deval( xj , &wj , &wj_dx );
         
         pj->rho += mi * wj;
         pj->rho_dh -= mi * ( 3.0*wj + xj*wj_dx );
-        pj->density.wcount += wj * ( 4.0f * M_PI / 3.0f * kernel_igamma3 );
-        pj->density.wcount_dh -= xj * h_inv * wj_dx * ( 4.0f * M_PI / 3.0f * kernel_igamma3 );
-        // pj->icount += 1;
+        pj->density.wcount += wj;
+        pj->density.wcount_dh -= xj * wj_dx;
         
 	    pj->density.div_v += mi * dvdr * wj_dx;
 	    for ( k = 0 ; k < 3 ; k++ )
@@ -115,7 +111,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( flo
 
     vector r, ri, xi, xj, hi, hj, hi_inv, hj_inv, hig_inv, hjg_inv, wi, wj, wi_dx, wj_dx;
     vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh;
-    vector mi, mj, wscale;
+    vector mi, mj;
     vector dx[3], dv[3];
     vector vi[3], vj[3];    
     vector dvdr, div_vi, div_vj;
@@ -144,19 +140,15 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( flo
             dx[k].v = _mm_set_ps( Dx[9+k] , Dx[6+k] , Dx[3+k] , Dx[0+k] );
     #endif
 
-    wscale.v = vec_set1( 4.0f * M_PI / 3.0f * kernel_igamma3 );
-    
     hi.v = vec_load( Hi );
     hi_inv.v = vec_rcp( hi.v );
     hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v  - vec_set1( 1.0f ) );
-    hig_inv.v = hi_inv.v * vec_set1( kernel_igamma );
-    xi.v = r.v * hig_inv.v;
+    xi.v = r.v * hi_inv.v;
 
     hj.v = vec_load( Hj );
     hj_inv.v = vec_rcp( hj.v );
     hj_inv.v = hj_inv.v - hj_inv.v * ( hj_inv.v * hj.v  - vec_set1( 1.0f ) );
-    hjg_inv.v = hj_inv.v * vec_set1( kernel_igamma );
-    xj.v = r.v * hjg_inv.v;
+    xj.v = r.v * hj_inv.v;
     
     kernel_deval_vec( &xi , &wi , &wi_dx );
     kernel_deval_vec( &xj , &wj , &wj_dx );
@@ -179,16 +171,16 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( flo
 
     rhoi.v = mj.v * wi.v;
     rhoi_dh.v = mj.v * ( vec_set1( 3.0f ) * wi.v + xi.v * wi_dx.v );
-    wcounti.v = wi.v * wscale.v;
-    wcounti_dh.v = xi.v * hi_inv.v * wi_dx.v * wscale.v;
+    wcounti.v = wi.v;
+    wcounti_dh.v = xi.v * wi_dx.v;
     div_vi.v = mj.v * dvdr.v * wi_dx.v;
     for ( k = 0 ; k < 3 ; k++ )
         curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;
         
     rhoj.v = mi.v * wj.v;
     rhoj_dh.v = mi.v * ( vec_set1( 3.0f ) * wj.v + xj.v * wj_dx.v );
-    wcountj.v = wj.v * wscale.v;
-    wcountj_dh.v = xj.v * hj_inv.v * wj_dx.v * wscale.v;
+    wcountj.v = wj.v;
+    wcountj_dh.v = xj.v * wj_dx.v;
     div_vj.v = mi.v * dvdr.v * wj_dx.v;
     for ( k = 0 ; k < 3 ; k++ )
         curl_vj[k].v = mi.v * curlvr[k].v * wj_dx.v;
@@ -230,14 +222,14 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_density (
 
     float r, ri;
     float xi;
-    float h_inv, hg_inv;
+    float h_inv;
     float wi, wi_dx;
     float mj;
     float dvdr;
     float dv[3], curlvr[3];
     int k;
 
-    if ( r2 < hi*hi ) {
+    if ( r2 < hi*hi*kernel_gamma2 ) {
         
         /* Get the masses. */
         mj = pj->mass;
@@ -261,15 +253,13 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_density (
             curlvr[k] *= ri;
             
         h_inv = 1.0 / hi;
-        hg_inv = kernel_igamma * h_inv;
-        xi = r * hg_inv;
+        xi = r * h_inv;
         kernel_deval( xi , &wi , &wi_dx );
         
         pi->rho += mj * wi;
         pi->rho_dh -= mj * ( 3.0*wi + xi*wi_dx );
-        pi->density.wcount += wi * ( 4.0f * M_PI / 3.0f * kernel_igamma3 );
-        pi->density.wcount_dh -= xi * h_inv * wi_dx * ( 4.0f * M_PI / 3.0f * kernel_igamma3 );
-        // pi->icount += 1;
+        pi->density.wcount += wi;
+        pi->density.wcount_dh -= xi * wi_dx;
 
 	    pi->density.div_v += mj * dvdr * wi_dx;
 	    for ( k = 0 ; k < 3 ; k++ )
@@ -289,7 +279,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_densit
 
     vector r, ri, xi, hi, hi_inv, hig_inv, wi, wi_dx;
     vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi;
-    vector mj, wscale;
+    vector mj;
     vector dx[3], dv[3];
     vector vi[3], vj[3];
     vector dvdr;
@@ -315,13 +305,11 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_densit
         for ( k = 0 ; k < 3 ; k++ )
             dx[k].v = _mm_set_ps( Dx[9+k] , Dx[6+k] , Dx[3+k] , Dx[0+k] );
     #endif
-    wscale.v = vec_set1( 4.0f * M_PI / 3.0f * kernel_igamma3 );
     
     hi.v = vec_load( Hi );
     hi_inv.v = vec_rcp( hi.v );
     hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v  - vec_set1( 1.0f ) );
-    hig_inv.v = hi_inv.v * vec_set1( kernel_igamma );
-    xi.v = r.v * hig_inv.v;
+    xi.v = r.v * hi_inv.v;
 
     kernel_deval_vec( &xi , &wi , &wi_dx );
     
@@ -343,8 +331,8 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_densit
 
     rhoi.v = mj.v * wi.v;
     rhoi_dh.v = mj.v * ( vec_set1( 3.0f ) * wi.v + xi.v * wi_dx.v );
-    wcounti.v = wi.v * wscale.v;
-    wcounti_dh.v = xi.v * hi_inv.v * wi_dx.v * wscale.v;
+    wcounti.v = wi.v;
+    wcounti_dh.v = xi.v * wi_dx.v;
     div_vi.v = mj.v * dvdr.v * wi_dx.v;
     for ( k = 0 ; k < 3 ; k++ )
         curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;
@@ -392,14 +380,14 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2
     POrho2j = pj->force.POrho2;
     
     /* Get the kernel for hi. */
-    hi_inv = kernel_igamma / hi;
+    hi_inv = 1.0f / hi;
     hi2_inv = hi_inv * hi_inv;
     xi = r * hi_inv;
     kernel_deval( xi , &wi , &wi_dx );
     wi_dr = hi2_inv * hi2_inv * wi_dx;
         
     /* Get the kernel for hj. */
-    hj_inv = kernel_igamma / hj;
+    hj_inv = 1.0f / hj;
     hj2_inv = hj_inv * hj_inv;
     xj = r * hj_inv;
     kernel_deval( xj , &wj , &wj_dx );
@@ -522,7 +510,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float
     r.v = r2.v * ri.v;
     
     /* Get the kernel for hi. */
-    hi.v = vec_set1( kernel_gamma ) * vec_load( Hi );
+    hi.v = vec_load( Hi );
     hi_inv.v = vec_rcp( hi.v );
     hi_inv.v = hi_inv.v - hi_inv.v * ( hi.v * hi_inv.v - vec_set1( 1.0f ) );
     hi2_inv.v = hi_inv.v * hi_inv.v;
@@ -531,7 +519,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float
     wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v;
         
     /* Get the kernel for hj. */
-    hj.v = vec_set1( kernel_gamma ) * vec_load( Hj );
+    hj.v = vec_load( Hj );
     hj_inv.v = vec_rcp( hj.v );
     hj_inv.v = hj_inv.v - hj_inv.v * ( hj.v * hj_inv.v - vec_set1( 1.0f ) );
     hj2_inv.v = hj_inv.v * hj_inv.v;
@@ -622,14 +610,14 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_force ( fl
     POrho2j = pj->force.POrho2;
     
     /* Get the kernel for hi. */
-    hi_inv = kernel_igamma / hi;
+    hi_inv = 1.0f / hi;
     hi2_inv = hi_inv * hi_inv;
     xi = r * hi_inv;
     kernel_deval( xi , &wi , &wi_dx );
     wi_dr = hi2_inv * hi2_inv * wi_dx;
         
     /* Get the kernel for hj. */
-    hj_inv = kernel_igamma / hj;
+    hj_inv = 1.0f / hj;
     hj2_inv = hj_inv * hj_inv;
     xj = r * hj_inv;
     kernel_deval( xj , &wj , &wj_dx );
@@ -747,7 +735,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force
     r.v = r2.v * ri.v;
     
     /* Get the kernel for hi. */
-    hi.v = vec_set1( kernel_gamma ) * vec_load( Hi );
+    hi.v = vec_load( Hi );
     hi_inv.v = vec_rcp( hi.v );
     hi_inv.v = hi_inv.v - hi_inv.v * ( hi.v * hi_inv.v - vec_set1( 1.0f ) );
     hi2_inv.v = hi_inv.v * hi_inv.v;
@@ -756,7 +744,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force
     wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v;
         
     /* Get the kernel for hj. */
-    hj.v = vec_set1( kernel_gamma ) * vec_load( Hj );
+    hj.v = vec_load( Hj );
     hj_inv.v = vec_rcp( hj.v );
     hj_inv.v = hj_inv.v - hj_inv.v * ( hj.v * hj_inv.v - vec_set1( 1.0f ) );
     hj2_inv.v = hj_inv.v * hj_inv.v;
diff --git a/src/space.c b/src/space.c
index 3bbdcb993ff998d79ba86dadb6b47795aa716695..a5f9c211210ce218cb4253d705d30c50749116e7 100644
--- a/src/space.c
+++ b/src/space.c
@@ -34,6 +34,7 @@
 #include "atomic.h"
 #include "lock.h"
 #include "task.h"
+#include "kernel.h"
 #include "part.h"
 #include "cell.h"
 #include "space.h"
@@ -320,123 +321,6 @@ void space_rebuild_recycle ( struct space *s , struct cell *c ) {
     
     }
 
-/**
- * @brief Recursively rebuild a cell tree.
- *
- */
- 
-int space_rebuild_recurse ( struct space *s , struct cell *c ) {
-    
-    int k, count, changes = 0, wasmt[8];
-    float h, h_limit, h_max = 0.0f, dt_min = c->parts[0].dt, dt_max = dt_min;
-    struct cell *temp;
-    
-    /* Clean out the task pointers. */
-    c->sorts = NULL;
-    c->nr_tasks = 0;
-    c->nr_density = 0;
-    c->dx_max = 0.0;
-    
-    /* If the cell is already split, check that the split is still ok. */
-    if ( c->split ) {
-    
-        /* Check the depth. */
-        if ( c->depth > s->maxdepth )
-            s->maxdepth = c->depth;
-
-        /* Set the minimum cutoff. */
-        h_limit = fmin( c->h[0] , fmin( c->h[1] , c->h[2] ) ) / 2;
-
-        /* Count the particles below that. */
-        for ( count = 0 , k = 0 ; k < c->count ; k++ ) {
-            h = c->parts[k].h;
-            if ( h <= h_limit )
-                count += 1;
-            if ( h > h_max )
-                h_max = h;
-            if ( c->parts[k].dt < dt_min )
-                dt_min = c->parts[k].dt;
-            if ( c->parts[k].dt > dt_max )
-                dt_max = c->parts[k].dt;
-            }
-        c->h_max = h_max;
-        c->dt_min = dt_min;
-        c->dt_max = dt_max;
-        c->dx_max = 0;
-            
-        /* Un-split? */
-        if ( count < c->count*space_splitratio || c->count < space_splitsize ) {
-        
-            /* Get rid of the progeny. */
-            space_rebuild_recycle( s , c );
-            
-            /* Re-set the split flag. */
-            c->split = 0;
-        
-            }
-        
-        /* Otherwise, recurse on the kids. */
-        else {
-        
-            /* Populate all progeny. */
-            for ( k = 0 ; k < 8 ; k++ )
-                if ( ( wasmt[k] = ( c->progeny[k] == NULL ) ) ) {
-                    temp = space_getcell( s );
-                    temp->count = 0;
-                    temp->loc[0] = c->loc[0];
-                    temp->loc[1] = c->loc[1];
-                    temp->loc[2] = c->loc[2];
-                    temp->h[0] = c->h[0]/2;
-                    temp->h[1] = c->h[1]/2;
-                    temp->h[2] = c->h[2]/2;
-                    temp->dmin = c->dmin/2;
-                    if ( k & 4 )
-                        temp->loc[0] += temp->h[0];
-                    if ( k & 2 )
-                        temp->loc[1] += temp->h[1];
-                    if ( k & 1 )
-                        temp->loc[2] += temp->h[2];
-                    temp->depth = c->depth + 1;
-                    temp->split = 0;
-                    temp->h_max = 0.0;
-                    temp->dx_max = 0.0;
-                    temp->parent = c;
-                    c->progeny[k] = temp;
-                    }
-        
-            /* Make sure each part is in its place. */
-            cell_split( c );
-            
-            /* Remove empty progeny. */
-            for ( k = 0 ; k < 8 ; k++ )
-                if ( c->progeny[k]->count == 0 ) {
-                    changes += !wasmt[k];
-                    space_recycle( s , c->progeny[k] );
-                    c->progeny[k] = NULL;
-                    }
-                else
-                    changes += wasmt[k];
-        
-            /* Recurse. */
-            for ( k = 0 ; k < 8 ; k++ )
-                if ( c->progeny[k] != NULL )
-                    changes += space_rebuild_recurse( s , c->progeny[k] );
-                    
-            }
-    
-        }
-        
-    /* Otherwise, try to split it anyway. */
-    else {
-        space_split( s , c );
-        changes += c->split;
-        }
-        
-    /* Return the grand total. */
-    return changes;
-    
-    }
-
 /**
  * @brief Re-build the cells as well as the tasks.
  *
@@ -447,7 +331,7 @@ int space_rebuild_recurse ( struct space *s , struct cell *c ) {
  
 void space_rebuild ( struct space *s , double cell_max ) {
 
-    float h_max = s->cell_min, dmin;
+    float h_max = s->cell_min / kernel_gamma, dmin;
     int i, j, k, cdim[3], nr_parts = s->nr_parts;
     struct cell *restrict c;
     struct part *restrict finger, *restrict p, *parts = s->parts;
@@ -473,12 +357,12 @@ void space_rebuild ( struct space *s , double cell_max ) {
             }
         s->h_max = h_max;
         }
-    // printf( "space_rebuild: h_max is %.3e.\n" , h_max );
+    // printf( "space_rebuild: h_max is %.3e (cell_max=%.3e).\n" , h_max , cell_max );
     // printf( "space_rebuild: getting h_min and h_max took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
     
     /* Get the new putative cell dimensions. */
     for ( k = 0 ; k < 3 ; k++ )
-        cdim[k] = floor( s->dim[k] / fmax( h_max*space_stretch , cell_max ) );
+        cdim[k] = floor( s->dim[k] / fmax( h_max*kernel_gamma*space_stretch , cell_max ) );
         
     /* Do we need to re-build the upper-level cells? */
     // tic = getticks();
@@ -1014,7 +898,8 @@ void space_splittasks ( struct space *s ) {
                 
             /* Should this task be split-up? */
             if ( ci->split && cj->split &&
-                 ci->h_max*space_stretch < hi/2 && cj->h_max*space_stretch < hj/2 ) {
+                 ci->h_max*kernel_gamma*space_stretch < hi/2 &&
+                 cj->h_max*kernel_gamma*space_stretch < hj/2 ) {
                  
                 /* Replace by a single sub-task? */
                 if ( space_dosub &&
@@ -1421,6 +1306,7 @@ void space_split ( struct space *s , struct cell *c ) {
             temp->split = 0;
             temp->h_max = 0.0;
             temp->dx_max = 0.0;
+            temp->h2dx_max = 0.0;
             temp->parent = c;
             c->progeny[k] = temp;
             }
@@ -1428,10 +1314,6 @@ void space_split ( struct space *s , struct cell *c ) {
         /* Split the cell data. */
         cell_split( c );
             
-        /* Recurse? */
-        // for ( k = 0 ; k < 8 ; k++ )
-        //     space_split( s , c->progeny[k] );
-            
         /* Remove any progeny with zero parts. */
         for ( k = 0 ; k < 8 ; k++ )
             if ( c->progeny[k]->count == 0 ) {
diff --git a/src/space.h b/src/space.h
index d5cb42f14dff889d9c3f885461099bb6014f4ff8..51ddbd5c01e945176679529f11e7bc9729e5f0c6 100644
--- a/src/space.h
+++ b/src/space.h
@@ -26,7 +26,7 @@
 #define space_splitratio                0.875
 #define space_splitsize_default         400
 #define space_subsize_default           5000
-#define space_dosub                     1
+#define space_dosub                     0
 #define space_stretch                   1.05
 #define space_maxtaskspercell           31