diff --git a/src/Makefile.am b/src/Makefile.am
index 6acf9e47dd85e34e2d123496879d439c791ed60a..53fc189f38d20616bdc75a8ef8a2abcf49e8c87e 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -20,11 +20,11 @@
 AUTOMAKE_OPTIONS=gnu
 
 # Add the debug flag to the whole thing
-# AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \
-#     -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \
-#     -DTIMER -DCOUNTER -DCPU_TPS=2.67e9
-AM_CFLAGS = -Wall -Werror $(OPENMP_CFLAGS) \
+AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \
+    -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \
     -DTIMER -DCOUNTER -DCPU_TPS=2.67e9
+# AM_CFLAGS = -Wall -Werror $(OPENMP_CFLAGS) \
+#     -DTIMER -DCOUNTER -DCPU_TPS=2.67e9
 
 # Assign a "safe" version number
 AM_LDFLAGS = $(LAPACK_LIBS) $(BLAS_LIBS) $(HDF5_LDFLAGS) -version-info 0:0:0
diff --git a/src/debug.c b/src/debug.c
index 4d885521bb814cdeb89c4494036d5ae3b5854a48..c11edffc943fd9f4af1856ae5060489138f23a13 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -1,6 +1,7 @@
 /*******************************************************************************
  * This file is part of SWIFT.
- * Coypright (c) 2012 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
+ * Coypright (c) 2013 Matthieu Schaller (matthieu.schaller@durham.ac.uk),
+ *                    Pedro Gonnet (pedro.gonnet@durham.ac.uk).
  * 
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -29,7 +30,7 @@ void printParticle ( struct part *parts , long long int id ) {
     /* Look for the particle. */
     for ( i = 0 ; parts[i].id != id ; i++ );
 
-  printf("## Particle[%d]: id=%lld, x=(%f,%f,%f), v=(%f,%f,%f), a=(%f,%f,%f), h=%f, h_dt=%f, wcount=%f, m=%f, rho=%f, u=%f, dudt=%f, dt=%.3e\n",
+  printf("## Particle[%d]: id=%lld, x=[%f,%f,%f], v=[%f,%f,%f], a=[%f,%f,%f], h=%.3e, h_dt=%.3e, wcount=%f, m=%.3e, rho=%f, u=%f, dudt=%f, dt=%.3e\n",
 	 i,
 	 parts[i].id,
 	 parts[i].x[0], parts[i].x[1], parts[i].x[2],
diff --git a/src/engine.c b/src/engine.c
index 7513914b11e0ecf2d738764a181dc298ac49e04a..b101a1e34c94cc5a460fb23d457356e55eb6c5b1 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -66,7 +66,7 @@ void engine_prepare ( struct engine *e ) {
     int j, k, qid;
     struct space *s = e->s;
     struct queue *q;
-    float dt_max = e->dt_max;
+    float dt_step = e->dt_step;
     
     TIMER_TIC
 
@@ -95,7 +95,7 @@ void engine_prepare ( struct engine *e ) {
     // tic = getticks();
     #pragma omp parallel for schedule(static) 
     for ( k = 0 ; k < s->nr_parts ; k++ )
-        if ( s->parts[k].dt <= dt_max ) {
+        if ( s->parts[k].dt <= dt_step ) {
             s->parts[k].wcount = 0.0f;
             s->parts[k].wcount_dh = 0.0f;
             s->parts[k].rho = 0.0f;
@@ -179,71 +179,70 @@ void engine_step ( struct engine *e , int sort_queues ) {
 
     int k, nr_parts = e->s->nr_parts;
     struct part *restrict parts = e->s->parts, *restrict p;
-    float *restrict v_bar;
-    float dt = e->dt, hdt = 0.5*dt, dt_max, dt_min, ldt_min, ldt_max;
-    double etot = 0.0, letot, lmom[3], mom[3] = { 0.0 , 0.0 , 0.0 };
-    int threadID, nthreads;
+    struct xpart *restrict xp;
+    float dt = e->dt, hdt = 0.5*dt, dt_step, dt_max, dt_min, ldt_min, ldt_max;
+    double epot = 0.0, ekin = 0.0, lepot, lekin, lmom[3], mom[3] = { 0.0 , 0.0 , 0.0 };
+    int threadID, nthreads, count = 0, lcount;
     // #ifdef __SSE2__
     //     VEC_MACRO(4,float) hdtv = _mm_set1_ps( hdt );
     // #endif
 
     /* Get the maximum dt. */
-    dt_max = 2.0f*dt;
+    dt_step = 2.0f*dt;
     for ( k = 0 ; k < 32 && (e->step & (1 << k)) == 0 ; k++ )
-        dt_max *= 2;
-    dt_max = 1;
+        dt_step *= 2;
+    // dt_step = FLT_MAX;
         
     /* Set the maximum dt. */
-    e->dt_max = dt_max;
-    e->s->dt_max = dt_max;
-    printf( "engine_step: dt_max set to %.3e.\n" , dt_max ); fflush(stdout);
+    e->dt_step = dt_step;
+    e->s->dt_step = dt_step;
+    printf( "engine_step: dt_step set to %.3e.\n" , dt_step ); fflush(stdout);
     
-    /* Allocate a buffer for the old velocities. */
-    if ( posix_memalign( (void **)&v_bar , 16 , sizeof(float) * nr_parts * 4 ) != 0 )
-        error( "Failed to allocate v_old buffer." );
-        
     /* First kick. */
     TIMER_TIC
-    // #pragma omp parallel for schedule(static) private(p)
+    #pragma omp parallel for schedule(static) private(p,xp)
     for ( k = 0 ; k < nr_parts ; k++ ) {
         
         /* Get a handle on the part. */
         p = &parts[k];
+        xp = p->xtras;
         
         /* Step and store the velocity and internal energy. */
         // #ifdef __SSE__
-        //     _mm_store_ps( &v_bar[4*k] , _mm_add_ps( _mm_load_ps( &p->v[0] ) , _mm_mul_ps( hdtv , _mm_load_ps( &p->a[0] ) ) ) );
+        //     _mm_store_ps( &v_bar[4*k] , _mm_load_ps( &p->v[0] ) + hdtv * _mm_load_ps( &p->a[0] ) );
         // #else
-            v_bar[4*k+0] = p->v[0] + hdt * p->a[0];
-            v_bar[4*k+1] = p->v[1] + hdt * p->a[1];
-            v_bar[4*k+2] = p->v[2] + hdt * p->a[2];
+            xp->v_old[0] = p->v[0] + hdt * p->a[0];
+            xp->v_old[1] = p->v[1] + hdt * p->a[1];
+            xp->v_old[2] = p->v[2] + hdt * p->a[2];
         // #endif
-        v_bar[4*k+3] = p->u + hdt * p->u_dt;
+        // xp->u_old = fmaxf( p->u + hdt * p->u_dt , FLT_EPSILON );
+        xp->u_old = p->u + hdt * p->u_dt;
         
         /* Move the particles with the velocitie at the half-step. */
-        p->x[0] += dt * v_bar[4*k+0];
-        p->x[1] += dt * v_bar[4*k+1];
-        p->x[2] += dt * v_bar[4*k+2];
+        p->x[0] += dt * xp->v_old[0];
+        p->x[1] += dt * xp->v_old[1];
+        p->x[2] += dt * xp->v_old[2];
         
         /* Update positions and energies at the half-step. */
         p->v[0] += dt * p->a[0];
         p->v[1] += dt * p->a[1];
         p->v[2] += dt * p->a[2];
         p->u *= expf( p->u_dt / p->u * dt );
-        // p->h *= expf( p->h_dt / p->h * dt );
+        p->h *= expf( p->h_dt / p->h * dt );
         
         /* Integrate other values if this particle will not be updated. */
-        // if ( p->dt > dt_max ) {
-        //     p->rho *= expf( -3.0f * p->h_dt / p->h * dt );
-        //     p->POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f );
-        //     }
+        if ( p->dt > dt_step ) {
+            p->rho *= expf( -3.0f * p->h_dt / p->h * dt );
+            p->POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f );
+            }
         
         }
     TIMER_TOC( timer_kick1 );
         
     // for(k=0; k<10; ++k)
     //   printParticle(parts, k);
-
+    // printParticle( parts , 494849 );
+ 
     /* Prepare the space. */
     engine_prepare( e );
     
@@ -274,44 +273,52 @@ void engine_step ( struct engine *e , int sort_queues ) {
     
     // for(k=0; k<10; ++k)
     //   printParticle(parts, k);
+    // printParticle( parts , 494849 );
 
     /* Second kick. */
     TIMER_TIC_ND
     dt_min = FLT_MAX; dt_max = 0.0f;
-    #pragma omp parallel private(p,k,ldt_min,ldt_max,lmom,letot,threadID,nthreads)
+    #pragma omp parallel private(p,xp,k,ldt_min,lcount,ldt_max,lmom,lekin,lepot,threadID,nthreads)
     {
         threadID = omp_get_thread_num();
         nthreads = omp_get_num_threads();
         ldt_min = FLT_MAX; ldt_max = 0.0f;
         lmom[0] = 0.0; lmom[1] = 0.0; lmom[2] = 0.0;
-        letot = 0.0;
+        lekin = 0.0; lepot = 0.0; lcount = 0;
         for ( k = nr_parts * threadID / nthreads ; k < nr_parts * (threadID + 1) / nthreads ; k++ ) {
 
             /* Get a handle on the part. */
             p = &parts[k];
+            xp = p->xtras;
 
             /* Scale the derivatives if they're freshly computed. */
-            if ( p->dt <= dt_max ) {
+            if ( p->dt <= dt_step ) {
                 p->u_dt *= p->POrho2;
                 p->h_dt *= p->h * 0.333333333f;
+                lcount += 1;
                 }
+                
+            /* Update the particle's time step. */
+            p->dt = const_cfl * p->h / ( p->c + p->v_sig );
 
             /* Update positions and energies at the half-step. */
             // #ifdef __SSE__
-            //     _mm_store_ps( &p->v[0] , _mm_add_ps( _mm_load_ps( &v_bar[4*k] ) , _mm_mul_ps( hdtv , _mm_load_ps( &p->a[0] ) ) ) );
+            //     _mm_store_ps( &p->v[0] , _mm_load_ps( &v_bar[4*k] ) + hdtv * _mm_load_ps( &p->a[0] )  );
             // #else
-                p->v[0] = v_bar[4*k+0] + hdt * p->a[0];
-                p->v[1] = v_bar[4*k+1] + hdt * p->a[1];
-                p->v[2] = v_bar[4*k+2] + hdt * p->a[2];
+                p->v[0] = xp->v_old[0] + hdt * p->a[0];
+                p->v[1] = xp->v_old[1] + hdt * p->a[1];
+                p->v[2] = xp->v_old[2] + hdt * p->a[2];
             // #endif
-            p->u = v_bar[4*k+3] + hdt * p->u_dt;
-
+            // p->u = fmaxf( xp->u_old + hdt * p->u_dt , FLT_EPSILON );
+            p->u = xp->u_old + hdt * p->u_dt;
+            
             /* Get the smallest/largest dt. */
             ldt_min = fminf( ldt_min , p->dt );
             ldt_max = fmaxf( ldt_max , p->dt );
             
             /* Collect total energy. */
-            letot += 0.5 * p->mass * ( p->v[0]*p->v[0] + p->v[1]*p->v[1] + p->v[2]*p->v[2] ) + p->mass * p->u;
+            lekin += 0.5 * p->mass * ( p->v[0]*p->v[0] + p->v[1]*p->v[1] + p->v[2]*p->v[2] );
+            lepot += p->mass * p->u;
 
             /* Collect momentum */
             lmom[0] += p->mass * p->v[0];
@@ -326,31 +333,33 @@ void engine_step ( struct engine *e , int sort_queues ) {
             mom[0] += lmom[0];
             mom[1] += lmom[1];
             mom[2] += lmom[2];
-            etot += letot;
+            epot += lepot;
+            ekin += lekin;
+            count += lcount;
         }
     }
     TIMER_TOC( timer_kick2 );
     e->dt_min = dt_min;
     printf( "engine_step: dt_min/dt_max is %e/%e.\n" , dt_min , dt_max ); fflush(stdout);
-    printf( "engine_step: etot is %e.\n" , etot ); fflush(stdout);
+    printf( "engine_step: etot is %e (ekin=%e, epot=%e).\n" , ekin+epot , ekin , epot ); fflush(stdout);
     printf( "engine_step: total momentum is [ %e , %e , %e ].\n" , mom[0] , mom[1] , mom[2] ); fflush(stdout);
+    printf( "engine_step: updated %i parts (dt_step=%.3e).\n" , count , dt_step ); fflush(stdout);
         
+    /* Increase the step counter. */
+    e->step += 1;
+    
     /* Does the time step need adjusting? */
-    if ( dt_min < e->dt ) {
+    while ( dt_min < e->dt ) {
         e->dt *= 0.5;
+        e->step *= 2;
         printf( "engine_step: dt_min dropped below time step, adjusting to dt=%e.\n" , e->dt );
         }
-    else if ( dt_min > 2*e->dt ) {
+    while ( dt_min > 2*e->dt && (e->step & 1) == 0 ) {
         e->dt *= 2.0;
+        e->step /= 2;
         printf( "engine_step: dt_min is larger than twice the time step, adjusting to dt=%e.\n" , e->dt );
         }
     
-    /* Clean up. */
-    free( v_bar );
-    
-    /* Increase the step counter. */
-    e->step += 1;
-    
     }
     
     
diff --git a/src/engine.h b/src/engine.h
index 74a059addc7a5ac28854d808a80a9df2e29e4a94..f872f3cef703dfc58b4ed6dddfc365cc9268cfe3 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -50,8 +50,10 @@ struct engine {
     /* The queues. */
     struct queue *queues;
     
-    /* The maximum dt to step. */
-    float dt_max;
+    /* The maximum dt to step (current). */
+    float dt_step;
+    
+    /* The minimum dt over all particles in the system. */
     float dt_min;
     
     /* The system time step. */
diff --git a/src/part.h b/src/part.h
index d4c2f2ea5f21e9245308836b098db675c4051342..bfa88567b79b83e4656ce66c3c3aa5edb6cc634e 100644
--- a/src/part.h
+++ b/src/part.h
@@ -38,6 +38,21 @@ struct cpart {
     
     } __attribute__((aligned (32)));
     
+    
+/* Extra particle data not needed during the computation. */
+struct xpart {
+
+    /* Old position, at last tree rebuild. */
+    double x_old[3];
+    
+    /* Old velocity. */
+    float v_old[3];
+    
+    /* Old entropy. */
+    float u_old;
+    
+    } __attribute__((aligned (32)));
+    
 
 /* Data of a single particle. */
 struct part {
@@ -66,6 +81,9 @@ struct part {
     /* Particle acceleration. */
     float a[3] __attribute__((aligned (16)));
     
+    /* Maximum neighbouring u. */
+    float c, v_sig;
+    
     /* Derivative of the density with respect to this particle's smoothing length. */
     float rho_dh;
     
@@ -80,8 +98,8 @@ struct part {
     /* Particle ID. */
     unsigned long long id;
     
-    /* Old position, at last tree rebuild. */
-    double x_old[3];
+    /* Pointer to extra particle data. */
+    struct xpart *xtras;
     
     /* Particle position. */
     double x[3];
diff --git a/src/runner.c b/src/runner.c
index de4d5b7db7ce9c7d60fdb202f22dd82d3eba5a6c..9b74876d6f492ba277c6e64cfd06d4beb6060571 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -329,8 +329,8 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
     struct cell *finger;
     int i, k, redo, count = c->count;
     int *pid;
-    float ihg, ihg2;
-    float dt_max = r->e->dt_max;
+    float ihg, ihg2, h_corr;
+    float dt_step = r->e->dt_step;
     TIMER_TIC
     
     /* Recurse? */
@@ -361,7 +361,7 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
             cp = &c->cparts[ pid[i] ];
             
             /* Is this part within the timestep? */
-            if ( cp->dt <= dt_max ) {
+            if ( cp->dt <= dt_step ) {
 
                 /* Adjust the computed rho. */
                 ihg = kernel_igamma / p->h;
@@ -370,8 +370,15 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
                 p->rho_dh *= ihg2 * ihg2;
                 p->wcount += kernel_wroot;
 
-                /* Update the smoothing length. */
-                p->h -= ( p->wcount - const_nwneigh ) / p->wcount_dh;
+                /* Compute the smoothing length update (Newton step). */
+                h_corr = ( const_nwneigh - p->wcount ) / p->wcount_dh;
+                
+                /* Truncate to the range [ -p->h/2 , p->h ]. */
+                h_corr = fminf( h_corr , p->h );
+                h_corr = fmaxf( h_corr , -p->h/2 );
+                
+                /* Apply the correction to p->h. */
+                p->h += h_corr;
                 cp->h = p->h;
 
                 /* Did we get the right number density? */
@@ -389,8 +396,7 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
                     }
                     
                 /* Compute this particle's time step. */
-                p->dt = const_cfl * p->h / sqrtf( const_gamma * ( const_gamma - 1.0f ) * p->u );
-                cp->dt = p->dt;
+                p->c = sqrtf( const_gamma * ( const_gamma - 1.0f ) * p->u );
 
                 /* Compute the pressure. */
                 // p->P = p->rho * p->u * ( const_gamma - 1.0f );
@@ -405,6 +411,7 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
                 /* Reset the time derivatives. */
                 p->u_dt = 0.0f;
                 p->h_dt = 0.0f;
+                p->v_sig = 0.0f;
 
                 }
 
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index f22f9a6ad5b5c0d0aa8866cbc3acb63f88ebe57e..8436e7c06b24f4b8b6725e4be031d96d1448bb0e 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -106,7 +106,7 @@ void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *r
     struct cpart *restrict cpi, *restrict cparts_i = ci->cparts;
     double pix[3];
     float dx[3], hi, hi2, r2;
-    float dt_max = e->dt_max;
+    float dt_step = e->dt_step;
     #ifdef VECTORIZE
         int icount = 0;
         float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
@@ -118,7 +118,7 @@ void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *r
     TIMER_TIC
     
     /* Anything to do here? */
-    if ( ci->dt_min > dt_max && cj->dt_min > dt_max )
+    if ( ci->dt_min > dt_step && cj->dt_min > dt_step )
         return;
     
     /* Get the relative distance between the pairs, wrapping. */
@@ -215,7 +215,7 @@ void DOSELF_NAIVE ( struct runner *r , struct cell *restrict c ) {
     struct cpart *restrict cpi,  *restrict cpj,*restrict cparts = c->cparts;
     double pix[3];
     float dx[3], hi, hi2, r2;
-    float dt_max = r->e->dt_max;
+    float dt_step = r->e->dt_step;
     #ifdef VECTORIZE
         int icount = 0;
         float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
@@ -227,7 +227,7 @@ void DOSELF_NAIVE ( struct runner *r , struct cell *restrict c ) {
     TIMER_TIC
     
     /* Anything to do here? */
-    if ( c->dt_min > dt_max )
+    if ( c->dt_min > dt_step )
         return;
     
     /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" ,
@@ -638,7 +638,7 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) {
     double hi_max, hj_max;
     double di_max, dj_min;
     int count_i, count_j;
-    float dt_max = e->dt_max;
+    float dt_step = e->dt_step;
     #ifdef VECTORIZE
         int icount = 0;
         float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
@@ -650,7 +650,7 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) {
     TIMER_TIC
     
     /* Anything to do here? */
-    if ( ci->dt_min > dt_max && cj->dt_min > dt_max )
+    if ( ci->dt_min > dt_step && cj->dt_min > dt_step )
         return;
         
     /* Get the sort ID. */
@@ -722,7 +722,7 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) {
         /* Get a hold of the ith part in ci. */
         pi = &parts_i[ sort_i[ pid ].i ];
         cpi = &cparts_i[ sort_i[ pid ].i ];
-        if ( cpi->dt > dt_max )
+        if ( cpi->dt > dt_step )
             continue;
         hi = cpi->h;
         di = sort_i[pid].d + hi - rshift;
@@ -789,7 +789,7 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) {
         /* Get a hold of the jth part in cj. */
         pj = &parts_j[ sort_j[ pjd ].i ];
         cpj = &cparts_j[ sort_j[ pjd ].i ];
-        if ( cpj->dt > dt_max )
+        if ( cpj->dt > dt_step )
             continue;
         hj = cpj->h;
         dj = sort_j[pjd].d - hj - rshift;
@@ -879,19 +879,25 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
     double hi_max, hj_max;
     double di_max, dj_min;
     int count_i, count_j;
-    float dt_max = e->dt_max;
+    float dt_step = e->dt_step;
     #ifdef VECTORIZE
-        int icount = 0;
-        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
-        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
-        struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
+        int icount1 = 0;
+        float r2q1[VEC_SIZE] __attribute__ ((aligned (16)));
+        float hiq1[VEC_SIZE] __attribute__ ((aligned (16)));
+        float hjq1[VEC_SIZE] __attribute__ ((aligned (16)));
+        float dxq1[3*VEC_SIZE] __attribute__ ((aligned (16)));
+        struct part *piq1[VEC_SIZE], *pjq1[VEC_SIZE];
+        int icount2 = 0;
+        float r2q2[VEC_SIZE] __attribute__ ((aligned (16)));
+        float hiq2[VEC_SIZE] __attribute__ ((aligned (16)));
+        float hjq2[VEC_SIZE] __attribute__ ((aligned (16)));
+        float dxq2[3*VEC_SIZE] __attribute__ ((aligned (16)));
+        struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE];
     #endif
     TIMER_TIC
     
     /* Anything to do here? */
-    if ( ci->dt_min > dt_max && cj->dt_min > dt_max )
+    if ( ci->dt_min > dt_step && cj->dt_min > dt_step )
         return;
         
     /* Get the shift ID. */
@@ -931,28 +937,28 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
     dj_min = sort_j[0].d;
     
     /* Collect the number of parts left and right below dt. */
-    if ( cj->dt_min > dt_max ) {
+    if ( ci->dt_max <= dt_step ) {
         sortdt_i = sort_i;
         countdt_i = count_i;
         }
-    else if ( cj->dt_max > dt_max ) {
+    else if ( ci->dt_min <= dt_step ) {
         if ( ( sortdt_i = (struct entry *)alloca( sizeof(struct entry) * count_i ) ) == NULL )
             error( "Failed to allocate dt sortlists." );
         for ( k = 0 ; k < count_i ; k++ )
-            if ( cparts_i[ sort_i[ k ].i ].dt <= dt_max ) {
+            if ( cparts_i[ sort_i[ k ].i ].dt <= dt_step ) {
                 sortdt_i[ countdt_i ] = sort_i[k];
                 countdt_i += 1;
                 }
         }
-    if ( ci->dt_min > dt_max ) {
+    if ( cj->dt_max <= dt_step ) {
         sortdt_j = sort_j;
         countdt_j = count_j;
         }
-    else if ( ci->dt_max > dt_max ) {
+    else if ( cj->dt_min <= dt_step ) {
         if ( ( sortdt_j = (struct entry *)alloca( sizeof(struct entry) * count_j ) ) == NULL )
             error( "Failed to allocate dt sortlists." );
         for ( k = 0 ; k < count_j ; k++ )
-            if ( cparts_j[ sort_j[ k ].i ].dt <= dt_max ) {
+            if ( cparts_j[ sort_j[ k ].i ].dt <= dt_step ) {
                 sortdt_j[ countdt_j ] = sort_j[k];
                 countdt_j += 1;
                 }
@@ -977,7 +983,7 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
             pix[k] = cpi->x[k] - shift[k];
             
         /* Look at valid dt parts only? */
-        if ( cpi->dt > dt_max ) {
+        if ( cpi->dt > dt_step ) {
         
             /* Loop over the parts in cj within dt. */
             for ( pjd = 0 ; pjd < countdt_j && sortdt_j[pjd].d < di ; pjd++ ) {
@@ -988,7 +994,7 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
                 /* Compute the pairwise distance. */
                 r2 = 0.0f;
                 for ( k = 0 ; k < 3 ; k++ ) {
-                    dx[k] = pix[k] - cpj->x[k];
+                    dx[k] = cpj->x[k] - pix[k];
                     r2 += dx[k]*dx[k];
                     }
 
@@ -997,25 +1003,25 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
 
                     #ifndef VECTORIZE
 
-                        IACT( r2 , dx , hi , cpj->h , pi , &parts_j[ sortdt_j[pjd].i ] );
+                        IACT_NONSYM( r2 , dx , cpj->h , hi , &parts_j[ sortdt_j[pjd].i ] , pi );
 
                     #else
 
                         /* Add this interaction to the queue. */
-                        r2q[icount] = r2;
-                        dxq[3*icount+0] = dx[0];
-                        dxq[3*icount+1] = dx[1];
-                        dxq[3*icount+2] = dx[2];
-                        hiq[icount] = hi;
-                        hjq[icount] = cpj->h;
-                        piq[icount] = pi;
-                        pjq[icount] = &parts_j[ sortdt_j[ pjd ].i ];
-                        icount += 1;
+                        r2q1[icount1] = r2;
+                        dxq1[3*icount1+0] = dx[0];
+                        dxq1[3*icount1+1] = dx[1];
+                        dxq1[3*icount1+2] = dx[2];
+                        hiq1[icount1] = cpj->h;
+                        hjq1[icount1] = hi;
+                        piq1[icount1] = &parts_j[ sortdt_j[ pjd ].i ];
+                        pjq1[icount1] = pi;
+                        icount1 += 1;
 
                         /* Flush? */
-                        if ( icount == VEC_SIZE ) {
-                            IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq );
-                            icount = 0;
+                        if ( icount1 == VEC_SIZE ) {
+                            IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 );
+                            icount1 = 0;
                             }
 
                     #endif
@@ -1047,25 +1053,55 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
 
                     #ifndef VECTORIZE
 
-                        IACT( r2 , dx , hi , cpj->h , pi , &parts_j[ sort_j[pjd].i ] );
+                        /* Does pj need to be updated too? */
+                        if ( cpj->dt <= dt_step )
+                            IACT( r2 , dx , hi , cpj->h , pi , &parts_j[ sort_j[pjd].i ] );
+                        else
+                            IACT_NONSYM( r2 , dx , hi , cpj->h , pi , &parts_j[ sort_j[pjd].i ] );
 
                     #else
 
-                        /* Add this interaction to the queue. */
-                        r2q[icount] = r2;
-                        dxq[3*icount+0] = dx[0];
-                        dxq[3*icount+1] = dx[1];
-                        dxq[3*icount+2] = dx[2];
-                        hiq[icount] = hi;
-                        hjq[icount] = cpj->h;
-                        piq[icount] = pi;
-                        pjq[icount] = &parts_j[ sort_j[ pjd ].i ];
-                        icount += 1;
-
-                        /* Flush? */
-                        if ( icount == VEC_SIZE ) {
-                            IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq );
-                            icount = 0;
+                        /* Does pj need to be updated too? */
+                        if ( cpj->dt <= dt_step ) {
+                        
+                            /* Add this interaction to the symmetric queue. */
+                            r2q2[icount2] = r2;
+                            dxq2[3*icount2+0] = dx[0];
+                            dxq2[3*icount2+1] = dx[1];
+                            dxq2[3*icount2+2] = dx[2];
+                            hiq2[icount2] = hi;
+                            hjq2[icount2] = cpj->h;
+                            piq2[icount2] = pi;
+                            pjq2[icount2] = &parts_j[ sort_j[ pjd ].i ];
+                            icount2 += 1;
+
+                            /* Flush? */
+                            if ( icount2 == VEC_SIZE ) {
+                                IACT_VEC( r2q2 , dxq2 , hiq2 , hjq2 , piq2 , pjq2 );
+                                icount2 = 0;
+                                }
+                                
+                            }
+                            
+                        else {
+                        
+                            /* Add this interaction to the non-symmetric queue. */
+                            r2q1[icount1] = r2;
+                            dxq1[3*icount1+0] = dx[0];
+                            dxq1[3*icount1+1] = dx[1];
+                            dxq1[3*icount1+2] = dx[2];
+                            hiq1[icount1] = hi;
+                            hjq1[icount1] = cpj->h;
+                            piq1[icount1] = pi;
+                            pjq1[icount1] = &parts_j[ sort_j[ pjd ].i ];
+                            icount1 += 1;
+
+                            /* Flush? */
+                            if ( icount1 == VEC_SIZE ) {
+                                IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 );
+                                icount1 = 0;
+                                }
+                                
                             }
 
                     #endif
@@ -1097,7 +1133,7 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
         hj2 = hj * hj;
         
         /* Is this particle outside the dt? */
-        if ( cpj->dt > dt_max ) {
+        if ( cpj->dt > dt_step ) {
         
             /* Loop over the parts in ci. */
             for ( pid = countdt_i-1 ; pid >= 0 && sortdt_i[pid].d > dj ; pid-- ) {
@@ -1108,7 +1144,7 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
                 /* Compute the pairwise distance. */
                 r2 = 0.0f;
                 for ( k = 0 ; k < 3 ; k++ ) {
-                    dx[k] = cpi->x[k] - pjx[k];
+                    dx[k] = pjx[k] - cpi->x[k];
                     r2 += dx[k]*dx[k];
                     }
 
@@ -1117,25 +1153,25 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
 
                     #ifndef VECTORIZE
 
-                        IACT( r2 , dx , hj , cpi->h , pj , &parts_i[ sortdt_i[pid].i ] );
+                        IACT_NONSYM( r2 , dx , cpi->h , hj , &parts_i[ sortdt_i[pid].i ] , pj );
 
                     #else
 
                         /* Add this interaction to the queue. */
-                        r2q[icount] = r2;
-                        dxq[3*icount+0] = dx[0];
-                        dxq[3*icount+1] = dx[1];
-                        dxq[3*icount+2] = dx[2];
-                        hiq[icount] = hj;
-                        hjq[icount] = cpi->h;
-                        piq[icount] = pj;
-                        pjq[icount] = &parts_i[ sortdt_i[ pid ].i ];
-                        icount += 1;
+                        r2q1[icount1] = r2;
+                        dxq1[3*icount1+0] = dx[0];
+                        dxq1[3*icount1+1] = dx[1];
+                        dxq1[3*icount1+2] = dx[2];
+                        hiq1[icount1] = cpi->h;
+                        hjq1[icount1] = hj;
+                        piq1[icount1] = &parts_i[ sortdt_i[ pid ].i ];
+                        pjq1[icount1] = pj;
+                        icount1 += 1;
 
                         /* Flush? */
-                        if ( icount == VEC_SIZE ) {
-                            IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq );
-                            icount = 0;
+                        if ( icount1 == VEC_SIZE ) {
+                            IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 );
+                            icount1 = 0;
                             }
 
                     #endif
@@ -1166,25 +1202,55 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
 
                     #ifndef VECTORIZE
 
-                        IACT( r2 , dx , hj , cpi->h , pj , &parts_i[ sort_i[pid].i ] );
+                        /* Does pi need to be updated too? */
+                        if ( cpi->dt <= dt_step )
+                            IACT( r2 , dx , hj , cpi->h , pj , &parts_i[ sort_i[pid].i ] );
+                        else
+                            IACT_NONSYM( r2 , dx , hj , cpi->h , pj , &parts_i[ sort_i[pid].i ] );
 
                     #else
 
-                        /* Add this interaction to the queue. */
-                        r2q[icount] = r2;
-                        dxq[3*icount+0] = dx[0];
-                        dxq[3*icount+1] = dx[1];
-                        dxq[3*icount+2] = dx[2];
-                        hiq[icount] = hj;
-                        hjq[icount] = cpi->h;
-                        piq[icount] = pj;
-                        pjq[icount] = &parts_i[ sort_i[ pid ].i ];
-                        icount += 1;
-
-                        /* Flush? */
-                        if ( icount == VEC_SIZE ) {
-                            IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq );
-                            icount = 0;
+                        /* Does pi need to be updated too? */
+                        if ( cpi->dt <= dt_step ) {
+                        
+                            /* Add this interaction to the symmetric queue. */
+                            r2q2[icount2] = r2;
+                            dxq2[3*icount2+0] = dx[0];
+                            dxq2[3*icount2+1] = dx[1];
+                            dxq2[3*icount2+2] = dx[2];
+                            hiq2[icount2] = hj;
+                            hjq2[icount2] = cpi->h;
+                            piq2[icount2] = pj;
+                            pjq2[icount2] = &parts_i[ sort_i[ pid ].i ];
+                            icount2 += 1;
+
+                            /* Flush? */
+                            if ( icount2 == VEC_SIZE ) {
+                                IACT_VEC( r2q2 , dxq2 , hiq2 , hjq2 , piq2 , pjq2 );
+                                icount2 = 0;
+                                }
+                                
+                            }
+                            
+                        else {
+                        
+                            /* Add this interaction to the non-summetric queue. */
+                            r2q1[icount1] = r2;
+                            dxq1[3*icount1+0] = dx[0];
+                            dxq1[3*icount1+1] = dx[1];
+                            dxq1[3*icount1+2] = dx[2];
+                            hiq1[icount1] = hj;
+                            hjq1[icount1] = cpi->h;
+                            piq1[icount1] = pj;
+                            pjq1[icount1] = &parts_i[ sort_i[ pid ].i ];
+                            icount1 += 1;
+
+                            /* Flush? */
+                            if ( icount1 == VEC_SIZE ) {
+                                IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 );
+                                icount1 = 0;
+                                }
+                                
                             }
 
                     #endif
@@ -1199,9 +1265,12 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
 
     #ifdef VECTORIZE
     /* Pick up any leftovers. */
-    if ( icount > 0 )
-        for ( k = 0 ; k < icount ; k++ )
-            IACT( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
+    if ( icount1 > 0 )
+        for ( k = 0 ; k < icount1 ; k++ )
+            IACT_NONSYM( r2q1[k] , &dxq1[3*k] , hiq1[k] , hjq1[k] , piq1[k] , pjq1[k] );
+    if ( icount2 > 0 )
+        for ( k = 0 ; k < icount2 ; k++ )
+            IACT( r2q2[k] , &dxq2[3*k] , hiq2[k] , hjq2[k] , piq2[k] , pjq2[k] );
     #endif
     
     #ifdef TIMER_VERBOSE
@@ -1227,7 +1296,7 @@ void DOSELF1 ( struct runner *r , struct cell *restrict c ) {
     float dx[3], hi, hi2, r2;
     struct part *restrict parts = c->parts, *restrict pi;
     struct cpart *restrict cpi, *restrict cpj, *restrict cparts = c->cparts;
-    float dt_max = r->e->dt_max;
+    float dt_step = r->e->dt_step;
     #ifdef VECTORIZE
         int icount = 0;
         float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
@@ -1239,7 +1308,7 @@ void DOSELF1 ( struct runner *r , struct cell *restrict c ) {
     TIMER_TIC
     
     /* Any work here? */
-    if ( c->dt_min > dt_max )
+    if ( c->dt_min > dt_step )
         return;
     
     /* Loop over the particles in the cell. */
@@ -1248,7 +1317,7 @@ void DOSELF1 ( struct runner *r , struct cell *restrict c ) {
         /* Get a pointer to the ith particle. */
         cpi = &cparts[pid];
         pi = &parts[pid];
-        if ( cpi->dt > dt_max )
+        if ( cpi->dt > dt_step )
             continue;
             
         /* Get the particle position and radius. */
@@ -1331,26 +1400,32 @@ void DOSELF2 ( struct runner *r , struct cell *restrict c ) {
     float dx[3], hi, hi2, r2;
     struct part *restrict parts = c->parts, *restrict pi;
     struct cpart *restrict cpi, *restrict cpj, *restrict cparts = c->cparts;
-    float dt_max = r->e->dt_max;
+    float dt_step = r->e->dt_step;
     int firstdt = 0, countdt = 0, *indt = NULL;
     #ifdef VECTORIZE
-        int icount = 0;
-        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
-        float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
-        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
-        struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
+        int icount1 = 0;
+        float r2q1[VEC_SIZE] __attribute__ ((aligned (16)));
+        float hiq1[VEC_SIZE] __attribute__ ((aligned (16)));
+        float hjq1[VEC_SIZE] __attribute__ ((aligned (16)));
+        float dxq1[3*VEC_SIZE] __attribute__ ((aligned (16)));
+        struct part *piq1[VEC_SIZE], *pjq1[VEC_SIZE];
+        int icount2 = 0;
+        float r2q2[VEC_SIZE] __attribute__ ((aligned (16)));
+        float hiq2[VEC_SIZE] __attribute__ ((aligned (16)));
+        float hjq2[VEC_SIZE] __attribute__ ((aligned (16)));
+        float dxq2[3*VEC_SIZE] __attribute__ ((aligned (16)));
+        struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE];
     #endif
     TIMER_TIC
     
     /* Set up indt if needed. */
-    if ( c->dt_min > dt_max )
+    if ( c->dt_min > dt_step )
         return;
-    else if ( c->dt_max > dt_max ) {
+    else if ( c->dt_max > dt_step ) {
         if ( ( indt = (int *)alloca( sizeof(int) * count ) ) == NULL )
             error( "Failed to allocate indt." );
         for ( k = 0 ; k < count ; k++ )
-            if ( cparts[k].dt <= dt_max ) {
+            if ( cparts[k].dt <= dt_step ) {
                 indt[ countdt ] = k;
                 countdt += 1;
                 }
@@ -1370,7 +1445,7 @@ void DOSELF2 ( struct runner *r , struct cell *restrict c ) {
         hi2 = hi * hi;
         
         /* Is the ith particle not active? */
-        if ( cpi->dt > dt_max ) {
+        if ( cpi->dt > dt_step ) {
         
             /* Loop over the other particles .*/
             for ( pjd = firstdt ; pjd < countdt ; pjd++ ) {
@@ -1381,7 +1456,7 @@ void DOSELF2 ( struct runner *r , struct cell *restrict c ) {
                 /* Compute the pairwise distance. */
                 r2 = 0.0f;
                 for ( k = 0 ; k < 3 ; k++ ) {
-                    dx[k] = pix[k] - cpj->x[k];
+                    dx[k] = cpj->x[k] - pix[k];
                     r2 += dx[k]*dx[k];
                     }
 
@@ -1390,25 +1465,25 @@ void DOSELF2 ( struct runner *r , struct cell *restrict c ) {
 
                     #ifndef VECTORIZE
 
-                        IACT( r2 , dx , hi , cpj->h , pi , &parts[indt[pjd]] );
+                        IACT_NONSYM( r2 , dx , cpj->h , hi , &parts[indt[pjd]] , pi );
 
                     #else
 
                         /* Add this interaction to the queue. */
-                        r2q[icount] = r2;
-                        dxq[3*icount+0] = dx[0];
-                        dxq[3*icount+1] = dx[1];
-                        dxq[3*icount+2] = dx[2];
-                        hiq[icount] = hi;
-                        hjq[icount] = cpj->h;
-                        piq[icount] = pi;
-                        pjq[icount] = &parts[indt[pjd]];
-                        icount += 1;
+                        r2q1[icount1] = r2;
+                        dxq1[3*icount1+0] = dx[0];
+                        dxq1[3*icount1+1] = dx[1];
+                        dxq1[3*icount1+2] = dx[2];
+                        hiq1[icount1] = cpj->h;
+                        hjq1[icount1] = hi;
+                        piq1[icount1] = &parts[indt[pjd]];
+                        pjq1[icount1] = pi;
+                        icount1 += 1;
 
                         /* Flush? */
-                        if ( icount == VEC_SIZE ) {
-                            IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq );
-                            icount = 0;
+                        if ( icount1 == VEC_SIZE ) {
+                            IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 );
+                            icount1 = 0;
                             }
 
                     #endif
@@ -1443,25 +1518,55 @@ void DOSELF2 ( struct runner *r , struct cell *restrict c ) {
 
                     #ifndef VECTORIZE
 
-                        IACT( r2 , dx , hi , cpj->h , pi , &parts[pjd] );
+                        /* Does pj need to be updated too? */
+                        if ( cpj->dt <= dt_step )
+                            IACT( r2 , dx , hi , cpj->h , pi , &parts[pjd] );
+                        else
+                            IACT_NONSYM( r2 , dx , hi , cpj->h , pi , &parts[pjd] );
 
                     #else
 
-                        /* Add this interaction to the queue. */
-                        r2q[icount] = r2;
-                        dxq[3*icount+0] = dx[0];
-                        dxq[3*icount+1] = dx[1];
-                        dxq[3*icount+2] = dx[2];
-                        hiq[icount] = hi;
-                        hjq[icount] = cpj->h;
-                        piq[icount] = pi;
-                        pjq[icount] = &parts[pjd];
-                        icount += 1;
-
-                        /* Flush? */
-                        if ( icount == VEC_SIZE ) {
-                            IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq );
-                            icount = 0;
+                        /* Does pj need to be updated too? */
+                        if ( cpj->dt <= dt_step ) {
+                        
+                            /* Add this interaction to the symmetric queue. */
+                            r2q2[icount2] = r2;
+                            dxq2[3*icount2+0] = dx[0];
+                            dxq2[3*icount2+1] = dx[1];
+                            dxq2[3*icount2+2] = dx[2];
+                            hiq2[icount2] = hi;
+                            hjq2[icount2] = cpj->h;
+                            piq2[icount2] = pi;
+                            pjq2[icount2] = &parts[pjd];
+                            icount2 += 1;
+
+                            /* Flush? */
+                            if ( icount2 == VEC_SIZE ) {
+                                IACT_VEC( r2q2 , dxq2 , hiq2 , hjq2 , piq2 , pjq2 );
+                                icount2 = 0;
+                                }
+                                
+                            }
+                            
+                        else {
+                        
+                            /* Add this interaction to the non-symmetric queue. */
+                            r2q1[icount1] = r2;
+                            dxq1[3*icount1+0] = dx[0];
+                            dxq1[3*icount1+1] = dx[1];
+                            dxq1[3*icount1+2] = dx[2];
+                            hiq1[icount1] = hi;
+                            hjq1[icount1] = cpj->h;
+                            piq1[icount1] = pi;
+                            pjq1[icount1] = &parts[pjd];
+                            icount1 += 1;
+
+                            /* Flush? */
+                            if ( icount1 == VEC_SIZE ) {
+                                IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 );
+                                icount1 = 0;
+                                }
+                                
                             }
 
                     #endif
@@ -1476,9 +1581,12 @@ void DOSELF2 ( struct runner *r , struct cell *restrict c ) {
 
     #ifdef VECTORIZE
     /* Pick up any leftovers. */
-    if ( icount > 0 )
-        for ( k = 0 ; k < icount ; k++ )
-            IACT( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
+    if ( icount1 > 0 )
+        for ( k = 0 ; k < icount1 ; k++ )
+            IACT_NONSYM( r2q1[k] , &dxq1[3*k] , hiq1[k] , hjq1[k] , piq1[k] , pjq1[k] );
+    if ( icount2 > 0 )
+        for ( k = 0 ; k < icount2 ; k++ )
+            IACT( r2q2[k] , &dxq2[3*k] , hiq2[k] , hjq2[k] , piq2[k] , pjq2[k] );
     #endif
     
     #ifdef TIMER_VERBOSE
@@ -1506,7 +1614,7 @@ void DOSUB1 ( struct runner *r , struct cell *ci , struct cell *cj , int sid ) {
     double shift[3];
     float h;
     struct space *s = r->e->s;
-    float dt_max = r->e->dt_max;
+    float dt_step = r->e->dt_step;
 
     TIMER_TIC
     
@@ -1514,7 +1622,7 @@ void DOSUB1 ( struct runner *r , struct cell *ci , struct cell *cj , int sid ) {
     if ( cj == NULL ) {
 
         /* Should we even bother? */    
-        if ( ci->dt_min > dt_max )
+        if ( ci->dt_min > dt_step )
             return;
         
         /* Recurse? */
@@ -1541,7 +1649,7 @@ void DOSUB1 ( struct runner *r , struct cell *ci , struct cell *cj , int sid ) {
     else {
     
         /* Should we even bother? */    
-        if ( ci->dt_min > dt_max && cj->dt_min > dt_max )
+        if ( ci->dt_min > dt_step && cj->dt_min > dt_step )
             return;
         
         /* Get the cell dimensions. */
@@ -1776,7 +1884,7 @@ void DOSUB2 ( struct runner *r , struct cell *ci , struct cell *cj , int sid ) {
     double shift[3];
     float h;
     struct space *s = r->e->s;
-    float dt_max = r->e->dt_max;
+    float dt_step = r->e->dt_step;
 
     TIMER_TIC
     
@@ -1784,7 +1892,7 @@ void DOSUB2 ( struct runner *r , struct cell *ci , struct cell *cj , int sid ) {
     if ( cj == NULL ) {
     
         /* Should we even bother? */    
-        if ( ci->dt_min > dt_max )
+        if ( ci->dt_min > dt_step )
             return;
         
         /* Recurse? */
@@ -1811,7 +1919,7 @@ void DOSUB2 ( struct runner *r , struct cell *ci , struct cell *cj , int sid ) {
     else {
     
         /* Should we even bother? */    
-        if ( ci->dt_min > dt_max && cj->dt_min > dt_max )
+        if ( ci->dt_min > dt_step && cj->dt_min > dt_step )
             return;
         
         /* Get the cell dimensions. */
diff --git a/src/runner_iact.h b/src/runner_iact.h
index f4b117ffa68fb65ec788c990e3ef626503a4f3b1..70d07a6b658554e1583b5ac89dc5d30b19863f3e 100644
--- a/src/runner_iact.h
+++ b/src/runner_iact.h
@@ -329,26 +329,26 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2
 
     float r = sqrtf( r2 ), ri = 1.0f / r;
     float xi, xj;
-    float hig_inv, hig2_inv;
-    float hjg_inv, hjg2_inv;
+    float hi_inv, hi2_inv;
+    float hj_inv, hj2_inv;
     float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr;
     float f;
     int k;
     
     /* Get the kernel for hi. */
-    hig_inv = kernel_igamma / hi;
-    hig2_inv = hig_inv * hig_inv;
-    xi = r * hig_inv;
+    hi_inv = 1.0f / hi;
+    hi2_inv = hi_inv * hi_inv;
+    xi = r * hi_inv * kernel_igamma;
     kernel_deval( xi , &wi , &wi_dx );
-    wi_dr = hig2_inv * hig2_inv * wi_dx;
+    wi_dr = hi2_inv * hi2_inv * wi_dx;
         
     /* Get the kernel for hj. */
-    hjg_inv = kernel_igamma / hj;
-    hjg2_inv = hjg_inv * hjg_inv;
-    xj = r * hjg_inv;
+    hj_inv = 1.0f / hj;
+    hj2_inv = hj_inv * hj_inv;
+    xj = r * hj_inv * kernel_igamma;
     kernel_deval( xj , &wj , &wj_dx );
-    wj_dr = hjg2_inv * hjg2_inv * wj_dx;
-    
+    wj_dr = hj2_inv * hj2_inv * wj_dx;
+        
     /* Get the common factor out. */
     w = ri * ( pi->POrho2 * wi_dr + pj->POrho2 * wj_dr );
         
@@ -370,7 +370,11 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2
     /* Get the time derivative for h. */
     pi->h_dt -= pj->mass / pj->rho * dvdr * wi_dr;
     pj->h_dt -= pi->mass / pi->rho * dvdr * wj_dr;
-        
+    
+    /* Update the signal velocity. */
+    pi->v_sig = fmaxf( pi->v_sig , pj->c - 3*dvdr );
+    pj->v_sig = fmaxf( pj->v_sig , pi->c - 3*dvdr );
+    
     #ifdef HIST
     if ( hi > hj )
         runner_hist_hit( hi / hj );
@@ -388,8 +392,8 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float
     vector r, r2, ri;
     vector xi, xj;
     vector hi, hj, hi_inv, hj_inv;
-    vector hig_inv, hig2_inv;
-    vector hjg_inv, hjg2_inv;
+    vector hig_inv, hi2_inv;
+    vector hjg_inv, hj2_inv;
     vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
     vector w;
     vector piPOrho2, pjPOrho2, pirho, pjrho;
@@ -444,20 +448,20 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float
     hi_inv.v = vec_rcp( hi.v );
     hi_inv.v = hi_inv.v - hi_inv.v * ( hi.v * hi_inv.v - vec_set1( 1.0f ) );
     hig_inv.v = vec_set1( kernel_igamma ) * hi_inv.v;
-    hig2_inv.v = hig_inv.v * hig_inv.v;
+    hi2_inv.v = hi_inv.v * hi_inv.v;
     xi.v = r.v * hig_inv.v;
     kernel_deval_vec( &xi , &wi , &wi_dx );
-    wi_dr.v = hig2_inv.v * hig2_inv.v * wi_dx.v;
+    wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v;
         
     /* Get the kernel for hj. */
     hj.v = vec_load( Hj );
     hj_inv.v = vec_rcp( hj.v );
     hj_inv.v = hj_inv.v - hj_inv.v * ( hj.v * hj_inv.v - vec_set1( 1.0f ) );
     hjg_inv.v = vec_set1( kernel_igamma ) * hj_inv.v;
-    hjg2_inv.v = hjg_inv.v * hjg_inv.v;
+    hj2_inv.v = hj_inv.v * hj_inv.v;
     xj.v = r.v * hjg_inv.v;
     kernel_deval_vec( &xj , &wj , &wj_dx );
-    wj_dr.v = hjg2_inv.v * hjg2_inv.v * wj_dx.v;
+    wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v;
         
     /* Get the common factor out. */
     w.v = ri.v * ( piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v );
@@ -508,25 +512,25 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_force ( fl
 
     float r = sqrtf( r2 ), ri = 1.0f / r;
     float xi, xj;
-    float hig_inv, hig2_inv;
-    float hjg_inv, hjg2_inv;
+    float hi_inv, hi2_inv;
+    float hj_inv, hj2_inv;
     float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr;
     float f;
     int k;
     
     /* Get the kernel for hi. */
-    hig_inv = kernel_igamma / hi;
-    hig2_inv = hig_inv * hig_inv;
-    xi = r * hig_inv;
+    hi_inv = 1.0f / hi;
+    hi2_inv = hi_inv * hi_inv;
+    xi = r * hi_inv * kernel_igamma;
     kernel_deval( xi , &wi , &wi_dx );
-    wi_dr = hig2_inv * hig2_inv * wi_dx;
+    wi_dr = hi2_inv * hi2_inv * wi_dx;
         
     /* Get the kernel for hj. */
-    hjg_inv = kernel_igamma / hj;
-    hjg2_inv = hjg_inv * hjg_inv;
-    xj = r * hjg_inv;
+    hj_inv = 1.0f / hj;
+    hj2_inv = hj_inv * hj_inv;
+    xj = r * hj_inv * kernel_igamma;
     kernel_deval( xj , &wj , &wj_dx );
-    wj_dr = hjg2_inv * hjg2_inv * wj_dx;
+    wj_dr = hj2_inv * hj2_inv * wj_dx;
     
     /* Get the common factor out. */
     w = ri * ( pi->POrho2 * wi_dr + pj->POrho2 * wj_dr );
@@ -547,6 +551,10 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_force ( fl
     /* Get the time derivative for h. */
     pi->h_dt -= pj->mass / pj->rho * dvdr * wi_dr;
         
+    /* Update the signal velocity (this is always symmetrical). */
+    pi->v_sig = fmaxf( pi->v_sig , pj->c - 3*dvdr );
+    pj->v_sig = fmaxf( pj->v_sig , pi->c - 3*dvdr );
+    
     }
     
 
@@ -557,8 +565,8 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force
     vector r, r2, ri;
     vector xi, xj;
     vector hi, hj, hi_inv, hj_inv;
-    vector hig_inv, hig2_inv;
-    vector hjg_inv, hjg2_inv;
+    vector hig_inv, hi2_inv;
+    vector hjg_inv, hj2_inv;
     vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
     vector w;
     vector piPOrho2, pjPOrho2, pjrho;
@@ -609,20 +617,20 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force
     hi_inv.v = vec_rcp( hi.v );
     hi_inv.v = hi_inv.v - hi_inv.v * ( hi.v * hi_inv.v - vec_set1( 1.0f ) );
     hig_inv.v = vec_set1( kernel_igamma ) * hi_inv.v;
-    hig2_inv.v = hig_inv.v * hig_inv.v;
+    hi2_inv.v = hi_inv.v * hi_inv.v;
     xi.v = r.v * hig_inv.v;
     kernel_deval_vec( &xi , &wi , &wi_dx );
-    wi_dr.v = hig2_inv.v * hig2_inv.v * wi_dx.v;
+    wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v;
         
     /* Get the kernel for hj. */
     hj.v = vec_load( Hj );
     hj_inv.v = vec_rcp( hj.v );
     hj_inv.v = hj_inv.v - hj_inv.v * ( hj.v * hj_inv.v - vec_set1( 1.0f ) );
     hjg_inv.v = vec_set1( kernel_igamma ) * hj_inv.v;
-    hjg2_inv.v = hjg_inv.v * hjg_inv.v;
+    hj2_inv.v = hj_inv.v * hj_inv.v;
     xj.v = r.v * hjg_inv.v;
     kernel_deval_vec( &xj , &wj , &wj_dx );
-    wj_dr.v = hjg2_inv.v * hjg2_inv.v * wj_dx.v;
+    wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v;
         
     /* Get the common factor out. */
     w.v = ri.v * ( piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v );
diff --git a/src/space.c b/src/space.c
index ba81b3b551a71f6126c4b95f59afe617e6caf5f1..ca43394f3923704b65a3af59ec6650b6f847f236 100644
--- a/src/space.c
+++ b/src/space.c
@@ -86,28 +86,31 @@ void space_map_prepare ( struct cell *c , void *data ) {
     int k;
     float dt_min, dt_max, h_max, dx_max;
     struct part *p;
+    struct xpart *xp;
 
     /* No children? */
     if ( !c->split ) {
     
         /* Init with first part. */
         p = &c->parts[0];
+        xp = p->xtras;
         dt_min = p->dt;
         dt_max = p->dt;
         h_max = p->h;
-        dx_max = sqrtf( (p->x[0] - p->x_old[0])*(p->x[0] - p->x_old[0]) +
-                        (p->x[1] - p->x_old[1])*(p->x[1] - p->x_old[1]) +
-                        (p->x[2] - p->x_old[2])*(p->x[2] - p->x_old[2]) )*2 + p->h;
+        dx_max = sqrtf( (p->x[0] - xp->x_old[0])*(p->x[0] - xp->x_old[0]) +
+                        (p->x[1] - xp->x_old[1])*(p->x[1] - xp->x_old[1]) +
+                        (p->x[2] - xp->x_old[2])*(p->x[2] - xp->x_old[2]) )*2 + p->h;
     
         /* Loop over parts. */
         for ( k = 1 ; k < c->count ; k++ ) {
             p = &c->parts[k];
+            xp = p->xtras;
             dt_min = fminf( dt_min , p->dt );
             dt_max = fmaxf( dt_max , p->dt );
             h_max = fmaxf( h_max , p->h );
-            dx_max = fmaxf( dx_max , sqrtf( (p->x[0] - p->x_old[0])*(p->x[0] - p->x_old[0]) +
-                                            (p->x[1] - p->x_old[1])*(p->x[1] - p->x_old[1]) +
-                                            (p->x[2] - p->x_old[2])*(p->x[2] - p->x_old[2]) )*2 + p->h );
+            dx_max = fmaxf( dx_max , sqrtf( (p->x[0] - xp->x_old[0])*(p->x[0] - xp->x_old[0]) +
+                                            (p->x[1] - xp->x_old[1])*(p->x[1] - xp->x_old[1]) +
+                                            (p->x[2] - xp->x_old[2])*(p->x[2] - xp->x_old[2]) )*2 + p->h );
             }
             
         }
@@ -161,10 +164,10 @@ void space_prepare ( struct space *s ) {
 
     int k;
     struct task *t;
-    float dt_max = s->dt_max, dx_max = 0.0f;
+    float dt_step = s->dt_step, dx_max = 0.0f;
     int counts[ task_type_count + 1 ];
     
-    /* Traverse the cells and set their dt_min and dt_max. */
+    /* Traverse the cells and set their dt_min and dx_max. */
     space_map_cells_post( s , 1 , &space_map_prepare , NULL );
     
     /* Get the maximum displacement in the whole system. */
@@ -179,9 +182,9 @@ void space_prepare ( struct space *s ) {
              t->type == task_type_self ||
              t->type == task_type_ghost ||
              ( t->type == task_type_sub && t->cj == NULL ) )
-            t->skip = ( t->ci->dt_min > dt_max );
+            t->skip = ( t->ci->dt_min > dt_step );
         else if ( t->type == task_type_pair || ( t->type == task_type_sub && t->cj != NULL ) ) {
-            t->skip = ( t->ci->dt_min > dt_max && t->cj->dt_min > dt_max );
+            t->skip = ( t->ci->dt_min > dt_step && t->cj->dt_min > dt_step );
             if ( !t->skip && t->tight &&
                  ( t->ci->dx_max > t->ci->dmin || t->cj->dx_max > t->cj->dmin ) )
                 break;
@@ -194,7 +197,7 @@ void space_prepare ( struct space *s ) {
         /* Re-build the space. */
         space_rebuild( s , 0.0 );
     
-        /* Traverse the cells and set their dt_min and dt_max. */
+        /* Traverse the cells and set their dt_min and dx_max. */
         space_map_cells_post( s , 1 , &space_map_prepare , NULL );
     
         }
@@ -576,9 +579,9 @@ void space_rebuild ( struct space *s , double cell_max ) {
     // tic = getticks();
     #pragma omp parallel for schedule(static)
     for ( k = 0 ; k < s->nr_parts ; k++ ) {
-        s->parts[k].x_old[0] = s->parts[k].x[0];
-        s->parts[k].x_old[1] = s->parts[k].x[1];
-        s->parts[k].x_old[2] = s->parts[k].x[2];
+        s->parts[k].xtras->x_old[0] = s->parts[k].x[0];
+        s->parts[k].xtras->x_old[1] = s->parts[k].x[1];
+        s->parts[k].xtras->x_old[2] = s->parts[k].x[2];
         }
     // printf( "space_rebuild: storing old positions took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
 
@@ -877,7 +880,7 @@ void space_splittasks ( struct space *s ) {
     struct cell *ci, *cj;
     double hi, hj, shift[3];
     struct task *t;
-    float dt_max = s->dt_max;
+    float dt_step = s->dt_step;
     int pts[7][8] = { { -1 , 12 , 10 ,  9 ,  4 ,  3 ,  1 ,  0 } ,
                       { -1 , -1 , 11 , 10 ,  5 ,  4 ,  2 ,  1 } ,
                       { -1 , -1 , -1 , 12 ,  7 ,  6 ,  4 ,  3 } , 
@@ -899,7 +902,7 @@ void space_splittasks ( struct space *s ) {
             ci = t->ci;
             
             /* Ingore this task? */
-            if ( ci->dt_min > dt_max ) {
+            if ( ci->dt_min > dt_step ) {
                 t->skip = 1;
                 continue;
                 }
@@ -953,7 +956,7 @@ void space_splittasks ( struct space *s ) {
             hj = fmin( cj->h[0] , fmin( cj->h[1] , cj->h[2] ) );
 
             /* Ingore this task? */
-            if ( ci->dt_min > dt_max && cj->dt_min > dt_max ) {
+            if ( ci->dt_min > dt_step && cj->dt_min > dt_step ) {
                 t->skip = 1;
                 continue;
                 }
@@ -1338,7 +1341,7 @@ void space_maketasks ( struct space *s , int do_sort ) {
     int *cdim = s->cdim;
     struct task *t, *t2;
     struct cell *ci, *cj;
-    // float dt_max = s->dt_max;
+    float dt_step = s->dt_step;
 
     /* Allocate the task-list, if needed. */
     if ( s->tasks == NULL || s->tasks_size < s->tot_cells * space_maxtaskspercell ) {
@@ -1364,7 +1367,7 @@ void space_maketasks ( struct space *s , int do_sort ) {
                 ci = &s->cells[cid];
                 if ( ci->count == 0 )
                     continue;
-                // if ( ci->dt_min <= dt_max )
+                if ( ci->dt_min <= dt_step )
                     space_addtask( s , task_type_self , task_subtype_density , 0 , 0 , ci , NULL , 0 );
                 for ( ii = -1 ; ii < 2 ; ii++ ) {
                     iii = i + ii;
@@ -1383,8 +1386,8 @@ void space_maketasks ( struct space *s , int do_sort ) {
                             kkk = ( kkk + cdim[2] ) % cdim[2];
                             cjd = cell_getid( cdim , iii , jjj , kkk );
                             cj = &s->cells[cjd];
-                            if ( cid >= cjd || cj->count == 0 /* ||
-                                 ( ci->dt_min > dt_max && cj->dt_min > dt_max ) */ )
+                            if ( cid >= cjd || cj->count == 0 ||
+                                 ( ci->dt_min > dt_step && cj->dt_min > dt_step ) )
                                 continue;
                             sid = sortlistID[ (kk+1) + 3*( (jj+1) + 3*(ii+1) ) ];
                             t = space_addtask( s , task_type_pair , task_subtype_density , sid , 0 , ci , cj , 1 );
@@ -1688,6 +1691,8 @@ struct cell *space_getcell ( struct space *s ) {
 
 void space_init ( struct space *s , double dim[3] , struct part *parts , int N , int periodic , double h_max ) {
 
+    int k;
+
     /* Store eveything in the space. */
     s->dim[0] = dim[0]; s->dim[1] = dim[1]; s->dim[2] = dim[2];
     s->periodic = periodic;
@@ -1696,9 +1701,15 @@ void space_init ( struct space *s , double dim[3] , struct part *parts , int N ,
     s->cell_min = h_max;
     
     /* Allocate the cparts array. */
-    if ( posix_memalign( (void *)&s->cparts , 32 ,  N * sizeof(struct cpart) ) != 0 )
+    if ( posix_memalign( (void *)&s->cparts , 32 , N * sizeof(struct cpart) ) != 0 )
         error( "Failed to allocate cparts." );
         
+    /* Allocate and link the xtra parts array. */
+    if ( posix_memalign( (void *)&s->xparts , 32 , N * sizeof(struct xpart) ) != 0 )
+        error( "Failed to allocate xparts." );
+    for ( k = 0 ; k < N ; k++ )
+        s->parts[k].xtras = &s->xparts[k];
+        
     /* Init the space lock. */
     if ( lock_init( &s->lock ) != 0 )
         error( "Failed to create space spin-lock." );
diff --git a/src/space.h b/src/space.h
index a93ac3a5db939c73ff8379cee81914489cece75e..13aa6c300dce0bf5e521840dfac0f0b8573bdcd0 100644
--- a/src/space.h
+++ b/src/space.h
@@ -62,7 +62,7 @@ struct space {
     double h_min, h_max, cell_min;
     
     /* Current time step for particles. */
-    float dt_max;
+    float dt_step;
     
     /* Number of cells. */
     int nr_cells, tot_cells;
@@ -79,6 +79,7 @@ struct space {
     /* The particle data (cells have pointers to this). */
     struct part *parts;
     struct cpart *cparts;
+    struct xpart *xparts;
     
     /* The total number of parts in the space. */
     int nr_parts;