diff --git a/src/engine.c b/src/engine.c
index 4872df41e0549c76c325edab69756c1d3eb09884..898ac2f1f32bb79b2242cd9cbb97bfc33d1def2f 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -176,7 +176,7 @@ void engine_map_kick_first ( struct cell *c , void *data ) {
     struct engine *e = (struct engine *)data;
     float pdt, dt_step = e->dt_step, dt = e->dt, hdt = 0.5f*dt;
     float dt_min, dt_max, h_max, dx, h2dx_max, dx_max;
-    float a[3], v[3], u, u_dt, h, h_dt, rho, v_old[3];
+    float a[3], v[3], u, u_dt, h, h_dt, rho, v_old[3], w;
     double x[3], x_old[3];
     struct part *restrict p;
     struct xpart *restrict xp;
@@ -235,12 +235,16 @@ void engine_map_kick_first ( struct cell *c , void *data ) {
             p->v[0] = v[0] += dt * a[0];
             p->v[1] = v[1] += dt * a[1];
             p->v[2] = v[2] += dt * a[2];
-            p->u = u += u_dt * dt;
-            // p->h = h += h_dt * dt;
-            {
-                float w = h_dt / h *dt;
-                p->h = h *= 1.0f + w*( -1.0f + w*( 0.5f + w*(-1.0f/6.0f + 1.0f/24.0*w ) ) );
-            }
+            w = u_dt / u * dt;
+            if ( w > 0.9f && w < 1.1f )
+                p->u = u *= 1.0f + w*( -1.0f + w*( 0.5f - w*1.0f/6.0f ) );
+            else
+                p->u = u *= expf( w );
+            w = h_dt / h * dt;
+            if ( w > 0.9f && w < 1.1f )
+                p->h = h *= 1.0f + w*( -1.0f + w*( 0.5f - w*1.0f/6.0f ) );
+            else
+                p->h = h *= expf( w );
             h_max = fmaxf( h_max , h );
 
         
@@ -421,7 +425,6 @@ void engine_step ( struct engine *e , int sort_queues ) {
     float dt = e->dt, dt_step, dt_max = 0.0f, dt_min = FLT_MAX;
     float epot = 0.0, ekin = 0.0, mom[3] = { 0.0 , 0.0 , 0.0 };
     float ang[3] = { 0.0 , 0.0 , 0.0 };
-    // double lent, ent = 0.0;
     int count = 0;
     struct cell *c;
     
@@ -518,6 +521,7 @@ void engine_step ( struct engine *e , int sort_queues ) {
         }
     else {
         if ( e->dt == 0 ) {
+            // e->nullstep += 1;
             e->dt = e->dt_orig;
             while ( dt_min < e->dt )
                 e->dt *= 0.5;
@@ -540,7 +544,7 @@ void engine_step ( struct engine *e , int sort_queues ) {
         } 
     
     /* Set the system time. */
-    e->time = e->dt * e->step;
+    e->time = e->dt * (e->step - e->nullstep);
         
     TIMER_TOC2(timer_step);
     
@@ -573,6 +577,8 @@ void engine_init ( struct engine *e , struct space *s , float dt , int nr_thread
     e->nr_queues = nr_queues;
     e->policy = policy;
     e->step = 0;
+    e->nullstep = 0;
+    e->time = 0.0;
     
     /* First of all, init the barrier and lock it. */
     if ( pthread_mutex_init( &e->barrier_mutex , NULL ) != 0 )
diff --git a/src/engine.h b/src/engine.h
index 5d332ad88adf21f6cecae250a11f8bd221b73694..4e29a7b670790d094fb7616777c39ea9ef7da464 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -61,7 +61,7 @@ struct engine {
     float dt, dt_orig;
     
     /* The current step number. */
-    int step;
+    int step, nullstep;
     
     /* The current system time. */
     float time;
diff --git a/src/runner.c b/src/runner.c
index 8f639b89e84680aa2e0ff09d0b3f24a0c424491a..51af88c1513190bb18ce20dd3a549678d15a6cc2 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -432,7 +432,7 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
 
 		        /* Balsara switch */
 		        p->force.balsara = normCurl_v / ( normDiv_v + normCurl_v + 0.0001f * fc * ihg );
-
+                
                 /* Reset the acceleration. */
                 for ( k = 0 ; k < 3 ; k++ )
                     p->a[k] = 0.0f;
diff --git a/src/runner_iact.h b/src/runner_iact.h
index 79708a311724959a9202ccd009a1b02f5cfc5659..c1a156edbc23e7ed5b52071dd4d35e4fe4e2c17b 100644
--- a/src/runner_iact.h
+++ b/src/runner_iact.h
@@ -362,16 +362,16 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2
     int k;
     
     /* Get the kernel for hi. */
-    hi_inv = 1.0f / hi;
+    hi_inv = kernel_igamma / hi;
     hi2_inv = hi_inv * hi_inv;
-    xi = r * hi_inv * kernel_igamma;
+    xi = r * hi_inv;
     kernel_deval( xi , &wi , &wi_dx );
     wi_dr = hi2_inv * hi2_inv * wi_dx;
         
     /* Get the kernel for hj. */
-    hj_inv = 1.0f / hj;
+    hj_inv = kernel_igamma / hj;
     hj2_inv = hj_inv * hj_inv;
-    xj = r * hj_inv * kernel_igamma;
+    xj = r * hj_inv;
     kernel_deval( xj , &wj , &wj_dx );
     wj_dr = hj2_inv * hj2_inv * wj_dx;
                 
@@ -389,7 +389,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2
     Pi_ij = -const_viscosity_alpha * v_sig * omega_ij / (pi->rho + pj->rho);
 
     /* Apply balsara switch */
-    Pi_ij *= (pi->force.balsara + pj->force.balsara);
+    Pi_ij *= ( pi->force.balsara + pj->force.balsara );
 
     /* Get the common factor out. */
     w = ri * ( ( pi->force.POrho2 * wi_dr + pj->force.POrho2 * wj_dr ) + 0.25f * Pi_ij * ( wi_dr + wj_dr ) );
@@ -434,8 +434,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float
     vector r, r2, ri;
     vector xi, xj;
     vector hi, hj, hi_inv, hj_inv;
-    vector hig_inv, hi2_inv;
-    vector hjg_inv, hj2_inv;
+    vector hi2_inv, hj2_inv;
     vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
     vector w;
     vector piPOrho2, pjPOrho2, pirho, pjrho;
@@ -500,22 +499,20 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float
     r.v = r2.v * ri.v;
     
     /* Get the kernel for hi. */
-    hi.v = vec_load( Hi );
+    hi.v = vec_set1( kernel_gamma ) * vec_load( Hi );
     hi_inv.v = vec_rcp( hi.v );
     hi_inv.v = hi_inv.v - hi_inv.v * ( hi.v * hi_inv.v - vec_set1( 1.0f ) );
-    hig_inv.v = vec_set1( kernel_igamma ) * hi_inv.v;
     hi2_inv.v = hi_inv.v * hi_inv.v;
-    xi.v = r.v * hig_inv.v;
+    xi.v = r.v * hi_inv.v;
     kernel_deval_vec( &xi , &wi , &wi_dx );
     wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v;
         
     /* Get the kernel for hj. */
-    hj.v = vec_load( Hj );
+    hj.v = vec_set1( kernel_gamma ) * vec_load( Hj );
     hj_inv.v = vec_rcp( hj.v );
     hj_inv.v = hj_inv.v - hj_inv.v * ( hj.v * hj_inv.v - vec_set1( 1.0f ) );
-    hjg_inv.v = vec_set1( kernel_igamma ) * hj_inv.v;
     hj2_inv.v = hj_inv.v * hj_inv.v;
-    xj.v = r.v * hjg_inv.v;
+    xj.v = r.v * hj_inv.v;
     kernel_deval_vec( &xj , &wj , &wj_dx );
     wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v;
         
@@ -595,16 +592,16 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_force ( fl
     int k;
     
     /* Get the kernel for hi. */
-    hi_inv = 1.0f / hi;
+    hi_inv = kernel_igamma / hi;
     hi2_inv = hi_inv * hi_inv;
-    xi = r * hi_inv * kernel_igamma;
+    xi = r * hi_inv;
     kernel_deval( xi , &wi , &wi_dx );
     wi_dr = hi2_inv * hi2_inv * wi_dx;
         
     /* Get the kernel for hj. */
-    hj_inv = 1.0f / hj;
+    hj_inv = kernel_igamma / hj;
     hj2_inv = hj_inv * hj_inv;
-    xj = r * hj_inv * kernel_igamma;
+    xj = r * hj_inv;
     kernel_deval( xj , &wj , &wj_dx );
     wj_dr = hj2_inv * hj2_inv * wj_dx;
                 
@@ -664,8 +661,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force
     vector r, r2, ri;
     vector xi, xj;
     vector hi, hj, hi_inv, hj_inv;
-    vector hig_inv, hi2_inv;
-    vector hjg_inv, hj2_inv;
+    vector hi2_inv, hj2_inv;
     vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
     vector w;
     vector piPOrho2, pjPOrho2, pirho, pjrho;
@@ -728,22 +724,20 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force
     r.v = r2.v * ri.v;
     
     /* Get the kernel for hi. */
-    hi.v = vec_load( Hi );
+    hi.v = vec_set1( kernel_gamma ) * vec_load( Hi );
     hi_inv.v = vec_rcp( hi.v );
     hi_inv.v = hi_inv.v - hi_inv.v * ( hi.v * hi_inv.v - vec_set1( 1.0f ) );
-    hig_inv.v = vec_set1( kernel_igamma ) * hi_inv.v;
     hi2_inv.v = hi_inv.v * hi_inv.v;
-    xi.v = r.v * hig_inv.v;
+    xi.v = r.v * hi_inv.v;
     kernel_deval_vec( &xi , &wi , &wi_dx );
     wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v;
         
     /* Get the kernel for hj. */
-    hj.v = vec_load( Hj );
+    hj.v = vec_set1( kernel_gamma ) * vec_load( Hj );
     hj_inv.v = vec_rcp( hj.v );
     hj_inv.v = hj_inv.v - hj_inv.v * ( hj.v * hj_inv.v - vec_set1( 1.0f ) );
-    hjg_inv.v = vec_set1( kernel_igamma ) * hj_inv.v;
     hj2_inv.v = hj_inv.v * hj_inv.v;
-    xj.v = r.v * hjg_inv.v;
+    xj.v = r.v * hj_inv.v;
     kernel_deval_vec( &xj , &wj , &wj_dx );
     wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v;
         
diff --git a/src/vector.h b/src/vector.h
index 65f0d30c67601e8d17d8347e508aacdd30fffcc2..39f1385e57b5fec3cfc30911c9ed900d6ba78524 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -27,7 +27,7 @@
     #define VEC_MACRO(elcount, type)  __attribute__((vector_size((elcount)*sizeof(type)))) type
 
     /* So what will the vector size be? */
-    #ifdef NO__AVX__
+    #ifdef __AVX__
         #define VECTORIZE
         #define VEC_SIZE 8
         #define VEC_FLOAT __m256
@@ -54,7 +54,7 @@
         #define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a)
         #define vec_dbl_fmin(a,b) _mm256_min_pd(a,b)
         #define vec_dbl_fmax(a,b) _mm256_max_pd(a,b)
-    #elif defined( NO__SSE2__ )
+    #elif defined( __SSE2__ )
         #define VECTORIZE
         #define VEC_SIZE 4
         #define VEC_FLOAT __m128