diff --git a/src/gadgetsmp.h b/src/gadgetsmp.h
index a14888a70ea2cd972b0e152c70ff7f3781fb77d3..866413474718a9bc9c5bfb1d1c050d1e2f4cb3a9 100644
--- a/src/gadgetsmp.h
+++ b/src/gadgetsmp.h
@@ -22,6 +22,7 @@
 
 /* Local headers. */
 #include "cycle.h"
+#include "const.h"
 #include "lock.h"
 #include "task.h"
 #include "part.h"
diff --git a/src/part.h b/src/part.h
index 07e639a1aa59166f7ddf6fd25e84738d38ada638..80a93d628bb07e8d5f8923ec8047bcf6ac6c1078 100644
--- a/src/part.h
+++ b/src/part.h
@@ -21,6 +21,7 @@
 /* Some constants. */
 #define part_maxwait                    3
 #define part_maxunlock                  39
+#define part_dtmax                      10
 
 
 /* Data of a single particle. */
@@ -30,14 +31,17 @@ struct part {
     float h;
     
     /* Particle time-step. */
-    float dt;
+    int dt;
     
-    /* Particle ID. */
-    int id;
+    /* Particle mass. */
+    float mass;
     
     /* Particle density. */
     float rho;
     
+    /* Particle ID. */
+    int id;
+    
     /* Particle position. */
     double x[3];
     
@@ -50,8 +54,8 @@ struct part {
     /* Particle pressure. */
     float P;
     
-    /* Particle mass. */
-    float m;
+    /* Aggregate quantities. */
+    float POrho2;
     
     /* Particle internal energy. */
     float u;
@@ -59,11 +63,15 @@ struct part {
     /* Change in particle energy over time. */
     float u_dt;
     
+    /* Change in smoothing length over time. */
+    float h_dt;
+    
     /* Derivative of the density with respect to this particle's smoothing length. */
     float rho_dh;
     
     /* Particle number density. */
     int icount;
+    float wcount;
     
     } __attribute__((aligned (32)));
     
diff --git a/src/queue.c b/src/queue.c
index fca97701093dc04a7f99f1e1b5e41e044dc76566..367b9423fb1872cfecbb7ab7f2eb6b3b19202777 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -306,7 +306,7 @@ struct task *queue_gettask_new ( struct queue *q , int rid , int blocking , int
                 continue;
                 
             /* Get the score for this task. */
-            if ( res->type == task_type_self || res->type == task_type_sort || ( res->type == task_type_sub && res->cj == NULL ) )
+            if ( res->type == task_type_self || res->type == task_type_ghost || res->type == task_type_sort || ( res->type == task_type_sub && res->cj == NULL ) )
                 score = ( res->ci->owner == rid );
             else
                 score = ( res->ci->owner == rid ) + ( res->cj->owner == rid );
diff --git a/src/runner.c b/src/runner.c
index b7f10d76e73f853a7e027e4f5cee09b56b033958..dd38394b7c57dcfae922bc510e6be2fee15dfb23 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -33,6 +33,7 @@
 
 /* Local headers. */
 #include "cycle.h"
+#include "const.h"
 #include "lock.h"
 #include "task.h"
 #include "part.h"
@@ -485,6 +486,56 @@ void runner_dosort ( struct runner_thread *rt , struct cell *c , int flags ) {
     #endif
 
     }
+    
+    
+/**
+ * @brief Intermediate task between density and force
+ *
+ * @param rt The runner thread.
+ * @param ci THe cell.
+ */
+ 
+void runner_doghost ( struct runner_thread *r , struct cell *c ) {
+
+    struct part *p;
+    int i, k;
+    TIMER_TIC
+    
+    /* If this cell has progeny, don't bother. */
+    if ( c->split )
+        return;
+    
+    /* Loop over the parts in this cell. */
+    for ( i = 0 ; i < c->count ; i++ ) {
+    
+        /* Get a direct pointer on the part. */
+        p = &c->parts[i];
+    
+        /* Reset the acceleration. */
+        for ( k = 0 ; k < 3 ; k++ )
+            p->a[k] = 0.0f;
+            
+        /* Reset the time derivatives. */
+        p->u_dt = 0.0f;
+        p->h_dt = 0.0f;
+            
+        /* Compute the pressure. */
+        p->P = p->rho * p->u * ( const_gamma - 1.0f );
+        
+        /* Compute the P/Omega/rho2. */
+        p->POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f );
+        
+        }
+
+    #ifdef TIMER_VERBOSE
+        printf( "runner_doghost[%02i]: %i parts at depth %i took %.3f ms.\n" ,
+            rt->id , c->count , c->depth ,
+            ((double)TIMER_TOC(runner_timer_doghost)) / CPU_TPS * 1000 ); fflush(stdout);
+    #else
+        TIMER_TOC(runner_timer_doghost);
+    #endif
+    
+    }
 
 
 /**
@@ -677,6 +728,7 @@ void *runner_main ( void *data ) {
                         cell_unlocktree( cj );
                     break;
                 case task_type_ghost:
+                    runner_doghost( rt , ci );
                     break;
                 default:
                     error( "Unknown task type." );
diff --git a/src/runner.h b/src/runner.h
index 535c416b47f7f4140dcba16f4e00e4a4f17af8d5..505e7561a6c7159f9db22b748f67c5f1fd873c7d 100644
--- a/src/runner.h
+++ b/src/runner.h
@@ -33,9 +33,13 @@
 enum {
     runner_timer_none = 0,
     runner_timer_dosort,
-    runner_timer_doself,
-    runner_timer_dopair,
-    runner_timer_dosub,
+    runner_timer_doself_density,
+    runner_timer_doself_force,
+    runner_timer_dopair_density,
+    runner_timer_dopair_force,
+    runner_timer_dosub_density,
+    runner_timer_dosub_force,
+    runner_timer_doghost,
     runner_timer_getpair,
     runner_timer_steal,
     runner_timer_stalled,
@@ -155,6 +159,7 @@ struct runner {
 
 /* Function prototypes. */
 void runner_run ( struct runner *r , int sort_queues );
+void runner_doghost ( struct runner_thread *rt , struct cell *c );
 void runner_dopair_density ( struct runner_thread *rt , struct cell *ci , struct cell *cj );
 void runner_doself_density ( struct runner_thread *rt , struct cell *c );
 void runner_dosub_density ( struct runner_thread *rt , struct cell *ci , struct cell *cj , int flags );
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index a034e9d8641791b26fd2a444bf4204732faf8cd2..52039b151a43d7f7d5ca1c8ca5243e280bd98be1 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -41,6 +41,15 @@
 #define IACT2(f) PASTE(runner_iact,f)
 #define IACT IACT2(FUNCTION)
 
+#define TIMER_DOSELF2(f) PASTE(runner_timer_doself,f)
+#define TIMER_DOSELF TIMER_DOSELF2(FUNCTION)
+
+#define TIMER_DOPAIR2(f) PASTE(runner_timer_dopair,f)
+#define TIMER_DOPAIR TIMER_DOPAIR2(FUNCTION)
+
+#define TIMER_DOSUB2(f) PASTE(runner_timer_dosub,f)
+#define TIMER_DOSUB TIMER_DOSUB2(FUNCTION)
+
 
 /**
  * @brief Compute the interactions between a cell pair.
@@ -56,7 +65,8 @@ void DOPAIR_NAIVE ( struct runner_thread *rt , struct cell *ci , struct cell *cj
     int pid, pjd, k, count_i = ci->count, count_j = cj->count;
     double shift[3] = { 0.0 , 0.0 , 0.0 };
     struct part *pi, *pj, *parts_i = ci->parts, *parts_j = cj->parts;
-    double dx[3], pix[3], hi, hi2, r2;
+    double pix[3];
+    float dx[3], hi, hi2, r2;
     TIMER_TIC
     
     /* Get the relative distance between the pairs, wrapping. */
@@ -98,7 +108,7 @@ void DOPAIR_NAIVE ( struct runner_thread *rt , struct cell *ci , struct cell *cj
             /* Hit or miss? */
             if ( r2 < hi2 || r2 < pj->h*pj->h ) {
             
-                IACT( r2 , hi , pj->h , pi , pj );
+                IACT( r2 , dx , hi , pj->h , pi , pj );
             
                 }
         
@@ -107,9 +117,9 @@ void DOPAIR_NAIVE ( struct runner_thread *rt , struct cell *ci , struct cell *cj
         } /* loop over the parts in ci. */
         
     #ifdef TIMER_VERBOSE
-        printf( "runner_dopair_naive[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , rt->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , ((double)TIMER_TOC(runner_timer_dopair)) / CPU_TPS * 1000 );
+        printf( "runner_dopair_naive[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , rt->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 );
     #else
-        TIMER_TOC(runner_timer_dopair);
+        TIMER_TOC(TIMER_DOPAIR);
     #endif
 
 
@@ -132,7 +142,8 @@ void DOPAIR ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) {
     struct cell *temp;
     struct entry *sort_i, *sort_j;
     struct part *pi, *pj, *parts_i, *parts_j;
-    double dx[3], pix[3], pjx[3], hi, hi2, hj, hj2, r2, di, dj;
+    double pix[3], pjx[3], di, dj;
+    float dx[3], hi, hi2, hj, hj2, r2;
     double hi_max, hj_max, di_max, dj_min;
     int count_i, count_j;
     TIMER_TIC
@@ -214,7 +225,7 @@ void DOPAIR ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) {
             /* Hit or miss? */
             if ( r2 < hi2 ) {
             
-                IACT( r2 , hi , pj->h , pi , pj );
+                IACT( r2 , dx , hi , pj->h , pi , pj );
             
                 }
         
@@ -255,7 +266,7 @@ void DOPAIR ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) {
             /* Hit or miss? */
             if ( r2 < hj2 && r2 > pi->h*pi->h ) {
             
-                IACT( r2 , pi->h , hj , pi , pj );
+                IACT( r2 , dx , pi->h , hj , pi , pj );
             
                 }
         
@@ -264,9 +275,9 @@ void DOPAIR ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) {
         } /* loop over the parts in ci. */
 
     #ifdef TIMER_VERBOSE
-        printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , rt->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(runner_timer_dopair))) / CPU_TPS * 1000 );
+        printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , rt->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000 );
     #else
-        TIMER_TOC(runner_timer_dopair);
+        TIMER_TOC(TIMER_DOPAIR);
     #endif
 
     }
@@ -282,7 +293,8 @@ void DOPAIR ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) {
 void DOSELF ( struct runner_thread *rt , struct cell *c ) {
 
     int k, pid, pjd, count = c->count;
-    double pix[3], dx[3], hi, hi2, r2;
+    double pix[3];
+    float dx[3], hi, hi2, r2;
     struct part *pi, *pj, *parts = c->parts;
     TIMER_TIC
     
@@ -317,7 +329,7 @@ void DOSELF ( struct runner_thread *rt , struct cell *c ) {
             /* Hit or miss? */
             if ( r2 < hi2 || r2 < pj->h*pj->h ) {
             
-                IACT( r2 , hi , pj->h , pi , pj );
+                IACT( r2 , dx , hi , pj->h , pi , pj );
             
                 }
         
@@ -326,9 +338,9 @@ void DOSELF ( struct runner_thread *rt , struct cell *c ) {
         } /* loop over all particles. */
 
     #ifdef TIMER_VERBOSE
-        printf( "runner_doself[%02i]: %i parts at depth %i took %.3f ms.\n" , rt->id , count , c->depth , ((double)TIMER_TOC(runner_timer_doself)) / CPU_TPS * 1000 );
+        printf( "runner_doself[%02i]: %i parts at depth %i took %.3f ms.\n" , rt->id , count , c->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 );
     #else
-        TIMER_TOC(runner_timer_doself);
+        TIMER_TOC(TIMER_DOSELF);
     #endif
 
     }
@@ -533,9 +545,9 @@ void DOSUB ( struct runner_thread *rt , struct cell *ci , struct cell *cj , int
     
 
     #ifdef TIMER_VERBOSE
-        printf( "runner_dosub[%02i]: flags=%i at depth %i took %.3f ms.\n" , rt->id , flags , ci->depth , ((double)TIMER_TOC(runner_timer_dosub)) / CPU_TPS * 1000 );
+        printf( "runner_dosub[%02i]: flags=%i at depth %i took %.3f ms.\n" , rt->id , flags , ci->depth , ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000 );
     #else
-        TIMER_TOC(runner_timer_dosub);
+        TIMER_TOC(TIMER_DOSUB);
     #endif
 
     }
diff --git a/src/runner_iact.h b/src/runner_iact.h
index d34296a5d1c79e9a0bea8f0cf3bdb14f320343c5..0fb85d8d1e1c39aa4675a0a7916372a2a68240e9 100644
--- a/src/runner_iact.h
+++ b/src/runner_iact.h
@@ -16,59 +16,88 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  * 
  ******************************************************************************/
+ 
 
+/* Coefficients for the kernel. */
+ 
+#define kernel_degree 3
+#define kernel_ivals 2
+#define kernel_gamma 0.5f
+#define kernel_igamma 2.0f
+#define kernel_igamma3 8.0f
+#define kernel_igamma4 16.0f
+static float kernel_coeffs[ (kernel_degree + 1) * (kernel_ivals + 1) ] =
+    { 3.0/4.0*M_1_PI , -3.0/2.0*M_1_PI , 0.0 , M_1_PI , 
+      -0.25*M_1_PI , 3.0/2.0*M_1_PI , -3.0*M_1_PI , M_2_PI , 
+      0.0 , 0.0 , 0.0 , 0.0 };
+#define kernel_root(h) ( 1.0f/((h)*(h)*(h))*kernel_igamma3*kernel_coeffs[ kernel_degree ] )
+      
+      
+/**
+ * @brief Helper function to evaluate the kernel at a given x.
+ */
+
+__attribute__ ((always_inline)) INLINE static void kernel_deval ( float x , float *W , float *dW_dx ) {
+    int ind = fmin( x , kernel_ivals );
+    float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ];
+    float w = coeffs[0]*x + coeffs[1];
+    float dw_dx = coeffs[0];
+    for ( int k = 2 ; k <= kernel_degree ; k++ ) {
+        dw_dx = dw_dx*x + w;
+        w = x*w + coeffs[k];
+        }
+    *W = w;
+    *dW_dx = dw_dx;
+    }
+
+
+__attribute__ ((always_inline)) INLINE static void kernel_eval ( float x , float *W ) {
+    int ind = fmin( x , kernel_ivals );
+    float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ];
+    float w = coeffs[0]*x + coeffs[1];
+    for ( int k = 2 ; k <= kernel_degree ; k++ )
+        w = x*w + coeffs[k];
+    *W = w;
+    }
 
 
-__attribute__ ((always_inline)) INLINE void runner_iact_density ( float r2 , float hi , float hj , struct part *pi , struct part *pj ) {
+/**
+ * @brief Density kernel
+ */
 
-    #define  KERNEL_COEFF_1  2.546479089470f
-    #define  KERNEL_COEFF_2  15.278874536822f
-    #define  KERNEL_COEFF_3  45.836623610466f
-    #define  KERNEL_COEFF_4  30.557749073644f
-    #define  KERNEL_COEFF_5  5.092958178941f
-    #define  KERNEL_COEFF_6  (-15.278874536822f)
-    #define  NORM_COEFF      4.188790204786f
+__attribute__ ((always_inline)) INLINE static void runner_iact_density ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) {
 
     float r = sqrtf( r2 );
-    float ui, uj, wi, wj;
-    float ui_dh, uj_dh, wi_dh, wj_dh;
+    float xi, xj;
+    float hg_inv, hg2_inv;
+    float wi, wj, wi_dx, wj_dx;
     
     if ( r2 < hi*hi && pi != NULL ) {
         
-        ui = r / hi;
-        ui_dh = -r / hi / hi;
-        if ( ui < 0.5f ) {
-            wi = KERNEL_COEFF_1 + KERNEL_COEFF_2 * (ui - 1.0f) * ui * ui;
-            wi_dh = KERNEL_COEFF_2 * ui_dh * ui * ui
-                  + 2 * KERNEL_COEFF_2 * (ui - 1.0f) * ui_dh * ui;
-            }
-        else {
-            wi = KERNEL_COEFF_5 * (1.0f - ui) * (1.0f - ui) * (1.0f - ui);
-            wi_dh = -3 * KERNEL_COEFF_5 * ui_dh * (1.0f - ui) * (1.0f - ui);
-            }
-        pi->rho += NORM_COEFF * wi;
-        pi->rho_dh += NORM_COEFF * wi_dh;
+        hg_inv = kernel_igamma / hi;
+        hg2_inv = hg_inv * hg_inv;
+        xi = r * hg_inv;
+        kernel_deval( xi , &wi , &wi_dx );
+        
+        pi->rho += pj->mass * hg_inv * hg2_inv * wi;
+        pi->rho_dh += -pj->mass * hg2_inv * hg2_inv * ( 3.0*wi + xi*wi_dx );
+        pi->wcount += wi * 4.0f * M_PI / 3.0f * kernel_igamma3;
         pi->icount += 1;
         
         }
 
     if ( r2 < hj*hj && pj != NULL ) {
         
-        uj = r / hj;
-        uj_dh = -r / hj / hj;
-        if ( uj < 0.5f ) {
-            wj = KERNEL_COEFF_1 + KERNEL_COEFF_2 * (uj - 1.0f) * uj * uj;
-            wj_dh = KERNEL_COEFF_2 * uj_dh * uj * uj
-                  + 2 * KERNEL_COEFF_2 * (uj - 1.0f) * uj_dh * uj;
-            }
-        else {
-            wj = KERNEL_COEFF_5 * (1.0f - uj) * (1.0f - uj) * (1.0f - uj);
-            wj_dh = -3 * KERNEL_COEFF_5 * uj_dh * (1.0f - uj) * (1.0f - uj);
-            }
-        pj->rho += NORM_COEFF * wj;
-        pj->rho_dh += NORM_COEFF * wj_dh;
+        hg_inv = kernel_igamma / hj;
+        hg2_inv = hg_inv * hg_inv;
+        xj = r * hg_inv;
+        kernel_deval( xj , &wj , &wj_dx );
+        
+        pj->rho += pi->mass * hg_inv * hg2_inv * wj;
+        pj->rho_dh += -pi->mass * hg2_inv * hg2_inv * ( 3.0*wj + xj*wj_dx );
+        pj->wcount += wj * 4.0f * M_PI / 3.0f * kernel_igamma3;
         pj->icount += 1;
-            
+        
         }
         
     #ifdef HIST
@@ -82,15 +111,57 @@ __attribute__ ((always_inline)) INLINE void runner_iact_density ( float r2 , flo
     
 
 
-__attribute__ ((always_inline)) INLINE void runner_iact_force ( float r2 , float hi , float hj , struct part *pi , struct part *pj ) {
+__attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) {
 
-    #define  KERNEL_COEFF_1  2.546479089470f
-    #define  KERNEL_COEFF_2  15.278874536822f
-    #define  KERNEL_COEFF_3  45.836623610466f
-    #define  KERNEL_COEFF_4  30.557749073644f
-    #define  KERNEL_COEFF_5  5.092958178941f
-    #define  KERNEL_COEFF_6  (-15.278874536822f)
-    #define  NORM_COEFF      4.188790204786f
+    float r = sqrtf( r2 ), ri = 1.0f / r;
+    float xi, xj;
+    float hig_inv, hig2_inv;
+    float hjg_inv, hjg2_inv;
+    float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr;
+    float f;
+    int k;
+    
+    /* Get the kernel for hi. */
+    hig_inv = kernel_igamma / hi;
+    hig2_inv = hig_inv * hig_inv;
+    xi = r * hig_inv;
+    kernel_deval( xi , &wi , &wi_dx );
+    wi_dr = hig2_inv * hig2_inv * wi_dx;
+        
+    /* Get the kernel for hj. */
+    hjg_inv = kernel_igamma / hj;
+    hjg2_inv = hjg_inv * hjg_inv;
+    xj = r * hjg_inv;
+    kernel_deval( xj , &wj , &wj_dx );
+    wj_dr = hjg2_inv *hjg2_inv * wj_dx;
+    
+    /* Get the common factor out. */
+    w = ri * ( pi->POrho2 * wi_dr + pj->POrho2 * wj_dr );
+        
+    /* Use the force, Luke! */
+    for ( k = 0 ; k < 3 ; k++ ) {
+        f = dx[k] * w;
+        pi->a[k] += pj->mass * f;
+        pj->a[k] += -pi->mass * f;
+        }
+        
+    /* Compute dv dot r. */
+    dvdr = ( pi->v[0] - pj->v[0] ) * dx[0] + ( pi->v[1] - pj->v[1] ) * dx[1] + ( pi->v[2] - pj->v[2] ) * dx[2];
+        
+    /* Get the time derivative for u. */
+    pi->u_dt += pj->mass * dvdr * wi_dr;
+    pj->u_dt += pi->mass * dvdr * wj_dr;
+    
+    /* Get the time derivative for h. */
+    pi->h_dt += pj->mass / pj->rho * dvdr * wi_dr;
+    pj->h_dt += pi->mass / pi->rho * dvdr * wj_dr;
+        
+    #ifdef HIST
+    if ( hi > hj )
+        runner_hist_hit( hi / hj );
+    else
+        runner_hist_hit( hj / hi );
+    #endif
     
     }
     
diff --git a/src/space.c b/src/space.c
index 54037106a8880dd8f6d7916fd12711df634f6628..6a3203ee0601aac09cb109b91890d9911866367d 100644
--- a/src/space.c
+++ b/src/space.c
@@ -960,6 +960,17 @@ void space_maketasks ( struct space *s , int do_sort ) {
             task_addunlock( t->cj->ghost , t2 );
             }
 
+        /* Otherwise, sub interaction? */
+        else if ( t->type == task_type_sub && t->subtype == task_subtype_density ) {
+            task_addunlock( t , t->ci->ghost );
+            if ( t->cj != NULL )
+                task_addunlock( t , t->cj->ghost );
+            t2 = space_addtask( s , task_type_sub , task_subtype_force , t->flags , 0 , t->ci , t->cj , NULL , 0 , NULL , 0 );
+            task_addunlock( t->ci->ghost , t2 );
+            if ( t->cj != NULL )
+                task_addunlock( t->cj->ghost , t2 );
+            }
+
         }
         
     /* Did we already create indices? */