diff --git a/src/atomic.h b/src/atomic.h
index 882f4288dfc262d7f62d2ad8b25f7a6625ed90b2..5deb82c6d77522ad2d05a24fd937c5e239b3b91f 100644
--- a/src/atomic.h
+++ b/src/atomic.h
@@ -22,3 +22,4 @@
     
 #define atomic_add(v,i) __sync_fetch_and_add( v , i )
 #define atomic_inc(v) atomic_add( v , 1 )
+#define atomic_dec(v) atomic_add( v , -1 )
diff --git a/src/cell.h b/src/cell.h
index a560fcb1a54000188b454913b2776308c0fce24b..265c7ef42efa07e8b7c93f97afc143ff70b58747 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -88,7 +88,7 @@ struct cell {
     int nr_density;
     
     /* The ghost task to link density to interactions. */
-    struct task *ghost;
+    struct task *ghost, *kick2;
     
     /* Number of tasks that are associated with this cell. */
     int nr_tasks;
@@ -105,6 +105,15 @@ struct cell {
     /* ID of the previous owner, e.g. runner. */
     int owner;
     
+    /* Momentum of particles in cell. */
+    float mom[3], ang[3];
+    
+    /* Potential and kinetic energy of particles in this cell. */
+    float epot, ekin;
+    
+    /* Number of particles updated in this cell. */
+    int updated;
+    
     /* Linking pointer for "memory management". */
     struct cell *next;
 
diff --git a/src/engine.c b/src/engine.c
index c649d00c7ab591cc5fee84d5c10aa7101c8fc579..43e3c21b4b6c362cf0b7b42168f9640e6fc16759 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -33,6 +33,7 @@
 
 /* Local headers. */
 #include "cycle.h"
+#include "atomic.h"
 #include "timers.h"
 #include "const.h"
 #include "vector.h"
@@ -94,7 +95,7 @@ void engine_prepare ( struct engine *e ) {
         if ( s->tasks[k].skip )
             continue;
         for ( j = 0 ; j < s->tasks[k].nr_unlock_tasks ; j++ )
-            __sync_add_and_fetch( &s->tasks[k].unlock_tasks[j]->wait , 1 );
+            atomic_inc( &s->tasks[k].unlock_tasks[j]->wait );
         }
     // printf( "engine_prepare: preparing task dependencies took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
     
@@ -288,6 +289,51 @@ void engine_map_kick_first ( struct cell *c , void *data ) {
     }
 
 
+/**
+ * @brief Mapping function to collect the data from the second kick.
+ */
+
+void engine_collect_kick2 ( struct cell *c ) {
+
+    int k, updated = 0;
+    float dt_min = FLT_MAX, dt_max = 0.0f, ekin = 0.0f, epot = 0.0f;
+    float mom[3] = { 0.0f , 0.0f , 0.0f }, ang[3] = { 0.0f , 0.0f , 0.0f };
+    struct cell *cp;
+    
+    /* If I am a super-cell, return immediately. */
+    if ( c->kick2 != NULL )
+        return;
+        
+    /* If this cell is not split, I'm in trouble. */
+    if ( !c->split )
+        error( "Cell has no super-cell." );
+        
+    /* Collect the values from the progeny. */
+    for ( k = 0 ; k < 8 ; k++ )
+        if ( c->progeny[k] != NULL ) {
+            cp = c->progeny[k];
+            engine_collect_kick2( cp );
+            dt_min = fminf( dt_min , cp->dt_min );
+            dt_max = fmaxf( dt_max , cp->dt_max );
+            updated += cp->updated;
+            ekin += cp->ekin;
+            epot += cp->epot;
+            mom[0] += cp->mom[0]; mom[1] += cp->mom[1]; mom[2] += cp->mom[2];
+            ang[0] += cp->ang[0]; ang[1] += cp->ang[1]; ang[2] += cp->ang[2];
+            }
+    
+    /* Store the collected values in the cell. */
+    c->dt_min = dt_min;
+    c->dt_max = dt_max;
+    c->updated = updated;
+    c->ekin = ekin;
+    c->epot = epot;
+    c->mom[0] = mom[0]; c->mom[1] = mom[1]; c->mom[2] = mom[2];
+    c->ang[0] = ang[0]; c->ang[1] = ang[1]; c->ang[2] = ang[2];
+        
+    }
+
+
 /**
  * @brief Compute the force on a single particle brute-force.
  */
@@ -354,16 +400,13 @@ void engine_single ( double *dim , long long int pid , struct part *__restrict__
  
 void engine_step ( struct engine *e , int sort_queues ) {
 
-    int k, nr_parts = e->s->nr_parts;
-    struct part *restrict parts = e->s->parts, *restrict p;
-    struct xpart *restrict xp;
-    float dt = e->dt, hdt = 0.5*dt, dt_step, dt_max, dt_min, ldt_min, ldt_max;
-    float pdt, h, m, v[3], u;
-    double x[3];
-    double epot = 0.0, ekin = 0.0, lepot, lekin, lmom[3], mom[3] = { 0.0 , 0.0 , 0.0 };
-    double lang[3], ang[3] = { 0.0 , 0.0 , 0.0 };
+    int k;
+    float dt = e->dt, dt_step, dt_max = 0.0f, dt_min = FLT_MAX;
+    float epot = 0.0, ekin = 0.0, mom[3] = { 0.0 , 0.0 , 0.0 };
+    float ang[3] = { 0.0 , 0.0 , 0.0 };
     // double lent, ent = 0.0;
-    int threadID, nthreads, count = 0, lcount;
+    int count = 0;
+    struct cell *c;
     
     TIMER_TIC2
 
@@ -424,83 +467,19 @@ void engine_step ( struct engine *e , int sort_queues ) {
     // printParticle( parts , 432626 );
     // printParticle( parts , 432628 );
 
-    /* Second kick. */
-    TIMER_TIC_ND
-    dt_min = FLT_MAX; dt_max = 0.0f;
-    #pragma omp parallel private(p,xp,k,ldt_min,lcount,ldt_max,lmom,lang,lekin,lepot,threadID,nthreads,pdt,v,h,m,x,u)
-    {
-        threadID = omp_get_thread_num();
-        nthreads = omp_get_num_threads();
-        ldt_min = FLT_MAX; ldt_max = 0.0f;
-        lmom[0] = 0.0; lmom[1] = 0.0; lmom[2] = 0.0;
-        lang[0] = 0.0; lang[1] = 0.0; lang[2] = 0.0;
-        lekin = 0.0; lepot = 0.0; /* lent=0.0; */ lcount = 0;
-        for ( k = nr_parts * threadID / nthreads ; k < nr_parts * (threadID + 1) / nthreads ; k++ ) {
-
-            /* Get a handle on the part. */
-            p = &parts[k];
-            xp = p->xtras;
-            
-            /* Get local copies of particle data. */
-            pdt = p->dt;
-            h = p->h;
-            m = p->mass;
-            x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2];
-
-            /* Scale the derivatives if they're freshly computed. */
-            if ( pdt <= dt_step ) {
-                p->force.h_dt *= h * 0.333333333f;
-                lcount += 1;
-                }
-                
-            /* Update the particle's time step. */
-            p->dt = pdt = const_cfl * h / ( p->force.v_sig );
-
-            /* Update positions and energies at the half-step. */
-            p->v[0] = v[0] = xp->v_old[0] + hdt * p->a[0];
-            p->v[1] = v[1] = xp->v_old[1] + hdt * p->a[1];
-            p->v[2] = v[2] = xp->v_old[2] + hdt * p->a[2];
-            p->u = u = xp->u_old + hdt * p->force.u_dt;
-            
-            /* Get the smallest/largest dt. */
-            ldt_min = fminf( ldt_min , pdt );
-            ldt_max = fmaxf( ldt_max , pdt );
-            
-            /* Collect total energy. */
-            lekin += 0.5 * m * ( v[0]*v[0] + v[1]*v[1] + v[2]*v[2] );
-            lepot += m * u;
-
-            /* Collect momentum */
-            lmom[0] += m * p->v[0];
-            lmom[1] += m * p->v[1];
-            lmom[2] += m * p->v[2];
-	    
-	        /* Collect angular momentum */
-	        lang[0] += m * ( x[1]*v[2] - v[2]*x[1] );
-	        lang[1] += m * ( x[2]*v[0] - v[0]*x[2] );
-	        lang[2] += m * ( x[0]*v[1] - v[1]*x[0] );
-
-	        /* Collect entropic function */
-	        // lent += u * pow( p->rho, 1.f-const_gamma );
-
-            }
-        #pragma omp critical
-        {
-            dt_min = fminf( dt_min , ldt_min );
-            dt_max = fmaxf( dt_max , ldt_max );
-            mom[0] += lmom[0];
-            mom[1] += lmom[1];
-            mom[2] += lmom[2];
-            ang[0] += lang[0];
-            ang[1] += lang[1];
-            ang[2] += lang[2];
-	        // ent += (const_gamma -1.) * lent;
-            epot += lepot;
-            ekin += lekin;
-            count += lcount;
+    /* Collect the cell data from the second kick. */
+    for ( k = 0 ; k < e->s->nr_cells ; k++ ) {
+        c = &e->s->cells[k];
+        engine_collect_kick2( c );
+        dt_min = fminf( dt_min , c->dt_min );
+        dt_max = fmaxf( dt_max , c->dt_max );
+        ekin += c->ekin;
+        epot += c->epot;
+        count += c->updated;
+        mom[0] += c->mom[0]; mom[1] += c->mom[1]; mom[2] += c->mom[2];
+        ang[0] += c->ang[0]; ang[1] += c->ang[1]; ang[2] += c->ang[2];
         }
-    }
-    TIMER_TOC( timer_kick2 );
+    
     e->dt_min = dt_min;
     // printParticle( parts , 432626 );
     printf( "engine_step: dt_min/dt_max is %e/%e.\n" , dt_min , dt_max ); fflush(stdout);
diff --git a/src/queue.c b/src/queue.c
index d2c47b4d17833109bed04bfb083d7f5d8db9ea7f..82bf86c16943a7da8368c5fdb83f2409c6272059 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -309,8 +309,8 @@ struct task *queue_gettask ( struct queue *q , int rid , int blocking , int keep
                 continue;
                 
             /* Get the score for this task. */
-            if ( res->type == task_type_self || res->type == task_type_ghost || res->type == task_type_sort || ( res->type == task_type_sub && res->cj == NULL ) )
-                score = ( res->ci->owner == rid );
+            if ( res->cj == NULL )
+                score = 2 * ( res->ci->owner == rid );
             else
                 score = ( res->ci->owner == rid ) + ( res->cj->owner == rid );
             if ( score <= score_best )
@@ -352,8 +352,7 @@ struct task *queue_gettask ( struct queue *q , int rid , int blocking , int keep
             hits += 1;
             
             /* Should we bother looking any farther? */
-            if ( ( qtasks[ qtid[ ind_best ] ].cj == NULL && score_best == 1 ) ||
-                score_best == 2 );
+            if ( score_best == 2 );
                 break;
         
             } /* loop over the task IDs. */
diff --git a/src/runner.c b/src/runner.c
index adbf48ddd2d5de1b094492abfc7e51edbaa25500..deba874d50ab98f8b7b944d0fc61f5560cc467ba 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -33,6 +33,7 @@
 
 /* Local headers. */
 #include "cycle.h"
+#include "atomic.h"
 #include "timers.h"
 #include "const.h"
 #include "lock.h"
@@ -331,7 +332,7 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock )
  * @brief Intermediate task between density and force
  *
  * @param r The runner thread.
- * @param c THe cell.
+ * @param c The cell.
  */
  
 void runner_doghost ( struct runner *r , struct cell *c ) {
@@ -506,6 +507,96 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
     #endif
     
     }
+    
+    
+/**
+ * @brief Compute the second kick of the given cell.
+ *
+ * @param r The runner thread.
+ * @param c The cell.
+ */
+ 
+void runner_dokick2 ( struct runner *r , struct cell *c ) {
+
+    int k, count = 0, nr_parts = c->count;
+    float dt_min = FLT_MAX, dt_max = 0.0f, ekin = 0.0f, epot = 0.0f;
+    float mom[3] = { 0.0f , 0.0f , 0.0f }, ang[3] = { 0.0f , 0.0f , 0.0f };
+    float x[3], v[3], u, h, pdt, m;
+    float dt_step = r->e->dt_step, dt = r->e->dt, hdt = 0.5f*dt;
+    struct part *p, *parts = c->parts;
+    struct xpart *xp;
+    
+    TIMER_TIC
+    
+    /* Loop over the particles and kick them. */
+    for ( k = 0 ; k < nr_parts ; k++ ) {
+
+        /* Get a handle on the part. */
+        p = &parts[k];
+        xp = p->xtras;
+
+        /* Get local copies of particle data. */
+        pdt = p->dt;
+        h = p->h;
+        m = p->mass;
+        x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2];
+
+        /* Scale the derivatives if they're freshly computed. */
+        if ( pdt <= dt_step ) {
+            p->force.h_dt *= h * 0.333333333f;
+            count += 1;
+            }
+
+        /* Update the particle's time step. */
+        p->dt = pdt = const_cfl * h / ( p->force.v_sig );
+
+        /* Update positions and energies at the half-step. */
+        p->v[0] = v[0] = xp->v_old[0] + hdt * p->a[0];
+        p->v[1] = v[1] = xp->v_old[1] + hdt * p->a[1];
+        p->v[2] = v[2] = xp->v_old[2] + hdt * p->a[2];
+        p->u = u = xp->u_old + hdt * p->force.u_dt;
+
+        /* Get the smallest/largest dt. */
+        dt_min = fminf( dt_min , pdt );
+        dt_max = fmaxf( dt_max , pdt );
+
+        /* Collect total energy. */
+        ekin += 0.5 * m * ( v[0]*v[0] + v[1]*v[1] + v[2]*v[2] );
+        epot += m * u;
+
+        /* Collect momentum */
+        mom[0] += m * p->v[0];
+        mom[1] += m * p->v[1];
+        mom[2] += m * p->v[2];
+
+	    /* Collect angular momentum */
+	    ang[0] += m * ( x[1]*v[2] - v[2]*x[1] );
+	    ang[1] += m * ( x[2]*v[0] - v[0]*x[2] );
+	    ang[2] += m * ( x[0]*v[1] - v[1]*x[0] );
+
+	    /* Collect entropic function */
+	    // lent += u * pow( p->rho, 1.f-const_gamma );
+
+        }
+
+    #ifdef TIMER_VERBOSE
+        printf( "runner_dokick2[%02i]: %i parts at depth %i took %.3f ms.\n" ,
+            r->id , c->count , c->depth ,
+            ((double)TIMER_TOC(timer_kick2)) / CPU_TPS * 1000 ); fflush(stdout);
+    #else
+        TIMER_TOC(timer_kick2);
+    #endif
+        
+    /* Store the computed values in the cell. */
+    c->dt_min = dt_min;
+    c->dt_max = dt_max;
+    c->updated = count;
+    c->ekin = ekin;
+    c->epot = epot;
+    c->mom[0] = mom[0]; c->mom[1] = mom[1]; c->mom[2] = mom[2];
+    c->ang[0] = ang[0]; c->ang[1] = ang[1]; c->ang[2] = ang[2];
+        
+    }
 
 
 /**
@@ -660,14 +751,17 @@ void *runner_main ( void *data ) {
                     if ( ci->super == ci )
                         runner_doghost( r , ci );
                     break;
+                case task_type_kick2:
+                    runner_dokick2( r , ci );
+                    break;
                 default:
                     error( "Unknown task type." );
                 }
                 
             /* Resolve any dependencies. */
             for ( k = 0 ; k < t->nr_unlock_tasks ; k++ )
-                if ( __sync_fetch_and_sub( &t->unlock_tasks[k]->wait , 1 ) == 0 )
-                    abort();
+                if ( atomic_dec( &t->unlock_tasks[k]->wait ) == 0 )
+                    error( "Task negative wait." );
         
             } /* main loop. */
             
diff --git a/src/space.c b/src/space.c
index 08ac0b81e24fb34b0873f0dfe4b7e3a1f6e17d5b..20cd589cb4d0ff9e5997f1b4721dc7069542df5f 100644
--- a/src/space.c
+++ b/src/space.c
@@ -106,8 +106,8 @@ int space_marktasks ( struct space *s ) {
         
         /* Single-cell task? */
         else if ( t->type == task_type_self ||
-             t->type == task_type_ghost ||
-             ( t->type == task_type_sub && t->cj == NULL ) ) {
+                  t->type == task_type_ghost ||
+                ( t->type == task_type_sub && t->cj == NULL ) ) {
              
             /* Set this task's skip. */
             t->skip = ( t->ci->dt_min > dt_step );
@@ -736,6 +736,8 @@ void space_map_clearsort ( struct cell *c , void *data ) {
  * Looks for the super cell, e.g. the highest-level cell above each
  * cell for which a pair is defined. All ghosts below this cell will
  * depend on the ghost of their parents (sounds spooky, but it isn't).
+ *
+ * A kick2-task is appended to each super cell.
  */
 
 void space_map_mkghosts ( struct cell *c , void *data ) {
@@ -754,11 +756,15 @@ void space_map_mkghosts ( struct cell *c , void *data ) {
     if ( c->super != c || c->nr_tasks > 0 )
         c->ghost = space_addtask( s , task_type_ghost , task_subtype_none , 0 , 0 , c , NULL , 0 );
 
+    /* Append a kick task if we are the active super cell. */
+    if ( c->super == c && c->nr_tasks > 0 )
+        c->kick2 = space_addtask( s , task_type_kick2 , task_subtype_none , 0 , 0 , c , NULL , 0 );
+    
     /* If we are not the super cell ourselves, make our ghost depend
        on our parent cell. */
     if ( c->super != c )
         task_addunlock( c->parent->ghost , c->ghost );
-    
+        
     }
 
 
@@ -1601,7 +1607,9 @@ void space_maketasks ( struct space *s , int do_sort ) {
     /* Append a ghost task to each cell. */
     space_map_cells_pre( s , 1 , &space_map_mkghosts , s );
     
-    /* Run through the tasks and make force tasks for each density task. */
+    /* Run through the tasks and make force tasks for each density task.
+       Each force task depends on the cell ghosts and unlocks the kick2 task
+       of its super-cell. */
     kk = s->nr_tasks;
     #pragma omp parallel for private(t,t2)
     for ( k = 0 ; k < kk ; k++ ) {
@@ -1618,6 +1626,7 @@ void space_maketasks ( struct space *s , int do_sort ) {
             task_addunlock( t , t->ci->super->ghost );
             t2 = space_addtask( s , task_type_self , task_subtype_force , 0 , 0 , t->ci , NULL , 0 );
             task_addunlock( t->ci->ghost , t2 );
+            task_addunlock( t2 , t->ci->super->kick2 );
             }
             
         /* Otherwise, pair interaction? */
@@ -1628,17 +1637,23 @@ void space_maketasks ( struct space *s , int do_sort ) {
             t2 = space_addtask( s , task_type_pair , task_subtype_force , 0 , 0 , t->ci , t->cj , 0 );
             task_addunlock( t->ci->ghost , t2 );
             task_addunlock( t->cj->ghost , t2 );
+            task_addunlock( t2 , t->ci->super->kick2 );
+            if ( t->ci->super != t->cj->super )
+                task_addunlock( t2 , t->cj->super->kick2 );
             }
     
         /* Otherwise, sub interaction? */
         else if ( t->type == task_type_sub && t->subtype == task_subtype_density ) {
             task_addunlock( t , t->ci->super->ghost );
-            if ( t->cj != NULL )
+            if ( t->cj != NULL && t->ci->super != t->cj->super )
                 task_addunlock( t , t->cj->super->ghost );
             t2 = space_addtask( s , task_type_sub , task_subtype_force , t->flags , 0 , t->ci , t->cj , 0 );
             task_addunlock( t->ci->ghost , t2 );
             if ( t->cj != NULL )
                 task_addunlock( t->cj->ghost , t2 );
+            task_addunlock( t2 , t->ci->super->kick2 );
+            if ( t->cj != NULL && t->ci->super != t->cj->super )
+                task_addunlock( t2 , t->cj->super->kick2 );
             }
             
         }
diff --git a/src/space.h b/src/space.h
index fed17faec562675b86d82c043914af88b240fe4e..7d886d98e4321eb7195aef37cef056437282f3b1 100644
--- a/src/space.h
+++ b/src/space.h
@@ -28,7 +28,7 @@
 #define space_subsize_default           1000
 #define space_dosub                     0
 #define space_stretch                   1.05
-#define space_maxtaskspercell           30
+#define space_maxtaskspercell           31
 
 
 /* Convert cell location to ID. */
diff --git a/src/task.h b/src/task.h
index 696fe889a1f3919c059424106216796d4b8d580a..26068aa4000133505f11ff194357dc6cac898e42 100644
--- a/src/task.h
+++ b/src/task.h
@@ -31,6 +31,7 @@ enum task_types {
     task_type_pair,
     task_type_sub,
     task_type_ghost,
+    task_type_kick2,
     task_type_count
     };