diff --git a/src/cell.h b/src/cell.h
index 211f16ab3078b082395ebee0babc594ec2b7882d..6005929f35cb61294247d593f05680e4ce3b3d7b 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -88,7 +88,7 @@ struct cell {
     int nr_density;
     
     /* The ghost task to link density to interactions. */
-    struct task *ghost, *kick2;
+    struct task *ghost, *kick1, *kick2;
     
     /* Number of tasks that are associated with this cell. */
     int nr_tasks;
@@ -117,10 +117,6 @@ struct cell {
     /* Linking pointer for "memory management". */
     struct cell *next;
     
-    /* Timing stuff. */
-    ticks tic, toc;
-    int tid;
-    
     } __attribute__((aligned (64)));
 
 
diff --git a/src/engine.c b/src/engine.c
index 9fde2d8bb681a0bb0e4bf7a64a5b56945eaa571c..8637ee07f44b8f88aa00d9dab9f83e1ed4318cc6 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -70,7 +70,7 @@ void engine_maketasks ( struct engine *e ) {
     struct cell *ci, *cj;
 
     /* Re-set the scheduler. */
-    scheduler_reset( sched , s->tot_cells * space_maxtaskspercell );
+    scheduler_reset( sched , s->tot_cells * engine_maxtaskspercell );
     
     /* Run through the highest level of cells and add pairs. */
     for ( i = 0 ; i < cdim[0] ; i++ )
@@ -351,7 +351,12 @@ void engine_prepare ( struct engine *e ) {
         }
 
     /* Start the scheduler. */
-    scheduler_start( &e->sched );
+    scheduler_start( &e->sched , (1 << task_type_sort) | 
+                                 (1 << task_type_self) |
+                                 (1 << task_type_pair) | 
+                                 (1 << task_type_sub) |
+                                 (1 << task_type_ghost) | 
+                                 (1 << task_type_kick2) );
     
     TIMER_TOC( timer_prepare );
     
@@ -403,151 +408,6 @@ void engine_barrier( struct engine *e ) {
     }
     
     
-/**
- * @brief Mapping function to set dt_min and dt_max, do the first
- * kick.
- */
-
-void engine_map_kick_first ( struct cell *c , void *data ) {
-
-    int j, k;
-    struct engine *e = (struct engine *)data;
-    float pdt, dt_step = e->dt_step, dt = e->dt, hdt = 0.5f*dt;
-    float dt_min, dt_max, h_max, dx, dx_max;
-    float a[3], v[3], u, u_dt, h, h_dt, v_old[3], w, rho;
-    double x[3], x_old[3];
-    struct part *restrict p;
-    struct xpart *restrict xp;
-
-    /* No children? */
-    if ( !c->split ) {
-    
-        /* Store the timer. */
-        c->tic = getticks();
-        c->tid = omp_get_thread_num();
-    
-        /* Init the min/max counters. */
-        dt_min = FLT_MAX;
-        dt_max = 0.0f;
-        h_max = 0.0f;
-        dx_max = 0.0f;
-    
-        /* Loop over parts. */
-        for ( k = 0 ; k < c->count ; k++ ) {
-            
-            /* Get a handle on the kth particle. */
-            p = &c->parts[k];
-            xp = p->xtras;
-            
-            /* Load the data locally. */
-            a[0] = p->a[0]; a[1] = p->a[1]; a[2] = p->a[2];
-            v[0] = p->v[0]; v[1] = p->v[1]; v[2] = p->v[2];
-            x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2];
-            x_old[0] = xp->x_old[0]; x_old[1] = xp->x_old[1]; x_old[2] = xp->x_old[2];
-            h = p->h;
-            u = p->u;
-            h_dt = p->force.h_dt;
-            u_dt = p->force.u_dt;
-            pdt = p->dt;
-            
-            /* Store the min/max dt. */
-            dt_min = fminf( dt_min , pdt );
-            dt_max = fmaxf( dt_max , pdt );
-            
-            /* Step and store the velocity and internal energy. */
-            xp->v_old[0] = v_old[0] = v[0] + hdt * a[0];
-            xp->v_old[1] = v_old[1] = v[1] + hdt * a[1];
-            xp->v_old[2] = v_old[2] = v[2] + hdt * a[2];
-            xp->u_old = p->u + hdt * p->force.u_dt;
-
-            /* Move the particles with the velocitie at the half-step. */
-            p->x[0] = x[0] += dt * v_old[0];
-            p->x[1] = x[1] += dt * v_old[1];
-            p->x[2] = x[2] += dt * v_old[2];
-            dx = sqrtf( (x[0] - x_old[0])*(x[0] - x_old[0]) +
-                        (x[1] - x_old[1])*(x[1] - x_old[1]) +
-                        (x[2] - x_old[2])*(x[2] - x_old[2]) );
-            dx_max = fmaxf( dx_max , dx );
-
-            /* Update positions and energies at the half-step. */
-            p->v[0] = v[0] += dt * a[0];
-            p->v[1] = v[1] += dt * a[1];
-            p->v[2] = v[2] += dt * a[2];
-            w = u_dt / u * dt;
-            if ( fabsf( w ) < 0.01f )
-                p->u = u *= 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) );
-            else
-                p->u = u *= expf( w );
-            w = h_dt / h * dt;
-            if ( fabsf( w ) < 0.01f )
-                p->h = h *= 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) );
-            else
-                p->h = h *= expf( w );
-            h_max = fmaxf( h_max , h );
-
-        
-            /* Integrate other values if this particle will not be updated. */
-            /* Init fields for density calculation. */
-            if ( pdt > dt_step ) {
-                float w = -3.0f * h_dt / h * dt;
-                if ( fabsf( w ) < 0.1f )
-                    rho = p->rho *= 1.0f + w*( 1.0f + w*( 0.5f + w*(1.0f/6.0f + 1.0f/24.0f*w ) ) );
-                else
-                    rho = p->rho *= expf( w );
-                p->force.POrho2 = u * ( const_hydro_gamma - 1.0f ) / ( rho * xp->omega );
-                }
-            else {
-                p->density.wcount = 0.0f;
-                p->density.wcount_dh = 0.0f;
-                p->rho = 0.0f;
-                p->rho_dh = 0.0f;
-	            p->density.div_v = 0.0f;
-	            for ( j = 0 ; j < 3 ; ++j)
-	                p->density.curl_v[j] = 0.0f;
-                }
-                
-            }
-            
-        /* Store the timer. */
-        c->toc = getticks();
-    
-        }
-        
-    /* Otherwise, agregate data from children. */
-    else {
-    
-        /* Init with the first non-null child. */
-        dt_min = FLT_MAX;
-        dt_max = 0.0f;
-        h_max = 0.0f;
-        dx_max = 0.0f;
-        
-        /* Loop over the remaining progeny. */
-        for ( k = 0 ; k < 8 ; k++ )
-            if ( c->progeny[k] != NULL ) {
-                #pragma omp task
-                engine_map_kick_first( c->progeny[k] , e );
-                }
-        #pragma omp taskwait
-        for ( k = 0 ; k < 8 ; k++ )
-            if ( c->progeny[k] != NULL ) {
-                dt_min = fminf( dt_min , c->progeny[k]->dt_min );
-                dt_max = fmaxf( dt_max , c->progeny[k]->dt_max );
-                h_max = fmaxf( h_max , c->progeny[k]->h_max );
-                dx_max = fmaxf( dx_max , c->progeny[k]->dx_max );
-                }
-    
-        }
-
-    /* Store the values. */
-    c->dt_min = dt_min;
-    c->dt_max = dt_max;
-    c->h_max = h_max;
-    c->dx_max = dx_max;
-    
-    }
-
-
 /**
  * @brief Mapping function to collect the data from the second kick.
  */
@@ -736,36 +596,20 @@ void engine_step ( struct engine *e ) {
     
     /* First kick. */
     TIMER_TIC
-    // space_map_cells_post( e->s , 1 , &engine_map_kick_first , e );
-    /* k = 0;
-    #pragma omp parallel shared(k,e)
-    {
-        int myk;
-        while ( 1 ) {
-            #pragma omp critical
-            myk = k++;
-            if ( myk < e->s->nr_cells ) {
-                e->s->cells[myk].tic = getticks();
-                e->s->cells[myk].tid = omp_get_thread_num();
-                engine_map_kick_first( &e->s->cells[myk] , e );
-                e->s->cells[myk].toc = getticks();
-                }
-            else
-                break;
-            }
-        } */
-    #pragma omp parallel private(k)
-    {
-        #pragma omp single
-        {
-            for ( k = 0 ; k < e->s->nr_cells ; k++ ) {
-                #pragma omp task
-                engine_map_kick_first( &e->s->cells[k] , e );
-                }
-            }
-        #pragma omp taskwait
-        }
+    scheduler_start( &e->sched , (1 << task_type_kick1) );
+    e->barrier_count = -e->barrier_count;
+    if ( pthread_cond_broadcast( &e->barrier_cond ) != 0 )
+        error( "Failed to broadcast barrier open condition." );
+    while ( e->barrier_count < e->nr_threads )
+        if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 )
+            error( "Error while waiting for barrier." );
     TIMER_TOC( timer_kick1 );
+    
+    /* Check if all the kick1 threads have executed. */
+    for ( k = 0 ; k < e->sched.nr_tasks ; k++ )
+        if ( e->sched.tasks[k].type == task_type_kick1 &&
+             e->sched.tasks[k].tic == 0 )
+            error( "Not all kick1 tasks completed." );
         
     // for(k=0; k<10; ++k)
     //   printParticle(parts, k);
@@ -919,6 +763,10 @@ void engine_init ( struct engine *e , struct space *s , float dt , int nr_thread
     /* Init the scheduler. */
     scheduler_init( &e->sched , e->s , nr_queues , scheduler_flag_steal );
         
+    /* Append a kick1 task to each cell. */
+    scheduler_reset( &e->sched , s->tot_cells );
+    space_map_cells_pre( e->s , 1 , &scheduler_map_mkkick1 , &e->sched );
+    
     /* Allocate and init the threads. */
     if ( ( e->runners = (struct runner *)malloc( sizeof(struct runner) * nr_threads ) ) == NULL )
         error( "Failed to allocate threads array." );
diff --git a/src/engine.h b/src/engine.h
index 8efae350763a2e5133aa2199ec52207ca2e6b981..ade67006c7c1b1ebdd6f28dd2b083493acb4bae1 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -29,6 +29,7 @@
 #define engine_policy_multistep     32
 
 #define engine_queue_scale          1.2
+#define engine_maxtaskspercell      32
 
 
 /* Data structure for the engine. */
diff --git a/src/runner.c b/src/runner.c
index 90c34007d66183a77793157fa6bfbef52c6baa8d..6272236984af45c1930616f09c13ce08d72b7873 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -607,6 +607,138 @@ void runner_dokick2 ( struct runner *r , struct cell *c ) {
     }
 
 
+/**
+ * @brief Mapping function to set dt_min and dt_max, do the first
+ * kick.
+ */
+
+void runner_dokick1 ( struct runner *r , struct cell *c ) {
+
+    int j, k;
+    struct engine *e = r->e;
+    float pdt, dt_step = e->dt_step, dt = e->dt, hdt = 0.5f*dt;
+    float dt_min, dt_max, h_max, dx, dx_max;
+    float a[3], v[3], u, u_dt, h, h_dt, v_old[3], w, rho;
+    double x[3], x_old[3];
+    struct part *restrict p;
+    struct xpart *restrict xp;
+
+    /* No children? */
+    if ( !c->split ) {
+    
+        /* Init the min/max counters. */
+        dt_min = FLT_MAX;
+        dt_max = 0.0f;
+        h_max = 0.0f;
+        dx_max = 0.0f;
+    
+        /* Loop over parts. */
+        for ( k = 0 ; k < c->count ; k++ ) {
+            
+            /* Get a handle on the kth particle. */
+            p = &c->parts[k];
+            xp = p->xtras;
+            
+            /* Load the data locally. */
+            a[0] = p->a[0]; a[1] = p->a[1]; a[2] = p->a[2];
+            v[0] = p->v[0]; v[1] = p->v[1]; v[2] = p->v[2];
+            x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2];
+            x_old[0] = xp->x_old[0]; x_old[1] = xp->x_old[1]; x_old[2] = xp->x_old[2];
+            h = p->h;
+            u = p->u;
+            h_dt = p->force.h_dt;
+            u_dt = p->force.u_dt;
+            pdt = p->dt;
+            
+            /* Store the min/max dt. */
+            dt_min = fminf( dt_min , pdt );
+            dt_max = fmaxf( dt_max , pdt );
+            
+            /* Step and store the velocity and internal energy. */
+            xp->v_old[0] = v_old[0] = v[0] + hdt * a[0];
+            xp->v_old[1] = v_old[1] = v[1] + hdt * a[1];
+            xp->v_old[2] = v_old[2] = v[2] + hdt * a[2];
+            xp->u_old = p->u + hdt * p->force.u_dt;
+
+            /* Move the particles with the velocitie at the half-step. */
+            p->x[0] = x[0] += dt * v_old[0];
+            p->x[1] = x[1] += dt * v_old[1];
+            p->x[2] = x[2] += dt * v_old[2];
+            dx = sqrtf( (x[0] - x_old[0])*(x[0] - x_old[0]) +
+                        (x[1] - x_old[1])*(x[1] - x_old[1]) +
+                        (x[2] - x_old[2])*(x[2] - x_old[2]) );
+            dx_max = fmaxf( dx_max , dx );
+
+            /* Update positions and energies at the half-step. */
+            p->v[0] = v[0] += dt * a[0];
+            p->v[1] = v[1] += dt * a[1];
+            p->v[2] = v[2] += dt * a[2];
+            w = u_dt / u * dt;
+            if ( fabsf( w ) < 0.01f )
+                p->u = u *= 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) );
+            else
+                p->u = u *= expf( w );
+            w = h_dt / h * dt;
+            if ( fabsf( w ) < 0.01f )
+                p->h = h *= 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) );
+            else
+                p->h = h *= expf( w );
+            h_max = fmaxf( h_max , h );
+
+        
+            /* Integrate other values if this particle will not be updated. */
+            /* Init fields for density calculation. */
+            if ( pdt > dt_step ) {
+                float w = -3.0f * h_dt / h * dt;
+                if ( fabsf( w ) < 0.1f )
+                    rho = p->rho *= 1.0f + w*( 1.0f + w*( 0.5f + w*(1.0f/6.0f + 1.0f/24.0f*w ) ) );
+                else
+                    rho = p->rho *= expf( w );
+                p->force.POrho2 = u * ( const_hydro_gamma - 1.0f ) / ( rho * xp->omega );
+                }
+            else {
+                p->density.wcount = 0.0f;
+                p->density.wcount_dh = 0.0f;
+                p->rho = 0.0f;
+                p->rho_dh = 0.0f;
+	            p->density.div_v = 0.0f;
+	            for ( j = 0 ; j < 3 ; ++j)
+	                p->density.curl_v[j] = 0.0f;
+                }
+                
+            }
+            
+        }
+        
+    /* Otherwise, agregate data from children. */
+    else {
+    
+        /* Init with the first non-null child. */
+        dt_min = FLT_MAX;
+        dt_max = 0.0f;
+        h_max = 0.0f;
+        dx_max = 0.0f;
+        
+        /* Loop over the progeny. */
+        for ( k = 0 ; k < 8 ; k++ )
+            if ( c->progeny[k] != NULL ) {
+                dt_min = fminf( dt_min , c->progeny[k]->dt_min );
+                dt_max = fmaxf( dt_max , c->progeny[k]->dt_max );
+                h_max = fmaxf( h_max , c->progeny[k]->h_max );
+                dx_max = fmaxf( dx_max , c->progeny[k]->dx_max );
+                }
+    
+        }
+
+    /* Store the values. */
+    c->dt_min = dt_min;
+    c->dt_max = dt_max;
+    c->h_max = h_max;
+    c->dx_max = dx_max;
+    
+    }
+
+
 /**
  * @brief The #runner main thread routine.
  *
@@ -679,6 +811,9 @@ void *runner_main ( void *data ) {
                     if ( ci->super == ci )
                         runner_doghost( r , ci );
                     break;
+                case task_type_kick1:
+                    runner_dokick1( r , ci );
+                    break;
                 case task_type_kick2:
                     runner_dokick2( r , ci );
                     break;
diff --git a/src/scheduler.c b/src/scheduler.c
index 9864191fa419f8ed166585441e718d08dcb9df1d..76c495c7ee6e16830515cb9adc900e76dab9fb8f 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -71,7 +71,12 @@ void scheduler_map_mkghosts ( struct cell *c , void *data ) {
     if ( c->super != c || c->nr_tasks > 0 )
         c->ghost = scheduler_addtask( s , task_type_ghost , task_subtype_none , 0 , 0 , c , NULL , 0 );
 
-    /* Append a kick task if we are the active super cell. */
+    /* Append a kick1 task and make sure the parent depends on it. */
+    c->kick1 = scheduler_addtask( s , task_type_kick1 , task_subtype_none , 0 , 0 , c , NULL , 0 );
+    if ( c->parent != NULL )
+        task_addunlock( c->kick1 , c->parent->kick1 );
+    
+    /* Append a kick2 task if we are the active super cell. */
     if ( c->super == c && c->nr_tasks > 0 )
         c->kick2 = scheduler_addtask( s , task_type_kick2 , task_subtype_none , 0 , 0 , c , NULL , 0 );
     
@@ -83,6 +88,29 @@ void scheduler_map_mkghosts ( struct cell *c , void *data ) {
     }
 
 
+/**
+ * @brief Mapping function to append a ghost task to each cell.
+ *
+ * A kick1-task is appended to each cell.
+ */
+
+void scheduler_map_mkkick1 ( struct cell *c , void *data ) {
+
+    struct scheduler *s = (struct scheduler *)data;
+    struct cell *finger;
+
+    /* Append a kick1 task and make sure the parent depends on it. */
+    c->kick1 = scheduler_addtask( s , task_type_kick1 , task_subtype_none , 0 , 0 , c , NULL , 0 );
+    if ( c->parent != NULL )
+        task_addunlock( c->kick1 , c->parent->kick1 );
+        
+    /* Set a bogus super cell. */
+    for ( finger = c ; finger->parent != NULL ; finger = finger->parent );
+    c->super = finger;
+    
+    }
+
+
 /**
  * @brief Split tasks that may be too large.
  *
@@ -525,7 +553,7 @@ void scheduler_reset ( struct scheduler *s , int size ) {
  * @param s The #scheduler.
  */
  
-void scheduler_start ( struct scheduler *s ) {
+void scheduler_start ( struct scheduler *s , unsigned int mask ) {
 
     int k, j, *tid = s->tasks_ind;
     struct task *t, *tasks = s->tasks;
@@ -535,7 +563,7 @@ void scheduler_start ( struct scheduler *s ) {
     // #pragma omp parallel for schedule(static) private(t,j)
     for ( k = s->nr_tasks-1 ; k >= 0 ; k-- ) {
         t = &tasks[ tid[k] ];
-        if ( !t->skip ) {
+        if ( ( (1 << t->type) & mask ) && !t->skip ) {
             for ( j = 0 ; j < t->nr_unlock_tasks ; j++ )
                 atomic_inc( &t->unlock_tasks[j]->wait );
             t->maxdepth = 0;
@@ -570,6 +598,7 @@ void scheduler_start ( struct scheduler *s ) {
                         if ( t->ci == t->ci->super )
                             t->weight += t->ci->count;
                         break;
+                    case task_type_kick1:
                     case task_type_kick2:
                         t->weight += t->ci->count;
                         break;
@@ -580,7 +609,7 @@ void scheduler_start ( struct scheduler *s ) {
     /* Loop over the tasks and enqueue whoever is ready. */
     for ( k = 0 ; k < s->nr_tasks ; k++ ) {
         t = &s->tasks[k];
-        if ( !t->skip && t->wait == 0 )
+        if ( ( (1 << t->type) & mask ) && !t->skip && t->wait == 0 )
             scheduler_enqueue( s , t );
         }
         
@@ -607,6 +636,7 @@ void scheduler_enqueue ( struct scheduler *s , struct task *t ) {
         case task_type_self:
         case task_type_sort:
         case task_type_ghost:
+        case task_type_kick1:
         case task_type_kick2:
             qid = t->ci->super->owner;
             break;
@@ -617,6 +647,8 @@ void scheduler_enqueue ( struct scheduler *s , struct task *t ) {
                  ( qid < 0 || s->queues[qid].count > s->queues[t->cj->super->owner].count ) )
                 qid = t->cj->super->owner;
             break;
+        default:
+            qid = -1;
         }
         
     /* If no previous owner, find the shortest queue. */
diff --git a/src/scheduler.h b/src/scheduler.h
index b4c7b871e0081994addbc547fcf4544019ef8dfb..b20c7ecaf793bc0dceb6d29ad4f74d0423ba7067 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -71,10 +71,11 @@ struct scheduler {
 void scheduler_init ( struct scheduler *s , struct space *space , int nr_queues , unsigned int flags );
 struct task *scheduler_gettask ( struct scheduler *s , int qid );
 void scheduler_enqueue ( struct scheduler *s , struct task *t );
-void scheduler_start ( struct scheduler *s );
+void scheduler_start ( struct scheduler *s , unsigned int mask );
 void scheduler_reset ( struct scheduler *s , int nr_tasks );
 void scheduler_ranktasks ( struct scheduler *s );
 struct task *scheduler_addtask ( struct scheduler *s , int type , int subtype , int flags , int wait , struct cell *ci , struct cell *cj , int tight );
 void scheduler_splittasks ( struct scheduler *s );
 void scheduler_map_mkghosts ( struct cell *c , void *data );
+void scheduler_map_mkkick1 ( struct cell *c , void *data );
 void scheduler_done ( struct scheduler *s , struct task *t );
diff --git a/src/space.h b/src/space.h
index 6e31cd4397da07c01100e3e1766e33161b76bc4b..6995c7f34d207a597b573885b3b6055bc081a763 100644
--- a/src/space.h
+++ b/src/space.h
@@ -27,7 +27,6 @@
 #define space_splitsize_default         400
 #define space_subsize_default           5000
 #define space_stretch                   1.05f
-#define space_maxtaskspercell           31
 #define space_maxreldx                  0.2f
 
 
diff --git a/src/task.c b/src/task.c
index f75f96312e6b9fc2508bec63df4059b76131632d..1f27bc08401e609208ff85cef4c0afba2ce32ae8 100644
--- a/src/task.c
+++ b/src/task.c
@@ -39,7 +39,7 @@
 #include "error.h"
 
 /* Task type names. */
-const char *taskID_names[task_type_count] = { "none" , "sort" , "self" , "pair" , "sub" , "ghost" , "kick2" };
+const char *taskID_names[task_type_count] = { "none" , "sort" , "self" , "pair" , "sub" , "ghost" , "kick1" , "kick2" };
 
 
 /**
diff --git a/src/task.h b/src/task.h
index fb7192744c8f2f010f51483bf15af8bbbea5a5e5..1eb7fad118bcd3d5f1216f37554f226605f10459 100644
--- a/src/task.h
+++ b/src/task.h
@@ -31,6 +31,7 @@ enum task_types {
     task_type_pair,
     task_type_sub,
     task_type_ghost,
+    task_type_kick1,
     task_type_kick2,
     task_type_count
     };