diff --git a/src/engine.c b/src/engine.c
index bb0d886fb65d026fa4c4f277b569516eed76a96a..10aa28cba15df61232373b4c1c12ea5a4633fa4a 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -122,14 +122,11 @@ void engine_maketasks ( struct engine *e ) {
         if ( t->skip )
             continue;
         if ( t->type == task_type_sort && t->ci->split )
-            for ( j = 0 ; j < 8 ; j++ ) {
-                if ( t->ci->progeny[j] != NULL ) {
-                    if ( t->ci->progeny[j]->sorts == NULL )
-                        t->ci->progeny[j]->sorts = scheduler_addtask( sched , task_type_sort , task_subtype_none , t->flags , 0 , t->ci->progeny[j] , NULL , 0 );
+            for ( j = 0 ; j < 8 ; j++ )
+                if ( t->ci->progeny[j] != NULL && t->ci->progeny[j]->sorts != NULL ) {
                     t->ci->progeny[j]->sorts->skip = 0;
                     task_addunlock( t->ci->progeny[j]->sorts , t );
                     }
-                }
         if ( t->type == task_type_self ) {
             atomic_inc( &t->ci->nr_tasks );
             if ( t->subtype == task_subtype_density ) {
@@ -219,20 +216,6 @@ void engine_maketasks ( struct engine *e ) {
     /* Weight the tasks. */
     scheduler_reweight( sched );
             
-    /* Count the number of each task type. */
-    int counts[ task_type_count+1 ];
-    for ( k = 0 ; k <= task_type_count ; k++ )
-        counts[k] = 0;
-    for ( k = 0 ; k < sched->nr_tasks ; k++ )
-        if ( !sched->tasks[k].skip )
-            counts[ (int)sched->tasks[k].type ] += 1;
-        else
-            counts[ task_type_count ] += 1;
-    printf( "engine_maketasks: task counts are [ %s=%i" , taskID_names[0] , counts[0] );
-    for ( k = 1 ; k < task_type_count ; k++ )
-        printf( " %s=%i" , taskID_names[k] , counts[k] );
-    printf( " skipped=%i ]\n" , counts[ task_type_count ] ); fflush(stdout); 
-    
     }
     
     
@@ -375,7 +358,8 @@ int engine_marktasks ( struct engine *e ) {
  
 void engine_prepare ( struct engine *e ) {
     
-    int rebuild;
+    int k, rebuild;
+    struct scheduler *sched = &e->sched;
     
     TIMER_TIC
 
@@ -403,16 +387,30 @@ void engine_prepare ( struct engine *e ) {
             error( "engine_marktasks failed after space_rebuild." );
         // printf( "engine_prepare: engine_marktasks took %.3f ms.\n" , (double)(getticks() - tic)/CPU_TPS*1000 );
         
+        /* Count the number of each task type. */
+        int counts[ task_type_count+1 ];
+        for ( k = 0 ; k <= task_type_count ; k++ )
+            counts[k] = 0;
+        for ( k = 0 ; k < sched->nr_tasks ; k++ )
+            if ( !sched->tasks[k].skip )
+                counts[ (int)sched->tasks[k].type ] += 1;
+            else
+                counts[ task_type_count ] += 1;
+        printf( "engine_prepare: task counts are [ %s=%i" , taskID_names[0] , counts[0] );
+        for ( k = 1 ; k < task_type_count ; k++ )
+            printf( " %s=%i" , taskID_names[k] , counts[k] );
+        printf( " skipped=%i ]\n" , counts[ task_type_count ] ); fflush(stdout); 
+    
         }
 
     /* Start the scheduler. */
     // ticks tic2 = getticks();
-    scheduler_start( &e->sched , (1 << task_type_sort) | 
-                                 (1 << task_type_self) |
-                                 (1 << task_type_pair) | 
-                                 (1 << task_type_sub) |
-                                 (1 << task_type_ghost) | 
-                                 (1 << task_type_kick2) );
+    scheduler_start( sched , (1 << task_type_sort) | 
+                             (1 << task_type_self) |
+                             (1 << task_type_pair) | 
+                             (1 << task_type_sub) |
+                             (1 << task_type_ghost) | 
+                             (1 << task_type_kick2) );
     // printf( "engine_prepare: scheduler_start took %.3f ms.\n" , (double)(getticks() - tic2)/CPU_TPS*1000 );
     
     TIMER_TOC( timer_prepare );
@@ -697,7 +695,7 @@ void engine_step ( struct engine *e ) {
     TIMER_TIC
     engine_launch( e , e->nr_threads );
     TIMER_TOC(timer_runners);
-            
+    
     // engine_single_force( e->s->dim , 8328423931905 , e->s->parts , e->s->nr_parts , e->s->periodic );
     
     // for(k=0; k<10; ++k)
@@ -729,7 +727,6 @@ void engine_step ( struct engine *e ) {
     // printf( "engine_step: etot is %e (ekin=%e, epot=%e).\n" , ekin+epot , ekin , epot ); fflush(stdout);
     // printf( "engine_step: total momentum is [ %e , %e , %e ].\n" , mom[0] , mom[1] , mom[2] ); fflush(stdout);
     // printf( "engine_step: total angular momentum is [ %e , %e , %e ].\n" , ang[0] , ang[1] , ang[2] ); fflush(stdout);
-    // printf( "engine_step: total entropic function is %e .\n", ent ); fflush(stdout);
     // printf( "engine_step: updated %i parts (dt_step=%.3e).\n" , count , dt_step ); fflush(stdout);
         
     /* Increase the step. */
@@ -737,36 +734,51 @@ void engine_step ( struct engine *e ) {
 
     /* Does the time step need adjusting? */
     if ( e->policy & engine_policy_fixdt ) {
-        e->dt = e->dt_orig;
+        dt = e->dt_orig;
         }
     else {
-        if ( e->dt == 0 ) {
+        if ( dt == 0 ) {
             e->nullstep += 1;
-            e->dt = e->dt_orig;
-            while ( dt_min < e->dt )
-                e->dt *= 0.5;
-            while ( dt_min > 2*e->dt )
-                e->dt *= 2.0;
+            if ( e->dt_orig > 0.0 ) {
+                dt = e->dt_orig;
+                while ( dt_min < dt )
+                    dt *= 0.5;
+                while ( dt_min > 2*dt )
+                    dt *= 2.0;
+                }
+            else
+                dt = dt_min;
+            for ( k = 0 ; k < s->nr_parts ; k++ ) {
+                /* struct part *p = &s->parts[k];
+                struct xpart *xp = &s->xparts[k];
+                float dt_curr = dt;
+                for ( int j = (int)( p->dt / dt ) ; j > 1 ; j >>= 1 )
+                    dt_curr *= 2.0f; 
+                xp->dt_curr = dt_curr; */
+                s->parts[k].dt = dt;
+                s->xparts[k].dt_curr = dt;
+                }
             // printf( "engine_step: dt_min=%.3e, adjusting time step to dt=%e.\n" , dt_min , e->dt );
             }
         else {
-            while ( dt_min < e->dt ) {
-                e->dt *= 0.5;
+            while ( dt_min < dt ) {
+                dt *= 0.5;
                 e->step *= 2;
                 e->nullstep *= 2;
                 // printf( "engine_step: dt_min dropped below time step, adjusting to dt=%e.\n" , e->dt );
                 }
-            while ( dt_min > 2*e->dt && (e->step & 1) == 0 ) {
-                e->dt *= 2.0;
+            while ( dt_min > 2*dt && (e->step & 1) == 0 ) {
+                dt *= 2.0;
                 e->step /= 2;
                 e->nullstep /= 2;
                 // printf( "engine_step: dt_min is larger than twice the time step, adjusting to dt=%e.\n" , e->dt );
                 }
             }
         } 
+    e->dt = dt;
     
     /* Set the system time. */
-    e->time = e->dt * (e->step - e->nullstep);
+    e->time = dt * (e->step - e->nullstep);
         
     TIMER_TOC2(timer_step);
     
@@ -848,6 +860,7 @@ void engine_init ( struct engine *e , struct space *s , float dt , int nr_thread
     /* Append a kick1 task to each cell. */
     scheduler_reset( &e->sched , s->tot_cells );
     space_map_cells_pre( e->s , 1 , &scheduler_map_mkkick1 , &e->sched );
+    scheduler_ranktasks( &e->sched );
     
     /* Allocate and init the threads. */
     if ( ( e->runners = (struct runner *)malloc( sizeof(struct runner) * nr_threads ) ) == NULL )
diff --git a/src/part.h b/src/part.h
index d5e5bb54e22e52ff44e161dce34fcffa08f9a0c2..7c503486c1069ff723d218dd705a7cc82a42943a 100644
--- a/src/part.h
+++ b/src/part.h
@@ -31,11 +31,11 @@ struct xpart {
     /* Old position, at last tree rebuild. */
     double x_old[3];
     
-    /* Old velocity. */
-    float v_old[3];
+    /* Velocity at the half-step. */
+    float v_hdt[3];
     
-    /* Old entropy. */
-    float u_old;
+    /* Entropy at the half-step. */
+    float u_hdt;
     
     /* Old density. */
     float omega;
diff --git a/src/queue.c b/src/queue.c
index 918b4c108252a7a9954277e9f46a592581a423f7..7b66ccbeeb67b2f75e63ab4cd5bcf3bdb2fc5b1f 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -59,34 +59,45 @@ int queue_counter[ queue_counter_count ];
  
 void queue_insert ( struct queue *q , struct task *t ) {
 
+    int k, *tid;
+    struct task *tasks;
+
     /* Lock the queue. */
     if ( lock_lock( &q->lock ) != 0 )
         error( "Failed to get queue lock." );
         
+    tid = q->tid;
+    tasks = q->tasks;
+
     /* Does the queue need to be grown? */
     if ( q->count == q->size ) {
         int *temp;
         q->size *= queue_sizegrow;
         if ( ( temp = (int *)malloc( sizeof(int) * q->size ) ) == NULL )
             error( "Failed to allocate new indices." );
-        memcpy( temp , q->tid , sizeof(int) * q->count );
-        free( q->tid );
-        q->tid = temp;
+        memcpy( temp , tid , sizeof(int) * q->count );
+        free( tid );
+        q->tid = tid = temp;
         }
         
     /* Drop the task at the end of the queue. */
-    q->tid[ q->count ] = ( t - q->tasks );
+    tid[ q->count ] = ( t - tasks );
     q->count += 1;
     
     /* Shuffle up. */
-    for ( int k = q->count - 1 ; k > 0 ; k = (k-1)/2 )
-        if ( q->tasks[ q->tid[k] ].weight > q->tasks[ q->tid[(k-1)/2] ].weight ) {
-            int temp = q->tid[k];
-            q->tid[k] = q->tid[(k-1)/2];
-            q->tid[(k-1)/2] = temp;
+    for ( k = q->count - 1 ; k > 0 ; k = (k-1)/2 )
+        if ( tasks[ tid[k] ].weight > tasks[ tid[(k-1)/2] ].weight ) {
+            int temp = tid[k];
+            tid[k] = tid[(k-1)/2];
+            tid[(k-1)/2] = temp;
             }
         else
             break;
+            
+    /* Check the queue's consistency. */
+    /* for ( k = 1 ; k < q->count ; k++ )
+        if ( tasks[ tid[(k-1)/2] ].weight < tasks[ tid[k] ].weight )
+            error( "Queue heap is disordered." ); */
     
     /* Unlock the queue. */
     if ( lock_unlock( &q->lock ) != 0 )
@@ -221,6 +232,11 @@ struct task *queue_gettask ( struct queue *q , struct cell *super , int blocking
     else
         res = NULL;
 
+    /* Check the queue's consistency. */
+    /* for ( k = 1 ; k < q->count ; k++ )
+        if ( qtasks[ qtid[(k-1)/2] ].weight < qtasks[ qtid[k] ].weight )
+            error( "Queue heap is disordered." ); */
+    
     /* Release the task lock. */
     if ( lock_unlock( qlock ) != 0 )
         error( "Unlocking the qlock failed.\n" );
diff --git a/src/runner.c b/src/runner.c
index b9c82b19b539d7fed3fbbbbad0c26b4e41cf040e..665ca5262769c4ac5adae5851c72eba6878e7743 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -518,12 +518,12 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
  
 void runner_dokick2 ( struct runner *r , struct cell *c ) {
 
-    int k, count = 0, nr_parts = c->count;
-    float dt_min = FLT_MAX, dt_max = 0.0f;
+    int j, k, count = 0, nr_parts = c->count;
+    float dt_curr, hdt_curr, dt_min = FLT_MAX, dt_max = 0.0f;
     double ekin = 0.0, epot = 0.0;
     float mom[3] = { 0.0f , 0.0f , 0.0f }, ang[3] = { 0.0f , 0.0f , 0.0f };
-    float x[3], v[3], u, h, pdt, m;
-    float dt_step = r->e->dt_step, dt = r->e->dt, hdt = 0.5f*dt;
+    float x[3], v_hdt[3], u_hdt, h, pdt, m;
+    float dt_step = r->e->dt_step, dt = r->e->dt, idt;
     float dt_cfl, dt_h_change, dt_u_change, dt_new;
     float h_dt, u_dt;
     struct part *restrict p, *restrict parts = c->parts;
@@ -531,6 +531,9 @@ void runner_dokick2 ( struct runner *r , struct cell *c ) {
     
     TIMER_TIC
     
+    /* Init idt to avoid compiler stupidity. */
+    idt = ( dt > 0 ) ? 1.0f / dt : 0.0f;
+    
     /* Loop over the particles and kick them. */
     __builtin_prefetch( &parts[0] , 0 , 1 );
     __builtin_prefetch( &parts[0].rho_dh , 0 , 1 );
@@ -552,55 +555,77 @@ void runner_dokick2 ( struct runner *r , struct cell *c ) {
 
         /* Get local copies of particle data. */
         pdt = p->dt;
-        u_dt = p->force.u_dt;
-        h = p->h;
         m = p->mass;
         x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2];
+        v_hdt[0] = xp->v_hdt[0]; v_hdt[1] = xp->v_hdt[1]; v_hdt[2] = xp->v_hdt[2];
+        u_hdt = xp->u_hdt;
 
-        /* Scale the derivatives if they're freshly computed. */
+        /* Update the particle's data (if active). */
         if ( pdt <= dt_step ) {
-            h_dt = p->force.h_dt *= h * 0.333333333f;
+            
+            /* Increase the number of particles updated. */
             count += 1;
+            
+            /* Scale the derivatives as they're freshly computed. */
+            h = p->h;
+            h_dt = p->force.h_dt *= h * 0.333333333f;
             xp->omega = 1.0f + h * p->rho_dh / p->rho * 0.3333333333f;
+            
+            /* Compute the new time step. */
+            u_dt = p->force.u_dt;
+            dt_cfl = const_cfl * h / p->force.v_sig;
+            dt_h_change = ( h_dt != 0.0f ) ? fabsf( const_ln_max_h_change * h / h_dt ) : FLT_MAX;
+            dt_u_change = ( u_dt != 0.0f ) ? fabsf( const_max_u_change * p->u / u_dt ) : FLT_MAX;
+            dt_new = fminf( dt_cfl , fminf( dt_h_change , dt_u_change ) );
+            if ( pdt == 0.0f )
+                p->dt = pdt = dt_new;
+            else
+                p->dt = pdt = fminf( dt_new , 2.0f*pdt );
+                
+            /* Get the particle-specific time step. */
+            dt_curr = xp->dt_curr;
+            hdt_curr = 0.5f * dt_curr;
+            
+            /* Update positions and energies at the full step. */
+            p->v[0] = v_hdt[0] + hdt_curr * p->a[0];
+            p->v[1] = v_hdt[1] + hdt_curr * p->a[1];
+            p->v[2] = v_hdt[2] + hdt_curr * p->a[2];
+            p->u = u_hdt + hdt_curr * u_dt;
+            xp->v_hdt[0] = ( v_hdt[0] += dt_curr * p->a[0] );
+            xp->v_hdt[1] = ( v_hdt[1] += dt_curr * p->a[1] );
+            xp->v_hdt[2] = ( v_hdt[2] += dt_curr * p->a[2] );
+            xp->u_hdt = ( u_hdt += dt_curr * u_dt );
+            
+            /* Set the new particle-specific time step. */
+            if ( dt > 0.0f ) {
+                dt_curr = dt;
+                j = (int)( pdt * idt );
+                while ( j > 1 ) {
+                    dt_curr *= 2.0f;
+                    j >>= 1;
+                    }
+                xp->dt_curr = dt_curr;
+                }
+            
             }
-        else
-            h_dt = p->force.h_dt;
-
-        /* Update the particle's time step. */
-        dt_cfl = const_cfl * h / p->force.v_sig;
-        dt_h_change = ( h_dt != 0.0f ) ? fabsf( const_ln_max_h_change * h / h_dt ) : FLT_MAX;
-        dt_u_change = ( u_dt != 0.0f ) ? fabsf( const_max_u_change * p->u / u_dt ) : FLT_MAX;
-        dt_new = fminf( dt_cfl , fminf( dt_h_change , dt_u_change ) );
-        if ( pdt == 0.0f )
-            p->dt = pdt = dt_new;
-        else if ( pdt <= dt_step )
-            p->dt = pdt = fminf( dt_new , 2.0f*pdt );
-        else
-            p->dt = pdt = fminf( dt_new , pdt );
-
-        /* Update positions and energies at the half-step. */
-        p->v[0] = ( v[0] = xp->v_old[0] + hdt * p->a[0] );
-        p->v[1] = ( v[1] = xp->v_old[1] + hdt * p->a[1] );
-        p->v[2] = ( v[2] = xp->v_old[2] + hdt * p->a[2] );
-        p->u = ( u = xp->u_old + hdt * u_dt );
 
         /* Get the smallest/largest dt. */
         dt_min = fminf( dt_min , pdt );
         dt_max = fmaxf( dt_max , pdt );
 
         /* Collect total energy. */
-        ekin += 0.5 * m * ( v[0]*v[0] + v[1]*v[1] + v[2]*v[2] );
-        epot += m * u;
+        ekin += 0.5 * m * ( v_hdt[0]*v_hdt[0] + v_hdt[1]*v_hdt[1] + v_hdt[2]*v_hdt[2] );
+        epot += m * u_hdt;
 
         /* Collect momentum */
-        mom[0] += m * v[0];
-        mom[1] += m * v[1];
-        mom[2] += m * v[2];
+        mom[0] += m * v_hdt[0];
+        mom[1] += m * v_hdt[1];
+        mom[2] += m * v_hdt[2];
 
 	    /* Collect angular momentum */
-	    ang[0] += m * ( x[1]*v[2] - x[2]*v[1] );
-	    ang[1] += m * ( x[2]*v[0] - x[0]*v[2] );
-	    ang[2] += m * ( x[0]*v[1] - x[1]*v[0] );
+	    ang[0] += m * ( x[1]*v_hdt[2] - x[2]*v_hdt[1] );
+	    ang[1] += m * ( x[2]*v_hdt[0] - x[0]*v_hdt[2] );
+	    ang[2] += m * ( x[0]*v_hdt[1] - x[1]*v_hdt[0] );
 
 	    /* Collect entropic function */
 	    // lent += u * pow( p->rho, 1.f-const_gamma );
@@ -636,9 +661,9 @@ void runner_dokick1 ( struct runner *r , struct cell *c ) {
 
     int j, k;
     struct engine *e = r->e;
-    float pdt, dt_step = e->dt_step, dt = e->dt, hdt = 0.5f*dt;
+    float pdt, dt_step = e->dt_step, dt = e->dt;
     float dt_min, dt_max, h_max, dx, dx_max;
-    float a[3], v[3], u, u_dt, h, h_dt, v_old[3], w, rho;
+    float a[3], v[3], u, u_dt, h, h_dt, w, rho;
     double x[3], x_old[3];
     struct part *restrict p, *restrict parts = c->parts;
     struct xpart *restrict xp, *restrict xparts = c->xparts;
@@ -686,16 +711,10 @@ void runner_dokick1 ( struct runner *r , struct cell *c ) {
             dt_min = fminf( dt_min , pdt );
             dt_max = fmaxf( dt_max , pdt );
             
-            /* Step and store the velocity and internal energy. */
-            xp->v_old[0] = v_old[0] = v[0] + hdt * a[0];
-            xp->v_old[1] = v_old[1] = v[1] + hdt * a[1];
-            xp->v_old[2] = v_old[2] = v[2] + hdt * a[2];
-            xp->u_old = p->u + hdt * p->force.u_dt;
-
-            /* Move the particles with the velocitie at the half-step. */
-            p->x[0] = x[0] += dt * v_old[0];
-            p->x[1] = x[1] += dt * v_old[1];
-            p->x[2] = x[2] += dt * v_old[2];
+            /* Move the particles with the velocities at the half-step. */
+            p->x[0] = x[0] += dt * xp->v_hdt[0];
+            p->x[1] = x[1] += dt * xp->v_hdt[1];
+            p->x[2] = x[2] += dt * xp->v_hdt[2];
             dx = sqrtf( (x[0] - x_old[0])*(x[0] - x_old[0]) +
                         (x[1] - x_old[1])*(x[1] - x_old[1]) +
                         (x[2] - x_old[2])*(x[2] - x_old[2]) );
@@ -787,7 +806,7 @@ void runner_dokick ( struct runner *r , struct cell *c , int timer ) {
     float h_max, dx, dx_max;
     double ekin = 0.0, epot = 0.0;
     float mom[3] = { 0.0f , 0.0f , 0.0f }, ang[3] = { 0.0f , 0.0f , 0.0f };
-    float x[3], x_old[3], v[3], v_old[3], a[3], u, u_old, h, pdt, m, w;
+    float x[3], x_old[3], v_hdt[3], a[3], u, u_hdt, h, pdt, m, w;
     float dt = r->e->dt, hdt = 0.5f*dt;
     float dt_cfl, dt_h_change, dt_u_change, dt_new;
     float h_dt, u_dt;
@@ -832,8 +851,8 @@ void runner_dokick ( struct runner *r , struct cell *c , int timer ) {
             x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2];
             a[0] = p->a[0]; a[1] = p->a[1]; a[2] = p->a[2];
             x_old[0] = xp->x_old[0]; x_old[1] = xp->x_old[1]; x_old[2] = xp->x_old[2];
-            v_old[0] = xp->v_old[0]; v_old[1] = xp->v_old[1]; v_old[2] = xp->v_old[2];
-            u_old = xp->u_old;
+            v_hdt[0] = xp->v_hdt[0]; v_hdt[1] = xp->v_hdt[1]; v_hdt[2] = xp->v_hdt[2];
+            u_hdt = xp->u_hdt;
 
             /* Scale the derivatives if they're freshly computed. */
             h_dt = p->force.h_dt *= h * 0.333333333f;
@@ -854,57 +873,51 @@ void runner_dokick ( struct runner *r , struct cell *c , int timer ) {
             dt_min = fminf( dt_min , pdt );
             dt_max = fmaxf( dt_max , pdt );
 
-            /* Get the instentaneous velocity and internal energy. */
-            v[0] = v_old[0] + hdt * a[0];
-            v[1] = v_old[1] + hdt * a[1];
-            v[2] = v_old[2] + hdt * a[2];
-            u = u_old + hdt * u_dt;
-            
-            /* Collect momentum */
-            mom[0] += m * v[0];
-            mom[1] += m * v[1];
-            mom[2] += m * v[2];
-
-	        /* Collect angular momentum */
-	        ang[0] += m * ( x[1]*v[2] - x[2]*v[1] );
-	        ang[1] += m * ( x[2]*v[0] - x[0]*v[2] );
-	        ang[2] += m * ( x[0]*v[1] - x[1]*v[0] );
-
-            /* Collect total energy. */
-            ekin += 0.5 * m * ( v[0]*v[0] + v[1]*v[1] + v[2]*v[2] );
-            epot += m * u;
-
             /* Step and store the velocity and internal energy. */
-            xp->v_old[0] = ( v_old[0] += dt * a[0] );
-            xp->v_old[1] = ( v_old[1] += dt * a[1] );
-            xp->v_old[2] = ( v_old[2] += dt * a[2] );
-            xp->u_old = u_old + hdt * u_dt;
+            xp->v_hdt[0] = ( v_hdt[0] += dt * a[0] );
+            xp->v_hdt[1] = ( v_hdt[1] += dt * a[1] );
+            xp->v_hdt[2] = ( v_hdt[2] += dt * a[2] );
+            xp->u_hdt = ( u_hdt += dt * u_dt );
 
             /* Move the particles with the velocitie at the half-step. */
-            p->x[0] = x[0] += dt * v_old[0];
-            p->x[1] = x[1] += dt * v_old[1];
-            p->x[2] = x[2] += dt * v_old[2];
+            p->x[0] = x[0] += dt * v_hdt[0];
+            p->x[1] = x[1] += dt * v_hdt[1];
+            p->x[2] = x[2] += dt * v_hdt[2];
             dx = sqrtf( (x[0] - x_old[0])*(x[0] - x_old[0]) +
                         (x[1] - x_old[1])*(x[1] - x_old[1]) +
                         (x[2] - x_old[2])*(x[2] - x_old[2]) );
             dx_max = fmaxf( dx_max , dx );
 
-            /* Update positions and energies at the half-step. */
-            p->v[0] = v[0] + dt * a[0];
-            p->v[1] = v[1] + dt * a[1];
-            p->v[2] = v[2] + dt * a[2];
-            w = u_dt / u * dt;
+            /* Update positions and energies at the next full step. */
+            p->v[0] = v_hdt[0] + hdt * a[0];
+            p->v[1] = v_hdt[1] + hdt * a[1];
+            p->v[2] = v_hdt[2] + hdt * a[2];
+            w = u_dt / u_hdt * hdt;
             if ( fabsf( w ) < 0.01f )
-                p->u = u * ( 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) ) );
+                p->u = u = u_hdt * ( 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) ) );
             else
-                p->u = u * expf( w );
+                p->u = u = u_hdt * expf( w );
             w = h_dt / h * dt;
             if ( fabsf( w ) < 0.01f )
-                p->h = h * ( 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) ) );
+                p->h = h *= ( 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) ) );
             else
-                p->h = h * expf( w );
+                p->h = h *= expf( w );
             h_max = fmaxf( h_max , h );
 
+            /* Collect momentum */
+            mom[0] += m * v_hdt[0];
+            mom[1] += m * v_hdt[1];
+            mom[2] += m * v_hdt[2];
+
+	        /* Collect angular momentum */
+	        ang[0] += m * ( x[1]*v_hdt[2] - x[2]*v_hdt[1] );
+	        ang[1] += m * ( x[2]*v_hdt[0] - x[0]*v_hdt[2] );
+	        ang[2] += m * ( x[0]*v_hdt[1] - x[1]*v_hdt[0] );
+
+            /* Collect total energy. */
+            ekin += 0.5 * m * ( v_hdt[0]*v_hdt[0] + v_hdt[1]*v_hdt[1] + v_hdt[2]*v_hdt[2] );
+            epot += m * u_hdt;
+
             /* Init fields for density calculation. */
             p->density.wcount = 0.0f;
             p->density.wcount_dh = 0.0f;
diff --git a/src/scheduler.c b/src/scheduler.c
index b281f1db00747cd42ab1da5247deb6fb87ddcd25..a3f711fedf9948183671e80fa8173da9f5106b97 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -541,6 +541,11 @@ void scheduler_ranktasks ( struct scheduler *s ) {
             
         }
         
+    /* Verify that the tasks were ranked correctly. */
+    /* for ( k = 1 ; k < s->nr_tasks ; k++ )
+        if ( tasks[ tid[k-1] ].rank > tasks[ tid[k-1] ].rank )
+            error( "Task ranking failed." ); */
+        
     }
 
 
@@ -656,11 +661,12 @@ void scheduler_start ( struct scheduler *s , unsigned int mask ) {
     struct task *t, *tasks = s->tasks;
     // ticks tic;
     
-    /* Run throught the tasks backwards and set their waits. */
+    /* Run throught the tasks and set their waits. */
     // tic = getticks();
     // #pragma omp parallel for schedule(static) private(t,j)
-    for ( k = nr_tasks-1 ; k >= 0 ; k-- ) {
+    for ( k = nr_tasks - 1 ; k >= 0 ; k-- ) {
         t = &tasks[ tid[k] ];
+        t->wait = 0;
         if ( !( (1 << t->type) & mask ) || t->skip )
             continue;
         for ( j = 0 ; j < t->nr_unlock_tasks ; j++ )
@@ -673,10 +679,13 @@ void scheduler_start ( struct scheduler *s , unsigned int mask ) {
     // #pragma omp parallel for schedule(static) private(t)
     for ( k = 0 ; k < nr_tasks ; k++ ) {
         t = &tasks[ tid[k] ];
-        if ( t->rank > 0 )
-            break;
-        if ( ( (1 << t->type) & mask ) && !t->skip && t->wait == 0 )
-            scheduler_enqueue( s , t );
+        if ( ( (1 << t->type) & mask ) && !t->skip && t->wait == 0 ) {
+            if ( t->implicit )
+                for ( j = 0 ; j < t->nr_unlock_tasks ; j++ )
+                    atomic_dec( &t->unlock_tasks[j]->wait );
+            else
+                scheduler_enqueue( s , t );
+            }
         }
     // printf( "scheduler_start: enqueueing tasks took %.3f ms.\n" , (double)( getticks() - tic ) / CPU_TPS * 1000 );
         
@@ -744,55 +753,6 @@ void scheduler_enqueue ( struct scheduler *s , struct task *t ) {
     }
 
 
-/**
- * @brief Take care of a tasks dependencies.
- *
- * @param s The #scheduler.
- * @param t The finished #task.
- */
- 
-void scheduler_done_old ( struct scheduler *s , struct task *t ) {
-
-    int k, res;
-    struct task *t2;
-    
-    /* Release whatever locks this task held. */
-    if ( !t->implicit )
-        switch ( t->type ) {
-            case task_type_self:
-            case task_type_sort:
-                cell_unlocktree( t->ci );
-                break;
-            case task_type_pair:
-            case task_type_sub:
-                cell_unlocktree( t->ci );
-                if ( t->cj != NULL )
-                    cell_unlocktree( t->cj );
-                break;
-            }
-        
-    /* Loop through the dependencies and add them to a queue if
-       they are ready. */
-    for ( k = 0 ; k < t->nr_unlock_tasks ; k++ ) {
-        t2 = t->unlock_tasks[k];
-        if ( ( res = atomic_dec( &t2->wait ) ) < 1 )
-            error( "Negative wait!" );
-        if ( res == 1 && !t2->skip )
-            scheduler_enqueue( s , t2 );
-        }
-        
-    /* Task definitely done. */
-    if ( !t->implicit ) {
-        t->toc = getticks();
-        pthread_mutex_lock( &s->sleep_mutex );
-        atomic_dec( &s->waiting );
-        pthread_cond_broadcast( &s->sleep_cond );
-        pthread_mutex_unlock( &s->sleep_mutex );
-        }
-
-    }
-
-
 /**
  * @brief Take care of a tasks dependencies.
  *
@@ -819,7 +779,7 @@ struct task *scheduler_done ( struct scheduler *s , struct task *t ) {
         t2 = t->unlock_tasks[k];
         if ( ( res = atomic_dec( &t2->wait ) ) < 1 )
             error( "Negative wait!" );
-        if (res == 1 && !t2->skip ) {
+        if ( res == 1 && !t2->skip ) {
             if ( 0 && !t2->implicit &&
                  t2->ci->super == super &&
                  ( next == NULL || t2->weight > next->weight ) &&
diff --git a/src/space.c b/src/space.c
index dcf8a0422fab886b4fc891ffbba540092ba26754..a1a933fe21852704652c4b3370a047623b5cea2e 100644
--- a/src/space.c
+++ b/src/space.c
@@ -856,6 +856,17 @@ void space_init ( struct space *s , double dim[3] , struct part *parts , int N ,
     /* Allocate the xtra parts array. */
     if ( posix_memalign( (void *)&s->xparts , 32 , N * sizeof(struct xpart) ) != 0 )
         error( "Failed to allocate xparts." );
+    bzero( s->xparts , N * sizeof(struct xpart) );
+    
+    /* Initialize the velocities and internal energies. */
+    for ( int k = 0 ; k < N ; k++ ) {
+        struct part *p = &parts[k];
+        struct xpart *xp = &s->xparts[k];
+        xp->v_hdt[0] = p->v[0];
+        xp->v_hdt[1] = p->v[1];
+        xp->v_hdt[2] = p->v[2];
+        xp->u_hdt = p->u;
+        }
         
     /* Init the space lock. */
     if ( lock_init( &s->lock ) != 0 )
diff --git a/src/space.h b/src/space.h
index 7ae8fefce07e09c5bc27faa8b2a0d23bf60f4e57..12f0ad97c96a00142b8254841dfd7335b678b343 100644
--- a/src/space.h
+++ b/src/space.h
@@ -28,7 +28,7 @@
 #define space_subsize_default           5000
 #define space_stretch                   1.10f
 #define space_maxreldx                  0.25f
-#define space_qstack                    1000
+#define space_qstack                    1024
 
 
 /* Convert cell location to ID. */