diff --git a/src/Makefile.am b/src/Makefile.am
index 6dc507a5bfb846fc19cd6d879e1c7721ca23fda9..6f1e27f4d59e81d3755ce27ab4d2cb6a90eee9b3 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -26,8 +26,9 @@ AM_LDFLAGS = $(LAPACK_LIBS) $(BLAS_LIBS) -version-info 0:0:0
 
 # Build the libgadgetsmp library
 lib_LTLIBRARIES = libgadgetsmp.la
-libgadgetsmp_la_SOURCES = space.c runner.c queue.c task.c cell.c
+libgadgetsmp_la_SOURCES = space.c runner.c queue.c task.c cell.c engine.c
 
 # List required headers
-include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h gadgetsmp.h
+include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \
+    engine.h gadgetsmp.h
 
diff --git a/src/engine.c b/src/engine.c
new file mode 100644
index 0000000000000000000000000000000000000000..22fd0562c3fd0876288c9a29e4b852f32114c472
--- /dev/null
+++ b/src/engine.c
@@ -0,0 +1,293 @@
+/*******************************************************************************
+ * This file is part of GadgetSMP.
+ * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ * 
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ * 
+ ******************************************************************************/
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+#include <omp.h>
+#include <sched.h>
+
+/* Local headers. */
+#include "cycle.h"
+#include "const.h"
+#include "lock.h"
+#include "task.h"
+#include "part.h"
+#include "cell.h"
+#include "space.h"
+#include "queue.h"
+#include "engine.h"
+#include "runner.h"
+#include "runner_iact.h"
+
+/* Error macro. */
+#define error(s) { printf( "%s:%s:%i: %s\n" , __FILE__ , __FUNCTION__ , __LINE__ , s ); abort(); }
+
+/* Convert cell location to ID. */
+#define cell_getid( cdim , i , j , k ) ( (int)(k) + (cdim)[2]*( (int)(j) + (cdim)[1]*(int)(i) ) )
+
+
+/** 
+ * @brief Sort the tasks in topological order over all queues.
+ *
+ * @param e The #engine.
+ */
+ 
+void engine_ranktasks ( struct engine *e ) {
+
+    int i, j = 0, k, temp, left = 0, rank;
+    struct task *t;
+    struct space *s = e->s;
+    int *tid;
+
+    /* Run throught the tasks and get all the waits right. */
+    for ( k = 0 ; k < s->nr_tasks ; k++ ) {
+        for ( j = 0 ; j < s->tasks[k].nr_unlock_tasks ; j++ )
+            s->tasks[k].unlock_tasks[j]->wait += 1;
+        }
+        
+    /* Allocate and init the task-ID array. */
+    if ( ( tid = (int *)malloc( sizeof(int) * s->nr_tasks ) ) == NULL )
+        error( "Failed to allocate temporary tid array." );
+    for ( k = 0 ; k < s->nr_tasks ; k++ )
+        tid[k] = k;
+        
+    /* Main loop. */
+    for ( rank = 0 ; left < s->nr_tasks ; rank++ ) {
+        
+        /* Load the tids of tasks with no waits. */
+        for ( k = left ; k < s->nr_tasks ; k++ )
+            if ( s->tasks[ tid[k] ].wait == 0 ) {
+                temp = tid[j]; tid[j] = tid[k]; tid[k] = temp;
+                j += 1;
+                }
+
+        /* Traverse the task tree and add tasks with no weight. */
+        for ( i = left ; i < j ; i++ ) {
+            t = &s->tasks[ tid[i] ];
+            t->rank = rank;
+            s->tasks_ind[i] = t - s->tasks;
+            /* printf( "engine_ranktasks: task %i of type %s has rank %i.\n" , i , 
+                (t->type == task_type_self) ? "self" : (t->type == task_type_pair) ? "pair" : "sort" , rank ); */
+            for ( k = 0 ; k < t->nr_unlock_tasks ; k++ )
+                t->unlock_tasks[k]->wait -= 1;
+            }
+            
+        /* The new left (no, not tony). */
+        left = j;
+            
+        }
+        
+    /* Release the temporary array. */
+    free(tid);
+    
+    }
+
+
+/**
+ * @brief Implements a barrier for the #runner threads.
+ *
+ * @param e The #engine.
+ */
+ 
+void engine_barrier( struct engine *e ) {
+
+    /* First, get the barrier mutex. */
+    if ( pthread_mutex_lock( &e->barrier_mutex ) != 0 )
+        error( "Failed to get barrier mutex." );
+        
+    /* Wait for the barrier to close. */
+    while ( e->barrier_count < 0 )
+        if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 )
+            error( "Eror waiting for barrier to close." );
+        
+    /* Once I'm in, increase the barrier count. */
+    e->barrier_count += 1;
+    
+    /* If all threads are in, send a signal... */
+    if ( e->barrier_count == e->nr_threads )
+        if ( pthread_cond_broadcast( &e->barrier_cond ) != 0 )
+            error( "Failed to broadcast barrier full condition." );
+        
+    /* Wait for barrier to be released. */
+    while ( e->barrier_count > 0 )
+        if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 )
+            error( "Error waiting for barrier to be released." );
+            
+    /* Decrease the counter before leaving... */
+    e->barrier_count += 1;
+    
+    /* If I'm the last one out, signal the condition again. */
+    if ( e->barrier_count == 0 )
+        if ( pthread_cond_broadcast( &e->barrier_cond ) != 0 )
+            error( "Failed to broadcast empty barrier condition." );
+            
+    /* Last but not least, release the mutex. */
+    if ( pthread_mutex_unlock( &e->barrier_mutex ) != 0 )
+        error( "Failed to get unlock the barrier mutex." );
+
+    }
+    
+    
+/**
+ * @brief Let the #engine loose to compute the forces.
+ *
+ * @param e The #engine.
+ * @param sort_queues Flag to try to sort the queues topologically.
+ */
+ 
+void engine_run ( struct engine *e , int sort_queues ) {
+
+    int j, k;
+    struct space *s = e->s;
+    
+    /* Run throught the tasks and get all the waits right. */
+    for ( k = 0 ; k < s->nr_tasks ; k++ ) {
+        s->tasks[k].done = 0;
+        for ( j = 0 ; j < s->tasks[k].nr_unlock_tasks ; j++ )
+            s->tasks[k].unlock_tasks[j]->wait += 1;
+        for ( j = 0 ; j < s->tasks[k].nr_unlock_cells ; j++ )
+            s->tasks[k].unlock_cells[j]->wait += 1;
+        }
+    
+    /* Re-set the queues.*/
+    if ( sort_queues ) {
+        #pragma omp parallel for default(none), shared(e)
+        for ( k = 0 ; k < e->nr_queues ; k++ ) {
+            queue_sort( &e->queues[k] );
+            e->queues[k].next = 0;
+            }
+        }
+    else
+        for ( k = 0 ; k < e->nr_queues ; k++ )
+            e->queues[k].next = 0;
+    
+    /* Cry havoc and let loose the dogs of war. */
+    e->barrier_count = -e->barrier_count;
+    if ( pthread_cond_broadcast( &e->barrier_cond ) != 0 )
+        error( "Failed to broadcast barrier open condition." );
+        
+    /* Sit back and wait for the runners to come home. */
+    while ( e->barrier_count < e->nr_threads )
+        if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 )
+            error( "Error while waiting for barrier." );
+    
+    }
+    
+    
+/**
+ * @brief init an engine with the given number of threads, queues, and
+ *      the given policy.
+ *
+ * @param e The #engine.
+ * @param s The #space in which this #runner will run.
+ * @param nr_threads The number of threads to spawn.
+ * @param nr_queues The number of task queues to create.
+ * @param policy The queueing policy to use.
+ */
+ 
+void engine_init ( struct engine *e , struct space *s , int nr_threads , int nr_queues , int policy ) {
+
+    #if defined(HAVE_SETAFFINITY)
+        cpu_set_t cpuset;
+    #endif
+    int k, qid, nrq;
+    
+    /* Store the values. */
+    e->s = s;
+    e->nr_threads = nr_threads;
+    e->nr_queues = nr_queues;
+    e->policy = policy;
+    
+    /* First of all, init the barrier and lock it. */
+    if ( pthread_mutex_init( &e->barrier_mutex , NULL ) != 0 )
+        error( "Failed to initialize barrier mutex." );
+    if ( pthread_cond_init( &e->barrier_cond , NULL ) != 0 )
+        error( "Failed to initialize barrier condition variable." );
+    if ( pthread_mutex_lock( &e->barrier_mutex ) != 0 )
+        error( "Failed to lock barrier mutex." );
+    e->barrier_count = 0;
+    
+    /* Allocate the queues. */
+    if ( posix_memalign( (void *)(&e->queues) , 64 , nr_queues * sizeof(struct queue) ) != 0 )
+        error( "Failed to allocate queues." );
+    bzero( e->queues , nr_queues * sizeof(struct queue) );
+        
+    /* Init the queues. */
+    for ( k = 0 ; k < nr_queues ; k++ )
+        queue_init( &e->queues[k] , s->nr_tasks , s->tasks );
+        
+    /* Rank the tasks in topological order. */
+    engine_ranktasks( e );
+    
+    /* How many queues to fill initially? */
+    for ( nrq = 0 , k = nr_queues ; k > 0 ; k = k / 2 )
+        nrq += 1;
+        
+    /* Fill the queues (round-robin). */
+    for ( k = 0 ; k < s->nr_tasks ; k++ ) {
+        if ( s->tasks[ s->tasks_ind[k] ].type == task_type_none )
+            continue;
+        // qid = 0;
+        // qid = k % nrq;
+        qid = k % nr_queues;
+        e->queues[qid].tid[ e->queues[qid].count ] = s->tasks_ind[k];
+        e->queues[qid].count += 1;
+        }
+        
+    /* Sort the queues topologically. */
+    for ( k = 0 ; k < nr_queues ; k++ )
+        queue_sort( &e->queues[k] );
+        
+    /* Allocate and init the threads. */
+    if ( ( e->runners = (struct runner *)malloc( sizeof(struct runner) * nr_threads ) ) == NULL )
+        error( "Failed to allocate threads array." );
+    for ( k = 0 ; k < nr_threads ; k++ ) {
+        e->runners[k].id = k;
+        e->runners[k].e = e;
+        if ( pthread_create( &e->runners[k].thread , NULL , &runner_main , &e->runners[k] ) != 0 )
+            error( "Failed to create runner thread." );
+        #if defined(HAVE_SETAFFINITY)
+            /* Set the cpu mask to zero | e->id. */
+            CPU_ZERO( &cpuset );
+            CPU_SET( e->runners[k].id , &cpuset );
+
+            /* Apply this mask to the runner's pthread. */
+            if ( pthread_setaffinity_np( e->runners[k].thread , sizeof(cpu_set_t) , &cpuset ) != 0 )
+                error( "Failed to set thread affinity." );
+        #endif
+        }
+        
+    /* Wait for the runner threads to be in place. */
+    while ( e->barrier_count != e->nr_threads )
+        if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 )
+            error( "Error while waiting for runner threads to get in place." );
+    
+    }
+    
+    
+    
diff --git a/src/engine.h b/src/engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2ca98abfe3ee5d687accfaaa51f78e4d593b611
--- /dev/null
+++ b/src/engine.h
@@ -0,0 +1,65 @@
+/*******************************************************************************
+ * This file is part of GadgetSMP.
+ * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ * 
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ * 
+ ******************************************************************************/
+
+
+
+/* Some constants. */
+#define engine_policy_none          0
+#define engine_policy_rand          1
+#define engine_policy_steal         2
+#define engine_policy_keep          4
+#define engine_policy_block         8
+
+#define engine_queue_scale          1.2
+
+
+/* Data structure for the engine. */
+struct engine {
+
+    /* Number of threads on which to run. */
+    int nr_threads;
+    
+    /* The space with which the runner is associated. */
+    struct space *s;
+    
+    /* The runner's threads. */
+    struct runner *runners;
+    
+    /* The running policy. */
+    int policy;
+    
+    /* The number of queues. */
+    int nr_queues;
+    
+    /* The queues. */
+    struct queue *queues;
+    
+    /* Data for the threads' barrier. */
+    pthread_mutex_t barrier_mutex;
+    pthread_cond_t barrier_cond;
+    int barrier_count;
+    
+    };
+
+
+/* Function prototypes. */
+void engine_barrier( struct engine *e );
+void engine_init ( struct engine *e , struct space *s , int nr_threads , int nr_queues , int policy );
+void engine_ranktasks ( struct engine *e );
+void engine_run ( struct engine *e , int sort_queues );
diff --git a/src/gadgetsmp.h b/src/gadgetsmp.h
index 866413474718a9bc9c5bfb1d1c050d1e2f4cb3a9..3193cc8200cab4906a027faa775659e128cf3d62 100644
--- a/src/gadgetsmp.h
+++ b/src/gadgetsmp.h
@@ -30,5 +30,6 @@
 #include "space.h"
 #include "queue.h"
 #include "runner_iact.h"
+#include "engine.h"
 #include "runner.h"
 
diff --git a/src/runner.c b/src/runner.c
index dd38394b7c57dcfae922bc510e6be2fee15dfb23..5d945986a4a2aedfb1b29a30e2bfdd92dff25667 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -40,6 +40,7 @@
 #include "cell.h"
 #include "space.h"
 #include "queue.h"
+#include "engine.h"
 #include "runner.h"
 #include "runner_iact.h"
 
@@ -85,63 +86,6 @@ const char runner_flip[27] = { 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1
 #include "runner_doiact.h"
 
 
-/** 
- * @brief Sort the tasks in topological order over all queues.
- *
- * @param r The #runner.
- */
- 
-void runner_ranktasks ( struct runner *r ) {
-
-    int i, j = 0, k, temp, left = 0, rank;
-    struct task *t;
-    struct space *s = r->s;
-    int *tid;
-
-    /* Run throught the tasks and get all the waits right. */
-    for ( k = 0 ; k < s->nr_tasks ; k++ ) {
-        for ( j = 0 ; j < s->tasks[k].nr_unlock_tasks ; j++ )
-            s->tasks[k].unlock_tasks[j]->wait += 1;
-        }
-        
-    /* Allocate and init the task-ID array. */
-    if ( ( tid = (int *)malloc( sizeof(int) * s->nr_tasks ) ) == NULL )
-        error( "Failed to allocate temporary tid array." );
-    for ( k = 0 ; k < s->nr_tasks ; k++ )
-        tid[k] = k;
-        
-    /* Main loop. */
-    for ( rank = 0 ; left < s->nr_tasks ; rank++ ) {
-        
-        /* Load the tids of tasks with no waits. */
-        for ( k = left ; k < s->nr_tasks ; k++ )
-            if ( s->tasks[ tid[k] ].wait == 0 ) {
-                temp = tid[j]; tid[j] = tid[k]; tid[k] = temp;
-                j += 1;
-                }
-
-        /* Traverse the task tree and add tasks with no weight. */
-        for ( i = left ; i < j ; i++ ) {
-            t = &s->tasks[ tid[i] ];
-            t->rank = rank;
-            s->tasks_ind[i] = t - s->tasks;
-            /* printf( "runner_ranktasks: task %i of type %s has rank %i.\n" , i , 
-                (t->type == task_type_self) ? "self" : (t->type == task_type_pair) ? "pair" : "sort" , rank ); */
-            for ( k = 0 ; k < t->nr_unlock_tasks ; k++ )
-                t->unlock_tasks[k]->wait -= 1;
-            }
-            
-        /* The new left (no, not tony). */
-        left = j;
-            
-        }
-        
-    /* Release the temporary array. */
-    free(tid);
-    
-    }
-
-
 /**
  * @brief Sort the entries in ascending order using QuickSort.
  *
@@ -282,7 +226,7 @@ inline void merge_backward ( struct entry *__restrict__ one , int none , struct
  * @param c The #cell.
  */
  
-void runner_dosort ( struct runner_thread *rt , struct cell *c , int flags ) {
+void runner_dosort ( struct runner *r , struct cell *c , int flags ) {
 
     struct entry *finger;
     struct entry *fingers[8];
@@ -478,7 +422,7 @@ void runner_dosort ( struct runner_thread *rt , struct cell *c , int flags ) {
 
     #ifdef TIMER_VERBOSE
         printf( "runner_dosort[%02i]: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) took %.3f ms.\n" ,
-            rt->id , c->count , c->depth ,
+            r->id , c->count , c->depth ,
             (flags & 0x1000) >> 12 , (flags & 0x800) >> 11 , (flags & 0x400) >> 10 , (flags & 0x200) >> 9 , (flags & 0x100) >> 8 , (flags & 0x80) >> 7 , (flags & 0x40) >> 6 , (flags & 0x20) >> 5 , (flags & 0x10) >> 4 , (flags & 0x8) >> 3 , (flags & 0x4) >> 2 , (flags & 0x2) >> 1 , (flags & 0x1) >> 0 , 
             ((double)TIMER_TOC(runner_timer_dosort)) / CPU_TPS * 1000 ); fflush(stdout);
     #else
@@ -491,20 +435,16 @@ void runner_dosort ( struct runner_thread *rt , struct cell *c , int flags ) {
 /**
  * @brief Intermediate task between density and force
  *
- * @param rt The runner thread.
+ * @param r The runner thread.
  * @param ci THe cell.
  */
  
-void runner_doghost ( struct runner_thread *r , struct cell *c ) {
+void runner_doghost ( struct runner *r , struct cell *c ) {
 
     struct part *p;
     int i, k;
     TIMER_TIC
     
-    /* If this cell has progeny, don't bother. */
-    if ( c->split )
-        return;
-    
     /* Loop over the parts in this cell. */
     for ( i = 0 ; i < c->count ; i++ ) {
     
@@ -529,7 +469,7 @@ void runner_doghost ( struct runner_thread *r , struct cell *c ) {
 
     #ifdef TIMER_VERBOSE
         printf( "runner_doghost[%02i]: %i parts at depth %i took %.3f ms.\n" ,
-            rt->id , c->count , c->depth ,
+            r->id , c->count , c->depth ,
             ((double)TIMER_TOC(runner_timer_doghost)) / CPU_TPS * 1000 ); fflush(stdout);
     #else
         TIMER_TOC(runner_timer_doghost);
@@ -538,50 +478,6 @@ void runner_doghost ( struct runner_thread *r , struct cell *c ) {
     }
 
 
-/**
- * @brief Implements a barrier for the #runner threads.
- *
- */
- 
-void runner_barrier( struct runner *r ) {
-
-    /* First, get the barrier mutex. */
-    if ( pthread_mutex_lock( &r->barrier_mutex ) != 0 )
-        error( "Failed to get barrier mutex." );
-        
-    /* Wait for the barrier to close. */
-    while ( r->barrier_count < 0 )
-        if ( pthread_cond_wait( &r->barrier_cond , &r->barrier_mutex ) != 0 )
-            error( "Eror waiting for barrier to close." );
-        
-    /* Once I'm in, increase the barrier count. */
-    r->barrier_count += 1;
-    
-    /* If all threads are in, send a signal... */
-    if ( r->barrier_count == r->nr_threads )
-        if ( pthread_cond_broadcast( &r->barrier_cond ) != 0 )
-            error( "Failed to broadcast barrier full condition." );
-        
-    /* Wait for barrier to be released. */
-    while ( r->barrier_count > 0 )
-        if ( pthread_cond_wait( &r->barrier_cond , &r->barrier_mutex ) != 0 )
-            error( "Error waiting for barrier to be released." );
-            
-    /* Decrease the counter before leaving... */
-    r->barrier_count += 1;
-    
-    /* If I'm the last one out, signal the condition again. */
-    if ( r->barrier_count == 0 )
-        if ( pthread_cond_broadcast( &r->barrier_cond ) != 0 )
-            error( "Failed to broadcast empty barrier condition." );
-            
-    /* Last but not least, release the mutex. */
-    if ( pthread_mutex_unlock( &r->barrier_mutex ) != 0 )
-        error( "Failed to get unlock the barrier mutex." );
-
-    }
-    
-    
 /**
  * @brief The #runner main thread routine.
  *
@@ -590,14 +486,14 @@ void runner_barrier( struct runner *r ) {
  
 void *runner_main ( void *data ) {
 
-    struct runner_thread *rt = (struct runner_thread *)data;
-    struct runner *r = rt->r;
-    int threadID = rt->id;
+    struct runner *r = (struct runner *)data;
+    struct engine *e = r->e;
+    int threadID = r->id;
     int k, qid, naq, keep, tpq;
-    struct queue *queues[ r->nr_queues ], *myq;
+    struct queue *queues[ e->nr_queues ], *myq;
     struct task *t;
     struct cell *ci, *cj;
-    unsigned int myseed = rand() + rt->id;
+    unsigned int myseed = rand() + r->id;
     #ifdef TIMER
         ticks stalled;
     #endif
@@ -606,23 +502,23 @@ void *runner_main ( void *data ) {
     while ( 1 ) {
     
         /* Wait at the barrier. */
-        runner_barrier( r );
+        engine_barrier( e );
         
         /* Set some convenient local data. */
-        keep = r->policy & runner_policy_keep;
-        myq = &r->queues[ threadID % r->nr_queues ];
-        tpq = ceil( ((double)r->nr_threads) / r->nr_queues );
+        keep = e->policy & engine_policy_keep;
+        myq = &e->queues[ threadID % e->nr_queues ];
+        tpq = ceil( ((double)e->nr_threads) / e->nr_queues );
         stalled = 0;
         
         /* Set up the local list of active queues. */
-        naq = r->nr_queues;
+        naq = e->nr_queues;
         for ( k = 0 ; k < naq ; k++ )
-            queues[k] = &r->queues[k];
+            queues[k] = &e->queues[k];
     
         /* Set up the local list of active queues. */
-        naq = r->nr_queues;
+        naq = e->nr_queues;
         for ( k = 0 ; k < naq ; k++ )
-            queues[k] = &r->queues[k];
+            queues[k] = &e->queues[k];
     
         /* Loop while there are tasks... */
         while ( 1 ) {
@@ -640,32 +536,32 @@ void *runner_main ( void *data ) {
             /* Get a task, how and from where depends on the policy. */
             TIMER_TIC
             t = NULL;
-            if ( r->nr_queues == 1 ) {
-                t = queue_gettask( &r->queues[0] , 1 , 0 );
+            if ( e->nr_queues == 1 ) {
+                t = queue_gettask( &e->queues[0] , 1 , 0 );
                 }
-            else if ( r->policy & runner_policy_steal ) {
+            else if ( e->policy & engine_policy_steal ) {
                 if ( ( myq->next == myq->count ) ||
-                     ( t = queue_gettask_new( myq , rt->id , 0 , 0 ) ) == NULL ) {
+                     ( t = queue_gettask_new( myq , r->id , 0 , 0 ) ) == NULL ) {
                     TIMER_TIC2
                     qid = rand_r( &myseed ) % naq;
-                    keep = ( r->policy & runner_policy_keep ) &&
+                    keep = ( e->policy & engine_policy_keep ) &&
                            ( myq->count <= myq->size-tpq );
                     if ( myq->next == myq->count )
                         COUNT(runner_counter_steal_empty);
                     else
                         COUNT(runner_counter_steal_stall);
-                    t = queue_gettask_new( queues[qid] , rt->id , 0 , keep );
+                    t = queue_gettask_new( queues[qid] , r->id , 0 , keep );
                     if ( t != NULL && keep )
                         queue_insert( myq , t );
                     TIMER_TOC2(runner_timer_steal);
                     }
                 }
-            else if ( r->policy & runner_policy_rand ) {
+            else if ( e->policy & engine_policy_rand ) {
                 qid = rand_r( &myseed ) % naq;
-                t = queue_gettask( queues[qid] , r->policy & runner_policy_block , 0 );
+                t = queue_gettask( queues[qid] , e->policy & engine_policy_block , 0 );
                 }
             else {
-                t = queue_gettask( &r->queues[threadID] , r->policy & runner_policy_block , 0 );
+                t = queue_gettask( &e->queues[threadID] , e->policy & engine_policy_block , 0 );
                 }
             TIMER_TOC(runner_timer_getpair);
             
@@ -681,7 +577,7 @@ void *runner_main ( void *data ) {
                 stalled = getticks() - stalled;
                 __sync_add_and_fetch( &runner_timer[runner_timer_stalled] , stalled );
                 #ifdef TIMER_VERBOSE
-                    printf( "runner_main[%02i]: stalled %.3f ms\n" , rt->id , ((double)stalled) / CPU_TPS * 1000 );
+                    printf( "runner_main[%02i]: stalled %.3f ms\n" , r->id , ((double)stalled) / CPU_TPS * 1000 );
                     fflush(stdout);
                 #endif
                 stalled = 0;
@@ -696,31 +592,31 @@ void *runner_main ( void *data ) {
             switch ( t->type ) {
                 case task_type_self:
                     if ( t->subtype == task_subtype_density )
-                        runner_doself_density( rt , ci );
+                        runner_doself_density( r , ci );
                     else if ( t->subtype == task_subtype_force )
-                        runner_doself_force( rt , ci );
+                        runner_doself_force( r , ci );
                     else
                         error( "Unknown task subtype." );
                     cell_unlocktree( ci );
                     break;
                 case task_type_pair:
                     if ( t->subtype == task_subtype_density )
-                        runner_dopair_density( rt , ci , cj );
+                        runner_dopair_density( r , ci , cj );
                     else if ( t->subtype == task_subtype_force )
-                        runner_dopair_force( rt , ci , cj );
+                        runner_dopair_force( r , ci , cj );
                     else
                         error( "Unknown task subtype." );
                     cell_unlocktree( ci );
                     cell_unlocktree( cj );
                     break;
                 case task_type_sort:
-                    runner_dosort( rt , ci , t->flags );
+                    runner_dosort( r , ci , t->flags );
                     break;
                 case task_type_sub:
                     if ( t->subtype == task_subtype_density )
-                        runner_dosub_density( rt , ci , cj , t->flags );
+                        runner_dosub_density( r , ci , cj , t->flags );
                     else if ( t->subtype == task_subtype_force )
-                        runner_dosub_force( rt , ci , cj , t->flags );
+                        runner_dosub_force( r , ci , cj , t->flags );
                     else
                         error( "Unknown task subtype." );
                     cell_unlocktree( ci );
@@ -728,7 +624,8 @@ void *runner_main ( void *data ) {
                         cell_unlocktree( cj );
                     break;
                 case task_type_ghost:
-                    runner_doghost( rt , ci );
+                    if ( t->flags )
+                        runner_doghost( r , ci );
                     break;
                 default:
                     error( "Unknown task type." );
@@ -751,7 +648,7 @@ void *runner_main ( void *data ) {
             stalled = getticks() - stalled;
             __sync_add_and_fetch( &runner_timer[runner_timer_stalled] , stalled );
             #ifdef TIMER_VERBOSE
-                printf( "runner_main[%02i]: stalled %.3f ms\n" , rt->id , ((double)stalled) / CPU_TPS * 1000 );
+                printf( "runner_main[%02i]: stalled %.3f ms\n" , r->id , ((double)stalled) / CPU_TPS * 1000 );
                 fflush(stdout);
             #endif
             stalled = 0;
@@ -766,141 +663,3 @@ void *runner_main ( void *data ) {
     }
     
 
-/**
- * @brief Let the #runner loose on the given #space.
- *
- * @param r The #runner.
- * @param s The #space.
- */
- 
-void runner_run ( struct runner *r , int sort_queues ) {
-
-    int j, k;
-    struct space *s = r->s;
-    
-    /* Run throught the tasks and get all the waits right. */
-    for ( k = 0 ; k < s->nr_tasks ; k++ ) {
-        s->tasks[k].done = 0;
-        for ( j = 0 ; j < s->tasks[k].nr_unlock_tasks ; j++ )
-            s->tasks[k].unlock_tasks[j]->wait += 1;
-        for ( j = 0 ; j < s->tasks[k].nr_unlock_cells ; j++ )
-            s->tasks[k].unlock_cells[j]->wait += 1;
-        }
-    
-    /* Re-set the queues.*/
-    if ( sort_queues ) {
-        #pragma omp parallel for default(none), shared(r)
-        for ( k = 0 ; k < r->nr_queues ; k++ ) {
-            queue_sort( &r->queues[k] );
-            r->queues[k].next = 0;
-            }
-        }
-    else
-        for ( k = 0 ; k < r->nr_queues ; k++ )
-            r->queues[k].next = 0;
-    
-    /* Cry havoc and let loose the dogs of war. */
-    r->barrier_count = -r->barrier_count;
-    if ( pthread_cond_broadcast( &r->barrier_cond ) != 0 )
-        error( "Failed to broadcast barrier open condition." );
-        
-    /* Sit back and wait for the runner_threads to come home. */
-    while ( r->barrier_count < r->nr_threads )
-        if ( pthread_cond_wait( &r->barrier_cond , &r->barrier_mutex ) != 0 )
-            error( "Error while waiting for barrier." );
-    
-    }
-    
-    
-/**
- * @brief init a runner with the given number of threads, queues, and
- *      the given policy.
- *
- * @param r The #runner.
- * @param s The #space in which this #runner will run.
- * @param nr_threads The number of threads to spawn.
- * @param nr_queues The number of task queues to create.
- * @param policy The queueing policy to use.
- */
- 
-void runner_init ( struct runner *r , struct space *s , int nr_threads , int nr_queues , int policy ) {
-
-    #if defined(HAVE_SETAFFINITY)
-        cpu_set_t cpuset;
-    #endif
-    int k, qid, nrq;
-    
-    /* Store the values. */
-    r->s = s;
-    r->nr_threads = nr_threads;
-    r->nr_queues = nr_queues;
-    r->policy = policy;
-    
-    /* First of all, init the barrier and lock it. */
-    if ( pthread_mutex_init( &r->barrier_mutex , NULL ) != 0 )
-        error( "Failed to initialize barrier mutex." );
-    if ( pthread_cond_init( &r->barrier_cond , NULL ) != 0 )
-        error( "Failed to initialize barrier condition variable." );
-    if ( pthread_mutex_lock( &r->barrier_mutex ) != 0 )
-        error( "Failed to lock barrier mutex." );
-    r->barrier_count = 0;
-    
-    /* Allocate the queues. */
-    if ( posix_memalign( (void *)(&r->queues) , 64 , nr_queues * sizeof(struct queue) ) != 0 )
-        error( "Failed to allocate queues." );
-    bzero( r->queues , nr_queues * sizeof(struct queue) );
-        
-    /* Init the queues. */
-    for ( k = 0 ; k < nr_queues ; k++ )
-        queue_init( &r->queues[k] , s->nr_tasks , s->tasks );
-        
-    /* Rank the tasks in topological order. */
-    runner_ranktasks( r );
-    
-    /* How many queues to fill initially? */
-    for ( nrq = 0 , k = nr_queues ; k > 0 ; k = k / 2 )
-        nrq += 1;
-        
-    /* Fill the queues (round-robin). */
-    for ( k = 0 ; k < s->nr_tasks ; k++ ) {
-        if ( s->tasks[ s->tasks_ind[k] ].type == task_type_none )
-            continue;
-        // qid = 0;
-        // qid = k % nrq;
-        qid = k % nr_queues;
-        r->queues[qid].tid[ r->queues[qid].count ] = s->tasks_ind[k];
-        r->queues[qid].count += 1;
-        }
-        
-    /* Sort the queues topologically. */
-    for ( k = 0 ; k < nr_queues ; k++ )
-        queue_sort( &r->queues[k] );
-        
-    /* Allocate and init the threads. */
-    if ( ( r->threads = (struct runner_thread *)malloc( sizeof(struct runner_thread) * nr_threads ) ) == NULL )
-        error( "Failed to allocate threads array." );
-    for ( k = 0 ; k < nr_threads ; k++ ) {
-        r->threads[k].id = k;
-        r->threads[k].r = r;
-        if ( pthread_create( &r->threads[k].thread , NULL , &runner_main , &r->threads[k] ) != 0 )
-            error( "Failed to create runner thread." );
-        #if defined(HAVE_SETAFFINITY)
-            /* Set the cpu mask to zero | r->id. */
-            CPU_ZERO( &cpuset );
-            CPU_SET( r->threads[k].id , &cpuset );
-
-            /* Apply this mask to the runner's pthread. */
-            if ( pthread_setaffinity_np( r->threads[k].thread , sizeof(cpu_set_t) , &cpuset ) != 0 )
-                error( "Failed to set thread affinity." );
-        #endif
-        }
-        
-    /* Wait for the runner threads to be in place. */
-    while ( r->barrier_count != r->nr_threads )
-        if ( pthread_cond_wait( &r->barrier_cond , &r->barrier_mutex ) != 0 )
-            error( "Error while waiting for runner threads to get in place." );
-    
-    }
-    
-    
-    
diff --git a/src/runner.h b/src/runner.h
index 505e7561a6c7159f9db22b748f67c5f1fd873c7d..4d1a6ec64c6bf77276c5eacf078d54454596bff6 100644
--- a/src/runner.h
+++ b/src/runner.h
@@ -19,16 +19,6 @@
 
 
 
-/* Some constants. */
-#define runner_policy_none          0
-#define runner_policy_rand          1
-#define runner_policy_steal         2
-#define runner_policy_keep          4
-#define runner_policy_block         8
-
-#define runner_queue_scale          1.2
-
-
 /* The timers themselves. */
 enum {
     runner_timer_none = 0,
@@ -114,7 +104,7 @@ long long int runner_hist_bins[ runner_hist_N ];
 
 
 /* A struct representing a runner's thread and its data. */
-struct runner_thread {
+struct runner {
 
     /* The id of this thread. */
     int id;
@@ -123,45 +113,15 @@ struct runner_thread {
     pthread_t thread;
     
     /* The underlying runner. */
-    struct runner *r;
-    
-    };
-
-
-/* Data structure for the runner. */
-struct runner {
-
-    /* Number of threads on which to run. */
-    int nr_threads;
-    
-    /* The space with which the runner is associated. */
-    struct space *s;
-    
-    /* The runner's threads. */
-    struct runner_thread *threads;
-    
-    /* The running policy. */
-    int policy;
-    
-    /* The number of queues. */
-    int nr_queues;
-    
-    /* The queues. */
-    struct queue *queues;
-    
-    /* Data for the threads' barrier. */
-    pthread_mutex_t barrier_mutex;
-    pthread_cond_t barrier_cond;
-    int barrier_count;
+    struct engine *e;
     
     };
 
 
 /* Function prototypes. */
-void runner_run ( struct runner *r , int sort_queues );
-void runner_doghost ( struct runner_thread *rt , struct cell *c );
-void runner_dopair_density ( struct runner_thread *rt , struct cell *ci , struct cell *cj );
-void runner_doself_density ( struct runner_thread *rt , struct cell *c );
-void runner_dosub_density ( struct runner_thread *rt , struct cell *ci , struct cell *cj , int flags );
-void runner_dosort ( struct runner_thread *rt , struct cell *c , int flag );
-void runner_init ( struct runner *r , struct space *s , int nr_threads , int nr_queues , int policy );
+void runner_doghost ( struct runner *r , struct cell *c );
+void runner_dopair_density ( struct runner *r , struct cell *ci , struct cell *cj );
+void runner_doself_density ( struct runner *r , struct cell *c );
+void runner_dosub_density ( struct runner *r , struct cell *ci , struct cell *cj , int flags );
+void runner_dosort ( struct runner *r , struct cell *c , int flag );
+void *runner_main ( void *data );
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index 52039b151a43d7f7d5ca1c8ca5243e280bd98be1..20e5798b3525b1551214e9573627f91143525a20 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -59,9 +59,9 @@
  * @param cj The second #cell.
  */
  
-void DOPAIR_NAIVE ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) {
+void DOPAIR_NAIVE ( struct runner *r , struct cell *ci , struct cell *cj ) {
 
-    struct runner *r = rt->r;
+    struct engine *e = r->e;
     int pid, pjd, k, count_i = ci->count, count_j = cj->count;
     double shift[3] = { 0.0 , 0.0 , 0.0 };
     struct part *pi, *pj, *parts_i = ci->parts, *parts_j = cj->parts;
@@ -71,10 +71,10 @@ void DOPAIR_NAIVE ( struct runner_thread *rt , struct cell *ci , struct cell *cj
     
     /* Get the relative distance between the pairs, wrapping. */
     for ( k = 0 ; k < 3 ; k++ ) {
-        if ( cj->loc[k] - ci->loc[k] < -r->s->dim[k]/2 )
-            shift[k] = r->s->dim[k];
-        else if ( cj->loc[k] - ci->loc[k] > r->s->dim[k]/2 )
-            shift[k] = -r->s->dim[k];
+        if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 )
+            shift[k] = e->s->dim[k];
+        else if ( cj->loc[k] - ci->loc[k] > e->s->dim[k]/2 )
+            shift[k] = -e->s->dim[k];
         }
         
     /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" ,
@@ -117,7 +117,7 @@ void DOPAIR_NAIVE ( struct runner_thread *rt , struct cell *ci , struct cell *cj
         } /* loop over the parts in ci. */
         
     #ifdef TIMER_VERBOSE
-        printf( "runner_dopair_naive[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , rt->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 );
+        printf( "runner_dopair_naive[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 );
     #else
         TIMER_TOC(TIMER_DOPAIR);
     #endif
@@ -134,9 +134,9 @@ void DOPAIR_NAIVE ( struct runner_thread *rt , struct cell *ci , struct cell *cj
  * @param cj The second #cell.
  */
  
-void DOPAIR ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) {
+void DOPAIR ( struct runner *r , struct cell *ci , struct cell *cj ) {
 
-    struct runner *r = rt->r;
+    struct engine *e = r->e;
     int pid, pjd, k, sid;
     double rshift, shift[3] = { 0.0 , 0.0 , 0.0 };
     struct cell *temp;
@@ -150,10 +150,10 @@ void DOPAIR ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) {
     
     /* Get the relative distance between the pairs, wrapping. */
     for ( k = 0 ; k < 3 ; k++ ) {
-        if ( cj->loc[k] - ci->loc[k] < -r->s->dim[k]/2 )
-            shift[k] = r->s->dim[k];
-        else if ( cj->loc[k] - ci->loc[k] > r->s->dim[k]/2 )
-            shift[k] = -r->s->dim[k];
+        if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 )
+            shift[k] = e->s->dim[k];
+        else if ( cj->loc[k] - ci->loc[k] > e->s->dim[k]/2 )
+            shift[k] = -e->s->dim[k];
         }
         
     /* Get the sorting index. */
@@ -275,7 +275,7 @@ void DOPAIR ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) {
         } /* loop over the parts in ci. */
 
     #ifdef TIMER_VERBOSE
-        printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , rt->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000 );
+        printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000 );
     #else
         TIMER_TOC(TIMER_DOPAIR);
     #endif
@@ -290,7 +290,7 @@ void DOPAIR ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) {
  * @param c The #cell.
  */
 
-void DOSELF ( struct runner_thread *rt , struct cell *c ) {
+void DOSELF ( struct runner *r , struct cell *c ) {
 
     int k, pid, pjd, count = c->count;
     double pix[3];
@@ -338,7 +338,7 @@ void DOSELF ( struct runner_thread *rt , struct cell *c ) {
         } /* loop over all particles. */
 
     #ifdef TIMER_VERBOSE
-        printf( "runner_doself[%02i]: %i parts at depth %i took %.3f ms.\n" , rt->id , count , c->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 );
+        printf( "runner_doself[%02i]: %i parts at depth %i took %.3f ms.\n" , r->id , count , c->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 );
     #else
         TIMER_TOC(TIMER_DOSELF);
     #endif
@@ -353,7 +353,7 @@ void DOSELF ( struct runner_thread *rt , struct cell *c ) {
  * @param c The #cell.
  */
 
-void DOSUB ( struct runner_thread *rt , struct cell *ci , struct cell *cj , int flags ) {
+void DOSUB ( struct runner *r , struct cell *ci , struct cell *cj , int flags ) {
 
     int j, k;
 
@@ -367,185 +367,185 @@ void DOSUB ( struct runner_thread *rt , struct cell *ci , struct cell *cj , int
             for ( j = 0 ; j < 7 ; j++ )
                 for ( k = j + 1 ; k < 8 ; k++ )
                     if ( ci->progeny[j] != NULL && ci->progeny[k] != NULL )
-                        DOPAIR( rt , ci->progeny[j] , ci->progeny[k] );
+                        DOPAIR( r , ci->progeny[j] , ci->progeny[k] );
             break;
             
         case 1: /* (  1 ,  1 ,  0 ) */
             if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL )
-                DOPAIR( rt , ci->progeny[6] , cj->progeny[0] );
+                DOPAIR( r , ci->progeny[6] , cj->progeny[0] );
             if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL )
-                DOPAIR( rt , ci->progeny[6] , cj->progeny[1] );
+                DOPAIR( r , ci->progeny[6] , cj->progeny[1] );
             if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                DOPAIR( rt , ci->progeny[7] , cj->progeny[0] );
+                DOPAIR( r , ci->progeny[7] , cj->progeny[0] );
             if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL )
-                DOPAIR( rt , ci->progeny[7] , cj->progeny[1] );
+                DOPAIR( r , ci->progeny[7] , cj->progeny[1] );
             break;
     
         case 3: /* (  1 ,  0 ,  1 ) */
             if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL )
-                DOPAIR( rt , ci->progeny[5] , cj->progeny[0] );
+                DOPAIR( r , ci->progeny[5] , cj->progeny[0] );
             if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL )
-                DOPAIR( rt , ci->progeny[5] , cj->progeny[2] );
+                DOPAIR( r , ci->progeny[5] , cj->progeny[2] );
             if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                DOPAIR( rt , ci->progeny[7] , cj->progeny[0] );
+                DOPAIR( r , ci->progeny[7] , cj->progeny[0] );
             if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL )
-                DOPAIR( rt , ci->progeny[7] , cj->progeny[2] );
+                DOPAIR( r , ci->progeny[7] , cj->progeny[2] );
             break;
                     
         case 4: /* (  1 ,  0 ,  0 ) */
             if ( ci->progeny[4] != NULL && cj->progeny[0] != NULL )
-                DOPAIR( rt , ci->progeny[4] , cj->progeny[0] );
+                DOPAIR( r , ci->progeny[4] , cj->progeny[0] );
             if ( ci->progeny[4] != NULL && cj->progeny[1] != NULL )
-                DOPAIR( rt , ci->progeny[4] , cj->progeny[1] );
+                DOPAIR( r , ci->progeny[4] , cj->progeny[1] );
             if ( ci->progeny[4] != NULL && cj->progeny[2] != NULL )
-                DOPAIR( rt , ci->progeny[4] , cj->progeny[2] );
+                DOPAIR( r , ci->progeny[4] , cj->progeny[2] );
             if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL )
-                DOPAIR( rt , ci->progeny[4] , cj->progeny[3] );
+                DOPAIR( r , ci->progeny[4] , cj->progeny[3] );
             if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL )
-                DOPAIR( rt , ci->progeny[5] , cj->progeny[0] );
+                DOPAIR( r , ci->progeny[5] , cj->progeny[0] );
             if ( ci->progeny[5] != NULL && cj->progeny[1] != NULL )
-                DOPAIR( rt , ci->progeny[5] , cj->progeny[1] );
+                DOPAIR( r , ci->progeny[5] , cj->progeny[1] );
             if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL )
-                DOPAIR( rt , ci->progeny[5] , cj->progeny[2] );
+                DOPAIR( r , ci->progeny[5] , cj->progeny[2] );
             if ( ci->progeny[5] != NULL && cj->progeny[3] != NULL )
-                DOPAIR( rt , ci->progeny[5] , cj->progeny[3] );
+                DOPAIR( r , ci->progeny[5] , cj->progeny[3] );
             if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL )
-                DOPAIR( rt , ci->progeny[6] , cj->progeny[0] );
+                DOPAIR( r , ci->progeny[6] , cj->progeny[0] );
             if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL )
-                DOPAIR( rt , ci->progeny[6] , cj->progeny[1] );
+                DOPAIR( r , ci->progeny[6] , cj->progeny[1] );
             if ( ci->progeny[6] != NULL && cj->progeny[2] != NULL )
-                DOPAIR( rt , ci->progeny[6] , cj->progeny[2] );
+                DOPAIR( r , ci->progeny[6] , cj->progeny[2] );
             if ( ci->progeny[6] != NULL && cj->progeny[3] != NULL )
-                DOPAIR( rt , ci->progeny[6] , cj->progeny[3] );
+                DOPAIR( r , ci->progeny[6] , cj->progeny[3] );
             if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                DOPAIR( rt , ci->progeny[7] , cj->progeny[0] );
+                DOPAIR( r , ci->progeny[7] , cj->progeny[0] );
             if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL )
-                DOPAIR( rt , ci->progeny[7] , cj->progeny[1] );
+                DOPAIR( r , ci->progeny[7] , cj->progeny[1] );
             if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL )
-                DOPAIR( rt , ci->progeny[7] , cj->progeny[2] );
+                DOPAIR( r , ci->progeny[7] , cj->progeny[2] );
             if ( ci->progeny[7] != NULL && cj->progeny[3] != NULL )
-                DOPAIR( rt , ci->progeny[7] , cj->progeny[3] );
+                DOPAIR( r , ci->progeny[7] , cj->progeny[3] );
             break;
             
         case 5: /* (  1 ,  0 , -1 ) */
             if ( ci->progeny[4] != NULL && cj->progeny[1] != NULL )
-                DOPAIR( rt , ci->progeny[4] , cj->progeny[1] );
+                DOPAIR( r , ci->progeny[4] , cj->progeny[1] );
             if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL )
-                DOPAIR( rt , ci->progeny[4] , cj->progeny[3] );
+                DOPAIR( r , ci->progeny[4] , cj->progeny[3] );
             if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL )
-                DOPAIR( rt , ci->progeny[6] , cj->progeny[1] );
+                DOPAIR( r , ci->progeny[6] , cj->progeny[1] );
             if ( ci->progeny[6] != NULL && cj->progeny[3] != NULL )
-                DOPAIR( rt , ci->progeny[6] , cj->progeny[3] );
+                DOPAIR( r , ci->progeny[6] , cj->progeny[3] );
             break;
                     
         case 7: /* (  1 , -1 ,  0 ) */
             if ( ci->progeny[4] != NULL && cj->progeny[2] != NULL )
-                DOPAIR( rt , ci->progeny[4] , cj->progeny[2] );
+                DOPAIR( r , ci->progeny[4] , cj->progeny[2] );
             if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL )
-                DOPAIR( rt , ci->progeny[4] , cj->progeny[3] );
+                DOPAIR( r , ci->progeny[4] , cj->progeny[3] );
             if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL )
-                DOPAIR( rt , ci->progeny[5] , cj->progeny[2] );
+                DOPAIR( r , ci->progeny[5] , cj->progeny[2] );
             if ( ci->progeny[5] != NULL && cj->progeny[3] != NULL )
-                DOPAIR( rt , ci->progeny[5] , cj->progeny[3] );
+                DOPAIR( r , ci->progeny[5] , cj->progeny[3] );
             break;
                     
         case 9: /* (  0 ,  1 ,  1 ) */
             if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL )
-                DOPAIR( rt , ci->progeny[3] , cj->progeny[0] );
+                DOPAIR( r , ci->progeny[3] , cj->progeny[0] );
             if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL )
-                DOPAIR( rt , ci->progeny[3] , cj->progeny[4] );
+                DOPAIR( r , ci->progeny[3] , cj->progeny[4] );
             if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                DOPAIR( rt , ci->progeny[7] , cj->progeny[0] );
+                DOPAIR( r , ci->progeny[7] , cj->progeny[0] );
             if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL )
-                DOPAIR( rt , ci->progeny[7] , cj->progeny[4] );
+                DOPAIR( r , ci->progeny[7] , cj->progeny[4] );
             break;
                     
         case 10: /* (  0 ,  1 ,  0 ) */
             if ( ci->progeny[2] != NULL && cj->progeny[0] != NULL )
-                DOPAIR( rt , ci->progeny[2] , cj->progeny[0] );
+                DOPAIR( r , ci->progeny[2] , cj->progeny[0] );
             if ( ci->progeny[2] != NULL && cj->progeny[1] != NULL )
-                DOPAIR( rt , ci->progeny[2] , cj->progeny[1] );
+                DOPAIR( r , ci->progeny[2] , cj->progeny[1] );
             if ( ci->progeny[2] != NULL && cj->progeny[4] != NULL )
-                DOPAIR( rt , ci->progeny[2] , cj->progeny[4] );
+                DOPAIR( r , ci->progeny[2] , cj->progeny[4] );
             if ( ci->progeny[2] != NULL && cj->progeny[5] != NULL )
-                DOPAIR( rt , ci->progeny[2] , cj->progeny[5] );
+                DOPAIR( r , ci->progeny[2] , cj->progeny[5] );
             if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL )
-                DOPAIR( rt , ci->progeny[3] , cj->progeny[0] );
+                DOPAIR( r , ci->progeny[3] , cj->progeny[0] );
             if ( ci->progeny[3] != NULL && cj->progeny[1] != NULL )
-                DOPAIR( rt , ci->progeny[3] , cj->progeny[1] );
+                DOPAIR( r , ci->progeny[3] , cj->progeny[1] );
             if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL )
-                DOPAIR( rt , ci->progeny[3] , cj->progeny[4] );
+                DOPAIR( r , ci->progeny[3] , cj->progeny[4] );
             if ( ci->progeny[3] != NULL && cj->progeny[5] != NULL )
-                DOPAIR( rt , ci->progeny[3] , cj->progeny[5] );
+                DOPAIR( r , ci->progeny[3] , cj->progeny[5] );
             if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL )
-                DOPAIR( rt , ci->progeny[6] , cj->progeny[0] );
+                DOPAIR( r , ci->progeny[6] , cj->progeny[0] );
             if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL )
-                DOPAIR( rt , ci->progeny[6] , cj->progeny[1] );
+                DOPAIR( r , ci->progeny[6] , cj->progeny[1] );
             if ( ci->progeny[6] != NULL && cj->progeny[4] != NULL )
-                DOPAIR( rt , ci->progeny[6] , cj->progeny[4] );
+                DOPAIR( r , ci->progeny[6] , cj->progeny[4] );
             if ( ci->progeny[6] != NULL && cj->progeny[5] != NULL )
-                DOPAIR( rt , ci->progeny[6] , cj->progeny[5] );
+                DOPAIR( r , ci->progeny[6] , cj->progeny[5] );
             if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                DOPAIR( rt , ci->progeny[7] , cj->progeny[0] );
+                DOPAIR( r , ci->progeny[7] , cj->progeny[0] );
             if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL )
-                DOPAIR( rt , ci->progeny[7] , cj->progeny[1] );
+                DOPAIR( r , ci->progeny[7] , cj->progeny[1] );
             if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL )
-                DOPAIR( rt , ci->progeny[7] , cj->progeny[4] );
+                DOPAIR( r , ci->progeny[7] , cj->progeny[4] );
             if ( ci->progeny[7] != NULL && cj->progeny[5] != NULL )
-                DOPAIR( rt , ci->progeny[7] , cj->progeny[5] );
+                DOPAIR( r , ci->progeny[7] , cj->progeny[5] );
             break;
                     
         case 11: /* (  0 ,  1 , -1 ) */
             if ( ci->progeny[2] != NULL && cj->progeny[1] != NULL )
-                DOPAIR( rt , ci->progeny[2] , cj->progeny[1] );
+                DOPAIR( r , ci->progeny[2] , cj->progeny[1] );
             if ( ci->progeny[2] != NULL && cj->progeny[5] != NULL )
-                DOPAIR( rt , ci->progeny[2] , cj->progeny[5] );
+                DOPAIR( r , ci->progeny[2] , cj->progeny[5] );
             if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL )
-                DOPAIR( rt , ci->progeny[6] , cj->progeny[1] );
+                DOPAIR( r , ci->progeny[6] , cj->progeny[1] );
             if ( ci->progeny[6] != NULL && cj->progeny[5] != NULL )
-                DOPAIR( rt , ci->progeny[6] , cj->progeny[5] );
+                DOPAIR( r , ci->progeny[6] , cj->progeny[5] );
             break;
                     
         case 12: /* (  0 ,  0 ,  1 ) */
             if ( ci->progeny[1] != NULL && cj->progeny[0] != NULL )
-                DOPAIR( rt , ci->progeny[1] , cj->progeny[0] );
+                DOPAIR( r , ci->progeny[1] , cj->progeny[0] );
             if ( ci->progeny[1] != NULL && cj->progeny[2] != NULL )
-                DOPAIR( rt , ci->progeny[1] , cj->progeny[2] );
+                DOPAIR( r , ci->progeny[1] , cj->progeny[2] );
             if ( ci->progeny[1] != NULL && cj->progeny[4] != NULL )
-                DOPAIR( rt , ci->progeny[1] , cj->progeny[4] );
+                DOPAIR( r , ci->progeny[1] , cj->progeny[4] );
             if ( ci->progeny[1] != NULL && cj->progeny[6] != NULL )
-                DOPAIR( rt , ci->progeny[1] , cj->progeny[6] );
+                DOPAIR( r , ci->progeny[1] , cj->progeny[6] );
             if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL )
-                DOPAIR( rt , ci->progeny[3] , cj->progeny[0] );
+                DOPAIR( r , ci->progeny[3] , cj->progeny[0] );
             if ( ci->progeny[3] != NULL && cj->progeny[2] != NULL )
-                DOPAIR( rt , ci->progeny[3] , cj->progeny[2] );
+                DOPAIR( r , ci->progeny[3] , cj->progeny[2] );
             if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL )
-                DOPAIR( rt , ci->progeny[3] , cj->progeny[4] );
+                DOPAIR( r , ci->progeny[3] , cj->progeny[4] );
             if ( ci->progeny[3] != NULL && cj->progeny[6] != NULL )
-                DOPAIR( rt , ci->progeny[3] , cj->progeny[6] );
+                DOPAIR( r , ci->progeny[3] , cj->progeny[6] );
             if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL )
-                DOPAIR( rt , ci->progeny[5] , cj->progeny[0] );
+                DOPAIR( r , ci->progeny[5] , cj->progeny[0] );
             if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL )
-                DOPAIR( rt , ci->progeny[5] , cj->progeny[2] );
+                DOPAIR( r , ci->progeny[5] , cj->progeny[2] );
             if ( ci->progeny[5] != NULL && cj->progeny[4] != NULL )
-                DOPAIR( rt , ci->progeny[5] , cj->progeny[4] );
+                DOPAIR( r , ci->progeny[5] , cj->progeny[4] );
             if ( ci->progeny[5] != NULL && cj->progeny[6] != NULL )
-                DOPAIR( rt , ci->progeny[5] , cj->progeny[6] );
+                DOPAIR( r , ci->progeny[5] , cj->progeny[6] );
             if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL )
-                DOPAIR( rt , ci->progeny[7] , cj->progeny[0] );
+                DOPAIR( r , ci->progeny[7] , cj->progeny[0] );
             if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL )
-                DOPAIR( rt , ci->progeny[7] , cj->progeny[2] );
+                DOPAIR( r , ci->progeny[7] , cj->progeny[2] );
             if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL )
-                DOPAIR( rt , ci->progeny[7] , cj->progeny[4] );
+                DOPAIR( r , ci->progeny[7] , cj->progeny[4] );
             if ( ci->progeny[7] != NULL && cj->progeny[6] != NULL )
-                DOPAIR( rt , ci->progeny[7] , cj->progeny[6] );
+                DOPAIR( r , ci->progeny[7] , cj->progeny[6] );
             break;
                 
         }
     
 
     #ifdef TIMER_VERBOSE
-        printf( "runner_dosub[%02i]: flags=%i at depth %i took %.3f ms.\n" , rt->id , flags , ci->depth , ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000 );
+        printf( "runner_dosub[%02i]: flags=%i at depth %i took %.3f ms.\n" , r->id , flags , ci->depth , ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000 );
     #else
         TIMER_TOC(TIMER_DOSUB);
     #endif
diff --git a/src/runner_iact.h b/src/runner_iact.h
index 0fb85d8d1e1c39aa4675a0a7916372a2a68240e9..b578faef6a5c938bd7dc898b5b3f60af101953d7 100644
--- a/src/runner_iact.h
+++ b/src/runner_iact.h
@@ -147,6 +147,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2
         
     /* Compute dv dot r. */
     dvdr = ( pi->v[0] - pj->v[0] ) * dx[0] + ( pi->v[1] - pj->v[1] ) * dx[1] + ( pi->v[2] - pj->v[2] ) * dx[2];
+    dvdr *= ri;
         
     /* Get the time derivative for u. */
     pi->u_dt += pj->mass * dvdr * wi_dr;
diff --git a/src/space.c b/src/space.c
index 6a3203ee0601aac09cb109b91890d9911866367d..a8e42fbefa2467ca520ab39f18cc5ee4ec60f411 100644
--- a/src/space.c
+++ b/src/space.c
@@ -970,7 +970,7 @@ void space_maketasks ( struct space *s , int do_sort ) {
             if ( t->cj != NULL )
                 task_addunlock( t->cj->ghost , t2 );
             }
-
+            
         }
         
     /* Did we already create indices? */
@@ -987,13 +987,28 @@ void space_maketasks ( struct space *s , int do_sort ) {
     for ( k = 0 ; k < s->nr_tasks ; k++ ) {
         t = &s->tasks[k];
         if ( ( t->type == task_type_sort || t->type == task_type_ghost ) && t->nr_unlock_tasks == 0 ) {
+            if ( t->type == task_type_sort && t->ci->split )
+                for ( i = 0 ; i < 13 ; i++ )
+                    if ( t->flags & ( 1 << i ) ) {
+                        for ( j = 0 ; j < 8 ; j++ )
+                            if ( t->ci->progeny[j] != NULL )
+                                task_rmunlock( t->ci->progeny[j]->sorts[i] , t );
+                        t->ci->sorts[i] = NULL;
+                        }
             t->type = task_type_none;
-            if ( t->ci->split )
-                for ( j = 0 ; j < 8 ; j++ )
-                    if ( t->ci->progeny[j] != NULL && t->flags & ( 1 << j ) )
-                        task_rmunlock( t->ci->progeny[j]->sorts[j] , t );
             }
         }
+        
+    /* Make each remaining ghost task unlock the ghosts of its progeny. */
+    for ( k = 0 ; k < s->nr_tasks ; k++ ) {
+        t = &s->tasks[k];
+        if ( t->type == task_type_ghost && t->ci->split )
+            for ( j = 0 ; j < 8 ; j++ )
+                if ( t->ci->progeny[j] != NULL )
+                    task_addunlock( t->ci->ghost , t->ci->progeny[j]->ghost );
+        if ( t->type == task_type_ghost && ( t->ci->parent == NULL || t->ci->parent->ghost->type == task_type_none ) )
+            t->flags = 1;
+        }
             
     /* Count the number of each task type. */
     for ( k = 0 ; k < task_type_count ; k++ )