diff --git a/src/engine.c b/src/engine.c
index 1f93b1ab9884fd6834cacb60f00fcdce04f5a422..62b9273db671e667eb46e4fb66261d1a09fff491 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -60,55 +60,60 @@
  
 void engine_prepare ( struct engine *e , int force ) {
 
-    int j, k, qid, changes;
+    int j, k, qid, changes, count;
     struct space *s = e->s;
+    // ticks tic;
 
     /* Rebuild the space. */
+    // tic = getticks();
     changes = space_rebuild( e->s , force , 0 );
-    // printf( "engine_prepare: space_rebuild with %i changes.\n" , changes );
+    // printf( "engine_prepare: space_rebuild with %i changes took %.3f ms.\n" , changes , (double)(getticks() - tic) / CPU_TPS * 1000 );
     
     /* Has anything changed? */
+    // tic = getticks();
     if ( changes ) {
     
         /* Rank the tasks in topological order. */
         engine_ranktasks( e );
     
-        /* Clear the queues. */
-        for ( k = 0 ; k < e->nr_queues ; k++ )
-            e->queues[k].count = 0;
-            
-        /* Re-allocate the queue buffers? */
-        for ( k = 0 ; k < e->nr_queues ; k++ )
-            queue_init( &e->queues[k] , s->nr_tasks , s->tasks );
-        
         /* Fill the queues (round-robin). */
-        for ( k = 0 ; k < s->nr_tasks ; k++ ) {
-            if ( s->tasks[ s->tasks_ind[k] ].type == task_type_none )
-                continue;
-            qid = k % e->nr_queues;
-            e->queues[qid].tid[ e->queues[qid].count ] = s->tasks_ind[k];
-            e->queues[qid].count += 1;
+        #pragma omp parallel for schedule(static) private(count,k)
+        for ( qid = 0 ; qid < e->nr_queues ; qid++ ) {
+            queue_init( &e->queues[qid] , s->nr_tasks , s->tasks );
+            for ( count = 0 , k = qid ; k < s->nr_tasks ; k += e->nr_queues ) {
+                if ( s->tasks[ s->tasks_ind[k] ].type == task_type_none )
+                    continue;
+                e->queues[qid].tid[ count ] = s->tasks_ind[k];
+                count += 1;
+                }
+            e->queues[qid].count = count;
+            e->queues[qid].next = 0;
             }
             
         }
+    // printf( "engine_prepare: re-filling queues took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
 
     /* Re-set the particle data. */
-    #pragma omp parallel for
+    // tic = getticks();
+    #pragma omp parallel for schedule(static) 
     for ( k = 0 ; k < s->nr_parts ; k++ ) {
         s->parts[k].wcount = 0.0f;
         s->parts[k].wcount_dh = 0.0f;
         s->parts[k].rho = 0.0f;
         s->parts[k].rho_dh = 0.0f;
         }
+    // printf( "engine_prepare: re-setting particle data took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
     
     /* Run throught the tasks and get all the waits right. */
-    #pragma omp parallel for private(j)
+    // tic = getticks();
+    #pragma omp parallel for schedule(static) private(j)
     for ( k = 0 ; k < s->nr_tasks ; k++ ) {
         for ( j = 0 ; j < s->tasks[k].nr_unlock_tasks ; j++ )
             __sync_add_and_fetch( &s->tasks[k].unlock_tasks[j]->wait , 1 );
         for ( j = 0 ; j < s->tasks[k].nr_unlock_cells ; j++ )
             __sync_add_and_fetch( &s->tasks[k].unlock_cells[j]->wait , 1 );
         }
+    // printf( "engine_prepare: preparing task dependencies took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
     
     /* Re-set the queues.*/
     for ( k = 0 ; k < e->nr_queues ; k++ )
diff --git a/src/space.c b/src/space.c
index 858fc7313f4ce15189b6f49390b61949d4e261ac..0b5f5bec01b2a022203efbe9cb2465435d107871 100644
--- a/src/space.c
+++ b/src/space.c
@@ -269,8 +269,10 @@ int space_rebuild ( struct space *s , int force , double cell_max ) {
     struct part *finger;
     struct cpart *cfinger;
     int *ind, changes = 0;
+    // ticks tic;
     
     /* Run through the parts and get the current h_max. */
+    // tic = getticks();
     for ( k = 0 ; k < s->nr_parts ; k++ ) {
         if ( s->parts[k].h > h_max )
             h_max = s->parts[k].h;
@@ -279,12 +281,14 @@ int space_rebuild ( struct space *s , int force , double cell_max ) {
         }
     s->h_min = h_min;
     s->h_max = h_max;
+    // printf( "space_rebuild: getting h_min and h_max took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
     
     /* Get the new putative cell dimensions. */
     for ( k = 0 ; k < 3 ; k++ )
         cdim[k] = floor( s->dim[k] / fmax( h_max*space_stretch , cell_max ) );
         
     /* Do we need to re-build the upper-level cells? */
+    // tic = getticks();
     if ( force || cdim[0] < s->cdim[0] || cdim[1] < s->cdim[1] || cdim[2] < s->cdim[2] ) {
     
         /* Free the old cells, if they were allocated. */
@@ -328,9 +332,11 @@ int space_rebuild ( struct space *s , int force , double cell_max ) {
         changes = 1;
         
         } /* re-build upper-level cells? */
+    // printf( "space_rebuild: rebuilding upper-level cells took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
         
         
     /* Run through the particles and get their cell index. */
+    // tic = getticks();
     if ( ( ind = (int *)malloc( sizeof(int) * s->nr_parts ) ) == NULL )
         error( "Failed to allocate temporary particle indices." );
     for ( k = 0 ; k < s->nr_cells ; k++ )
@@ -339,14 +345,19 @@ int space_rebuild ( struct space *s , int force , double cell_max ) {
         ind[k] = cell_getid( s->cdim , s->parts[k].x[0]*s->ih[0] , s->parts[k].x[1]*s->ih[1] , s->parts[k].x[2]*s->ih[2] );
         s->cells[ ind[k] ].count += 1;
         }
+    // printf( "space_rebuild: getting particle indices took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
 
     /* Sort the parts according to their cells. */
+    // tic = getticks();
     parts_sort( s->parts , ind , s->nr_parts , 0 , s->nr_cells );    
+    // printf( "space_rebuild: parts_sort took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
     
     /* We no longer need the indices as of here. */
     free( ind );    
 
     /* Update the condensed particle data. */         
+    // tic = getticks();
+    #pragma omp parallel for schedule(static)
     for ( k = 0 ; k < s->nr_parts ; k++ ) {
         s->cparts[k].x[0] = s->parts[k].x[0];
         s->cparts[k].x[1] = s->parts[k].x[1];
@@ -354,8 +365,10 @@ int space_rebuild ( struct space *s , int force , double cell_max ) {
         s->cparts[k].h = s->parts[k].h;
         s->cparts[k].dt = s->parts[k].dt;
         }
+    // printf( "space_rebuild: creating condensed parts took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
 
     /* Hook the cells up to the parts. */
+    // tic = getticks();
     finger = s->parts;
     cfinger = s->cparts;
     for ( k = 0 ; k < s->nr_cells ; k++ ) {
@@ -365,17 +378,22 @@ int space_rebuild ( struct space *s , int force , double cell_max ) {
         finger = &finger[ c->count ];
         cfinger = &cfinger[ c->count ];
         }
+    // printf( "space_rebuild: hooking up cells took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
         
         
     /* At this point, we have the upper-level cells, old or new. Now make
        sure that the parts in each cell are ok. */
-    #pragma omp parallel for shared(s) reduction(+:changes)
+    // tic = getticks();
+    #pragma omp parallel for schedule(dynamic) shared(s) reduction(+:changes)
     for ( k = 0 ; k < s->nr_cells ; k++ )
         changes += space_rebuild_recurse( s , &s->cells[k] );
+    // printf( "space_rebuild: space_rebuild_recurse took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
         
     /* Now that we have the cell structre, re-build the tasks. */
+    // tic = getticks();
     if ( changes )
         space_maketasks( s , 1 );
+    // printf( "space_rebuild: maketasks took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
     
     /* Return the number of changes. */
     return changes;
@@ -460,16 +478,16 @@ void parts_sort ( struct part *parts , int *ind , int N , int min , int max ) {
             }
 
         else
-        #pragma omp parallel sections
+        // #pragma omp parallel sections
         {
 
             /* Recurse on the left? */
-            #pragma omp section
+            // #pragma omp section
             if ( j > 0 && pivot > min )
                 parts_sort( parts , ind , j+1 , min , pivot );
 
             /* Recurse on the right? */
-            #pragma omp section
+            // #pragma omp section
             if ( i < N && pivot+1 < max )
                 parts_sort( &parts[i], &ind[i], N-i , pivot+1 , max );