diff --git a/src/Makefile.am b/src/Makefile.am
index f1376a6258180bd0a72306a60af0beece6e097d8..820ca314772f406ed900260a79824a7c842f7597 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -22,7 +22,7 @@ AUTOMAKE_OPTIONS=gnu
 # Add the debug flag to the whole thing
 AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \
     -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \
-    -DTIMER -DCOUNTER -DCPU_TPS=2.40e9
+    -DTIMER -DCOUNTER -DCPU_TPS=2.40e9 -mfma4
 # AM_CFLAGS = -Wall -Werror $(OPENMP_CFLAGS) \
 #     -DTIMER -DCOUNTER -DCPU_TPS=2.67e9
 
diff --git a/src/engine.c b/src/engine.c
index 8637ee07f44b8f88aa00d9dab9f83e1ed4318cc6..e1e1e6c6c500d6280ad91dafc701f76dcd36c88c 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -326,7 +326,7 @@ void engine_prepare ( struct engine *e ) {
 
     /* Run through the tasks and mark as skip or not. */
     // tic = getticks();
-    rebuild = ( e->step == 0 || engine_marktasks( e ) );
+    rebuild = 1 || ( e->step == 0 || engine_marktasks( e ) );
     // printf( "space_prepare: space_marktasks took %.3f ms.\n" , (double)(getticks() - tic)/CPU_TPS*1000 );
         
     /* Did this not go through? */
diff --git a/src/scheduler.c b/src/scheduler.c
index 66708fb14ca4ac6014502771bc1029659b889b9e..aef13a10bacc5147fb5ed17a5d71b3eb846319ee 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -625,7 +625,7 @@ void scheduler_start ( struct scheduler *s , unsigned int mask ) {
  
 void scheduler_enqueue ( struct scheduler *s , struct task *t ) {
 
-    int k, qid = -1;
+    int qid = -1;
     
     /* Ignore skipped tasks. */
     if ( t->skip )
@@ -652,9 +652,10 @@ void scheduler_enqueue ( struct scheduler *s , struct task *t ) {
         
     /* If no previous owner, find the shortest queue. */
     if ( qid < 0 )
-        for ( qid = 0 , k = 1 ; k < s->nr_queues ; k++ )
+        qid = rand() % s->nr_queues;
+        /* for ( qid = 0 , int k = 1 ; k < s->nr_queues ; k++ )
             if ( s->queues[k].count < s->queues[qid].count )
-                qid = k;
+                qid = k; */
                 
     /* Increase the waiting counter. */
     atomic_inc( &s->waiting );
@@ -674,7 +675,7 @@ void scheduler_enqueue ( struct scheduler *s , struct task *t ) {
  
 void scheduler_done ( struct scheduler *s , struct task *t ) {
 
-    int k;
+    int k, res;
     struct task *t2;
 
     /* Release whatever locks this task held. */
@@ -695,15 +696,17 @@ void scheduler_done ( struct scheduler *s , struct task *t ) {
        they are ready. */
     for ( k = 0 ; k < t->nr_unlock_tasks ; k++ ) {
         t2 = t->unlock_tasks[k];
-        if ( atomic_dec( &t2->wait ) == 1 && !t2->skip )
+        if ( ( res = atomic_dec( &t2->wait ) ) < 1 )
+            error( "Negative wait!" );
+        if ( res == 1 && !t2->skip )
             scheduler_enqueue( s , t2 );
         }
         
     /* Task definitely done. */
-    pthread_mutex_lock( &s->sleep_mutex );
+    // pthread_mutex_lock( &s->sleep_mutex );
     atomic_dec( &s->waiting );
-    pthread_cond_broadcast( &s->sleep_cond );
-    pthread_mutex_unlock( &s->sleep_mutex );
+    // pthread_cond_broadcast( &s->sleep_cond );
+    // pthread_mutex_unlock( &s->sleep_mutex );
 
     }
 
@@ -751,10 +754,10 @@ struct task *scheduler_gettask ( struct scheduler *s , int qid ) {
             }
             
         /* If we failed, take a short nap. */
-        pthread_mutex_lock( &s->sleep_mutex );
+        /* pthread_mutex_lock( &s->sleep_mutex );
         if ( s->waiting > 0 )
             pthread_cond_wait( &s->sleep_cond , &s->sleep_mutex );
-        pthread_mutex_unlock( &s->sleep_mutex );
+        pthread_mutex_unlock( &s->sleep_mutex ); */
         
         }
         
diff --git a/src/scheduler.h b/src/scheduler.h
index b20c7ecaf793bc0dceb6d29ad4f74d0423ba7067..b1d6d8a274a09f63205446027cc23a465e30c09e 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -27,7 +27,7 @@
 /* Flags . */
 #define scheduler_flag_none                  0
 #define scheduler_flag_steal                 1
-#define scheduler_flag_maxsteal              2
+#define scheduler_flag_maxsteal              1
 
 
 /* Data of a scheduler. */
diff --git a/src/space.c b/src/space.c
index 97e1e45109a5836019c7db11342d8132be206b36..40ed7412efb66180c1749e5c332c774554619aa2 100644
--- a/src/space.c
+++ b/src/space.c
@@ -161,7 +161,7 @@ void space_rebuild ( struct space *s , double cell_max ) {
     struct part *restrict finger, *restrict p, *parts = s->parts;
     int *ind;
     double ih[3], dim[3];
-    // ticks tic;
+    ticks tic;
     
     /* Be verbose about this. */
     printf( "space_rebuild: (re)building space...\n" ); fflush(stdout);
@@ -252,6 +252,7 @@ void space_rebuild ( struct space *s , double cell_max ) {
             s->cells[k].dx_max = 0.0f;
             s->cells[k].sorted = 0;
             s->cells[k].count = 0;
+            s->cells[k].kick1 = NULL;
             s->cells[k].kick2 = NULL;
             }
         s->maxdepth = 0;
@@ -259,7 +260,7 @@ void space_rebuild ( struct space *s , double cell_max ) {
         }
         
     /* Run through the particles and get their cell index. */
-    // tic = getticks();
+    tic = getticks();
     if ( ( ind = (int *)malloc( sizeof(int) * s->nr_parts ) ) == NULL )
         error( "Failed to allocate temporary particle indices." );
     ih[0] = s->ih[0]; ih[1] = s->ih[1]; ih[2] = s->ih[2];
@@ -276,12 +277,12 @@ void space_rebuild ( struct space *s , double cell_max ) {
         ind[k] = cell_getid( cdim , p->x[0]*ih[0] , p->x[1]*ih[1] , p->x[2]*ih[2] );
         atomic_inc( &s->cells[ ind[k] ].count );
         }
-    // printf( "space_rebuild: getting particle indices took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
+    printf( "space_rebuild: getting particle indices took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
 
     /* Sort the parts according to their cells. */
-    // tic = getticks();
+    tic = getticks();
     parts_sort( parts , ind , s->nr_parts , 0 , s->nr_cells-1 );
-    // printf( "space_rebuild: parts_sort took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
+    printf( "space_rebuild: parts_sort took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
     
     /* Verify sort. */
     /* for ( k = 1 ; k < nr_parts ; k++ ) {
@@ -307,21 +308,19 @@ void space_rebuild ( struct space *s , double cell_max ) {
         
     /* At this point, we have the upper-level cells, old or new. Now make
        sure that the parts in each cell are ok. */
-    // tic = getticks();
+    tic = getticks();
     k = 0;
-    #pragma omp parallel shared(s,k)
+    #pragma omp parallel num_threads(8) shared(s,k)
     {
         while ( 1 ) {
-            int myk;
-            #pragma omp critical
-            myk = k++;
+            int myk = atomic_inc( &k );
             if ( myk < s->nr_cells )
                 space_split( s , &s->cells[myk] );
             else
                 break;
             }
         }
-    // printf( "space_rebuild: space_rebuild_recurse took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
+    printf( "space_rebuild: space_rebuild_recurse took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
         
     }
 
@@ -358,7 +357,7 @@ void parts_sort ( struct part *parts , int *ind , int N , int min , int max ) {
     first = 0; last = 1; waiting = 1;
     
     /* Parallel bit. */
-    #pragma omp parallel default(shared) private(pivot,i,ii,j,jj,min,max,temp_i,qid,temp_p)
+    #pragma omp parallel num_threads(8) default(shared) private(pivot,i,ii,j,jj,min,max,temp_i,qid,temp_p)
     {
     
         /* Main loop. */
@@ -565,11 +564,11 @@ void space_map_cells_post ( struct space *s , int full , void (*fun)( struct cel
         }
         
     /* Call the recursive function on all higher-level cells. */
-    #pragma omp parallel shared(s,cid)
+    // #pragma omp parallel shared(s,cid)
     {
         int mycid;
         while ( 1 ) {
-            #pragma omp critical
+            // #pragma omp critical
             mycid = cid++;
             if ( mycid < s->nr_cells )
                 rec_map( &s->cells[mycid] );
@@ -602,11 +601,11 @@ void space_map_cells_pre ( struct space *s , int full , void (*fun)( struct cell
         }
         
     /* Call the recursive function on all higher-level cells. */
-    #pragma omp parallel shared(s,cid)
+    // #pragma omp parallel shared(s,cid)
     {
         int mycid;
         while ( 1 ) {
-            #pragma omp critical
+            // #pragma omp critical
             mycid = cid++;
             if ( mycid < s->nr_cells )
                 rec_map( &s->cells[mycid] );
@@ -790,15 +789,22 @@ struct cell *space_getcell ( struct space *s ) {
     s->cells_new = c->next;
     s->tot_cells += 1;
     
+    /* Unlock the space. */
+    lock_unlock_blind( &s->lock );
+    
     /* Init some things in the cell. */
-    bzero( c , sizeof(struct cell) );
+    c->sorts = NULL;
+    c->nr_tasks = 0;
+    c->nr_density = 0;
+    c->dx_max = 0.0f;
+    c->sorted = 0;
+    c->count = 0;
+    c->kick1 = NULL;
+    c->kick2 = NULL;
     if ( lock_init( &c->lock ) != 0 )
         error( "Failed to initialize cell spinlock." );
     c->owner = -1;
         
-    /* Unlock the space. */
-    lock_unlock_blind( &s->lock );
-    
     return c;
 
     }
diff --git a/src/space.h b/src/space.h
index 88b012a2279c1dcc679f482f46d039e53251af54..40788bd24f70e768e83724447818da9a185c9643 100644
--- a/src/space.h
+++ b/src/space.h
@@ -26,8 +26,8 @@
 #define space_splitratio                0.875f
 #define space_splitsize_default         400
 #define space_subsize_default           5000
-#define space_stretch                   1.05f
-#define space_maxreldx                  0.2f
+#define space_stretch                   1.10f
+#define space_maxreldx                  0.25f
 #define space_qstack                    1000