diff --git a/src/scheduler.c b/src/scheduler.c
index b7d27a87b5410b9a4507e2f0b467573eb66d7659..2b08a0f1d8d8f4b65235a8b2013e972fdd681b1e 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -26,6 +26,8 @@
 #include <string.h>
 #include <math.h>
 #include <pthread.h>
+#include <limits.h>
+#include <omp.h>
 
 /* MPI headers. */
 #ifdef WITH_MPI
@@ -190,7 +192,7 @@ void scheduler_splittasks ( struct scheduler *s ) {
             if ( ci->split ) {
             
                 /* Make a sub? */
-                if ( scheduler_dosub && ci->count < space_subsize ) {
+                if ( scheduler_dosub && ci->count < space_subsize/ci->count ) {
 
                     /* convert to a self-subtask. */
                     t->type = task_type_sub;
@@ -248,7 +250,7 @@ void scheduler_splittasks ( struct scheduler *s ) {
                  
                 /* Replace by a single sub-task? */
                 if ( scheduler_dosub &&
-                     ( ci->count + cj->count ) * sid_scale[sid] < space_subsize &&
+                     ci->count * sid_scale[sid] < space_subsize/cj->count &&
                      sid != 0 && sid != 2 && sid != 6 && sid != 8 ) {
                 
                     /* Make this task a sub task. */
@@ -625,23 +627,23 @@ void scheduler_reweight ( struct scheduler *s ) {
                     t->weight += wscale * __builtin_popcount( t->flags ) * t->ci->count * ( sizeof(int)*8 - __builtin_clz( t->ci->count ) );
                     break;
                 case task_type_self:
-                    t->weight += 10 * t->ci->count * t->ci->count;
+                    t->weight += 1 * t->ci->count * t->ci->count;
                     break;
                 case task_type_pair:
                     if ( t->ci->nodeID != nodeID || t->cj->nodeID != nodeID )
-                        t->weight += 30 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ];
+                        t->weight += 3 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ];
                     else
-                        t->weight += 20 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ];
+                        t->weight += 2 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ];
                     break;
                 case task_type_sub:
                     if ( t->cj != NULL ) {
                         if ( t->ci->nodeID != nodeID || t->cj->nodeID != nodeID )
-                            t->weight += 30 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ];
+                            t->weight += 3 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ];
                         else
-                            t->weight += 20 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ];
+                            t->weight += 2 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ];
                         }
                     else
-                        t->weight += 10 * wscale * t->ci->count * t->ci->count;
+                        t->weight += 1 * wscale * t->ci->count * t->ci->count;
                     break;
                 case task_type_ghost:
                     if ( t->ci == t->ci->super )
@@ -651,14 +653,21 @@ void scheduler_reweight ( struct scheduler *s ) {
                 case task_type_kick2:
                     t->weight += wscale * t->ci->count;
                     break;
-                    break;
                 }
         if ( t->type == task_type_send_xv || t->type == task_type_send_rho )
-            t->weight *= 10;
+            t->weight  = INT_MAX / 8;
         if ( t->type == task_type_recv_xv || t->type == task_type_recv_rho )
-            t->weight *= 2;
+            t->weight *= 1.41; 
         }
     // message( "weighting tasks took %.3f ms." , (double)( getticks() - tic ) / CPU_TPS * 1000 );
+
+    /* int min = tasks[0].weight, max = tasks[0].weight;
+    for ( k = 1 ; k < nr_tasks ; k++ )
+    	if ( tasks[k].weight < min )
+	    min = tasks[k].weight;
+	else if ( tasks[k].weight > max )
+	    max = tasks[k].weight;
+    message( "task weights are in [ %i , %i ]." , min , max ); */
         
     }
 
@@ -682,6 +691,7 @@ void scheduler_start ( struct scheduler *s , unsigned int mask ) {
     for ( k = nr_tasks - 1 ; k >= 0 ; k-- ) {
         t = &tasks[ tid[k] ];
         t->wait = 0;
+	t->rid = -1;
         if ( !( (1 << t->type) & mask ) || t->skip )
             continue;
         for ( j = 0 ; j < t->nr_unlock_tasks ; j++ )
@@ -691,15 +701,15 @@ void scheduler_start ( struct scheduler *s , unsigned int mask ) {
         
     /* Loop over the tasks and enqueue whoever is ready. */
     // tic = getticks();
-    // #pragma omp parallel for schedule(static) private(t)
-    for ( k = 0 ; k < nr_tasks ; k++ ) {
+    for ( k = 0 ; k < nr_tasks ; k++) {
         t = &tasks[ tid[k] ];
-        if ( ( (1 << t->type) & mask ) && !t->skip && t->wait == 0 ) {
-            if ( t->implicit )
-                for ( j = 0 ; j < t->nr_unlock_tasks ; j++ )
-                    atomic_dec( &t->unlock_tasks[j]->wait );
-            else
-                scheduler_enqueue( s , t );
+        if ( ( (1 << t->type) & mask ) && !t->skip ) {
+            if ( t->wait == 0 ) {
+		scheduler_enqueue( s , t );
+		pthread_cond_broadcast( &s->sleep_cond );
+		}
+	    else
+	        break;
             }
         }
     // message( "enqueueing tasks took %.3f ms." , (double)( getticks() - tic ) / CPU_TPS * 1000 );
@@ -722,7 +732,7 @@ void scheduler_enqueue ( struct scheduler *s , struct task *t ) {
     #endif
     
     /* Ignore skipped tasks. */
-    if ( t->skip )
+    if ( t->skip  || atomic_cas( &t->rid , -1 , 0 ) != -1 )
         return;
         
     /* If this is an implicit task, just pretend it's done. */
@@ -764,7 +774,7 @@ void scheduler_enqueue ( struct scheduler *s , struct task *t ) {
                         }
                     // message( "recieving %i parts with tag=%i from %i to %i." ,
                     //     t->ci->count , t->flags , t->ci->nodeID , s->nodeID ); fflush(stdout);
-                    qid = 0;
+                    qid = 1 % s->nr_queues;
                 #else
                     error( "SWIFT was not compiled with MPI support." );
                 #endif
@@ -971,15 +981,17 @@ struct task *scheduler_gettask ( struct scheduler *s , int qid , struct cell *su
             }
 
         /* If we failed, take a short nap. */
-        #ifndef WITH_MPI
+        #ifdef WITH_MPI
+	    if ( res == NULL && qid > 1 ) {
+	#else
             if ( res == NULL ) {
+	#endif
                 pthread_mutex_lock( &s->sleep_mutex );
                 if ( s->waiting > 0 )
                     pthread_cond_wait( &s->sleep_cond , &s->sleep_mutex );
                 pthread_mutex_unlock( &s->sleep_mutex );
                 }
-        #endif
-        
+
         }
         
     /* Start the timer on this task, if we got one. */