diff --git a/src/Makefile.am b/src/Makefile.am
index 33907eb666c96b038a9cd7a1e89abfc7df847167..21a6369eab003cd82dd47c4d5c200a341d34c6a6 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -19,7 +19,9 @@
 AUTOMAKE_OPTIONS=gnu
 
 # Add the debug flag to the whole thing
-AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing $(SIMD_FLAGS) $(CFLAGS) $(OPENMP_CFLAGS) -DTIMER -DCOUNTER -DCPU_TPS=2.67e9
+AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \
+    -funroll-loops $(SIMD_FLAGS) $(CFLAGS) $(OPENMP_CFLAGS) \
+    -DTIMER -DCOUNTER -DCPU_TPS=2.67e9 -DHIST
 
 # Assign a "safe" version number
 AM_LDFLAGS = $(LAPACK_LIBS) $(BLAS_LIBS) $(HDF5_LDFLAGS) -version-info 0:0:0
diff --git a/src/cell.h b/src/cell.h
index 920ae422e5eb5fca6baa4c78d90e5559ef34b54a..8d54374f9b7a9de711e8dd0f2f04c9ae92d93ee2 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -17,7 +17,8 @@
  * 
  ******************************************************************************/
 
-
+/* Some constants. */
+#define cell_sid_dt                 13
 
 /* The queue timers. */
 enum {
diff --git a/src/gadgetsmp.h b/src/gadgetsmp.h
index 67a01137a8b0f61b7bdfad57465a26e1890eaba5..3ead9719458a06bd1069bbdf37f134f7c391c461 100644
--- a/src/gadgetsmp.h
+++ b/src/gadgetsmp.h
@@ -29,7 +29,7 @@
 #include "cell.h"
 #include "space.h"
 #include "queue.h"
+#include "runner.h"
 #include "runner_iact.h"
 #include "engine.h"
-#include "runner.h"
 #include "ic.h"
diff --git a/src/ic.c b/src/ic.c
index 11025277e2443d0cfdf0d289df098b45f0f55b2b..c9c7d8559fbe50340ad0d78af9c6bbca81e8cbc1 100644
--- a/src/ic.c
+++ b/src/ic.c
@@ -1,6 +1,7 @@
 /*******************************************************************************
  * This file is part of GadgetSMP.
- * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk).
  * 
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -292,7 +293,7 @@ void read_ic ( char* fileName, double dim[3], struct part **parts,  int* N, int*
   readArray(h_grp, "Mass", FLOAT, *N, 1, *parts, mass);
   readArray(h_grp, "SmoothingLength", FLOAT, *N, 1, *parts, h);
   readArray(h_grp, "InternalEnergy", FLOAT, *N, 1, *parts, u);
-  readArray(h_grp, "ParticleIDs", ULONG, *N, 1, *parts, id);
+  readArray(h_grp, "ParticleIDs", ULONGLONG, *N, 1, *parts, id);
 
   /* Close particle group */
   H5Gclose(h_grp);
diff --git a/src/ic.h b/src/ic.h
index 10216bfb8c410695b6aadc14792e0f072be8d5a5..bcacfddad0ca72fa8951c30203d6cff46ead5f48 100644
--- a/src/ic.h
+++ b/src/ic.h
@@ -1,6 +1,6 @@
 /*******************************************************************************
  * This file is part of GadgetSMP.
- * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ * Coypright (c) 2012 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
  * 
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
diff --git a/src/part.h b/src/part.h
index 79324ba0f92477fa731d8ba0dbda2845a0dcb23a..095e85c2e198a1fcaf3938384053cbee80321b46 100644
--- a/src/part.h
+++ b/src/part.h
@@ -31,13 +31,13 @@ struct part {
     float h;
     
     /* Particle time-step. */
-    int dt;
+    float dt;
     
     /* Particle mass. */
     float mass;
     
     /* Particle ID. */
-    unsigned long id;
+    unsigned long long id;
     
     /* Particle position. */
     double x[3];
diff --git a/src/runner.c b/src/runner.c
index 672c8786bc8d2899e585461e43fa96af88d38f07..7c87642f1da8aa4e98dbcc55f1007590bcfea5f6 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -161,64 +161,6 @@ void runner_dosort_ascending ( struct entry *sort , int N ) {
     }
     
     
-/**
- * @brief inline helper fuction to merge two entry arrays (forward).
- *
- * @param one the first array
- * @param none the length of the first array
- * @param two the second array
- * @param ntwo the length of the second array
- * @param dest the destination array.
- */
- 
-inline void merge_forward ( struct entry *__restrict__ one , int none , struct entry *__restrict__ two , int ntwo , struct entry *__restrict__ dest ) {
-
-    int i = 0, j = 0, k = 0;
-    
-    while ( j < none && k < ntwo )
-        if ( one[j].d < two[k].d )
-            dest[i++] = one[j++];
-        else
-            dest[i++] = two[k++];
-    if ( j == none )
-        for ( ; k < ntwo ; k++ )
-            dest[i++] = two[k];
-    else
-        for ( ; j < none ; j++ )
-            dest[i++] = one[j];
-
-    }
-    
-
-/**
- * @brief inline helper fuction to merge two entry arrays (forward).
- *
- * @param one the first array
- * @param none the length of the first array
- * @param two the second array
- * @param ntwo the length of the second array
- * @param dest the destination array.
- */
- 
-inline void merge_backward ( struct entry *__restrict__ one , int none , struct entry *__restrict__ two , int ntwo , struct entry *__restrict__ dest ) {
-
-    int i = none + ntwo - 1, j = none - 1, k = ntwo - 1;
-    
-    while ( j >= 0 && k >= 0 )
-        if ( one[j].d > two[k].d )
-            dest[i--] = one[j--];
-        else
-            dest[i--] = two[k--];
-    if ( j < 0 )
-        for ( ; k >= 0 ; k-- )
-            dest[i--] = two[k];
-    else
-        for ( ; j >= 0 ; j-- )
-            dest[i--] = one[j];
-
-    }
-    
-
 /**
  * @brief Sort the particles in the given cell along all cardinal directions.
  *
@@ -232,7 +174,6 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags ) {
     struct entry *fingers[8];
     struct part *parts = c->parts;
     int j, k, count = c->count;
-    int cone, ctwo;
     int i, ind, off[8], inds[8], temp_i;
     // float shift[3];
     float buff[8], px[3];
@@ -248,7 +189,7 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags ) {
     if ( lock_lock( &c->lock ) != 0 )
         error( "Failed to lock cell." );
     if ( c->sort == NULL )
-        if ( ( c->sort = (struct entry *)malloc( sizeof(struct entry) * (c->count + 1) * 13 ) ) == NULL )
+        if ( ( c->sort = (struct entry *)malloc( sizeof(struct entry) * (c->count + 1) * 14 ) ) == NULL )
             error( "Failed to allocate sort memory." );
     if ( lock_unlock( &c->lock ) != 0 )
         error( "Failed to unlock cell." );
@@ -256,90 +197,57 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags ) {
     /* Does this cell have any progeny? */
     if ( c->split ) {
     
-        /* Loop over the 13 different sort arrays. */
-        for ( j = 0 ; j < 13 ; j++ ) {
+        /* Loop over the 13+1 different sort arrays. */
+        for ( j = 0 ; j < 14 ; j++ ) {
         
             /* Has this sort array been flagged? */
             if ( !( flags & (1 << j) ) )
                 continue;
                 
-            if ( 0 ) {
-            
-                /* Get a finger on the sorting array. */
-                finger = &c->sort[ j*(count + 1) ];
-                
-                /* Merge the two first sub-cells forward into this cell. */
-                cone = c->progeny[0]->count;
-                ctwo = c->progeny[1]->count;
-                merge_forward( &c->progeny[0]->sort[ j*(cone + 1) ] , cone ,
-                               &c->progeny[1]->sort[ j*(ctwo + 1) ] , ctwo ,
-                               finger );
-                               
-                /* Merge-in the remaining arrays, alternating forward and
-                   backward merges. */
-                for ( k = 2 ; k < 8 ; k++ ) {
-                    cone = cone + ctwo;
-                    ctwo = c->progeny[k]->count;
-                    if ( k & 1 )
-                        merge_forward( &finger[ count - cone ] , cone ,
-                                       &c->progeny[k]->sort[ j*(ctwo + 1) ] , ctwo ,
-                                       finger );
-                    else
-                        merge_backward( finger , cone ,
-                                        &c->progeny[k]->sort[ j*(ctwo + 1) ] , ctwo ,
-                                        &finger[ count - cone - ctwo ] );
+            /* Init the particle index offsets. */
+            for ( off[0] = 0 , k = 1 ; k < 8 ; k++ )
+                if ( c->progeny[k-1] != NULL )
+                    off[k] = off[k-1] + c->progeny[k-1]->count;
+                else
+                    off[k] = off[k-1];
+
+            /* Init the entries and indices. */
+            for ( k = 0 ; k < 8 ; k++ ) {
+                inds[k] = k;
+                if ( c->progeny[k] != NULL && c->progeny[k]->count > 0 ) {
+                    fingers[k] = &c->progeny[k]->sort[ j*(c->progeny[k]->count + 1) ];
+                    buff[k] = fingers[k]->d;
+                    off[k] = off[k];
                     }
-                
+                else
+                    buff[k] = FLT_MAX;
                 }
-                
-            else {
-            
-                /* Init the particle index offsets. */
-                for ( off[0] = 0 , k = 1 ; k < 8 ; k++ )
-                    if ( c->progeny[k-1] != NULL )
-                        off[k] = off[k-1] + c->progeny[k-1]->count;
-                    else
-                        off[k] = off[k-1];
-
-                /* Init the entries and indices. */
-                for ( k = 0 ; k < 8 ; k++ ) {
-                    inds[k] = k;
-                    if ( c->progeny[k] != NULL && c->progeny[k]->count > 0 ) {
-                        fingers[k] = &c->progeny[k]->sort[ j*(c->progeny[k]->count + 1) ];
-                        buff[k] = fingers[k]->d;
-                        off[k] = off[k];
-                        }
-                    else
-                        buff[k] = FLT_MAX;
-                    }
 
-                /* Sort the buffer. */
-                for ( i = 0 ; i < 7 ; i++ )
-                    for ( k = i+1 ; k < 8 ; k++ )
-                        if ( buff[ inds[k] ] < buff[ inds[i] ] ) {
-                            temp_i = inds[i]; inds[i] = inds[k]; inds[k] = temp_i;
-                            }
+            /* Sort the buffer. */
+            for ( i = 0 ; i < 7 ; i++ )
+                for ( k = i+1 ; k < 8 ; k++ )
+                    if ( buff[ inds[k] ] < buff[ inds[i] ] ) {
+                        temp_i = inds[i]; inds[i] = inds[k]; inds[k] = temp_i;
+                        }
 
-                /* For each entry in the new sort list. */
-                finger = &c->sort[ j*(count + 1) ];
-                for ( ind = 0 ; ind < count ; ind++ ) {
+            /* For each entry in the new sort list. */
+            finger = &c->sort[ j*(count + 1) ];
+            for ( ind = 0 ; ind < count ; ind++ ) {
 
-                    /* Copy the minimum into the new sort array. */
-                    finger[ind].d = buff[inds[0]];
-                    finger[ind].i = fingers[inds[0]]->i + off[inds[0]];
+                /* Copy the minimum into the new sort array. */
+                finger[ind].d = buff[inds[0]];
+                finger[ind].i = fingers[inds[0]]->i + off[inds[0]];
 
-                    /* Update the buffer. */
-                    fingers[inds[0]] += 1;
-                    buff[inds[0]] = fingers[inds[0]]->d;
+                /* Update the buffer. */
+                fingers[inds[0]] += 1;
+                buff[inds[0]] = fingers[inds[0]]->d;
 
-                    /* Find the smallest entry. */
-                    for ( k = 1 ; k < 8 && buff[inds[k]] < buff[inds[k-1]] ; k++ ) {
-                        temp_i = inds[k-1]; inds[k-1] = inds[k]; inds[k] = temp_i;
-                        }
+                /* Find the smallest entry. */
+                for ( k = 1 ; k < 8 && buff[inds[k]] < buff[inds[k-1]] ; k++ ) {
+                    temp_i = inds[k-1]; inds[k-1] = inds[k]; inds[k] = temp_i;
+                    }
 
-                    } /* Merge. */
-                    
-                }
+                } /* Merge. */
             
             /* Add a sentinel. */
             c->sort[ j*(c->count + 1) + c->count ].d = FLT_MAX;
@@ -349,39 +257,6 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags ) {
     
         } /* progeny? */
         
-    /* Otherwise, just sort. */
-    // else {
-    // 
-    //     /* Loop over the different cell axes. */
-    //     for ( j = 0 ; j < 13 ; j++ ) {
-    //     
-    //         /* Has this sort array been flagged? */
-    //         if ( !( flags & (1 << j) ) )
-    //             continue;
-    //     
-    //         /* Get the shift vector. */
-    //         shift[0] = runner_shift[ 3*j + 0 ];
-    //         shift[1] = runner_shift[ 3*j + 1 ];
-    //         shift[2] = runner_shift[ 3*j + 2 ];
-    //         
-    //         /* Fill the sort array. */
-    //         finger = &c->sort[ j*(count + 1) ];
-    //         for ( k = 0 ; k < count ; k++ ) {
-    //             finger[k].i = k;
-    //             finger[k].d = parts[k].x[0]*shift[0] + parts[k].x[1]*shift[1] + parts[k].x[2]*shift[2];
-    //             }
-    //             
-    //         /* Add the sentinel. */
-    //         finger[ c->count ].d = FLT_MAX;
-    //         finger[ c->count ].i = 0;
-    //             
-    //         /* Sort descending. */
-    //         runner_dosort_ascending( finger , c->count );
-    //     
-    //         }
-    //         
-    //     }
-        
     /* Otherwise, just sort. */
     else {
     
@@ -395,10 +270,14 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags ) {
                     c->sort[ j*(count + 1) + k].i = k;
                     c->sort[ j*(count + 1) + k].d = px[0]*runner_shift[ 3*j + 0 ] + px[1]*runner_shift[ 3*j + 1 ] + px[2]*runner_shift[ 3*j + 2 ];
                     }
+            if ( flags & (1 << 14) ) {
+                c->sort[ 14*(count + 1) + k ].i = k;
+                c->sort[ 14*(count + 1) + k ].d = parts[k].dt;
+                }
             }
 
         /* Add the sentinel and sort. */
-        for ( j = 0 ; j < 13 ; j++ )
+        for ( j = 0 ; j < 14 ; j++ )
             if ( flags & (1 << j) ) {
                 c->sort[ j*(count + 1) + c->count ].d = FLT_MAX;
                 c->sort[ j*(count + 1) + c->count ].i = 0;
@@ -505,7 +384,7 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
             p->h_dt = 0.0f;
 
             /* Compute this particle's time step. */
-            p->dt = const_cfl * p->h / ( const_gamma * ( const_gamma - 1.0f ) * p->u );
+            p->dt = const_cfl * p->h / sqrtf( const_gamma * ( const_gamma - 1.0f ) * p->u );
 
             /* Compute the pressure. */
             // p->P = p->rho * p->u * ( const_gamma - 1.0f );
diff --git a/src/runner.h b/src/runner.h
index 548165edf8a8913065c4629e1de30de04ff128eb..4b21648853fe9af962c4fde325abccf260c33690 100644
--- a/src/runner.h
+++ b/src/runner.h
@@ -88,8 +88,8 @@ extern int runner_counter[ runner_counter_count ];
 
 /* Histogram functions. */
 #define runner_hist_a 1.0
-#define runner_hist_b 10.0
-#define runner_hist_N 99
+#define runner_hist_b 1000.0
+#define runner_hist_N 100
 long long int runner_hist_bins[ runner_hist_N ];
 #define runner_hist_hit( x ) __sync_add_and_fetch( &runner_hist_bins[ (int)fmax( 0.0 , fmin( runner_hist_N-1 , ((x) - runner_hist_a) / (runner_hist_b - runner_hist_a) * runner_hist_N ) ) ] , 1 )
 
diff --git a/src/space.c b/src/space.c
index 52c582057086715cec098bf237e94d48008519a2..b43203c97dbe05728b076c34f9e06a00f1fab207 100644
--- a/src/space.c
+++ b/src/space.c
@@ -76,6 +76,60 @@ const int sortlistID[27] = {
     };
     
     
+/**
+ * @brief Sort the particles according to the given indices.
+ *
+ * @param parts The list of #part
+ * @param ind The indices with respect to which the parts are sorted.
+ * @param N The number of parts
+ * @param min Lowest index.
+ * @param max highest index.
+ *
+ * This function calls itself recursively.
+ */
+ 
+void parts_sort ( struct part *parts , int *ind , int N , int min , int max ) {
+
+    int pivot = (min + max) / 2;
+    int i = 0, j = N-1;
+    int temp_i;
+    struct part temp_p;
+    
+    /* One pass of quicksort. */
+    while ( i < j ) {
+        while ( i < N && ind[i] <= pivot )
+            i++;
+        while ( j >= 0 && ind[j] > pivot )
+            j--;
+        if ( i < j ) {
+            temp_i = ind[i]; ind[i] = ind[j]; ind[j] = temp_i;
+            temp_p = parts[i]; parts[i] = parts[j]; parts[j] = temp_p;
+            }
+        }
+        
+    /* Verify sort. */
+    for ( int k = 0 ; k <= j ; k++ )
+        if ( ind[k] > pivot ) {
+            printf( "parts_sort: sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, N=%i.\n" , k , ind[k] , pivot , i , j , N );
+            error( "Sorting failed (<=pivot)." );
+            }
+    for ( int k = j+1 ; k < N ; k++ )
+        if ( ind[k] <= pivot ) {
+            printf( "parts_sort: sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, N=%i.\n" , k , ind[k] , pivot , i , j , N );
+            error( "Sorting failed (>pivot)." );
+            }
+        
+    /* Recurse on the left? */
+    if ( j > 0 && pivot > min )
+        parts_sort( parts , ind , j+1 , min , pivot );
+        
+    /* Recurse on the right? */
+    if ( i < N && pivot+1 < max )
+        parts_sort( &parts[i], &ind[i], N-i , pivot+1 , max );
+
+    }
+
+
 /**
  * @brief Mapping function to free the sorted indices buffers.
  */
@@ -361,7 +415,7 @@ void space_splittasks ( struct space *s ) {
         hj = fmax( cj->h[0] , fmax( cj->h[1] , cj->h[2] ) );
             
         /* Should this task be split-up? */
-        if ( ci->split && cj->split && ci->h_max < hi/2 && cj->h_max < hj/2 ) {
+        if ( ci->split && cj->split && ci->h_max*space_stretch < hi/2 && cj->h_max*space_stretch < hj/2 ) {
         
             /* Get the relative distance between the pairs, wrapping. */
             for ( k = 0 ; k < 3 ; k++ ) {
@@ -816,7 +870,7 @@ void space_maketasks ( struct space *s , int do_sort ) {
     void maketasks_rec ( struct cell *c , struct task *sort_up[] , int nr_sort_up , struct cell *parent ) {
 
         int j, k, nr_sort = 0;
-        struct task *sort[14], *t;
+        struct task *sort[7], *t;
 
         /* Clear the waits on this cell. */
         c->wait = 0;
@@ -827,38 +881,29 @@ void space_maketasks ( struct space *s , int do_sort ) {
         
             if ( do_sort ) {
                 if ( c->count < 1000 ) {
-                    sort[0] = space_addtask( s , task_type_sort , task_subtype_none , 0x1fff , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    sort[0] = space_addtask( s , task_type_sort , task_subtype_none , 0x3fff , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
                     for ( k = 0 ; k < 13 ; k++ )
                         c->sorts[k] = sort[0];
                     nr_sort = 1;
                     }
                 else if ( c->count < 5000 ) {
-                    sort[0] = space_addtask( s , task_type_sort , task_subtype_none , 0xf , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
-                    sort[1] = space_addtask( s , task_type_sort , task_subtype_none , 0xf0 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
-                    sort[2] = space_addtask( s , task_type_sort , task_subtype_none , 0x1f00 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
-                    for ( k = 0 ; k < 4 ; k++ )
+                    sort[0] = space_addtask( s , task_type_sort , task_subtype_none , 0x7f , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    sort[1] = space_addtask( s , task_type_sort , task_subtype_none , 0x3f80 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    for ( k = 0 ; k < 7 ; k++ )
                         c->sorts[k] = sort[0];
-                    for ( k = 4 ; k < 8 ; k++ )
+                    for ( k = 7 ; k < 14 ; k++ )
                         c->sorts[k] = sort[1];
-                    for ( k = 8 ; k < 13 ; k++ )
-                        c->sorts[k] = sort[2];
-                    nr_sort = 3;
+                    nr_sort = 2;
                     }
                 else {
-                    c->sorts[0] = sort[0] = space_addtask( s , task_type_sort , task_subtype_none , 0x1 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
-                    c->sorts[1] = sort[1] = space_addtask( s , task_type_sort , task_subtype_none , 0x2 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
-                    c->sorts[2] = sort[2] = space_addtask( s , task_type_sort , task_subtype_none , 0x4 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
-                    c->sorts[3] = sort[3] = space_addtask( s , task_type_sort , task_subtype_none , 0x8 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
-                    c->sorts[4] = sort[4] = space_addtask( s , task_type_sort , task_subtype_none , 0x10 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
-                    c->sorts[5] = sort[5] = space_addtask( s , task_type_sort , task_subtype_none , 0x20 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
-                    c->sorts[6] = sort[6] = space_addtask( s , task_type_sort , task_subtype_none , 0x40 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
-                    c->sorts[7] = sort[7] = space_addtask( s , task_type_sort , task_subtype_none , 0x80 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
-                    c->sorts[8] = sort[8] = space_addtask( s , task_type_sort , task_subtype_none , 0x100 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
-                    c->sorts[9] = sort[9] = space_addtask( s , task_type_sort , task_subtype_none , 0x200 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
-                    c->sorts[10] = sort[10] = space_addtask( s , task_type_sort , task_subtype_none , 0x400 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
-                    c->sorts[11] = sort[11] = space_addtask( s , task_type_sort , task_subtype_none , 0x800 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
-                    c->sorts[12] = sort[12] = space_addtask( s , task_type_sort , task_subtype_none , 0x1000 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
-                    nr_sort = 13;
+                    c->sorts[0] = c->sorts[1] = sort[0] = space_addtask( s , task_type_sort , task_subtype_none , 0x1 + 0x2 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    c->sorts[2] = c->sorts[3] = sort[1] = space_addtask( s , task_type_sort , task_subtype_none , 0x4 + 0x8 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    c->sorts[4] = c->sorts[5] = sort[2] = space_addtask( s , task_type_sort , task_subtype_none , 0x10 + 0x20 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    c->sorts[6] = c->sorts[7] = sort[3] = space_addtask( s , task_type_sort , task_subtype_none , 0x40 + 0x80 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    c->sorts[8] = c->sorts[9] = sort[4] = space_addtask( s , task_type_sort , task_subtype_none , 0x100 + 0x200 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    c->sorts[10] = c->sorts[11] = sort[5] = space_addtask( s , task_type_sort , task_subtype_none , 0x400 + 0x800 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    c->sorts[12] = c->sorts[13] = sort[6] = space_addtask( s , task_type_sort , task_subtype_none , 0x1000 + 0x2000 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    nr_sort = 7;
                     }
                 }
 
@@ -1242,7 +1287,8 @@ void space_init ( struct space *s , double dim[3] , struct part *parts , int N ,
     int nr_cells, cdim[3];
     double h_min, h_max, h[3], ih[3];
     struct cell *c, *cells;
-    struct part *parts_new, *finger;
+    struct part *finger;
+    int *ind;
     
     
     /* Get the minimum and maximum cutoff radii. */
@@ -1253,6 +1299,9 @@ void space_init ( struct space *s , double dim[3] , struct part *parts , int N ,
         else if ( parts[k].h > h_max )
             h_max = parts[k].h;
             
+    /* Stretch the maximum smoothing length. */
+    h_max *= space_stretch;
+            
     /* Get the cell width. */
     if ( h_cells < h_max )
         h_cells = h_max;
@@ -1280,31 +1329,29 @@ void space_init ( struct space *s , double dim[3] , struct part *parts , int N ,
                 c->h[0] = h[0]; c->h[1] = h[1]; c->h[2] = h[2];
                 }
         
-    /* Run through the particles and get the counts for each cell. */
-    for ( k = 0 ; k < N ; k++ )
-        cells[ cell_getid( cdim , parts[k].x[0]*ih[0] , parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] ) ].count += 1;
+    /* Run through the particles and get their cell index. */
+    ind = (int *)alloca( sizeof(int) * N );
+    for ( k = 0 ; k < N ; k++ )  {
+        ind[k] = cell_getid( cdim , parts[k].x[0]*ih[0] , parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] );
+        cells[ ind[k] ].count += 1;
+        }
         
-    /* Allocate the new part buffer and set the part pointers in each cell. */
-    if ( posix_memalign( (void *)&parts_new , 64 , N * sizeof(struct part) ) != 0 )
-        error( "Failed to allocate parts." );
-    for ( finger = parts_new , k = 0 ; k < nr_cells ; k++ ) {
+    /* Sort the parts according to their cells. */
+    parts_sort( parts , ind , N , 0 , nr_cells );
+        
+    /* Hook the cells up to the parts. */
+    for ( finger = parts , k = 0 ; k < nr_cells ; k++ ) {
         c = &cells[ k ];
         c->parts = finger;
         finger = &finger[ c->count ];
-        c->count = 0;
-        }
-    for ( k = 0 ; k < N ; k++ ) {
-        c = &cells[ cell_getid( cdim , parts[k].x[0]*ih[0] , parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] ) ];
-        c->parts[ c->count ] = parts[k];
-        c->count += 1;
         }
         
     /* Store eveything in the space. */
     s->h_min = h_min; s->h_max = h_max;
     s->dim[0] = dim[0]; s->dim[1] = dim[1]; s->dim[2] = dim[2];
     s->periodic = periodic;
-    s->parts = parts_new;
     s->nr_parts = N;
+    s->parts = parts;
     s->h[0] = h[0]; s->h[1] = h[1]; s->h[2] = h[2];
     s->ih[0] = ih[0]; s->ih[1] = ih[1]; s->ih[2] = ih[2];
     s->cdim[0] = cdim[0]; s->cdim[1] = cdim[1]; s->cdim[2] = cdim[2];
diff --git a/src/space.h b/src/space.h
index e358744474a9650b07abbc6ef7ca5be0b873dd40..32219c8a4ff89d1d7591e02f905a46a29c7a63b7 100644
--- a/src/space.h
+++ b/src/space.h
@@ -26,6 +26,7 @@
 #define space_splitratio                0.875
 #define space_splitsize_default         400
 #define space_dosub                     1
+#define space_stretch                   1.0
 
 
 /* Convert cell location to ID. */
@@ -57,6 +58,9 @@ struct space {
     /* The minimum and maximum cutoff radii. */
     double h_min, h_max;
     
+    /* Current time step for particles. */
+    float dt;
+    
     /* Number of cells. */
     int nr_cells, tot_cells;