From c969365b55de835660b29f371af01080f70b332d Mon Sep 17 00:00:00 2001
From: Pedro Gonnet <pedro.gonnet@durham.ac.uk>
Date: Thu, 7 Feb 2013 18:22:15 +0000
Subject: [PATCH] several bug fixes using the new test cases.

Former-commit-id: 36567220a6b658f9dbff8091561e53ed45c4bf7a
---
 examples/PertubedBox/makeIC.py |   2 +-
 examples/UniformBox/makeIC.py  |   2 +-
 examples/test.c                |  54 ++++++++++---
 src/Makefile.am                |   8 +-
 src/debug.c                    |  15 +++-
 src/debug.h                    |   2 +-
 src/engine.c                   | 135 ++++++++++++++++++++++-----------
 src/runner.c                   |  22 +++---
 src/runner_iact.h              |  10 +--
 src/space.c                    |  37 +++++----
 src/vector.h                   |  18 ++++-
 11 files changed, 207 insertions(+), 98 deletions(-)

diff --git a/examples/PertubedBox/makeIC.py b/examples/PertubedBox/makeIC.py
index 03c97798bd..8301b0f630 100644
--- a/examples/PertubedBox/makeIC.py
+++ b/examples/PertubedBox/makeIC.py
@@ -32,7 +32,7 @@ L = 50           # Number of particles along one axis
 rho = 1.         # Density
 P = 1.           # Pressure
 gamma = 5./3.    # Gas adiabatic index
-pert = 0.1      # Perturbation scale (in units of the interparticle separation)
+pert = 0.01      # Perturbation scale (in units of the interparticle separation)
 fileName = "perturbedBox.hdf5" 
 
 
diff --git a/examples/UniformBox/makeIC.py b/examples/UniformBox/makeIC.py
index b68badbe80..e61106d30c 100644
--- a/examples/UniformBox/makeIC.py
+++ b/examples/UniformBox/makeIC.py
@@ -28,7 +28,7 @@ from numpy import *
 periodic= 1      # 1 For periodic box
 boxSize = 1.
 L = 50           # Number of particles along one axis
-rho = 1.         # Density
+rho = 2.         # Density
 P = 1.           # Pressure
 gamma = 5./3.    # Gas adiabatic index
 fileName = "uniformBox.hdf5" 
diff --git a/examples/test.c b/examples/test.c
index 7a98ab26ec..57669065b0 100644
--- a/examples/test.c
+++ b/examples/test.c
@@ -28,6 +28,7 @@
 #include <string.h>
 #include <pthread.h>
 #include <math.h>
+#include <fenv.h>
 #include <omp.h>
 
 /* Conditional headers. */
@@ -193,6 +194,24 @@ void map_wcount_max ( struct part *p , struct cell *c , void *data ) {
 
     }
 
+void map_h_min ( struct part *p , struct cell *c , void *data ) {
+
+    struct part **p2 = (struct part **)data;
+    
+    if ( p->h < (*p2)->h )
+        *p2 = p;
+
+    }
+
+void map_h_max ( struct part *p , struct cell *c , void *data ) {
+
+    struct part **p2 = (struct part **)data;
+    
+    if ( p->h > (*p2)->h )
+        *p2 = p;
+
+    }
+
 
 /**
  * @brief Mapping function for neighbour count.
@@ -689,6 +708,9 @@ int main ( int argc , char *argv[] ) {
     float dt_max = 0.0f;
     ticks tic;
     
+    /* Choke on FP-exceptions. */
+    feenableexcept( FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW );
+    
     /* Init the space. */
     bzero( &s , sizeof(struct space) );
 
@@ -862,6 +884,7 @@ int main ( int argc , char *argv[] ) {
 	    parts[k].x[2] += shift[2];
       }
 
+    /* Dump the first few particles. */
     for(k=0; k<10; ++k)
       printParticle(parts, k);
 
@@ -910,12 +933,6 @@ int main ( int argc , char *argv[] ) {
     /* Dump the particle positions. */
     // space_map_parts( &s , &map_dump , shift );
     
-    /* Dump the acceleration of the first particle. */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        printf( "main: parts[%lli].a is [ %.16e %.16e %.16e ].\n" , s.parts[k].id , s.parts[k].a[0] , s.parts[k].a[1] , s.parts[k].a[2] );
-        printf( "main: parts[%lli].a has h=%e, rho=%e, wcount=%.3f.\n" , s.parts[k].id , s.parts[k].h , s.parts[k].rho , s.parts[k].wcount + 32.0/3 );
-        }
-    
     /* Initialize the runner with this space. */
     tic = getticks();
     engine_init( &e , &s , nr_threads , nr_queues , engine_policy_steal | engine_policy_keep );
@@ -943,6 +960,23 @@ int main ( int argc , char *argv[] ) {
         /* Take a step. */
         engine_step( &e , 0 );
         
+        /* Dump the first few particles. */
+        for(k=0; k<10; ++k)
+          printParticle(parts, k);
+        printParticle( parts , 113531 );
+    
+        /* Get the particle with the lowest h. */
+        p = &s.parts[0];
+        space_map_parts( &s , &map_h_min , &p );
+        printf( "main: particle %lli/%i at [ %e %e %e ] has minimum h=%.3e (h_dt=%.3e).\n" ,
+	        p->id , (int)(p - s.parts) , p->x[0] , p->x[1] , p->x[2] , p->h , p->h_dt );
+
+        /* Get the particle with the highest h. */
+        p = &s.parts[0];
+        space_map_parts( &s , &map_h_max , &p );
+        printf( "main: particle %lli/%i at [ %e %e %e ] has maximum h=%.3e (h_dt=%.3e).\n" ,
+	        p->id , (int)(p - s.parts) , p->x[0] , p->x[1] , p->x[2] , p->h , p->h_dt );
+    
         /* Output. */
         #ifdef TIMER
             printf( "main: runner timers are [ %.3f" , timers[0]/CPU_TPS*1000 );
@@ -1010,11 +1044,9 @@ int main ( int argc , char *argv[] ) {
     // space_map_parts( &s , &map_icount , &icount );
     // printf( "main: average neighbours per particle is %.3f.\n" , (double)icount / s.nr_parts );
     
-    /* Dump the acceleration of the first particle. */
-    for ( k = 0 ; k < 3 ; k++ ) {
-        printf( "main: parts[%lli].a is [ %.16e %.16e %.16e ].\n" , s.parts[k].id , s.parts[k].a[0] , s.parts[k].a[1] , s.parts[k].a[2] );
-        printf( "main: parts[%lli].a has h=%e, rho=%e, wcount=%.3f.\n" , s.parts[k].id , s.parts[k].h , s.parts[k].rho , s.parts[k].wcount );
-        }
+    /* Dump the first few particles. */
+    for(k=0; k<10; ++k)
+      printParticle(parts, k);
     
     /* Get all the cells of a certain depth. */
     // icount = 1;
diff --git a/src/Makefile.am b/src/Makefile.am
index 53fc189f38..6acf9e47dd 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -20,11 +20,11 @@
 AUTOMAKE_OPTIONS=gnu
 
 # Add the debug flag to the whole thing
-AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \
-    -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \
-    -DTIMER -DCOUNTER -DCPU_TPS=2.67e9
-# AM_CFLAGS = -Wall -Werror $(OPENMP_CFLAGS) \
+# AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \
+#     -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \
 #     -DTIMER -DCOUNTER -DCPU_TPS=2.67e9
+AM_CFLAGS = -Wall -Werror $(OPENMP_CFLAGS) \
+    -DTIMER -DCOUNTER -DCPU_TPS=2.67e9
 
 # Assign a "safe" version number
 AM_LDFLAGS = $(LAPACK_LIBS) $(BLAS_LIBS) $(HDF5_LDFLAGS) -version-info 0:0:0
diff --git a/src/debug.c b/src/debug.c
index 194666f9a2..4d885521bb 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -22,17 +22,26 @@
 
 #include "part.h"
 
-void printParticle(struct part *parts, int i)
-{
-  printf("## Particle[%d]: id= %lld x=( %f, %f, %f) v=( %f, %f, %f) h= %f m= %f rho= %f u= %f dt= %f\n",
+void printParticle ( struct part *parts , long long int id ) {
+
+    int i;
+
+    /* Look for the particle. */
+    for ( i = 0 ; parts[i].id != id ; i++ );
+
+  printf("## Particle[%d]: id=%lld, x=(%f,%f,%f), v=(%f,%f,%f), a=(%f,%f,%f), h=%f, h_dt=%f, wcount=%f, m=%f, rho=%f, u=%f, dudt=%f, dt=%.3e\n",
 	 i,
 	 parts[i].id,
 	 parts[i].x[0], parts[i].x[1], parts[i].x[2],
 	 parts[i].v[0], parts[i].v[1], parts[i].v[2],
+	 parts[i].a[0], parts[i].a[1], parts[i].a[2],
 	 parts[i].h,
+	 parts[i].h_dt,
+	 parts[i].wcount,
 	 parts[i].mass,
 	 parts[i].rho,
 	 parts[i].u,
+     parts[i].u_dt,
 	 parts[i].dt
 	 );
 }
diff --git a/src/debug.h b/src/debug.h
index a75c61a1d5..693c50f6b0 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -20,4 +20,4 @@
 
 
 
-void printParticle(struct part *parts, int i);
+void printParticle(struct part *parts, long long int i);
diff --git a/src/engine.c b/src/engine.c
index 70f7ca3f4f..7513914b11 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -39,6 +39,7 @@
 #include "lock.h"
 #include "task.h"
 #include "part.h"
+#include "debug.h"
 #include "cell.h"
 #include "space.h"
 #include "queue.h"
@@ -65,6 +66,7 @@ void engine_prepare ( struct engine *e ) {
     int j, k, qid;
     struct space *s = e->s;
     struct queue *q;
+    float dt_max = e->dt_max;
     
     TIMER_TIC
 
@@ -92,12 +94,13 @@ void engine_prepare ( struct engine *e ) {
     /* Re-set the particle data. */
     // tic = getticks();
     #pragma omp parallel for schedule(static) 
-    for ( k = 0 ; k < s->nr_parts ; k++ ) {
-        s->parts[k].wcount = 0.0f;
-        s->parts[k].wcount_dh = 0.0f;
-        s->parts[k].rho = 0.0f;
-        s->parts[k].rho_dh = 0.0f;
-        }
+    for ( k = 0 ; k < s->nr_parts ; k++ )
+        if ( s->parts[k].dt <= dt_max ) {
+            s->parts[k].wcount = 0.0f;
+            s->parts[k].wcount_dh = 0.0f;
+            s->parts[k].rho = 0.0f;
+            s->parts[k].rho_dh = 0.0f;
+            }
     // printf( "engine_prepare: re-setting particle data took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
     
     /* Run throught the tasks and get all the waits right. */
@@ -177,15 +180,18 @@ void engine_step ( struct engine *e , int sort_queues ) {
     int k, nr_parts = e->s->nr_parts;
     struct part *restrict parts = e->s->parts, *restrict p;
     float *restrict v_bar;
-    float dt = e->dt, hdt = 0.5*dt, dt_max;
-    #ifdef __SSE2__
-        VEC_MACRO(4,float) hdtv = _mm_set1_ps( hdt );
-    #endif
+    float dt = e->dt, hdt = 0.5*dt, dt_max, dt_min, ldt_min, ldt_max;
+    double etot = 0.0, letot, lmom[3], mom[3] = { 0.0 , 0.0 , 0.0 };
+    int threadID, nthreads;
+    // #ifdef __SSE2__
+    //     VEC_MACRO(4,float) hdtv = _mm_set1_ps( hdt );
+    // #endif
 
     /* Get the maximum dt. */
-    dt_max = dt;
+    dt_max = 2.0f*dt;
     for ( k = 0 ; k < 32 && (e->step & (1 << k)) == 0 ; k++ )
         dt_max *= 2;
+    dt_max = 1;
         
     /* Set the maximum dt. */
     e->dt_max = dt_max;
@@ -198,43 +204,46 @@ void engine_step ( struct engine *e , int sort_queues ) {
         
     /* First kick. */
     TIMER_TIC
-    #pragma omp parallel for schedule(static) private(p)
+    // #pragma omp parallel for schedule(static) private(p)
     for ( k = 0 ; k < nr_parts ; k++ ) {
         
         /* Get a handle on the part. */
         p = &parts[k];
         
         /* Step and store the velocity and internal energy. */
-        #ifdef __SSE__
-            _mm_store_ps( &v_bar[4*k] , _mm_add_ps( _mm_load_ps( &p->v[0] ) , _mm_mul_ps( hdtv , _mm_load_ps( &p->a[0] ) ) ) );
-        #else
+        // #ifdef __SSE__
+        //     _mm_store_ps( &v_bar[4*k] , _mm_add_ps( _mm_load_ps( &p->v[0] ) , _mm_mul_ps( hdtv , _mm_load_ps( &p->a[0] ) ) ) );
+        // #else
             v_bar[4*k+0] = p->v[0] + hdt * p->a[0];
             v_bar[4*k+1] = p->v[1] + hdt * p->a[1];
             v_bar[4*k+2] = p->v[2] + hdt * p->a[2];
-        #endif
+        // #endif
         v_bar[4*k+3] = p->u + hdt * p->u_dt;
         
         /* Move the particles with the velocitie at the half-step. */
-        // p->x[0] += dt * v_bar[3*k+0];
-        // p->x[1] += dt * v_bar[3*k+1];
-        // p->x[2] += dt * v_bar[3*k+2];
+        p->x[0] += dt * v_bar[4*k+0];
+        p->x[1] += dt * v_bar[4*k+1];
+        p->x[2] += dt * v_bar[4*k+2];
         
         /* Update positions and energies at the half-step. */
-        // p->v[0] += dt * p->a[0];
-        // p->v[1] += dt * p->a[1];
-        // p->v[2] += dt * p->a[2];
-        // p->u *= expf( p->u_dt / p->u * dt );
-        // p->h *= expf( -1.0f * p->h_dt / p->h * dt );
+        p->v[0] += dt * p->a[0];
+        p->v[1] += dt * p->a[1];
+        p->v[2] += dt * p->a[2];
+        p->u *= expf( p->u_dt / p->u * dt );
+        // p->h *= expf( p->h_dt / p->h * dt );
         
         /* Integrate other values if this particle will not be updated. */
-        if ( p->dt > dt_max ) {
-            p->rho *= expf( -3.0f * p->h_dt / p->h * dt );
-            p->POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f );
-            }
+        // if ( p->dt > dt_max ) {
+        //     p->rho *= expf( -3.0f * p->h_dt / p->h * dt );
+        //     p->POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f );
+        //     }
         
         }
     TIMER_TOC( timer_kick1 );
         
+    // for(k=0; k<10; ++k)
+    //   printParticle(parts, k);
+
     /* Prepare the space. */
     engine_prepare( e );
     
@@ -263,43 +272,79 @@ void engine_step ( struct engine *e , int sort_queues ) {
     /* Stop the clock. */
     TIMER_TOC(timer_step);
     
+    // for(k=0; k<10; ++k)
+    //   printParticle(parts, k);
+
     /* Second kick. */
     TIMER_TIC_ND
-    e->dt_min = FLT_MAX;
-    #pragma omp parallel private(p,k)
+    dt_min = FLT_MAX; dt_max = 0.0f;
+    #pragma omp parallel private(p,k,ldt_min,ldt_max,lmom,letot,threadID,nthreads)
     {
-        int threadID = omp_get_thread_num();
-        int nthreads = omp_get_num_threads();
-        float dt_min = FLT_MAX;
+        threadID = omp_get_thread_num();
+        nthreads = omp_get_num_threads();
+        ldt_min = FLT_MAX; ldt_max = 0.0f;
+        lmom[0] = 0.0; lmom[1] = 0.0; lmom[2] = 0.0;
+        letot = 0.0;
         for ( k = nr_parts * threadID / nthreads ; k < nr_parts * (threadID + 1) / nthreads ; k++ ) {
 
             /* Get a handle on the part. */
             p = &parts[k];
 
-            /* Scale the derivatives. */
-            p->u_dt *= p->POrho2;
-            p->h_dt *= p->h * 0.333333333f;
+            /* Scale the derivatives if they're freshly computed. */
+            if ( p->dt <= dt_max ) {
+                p->u_dt *= p->POrho2;
+                p->h_dt *= p->h * 0.333333333f;
+                }
 
             /* Update positions and energies at the half-step. */
-            #ifdef __SSE__
-                _mm_store_ps( &p->v[0] , _mm_add_ps( _mm_load_ps( &v_bar[4*k] ) , _mm_mul_ps( hdtv , _mm_load_ps( &p->a[0] ) ) ) );
-            #else
+            // #ifdef __SSE__
+            //     _mm_store_ps( &p->v[0] , _mm_add_ps( _mm_load_ps( &v_bar[4*k] ) , _mm_mul_ps( hdtv , _mm_load_ps( &p->a[0] ) ) ) );
+            // #else
                 p->v[0] = v_bar[4*k+0] + hdt * p->a[0];
                 p->v[1] = v_bar[4*k+1] + hdt * p->a[1];
                 p->v[2] = v_bar[4*k+2] + hdt * p->a[2];
-            #endif
-            // p->u = v_bar[4*k+3] + hdt * p->u_dt;
+            // #endif
+            p->u = v_bar[4*k+3] + hdt * p->u_dt;
 
-            /* Get the smallest dt. */
-            dt_min = fminf( dt_min , p->dt );
+            /* Get the smallest/largest dt. */
+            ldt_min = fminf( ldt_min , p->dt );
+            ldt_max = fmaxf( ldt_max , p->dt );
+            
+            /* Collect total energy. */
+            letot += 0.5 * p->mass * ( p->v[0]*p->v[0] + p->v[1]*p->v[1] + p->v[2]*p->v[2] ) + p->mass * p->u;
+
+            /* Collect momentum */
+            lmom[0] += p->mass * p->v[0];
+            lmom[1] += p->mass * p->v[1];
+            lmom[2] += p->mass * p->v[2];
 
             }
         #pragma omp critical
-        e->dt_min = fminf( e->dt_min , dt_min );
+        {
+            dt_min = fminf( dt_min , ldt_min );
+            dt_max = fmaxf( dt_max , ldt_max );
+            mom[0] += lmom[0];
+            mom[1] += lmom[1];
+            mom[2] += lmom[2];
+            etot += letot;
+        }
     }
     TIMER_TOC( timer_kick2 );
-    printf( "engine_step: dt_min is %e.\n" , e->dt_min ); fflush(stdout);
+    e->dt_min = dt_min;
+    printf( "engine_step: dt_min/dt_max is %e/%e.\n" , dt_min , dt_max ); fflush(stdout);
+    printf( "engine_step: etot is %e.\n" , etot ); fflush(stdout);
+    printf( "engine_step: total momentum is [ %e , %e , %e ].\n" , mom[0] , mom[1] , mom[2] ); fflush(stdout);
         
+    /* Does the time step need adjusting? */
+    if ( dt_min < e->dt ) {
+        e->dt *= 0.5;
+        printf( "engine_step: dt_min dropped below time step, adjusting to dt=%e.\n" , e->dt );
+        }
+    else if ( dt_min > 2*e->dt ) {
+        e->dt *= 2.0;
+        printf( "engine_step: dt_min is larger than twice the time step, adjusting to dt=%e.\n" , e->dt );
+        }
+    
     /* Clean up. */
     free( v_bar );
     
diff --git a/src/runner.c b/src/runner.c
index 52e6be949e..de4d5b7db7 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -325,6 +325,7 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags ) {
 void runner_doghost ( struct runner *r , struct cell *c ) {
 
     struct part *p;
+    struct cpart *cp;
     struct cell *finger;
     int i, k, redo, count = c->count;
     int *pid;
@@ -357,9 +358,10 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
 
             /* Get a direct pointer on the part. */
             p = &c->parts[ pid[i] ];
+            cp = &c->cparts[ pid[i] ];
             
             /* Is this part within the timestep? */
-            if ( p->dt <= dt_max ) {
+            if ( cp->dt <= dt_max ) {
 
                 /* Adjust the computed rho. */
                 ihg = kernel_igamma / p->h;
@@ -370,11 +372,12 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
 
                 /* Update the smoothing length. */
                 p->h -= ( p->wcount - const_nwneigh ) / p->wcount_dh;
+                cp->h = p->h;
 
                 /* Did we get the right number density? */
                 if ( p->wcount > const_nwneigh + 1 ||
                      p->wcount < const_nwneigh - 1 ) {
-                    printf( "runner_doghost: particle %lli (h=%e,depth=%i) has bad wcount=%f.\n" , p->id , p->h , c->depth , p->wcount ); fflush(stdout);
+                    // printf( "runner_doghost: particle %lli (h=%e,h_dt=%e,depth=%i) has bad wcount=%.3f.\n" , p->id , p->h , p->h_dt , c->depth , p->wcount ); fflush(stdout);
                     // p->h += ( p->wcount + kernel_root - const_nwneigh ) / p->wcount_dh;
                     pid[redo] = pid[i];
                     redo += 1;
@@ -387,6 +390,7 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
                     
                 /* Compute this particle's time step. */
                 p->dt = const_cfl * p->h / sqrtf( const_gamma * ( const_gamma - 1.0f ) * p->u );
+                cp->dt = p->dt;
 
                 /* Compute the pressure. */
                 // p->P = p->rho * p->u * ( const_gamma - 1.0f );
@@ -394,15 +398,15 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
                 /* Compute the P/Omega/rho2. */
                 p->POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f );
 
-                }
+                /* Reset the acceleration. */
+                for ( k = 0 ; k < 3 ; k++ )
+                    p->a[k] = 0.0f;
 
-            /* Reset the acceleration. */
-            for ( k = 0 ; k < 3 ; k++ )
-                p->a[k] = 0.0f;
+                /* Reset the time derivatives. */
+                p->u_dt = 0.0f;
+                p->h_dt = 0.0f;
 
-            /* Reset the time derivatives. */
-            p->u_dt = 0.0f;
-            p->h_dt = 0.0f;
+                }
 
             }
             
diff --git a/src/runner_iact.h b/src/runner_iact.h
index 0b1c2f93d9..f4b117ffa6 100644
--- a/src/runner_iact.h
+++ b/src/runner_iact.h
@@ -368,7 +368,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2
     pj->u_dt += pi->mass * dvdr * wj_dr;
     
     /* Get the time derivative for h. */
-    pi->h_dt += pj->mass / pj->rho * dvdr * wi_dr;
+    pi->h_dt -= pj->mass / pj->rho * dvdr * wi_dr;
     pj->h_dt -= pi->mass / pi->rho * dvdr * wj_dr;
         
     #ifdef HIST
@@ -485,8 +485,8 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float
     for ( k = 0 ; k < VEC_SIZE ; k++ ) {
         pi[k]->u_dt += piu_dt.f[k];
         pj[k]->u_dt += pju_dt.f[k];
-        pi[k]->h_dt += pih_dt.f[k];
-        pj[k]->h_dt += pjh_dt.f[k];
+        pi[k]->h_dt -= pih_dt.f[k];
+        pj[k]->h_dt -= pjh_dt.f[k];
         for ( j = 0 ; j < 3 ; j++ ) {
             pi[k]->a[j] -= pia[j].f[k];
             pj[k]->a[j] += pja[j].f[k];
@@ -545,7 +545,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_force ( fl
     pi->u_dt += pj->mass * dvdr * wi_dr;
     
     /* Get the time derivative for h. */
-    pi->h_dt += pj->mass / pj->rho * dvdr * wi_dr;
+    pi->h_dt -= pj->mass / pj->rho * dvdr * wi_dr;
         
     }
     
@@ -646,7 +646,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force
     /* Store the forces back on the particles. */
     for ( k = 0 ; k < VEC_SIZE ; k++ ) {
         pi[k]->u_dt += piu_dt.f[k];
-        pi[k]->h_dt += pih_dt.f[k];
+        pi[k]->h_dt -= pih_dt.f[k];
         for ( j = 0 ; j < 3 ; j++ )
             pi[k]->a[j] -= pia[j].f[k];
         }
diff --git a/src/space.c b/src/space.c
index cf15bb969f..ba81b3b551 100644
--- a/src/space.c
+++ b/src/space.c
@@ -161,12 +161,17 @@ void space_prepare ( struct space *s ) {
 
     int k;
     struct task *t;
-    float dt_max = s->dt_max;
+    float dt_max = s->dt_max, dx_max = 0.0f;
     int counts[ task_type_count + 1 ];
     
     /* Traverse the cells and set their dt_min and dt_max. */
     space_map_cells_post( s , 1 , &space_map_prepare , NULL );
     
+    /* Get the maximum displacement in the whole system. */
+    for ( k = 0 ; k < s->nr_cells ; k++ )
+        dx_max = fmaxf( dx_max , s->cells[k].dx_max );
+    printf( "space_prepare: dx_max is %e.\n" , dx_max );
+    
     /* Run through the tasks and mark as skip or not. */
     for ( k = 0 ; k < s->nr_tasks ; k++ ) {
         t = &s->tasks[k];
@@ -192,15 +197,6 @@ void space_prepare ( struct space *s ) {
         /* Traverse the cells and set their dt_min and dt_max. */
         space_map_cells_post( s , 1 , &space_map_prepare , NULL );
     
-        /* Run through the tasks and mark as skip or not. */
-        for ( k = 0 ; k < s->nr_tasks ; k++ ) {
-            t = &s->tasks[k];
-            if ( t->type == task_type_sort || t->type == task_type_self || t->type == task_type_ghost )
-                t->skip = ( t->ci->dt_min > dt_max );
-            else if ( t->type == task_type_pair )
-                t->skip = ( t->ci->dt_min > dt_max && t->cj->dt_min > dt_max );
-            }
-            
         }
 
     /* Store the condensed particle data. */         
@@ -262,6 +258,10 @@ void space_ranktasks ( struct space *s ) {
                 temp = tid[j]; tid[j] = tid[k]; tid[k] = temp;
                 j += 1;
                 }
+                
+        /* Did we get anything? */
+        if ( j == left )
+            error( "Unsatisfiable task dependencies detected." );
 
         /* Traverse the task tree and add tasks with no weight. */
         for ( i = left ; i < j ; i++ ) {
@@ -488,6 +488,7 @@ void space_rebuild ( struct space *s , double cell_max ) {
         }
     s->h_min = h_min;
     s->h_max = h_max;
+    printf( "space_rebuild: h_min/h_max is %.3e/%.3e.\n" , h_min , h_max );
     // printf( "space_rebuild: getting h_min and h_max took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
     
     /* Get the new putative cell dimensions. */
@@ -537,6 +538,9 @@ void space_rebuild ( struct space *s , double cell_max ) {
                     c->dmin = dmin;
                     c->depth = 0;
                     }
+           
+        /* Be verbose about the change. */         
+        printf( "space_rebuild: set cell dimensions to [ %i %i %i ].\n" , cdim[0] , cdim[1] , cdim[2] ); fflush(stdout);
                     
         } /* re-build upper-level cells? */
     // printf( "space_rebuild: rebuilding upper-level cells took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
@@ -844,7 +848,7 @@ struct task *space_addtask ( struct space *s , int type , int subtype , int flag
     t->ci = ci;
     t->cj = cj;
     t->skip = 0;
-    t->tight = 0;
+    t->tight = tight;
     t->nr_unlock_tasks = 0;
     t->nr_unlock_cells = 0;
     
@@ -1398,9 +1402,14 @@ void space_maketasks ( struct space *s , int do_sort ) {
         if ( t->skip )
             continue;
         if ( t->type == task_type_sort && t->ci->split )
-            for ( j = 0 ; j < 8 ; j++ )
-                if ( t->ci->progeny[j] != NULL && t->ci->progeny[j]->sorts[0] != NULL )
-                    task_addunlock( t->ci->progeny[j]->sorts[0] , t );
+            for ( j = 0 ; j < 8 ; j++ ) {
+                if ( t->ci->progeny[j] == NULL )
+                    continue;
+                if ( t->ci->progeny[j]->sorts[0] == NULL )
+                    t->ci->progeny[j]->sorts[0] = space_addtask( s , task_type_sort , task_subtype_none , 0 /* t->flags? */ , 0 , t->ci , NULL , 0 );
+                t->ci->progeny[j]->sorts[0]->skip = 0;
+                task_addunlock( t->ci->progeny[j]->sorts[0] , t );
+                }
         }
     
     /* Count the number of tasks associated with each cell and
diff --git a/src/vector.h b/src/vector.h
index 3461134754..96534c6866 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -27,8 +27,11 @@
     #define VEC_MACRO(elcount, type)  __attribute__((vector_size((elcount)*sizeof(type)))) type
 
     /* So what will the vector size be? */
-    #ifdef __AVX__
+    #ifdef NO__AVX__
+        #define VECTORIZE
         #define VEC_SIZE 8
+        #define VEC_FLOAT __m256
+        #define VEC_INT __m256i
         #define vec_load(a) _mm256_load_ps(a)
         #define vec_set1(a) _mm256_set1_ps(a)
         #define vec_sqrt(a) _mm256_sqrt_ps(a)
@@ -36,8 +39,11 @@
         #define vec_rsqrt(a) _mm256_rsqrt_ps(a)
         #define vec_ftoi(a) _mm256_cvttps_epi32(a)
         #define vec_fmin(a,b) _mm256_min_ps(a,b)
-    #else
+    #elif defined( NO__SSE2__ )
+        #define VECTORIZE
         #define VEC_SIZE 4
+        #define VEC_FLOAT __m128
+        #define VEC_INT __m128i
         #define vec_load(a) _mm_load_ps(a)
         #define vec_set1(a) _mm_set1_ps(a)
         #define vec_sqrt(a) _mm_sqrt_ps(a)
@@ -45,6 +51,8 @@
         #define vec_rsqrt(a) _mm_rsqrt_ps(a)
         #define vec_ftoi(a) _mm_cvttps_epi32(a)
         #define vec_fmin(a,b) _mm_min_ps(a,b)
+    #else
+        #define VEC_SIZE 4
     #endif
     // #ifdef __AVX__
     //     #define VEC_SIZE 8
@@ -61,8 +69,10 @@
     /* Define the composite types for element access. */
     #ifdef VECTORIZE
     typedef union {
-        VEC_MACRO(VEC_SIZE,float) v;
-        VEC_MACRO(VEC_SIZE,int) m;
+        // VEC_MACRO(VEC_SIZE,float) v;
+        // VEC_MACRO(VEC_SIZE,int) m;
+        VEC_FLOAT v;
+        VEC_INT m;
         float f[VEC_SIZE];
         int i[VEC_SIZE];
         } vector;
-- 
GitLab