diff --git a/src/Makefile.am b/src/Makefile.am
index 8ea80c5b54e466e32b7fc5943222644e7fe6450c..2b5137582eb21cacd5888ad2b06886f5498c224d 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -22,12 +22,13 @@ AUTOMAKE_OPTIONS=gnu
 # Add the debug flag to the whole thing
 AM_CFLAGS = -g -O3 -std=gnu99 -Wall -Werror -ffast-math -fstrict-aliasing \
     -ftree-vectorize -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \
-    -DTIMER -DCOUNTER -DCPU_TPS=2.30e9 
+    -DTIMER -DCOUNTER -DCPU_TPS=2.30e9 \
+    # -fsanitize=address -fno-omit-frame-pointer
 # AM_CFLAGS = -Wall -Werror $(OPENMP_CFLAGS) \
 #     -DTIMER -DCOUNTER -DCPU_TPS=2.67e9
 
 # Assign a "safe" version number
-AM_LDFLAGS = $(LAPACK_LIBS) $(BLAS_LIBS) $(HDF5_LDFLAGS) -version-info 0:0:0
+AM_LDFLAGS = $(LAPACK_LIBS) $(BLAS_LIBS) $(HDF5_LDFLAGS) -version-info 0:0:0 # -fsanitize=address
 
 # Build the libswiftsim library
 lib_LTLIBRARIES = libswiftsim.la
@@ -38,11 +39,13 @@ endif
 
 # List required headers
 include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \
-    engine.h swift.h serial_io.h timers.h debug.h scheduler.h proxy.h parallel_io.h common_io.h
+    engine.h swift.h serial_io.h timers.h debug.h scheduler.h proxy.h parallel_io.h \
+    common_io.h multipole.h
 
 # Common source files
 AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \
-    serial_io.c timers.c debug.c scheduler.c proxy.c parallel_io.c units.c common_io.c
+    serial_io.c timers.c debug.c scheduler.c proxy.c parallel_io.c \
+    units.c common_io.c multipole.c
     
 # Sources and flags for regular library
 libswiftsim_la_SOURCES = $(AM_SOURCES)
diff --git a/src/cell.c b/src/cell.c
index 5e23afed5280534650847be30a6e16d35e04142d..3a0e99fd5e3a489734e05998d1cb32155e620544 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -47,6 +47,7 @@
 #include "timers.h"
 #include "part.h"
 #include "space.h"
+#include "multipole.h"
 #include "cell.h"
 #include "error.h"
 #include "inline.h"
@@ -265,6 +266,72 @@ int cell_locktree( struct cell *c ) {
     }
     
     
+int cell_glocktree( struct cell *c ) {
+
+    struct cell *finger, *finger2;
+    TIMER_TIC
+
+    /* First of all, try to lock this cell. */
+    if ( c->ghold || lock_trylock( &c->glock ) != 0 ) {
+        TIMER_TOC(timer_locktree);
+        return 1;
+        }
+        
+    /* Did somebody hold this cell in the meantime? */
+    if ( c->ghold ) {
+        
+        /* Unlock this cell. */
+        if ( lock_unlock( &c->glock ) != 0 )
+            error( "Failed to unlock cell." );
+            
+        /* Admit defeat. */
+        TIMER_TOC(timer_locktree);
+        return 1;
+    
+        }
+        
+    /* Climb up the tree and lock/hold/unlock. */
+    for ( finger = c->parent ; finger != NULL ; finger = finger->parent ) {
+    
+        /* Lock this cell. */
+        if ( lock_trylock( &finger->glock ) != 0 )
+            break;
+            
+        /* Increment the hold. */
+        __sync_fetch_and_add( &finger->ghold , 1 );
+        
+        /* Unlock the cell. */
+        if ( lock_unlock( &finger->glock ) != 0 )
+            error( "Failed to unlock cell." );
+    
+        }
+        
+    /* If we reached the top of the tree, we're done. */
+    if ( finger == NULL ) {
+        TIMER_TOC(timer_locktree);
+        return 0;
+        }
+        
+    /* Otherwise, we hit a snag. */
+    else {
+    
+        /* Undo the holds up to finger. */
+        for ( finger2 = c->parent ; finger2 != finger ; finger2 = finger2->parent )
+            __sync_fetch_and_sub( &finger2->ghold , 1 );
+            
+        /* Unlock this cell. */
+        if ( lock_unlock( &c->glock ) != 0 )
+            error( "Failed to unlock cell." );
+            
+        /* Admit defeat. */
+        TIMER_TOC(timer_locktree);
+        return 1;
+    
+        }
+
+    }
+    
+    
 /**
  * @brief Unock a cell's parents.
  *
@@ -289,6 +356,24 @@ void cell_unlocktree( struct cell *c ) {
     }
     
     
+void cell_gunlocktree( struct cell *c ) {
+
+    struct cell *finger;
+    TIMER_TIC
+
+    /* First of all, try to unlock this cell. */
+    if ( lock_unlock( &c->glock ) != 0 )
+        error( "Failed to unlock cell." );
+        
+    /* Climb up the tree and unhold the parents. */
+    for ( finger = c->parent ; finger != NULL ; finger = finger->parent )
+        __sync_fetch_and_sub( &finger->ghold , 1 );
+        
+    TIMER_TOC(timer_locktree);
+        
+    }
+    
+    
 /**
  * @brief Sort the parts into eight bins along the given pivots.
  *
@@ -297,20 +382,21 @@ void cell_unlocktree( struct cell *c ) {
  
 void cell_split ( struct cell *c  ) {
 
-    int i, j, k;
+    int i, j, k, count = c->count, gcount = c->gcount;
     struct part temp, *parts = c->parts;
     struct xpart xtemp, *xparts = c->xparts;
+    struct gpart gtemp, *gparts = c->gparts;
     int left[8], right[8];
     double pivot[3];
     
-    /* Init the pivot. */
+    /* Init the pivots. */
     for ( k = 0 ; k < 3 ; k++ )
         pivot[k] = c->loc[k] + c->h[k]/2;
     
     /* Split along the x-axis. */
-    i = 0; j = c->count - 1;
+    i = 0; j = count - 1;
     while ( i <= j ) {
-        while ( i <= c->count-1 && parts[i].x[0] <= pivot[0] )
+        while ( i <= count-1 && parts[i].x[0] <= pivot[0] )
             i += 1;
         while ( j >= 0 && parts[j].x[0] > pivot[0] )
             j -= 1;
@@ -322,10 +408,10 @@ void cell_split ( struct cell *c  ) {
     /* for ( k = 0 ; k <= j ; k++ )
         if ( parts[k].x[0] > pivot[0] )
             error( "cell_split: sorting failed." );
-    for ( k = i ; k < c->count ; k++ )
+    for ( k = i ; k < count ; k++ )
         if ( parts[k].x[0] < pivot[0] )
             error( "cell_split: sorting failed." ); */
-    left[1] = i; right[1] = c->count - 1;
+    left[1] = i; right[1] = count - 1;
     left[0] = 0; right[0] = j;
     
     /* Split along the y axis, twice. */
@@ -387,13 +473,17 @@ void cell_split ( struct cell *c  ) {
         c->progeny[k]->xparts = &c->xparts[ left[k] ];
         }
         
+    /* Re-link the gparts. */
+    for ( k = 0 ; k < count ; k++ )
+        parts[k].gpart->part = &parts[k];
+        
     /* Verify that _all_ the parts have been assigned to a cell. */
     /* for ( k = 1 ; k < 8 ; k++ )
         if ( &c->progeny[k-1]->parts[ c->progeny[k-1]->count ] != c->progeny[k]->parts )
             error( "Particle sorting failed (internal consistency)." );
     if ( c->progeny[0]->parts != c->parts )
         error( "Particle sorting failed (left edge)." );
-    if ( &c->progeny[7]->parts[ c->progeny[7]->count ] != &c->parts[ c->count ] )
+    if ( &c->progeny[7]->parts[ c->progeny[7]->count ] != &c->parts[ count ] )
         error( "Particle sorting failed (right edge)." ); */
         
     /* Verify a few sub-cells. */
@@ -413,6 +503,65 @@ void cell_split ( struct cell *c  ) {
              c->progeny[2]->parts[k].x[2] > pivot[2] )
             error( "Sorting failed (progeny=2)." ); */
 
+    /* Now do the same song and dance for the gparts. */
+
+    /* Split along the x-axis. */
+    i = 0; j = gcount - 1;
+    while ( i <= j ) {
+        while ( i <= gcount-1 && gparts[i].x[0] <= pivot[0] )
+            i += 1;
+        while ( j >= 0 && gparts[j].x[0] > pivot[0] )
+            j -= 1;
+        if ( i < j ) {
+            gtemp = gparts[i]; gparts[i] = gparts[j]; gparts[j] = gtemp;
+            }
+        }
+    left[1] = i; right[1] = gcount - 1;
+    left[0] = 0; right[0] = j;
+    
+    /* Split along the y axis, twice. */
+    for ( k = 1 ; k >= 0 ; k-- ) {
+        i = left[k]; j = right[k];
+        while ( i <= j ) {
+            while ( i <= right[k] && gparts[i].x[1] <= pivot[1] )
+                i += 1;
+            while ( j >= left[k] && gparts[j].x[1] > pivot[1] )
+                j -= 1;
+            if ( i < j ) {
+                gtemp = gparts[i]; gparts[i] = gparts[j]; gparts[j] = gtemp;
+                }
+            }
+        left[2*k+1] = i; right[2*k+1] = right[k];
+        left[2*k] = left[k]; right[2*k] = j;
+        }
+
+    /* Split along the z axis, four times. */
+    for ( k = 3 ; k >= 0 ; k-- ) {
+        i = left[k]; j = right[k];
+        while ( i <= j ) {
+            while ( i <= right[k] && gparts[i].x[2] <= pivot[2] )
+                i += 1;
+            while ( j >= left[k] && gparts[j].x[2] > pivot[2] )
+                j -= 1;
+            if ( i < j ) {
+                gtemp = gparts[i]; gparts[i] = gparts[j]; gparts[j] = gtemp;
+                }
+            }
+        left[2*k+1] = i; right[2*k+1] = right[k];
+        left[2*k] = left[k]; right[2*k] = j;
+        }
+        
+    /* Store the counts and offsets. */
+    for ( k = 0 ; k < 8 ; k++ ) {
+        c->progeny[k]->gcount = right[k] - left[k] + 1;
+        c->progeny[k]->gparts = &c->gparts[ left[k] ];
+        }
+        
+    /* Re-link the parts. */
+    for ( k = 0 ; k < gcount ; k++ )
+        if ( gparts[k].id > 0 )
+            gparts[k].part->gpart = &gparts[k];
+        
     }
 
 
diff --git a/src/cell.h b/src/cell.h
index 530eb5ae5b09a2bccd72736e88110387205deae3..0b4828efbef4e297cb765e98c193bec901e370ce 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -67,7 +67,7 @@ struct cell {
     int depth, split, maxdepth;
     
     /* Nr of parts. */
-    int count;
+    int count, gcount;
     
     /* Pointers to the particle data. */
     struct part *parts;
@@ -75,12 +75,12 @@ struct cell {
     /* Pointers to the extra particle data. */
     struct xpart *xparts;
     
-    /* Pointers for the sorted indices. */
-    struct entry *sort;
-    unsigned int sorted;
+    /* Pointers to the gravity particle data. */
+    struct gpart *gparts;
     
-    /* Number of pairs associated with this cell. */
-    // int nr_pairs;
+    /* Pointers for the sorted indices. */
+    struct entry *sort, *gsort;
+    unsigned int sorted, gsorted;
     
     /* Pointers to the next level of cells. */
     struct cell *progeny[8];
@@ -91,13 +91,13 @@ struct cell {
     /* Super cell, i.e. the highest-level supercell that has interactions. */
     struct cell *super;
     
-    /* The tasks computing this cell's sorts. */
-    struct task *sorts;
-    int sortsize;
+    /* The task computing this cell's sorts. */
+    struct task *sorts, *gsorts;
+    int sortsize, gsortsize;
     
     /* The tasks computing this cell's density. */
-    struct link *density, *force;
-    int nr_density, nr_force;
+    struct link *density, *force, *grav;
+    int nr_density, nr_force, nr_grav;
     
     /* The ghost task to link density to interactions. */
     struct task *ghost, *kick1, *kick2;
@@ -105,17 +105,17 @@ struct cell {
     /* Task receiving data. */
     struct task *recv_xv, *recv_rho;
     
+    /* Tasks for gravity tree. */
+    struct task *grav_up, *grav_down;
+    
     /* Number of tasks that are associated with this cell. */
     int nr_tasks;
     
-    /* Number of tasks this cell is waiting for and whether it is in use. */
-    int wait;
-    
     /* Is the data of this cell being used in a sub-cell? */
-    int hold;
+    int hold, ghold;
     
     /* Spin lock for various uses. */
-    lock_type lock;
+    lock_type lock, glock;
     
     /* ID of the previous owner, e.g. runner. */
     int owner;
@@ -143,6 +143,9 @@ struct cell {
     int pcell_size;
     int tag;
     
+    /* This cell's multipole. */
+    struct multipole multipole;
+    
     } __attribute__((aligned (64)));
 
 
@@ -150,6 +153,8 @@ struct cell {
 void cell_split ( struct cell *c  );
 int cell_locktree( struct cell *c );
 void cell_unlocktree( struct cell *c );
+int cell_glocktree( struct cell *c );
+void cell_gunlocktree( struct cell *c );
 int cell_pack ( struct cell *c , struct pcell *pc );
 int cell_unpack ( struct pcell *pc , struct cell *c , struct space *s );
 int cell_getsize ( struct cell *c );
diff --git a/src/const.h b/src/const.h
index 70271a4775fb2c6fcb86fceeb6c27a2ae0797a51..7d6f5c12e67abc4b5fe3719aea496088e97ddd58 100644
--- a/src/const.h
+++ b/src/const.h
@@ -41,6 +41,18 @@
 #define const_delta_nwneigh     1.f
 #define CUBIC_SPLINE_KERNEL
 
+/* Gravity stuff. */
+#define const_theta_max         0.57735f        /* Opening criteria, which is the ratio of the
+                                                   cell distance over the cell width. */
+#define const_G                 6.67384e-8f     /* Gravitational constant. */
+#define const_epsilon           0.0014f         /* Gravity blending distance. */
+#define const_iepsilon          714.285714286f  /* Inverse gravity blending distance. */
+#define const_iepsilon2         (const_iepsilon*const_iepsilon)
+#define const_iepsilon3         (const_iepsilon2*const_iepsilon)
+#define const_iepsilon4         (const_iepsilon2*const_iepsilon2)
+#define const_iepsilon5         (const_iepsilon3*const_iepsilon2)
+#define const_iepsilon6         (const_iepsilon3*const_iepsilon3)
+
 /* SPH variant to use */
 #define LEGACY_GADGET2_SPH
 
diff --git a/src/debug.c b/src/debug.c
index 4cef5bfcdeba7308c09fe49a8339d0cc4878a865..75c726bfced9f9b5f1b09aa276b92fbc06ae3882 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -34,44 +34,74 @@
  *
  * (Should be used for debugging only as it runs in O(N).)
  */
+ 
 void printParticle ( struct part *parts , long long int id, int N ) {
 
-    int i;
+    int i, found = 0;
 
     /* Look for the particle. */
-    for ( i = 0 ; i < N && parts[i].id != id; i++ );
+    for ( i = 0 ; i < N ; i++ )
+        if ( parts[i].id == id ) {
+            printf("## Particle[%d]: id=%lld, x=[%.16e,%.16e,%.16e], v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], h=%.3e, h_dt=%.3e, wcount=%.3e, m=%.3e, rho=%.3e, rho_dh=%.3e, div_v=%.3e, u=%.3e, dudt=%.3e, bals=%.3e, POrho2=%.3e, v_sig=%.3e, dt=%.3e\n",
+                i,
+                parts[i].id,
+                parts[i].x[0], parts[i].x[1], parts[i].x[2],
+                parts[i].v[0], parts[i].v[1], parts[i].v[2],
+                parts[i].a[0], parts[i].a[1], parts[i].a[2],
+                parts[i].h,
+                parts[i].force.h_dt,
+                parts[i].density.wcount,
+                parts[i].mass,
+                parts[i].rho, parts[i].rho_dh,
+                parts[i].density.div_v,
+                parts[i].u,
+                parts[i].force.u_dt,
+                parts[i].force.balsara,
+                parts[i].force.POrho2,
+                parts[i].force.v_sig,
+                parts[i].dt
+                );
+            found = 1;
+            }
+        
+    if ( !found )
+        printf("## Particles[???] id=%lld not found\n", id);
+    
+    }
+
 
-    if ( i < N )
-        printf("## Particle[%d]: id=%lld, x=[%.16e,%.16e,%.16e], v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], h=%.3e, h_dt=%.3e, wcount=%.3e, m=%.3e, rho=%.3e, rho_dh=%.3e, div_v=%.3e, u=%.3e, dudt=%.3e, bals=%.3e, POrho2=%.3e, v_sig=%.3e, dt=%.3e\n",
-            i,
-            parts[i].id,
-            parts[i].x[0], parts[i].x[1], parts[i].x[2],
-            parts[i].v[0], parts[i].v[1], parts[i].v[2],
-            parts[i].a[0], parts[i].a[1], parts[i].a[2],
-            parts[i].h,
-            parts[i].force.h_dt,
-            parts[i].density.wcount,
-            parts[i].mass,
-            parts[i].rho, parts[i].rho_dh,
-            parts[i].density.div_v,
-            parts[i].u,
-            parts[i].force.u_dt,
-            parts[i].force.balsara,
-            parts[i].force.POrho2,
-            parts[i].force.v_sig,
-            parts[i].dt
-            );
-    else
+void printgParticle ( struct gpart *parts , long long int id, int N ) {
+
+    int i, found = 0;
+
+    /* Look for the particle. */
+    for ( i = 0 ; i < N ; i++ )
+        if ( parts[i].id == -id || ( parts[i].id > 0 && parts[i].part->id == id ) ) {
+            printf("## gParticle[%d]: id=%lld, x=[%.16e,%.16e,%.16e], v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], m=%.3e, dt=%.3e\n",
+                i,
+                (parts[i].id < 0) ? -parts[i].id : parts[i].part->id ,
+                parts[i].x[0], parts[i].x[1], parts[i].x[2],
+                parts[i].v[0], parts[i].v[1], parts[i].v[2],
+                parts[i].a[0], parts[i].a[1], parts[i].a[2],
+                parts[i].mass,
+                parts[i].dt
+                );
+            found = 1;
+            }
+        
+    if ( !found )
         printf("## Particles[???] id=%lld not found\n", id);
     
     }
 
+
 /**
  * @brief Prints the details of a given particle to stdout
  * 
  * @param p The particle to print
  * 
  */
+ 
 void printParticle_single ( struct part *p ) {
 
     printf("## Particle: id=%lld, x=[%e,%e,%e], v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], h=%.3e, h_dt=%.3e, wcount=%.3e, m=%.3e, rho=%.3e, rho_dh=%.3e, div_v=%.3e, u=%.3e, dudt=%.3e, bals=%.3e, POrho2=%.3e, v_sig=%.3e, dt=%.3e\n",
diff --git a/src/debug.h b/src/debug.h
index ffd4699a816d5dce16667ed6ae917a9662caa056..5db731a857ef32792c4f0a377eb97e475ab6b782 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -21,4 +21,5 @@
 
 
 void printParticle(struct part *parts, long long int i, int N);
+void printgParticle(struct gpart *parts, long long int i, int N);
 void printParticle_single ( struct part *p );
diff --git a/src/engine.c b/src/engine.c
index 0697fa926996e5c3725113f040ff646abdf5ed6a..3a40b91b968342afac18223761c446855491313b 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -54,6 +54,7 @@
 #include "part.h"
 #include "debug.h"
 #include "space.h"
+#include "multipole.h"
 #include "cell.h"
 #include "queue.h"
 #include "scheduler.h"
@@ -551,6 +552,30 @@ void engine_repartition ( struct engine *e ) {
 #endif
 
     }
+    
+    
+/**
+ * @brief Add up/down gravity tasks to a cell hierarchy.
+ *
+ * @param e The #engine.
+ * @param c The #cell
+ * @param up The upward gravity #task.
+ * @param down The downward gravity #task.
+ */
+ 
+void engine_addtasks_grav ( struct engine *e , struct cell *c , struct task *up , struct task *down ) {
+
+    /* Link the tasks to this cell. */
+    c->grav_up = up;
+    c->grav_down = down;
+    
+    /* Recurse? */
+    if ( c->split )
+        for ( int k = 0 ; k < 8 ; k++ )
+            if ( c->progeny[k] != NULL )
+                engine_addtasks_grav( e , c->progeny[k] , up , down );
+
+    }
 
 
 /**
@@ -887,6 +912,9 @@ void engine_maketasks ( struct engine *e ) {
 
     struct space *s = e->s;
     struct scheduler *sched = &e->sched;
+    struct cell *cells = s->cells;
+    int nr_cells = s->nr_cells;
+    int nodeID = e->nodeID;
     int i, j, k, ii, jj, kk, iii, jjj, kkk, cid, cjd, sid;
     int *cdim = s->cdim;
     struct task *t, *t2;
@@ -900,12 +928,12 @@ void engine_maketasks ( struct engine *e ) {
         for ( j = 0 ; j < cdim[1] ; j++ )
             for ( k = 0 ; k < cdim[2] ; k++ ) {
                 cid = cell_getid( cdim , i , j , k );
-                if ( s->cells[cid].count == 0 )
+                if ( cells[cid].count == 0 )
                     continue;
-                ci = &s->cells[cid];
+                ci = &cells[cid];
                 if ( ci->count == 0 )
                     continue;
-                if ( ci->nodeID == e->nodeID )
+                if ( ci->nodeID == nodeID )
                     scheduler_addtask( sched , task_type_self , task_subtype_density , 0 , 0 , ci , NULL , 0 );
                 for ( ii = -1 ; ii < 2 ; ii++ ) {
                     iii = i + ii;
@@ -923,17 +951,24 @@ void engine_maketasks ( struct engine *e ) {
                                 continue;
                             kkk = ( kkk + cdim[2] ) % cdim[2];
                             cjd = cell_getid( cdim , iii , jjj , kkk );
-                            cj = &s->cells[cjd];
+                            cj = &cells[cjd];
                             if ( cid >= cjd || cj->count == 0 || 
-                                 ( ci->nodeID != e->nodeID && cj->nodeID != e->nodeID ) )
+                                 ( ci->nodeID != nodeID && cj->nodeID != nodeID ) )
                                 continue;
                             sid = sortlistID[ (kk+1) + 3*( (jj+1) + 3*(ii+1) ) ];
-                            t = scheduler_addtask( sched , task_type_pair , task_subtype_density , sid , 0 , ci , cj , 1 );
+                            scheduler_addtask( sched , task_type_pair , task_subtype_density , sid , 0 , ci , cj , 1 );
                             }
                         }
                     }
                 }
-
+                
+    /* Add the gravity mm tasks. */
+    for ( i = 0 ; i < nr_cells ; i++ ) {
+        scheduler_addtask( sched , task_type_grav_mm , task_subtype_none , -1 , 0 , &cells[i] , NULL , 0 );
+        for ( j = i+1 ; j < nr_cells ; j++ )
+            scheduler_addtask( sched , task_type_grav_mm , task_subtype_none , -1 , 0 , &cells[i] , &cells[j] , 0 );
+        }
+        
     /* Split the tasks. */
     scheduler_splittasks( sched );
     
@@ -944,27 +979,44 @@ void engine_maketasks ( struct engine *e ) {
         error( "Failed to allocate cell-task links." );
     e->nr_links = 0;
     
+    /* Add the gravity up/down tasks at the top-level cells and push them down. */
+    for ( k = 0 ; k < nr_cells ; k++ )
+        if ( cells[k].nodeID == nodeID ) {
+        
+            /* Create tasks at top level. */
+            struct task *up = scheduler_addtask( sched , task_type_grav_up , task_subtype_none , 0 , 0 , &cells[k] , NULL , 0 );
+            struct task *down = scheduler_addtask( sched , task_type_grav_down , task_subtype_none , 0 , 0 , &cells[k] , NULL , 0 );
+            
+            /* Push tasks down the cell hierarchy. */
+            engine_addtasks_grav( e , &cells[k] , up , down );
+            
+            }
+    
     /* Count the number of tasks associated with each cell and
        store the density tasks in each cell, and make each sort
        depend on the sorts of its progeny. */
     // #pragma omp parallel for private(t,j)
     for ( k = 0 ; k < sched->nr_tasks ; k++ ) {
+        
+        /* Get the current task. */
         t = &sched->tasks[k];
         if ( t->skip )
             continue;
+            
+        /* Link sort tasks together. */
         if ( t->type == task_type_sort && t->ci->split )
             for ( j = 0 ; j < 8 ; j++ )
                 if ( t->ci->progeny[j] != NULL && t->ci->progeny[j]->sorts != NULL ) {
                     t->ci->progeny[j]->sorts->skip = 0;
                     scheduler_addunlock( sched , t->ci->progeny[j]->sorts , t );
                     }
+                    
+        /* Link density tasks to cells. */
         if ( t->type == task_type_self ) {
             atomic_inc( &t->ci->nr_tasks );
             if ( t->subtype == task_subtype_density ) {
                 t->ci->density = engine_addlink( e , t->ci->density , t );
                 atomic_inc( &t->ci->nr_density );
-                if ( t->ci->nr_density > 27*8 )
-            error( "Density overflow." );
                 }
             }
         else if ( t->type == task_type_pair ) {
@@ -975,8 +1027,6 @@ void engine_maketasks ( struct engine *e ) {
                 atomic_inc( &t->ci->nr_density );
                 t->cj->density = engine_addlink( e , t->cj->density , t );
                 atomic_inc( &t->cj->nr_density );
-        if ( t->ci->nr_density > 8*27 || t->cj->nr_density > 8*27 )
-            error( "Density overflow." );
                 }
             }
         else if ( t->type == task_type_sub ) {
@@ -989,21 +1039,33 @@ void engine_maketasks ( struct engine *e ) {
                 if ( t->cj != NULL ) {
                     t->cj->density = engine_addlink( e , t->cj->density , t );
                     atomic_inc( &t->cj->nr_density );
-            if ( t->cj->nr_density > 8*27 )
-                error( "Density overflow." );
+                    }
+                }
             }
+            
+        /* Link gravity multipole tasks to the up/down tasks. */
+        if ( t->type == task_type_grav_mm ||
+             ( t->type == task_type_sub && t->subtype == task_subtype_grav ) ) {
+            atomic_inc( &t->ci->nr_tasks );
+            scheduler_addunlock( sched , t->ci->grav_up , t );
+            scheduler_addunlock( sched , t , t->ci->grav_down );
+            if ( t->cj != NULL && t->ci->grav_up != t->cj->grav_up ) {
+                scheduler_addunlock( sched , t->cj->grav_up , t );
+                scheduler_addunlock( sched , t , t->cj->grav_down );
                 }
             }
+            
         }
         
-    /* Append a ghost task to each cell. */
-    for ( k = 0 ; k < s->nr_cells ; k++ )
-        engine_mkghosts( e , &s->cells[k] , NULL );
+    /* Append a ghost task to each cell, and add kick2 tasks to the
+       super cells. */
+    for ( k = 0 ; k < nr_cells ; k++ )
+        engine_mkghosts( e , &cells[k] , NULL );
     /* if ( e->policy & engine_policy_fixdt )
         space_map_cells_pre( s , 1 , &scheduler_map_mkghosts_nokick1 , sched );
     else
         space_map_cells_pre( s , 1 , &scheduler_map_mkghosts , sched ); */
-    
+        
     /* Run through the tasks and make force tasks for each density task.
        Each force task depends on the cell ghosts and unlocks the kick2 task
        of its super-cell. */
@@ -1031,12 +1093,12 @@ void engine_maketasks ( struct engine *e ) {
         /* Otherwise, pair interaction? */
         else if ( t->type == task_type_pair && t->subtype == task_subtype_density ) {
             t2 = scheduler_addtask( sched , task_type_pair , task_subtype_force , 0 , 0 , t->ci , t->cj , 0 );
-            if ( t->ci->nodeID == e->nodeID ) {
+            if ( t->ci->nodeID == nodeID ) {
                 scheduler_addunlock( sched , t , t->ci->super->ghost );
                 scheduler_addunlock( sched , t->ci->super->ghost , t2 );
                 scheduler_addunlock( sched , t2 , t->ci->super->kick2 );
                 }
-            if ( t->cj->nodeID == e->nodeID && t->ci->super != t->cj->super ) {
+            if ( t->cj->nodeID == nodeID && t->ci->super != t->cj->super ) {
                 scheduler_addunlock( sched , t , t->cj->super->ghost );
                 scheduler_addunlock( sched , t->cj->super->ghost , t2 );
                 scheduler_addunlock( sched , t2 , t->cj->super->kick2 );
@@ -1050,12 +1112,12 @@ void engine_maketasks ( struct engine *e ) {
         /* Otherwise, sub interaction? */
         else if ( t->type == task_type_sub && t->subtype == task_subtype_density ) {
             t2 = scheduler_addtask( sched , task_type_sub , task_subtype_force , t->flags , 0 , t->ci , t->cj , 0 );
-            if ( t->ci->nodeID == e->nodeID ) {
+            if ( t->ci->nodeID == nodeID ) {
                 scheduler_addunlock( sched , t , t->ci->super->ghost );
                 scheduler_addunlock( sched , t->ci->super->ghost , t2 );
                 scheduler_addunlock( sched , t2 , t->ci->super->kick2 );
                 }
-            if ( t->cj != NULL && t->cj->nodeID == e->nodeID && t->ci->super != t->cj->super ) {
+            if ( t->cj != NULL && t->cj->nodeID == nodeID && t->ci->super != t->cj->super ) {
                 scheduler_addunlock( sched , t , t->cj->super->ghost );
                 scheduler_addunlock( sched , t->cj->super->ghost , t2 );
                 scheduler_addunlock( sched , t2 , t->cj->super->kick2 );
@@ -1068,6 +1130,10 @@ void engine_maketasks ( struct engine *e ) {
                 }
             }
             
+        /* Kick2 tasks should rely on the grav_down tasks of their cell. */
+        else if ( t->type == task_type_kick2 )
+            scheduler_addunlock( sched , t->ci->grav_down , t );
+            
         }
         
     /* Add the communication tasks if MPI is being used. */
@@ -1657,6 +1723,10 @@ void engine_step ( struct engine *e ) {
                                        (1 << task_type_kick2) |
                                        (1 << task_type_send) |
                                        (1 << task_type_recv) |
+                                       (1 << task_type_grav_pp) |
+                                       (1 << task_type_grav_mm) |
+                                       (1 << task_type_grav_up) |
+                                       (1 << task_type_grav_down) |
                                        (1 << task_type_link) );
     TIMER_TOC(timer_runners);
     
diff --git a/src/engine.h b/src/engine.h
index 308ba34476fdd408c1af53285538394329b88af6..2685fbd33f6caa529e26204f86732e6bb163ea78 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -32,7 +32,7 @@
 #define engine_policy_setaffinity   256
 
 #define engine_queue_scale          1.2
-#define engine_maxtaskspercell      32
+#define engine_maxtaskspercell      128
 #define engine_maxproxies           64
 #define engine_tasksreweight        10
 
diff --git a/src/kernel.h b/src/kernel.h
index f2ff4e329b42095f018c36b0878cb7bee54c47f9..c012739f300aeb5aeedd4b56798b00b2d7ed5cc9 100644
--- a/src/kernel.h
+++ b/src/kernel.h
@@ -23,11 +23,182 @@
 
 /**
  * @file kernel.h
- * @brief SPH kernel functions. Compute W(x,h) and the gradient of W(x,h).
+ * @brief SPH kernel functions. Compute W(x,h) and the gradient of W(x,h),
+ *        as well as the blending function used for gravity.
  */
 
 #include "vector.h"
 
+/* Gravity kernel stuff ----------------------------------------------------------------------------------------------- */
+
+/* The gravity kernel is defined as a degree 6 polynomial in the distance
+   r. The resulting value should be post-multiplied with r^-3, resulting
+   in a polynomial with terms ranging from r^-3 to r^3, which are
+   sufficient to model both the direct potential as well as the splines
+   near the origin. */
+   
+/* Coefficients for the gravity kernel. */
+#define kernel_grav_degree 6
+#define kernel_grav_ivals 2
+#define kernel_grav_scale (2*const_iepsilon)
+static float kernel_grav_coeffs[ (kernel_grav_degree+1) * (kernel_grav_ivals+1) ] =
+    { 32.0f*const_iepsilon6 , -192.0f/5.0f*const_iepsilon5 , 0.0f , 32.0f/3.0f*const_iepsilon3 , 0.0f , 0.0f , 0.0f ,
+      -32.0f/3.0f*const_iepsilon6 , 192.0f/5.0f*const_iepsilon5 , -48.0f*const_iepsilon4 , 64.0f/3.0f*const_iepsilon3 , 0.0f , 0.0f , -1.0f/15.0f ,
+      0.0f , 0.0f , 0.0f , 0.0f , 0.0f , 0.0f , 1.0f };
+
+
+/**
+ * @brief Computes the gravity cubic spline for a given distance x.
+ */
+
+__attribute__ ((always_inline)) INLINE static void kernel_grav_eval ( float x , float *W ) {
+    int ind = fmin( x*kernel_grav_scale , kernel_grav_ivals );
+    float *coeffs = &kernel_grav_coeffs[ ind*(kernel_grav_degree + 1) ];
+    float w = coeffs[0]*x + coeffs[1];
+    for ( int k = 2 ; k <= kernel_grav_degree ; k++ )
+        w = x*w + coeffs[k];
+    *W = w;
+    }
+
+
+#ifdef VECTORIZE
+
+/**
+ * @brief Computes the gravity cubic spline for a given distance x (Vectorized version).
+ */
+
+__attribute__ ((always_inline)) INLINE static void kernel_grav_eval_vec ( vector *x , vector *w ) {
+    
+    vector ind, c[kernel_grav_degree+1];
+    int j, k;
+    
+    /* Load x and get the interval id. */
+    ind.m = vec_ftoi( vec_fmin( x->v*vec_set1( kernel_grav_scale ) , vec_set1( (float)kernel_grav_ivals ) ) );
+    
+    /* load the coefficients. */
+    for ( k = 0 ; k < VEC_SIZE ; k++ )
+        for ( j = 0 ; j < kernel_grav_degree+1 ; j++ )
+            c[j].f[k] = kernel_grav_coeffs[ ind.i[k]*(kernel_grav_degree + 1) + j ];
+
+    /* Init the iteration for Horner's scheme. */
+    w->v = ( c[0].v * x->v ) + c[1].v;
+    
+    /* And we're off! */
+    for ( int k = 2 ; k <= kernel_grav_degree ; k++ )
+        w->v = ( x->v * w->v ) + c[k].v;
+        
+    }
+    
+    
+#endif
+
+
+/* Blending function stuff -------------------------------------------------------------------------------------------- */
+
+/* Coefficients for the blending function. */
+#define blender_degree 3
+#define blender_ivals 3
+#define blender_scale 4.0f
+static float blender_coeffs[ (blender_degree+1) * (blender_ivals+1) ] =
+    { 0.0f , 0.0f , 0.0f , 1.0f ,
+      -32.0f , 24.0f , -6.0f , 1.5f , 
+      -32.0f , 72.0f , -54.0f , 13.5f ,
+      0.0f , 0.0f , 0.0f , 0.0f };
+      
+      
+/**
+ * @brief Computes the cubic spline blender for a given distance x.
+ */
+
+__attribute__ ((always_inline)) INLINE static void blender_eval ( float x , float *W ) {
+    int ind = fmin( x*blender_scale , blender_ivals );
+    float *coeffs = &blender_coeffs[ ind*(blender_degree + 1) ];
+    float w = coeffs[0]*x + coeffs[1];
+    for ( int k = 2 ; k <= blender_degree ; k++ )
+        w = x*w + coeffs[k];
+    *W = w;
+    }
+
+
+/**
+ * @brief Computes the cubic spline blender and its derivative for a given distance x.
+ */
+
+__attribute__ ((always_inline)) INLINE static void blender_deval ( float x , float *W , float *dW_dx ) {
+    int ind = fminf( x*blender_scale , blender_ivals );
+    float *coeffs = &blender_coeffs[ ind*(blender_degree + 1) ];
+    float w = coeffs[0]*x + coeffs[1];
+    float dw_dx = coeffs[0];
+    for ( int k = 2 ; k <= blender_degree ; k++ ) {
+        dw_dx = dw_dx*x + w;
+        w = x*w + coeffs[k];
+        }
+    *W = w;
+    *dW_dx = dw_dx;
+    }
+
+
+#ifdef VECTORIZE
+
+/**
+ * @brief Computes the cubic spline blender and its derivative for a given distance x (Vectorized version). Gives a sensible answer only if x<2.
+ */
+
+__attribute__ ((always_inline)) INLINE static void blender_eval_vec ( vector *x , vector *w ) {
+    
+    vector ind, c[blender_degree+1];
+    int j, k;
+    
+    /* Load x and get the interval id. */
+    ind.m = vec_ftoi( vec_fmin( x->v*vec_set1( blender_scale ) , vec_set1( (float)blender_ivals ) ) );
+    
+    /* load the coefficients. */
+    for ( k = 0 ; k < VEC_SIZE ; k++ )
+        for ( j = 0 ; j < blender_degree+1 ; j++ )
+            c[j].f[k] = blender_coeffs[ ind.i[k]*(blender_degree + 1) + j ];
+
+    /* Init the iteration for Horner's scheme. */
+    w->v = ( c[0].v * x->v ) + c[1].v;
+    
+    /* And we're off! */
+    for ( int k = 2 ; k <= blender_degree ; k++ )
+        w->v = ( x->v * w->v ) + c[k].v;
+        
+    }
+    
+    
+/**
+ * @brief Computes the cubic spline blender and its derivative for a given distance x (Vectorized version). Gives a sensible answer only if x<2.
+ */
+
+__attribute__ ((always_inline)) INLINE static void blender_deval_vec ( vector *x , vector *w , vector *dw_dx ) {
+    
+    vector ind, c[blender_degree+1];
+    int j, k;
+    
+    /* Load x and get the interval id. */
+    ind.m = vec_ftoi( vec_fmin( x->v*vec_set1( blender_scale ) , vec_set1( (float)blender_ivals ) ) );
+    
+    /* load the coefficients. */
+    for ( k = 0 ; k < VEC_SIZE ; k++ )
+        for ( j = 0 ; j < blender_degree+1 ; j++ )
+            c[j].f[k] = blender_coeffs[ ind.i[k]*(blender_degree + 1) + j ];
+
+    /* Init the iteration for Horner's scheme. */
+    w->v = ( c[0].v * x->v ) + c[1].v;
+    dw_dx->v = c[0].v;
+    
+    /* And we're off! */
+    for ( int k = 2 ; k <= blender_degree ; k++ ) {
+        dw_dx->v = ( dw_dx->v * x->v ) + w->v;
+        w->v = ( x->v * w->v ) + c[k].v;
+        }
+        
+    }
+    
+#endif
+
+
 /* -------------------------------------------------------------------------------------------------------------------- */
 
 #if defined(CUBIC_SPLINE_KERNEL)
@@ -49,14 +220,14 @@ static float kernel_coeffs[ (kernel_degree + 1) * (kernel_ivals + 1) ] __attribu
       0.0 , 0.0 , 0.0 , 0.0 };
 #define kernel_root ( kernel_coeffs[ kernel_degree ] )
 #define kernel_wroot ( 4.0/3.0*M_PI*kernel_coeffs[ kernel_degree ] )
-      
+
       
 /**
  * @brief Computes the cubic spline kernel and its derivative for a given distance x. Gives a sensible answer only if x<2.
  */
 
 __attribute__ ((always_inline)) INLINE static void kernel_deval ( float x , float *W , float *dW_dx ) {
-    int ind = fmin( x , kernel_ivals );
+    int ind = fminf( x , kernel_ivals );
     float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ];
     float w = coeffs[0]*x + coeffs[1];
     float dw_dx = coeffs[0];
@@ -102,6 +273,7 @@ __attribute__ ((always_inline)) INLINE static void kernel_deval_vec ( vector *x
     
 #endif
 
+
 /**
  * @brief Computes the cubic spline kernel for a given distance x. Gives a sensible answer only if x<2.
  */
@@ -117,9 +289,6 @@ __attribute__ ((always_inline)) INLINE static void kernel_eval ( float x , float
 
 
 
-
-
-
 /* -------------------------------------------------------------------------------------------------------------------- */
 
 #elif defined(QUARTIC_SPLINE_KERNEL)
diff --git a/src/part.h b/src/part.h
index df4040c8788dc080b247078e85a780561bb836af..ce87cfcabb3170c97723c8afeedcfd3d5fd8e37b 100644
--- a/src/part.h
+++ b/src/part.h
@@ -45,6 +45,38 @@ struct xpart {
     
     } __attribute__((aligned (32)));
     
+    
+/* Gravity particle. */
+struct gpart {
+
+    /* Particle position. */
+    double x[3];
+    
+    /* Particle velocity. */
+    float v[3];
+    
+    /* Particle acceleration. */
+    float a[3];
+    
+    /* Particle mass. */
+    float mass;
+    
+    /* Particle time step. */
+    float dt;
+    
+    /* Anonymous union for id/part. */
+    union {
+    
+        /* Particle ID. */
+        unsigned long long id;
+
+        /* Pointer to corresponding SPH part. */
+        struct part *part;
+        
+        };
+    
+    } __attribute__((aligned (part_align)));
+    
 
 /* Data of a single particle. */
 struct part {
@@ -130,6 +162,9 @@ struct part {
     /* Particle ID. */
     unsigned long long id;
     
+    /* Associated gravitas. */
+    struct gpart *gpart;
+    
     } __attribute__((aligned (part_align)));
     
 
diff --git a/src/proxy.c b/src/proxy.c
index 8aac6cf97f574c28f940b38c24a56604df0788af..6c8f4e40476a755d186dda43461c0dc93b156fd0 100644
--- a/src/proxy.c
+++ b/src/proxy.c
@@ -46,9 +46,10 @@
 #include "vector.h"
 #include "lock.h"
 #include "space.h"
+#include "part.h"
+#include "multipole.h"
 #include "cell.h"
 #include "task.h"
-#include "part.h"
 #include "debug.h"
 #include "proxy.h"
 #include "error.h"
diff --git a/src/queue.c b/src/queue.c
index 1519cebac61216ec7529ea299eaad7ffd3a1c7bc..ff161fbdd35c8b3dc2afee0f45184ddedc3b240d 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -37,6 +37,8 @@
 #include "task.h"
 #include "timers.h"
 #include "space.h"
+#include "part.h"
+#include "multipole.h"
 #include "cell.h"
 #include "queue.h"
 #include "error.h"
diff --git a/src/runner.c b/src/runner.c
index 32e5b5694d23430cea5eaf7f2fa260b7b27b48a2..eff08f9e15f6db02e7ac92ddf593caccdd01cadb 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -45,6 +45,7 @@
 #include "task.h"
 #include "part.h"
 #include "space.h"
+#include "multipole.h"
 #include "cell.h"
 #include "queue.h"
 #include "scheduler.h"
@@ -58,6 +59,7 @@
 #else
 #include "runner_iact.h"
 #endif
+#include "runner_iact_grav.h"
 
 /* Convert cell location to ID. */
 #define cell_getid( cdim , i , j , k ) ( (int)(k) + (cdim)[2]*( (int)(j) + (cdim)[1]*(int)(i) ) )
@@ -86,14 +88,18 @@ const char runner_flip[27] = { 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1
                                0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }; 
 
 
-/* Import the density functions. */
+/* Import the density loop functions. */
 #define FUNCTION density
 #include "runner_doiact.h"
 
+/* Import the force loop functions. */
 #undef FUNCTION
 #define FUNCTION force
 #include "runner_doiact.h"
 
+/* Import the gravity loop functions. */
+#include "runner_doiact_grav.h"
+
 
 /**
  * @brief Send a local cell's particle data to another node.
@@ -222,6 +228,7 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock )
     struct entry *finger;
     struct entry *fingers[8];
     struct part *parts = c->parts;
+    struct entry *sort;
     int j, k, count = c->count;
     int i, ind, off[8], inds[8], temp_i, missing;
     // float shift[3];
@@ -235,13 +242,14 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock )
         return;
     
     /* start by allocating the entry arrays. */
-    if ( c->sort == NULL || c->sortsize < c->count ) {
+    if ( c->sort == NULL || c->sortsize < count ) {
         if ( c->sort != NULL )
             free( c->sort );
-        c->sortsize = c->count * 1.1;
+        c->sortsize = count * 1.1;
         if ( ( c->sort = (struct entry *)malloc( sizeof(struct entry) * (c->sortsize + 1) * 13 ) ) == NULL )
             error( "Failed to allocate sort memory." );
         }
+    sort = c->sort;
         
     /* Does this cell have any progeny? */
     if ( c->split ) {
@@ -289,7 +297,7 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock )
                         }
 
             /* For each entry in the new sort list. */
-            finger = &c->sort[ j*(count + 1) ];
+            finger = &sort[ j*(count + 1) ];
             for ( ind = 0 ; ind < count ; ind++ ) {
 
                 /* Copy the minimum into the new sort array. */
@@ -308,8 +316,8 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock )
                 } /* Merge. */
             
             /* Add a sentinel. */
-            c->sort[ j*(c->count + 1) + c->count ].d = FLT_MAX;
-            c->sort[ j*(c->count + 1) + c->count ].i = 0;
+            sort[ j*(count + 1) + count ].d = FLT_MAX;
+            sort[ j*(count + 1) + count ].i = 0;
             
             /* Mark as sorted. */
             c->sorted |= ( 1 << j );
@@ -328,17 +336,17 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock )
             px[2] = parts[k].x[2];
             for ( j = 0 ; j < 13 ; j++ )
                 if ( flags & (1 << j) ) {
-                    c->sort[ j*(count + 1) + k].i = k;
-                    c->sort[ j*(count + 1) + k].d = px[0]*runner_shift[ 3*j + 0 ] + px[1]*runner_shift[ 3*j + 1 ] + px[2]*runner_shift[ 3*j + 2 ];
+                    sort[ j*(count + 1) + k].i = k;
+                    sort[ j*(count + 1) + k].d = px[0]*runner_shift[ 3*j + 0 ] + px[1]*runner_shift[ 3*j + 1 ] + px[2]*runner_shift[ 3*j + 2 ];
                     }
             }
 
         /* Add the sentinel and sort. */
         for ( j = 0 ; j < 13 ; j++ )
             if ( flags & (1 << j) ) {
-                c->sort[ j*(count + 1) + c->count ].d = FLT_MAX;
-                c->sort[ j*(count + 1) + c->count ].i = 0;
-                runner_dosort_ascending( &c->sort[ j*(count + 1) ] , c->count );
+                sort[ j*(count + 1) + count ].d = FLT_MAX;
+                sort[ j*(count + 1) + count ].i = 0;
+                runner_dosort_ascending( &sort[ j*(count + 1) ] , count );
                 c->sorted |= ( 1 << j );
                 }
             
@@ -348,18 +356,173 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock )
     /* for ( j = 0 ; j < 13 ; j++ ) {
         if ( !( flags & (1 << j) ) )
             continue;
-        finger = &c->sort[ j*(c->count + 1) ];
-        for ( k = 1 ; k < c->count ; k++ ) {
+        finger = &sort[ j*(count + 1) ];
+        for ( k = 1 ; k < count ; k++ ) {
             if ( finger[k].d < finger[k-1].d )
                 error( "Sorting failed, ascending array." );
-            if ( finger[k].i >= c->count )
+            if ( finger[k].i >= count )
                 error( "Sorting failed, indices borked." );
             }
         } */
         
     #ifdef TIMER_VERBOSE
         message( "runner %02i: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) took %.3f ms." ,
-            r->id , c->count , c->depth ,
+            r->id , count , c->depth ,
+            (flags & 0x1000) >> 12 , (flags & 0x800) >> 11 , (flags & 0x400) >> 10 , (flags & 0x200) >> 9 , (flags & 0x100) >> 8 , (flags & 0x80) >> 7 , (flags & 0x40) >> 6 , (flags & 0x20) >> 5 , (flags & 0x10) >> 4 , (flags & 0x8) >> 3 , (flags & 0x4) >> 2 , (flags & 0x2) >> 1 , (flags & 0x1) >> 0 , 
+            ((double)TIMER_TOC(timer_dosort)) / CPU_TPS * 1000 ); fflush(stdout);
+    #else
+        if ( clock )
+            TIMER_TOC(timer_dosort);
+    #endif
+
+    }
+    
+    
+void runner_dogsort ( struct runner *r , struct cell *c , int flags , int clock ) {
+
+    struct entry *finger;
+    struct entry *fingers[8];
+    struct gpart *gparts = c->gparts;
+    struct entry *gsort;
+    int j, k, count = c->gcount;
+    int i, ind, off[8], inds[8], temp_i, missing;
+    // float shift[3];
+    float buff[8], px[3];
+    
+    TIMER_TIC
+    
+    /* Clean-up the flags, i.e. filter out what's already been sorted. */
+    flags &= ~c->gsorted;
+    if ( flags == 0 )
+        return;
+    
+    /* start by allocating the entry arrays. */
+    if ( c->gsort == NULL || c->gsortsize < count ) {
+        if ( c->gsort != NULL )
+            free( c->gsort );
+        c->gsortsize = count * 1.1;
+        if ( ( c->gsort = (struct entry *)malloc( sizeof(struct entry) * (c->gsortsize + 1) * 13 ) ) == NULL )
+            error( "Failed to allocate sort memory." );
+        }
+    gsort = c->gsort;
+        
+    /* Does this cell have any progeny? */
+    if ( c->split ) {
+    
+        /* Fill in the gaps within the progeny. */
+        for ( k = 0 ; k < 8 ; k++ ) {
+            if ( c->progeny[k] == NULL )
+                continue;
+            missing = flags & ~c->progeny[k]->gsorted;
+            if ( missing )
+                runner_dogsort( r , c->progeny[k] , missing , 0 );
+            }
+    
+        /* Loop over the 13 different sort arrays. */
+        for ( j = 0 ; j < 13 ; j++ ) {
+        
+            /* Has this sort array been flagged? */
+            if ( !( flags & (1 << j) ) )
+                continue;
+                
+            /* Init the particle index offsets. */
+            for ( off[0] = 0 , k = 1 ; k < 8 ; k++ )
+                if ( c->progeny[k-1] != NULL )
+                    off[k] = off[k-1] + c->progeny[k-1]->gcount;
+                else
+                    off[k] = off[k-1];
+
+            /* Init the entries and indices. */
+            for ( k = 0 ; k < 8 ; k++ ) {
+                inds[k] = k;
+                if ( c->progeny[k] != NULL && c->progeny[k]->gcount > 0 ) {
+                    fingers[k] = &c->progeny[k]->gsort[ j*(c->progeny[k]->gcount + 1) ];
+                    buff[k] = fingers[k]->d;
+                    off[k] = off[k];
+                    }
+                else
+                    buff[k] = FLT_MAX;
+                }
+
+            /* Sort the buffer. */
+            for ( i = 0 ; i < 7 ; i++ )
+                for ( k = i+1 ; k < 8 ; k++ )
+                    if ( buff[ inds[k] ] < buff[ inds[i] ] ) {
+                        temp_i = inds[i]; inds[i] = inds[k]; inds[k] = temp_i;
+                        }
+
+            /* For each entry in the new sort list. */
+            finger = &gsort[ j*(count + 1) ];
+            for ( ind = 0 ; ind < count ; ind++ ) {
+
+                /* Copy the minimum into the new sort array. */
+                finger[ind].d = buff[inds[0]];
+                finger[ind].i = fingers[inds[0]]->i + off[inds[0]];
+
+                /* Update the buffer. */
+                fingers[inds[0]] += 1;
+                buff[inds[0]] = fingers[inds[0]]->d;
+
+                /* Find the smallest entry. */
+                for ( k = 1 ; k < 8 && buff[inds[k]] < buff[inds[k-1]] ; k++ ) {
+                    temp_i = inds[k-1]; inds[k-1] = inds[k]; inds[k] = temp_i;
+                    }
+
+                } /* Merge. */
+            
+            /* Add a sentinel. */
+            gsort[ j*(count + 1) + count ].d = FLT_MAX;
+            gsort[ j*(count + 1) + count ].i = 0;
+            
+            /* Mark as sorted. */
+            c->gsorted |= ( 1 << j );
+            
+            } /* loop over sort arrays. */
+    
+        } /* progeny? */
+        
+    /* Otherwise, just sort. */
+    else {
+    
+        /* Fill the sort array. */
+        for ( k = 0 ; k < count ; k++ ) {
+            px[0] = gparts[k].x[0];
+            px[1] = gparts[k].x[1];
+            px[2] = gparts[k].x[2];
+            for ( j = 0 ; j < 13 ; j++ )
+                if ( flags & (1 << j) ) {
+                    gsort[ j*(count + 1) + k].i = k;
+                    gsort[ j*(count + 1) + k].d = px[0]*runner_shift[ 3*j + 0 ] + px[1]*runner_shift[ 3*j + 1 ] + px[2]*runner_shift[ 3*j + 2 ];
+                    }
+            }
+
+        /* Add the sentinel and sort. */
+        for ( j = 0 ; j < 13 ; j++ )
+            if ( flags & (1 << j) ) {
+                gsort[ j*(count + 1) + count ].d = FLT_MAX;
+                gsort[ j*(count + 1) + count ].i = 0;
+                runner_dosort_ascending( &gsort[ j*(count + 1) ] , count );
+                c->gsorted |= ( 1 << j );
+                }
+            
+        }
+        
+    /* Verify the sorting. */
+    /* for ( j = 0 ; j < 13 ; j++ ) {
+        if ( !( flags & (1 << j) ) )
+            continue;
+        finger = &c->gsort[ j*(count + 1) ];
+        for ( k = 1 ; k < count ; k++ ) {
+            if ( finger[k].d < finger[k-1].d )
+                error( "Sorting failed, ascending array." );
+            if ( finger[k].i < 0 || finger[k].i >= count )
+                error( "Sorting failed, indices borked." );
+            }
+        } */
+        
+    #ifdef TIMER_VERBOSE
+        message( "runner %02i: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) took %.3f ms." ,
+            r->id , count , c->depth ,
             (flags & 0x1000) >> 12 , (flags & 0x800) >> 11 , (flags & 0x400) >> 10 , (flags & 0x200) >> 9 , (flags & 0x100) >> 8 , (flags & 0x80) >> 7 , (flags & 0x40) >> 6 , (flags & 0x20) >> 5 , (flags & 0x10) >> 4 , (flags & 0x8) >> 3 , (flags & 0x4) >> 2 , (flags & 0x2) >> 1 , (flags & 0x1) >> 0 , 
             ((double)TIMER_TOC(timer_dosort)) / CPU_TPS * 1000 ); fflush(stdout);
     #else
@@ -1102,7 +1265,7 @@ void *runner_main ( void *data ) {
             t->rid = r->cpuid;
             
             /* Set super to the first cell that I own. */
-            if ( ci->super->owner == r->qid )
+            if ( ci->super != NULL && ci->super->owner == r->qid )
                 super = ci->super;
             else if ( cj != NULL && cj->super != NULL && cj->super->owner == r->qid )
                 super = cj->super;
@@ -1145,6 +1308,8 @@ void *runner_main ( void *data ) {
                         runner_dosub1_density( r , ci , cj , t->flags , 1 );
                     else if ( t->subtype == task_subtype_force )
                         runner_dosub2_force( r , ci , cj , t->flags , 1 );
+                    else if ( t->subtype == task_subtype_grav )
+                        runner_dosub_grav( r , ci , cj , 1 );
                     else
                         error( "Unknown task subtype." );
                     break;
@@ -1169,6 +1334,21 @@ void *runner_main ( void *data ) {
                         parts[k].dt = FLT_MAX;
                     ci->dt_min = ci->dt_max = FLT_MAX;
                     break;
+                case task_type_grav_pp:
+                    if ( t->cj == NULL )
+                        runner_doself_grav( r , t->ci );
+                    else
+                       runner_dopair_grav( r , t->ci , t->cj );
+                    break;
+                case task_type_grav_mm:
+                    runner_dograv_mm( r , t->ci , t->cj );
+                    break;
+                case task_type_grav_up:
+                    runner_dograv_up( r , t->ci );
+                    break;
+                case task_type_grav_down:
+                    runner_dograv_down( r , t->ci );
+                    break;
                 default:
                     error( "Unknown task type." );
                 }
diff --git a/src/runner.h b/src/runner.h
index 741411e5762dc1f8252bacf43bb1c625460c13a2..91ac475d7079a5a49dd194668bea567e9528a74c 100644
--- a/src/runner.h
+++ b/src/runner.h
@@ -27,7 +27,7 @@ extern const char runner_flip[];
 
 
 /* Counters. */
-enum {
+enum runner_counters {
     runner_counter_swap = 0,
     runner_counter_stall,
     runner_counter_steal_stall,
@@ -80,6 +80,7 @@ void runner_dopair_density ( struct runner *r , struct cell *ci , struct cell *c
 void runner_doself_density ( struct runner *r , struct cell *c );
 void runner_dosub_density ( struct runner *r , struct cell *ci , struct cell *cj , int flags );
 void runner_dosort ( struct runner *r , struct cell *c , int flag , int clock );
+void runner_dogsort ( struct runner *r , struct cell *c , int flag , int clock );
 void runner_dokick ( struct runner *r , struct cell *c , int timer );
 void runner_dokick1 ( struct runner *r , struct cell *c );
 void runner_dokick2 ( struct runner *r , struct cell *c );
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index e9b3e8e720c4c2198267fc5cf6052e49cd4c5790..1fd0492803ba2053a25fbb73768e1c357b11658d 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -673,9 +673,6 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) {
     dj_min = sort_j[0].d;
     dx_max = ( ci->dx_max + cj->dx_max );
     
-    /* if ( ci->split && cj->split && sid == 4 )
-        printf( "boing!\n" ); */
-        
 
     /* Loop over the parts in ci. */
     for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min ; pid-- ) {
diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba24b6bf4a024d4ae9f6e83f325cdcd75edee145
--- /dev/null
+++ b/src/runner_doiact_grav.h
@@ -0,0 +1,626 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ * 
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ * 
+ ******************************************************************************/
+
+
+
+/**
+ * @brief Compute the sorted gravity interactions between a cell pair.
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ */
+ 
+void runner_dopair_grav_new ( struct runner *r , struct cell *ci , struct cell *cj ) {
+
+    struct engine *restrict e = r->e;
+    int pid, pjd, k, sid;
+    double rshift, shift[3] = { 0.0 , 0.0 , 0.0 }, nshift[3];
+    struct entry *restrict sort_i, *restrict sort_j;
+    struct gpart *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j;
+    double pix[3];
+    float dx[3], r2, h_max, di, dj;
+    int count_i, count_j, cnj, cnj_new;
+    float dt_step = e->dt_step;
+    struct multipole m;
+    #ifdef VECTORIZE
+        int icount = 0;
+        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
+        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
+        struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE];
+    #endif
+    TIMER_TIC
+    
+    /* Anything to do here? */
+    if ( ci->dt_min > dt_step && cj->dt_min > dt_step )
+        return;
+        
+    /* Get the sort ID. */
+    sid = space_getsid( e->s , &ci , &cj , shift );
+    
+    /* Make sure the cells are sorted. */
+    runner_dogsort( r , ci , (1 << sid) , 0 );
+    runner_dogsort( r , cj , (1 << sid) , 0 );
+    
+    /* Have the cells been sorted? */
+    if ( !(ci->gsorted & (1 << sid)) || !(cj->gsorted & (1 << sid) ) )
+        error( "Trying to interact unsorted cells." );
+    
+    /* Get the cutoff shift. */
+    for ( rshift = 0.0 , k = 0 ; k < 3 ; k++ )
+        rshift += shift[k]*runner_shift[ 3*sid + k ];
+        
+    /* Pick-out the sorted lists. */
+    sort_i = &ci->gsort[ sid*(ci->count + 1) ];
+    sort_j = &cj->gsort[ sid*(cj->count + 1) ];
+    
+    /* Get some other useful values. */
+    h_max = sqrtf( ci->h[0]*ci->h[0] + ci->h[1]*ci->h[1] + ci->h[2]*ci->h[2] ) * const_theta_max;
+    count_i = ci->gcount; count_j = cj->gcount;
+    parts_i = ci->gparts; parts_j = cj->gparts;
+    cnj = count_j;
+    multipole_reset( &m );
+    nshift[0] = -shift[0]; nshift[1] = -shift[1]; nshift[2] = -shift[2];
+
+    /* Loop over the parts in ci. */
+    for ( pid = count_i-1 ; pid >= 0 ; pid-- ) {
+    
+        /* Get a hold of the ith part in ci. */
+        pi = &parts_i[ sort_i[ pid ].i ];
+        if ( pi->dt > dt_step )
+            continue;
+        di = sort_i[pid].d + h_max - rshift;
+            
+        for ( k = 0 ; k < 3 ; k++ )
+            pix[k] = pi->x[k] - shift[k];
+        
+        /* Loop over the parts in cj. */
+        for ( pjd = 0 ; pjd < cnj && sort_j[pjd].d < di ; pjd++ ) {
+        
+            /* Get a pointer to the jth particle. */
+            pj = &parts_j[ sort_j[pjd].i ];
+        
+            /* Compute the pairwise distance. */
+            r2 = 0.0f;
+            for ( k = 0 ; k < 3 ; k++ ) {
+                dx[k] = pix[k] - pj->x[k];
+                r2 += dx[k]*dx[k];
+                }
+                
+            #ifndef VECTORIZE
+
+                // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 )
+                //     message( "interacting particles pi=%lli and pj=%lli with r=%.3e in cells %lli/%lli." , pi->part->id , pj->part->id , sqrtf(r2) , ((long long int)ci) / sizeof(struct cell) , ((long long int)cj) / sizeof(struct cell) );
+
+                runner_iact_grav( r2 , dx , pi , pj );
+
+            #else
+
+                /* Add this interaction to the queue. */
+                r2q[icount] = r2;
+                dxq[3*icount+0] = dx[0];
+                dxq[3*icount+1] = dx[1];
+                dxq[3*icount+2] = dx[2];
+                piq[icount] = pi;
+                pjq[icount] = pj;
+                icount += 1;
+
+                /* Flush? */
+                if ( icount == VEC_SIZE ) {
+                    runner_iact_vec_grav( r2q , dxq , piq , pjq );
+                    icount = 0;
+                    }
+
+            #endif
+            
+            } /* loop over the parts in cj. */
+            
+        /* Set the new limit. */
+        cnj_new = pjd;
+        
+        /* Add trailing parts to the multipole. */
+        for ( pjd = cnj_new ; pjd < cnj ; pjd++ ) {
+        
+            /* Add the part to the multipole. */
+            multipole_addpart( &m , &parts_j[ sort_j[pjd].i ] );
+        
+            } /* add trailing parts to the multipole. */
+            
+        /* Set the new cnj. */
+        cnj = cnj_new;
+            
+        /* Interact the ith particle with the multipole. */
+        multipole_iact_mp( &m , pi , nshift );
+    
+        } /* loop over the parts in ci. */
+        
+    #ifdef VECTORIZE
+    /* Pick up any leftovers. */
+    if ( icount > 0 )
+        for ( k = 0 ; k < icount ; k++ )
+            runner_iact_grav( r2q[k] , &dxq[3*k] , piq[k] , pjq[k] );
+    #endif
+        
+    /* Re-set the multipole. */
+    multipole_reset( &m );
+        
+    /* Loop over the parts in cj and interact with the multipole in ci. */
+    for ( pid = count_i - 1 , pjd = 0 ; pjd < count_j ; pjd++ ) {
+    
+        /* Get the position of pj along the axis. */
+        dj = sort_j[pjd].d - h_max + rshift;
+        
+        /* Add any left-over parts in cell_i to the multipole. */
+        while ( pid >= 0 && sort_i[pid].d < dj ) {
+            
+            /* Add this particle to the multipole. */
+            multipole_addpart( &m , &parts_i[ sort_i[pid].i ] );
+            
+            /* Decrease pid. */
+            pid -= 1;
+            
+            }
+        
+        /* Interact pj with the multipole. */
+        multipole_iact_mp( &m , &parts_j[ sort_j[pjd].i ] , shift );
+    
+        } /* loop over the parts in cj and interact with the multipole. */
+        
+        
+    #ifdef TIMER_VERBOSE
+        printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000 );
+    #else
+        TIMER_TOC(TIMER_DOPAIR);
+    #endif
+
+    }
+
+
+/**
+ * @brief Compute the recursive upward sweep, i.e. construct the
+ *        multipoles in a cell hierarchy.
+ *
+ * @param r The #runner.
+ * @param c The top-level #cell.
+ */
+ 
+void runner_dograv_up ( struct runner *r , struct cell *c ) {
+
+    /* Re-set this cell's multipole. */
+    multipole_reset( &c->multipole );
+
+    /* Split? */
+    if ( c->split ) {
+    
+        /* Recurse. */
+        for ( int k = 0 ; k < 8 ; k++ )
+            if ( c->progeny[k] != NULL )
+                runner_dograv_up( r , c->progeny[k] );
+                
+        /* Collect the multipoles from the progeny. */
+        multipole_reset( &c->multipole );
+        for ( int k = 0 ; k < 8 ; k++ )
+            if ( c->progeny[k] != NULL )
+                multipole_merge( &c->multipole , &c->progeny[k]->multipole );
+    
+        }
+        
+    /* No, leaf node. */
+    else
+    
+        /* Just collect the multipole. */
+        multipole_init( &c->multipole , c->gparts , c->gcount );
+
+    }
+
+
+/**
+ * @brief Compute the recursive downward sweep, i.e. apply the multipole
+ *        accelleration on all the particles.
+ *
+ * @param r The #runner.
+ * @param c The top-level #cell.
+ */
+ 
+void runner_dograv_down ( struct runner *r , struct cell *c ) {
+
+    struct multipole *m = &c->multipole;
+
+    /* Split? */
+    if ( c->split ) {
+    
+        /* Apply this cell's accelleration on the multipoles below. */
+        for ( int k = 0 ; k < 8 ; k++ )
+            if ( c->progeny[k] != NULL ) {
+                struct multipole *mp = &c->progeny[k]->multipole;
+                mp->a[0] += m->a[0];
+                mp->a[1] += m->a[1];
+                mp->a[2] += m->a[2];
+                }
+    
+        /* Recurse. */
+        for ( int k = 0 ; k < 8 ; k++ )
+            if ( c->progeny[k] != NULL )
+                runner_dograv_down( r , c->progeny[k] );
+                
+        }
+        
+    /* No, leaf node. */
+    else {
+    
+        /* Apply the multipole accelleration to all gparts. */
+        for ( int k = 0 ; k < c->gcount ; k++ ) {
+            struct gpart *p = &c->gparts[k];
+            p->a[0] += m->a[0];
+            p->a[1] += m->a[1];
+            p->a[2] += m->a[2];
+            }
+    
+        }
+
+    }
+
+
+/**
+ * @brief Compute the multipole-multipole interaction between two cells.
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ */
+ 
+void runner_dograv_mm ( struct runner *r , struct cell *restrict ci , struct cell *restrict cj ) {
+
+    struct engine *e = r->e;
+    int k;
+    double shift[3] = { 0.0 , 0.0 , 0.0 };
+    float dx[3], theta;
+
+    /* Compute the shift between the cells. */
+    for ( k = 0 ; k < 3 ; k++ ) {
+        dx[k] = cj->loc[k] - ci->loc[k];
+        if ( r->e->s->periodic ) {
+            if ( dx[k] < -e->s->dim[k]/2 )
+                shift[k] = e->s->dim[k];
+            else if ( dx[k] > e->s->dim[k]/2 )
+                shift[k] = -e->s->dim[k];
+            dx[k] += shift[k];
+            }
+        }
+    theta = sqrt( ( dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2] ) /
+            ( ci->h[0]*ci->h[0] + ci->h[1]*ci->h[1] + ci->h[2]*ci->h[2] ) );
+    
+    /* Do an MM or an MP/PM? */
+    if ( theta > const_theta_max*4 ) {
+        
+        /* Update the multipoles. */
+        multipole_iact_mm( &ci->multipole , &cj->multipole , shift );
+        
+        }
+        
+    else {
+
+        /* Interact the multipoles via their parts. */
+        for ( k = 0 ; k < ci->gcount ; k++ )
+            multipole_iact_mp( &cj->multipole , &ci->gparts[k] , shift );
+        for ( k = 0 ; k < cj->gcount ; k++ )
+            multipole_iact_mp( &ci->multipole , &cj->gparts[k] , shift );
+            
+        }
+
+    }
+
+
+/**
+ * @brief Compute the interactions between a cell pair.
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ */
+ 
+void runner_dopair_grav ( struct runner *r , struct cell *restrict ci , struct cell *restrict cj ) {
+
+    struct engine *e = r->e;
+    int pid, pjd, k, count_i = ci->gcount, count_j = cj->gcount;
+    double shift[3] = { 0.0 , 0.0 , 0.0 };
+    struct gpart *restrict parts_i = ci->gparts, *restrict parts_j = cj->gparts;
+    struct gpart *restrict pi, *restrict pj;
+    double pix[3];
+    float dx[3], r2;
+    float dt_step = e->dt_step;
+    #ifdef VECTORIZE
+        int icount = 0;
+        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
+        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
+        struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE];
+    #endif
+    TIMER_TIC
+    
+    /* Anything to do here? */
+    if ( ci->dt_min > dt_step && cj->dt_min > dt_step )
+        return;
+    
+    /* Get the relative distance between the pairs, wrapping. */
+    if ( e->s->periodic )
+        for ( k = 0 ; k < 3 ; k++ ) {
+            if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 )
+                shift[k] = e->s->dim[k];
+            else if ( cj->loc[k] - ci->loc[k] > e->s->dim[k]/2 )
+                shift[k] = -e->s->dim[k];
+            }
+        
+    /* Loop over the parts in ci. */
+    for ( pid = 0 ; pid < count_i ; pid++ ) {
+    
+        /* Get a hold of the ith part in ci. */
+        pi = &parts_i[ pid ];
+        for ( k = 0 ; k < 3 ; k++ )
+            pix[k] = pi->x[k] - shift[k];
+        
+        /* Loop over the parts in cj. */
+        for ( pjd = 0 ; pjd < count_j ; pjd++ ) {
+        
+            /* Get a pointer to the jth particle. */
+            pj = &parts_j[ pjd ];
+        
+            /* Compute the pairwise distance. */
+            r2 = 0.0f;
+            for ( k = 0 ; k < 3 ; k++ ) {
+                dx[k] = pix[k] - pj->x[k];
+                r2 += dx[k]*dx[k];
+                }
+                
+            /* Compute the interaction. */
+            #ifndef VECTORIZE
+            
+                // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 )
+                //     message( "interacting particles pi=%lli and pj=%lli with r=%.3e in cells %lli/%lli." , pi->part->id , pj->part->id , sqrtf(r2) , ((long long int)ci) / sizeof(struct cell) , ((long long int)cj) / sizeof(struct cell) );
+
+                runner_iact_grav( r2 , dx , pi , pj );
+
+            #else
+
+                /* Add this interaction to the queue. */
+                r2q[icount] = r2;
+                dxq[3*icount+0] = dx[0];
+                dxq[3*icount+1] = dx[1];
+                dxq[3*icount+2] = dx[2];
+                piq[icount] = pi;
+                pjq[icount] = pj;
+                icount += 1;
+
+                /* Flush? */
+                if ( icount == VEC_SIZE ) {
+                    runner_iact_vec_grav( r2q , dxq , piq , pjq );
+                    icount = 0;
+                    }
+
+            #endif
+        
+            } /* loop over the parts in cj. */
+    
+        } /* loop over the parts in ci. */
+        
+    #ifdef VECTORIZE
+    /* Pick up any leftovers. */
+    if ( icount > 0 )
+        for ( k = 0 ; k < icount ; k++ )
+            runner_iact_grav( r2q[k] , &dxq[3*k] , piq[k] , pjq[k] );
+    #endif
+        
+    #ifdef TIMER_VERBOSE
+        printf( "runner_dopair_naive_grav[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 );
+    #else
+        TIMER_TOC(timer_dopair_grav);
+    #endif
+
+
+    }
+
+
+/**
+ * @brief Compute the interactions within a cell.
+ *
+ * @param r The #runner.
+ * @param c The #cell.
+ */
+ 
+void runner_doself_grav ( struct runner *r , struct cell *restrict c ) {
+
+    struct engine *e = r->e;
+    int pid, pjd, k, count = c->gcount;
+    struct gpart *restrict parts = c->gparts;
+    struct gpart *restrict pi, *restrict pj;
+    double pix[3] = { 0.0 , 0.0 , 0.0 };
+    float dx[3], r2;
+    float dt_step = e->dt_step;
+    #ifdef VECTORIZE
+        int icount = 0;
+        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
+        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
+        struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE];
+    #endif
+    TIMER_TIC
+    
+    /* Anything to do here? */
+    if ( c->dt_min > dt_step )
+        return;
+    
+    /* Loop over every part in c. */
+    for ( pid = 0 ; pid < count ; pid++ ) {
+    
+        /* Get a hold of the ith part in ci. */
+        pi = &parts[ pid ];
+        for ( k = 0 ; k < 3 ; k++ )
+            pix[k] = pi->x[k];
+        
+        /* Loop over every other part in c. */
+        for ( pjd = pid+1 ; pjd < count ; pjd++ ) {
+        
+            /* Get a pointer to the jth particle. */
+            pj = &parts[ pjd ];
+        
+            /* Compute the pairwise distance. */
+            r2 = 0.0f;
+            for ( k = 0 ; k < 3 ; k++ ) {
+                dx[k] = pix[k] - pj->x[k];
+                r2 += dx[k]*dx[k];
+                }
+                
+            /* Compute the interaction. */
+            #ifndef VECTORIZE
+
+                // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 )
+                //     message( "interacting particles pi=%lli and pj=%lli with r=%.3e." , pi->part->id , pj->part->id , sqrtf(r2) );
+
+                runner_iact_grav( r2 , dx , pi , pj );
+
+            #else
+
+                /* Add this interaction to the queue. */
+                r2q[icount] = r2;
+                dxq[3*icount+0] = dx[0];
+                dxq[3*icount+1] = dx[1];
+                dxq[3*icount+2] = dx[2];
+                piq[icount] = pi;
+                pjq[icount] = pj;
+                icount += 1;
+
+                /* Flush? */
+                if ( icount == VEC_SIZE ) {
+                    runner_iact_vec_grav( r2q , dxq , piq , pjq );
+                    icount = 0;
+                    }
+
+            #endif
+        
+            } /* loop over the remaining parts in c. */
+    
+        } /* loop over the parts in c. */
+        
+    #ifdef VECTORIZE
+    /* Pick up any leftovers. */
+    if ( icount > 0 )
+        for ( k = 0 ; k < icount ; k++ )
+            runner_iact_grav( r2q[k] , &dxq[3*k] , piq[k] , pjq[k] );
+    #endif
+        
+    #ifdef TIMER_VERBOSE
+        printf( "runner_doself_grav[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 );
+    #else
+        TIMER_TOC(timer_doself_grav);
+    #endif
+
+
+    }
+
+
+/**
+ * @brief Compute a gravity sub-task.
+ * 
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ * @param gettimer Flag to record timer or not.
+ */
+ 
+void runner_dosub_grav ( struct runner *r , struct cell *ci , struct cell *cj , int gettimer ) {
+
+    int j, k, periodic = r->e->s->periodic;
+    struct space *s = r->e->s;
+
+    TIMER_TIC
+
+    /* Self-interaction? */
+    if ( cj == NULL ) {
+
+        /* If the cell is split, recurse. */
+        if ( ci->split ) {
+
+            /* Split this task into tasks on its progeny. */
+            for ( j = 0 ; j < 8 ; j++ )
+                if ( ci->progeny[j] != NULL ) {
+                    runner_dosub_grav( r , ci->progeny[j] , NULL , 0 );
+                    for ( k = j+1 ; k < 8 ; k++ )
+                        if ( ci->progeny[k] != NULL )
+                            runner_dosub_grav( r , ci->progeny[j] , ci->progeny[k] , 0 );
+                    }
+
+            }
+
+        /* Otherwise, just make a pp task out of it. */
+        else
+            runner_doself_grav( r , ci );
+
+        }
+
+    /* Nope, pair. */
+    else {
+
+        /* Get the opening angle theta. */
+        float dx[3], theta;
+        for ( k = 0 ; k < 3 ; k++ ) {
+            dx[k] = fabsf( ci->loc[k] - cj->loc[k] );
+            if ( periodic && dx[k] > 0.5*s->dim[k] )
+                dx[k] = -dx[k] + s->dim[k];
+            if ( dx[k] > 0.0f )
+                dx[k] -= ci->h[k];
+            }
+        theta = ( dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2] ) / 
+                ( ci->h[0]*ci->h[0] + ci->h[1]*ci->h[1] + ci->h[2]*ci->h[2] );
+
+        /* Split the interacton? */
+        if ( theta < const_theta_max*const_theta_max ) {
+
+            /* Are both ci and cj split? */
+            if ( ci->split && cj->split ) {
+
+                /* Split this task into tasks on its progeny. */
+                for ( j = 0 ; j < 8 ; j++ )
+                    if ( ci->progeny[j] != NULL ) {
+                        for ( k = 0 ; k < 8 ; k++ )
+                            if ( cj->progeny[k] != NULL )
+                                runner_dosub_grav( r , ci->progeny[j] , cj->progeny[k] , 0 );
+                        }
+
+                }
+
+            /* Otherwise, make a pp task out of it. */
+            else
+                runner_dopair_grav( r , ci , cj );
+
+            }
+            
+        /* Otherwise, mm interaction is fine. */
+        else
+            runner_dograv_mm( r , ci , cj );
+
+        }
+        
+    if ( gettimer )
+        #ifdef TIMER_VERBOSE
+            printf( "runner_dosub_grav[%02i]: at depth %i took %.3f ms.\n" , r->id , ci->depth , ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000 );
+        #else
+            TIMER_TOC(timer_dosub_grav);
+        #endif
+
+    }
+
+
diff --git a/src/runner_iact.h b/src/runner_iact.h
index 8c14b2d15f54532d63b13c4aa35c158e1408355f..0a6b9c4ce74b41e1eb795b3c13adc1348b63aa23 100644
--- a/src/runner_iact.h
+++ b/src/runner_iact.h
@@ -103,7 +103,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( flo
 
 #ifdef VECTORIZE
 
-    vector r, ri, xi, xj, hi, hj, hi_inv, hj_inv,  wi, wj, wi_dx, wj_dx;
+    vector r, ri, r2, xi, xj, hi, hj, hi_inv, hj_inv,  wi, wj, wi_dx, wj_dx;
     vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh;
     vector mi, mj;
     vector dx[3], dv[3];
@@ -112,8 +112,6 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( flo
     vector curlvr[3], curl_vi[3], curl_vj[3];
     int k, j;
     
-    r.v = vec_sqrt( vec_load( R2 ) );
-    ri.v = vec_rcp( r.v );
     #if VEC_SIZE==8
         mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass , pi[4]->mass , pi[5]->mass , pi[6]->mass , pi[7]->mass );
         mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass );
@@ -134,6 +132,12 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( flo
             dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] );
     #endif
 
+    /* Get the radius and inverse radius. */
+    r2.v = vec_load( R2 );
+    ri.v = vec_rsqrt( r2.v );
+    ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) );
+    r.v = r2.v * ri.v;
+    
     hi.v = vec_load( Hi );
     hi_inv.v = vec_rcp( hi.v );
     hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v  - vec_set1( 1.0f ) );
@@ -267,7 +271,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_densit
 
 #ifdef VECTORIZE
 
-    vector r, ri, xi, hi, hi_inv, wi, wi_dx;
+    vector r, ri, r2, xi, hi, hi_inv, wi, wi_dx;
     vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi;
     vector mj;
     vector dx[3], dv[3];
@@ -276,8 +280,6 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_densit
     vector curlvr[3], curl_vi[3];
     int k, j;
     
-    r.v = vec_sqrt( vec_load( R2 ) );
-    ri.v = vec_rcp( r.v );
     #if VEC_SIZE==8
         mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass );
         for ( k = 0 ; k < 3 ; k++ ) {
@@ -296,6 +298,12 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_densit
             dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] );
     #endif
     
+    /* Get the radius and inverse radius. */
+    r2.v = vec_load( R2 );
+    ri.v = vec_rsqrt( r2.v );
+    ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) );
+    r.v = r2.v * ri.v;
+    
     hi.v = vec_load( Hi );
     hi_inv.v = vec_rcp( hi.v );
     hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v  - vec_set1( 1.0f ) );
diff --git a/src/runner_iact_grav.h b/src/runner_iact_grav.h
new file mode 100644
index 0000000000000000000000000000000000000000..da1f552ae073aab3575de03255a3919d7a14cf95
--- /dev/null
+++ b/src/runner_iact_grav.h
@@ -0,0 +1,119 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ * 
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ * 
+ ******************************************************************************/
+
+#include "kernel.h"
+#include "vector.h"
+
+/**
+ * @file  runner_iact_grav.h
+ * @brief Gravity interaction functions.
+ *
+ */
+
+
+/**
+ * @brief Gravity potential
+ */
+
+__attribute__ ((always_inline)) INLINE static void runner_iact_grav ( float r2 , float *dx , struct gpart *pi , struct gpart *pj ) {
+
+    float ir, r;
+    float w, acc;
+    float mi = pi->mass, mj = pj->mass;
+    int k;
+    
+    /* Get the absolute distance. */
+    ir = 1.0f / sqrtf( r2 );
+    r = r2 * ir;
+    
+    /* Evaluate the gravity kernel. */
+    kernel_grav_eval( r , &acc );
+    
+    /* Scale the acceleration. */
+    acc *= const_G * ir * ir * ir;
+    
+    /* Aggregate the accellerations. */
+    for ( k = 0 ; k < 3 ; k++ ) {
+        w = acc * dx[k];
+        pi->a[k] -= w * mj;
+        pj->a[k] += w * mi;
+        }
+
+    }
+    
+    
+/**
+ * @brief Gravity potential (Vectorized version)
+ */
+__attribute__ ((always_inline)) INLINE static void runner_iact_vec_grav ( float *R2 , float *Dx , struct gpart **pi , struct gpart **pj ) {
+
+#ifdef VECTORIZE
+
+    vector ir, r, r2, dx[3];
+    vector w, acc, ai, aj;
+    vector mi, mj;
+    int j, k;
+    
+    #if VEC_SIZE==8
+        mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass , pi[4]->mass , pi[5]->mass , pi[6]->mass , pi[7]->mass );
+        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass );
+        for ( k = 0 ; k < 3 ; k++ )
+            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] );
+    #elif VEC_SIZE==4
+        mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass );
+        mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass );
+        for ( k = 0 ; k < 3 ; k++ )
+            dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] );
+    #endif
+
+        
+    /* Get the radius and inverse radius. */
+    r2.v = vec_load( R2 );
+    ir.v = vec_rsqrt( r2.v );
+    ir.v = ir.v - vec_set1( 0.5f ) * ir.v * ( r2.v * ir.v * ir.v - vec_set1( 1.0f ) );
+    r.v = r2.v * ir.v;
+    
+    /* Evaluate the gravity kernel. */
+    blender_eval_vec( &r , &acc );
+    
+    /* Scale the acceleration. */
+    acc.v *= vec_set1( const_G ) * ir.v * ir.v * ir.v;
+    
+    /* Aggregate the accellerations. */
+    for ( k = 0 ; k < 3 ; k++ ) {
+        w.v = acc.v * dx[k].v;
+        ai.v = w.v * mj.v;
+        aj.v = w.v * mi.v;
+        for ( j = 0 ; j < VEC_SIZE ; j++ ) {
+            pi[j]->a[k] -= ai.f[j];
+            pj[j]->a[k] += aj.f[j];
+            }
+        }
+
+#else
+
+    for ( int k = 0 ; k < VEC_SIZE ; k++ )
+        runner_iact_grav( R2[k] , &Dx[3*k] , pi[k] , pj[k] );
+        
+#endif
+    
+    }
+    
+
diff --git a/src/runner_iact_legacy.h b/src/runner_iact_legacy.h
index 2042a1652c9fb6b2bb539db2c215fcc0c01cfab5..aa50cc1fe2c09fa558a21eaf8b9079ffc08b6cbb 100644
--- a/src/runner_iact_legacy.h
+++ b/src/runner_iact_legacy.h
@@ -103,7 +103,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( flo
 
 #ifdef VECTORIZE
 
-    vector r, ri, xi, xj, hi, hj, hi_inv, hj_inv,  wi, wj, wi_dx, wj_dx;
+    vector r, r2, ri, xi, xj, hi, hj, hi_inv, hj_inv,  wi, wj, wi_dx, wj_dx;
     vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh;
     vector mi, mj;
     vector dx[3], dv[3];
@@ -112,8 +112,6 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( flo
     vector curlvr[3], curl_vi[3], curl_vj[3];
     int k, j;
     
-    r.v = vec_sqrt( vec_load( R2 ) );
-    ri.v = vec_rcp( r.v );
     #if VEC_SIZE==8
         mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass , pi[4]->mass , pi[5]->mass , pi[6]->mass , pi[7]->mass );
         mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass );
@@ -134,6 +132,12 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( flo
             dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] );
     #endif
 
+    /* Get the radius and inverse radius. */
+    r2.v = vec_load( R2 );
+    ri.v = vec_rsqrt( r2.v );
+    ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) );
+    r.v = r2.v * ri.v;
+    
     hi.v = vec_load( Hi );
     hi_inv.v = vec_rcp( hi.v );
     hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v  - vec_set1( 1.0f ) );
@@ -267,7 +271,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_densit
 
 #ifdef VECTORIZE
 
-    vector r, ri, xi, hi, hi_inv, wi, wi_dx;
+    vector r, r2, ri, xi, hi, hi_inv, wi, wi_dx;
     vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi;
     vector mj;
     vector dx[3], dv[3];
@@ -276,8 +280,6 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_densit
     vector curlvr[3], curl_vi[3];
     int k, j;
     
-    r.v = vec_sqrt( vec_load( R2 ) );
-    ri.v = vec_rcp( r.v );
     #if VEC_SIZE==8
         mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass );
         for ( k = 0 ; k < 3 ; k++ ) {
@@ -296,6 +298,12 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_densit
             dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] );
     #endif
     
+    /* Get the radius and inverse radius. */
+    r2.v = vec_load( R2 );
+    ri.v = vec_rsqrt( r2.v );
+    ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) );
+    r.v = r2.v * ri.v;
+    
     hi.v = vec_load( Hi );
     hi_inv.v = vec_rcp( hi.v );
     hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v  - vec_set1( 1.0f ) );
diff --git a/src/scheduler.c b/src/scheduler.c
index 49ffd14dff821f7a1e8d7a22feb648b9db8140e3..10fb22171fce6f13c5c1862807938a18a0f293bb 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -46,6 +46,7 @@
 #include "part.h"
 #include "debug.h"
 #include "space.h"
+#include "multipole.h"
 #include "cell.h"
 #include "queue.h"
 #include "kernel.h"
@@ -431,6 +432,135 @@ void scheduler_splittasks ( struct scheduler *s ) {
                 }
                 
             } /* pair interaction? */
+            
+        /* Gravity interaction? */
+        else if ( t->type == task_type_grav_mm ) {
+        
+            /* Get a handle on the cells involved. */
+            ci = t->ci;
+            cj = t->cj;
+            
+            /* Self-interaction? */
+            if ( cj == NULL ) {
+            
+                /* Ignore this task if the cell has no gparts. */
+                if ( ci->gcount == 0 )
+                    t->type = task_type_none;
+            
+                /* If the cell is split, recurse. */
+                else if ( ci->split ) {
+                
+                    /* Make a single sub-task? */
+                    if ( scheduler_dosub && ci->count < space_subsize/ci->count ) {
+                    
+                        t->type = task_type_sub;
+                        t->subtype = task_subtype_grav;
+                    
+                        }
+                        
+                    /* Otherwise, just split the task. */
+                    else {
+                
+                        /* Split this task into tasks on its progeny. */
+                        t->type = task_type_none;
+                        for ( j = 0 ; j < 8 ; j++ )
+                            if ( ci->progeny[j] != NULL ) {
+                                if ( t->type == task_type_none ) {
+                                    t->type = task_type_grav_mm;
+                                    t->ci = ci->progeny[j];
+                                    t->cj = NULL;
+                                    }
+                                else
+                                    t = scheduler_addtask( s , task_type_grav_mm , task_subtype_none , 0 , 0 , ci->progeny[j] , NULL , 0 );
+                                for ( k = j+1 ; k < 8 ; k++ )
+                                    if ( ci->progeny[k] != NULL ) {
+                                        if ( t->type == task_type_none ) {
+                                            t->type = task_type_grav_mm;
+                                            t->ci = ci->progeny[j];
+                                            t->cj = ci->progeny[k];
+                                            }
+                                        else
+                                            t = scheduler_addtask( s , task_type_grav_mm , task_subtype_none , 0 , 0 , ci->progeny[j] , ci->progeny[k] , 0 );
+                                        }
+                                }
+                        redo = ( t->type != task_type_none );
+                        
+                        }
+                      
+                    }
+                    
+                /* Otherwise, just make a pp task out of it. */
+                else
+                    t->type = task_type_grav_pp;
+                
+                }
+                
+            /* Nope, pair. */
+            else {
+            
+                /* Make a sub-task? */
+                if ( scheduler_dosub && ci->count < space_subsize/cj->count ) {
+                
+                    t->type = task_type_sub;
+                    t->subtype = task_subtype_grav;
+                
+                    }
+                    
+                /* Otherwise, split the task. */
+                else {
+        
+                    /* Get the opening angle theta. */
+                    float dx[3], theta;
+                    for ( k = 0 ; k < 3 ; k++ ) {
+                        dx[k] = fabsf( ci->loc[k] - cj->loc[k] );
+                        if ( s->space->periodic && dx[k] > 0.5*s->space->dim[k] )
+                            dx[k] = -dx[k] + s->space->dim[k];
+                        if ( dx[k] > 0.0f )
+                            dx[k] -= ci->h[k];
+                        }
+                    theta = ( dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2] ) / 
+                            ( ci->h[0]*ci->h[0] + ci->h[1]*ci->h[1] + ci->h[2]*ci->h[2] );
+
+                    /* Ignore this task if the cell has no gparts. */
+                    if ( ci->gcount == 0 || cj->gcount == 0 )
+                        t->type = task_type_none;
+
+                    /* Split the interacton? */
+                    else if ( theta < const_theta_max*const_theta_max ) {
+
+                        /* Are both ci and cj split? */
+                        if ( ci->split && cj->split ) {
+
+                            /* Split this task into tasks on its progeny. */
+                            t->type = task_type_none;
+                            for ( j = 0 ; j < 8 ; j++ )
+                                if ( ci->progeny[j] != NULL ) {
+                                    for ( k = 0 ; k < 8 ; k++ )
+                                        if ( cj->progeny[k] != NULL ) {
+                                            if ( t->type == task_type_none ) {
+                                                t->type = task_type_grav_mm;
+                                                t->ci = ci->progeny[j];
+                                                t->cj = cj->progeny[k];
+                                                }
+                                            else
+                                                t = scheduler_addtask( s , task_type_grav_mm , task_subtype_none , 0 , 0 , ci->progeny[j] , cj->progeny[k] , 0 );
+                                            }
+                                    }
+                            redo = ( t->type != task_type_none );
+
+                            }
+
+                        /* Otherwise, make a pp task out of it. */
+                        else
+                            t->type = task_type_grav_pp;
+
+                        }
+                        
+                    }
+                
+                } /* gravity pair interaction? */
+        
+            } /* gravity interaction? */
     
         } /* loop over all tasks. */
         
@@ -457,6 +587,12 @@ struct task *scheduler_addtask ( struct scheduler *s , int type , int subtype ,
     
     /* Get the next free task. */
     ind = atomic_inc( &s->tasks_next );
+    
+    /* Overflow? */
+    if ( ind >= s->size )
+        error( "Task list overflow." );
+    
+    /* Get a pointer to the new task. */
     t = &s->tasks[ ind ];
     
     /* Copy the data. */
@@ -637,10 +773,18 @@ void scheduler_reweight ( struct scheduler *s ) {
                     break;
                 case task_type_sub:
                     if ( t->cj != NULL ) {
-                        if ( t->ci->nodeID != nodeID || t->cj->nodeID != nodeID )
-                            t->weight += 3 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ];
-                        else
-                            t->weight += 2 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ];
+                        if ( t->ci->nodeID != nodeID || t->cj->nodeID != nodeID ) {
+                            if ( t->flags < 0 )
+                                t->weight += 3 * wscale * t->ci->count * t->cj->count;
+                            else
+                                t->weight += 3 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ];
+                            }
+                        else {
+                            if ( t->flags < 0 )
+                                t->weight += 2 * wscale * t->ci->count * t->cj->count;
+                            else
+                                t->weight += 2 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ];
+                            }
                         }
                     else
                         t->weight += 1 * wscale * t->ci->count * t->ci->count;
@@ -653,6 +797,8 @@ void scheduler_reweight ( struct scheduler *s ) {
                 case task_type_kick2:
                     t->weight += wscale * t->ci->count;
                     break;
+                default:
+                    break;
                 }
         if ( t->type == task_type_send )
             t->weight  = INT_MAX / 8;
diff --git a/src/space.c b/src/space.c
index 5b0422fb12d1c85d8b9875fdc167bda3a7feeb77..e0bd071626006b08ea3a650548f3b344e065e0ef 100644
--- a/src/space.c
+++ b/src/space.c
@@ -44,6 +44,7 @@
 #include "kernel.h"
 #include "part.h"
 #include "space.h"
+#include "multipole.h"
 #include "cell.h"
 #include "scheduler.h"
 #include "engine.h"
@@ -101,16 +102,16 @@ const int sortlistID[27] = {
  
 int space_getsid ( struct space *s , struct cell **ci , struct cell **cj , double *shift ) {
 
-    int k, sid = 0;
+    int k, sid = 0, periodic = s->periodic;
     struct cell *temp;
     double dx[3];
 
     /* Get the relative distance between the pairs, wrapping. */
     for ( k = 0 ; k < 3 ; k++ ) {
         dx[k] = (*cj)->loc[k] - (*ci)->loc[k];
-        if ( dx[k] < -s->dim[k]/2 )
+        if ( periodic && dx[k] < -s->dim[k]/2 )
             shift[k] = s->dim[k];
-        else if ( dx[k] > s->dim[k]/2 )
+        else if ( periodic && dx[k] > s->dim[k]/2 )
             shift[k] = -s->dim[k];
         else
             shift[k] = 0.0;
@@ -240,6 +241,7 @@ void space_regrid ( struct space *s , double cell_max ) {
                     c->dmin = dmin;
                     c->depth = 0;
                     c->count = 0;
+                    c->gcount = 0;
                     c->super = c;
                     lock_init( &c->lock );
                     }
@@ -265,6 +267,7 @@ void space_regrid ( struct space *s , double cell_max ) {
             s->cells[k].dx_max = 0.0f;
             s->cells[k].sorted = 0;
             s->cells[k].count = 0;
+            s->cells[k].gcount = 0;
             s->cells[k].kick1 = NULL;
             s->cells[k].kick2 = NULL;
             s->cells[k].super = &s->cells[k];
@@ -286,11 +289,11 @@ void space_regrid ( struct space *s , double cell_max ) {
  
 void space_rebuild ( struct space *s , double cell_max ) {
 
-    float h_max = s->cell_min / kernel_gamma / space_stretch, dmin;
-    int i, j, k, cdim[3], nr_parts = s->nr_parts;
+    int j, k, cdim[3], nr_parts = s->nr_parts, nr_gparts = s->nr_gparts;
     struct cell *restrict c, *restrict cells = s->cells;
     struct part *restrict finger, *restrict p, *parts = s->parts;
     struct xpart *xfinger, *xparts = s->xparts;
+    struct gpart *gp, *gparts = s->gparts, *gfinger;
     int *ind;
     double ih[3], dim[3];
     // ticks tic;
@@ -298,110 +301,8 @@ void space_rebuild ( struct space *s , double cell_max ) {
     /* Be verbose about this. */
     // message( "re)building space..." ); fflush(stdout);
     
-    /* Run through the parts and get the current h_max. */
-    // tic = getticks();
-    if ( cells != NULL ) {
-        for ( k = 0 ; k < s->nr_cells ; k++ ) {
-            if ( cells[k].h_max > h_max )
-                h_max = cells[k].h_max;
-            }
-        }
-    else {
-        for ( k = 0 ; k < nr_parts ; k++ ) {
-            if ( s->parts[k].h > h_max )
-                h_max = s->parts[k].h;
-            }
-        s->h_max = h_max;
-        }
-    // message( "h_max is %.3e (cell_max=%.3e)." , h_max , cell_max );
-    // message( "getting h_min and h_max took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 );
-    
-    /* Get the new putative cell dimensions. */
-    for ( k = 0 ; k < 3 ; k++ )
-        cdim[k] = floor( s->dim[k] / fmax( h_max*kernel_gamma*space_stretch , cell_max ) );
-        
-    /* In MPI-Land, we're not allowed to change the top-level cell size. */
-    #ifdef WITH_MPI
-        if ( cdim[0] < s->cdim[0] || cdim[1] < s->cdim[1] || cdim[2] < s->cdim[2] )
-            error( "Root-level change of cell size not allowed." );
-    #endif
-        
-    /* Do we need to re-build the upper-level cells? */
-    // tic = getticks();
-    if ( cells == NULL ||
-         cdim[0] < s->cdim[0] || cdim[1] < s->cdim[1] || cdim[2] < s->cdim[2] ) {
-    
-        /* Free the old cells, if they were allocated. */
-        if ( cells != NULL ) {
-            for ( k = 0 ; k < s->nr_cells ; k++ ) {
-                space_rebuild_recycle( s , &cells[k] );
-                if ( cells[k].sort != NULL )
-                    free( cells[k].sort );
-                }
-            free( cells );
-            s->maxdepth = 0;
-            }
-            
-        /* Set the new cell dimensions only if smaller. */
-        for ( k = 0 ; k < 3 ; k++ ) {
-            s->cdim[k] = cdim[k];
-            s->h[k] = s->dim[k] / cdim[k];
-            s->ih[k] = 1.0 / s->h[k];
-            }
-        dmin = fminf( s->h[0] , fminf( s->h[1] , s->h[2] ) );
-
-        /* Allocate the highest level of cells. */
-        s->tot_cells = s->nr_cells = cdim[0] * cdim[1] * cdim[2];
-        if ( posix_memalign( (void *)&cells , 64 , s->nr_cells * sizeof(struct cell) ) != 0 )
-            error( "Failed to allocate cells." );
-        s->cells = cells;
-        bzero( cells , s->nr_cells * sizeof(struct cell) );
-        for ( k = 0 ; k < s->nr_cells ; k++ )
-            if ( lock_init( &cells[k].lock ) != 0 )
-                error( "Failed to init spinlock." );
-
-        /* Set the cell location and sizes. */
-        for ( i = 0 ; i < cdim[0] ; i++ )
-            for ( j = 0 ; j < cdim[1] ; j++ )
-                for ( k = 0 ; k < cdim[2] ; k++ ) {
-                    c = &cells[ cell_getid( cdim , i , j , k ) ];
-                    c->loc[0] = i*s->h[0]; c->loc[1] = j*s->h[1]; c->loc[2] = k*s->h[2];
-                    c->h[0] = s->h[0]; c->h[1] = s->h[1]; c->h[2] = s->h[2];
-                    c->dmin = dmin;
-                    c->depth = 0;
-                    c->count = 0;
-                    c->super = c;
-                    lock_init( &c->lock );
-                    }
-           
-        /* Be verbose about the change. */         
-        message( "set cell dimensions to [ %i %i %i ]." , cdim[0] , cdim[1] , cdim[2] ); fflush(stdout);
-                    
-        } /* re-build upper-level cells? */
-    // message( "rebuilding upper-level cells took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 );
-        
-    /* Otherwise, just clean up the cells. */
-    else {
-    
-        /* Free the old cells, if they were allocated. */
-        for ( k = 0 ; k < s->nr_cells ; k++ ) {
-            space_rebuild_recycle( s , &cells[k] );
-            cells[k].sorts = NULL;
-            cells[k].nr_tasks = 0;
-            cells[k].nr_density = 0;
-            cells[k].nr_force = 0;
-            cells[k].density = NULL;
-            cells[k].force = NULL;
-            cells[k].dx_max = 0.0f;
-            cells[k].sorted = 0;
-            cells[k].count = 0;
-            cells[k].super = &cells[k];
-            cells[k].kick1 = NULL;
-            cells[k].kick2 = NULL;
-            }
-        s->maxdepth = 0;
-    
-        }
+    /* Re-grid if necessary, or just re-set the cell data. */
+    space_regrid( s , cell_max );
         
     /* Run through the particles and get their cell index. */
     // tic = getticks();
@@ -458,6 +359,11 @@ void space_rebuild ( struct space *s , double cell_max ) {
     parts_sort( parts , xparts , ind , nr_parts , 0 , s->nr_cells-1 );
     // message( "parts_sort took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 );
     
+    /* Re-link the gparts. */
+    for ( k = 0 ; k < nr_parts ; k++ )
+        if ( parts[k].gpart != NULL )
+            parts[k].gpart->part = &parts[k];
+    
     /* Verify sort. */
     /* for ( k = 1 ; k < nr_parts ; k++ ) {
         if ( ind[k-1] > ind[k] ) {
@@ -470,16 +376,55 @@ void space_rebuild ( struct space *s , double cell_max ) {
     /* We no longer need the indices as of here. */
     free( ind );    
 
+
+
+    /* Run through the gravity particles and get their cell index. */
+    // tic = getticks();
+    if ( ( ind = (int *)malloc( sizeof(int) * s->size_gparts ) ) == NULL )
+        error( "Failed to allocate temporary particle indices." );
+    #pragma omp parallel for private(gp,j)
+    for ( k = 0 ; k < nr_gparts ; k++ )  {
+        gp = &gparts[k];
+        for ( j = 0 ; j < 3 ; j++ )
+            if ( gp->x[j] < 0.0 )
+                gp->x[j] += dim[j];
+            else if ( gp->x[j] >= dim[j] )
+                gp->x[j] -= dim[j];
+        ind[k] = cell_getid( cdim , gp->x[0]*ih[0] , gp->x[1]*ih[1] , gp->x[2]*ih[2] );
+        atomic_inc( &cells[ ind[k] ].gcount );
+        }
+    // message( "getting particle indices took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 );
+
+    /* TODO: Here we should exchange the gparts as well! */
+
+    /* Sort the parts according to their cells. */
+    // tic = getticks();
+    gparts_sort( gparts ,ind , nr_parts , 0 , s->nr_cells-1 );
+    // message( "gparts_sort took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 );
+    
+    /* Re-link the parts. */
+    for ( k = 0 ; k < nr_gparts ; k++ )
+        if ( gparts[k].id > 0 )
+            gparts[k].part->gpart = &gparts[k];
+
+    /* We no longer need the indices as of here. */
+    free( ind );    
+
+
+
     /* Hook the cells up to the parts. */
     // tic = getticks();
     finger = parts;
     xfinger = xparts;
+    gfinger = gparts;
     for ( k = 0 ; k < s->nr_cells ; k++ ) {
         c = &cells[ k ];
         c->parts = finger;
         c->xparts = xfinger;
+        c->gparts = gfinger;
         finger = &finger[ c->count ];
         xfinger = &xfinger[ c->count ];
+        gfinger = &gfinger[ c->gcount ];
         }
     // message( "hooking up cells took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 );
         
@@ -674,6 +619,165 @@ void parts_sort ( struct part *parts , struct xpart *xparts , int *ind , int N ,
     }
 
 
+void gparts_sort ( struct gpart *gparts , int *ind , int N , int min , int max ) {
+
+    struct qstack {
+        volatile int i, j, min, max;
+        volatile int ready;
+        };
+    struct qstack *qstack;
+    int qstack_size = 2*(max-min) + 10;
+    volatile unsigned int first, last, waiting;
+    
+    int pivot;
+    int i, ii, j, jj, temp_i, qid;
+    struct gpart temp_p;
+
+    /* for ( int k = 0 ; k < N ; k++ )
+        if ( ind[k] > max || ind[k] < min )
+	    error( "ind[%i]=%i is not in [%i,%i]." , k , ind[k] , min , max ); */
+    
+    /* Allocate the stack. */
+    if ( ( qstack = malloc( sizeof(struct qstack) * qstack_size ) ) == NULL )
+        error( "Failed to allocate qstack." );
+    
+    /* Init the interval stack. */
+    qstack[0].i = 0;
+    qstack[0].j = N-1;
+    qstack[0].min = min;
+    qstack[0].max = max;
+    qstack[0].ready = 1;
+    for ( i = 1 ; i < qstack_size ; i++ )
+        qstack[i].ready = 0;
+    first = 0; last = 1; waiting = 1;
+    
+    /* Parallel bit. */
+    #pragma omp parallel default(none) shared(N,first,last,waiting,qstack,gparts,ind,qstack_size,stderr,engine_rank) private(pivot,i,ii,j,jj,min,max,temp_i,qid,temp_p)
+    {
+    
+        /* Main loop. */
+        if ( omp_get_thread_num() < 8 )
+        while ( waiting > 0 ) {
+        
+            /* Grab an interval off the queue. */
+            qid = atomic_inc( &first ) % qstack_size;
+            
+            /* Wait for the interval to be ready. */
+            while ( waiting > 0 && atomic_cas( &qstack[qid].ready , 1 , 1 ) != 1 );
+            
+            /* Broke loop for all the wrong reasons? */
+            if ( waiting == 0 )
+                break;
+        
+            /* Get the stack entry. */
+            i = qstack[qid].i;
+            j = qstack[qid].j;
+            min = qstack[qid].min;
+            max = qstack[qid].max;
+            qstack[qid].ready = 0;
+            // message( "thread %i got interval [%i,%i] with values in [%i,%i]." , omp_get_thread_num() , i , j , min , max );
+            
+            /* Loop over sub-intervals. */
+            while ( 1 ) {
+            
+                /* Bring beer. */
+                pivot = (min + max) / 2;
+                
+                /* One pass of QuickSort's partitioning. */
+                ii = i; jj = j;
+                while ( ii < jj ) {
+                    while ( ii <= j && ind[ii] <= pivot )
+                        ii++;
+                    while ( jj >= i && ind[jj] > pivot )
+                        jj--;
+                    if ( ii < jj ) {
+                        temp_i = ind[ii]; ind[ii] = ind[jj]; ind[jj] = temp_i;
+                        temp_p = gparts[ii]; gparts[ii] = gparts[jj]; gparts[jj] = temp_p;
+                        }
+                    }
+
+                /* Verify sort. */
+                /* for ( int k = i ; k <= jj ; k++ )
+                    if ( ind[k] > pivot ) {
+                        message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, N=%i." , k , ind[k] , pivot , i , j , N );
+                        error( "Partition failed (<=pivot)." );
+                        }
+                for ( int k = jj+1 ; k <= j ; k++ )
+                    if ( ind[k] <= pivot ) {
+                        message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, N=%i." , k , ind[k] , pivot , i , j , N );
+                        error( "Partition failed (>pivot)." );
+                        } */
+                        
+                /* Split-off largest interval. */
+                if ( jj - i > j - jj+1 ) {
+
+                    /* Recurse on the left? */
+                    if ( jj > i  && pivot > min ) {
+                        qid = atomic_inc( &last ) % qstack_size;
+                        while ( atomic_cas( &qstack[qid].ready , 0 , 0 ) != 0 );
+                        qstack[qid].i = i;
+                        qstack[qid].j = jj;
+                        qstack[qid].min = min;
+                        qstack[qid].max = pivot;
+                        qstack[qid].ready = 1;
+                        if ( atomic_inc( &waiting ) >= qstack_size )
+                            error( "Qstack overflow." );
+                        }
+
+                    /* Recurse on the right? */
+                    if ( jj+1 < j && pivot+1 < max ) {
+                        i = jj+1;
+                        min = pivot+1;
+                        }
+                    else
+                        break;
+                        
+                    }
+                    
+                else {
+                
+                    /* Recurse on the right? */
+                    if ( jj+1 < j && pivot+1 < max ) {
+                        qid = atomic_inc( &last ) % qstack_size;
+                        while ( atomic_cas( &qstack[qid].ready , 0 , 0 ) != 0 );
+                        qstack[qid].i = jj+1;
+                        qstack[qid].j = j;
+                        qstack[qid].min = pivot+1;
+                        qstack[qid].max = max;
+                        qstack[qid].ready = 1;
+                        if ( atomic_inc( &waiting ) >= qstack_size )
+                            error( "Qstack overflow." );
+                        }
+                        
+                    /* Recurse on the left? */
+                    if ( jj > i  && pivot > min ) {
+                        j = jj;
+                        max = pivot;
+                        }
+                    else
+                        break;
+
+                    }
+                    
+                } /* loop over sub-intervals. */
+    
+            atomic_dec( &waiting );
+
+            } /* main loop. */
+    
+        } /* parallel bit. */
+    
+    /* Verify sort. */
+    /* for ( i = 1 ; i < N ; i++ )
+        if ( ind[i-1] > ind[i] )
+            error( "Sorting failed (ind[%i]=%i,ind[%i]=%i)." , i-1 , ind[i-1] , i , ind[i] ); */
+            
+    /* Clean up. */
+    free( qstack );
+
+    }
+
+
 /**
  * @brief Mapping function to free the sorted indices buffers.
  */
@@ -826,7 +930,7 @@ void space_map_cells_pre ( struct space *s , int full , void (*fun)( struct cell
  
 void space_split ( struct space *s , struct cell *c ) {
 
-    int k, count = c->count, maxdepth = 0;
+    int k, count = c->count, gcount = c->gcount, maxdepth = 0;
     float h, h_max = 0.0f, dt, dt_min = c->parts[0].dt, dt_max = dt_min;
     struct cell *temp;
     struct part *p, *parts = c->parts;
@@ -837,7 +941,7 @@ void space_split ( struct space *s , struct cell *c ) {
         s->maxdepth = c->depth;
     
     /* Split or let it be? */
-    if ( count > space_splitsize ) {
+    if ( count > space_splitsize || gcount > space_splitsize ) {
     
         /* No longer just a leaf. */
         c->split = 1;
@@ -846,6 +950,7 @@ void space_split ( struct space *s , struct cell *c ) {
         for ( k = 0 ; k < 8 ; k++ ) {
             temp = space_getcell( s );
             temp->count = 0;
+            temp->gcount = 0;
             temp->loc[0] = c->loc[0];
             temp->loc[1] = c->loc[1];
             temp->loc[2] = c->loc[2];
@@ -873,7 +978,7 @@ void space_split ( struct space *s , struct cell *c ) {
             
         /* Remove any progeny with zero parts. */
         for ( k = 0 ; k < 8 ; k++ )
-            if ( c->progeny[k]->count == 0 ) {
+            if ( c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0 ) {
                 space_recycle( s , c->progeny[k] );
                 c->progeny[k] = NULL;
                 }
@@ -1000,8 +1105,9 @@ struct cell *space_getcell ( struct space *s ) {
     /* Init some things in the cell. */
     bzero( c , sizeof(struct cell) );
     c->nodeID = -1;
-    if ( lock_init( &c->lock ) != 0 )
-        error( "Failed to initialize cell spinlock." );
+    if ( lock_init( &c->lock ) != 0 ||
+         lock_init( &c->glock ) != 0 )
+        error( "Failed to initialize cell spinlocks." );
         
     return c;
 
@@ -1052,6 +1158,28 @@ void space_init ( struct space *s , double dim[3] , struct part *parts , int N ,
         xp->u_hdt = p->u;
         }
         
+        
+    /* For now, clone the parts to make gparts. */
+    if ( posix_memalign( (void *)&s->gparts , part_align , N * sizeof(struct gpart) ) != 0 )
+        error( "Failed to allocate gparts." );
+    bzero( s->gparts , N * sizeof(struct gpart) );
+    for ( int k = 0 ; k < N ; k++ ) {
+        s->gparts[k].x[0] = s->parts[k].x[0];
+        s->gparts[k].x[1] = s->parts[k].x[1];
+        s->gparts[k].x[2] = s->parts[k].x[2];
+        s->gparts[k].v[0] = s->parts[k].v[0];
+        s->gparts[k].v[1] = s->parts[k].v[1];
+        s->gparts[k].v[2] = s->parts[k].v[2];
+        s->gparts[k].mass = s->parts[k].mass;
+        s->gparts[k].dt = s->parts[k].dt;
+        s->gparts[k].id = s->parts[k].id;
+        s->gparts[k].part = &s->parts[k];
+        s->parts[k].gpart = &s->gparts[k];
+        }
+    s->nr_gparts = s->nr_parts;
+    s->size_gparts = s->size_parts;
+    
+        
     /* Init the space lock. */
     if ( lock_init( &s->lock ) != 0 )
         error( "Failed to create space spin-lock." );
diff --git a/src/space.h b/src/space.h
index cbdfe8c0ee004e4e02e408eae70b6c171bf0d5e1..9d1f849d3b29b26d80b12a9767d6505040a1c74c 100644
--- a/src/space.h
+++ b/src/space.h
@@ -83,11 +83,12 @@ struct space {
     
     /* The particle data (cells have pointers to this). */
     struct part *parts;
-    struct cpart *cparts;
     struct xpart *xparts;
+    struct gpart *gparts;
     
     /* The total number of parts in the space. */
     int nr_parts, size_parts;
+    int nr_gparts, size_gparts;
     
     /* Is the space periodic? */
     int periodic;
@@ -110,6 +111,7 @@ struct space {
 
 /* function prototypes. */
 void parts_sort ( struct part *parts , struct xpart *xparts , int *ind , int N , int min , int max );
+void gparts_sort ( struct gpart *gparts , int *ind , int N , int min , int max );
 struct cell *space_getcell ( struct space *s );
 int space_getsid ( struct space *s , struct cell **ci , struct cell **cj , double *shift );
 void space_init ( struct space *s , double dim[3] , struct part *parts , int N , int periodic , double h_max );
diff --git a/src/swift.h b/src/swift.h
index 13b78ba17aa73075369fc7203394a9c3c6d792f5..07b4ccf13211d511e1498b0385f8722e21629f5b 100644
--- a/src/swift.h
+++ b/src/swift.h
@@ -31,6 +31,7 @@
 #include "task.h"
 #include "scheduler.h"
 #include "part.h"
+#include "multipole.h"
 #include "cell.h"
 #include "space.h"
 #include "queue.h"
@@ -47,3 +48,4 @@
 #else
 #include "runner_iact.h"
 #endif
+#include "runner_iact_grav.h"
diff --git a/src/task.c b/src/task.c
index 7ef068905a2fae92f23e0baa3549fb85ff87a726..ba15bf956f5ee87280da9e48cd4cbe7cb03e2ba4 100644
--- a/src/task.c
+++ b/src/task.c
@@ -42,6 +42,8 @@
 #include "atomic.h"
 #include "lock.h"
 #include "space.h"
+#include "part.h"
+#include "multipole.h"
 #include "cell.h"
 #include "task.h"
 #include "error.h"
@@ -49,7 +51,8 @@
 /* Task type names. */
 const char *taskID_names[task_type_count] = {   
     "none" , "sort" , "self" , "pair" , "sub" , "ghost" , 
-    "kick1" , "kick2" , "send" , "recv" , "link" };
+    "kick1" , "kick2" , "send" , "recv" , "link" , "grav_pp" ,
+    "grav_mm" , "grav_up" , "grav_down" };
 
 
 /**
@@ -72,6 +75,15 @@ void task_unlock ( struct task *t ) {
             if ( t->cj != NULL )
                 cell_unlocktree( t->cj );
             break;
+        case task_type_grav_pp:
+        case task_type_grav_mm:
+        case task_type_grav_down:
+            cell_gunlocktree( t->ci );
+            if ( t->cj != NULL )
+                cell_gunlocktree( t->cj );
+            break;
+        default:
+            break;
         }
         
     }
@@ -118,8 +130,9 @@ int task_lock ( struct task *t ) {
         }
         
     /* Otherwise, binary lock. */
-    else if ( type == task_type_pair || (type == task_type_sub && cj != NULL) ) {
-        if ( ci->hold || cj->hold || ci->wait || cj->wait )
+    else if ( type == task_type_pair || 
+              ( type == task_type_sub && cj != NULL ) ) {
+        if ( ci->hold || cj->hold )
             return 0;
         if ( cell_locktree( ci ) != 0 )
             return 0;
@@ -129,6 +142,20 @@ int task_lock ( struct task *t ) {
             }
         }
         
+    /* Gravity tasks? */
+    else if ( type == task_type_grav_mm ||
+              type == task_type_grav_pp ||
+              type == task_type_grav_down ) {
+        if ( ci->ghold || ( cj != NULL && cj->ghold ) )
+            return 0;
+        if ( cell_glocktree( ci ) != 0 )
+            return 0;
+        if ( cj != NULL && cell_glocktree( cj ) != 0 ) {
+            cell_gunlocktree( ci );
+            return 0;
+            }
+        }
+        
     /* If we made it this far, we've got a lock. */
     return 1;
             
diff --git a/src/task.h b/src/task.h
index 7b3b81a5dbbd3dc6e32d5d410fbd374cd2b414f2..0505815ff2d5dcc186b30011a906458947589bd8 100644
--- a/src/task.h
+++ b/src/task.h
@@ -36,6 +36,10 @@ enum task_types {
     task_type_send,
     task_type_recv,
     task_type_link,
+    task_type_grav_pp,
+    task_type_grav_mm,
+    task_type_grav_up,
+    task_type_grav_down,
     task_type_count
     };
     
@@ -47,6 +51,7 @@ enum task_subtypes {
     task_subtype_none = 0,
     task_subtype_density,
     task_subtype_force,
+    task_subtype_grav,
     task_subtype_count
     };
     
@@ -56,7 +61,9 @@ extern const char *taskID_names[];
 /* Data of a task. */
 struct task {
 
-    char type, subtype, skip, tight, implicit;
+    enum task_types type;
+    enum task_subtypes subtype;
+    char skip, tight, implicit;
     int flags, wait, rank, weight;
     
     lock_type lock;
diff --git a/src/timers.h b/src/timers.h
index 7f1f831c78c28bfc13787e0660d2b41d12310dd6..58c48ac2444e2cd615f711213474d729b4bbbe70 100644
--- a/src/timers.h
+++ b/src/timers.h
@@ -28,10 +28,13 @@ enum {
     timer_dosort,
     timer_doself_density,
     timer_doself_force,
+    timer_doself_grav,
     timer_dopair_density,
     timer_dopair_force,
+    timer_dopair_grav,
     timer_dosub_density,
     timer_dosub_force,
+    timer_dosub_grav,
     timer_dopair_subset,
     timer_doghost,
     timer_gettask,
diff --git a/src/vector.h b/src/vector.h
index 0f4cb8c95ae061d4f06e85aaad6911d2985621de..81efefe9f6218a17149869f3ec5535a4361f6b5c 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -58,7 +58,7 @@
         #define vec_dbl_fmax(a,b) _mm512_max_pd(a,b)
         #define vec_getoffsets(ptrs) _mm512_insertf64x4( _mm512_insertf64x4( _mm512_setzero_pd() , _mm512_cvtepi64_epi32( _mm512_load_epi64(ptrs) - _mm512_set1_epi64(ptrs[0]) ) , 0 ) , _mm512_cvtepi64_epi32( _mm512_load_epi64(&ptrs[4]) - _mm512_set1_epi64(ptrs[0]) ) , 1 ) 
         #define vec_gather(base,offsets) _mm512_i32gather_ps( offsets.m , base , 1 )
-    #elif defined(__AVX__)
+    #elif defined( NO__AVX__ )
         #define VECTORIZE
         #define VEC_SIZE 8
         #define VEC_FLOAT __m256
@@ -90,7 +90,7 @@
             #define VEC_HAVE_GATHER
             #define vec_gather(base,offsets) _mm256_i32gather_ps( base , offsets.m , 1 )
         #endif
-    #elif defined( __SSE2__ )
+    #elif defined( NO__SSE2__ )
         #define VECTORIZE
         #define VEC_SIZE 4
         #define VEC_FLOAT __m128