diff --git a/.gitignore b/.gitignore
index 14c768367ee9181c1459b57b93dd92d431ace981..8137ea759b24b3f4ec9909a460da4bcb47b0a1ac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,9 @@ examples/swift_mindt
 examples/swift_mindt_mpi
 examples/swift_mpi
 
+tests/testVectorize
+tests/brute_force.dat
+tests/swift_dopair.dat
 tests/testGreetings
 tests/testReading
 tests/input.hdf5
diff --git a/configure.ac b/configure.ac
index 1a6cd114895b0f0571f18778fb54707e4fd37de6..225796677a706183087283af4463474508a6b979 100644
--- a/configure.ac
+++ b/configure.ac
@@ -260,8 +260,10 @@ fi
 AM_CONDITIONAL([HAVEPARALLELHDF5],[test "$have_parallel_hdf5" = "yes"])
 
 # Check for setaffinity.
-AC_CHECK_FUNC( pthread_setaffinity_np , AC_DEFINE([HAVE_SETAFFINITY],[true],
+AC_CHECK_FUNC(pthread_setaffinity_np, AC_DEFINE([HAVE_SETAFFINITY],[true],
     [Defined if pthread_setaffinity_np exists.]) )
+AM_CONDITIONAL(HAVESETAFFINITY,
+    [test "$ac_cv_func_pthread_setaffinity_np" = "yes"])
 
 # Check for timing functions needed by cycle.h.
 AC_HEADER_TIME
diff --git a/doc/RTD/DeveloperGuide/AddingTasks/addingtasks.rst b/doc/RTD/DeveloperGuide/AddingTasks/addingtasks.rst
new file mode 100644
index 0000000000000000000000000000000000000000..936fc5b45f901cbb704545727dac1211ac58b5a4
--- /dev/null
+++ b/doc/RTD/DeveloperGuide/AddingTasks/addingtasks.rst
@@ -0,0 +1,127 @@
+.. _NewTask:
+
+How to add a new task to SWIFT?
+=================================
+.. highlight:: c
+
+
+
+.. toctree::
+   :maxdepth: 0
+
+This tutorial will step through how to add a new task to swift. First we will go through the 
+idealology of adding a new task to SWIFT. This will be followed by an example of how to add a task
+for an imposed external gravitational field to SWIFT and a task to include "cooling" to the gas particles.
+
+In the simplest case adding a new tasks requires changes to five files, namely:
+
+* task.h
+* cell.h
+* timers.h
+* task.c
+* engine.c
+
+Further, implementation details of what the task will then do should be added to another file
+(for example runner_myviptask.c) which will contain the actual task implementation.
+
+So now lets look at what needs to change in each of the files above, starting with task.h
+
+--------------
+**task.h**
+--------------
+Within task.h there exists a structure of the form:: 
+
+    /* The different task types. */
+    enum task_types {
+        task_type_none = 0,
+        task_type_sort,
+        task_type_self,
+        task_type_pair,
+	      .
+	      .
+	      .
+	task_type_my_new_task,
+	task_type_psort,
+	task_type_split_cell,
+	task_type_count
+    };
+
+Within this task structure your new task should be added. Add the task entry anywhere in the struct before the
+task_type_count member. This last entry is used to count the number of tasks and must always be the last entry.
+
+--------------
+**task.c**
+--------------
+
+Within task.c the addition of the new task type must be include in a character list (which at the moment is only 
+used for debugging purposes)::
+
+     /* Task type names. */
+     const char *taskID_names[task_type_count] = {
+     "none",  "sort",    "self",    "pair",    "sub",
+     "ghost", "kick1",   "kick2",   "send",    "recv",
+     "link",  "grav_pp", "grav_mm", "grav_up", "grav_down",
+     "my_new_task", "psort", "split_cell"};
+
+The new task type should be added to this list in the same order as it was added within the task_types struct in 
+task.h
+
+--------------
+**cell.h**
+--------------
+
+cell.h contains pointers to all of the tasks associated with that cell. You must include your new task type 
+here e.g::
+
+   struct task *my_new_task;
+
+--------------
+**timers.h**
+--------------
+
+Within timers.h is an enumerated list of timers associated with each task. The timers measure the time required 
+to execute a given task and this information is used in improve scheduling the task in future iterations::
+
+    /* The timers themselves. */
+    enum {
+      timer_none = 0,
+      timer_prepare,
+      timer_kick1,
+           .
+	   .
+	   .
+      timer_new_task,
+      timer_step,
+      timer_count,
+    };
+
+--------------
+**engine.c**
+--------------
+
+Finally, in engine.c the new task is added so that the scheduler knows to include the task in the list of tasks 
+to be scheduled. Knowing where to add the task in engine.c is a little bit more difficult. This will depend on 
+the type of task involved and whether it is a task that acts only on a individual particle independent of other 
+particles (e.g. a cooling a task) or whether the task depends on other tasks (e.g. density, force or feedback). 
+
+If we assume that the task is a particle only task then the first place to modify is the engine_mkghosts() 
+function. Within this function the new task must be added to the list of tasks 
+(within the c->nodeID == e->nodeID if clause)::
+
+       /* Generate the external gravity task*/
+         c->my_new_task = scheduler_addtask(s, task_type_my_new_task, task_subtype_none, 0, 0,
+                                   c, NULL, 0);
+
+
+That's pretty much it - but what about dependencies and conflicts?
+Remember SWIFT automatically handles conflicts (by understanding which tasks need to write to the same data) so 
+you (the developer) don't need to worry about conflicts. Dependencies do however need to be managed and they will
+be task specific. The following two examples, implementing cooling and an imposed external gravitational field
+will illustrate how dependencies should be treated. 
+
+
+Examples:
+
+:ref:`ExternalGravityExample`
+
+:ref:`CoolingExample`
diff --git a/doc/RTD/DeveloperGuide/Examples/Cooling/cooling.rst b/doc/RTD/DeveloperGuide/Examples/Cooling/cooling.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5dc0c46a1881601115ceee1b2ac5b205e2fb1e5b
--- /dev/null
+++ b/doc/RTD/DeveloperGuide/Examples/Cooling/cooling.rst
@@ -0,0 +1,11 @@
+.. _CoolingExample:
+
+Cooling Example
+--------------------------
+
+An example of how to implement a particle cooling task in SWIFT
+===================================================================
+
+
+.. toctree::
+   :maxdepth: 1
diff --git a/doc/RTD/DeveloperGuide/Examples/ExternalGravity/externalgravity.rst b/doc/RTD/DeveloperGuide/Examples/ExternalGravity/externalgravity.rst
new file mode 100644
index 0000000000000000000000000000000000000000..aceba80bb96dc69d942cbc675669c49fc7f2cf13
--- /dev/null
+++ b/doc/RTD/DeveloperGuide/Examples/ExternalGravity/externalgravity.rst
@@ -0,0 +1,225 @@
+.. _ExternalGravityExample:
+
+External Gravity Task Example
+----------------------------------
+
+An example of how to implement an external gravity task in SWIFT
+=====================================================================
+
+An external gravitational field can be imposed in SWIFT to mimic self-gravity. This is done by assigning
+a gravitational force that falls as $1/ r^2$ (mathjax support to be included).
+
+In order to do this we update the files as described in :ref:`NewTask`. For the specific case of adding an 
+external graviational field the additions are as follows:
+
+
+--------------
+**task.h**
+--------------
+
+Code (snapshot Nov 2015)::
+
+     /* The different task types. */
+     enum task_types {
+        task_type_none = 0,
+     	task_type_sort,
+     	task_type_self,
+     	task_type_pair,
+     	task_type_sub,
+     	task_type_ghost,
+     	task_type_kick1,
+     	task_type_kick2,
+     	task_type_send,
+     	task_type_recv,
+     	task_type_link,
+     	task_type_grav_pp,
+     	task_type_grav_mm,
+     	task_type_grav_up,
+     	task_type_grav_down,
+     	**task_type_grav_external,**
+     	task_type_psort,
+     	task_type_split_cell,
+     	task_type_count
+     };
+
+Task of type - task_type_grav_external - added to list of tasks.
+
+--------------
+**task.c**
+--------------
+
+Code (snapshot Nov 2015)::
+
+       /* Task type names. */
+       const char *taskID_names[task_type_count] = {
+           "none",  "sort",    "self",    "pair",    "sub",
+    	   "ghost", "kick1",   "kick2",   "send",    "recv",
+    	   "link",  "grav_pp", "grav_mm", "grav_up", "grav_down", "grav_external",
+    	   "psort", "split_cell"
+        };
+
+Task added to list of task names (used only for debugging purposed).
+
+
+--------------
+**cell.h**
+--------------
+
+Code (snapshot Nov 2015)::
+
+     /* The ghost task to link density to interactions. */
+        struct task *ghost, *kick1, *kick2, *grav_external;
+
+Struture of type "task" declared (or pointer to a task at least). 
+
+
+
+--------------
+**timers.h**
+--------------
+
+Code (snapshot Nov 2015)::
+
+    /* The timers themselves. */
+    enum {
+      timer_none = 0,
+      timer_prepare,
+      timer_kick1,
+      timer_kick2,
+      timer_dosort,
+      timer_doself_density,
+      timer_doself_force,
+      timer_doself_grav,
+      timer_dopair_density,
+      timer_dopair_force,
+      timer_dopair_grav,
+      timer_dosub_density,
+      timer_dosub_force,
+      timer_dosub_grav,
+      timer_dopair_subset,
+      timer_doghost,
+      timer_dograv_external,
+      timer_gettask,
+      timer_qget,
+      timer_qsteal,
+      timer_runners,
+      timer_step,
+      timer_count,
+      };
+
+The timer list is updated to include a timer task. 
+
+
+--------------
+**engine.c**
+--------------
+
+Code (snapshot Nov 2015)::
+
+    void engine_mkghosts(struct engine *e, struct cell *c, struct cell *super) {
+
+        int k;
+  	struct scheduler *s = &e->sched;
+
+  	/* Am I the super-cell? */
+  	if (super == NULL && c->nr_tasks > 0) {
+
+           /* Remember me. */
+    	   super = c;
+
+    	   /* Local tasks only... */
+           if (c->nodeID == e->nodeID) {
+
+               /* Generate the external gravity task*/
+      	       c->grav_external = scheduler_addtask(s, task_type_grav_external, task_subtype_none, 0, 0,
+                                   c, NULL, 0);
+
+ 	       /* Enforce gravity calculated before kick 2 */
+      	       scheduler_addunlock(s, c->grav_external, c->kick2);
+    	       }
+	   }
+     }
+
+
+The first function call adds the task to the scheduler. The second function call takes care of the dependency 
+involved in imposing an external gravitational field. These two functions are worth considering due to their 
+obvious importance. 
+
+
+
+The function prototype for the addtask function is (**found in scheduler.c**)::
+
+        struct task *scheduler_addtask(struct scheduler *s, int type, int subtype,
+                               int flags, int wait, struct cell *ci,
+                               struct cell *cj, int tight) {
+
+This function adds a task to the scheduler. In the call to this function in engine.c we used the actual 
+parameters **s** for the scheduler, **task_type_grav_external** for the (task) type, task_subtype_none for 
+the (task) subtype, zeros for the flags and wait parameters, **c** for the pointer to our cell, NULL for the
+cell we interact with since there is none and 0 for the tight parameter. 
+
+The function prototype for the addunlock function is(**found in scheduler.c**)::
+
+        void scheduler_addunlock(struct scheduler *s, struct task *ta,
+                         struct task *tb) {
+
+This function signals when the unlock a certain task. In our case we use the external gravity task to unlock the 
+kick2 task - i.e. kick2 depends on external gravity. So when calling the addunlock function the 
+order is the **ta** task should be the task to unlock and **tb** should the task that does the unlocking.
+
+
+--------------
+**runner.c**
+--------------
+
+In runner.c the implementation of the external gravity task is taken care of. The function prototype is::
+
+        void runner_dograv_external(struct runner *r, struct cell *c) {
+
+The function takes a pointer to a runner struct and a pointer to the cell struct. The entire function call is::
+
+
+
+       void runner_dograv_external(struct runner *r, struct cell *c) {
+
+            struct part *p, *parts = c->parts;
+	    float rinv;
+  	    int i, ic, k, count = c->count;
+  	    float dt_step = r->e->dt_step;
+  	    TIMER_TIC
+
+  	    /* Recurse? */
+  	    if (c->split) {
+    	       for (k = 0; k < 8; k++)
+      	           if (c->progeny[k] != NULL) runner_dograv_external(r, c->progeny[k]);
+    		   return;
+            }		   
+
+  	    /* Loop over the parts in this cell. */
+  	    for (i = 0; i < count; i++) {
+
+	        /* Get a direct pointer on the part. */
+	 	p = &parts[i];
+	 
+	        /* Is this part within the time step? */
+	 	if (p->dt <= dt_step) {
+		   rinv = 1 / sqrtf((p->x[0])*(p->x[0]) + (p->x[1])*(p->x[1]) + (p->x[2])*(p->x[2]));
+		   for(ic=0;ic<3;ic++){
+		       p->grav_accel[ic] = - const_G * (p->x[ic]) * rinv * rinv * rinv;
+		   }
+	        }
+            }
+            TIMER_TOC(timer_dograv_external);
+        }
+
+
+The key component of this function is the calculation of **rinv** and then the imposition of the 
+**grav_accel** to this particle. **rinv** is calculated assuming the centre of the gravitational 
+potential lies at the origin. The acceleration of each particle then is calculated by multiplying
+the graviational constant by the component of the position along one axis divided by R^3. The 
+gravitational acceleration is then added to the total particle acceleration **a**.
+
+
+
+.. toctree::
+   :maxdepth: 1
diff --git a/doc/RTD/DeveloperGuide/developerguide.rst b/doc/RTD/DeveloperGuide/developerguide.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0c2cbe23c681af2dc4302b74e3c8c9f44fce5bef
--- /dev/null
+++ b/doc/RTD/DeveloperGuide/developerguide.rst
@@ -0,0 +1,13 @@
+.. _DeveloperGuide:
+
+A Developer Guide for SWIFT
+=================================
+
+
+.. toctree::
+   :maxdepth: 1
+
+   AddingTasks/addingtasks.rst
+   Examples/ExternalGravity/externalgravity.rst
+   Examples/Cooling/cooling.rst
+   
diff --git a/doc/RTD/FAQ/index.rst b/doc/RTD/FAQ/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0d49b1d5833df205a1a98df78ebae9554686796d
--- /dev/null
+++ b/doc/RTD/FAQ/index.rst
@@ -0,0 +1,17 @@
+.. _GettingStarted:
+
+Frequently Asked Questions
+==========================
+
+1)
+
+2)
+
+3)
+
+4)
+
+5)
+
+.. toctree::
+   :maxdepth: 1
diff --git a/doc/RTD/Innovation/AsynchronousComms/index.rst b/doc/RTD/Innovation/AsynchronousComms/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..882492fc53abdab218fa46efe0b6f17c5d32393a
--- /dev/null
+++ b/doc/RTD/Innovation/AsynchronousComms/index.rst
@@ -0,0 +1,8 @@
+.. _GettingStarted:
+
+Asynchronous Communication
+=================================
+
+
+.. toctree::
+   :maxdepth: 1
diff --git a/doc/RTD/Innovation/Caching/index.rst b/doc/RTD/Innovation/Caching/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..18e495ebf6b3a523f58e5360e99e5dc6f3337695
--- /dev/null
+++ b/doc/RTD/Innovation/Caching/index.rst
@@ -0,0 +1,8 @@
+.. _GettingStarted:
+
+Caching
+=================================
+
+
+.. toctree::
+   :maxdepth: 1
diff --git a/doc/RTD/Innovation/HeirarchicalCellDecomposition/InitialDecomp.png b/doc/RTD/Innovation/HeirarchicalCellDecomposition/InitialDecomp.png
new file mode 100644
index 0000000000000000000000000000000000000000..30cb00258b6668a3e9b1060ee7a49538e89b0e9c
Binary files /dev/null and b/doc/RTD/Innovation/HeirarchicalCellDecomposition/InitialDecomp.png differ
diff --git a/doc/RTD/Innovation/HeirarchicalCellDecomposition/SplitCell.png b/doc/RTD/Innovation/HeirarchicalCellDecomposition/SplitCell.png
new file mode 100644
index 0000000000000000000000000000000000000000..597710b6899e6070318006de1065ed4ac323379e
Binary files /dev/null and b/doc/RTD/Innovation/HeirarchicalCellDecomposition/SplitCell.png differ
diff --git a/doc/RTD/Innovation/HeirarchicalCellDecomposition/SplitPair.png b/doc/RTD/Innovation/HeirarchicalCellDecomposition/SplitPair.png
new file mode 100644
index 0000000000000000000000000000000000000000..5d736aff29b94f5c915e9ec1f61e4b79c2323a7a
Binary files /dev/null and b/doc/RTD/Innovation/HeirarchicalCellDecomposition/SplitPair.png differ
diff --git a/doc/RTD/Innovation/HeirarchicalCellDecomposition/index.rst b/doc/RTD/Innovation/HeirarchicalCellDecomposition/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5a16095d9de0ec8e6dd35218f0303d6da297ad8d
--- /dev/null
+++ b/doc/RTD/Innovation/HeirarchicalCellDecomposition/index.rst
@@ -0,0 +1,47 @@
+.. _GettingStarted:
+
+Heirarchical Cell Decomposition
+=================================
+
+Most SPH codes rely on spatial trees to decompose the simulation space. This decomposition makes neighbour-finding simple, at the cost of computational efficiency. Neighbour-finding using the tree-based approach has an average computational cost of ~O(logN) and has a worst case behaviour of ~O(N\ :sup:`2/3`\), both cases grow with the total number of particles N. SWIFT's neighbour-finding algorithm however, has a constant scaling of ~O(1) per particle. This results from the way SWIFT decomposes its domain.
+
+The space is divided up into a grid of rectangular cells with an edge length that is greater than or equal to the maximum smoothing of any particle in the simulation, h\ :sub:`max`\  (See :ref:`cell_decomp`). 
+
+.. _cell_decomp:
+.. figure:: InitialDecomp.png
+   :scale: 40 %
+   :align: center
+   :figclass: align-center
+
+   Figure 1: 2D Cell Decomposition
+
+In this initial decomposition if a particle p\ :sub:`j`\  is within range of particle p\ :sub:`i`\, both will either be in the same cell (self-interaction) or in neighbouring cells (pair-interaction). Each cell then only has to compute its self-interactions and pair-interactions for each of its particles.   
+
+The best case scenario is when each cell only contains particles that have a smoothing length equal to the cell edge length and even then, for any given particle p\ :sub:`i`\  it will only interact with 16% of the total number of particles in the same cell and surrounding neighbours. This percentage decreases if the cell contains particles whose smoothing length is less than the cell edge length. Therefore the cell decomposition needs to be refined recursively by bisecting a cell along each dimension if the following conditions are met:
+
+1) The cell contains more than a minimum number of particles
+
+2) The smoothing length of a reasonable number of particles within a cell is less than half the cell's edge length
+
+.. _split_cell:
+.. figure:: SplitCell.png
+   :scale: 40 %
+   :align: center
+   :figclass: align-center
+
+   Figure 2: Refined Cell Decomposition
+
+Once a cell has been split its self-interactions can be decomposed into self-interactions of its sub-cells and corresponding pair interactions (See :ref:`split_cell`). If a pair of split cells share a boundary with each other and all particles in both cells have a smoothing length less than the cell edge length, then their pair-interactions can also be split up into pair-interactions of the sub-cells spanning the boundary (See :ref:`split_pair`).
+
+.. _split_pair:
+.. figure:: SplitPair.png
+   :scale: 40 %
+   :align: center
+   :figclass: align-center
+
+   Figure 3: Split Cell Pair Interactions
+
+When the cells' particle interactions are split up between self-interactions and pair-interactions, any two particles who are within range of each other will either share a cell for which a cell self-interaction is defined or they will be located in neighbouring cells which share a cell pair-interaction. Therefore to determine whether particles are within range of each other it is sufficient to traverse the list of self-interactions and pair-interactions and compute the interactions therein.  
+
+.. toctree::
+   :maxdepth: 1
diff --git a/doc/RTD/Innovation/HybridParallelism/index.rst b/doc/RTD/Innovation/HybridParallelism/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ed19e98a0d2fbae4494949014bf202416733add2
--- /dev/null
+++ b/doc/RTD/Innovation/HybridParallelism/index.rst
@@ -0,0 +1,8 @@
+.. _GettingStarted:
+
+Hybrid Shared/Distributed-Memory Parallelism
+============================================
+
+
+.. toctree::
+   :maxdepth: 1
diff --git a/doc/RTD/Innovation/TaskBasedParallelism/OMPScaling.png b/doc/RTD/Innovation/TaskBasedParallelism/OMPScaling.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e5feaae3a91d1eae1aa3dee0ae38065512de9d7
Binary files /dev/null and b/doc/RTD/Innovation/TaskBasedParallelism/OMPScaling.png differ
diff --git a/doc/RTD/Innovation/TaskBasedParallelism/TasksExample.png b/doc/RTD/Innovation/TaskBasedParallelism/TasksExample.png
new file mode 100644
index 0000000000000000000000000000000000000000..3177010188e86962ad337c8ceb7b53a2631a5652
Binary files /dev/null and b/doc/RTD/Innovation/TaskBasedParallelism/TasksExample.png differ
diff --git a/doc/RTD/Innovation/TaskBasedParallelism/TasksExampleConflicts.png b/doc/RTD/Innovation/TaskBasedParallelism/TasksExampleConflicts.png
new file mode 100644
index 0000000000000000000000000000000000000000..368a9643503f213f699c185182d3ccc244210d00
Binary files /dev/null and b/doc/RTD/Innovation/TaskBasedParallelism/TasksExampleConflicts.png differ
diff --git a/doc/RTD/Innovation/TaskBasedParallelism/index.rst b/doc/RTD/Innovation/TaskBasedParallelism/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4cc265b80b3b4bd408c3b417c4a43313b8cec609
--- /dev/null
+++ b/doc/RTD/Innovation/TaskBasedParallelism/index.rst
@@ -0,0 +1,45 @@
+.. _GettingStarted:
+
+Task Based Parallelism
+=================================
+
+One of biggest problems faced by many applications when running on a shared memory system is *load imbalance*, this occurs when the work load is not evenly distributed across the cores. The most well known paradigm for handling this type of parallel architecture is OpenMP, in which the programmer applies annotations to the code to indicate to the compiler which sections should be executed in parallel. If a ``for`` loop has been identified as a parallel section, the iterations of the loop are split between available threads, each executing on a single core. Once all threads have terminated the program becomes serial again and only executes on single thread, this technique is known as branch-and-bound parallelism, shown in :ref:`branch_and_bound`. Unfortunately, this implementation generally leads to low performance and bad scaling as you increase the number of cores. 
+
+.. _branch_and_bound:
+.. figure:: OMPScaling.png
+   :scale: 40 %
+   :align: center
+   :figclass: align-center
+
+   Figure 1: Branch-and-bound parallelism
+
+Another disadvantage with this form of shared-memory parallelism is that there is no implicit handling of concurrency issues between threads. *Race conditions* can occur when two threads attempt to modify the same data simultaneously, unless explicit *critical* regions are defined which prevent more than one thread executing the same code at the same time. These regions degrade parallel performance even further.
+
+A better way to exploit shared memory systems is to use an approach called *task-based parallelism*. This method describes the entire computation in a way that is more inherently parallelisable. The simulation is divided up into a set of computational tasks which are **dynamically** allocated to a number of processors. In order to ensure that the tasks are executed in the correct order and to avoid *race conditions*, *dependencies* between tasks are identified and strictly enforced by a task scheduler. A Directed Acyclic Graph (DAG) illustrates how a set of computational tasks link together via dependencies. Processors can traverse the graph in topological order, selecting and executing tasks that have no unresolved dependencies or waiting until tasks become available. This selection process continues for all processors until all tasks have been completed. An example of a DAG can be seen in :ref:`DAG`, the figure represents tasks as circles, labelled A-E, and dependencies as arrows. Tasks B and C both depend on A, and D depends on B, whereas A and E are independent tasks. Therefore on a shared memory system, tasks A and E could be executed first. Once task A is finished, tasks B and C become available for execution as their dependencies to A have been resolved. Finally, task D can be executed after task B has completed. 
+
+.. _DAG:
+.. figure:: TasksExample.png
+   :scale: 40 %
+   :align: center
+   :figclass: align-center
+
+   Figure 2: Tasks and Dependencies
+
+The main advantages of using this approach are as follows:
+
+* The order in which the tasks are processed is completely dynamic and adapts automatically to load imbalances.
+* If the dependencies and conflicts are specified correctly, there is no need for expensive explicit locking, synchronisation or atomic operations, found in OpenMP to deal with most concurrency problems.
+* Each task has exclusive access to the data it is working on, thus improving cache locality and efficiency.  
+
+SWIFT modifies the task-based approach by introducing the concept of *conflicts* between tasks. Conflicts occur when two tasks operate on the same data, but the order in which the operations occur does not matter. :ref:`task_conflicts` illustrates tasks with conflicts, where there is a conflict between tasks B and C, and tasks D and E. In a parallel setup, once task A has finished executing, if one processor selects task B, then no other processor is allowed to execute task C until task B has completed, or vice versa. Without this modification, other task-based models used dependencies to model conflicts between tasks, which introduces an artificial ordering between tasks and imposes unnecessary constraints on the task scheduler.  
+
+.. _task_conflicts:
+.. figure:: TasksExampleConflicts.png
+   :scale: 40 %
+   :align: center
+   :figclass: align-center
+
+   Figure 3: Tasks and Conflicts
+
+.. toctree::
+   :maxdepth: 1
diff --git a/doc/RTD/Innovation/TaskGraphPartition/index.rst b/doc/RTD/Innovation/TaskGraphPartition/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c0dd4b44927cbfa6da106e8e93ffdf0b9d9f94d7
--- /dev/null
+++ b/doc/RTD/Innovation/TaskGraphPartition/index.rst
@@ -0,0 +1,8 @@
+.. _GettingStarted:
+
+Task Graph Partition
+=================================
+
+
+.. toctree::
+   :maxdepth: 1
diff --git a/doc/RTD/Innovation/Vectorisation/index.rst b/doc/RTD/Innovation/Vectorisation/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c944fb98a9c440761d7a8c3955cc4f734ed24bab
--- /dev/null
+++ b/doc/RTD/Innovation/Vectorisation/index.rst
@@ -0,0 +1,8 @@
+.. _GettingStarted:
+
+Vectorisation
+=================================
+
+
+.. toctree::
+   :maxdepth: 1
diff --git a/doc/RTD/Innovation/index.rst b/doc/RTD/Innovation/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..da3f2474b4c71f8030634b2f669d3d8093757171
--- /dev/null
+++ b/doc/RTD/Innovation/index.rst
@@ -0,0 +1,19 @@
+.. _GettingStarted:
+
+What makes SWIFT different?
+===========================
+
+SWIFT implements a host of techniques that not only make it faster on one core but produce better scaling on shared and distributed memory architectures, when compared to equivalent codes such as Gadget-2.
+
+Here is a list outlining how SWIFT approaches parallelism:
+
+.. toctree::
+   :maxdepth: 1
+
+   HeirarchicalCellDecomposition/index.rst   
+   TaskBasedParallelism/index.rst
+   Caching/index.rst
+   TaskGraphPartition/index.rst
+   HybridParallelism/index.rst
+   AsynchronousComms/index.rst
+   Vectorisation/index.rst
diff --git a/doc/RTD/Makefile b/doc/RTD/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..b1dfebb01c2f55530f7a8efad3c5e5bf96484c18
--- /dev/null
+++ b/doc/RTD/Makefile
@@ -0,0 +1,177 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/SWIFT.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/SWIFT.qhc"
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/SWIFT"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/SWIFT"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/doc/RTD/Motivation/index.rst b/doc/RTD/Motivation/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..94daf931959c446592f1a06b7e49c94f92d27516
--- /dev/null
+++ b/doc/RTD/Motivation/index.rst
@@ -0,0 +1,10 @@
+.. _GettingStarted:
+
+What is the Motivation for SWIFT?
+=================================
+
+SWIFT is designing with parallelism from the very start. It is a bottom up approach with a task based 
+parallel infrastructure at it's very core. 
+
+.. toctree::
+   :maxdepth: 1
diff --git a/doc/RTD/Physics/Gravity/gravity.rst b/doc/RTD/Physics/Gravity/gravity.rst
new file mode 100644
index 0000000000000000000000000000000000000000..82b60bc01747f1693ff2d99fc09db8a5e23d7043
--- /dev/null
+++ b/doc/RTD/Physics/Gravity/gravity.rst
@@ -0,0 +1,11 @@
+.. _SWIFTGravity:
+
+Gravity
+=============================
+
+SWIFT implements the calculation of gravitational forces using a version of TreePM. 
+Matthieu - can you update this part? I know you have implemented a Multipole method here 
+with some SWIFT specific modifications. Could you write a brief overview of the idea?
+
+.. toctree::
+   :maxdepth: 1
diff --git a/doc/RTD/Physics/SPH/sph.rst b/doc/RTD/Physics/SPH/sph.rst
new file mode 100644
index 0000000000000000000000000000000000000000..feadec4bcd48a161c80bc4ecf09345278e20ab87
--- /dev/null
+++ b/doc/RTD/Physics/SPH/sph.rst
@@ -0,0 +1,9 @@
+.. _SWIFTSPH:
+
+Smooth Particle Hydrodynamics
+=============================
+
+SWIFT simulates the universe using SPH. 
+ 
+.. toctree::
+   :maxdepth: 1
diff --git a/doc/RTD/Physics/index.rst b/doc/RTD/Physics/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..536c135ec47d83791ba50dd55246f4c6a9559e9e
--- /dev/null
+++ b/doc/RTD/Physics/index.rst
@@ -0,0 +1,11 @@
+.. _SWIFTPhysics:
+
+What Physics does SWIFT Simulate?
+=================================
+
+
+.. toctree::
+   :maxdepth: 1
+
+   SPH/sph.rst
+   Gravity/gravity.rst
diff --git a/doc/RTD/conf.py b/doc/RTD/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4eab3d354322f8ff5e060b1795c2654eb879f90
--- /dev/null
+++ b/doc/RTD/conf.py
@@ -0,0 +1,248 @@
+# -*- coding: utf-8 -*-
+#
+# SWIFT documentation build configuration file, created by
+# sphinx-quickstart on Thu Nov  5 15:57:05 2015.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys, os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration -----------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo', 'sphinx.ext.pngmath', 'sphinx.ext.mathjax']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'SWIFT'
+copyright = u'2015: John Regan, James Willis, Stefan Arridge, Matthieu Schaller'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '0.1'
+# The full version, including alpha/beta/rc tags.
+release = '0.1'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'SWIFTdoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+  ('index', 'SWIFT.tex', u'SWIFT Documentation',
+   u'John Regan, James Willis, Stefan Arridge, Matthieu Schaller', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output --------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'swift', u'SWIFT Documentation',
+     [u'John Regan, James Willis, Stefan Arridge, Matthieu Schaller'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output ------------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  ('index', 'SWIFT', u'SWIFT Documentation',
+   u'John Regan, James Willis, Stefan Arridge, Matthieu Schaller', 'SWIFT', 'One line description of project.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
diff --git a/doc/RTD/index.rst b/doc/RTD/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1d2b08ceb8e227ba9fee306ab26ae9a490308dc4
--- /dev/null
+++ b/doc/RTD/index.rst
@@ -0,0 +1,29 @@
+.. SWIFT documentation master file, created by
+   sphinx-quickstart on Thu Nov  5 15:57:05 2015.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to SWIFT's documentation!
+=================================
+
+This is the first go at writing some high level documentation for SWIFT. 
+It will be augmented and linked with the already available documentation.
+Contents:
+
+
+.. toctree::
+   :maxdepth: 2
+
+   Motivation/index.rst
+   Innovation/index.rst
+   Physics/index.rst
+   DeveloperGuide/developerguide.rst
+   FAQ/index.rst
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
diff --git a/examples/Makefile.am b/examples/Makefile.am
index 5eaaa85bc75d3c404837fccd6f7b6f437e5ec706..f43bfe895bc16612f6f9b71070ba2cd15cd4c0e2 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -29,11 +29,18 @@ MPI_LIBS = $(METIS_LIBS) $(MPI_THREAD_LIBS)
 MPI_FLAGS = -DWITH_MPI $(METIS_INCS)
 
 # Set-up the library
-bin_PROGRAMS = swift
+bin_PROGRAMS = swift swift_fixdt swift_mindt
 
 # Build MPI versions as well?
 if HAVEMPI
-bin_PROGRAMS += swift_mpi
+bin_PROGRAMS += swift_mpi swift_fixdt_mpi swift_mindt_mpi
+endif
+
+# engine_policy_setaffinity is available?
+if HAVESETAFFINITY
+ENGINE_POLICY_SETAFFINITY=| engine_policy_setaffinity
+else
+ENGINE_POLICY_SETAFFINITY=
 endif
 
 # Sources for swift
@@ -41,8 +48,28 @@ swift_SOURCES = main.c
 swift_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) -DENGINE_POLICY="engine_policy_multistep | engine_policy_keep | engine_policy_setaffinity"
 swift_LDADD =  ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS)
 
+# Sources for swift_fixdt
+swift_fixdt_SOURCES = main.c
+swift_fixdt_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) -DENGINE_POLICY="engine_policy_fixdt | engine_policy_keep | engine_policy_setaffinity"
+swift_fixdt_LDADD =  ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS)
+
+# Sources for swift_mindt
+swift_mindt_SOURCES = main.c
+swift_mindt_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) -DENGINE_POLICY="engine_policy_keep | engine_policy_setaffinity"
+swift_mindt_LDADD =  ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS)
+
 # Sources for swift_mpi
 swift_mpi_SOURCES = main.c
 swift_mpi_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) -DENGINE_POLICY="engine_policy_multistep | engine_policy_keep"
 swift_mpi_LDADD =  ../src/.libs/libswiftsim_mpi.a $(HDF5_LDFLAGS) $(HDF5_LIBS) $(MPI_LIBS)
 
+# Sources for swift_fixdt_mpi
+swift_fixdt_mpi_SOURCES = main.c
+swift_fixdt_mpi_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) -DENGINE_POLICY="engine_policy_fixdt | engine_policy_keep"
+swift_fixdt_mpi_LDADD =  ../src/.libs/libswiftsim_mpi.a $(HDF5_LDFLAGS) $(HDF5_LIBS) $(MPI_LIBS)
+
+# Sources for swift_mindt_mpi
+swift_mindt_mpi_SOURCES = main.c
+swift_mindt_mpi_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) -DENGINE_POLICY="engine_policy_keep"
+swift_mindt_mpi_LDADD =  ../src/.libs/libswiftsim_mpi.a $(HDF5_LDFLAGS) $(HDF5_LIBS) $(MPI_LIBS)
+
diff --git a/examples/PertubedBox/makeIC.py b/examples/PertubedBox/makeIC.py
index 81ae9e5909ec51cb640209ce759d618311ce811f..a5e831eca02463d287ce2c7748eb780ef66aeb33 100644
--- a/examples/PertubedBox/makeIC.py
+++ b/examples/PertubedBox/makeIC.py
@@ -63,7 +63,7 @@ for i in range(L):
             v[index,1] = 0.
             v[index,2] = 0.
             m[index] = mass
-            h[index] = 2.251 * boxSize / L
+            h[index] = 1.1255 * boxSize / L
             u[index] = internalEnergy
             ids[index] = index
             
diff --git a/examples/SedovBlast/makeIC.py b/examples/SedovBlast/makeIC.py
index 9b4b5443f472edf5bb299ed5e7261d115c96293c..f3ed3288306a1ff7bb0387896a7e4e0f803a1f35 100644
--- a/examples/SedovBlast/makeIC.py
+++ b/examples/SedovBlast/makeIC.py
@@ -67,7 +67,7 @@ for i in range(L):
             v[index,1] = 0.
             v[index,2] = 0.
             m[index] = mass
-            h[index] = 2.251 / 2 * boxSize / L
+            h[index] = 1.1255 * boxSize / L
             u[index] = internalEnergy
             ids[index] = index
             if sqrt((x - boxSize/2.)**2 + (y - boxSize/2.)**2 + (z - boxSize/2.)**2) < 2.01 * boxSize/L:
diff --git a/examples/SedovBlast/makeIC_fcc.py b/examples/SedovBlast/makeIC_fcc.py
index 88cbaf8042323ea91ed7dd09b1bd63418aff3e3f..8a5d50031d06247afdd9b51ab9fe43bcca87a963 100644
--- a/examples/SedovBlast/makeIC_fcc.py
+++ b/examples/SedovBlast/makeIC_fcc.py
@@ -70,7 +70,7 @@ for i in range(L):
                 v[index,1] = 0.
                 v[index,2] = 0.
                 m[index] = mass
-                h[index] = 2.251 / 2 * hbox
+                h[index] = 1.1255 * hbox
                 u[index] = internalEnergy
                 ids[index] = index
                 if sqrt((x - boxSize/2.)**2 + (y - boxSize/2.)**2 + (z - boxSize/2.)**2) < 1.2 * hbox:
diff --git a/examples/main.c b/examples/main.c
index 2b772fc23d4f21dce8be59327d45283a74698493..e385f173d4b96a2a0a69ff508871e200034bd4b7 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -91,14 +91,12 @@ int main(int argc, char *argv[]) {
 #ifdef WITH_MPI
   /* Start by initializing MPI. */
   int res, prov;
-  if ((res = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &prov)) !=
-      MPI_SUCCESS)
+  if ((res = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &prov)) != MPI_SUCCESS)
     error("Call to MPI_Init failed with error %i.", res);
   if (prov != MPI_THREAD_MULTIPLE)
-    error(
-        "MPI does not provide the level of threading required "
-        "(MPI_THREAD_MULTIPLE).");
-  if ((res = MPI_Comm_size(MPI_COMM_WORLD, &nr_nodes) != MPI_SUCCESS))
+    error("MPI does not provide the level of threading required "
+          "(MPI_THREAD_MULTIPLE).");
+  if ((res = MPI_Comm_size(MPI_COMM_WORLD, &nr_nodes)) != MPI_SUCCESS)
     error("MPI_Comm_size failed with error %i.", res);
   if ((res = MPI_Comm_rank(MPI_COMM_WORLD, &myrank)) != MPI_SUCCESS)
     error("Call to MPI_Comm_rank failed with error %i.", res);
@@ -154,7 +152,7 @@ int main(int argc, char *argv[]) {
         break;
       case 'o':
         with_outputs = 0;
-	break;
+        break;
       case 'q':
         if (sscanf(optarg, "%d", &nr_queues) != 1)
           error("Error parsing number of queues.");
@@ -193,17 +191,20 @@ int main(int argc, char *argv[]) {
     }
 
 #if defined(WITH_MPI)
-  if (myrank == 0) message("Running with %i thread(s) per node.", nr_threads);
+  if (myrank == 0) {
+    message("Running with %i thread(s) per node.", nr_threads);
+    message("grid set to [ %i %i %i ].", grid[0], grid[1], grid[2]);
+
+    if (nr_nodes == 1) {
+      message("WARNING: you are running with one MPI rank.");
+      message("WARNING: you should use the non-MPI version of this program." );
+    }
+    fflush(stdout);
+  }
 #else
   if (myrank == 0) message("Running with %i thread(s).", nr_threads);
 #endif
 
-#if defined(WITH_MPI)
-  if (myrank == 0)
-    message("grid set to [ %i %i %i ].", grid[0], grid[1], grid[2]);
-  fflush(stdout);
-#endif
-
   /* How large are the parts? */
   if (myrank == 0) {
     message("sizeof(struct part) is %li bytes.", (long int)sizeof(struct part));
diff --git a/src/cell.h b/src/cell.h
index 38710e8fffa3732e91d2eb0d2720d653fcf7fe98..4cc09bdecd8838fc579c4fb22ea28fd14a0e2416 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -107,8 +107,8 @@ struct cell {
   int sortsize, gsortsize;
 
   /* The tasks computing this cell's density. */
-  struct link *link_density, *link_force, *link_grav;
-  int nr_link_density, nr_link_force, nr_link_grav;
+  struct link *density, *force, *grav;
+  int nr_density, nr_force, nr_grav;
 
   /* The ghost task to link density to interactions. */
   struct task *ghost, *init, *drift, *kick;
diff --git a/src/debug.c b/src/debug.c
index 0d8da8cae01ea00d10b094715e7c2d3387b3b2d7..c4d9b43723386128623fa8cd596cbc6260a70086 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -1,7 +1,9 @@
 /*******************************************************************************
  * This file is part of SWIFT.
- * Copyright (c) 2013 Matthieu Schaller (matthieu.schaller@durham.ac.uk),
- *                    Pedro Gonnet (pedro.gonnet@durham.ac.uk).
+ * Copyright (c) 2013- 2015:
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk),
+ *                    Pedro Gonnet (pedro.gonnet@durham.ac.uk),
+ *                    Peter W. Draper (p.w.draper@durham.ac.uk).
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -20,8 +22,10 @@
 
 #include <stdio.h>
 
+#include "config.h"
 #include "const.h"
 #include "part.h"
+#include "debug.h"
 
 /**
  * @brief Looks for the particle with the given id and prints its information to
@@ -100,3 +104,140 @@ void printParticle_single(struct part *p) {
       p->rho_dh, p->density.div_v, p->u, p->force.u_dt, p->force.balsara,
       p->force.POrho2, p->force.v_sig, p->t_begin, p->t_end);
 }
+
+#ifdef HAVE_METIS
+
+/**
+ * @brief Dump the METIS graph in standard format, simple format and weights
+ * only, to a file.
+ *
+ * @description The standard format output can be read into the METIS
+ * command-line tools. The simple format is just the cell connectivity (this
+ * should not change between calls).  The weights format is the standard one,
+ * minus the cell connectivity.
+ *
+ * The output filenames are generated from the prefix and the sequence number
+ * of calls. So the first is called <prefix>_std_001.dat, <prefix>_simple_001.dat,
+ * <prefix>_weights_001.dat, etc.
+ *
+ * @param prefix base output filename
+ * @param nvertices the number of vertices
+ * @param nvertexweights the number vertex weights
+ * @param cellconruns first part of cell connectivity info (CSR)
+ * @param cellcon second part of cell connectivity info (CSR)
+ * @param vertexweights weights of vertices
+ * @param vertexsizes size of vertices
+ * @param edgeweights weights of edges
+ */
+void dumpMETISGraph(const char *prefix, idx_t nvertices, idx_t nvertexweights,
+                    idx_t *cellconruns, idx_t *cellcon, idx_t *vertexweights, 
+                    idx_t *vertexsizes, idx_t *edgeweights) {
+  FILE *stdfile = NULL;
+  FILE *simplefile = NULL;
+  FILE *weightfile = NULL;
+  char fname[200];
+  idx_t i;
+  idx_t j;
+  int haveedgeweight = 0;
+  int havevertexsize = 0;
+  int havevertexweight = 0;
+  static int nseq = 0;
+  nseq++;
+
+  if (vertexweights != NULL) {
+    for (i = 0; i < nvertices * nvertexweights; i++) {
+      if (vertexweights[i] != 1) {
+        havevertexweight = 1;
+        break;
+      }
+    }
+  }
+
+  if (vertexsizes != NULL) {
+    for (i = 0; i < nvertices; i++) {
+      if (vertexsizes[i] != 1) {
+        havevertexsize = 1;
+        break;
+      }
+    }
+  }
+
+  if (edgeweights != NULL) {
+    for (i = 0; i < cellconruns[nvertices]; i++) {
+      if (edgeweights[i] != 1) {
+        haveedgeweight = 1;
+        break;
+      }
+    }
+  }
+
+  /*  Open output files. */
+  sprintf(fname, "%s_std_%03d.dat", prefix, nseq);
+  stdfile = fopen( fname, "w" );
+
+  sprintf(fname, "%s_simple_%03d.dat", prefix, nseq);
+  simplefile = fopen( fname, "w" );
+
+  if (havevertexweight || havevertexsize || haveedgeweight) {
+    sprintf(fname, "%s_weights_%03d.dat", prefix, nseq);
+    weightfile = fopen( fname, "w" );
+  }
+
+  /*  Write the header lines. */
+  fprintf(stdfile, "%" PRIDX " %" PRIDX, nvertices, cellconruns[nvertices] / 2);
+  fprintf(simplefile, "%" PRIDX " %" PRIDX, nvertices, cellconruns[nvertices] / 2);
+  if (havevertexweight || havevertexsize || haveedgeweight) {
+    fprintf(weightfile, "%" PRIDX " %" PRIDX, nvertices, cellconruns[nvertices] / 2);
+
+    fprintf(stdfile, " %d%d%d", havevertexsize, havevertexweight, haveedgeweight);
+    fprintf(weightfile, " %d%d%d", havevertexsize, havevertexweight, haveedgeweight);
+
+    if (havevertexweight) {
+      fprintf(stdfile, " %d", (int)nvertexweights);
+      fprintf(weightfile, " %d", (int)nvertexweights);
+    }
+  }
+
+  /*  Write the rest of the graph. */
+  for (i = 0; i < nvertices; i++) {
+    fprintf(stdfile, "\n");
+    fprintf(simplefile, "\n");
+    if (weightfile != NULL) {
+        fprintf(weightfile, "\n");
+    }
+
+    if (havevertexsize) {
+      fprintf(stdfile, " %" PRIDX, vertexsizes[i]);
+      fprintf(weightfile, " %" PRIDX, vertexsizes[i]);
+    }
+
+    if (havevertexweight) {
+      for (j = 0; j < nvertexweights; j++) {
+        fprintf(stdfile, " %" PRIDX, vertexweights[i * nvertexweights + j]);
+        fprintf(weightfile, " %" PRIDX, vertexweights[i * nvertexweights + j]);
+      }
+    }
+
+    for (j = cellconruns[i]; j < cellconruns[i + 1]; j++) {
+      fprintf(stdfile, " %" PRIDX, cellcon[j] + 1);
+      fprintf(simplefile, " %" PRIDX, cellcon[j] + 1);
+      if (haveedgeweight) {
+        fprintf(stdfile, " %" PRIDX, edgeweights[j]);
+        fprintf(weightfile, " %" PRIDX, edgeweights[j]);
+      }
+    }
+  }
+  fprintf(stdfile, "\n");
+  fprintf(simplefile, "\n");
+  if (weightfile != NULL) {
+      fprintf(weightfile, "\n");
+  }
+
+  fclose(stdfile);
+  fclose(simplefile);
+  if (weightfile != NULL) {
+    fclose(weightfile);
+  }
+}
+
+#endif
diff --git a/src/debug.h b/src/debug.h
index 83461df45e3c0fb137557fba5fdf68cac9d4915a..27b2f94eff28c0d2fd0bc76f548d5d775414d2c2 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -27,4 +27,11 @@ void printParticle(struct part *parts, long long int i, int N);
 void printgParticle(struct gpart *parts, long long int i, int N);
 void printParticle_single(struct part *p);
 
+#ifdef HAVE_METIS
+#include "metis.h"
+void dumpMETISGraph(const char *prefix, idx_t nvtxs, idx_t ncon,
+                    idx_t *xadj, idx_t *adjncy, idx_t *vwgt, idx_t *vsize,
+                    idx_t *adjwgt);
+
+#endif
 #endif /* SWIFT_DEBUG_H */
diff --git a/src/engine.c b/src/engine.c
index 00fd80f7e499ff292e88bc408a1a9bd65fe60467..66017cbd78e64b02956354c26c8546b07c7957f6 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -91,16 +91,12 @@ void engine_mkghosts(struct engine *e, struct cell *c, struct cell *super) {
   int k;
   struct scheduler *s = &e->sched;
 
-  //  message("in here");
-
   /* Am I the super-cell? */
   if (super == NULL && c->nr_tasks > 0) {
 
     /* Remember me. */
     super = c;
 
-    // message("Adding tasks");
-
     /* Local tasks only... */
     if (c->nodeID == e->nodeID) {
 
@@ -177,7 +173,7 @@ void engine_redistribute(struct engine *e) {
     dest[k] = cells[cid].nodeID;
     counts[nodeID * nr_nodes + dest[k]] += 1;
   }
-  parts_sort(s->parts, s->xparts, dest, s->nr_parts, 0, nr_nodes - 1);
+  space_parts_sort(s, dest, s->nr_parts, 0, nr_nodes - 1);
 
   /* Get all the counts from all the nodes. */
   if (MPI_Allreduce(MPI_IN_PLACE, counts, nr_nodes * nr_nodes, MPI_INT, MPI_SUM,
@@ -282,7 +278,7 @@ void engine_redistribute(struct engine *e) {
   free(dest);
 
 #else
-  error("SWIFT was not compiled with MPI and METIS support.");
+  error("SWIFT was not compiled with MPI support.");
 #endif
 }
 
@@ -310,14 +306,14 @@ void engine_repartition(struct engine *e) {
   idx_t wtot = 0;
   idx_t wmax = 1e9 / e->nr_nodes;
   idx_t wmin;
-  
+
   /* Clear the repartition flag. */
   e->forcerepart = 0;
 
   /* Nothing to do if only using a single node. Also avoids METIS
    * bug that doesn't handle this case well. */
   if (nr_nodes == 1) return;
-  
+
   /* Allocate the inds and weights. */
   if ((inds = (idx_t *)malloc(sizeof(idx_t) * 26 *nr_cells)) == NULL ||
       (weights_v = (idx_t *)malloc(sizeof(idx_t) *nr_cells)) == NULL ||
@@ -577,7 +573,7 @@ void engine_repartition(struct engine *e) {
     if (METIS_PartGraphRecursive(&idx_nr_cells, &one, offsets, inds, weights_v,
                                  NULL, weights_e, &idx_nr_nodes, NULL, NULL,
                                  options, &objval, nodeIDs) != METIS_OK)
-      error("Call to METIS_PartGrapRecursive failed.");
+      error("Call to METIS_PartGraphRecursive failed.");
 
     /* Dump the 3d array of cell IDs. */
     /* printf( "engine_repartition: nodeIDs = reshape( [" );
@@ -656,29 +652,28 @@ void engine_repartition(struct engine *e) {
 #endif
 }
 
-/* /\** */
-/*  * @brief Add up/down gravity tasks to a cell hierarchy. */
-/*  * */
-/*  * @param e The #engine. */
-/*  * @param c The #cell */
-/*  * @param up The upward gravity #task. */
-/*  * @param down The downward gravity #task. */
-/*  *\/ */
-
-/* void engine_addtasks_grav(struct engine *e, struct cell *c, struct task *up,
+/**
+ * @brief Add up/down gravity tasks to a cell hierarchy.
+ *
+ * @param e The #engine.
+ * @param c The #cell
+ * @param up The upward gravity #task.
+ * @param down The downward gravity #task.
  */
-/*                           struct task *down) { */
 
-/*   /\* Link the tasks to this cell. *\/ */
-/*   c->grav_up = up; */
-/*   c->grav_down = down; */
+void engine_addtasks_grav(struct engine *e, struct cell *c, struct task *up,
+                          struct task *down) {
 
-/*   /\* Recurse? *\/ */
-/*   if (c->split) */
-/*     for (int k = 0; k < 8; k++) */
-/*       if (c->progeny[k] != NULL) */
-/*         engine_addtasks_grav(e, c->progeny[k], up, down); */
-/* } */
+  /* Link the tasks to this cell. */
+  c->grav_up = up;
+  c->grav_down = down;
+
+  /* Recurse? */
+  if (c->split)
+    for (int k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL)
+        engine_addtasks_grav(e, c->progeny[k], up, down);
+}
 
 /**
  * @brief Add send tasks to a hierarchy of cells.
@@ -1104,18 +1099,16 @@ void engine_maketasks(struct engine *e) {
         }
       }
 
-  /* /\* Add the gravity mm tasks. *\/ */
-  /* for (i = 0; i < nr_cells; i++) */
-  /*   if (cells[i].gcount > 0) { */
-  /*     scheduler_addtask(sched, task_type_grav_mm, task_subtype_none, -1, 0,
-   */
-  /*                       &cells[i], NULL, 0); */
-  /*     for (j = i + 1; j < nr_cells; j++) */
-  /*       if (cells[j].gcount > 0) */
-  /*         scheduler_addtask(sched, task_type_grav_mm, task_subtype_none, -1,
-   * 0, */
-  /*                           &cells[i], &cells[j], 0); */
-  /*   } */
+  /* Add the gravity mm tasks. */
+  for (i = 0; i < nr_cells; i++)
+    if (cells[i].gcount > 0) {
+      scheduler_addtask(sched, task_type_grav_mm, task_subtype_none, -1, 0,
+                        &cells[i], NULL, 0);
+      for (j = i + 1; j < nr_cells; j++)
+        if (cells[j].gcount > 0)
+          scheduler_addtask(sched, task_type_grav_mm, task_subtype_none, -1, 0,
+                            &cells[i], &cells[j], 0);
+    }
 
   /* Split the tasks. */
   scheduler_splittasks(sched);
@@ -1129,28 +1122,21 @@ void engine_maketasks(struct engine *e) {
     error("Failed to allocate cell-task links.");
   e->nr_links = 0;
 
-  //space_link_cleanup(s); // MATTHIEU
-
-  /* /\* Add the gravity up/down tasks at the top-level cells and push them
-   * down. *\/ */
-  /* for (k = 0; k < nr_cells; k++) */
-  /*   if (cells[k].nodeID == nodeID && cells[k].gcount > 0) { */
-
-  /*     /\* Create tasks at top level. *\/ */
-  /*     struct task *up = */
-  /*         scheduler_addtask(sched, task_type_grav_up, task_subtype_none, 0,
-   * 0, */
-  /*                           &cells[k], NULL, 0); */
-  /*     struct task *down = */
-  /*         scheduler_addtask(sched, task_type_grav_down, task_subtype_none, 0,
-   * 0, */
-  /*                           &cells[k], NULL, 0); */
-
-  /*     /\* Push tasks down the cell hierarchy. *\/ */
-  /*     engine_addtasks_grav(e, &cells[k], up, down); */
-  /*   } */
-
-  message("nb tasks: %d", sched->nr_tasks);
+  /* Add the gravity up/down tasks at the top-level cells and push them down. */
+  for (k = 0; k < nr_cells; k++)
+    if (cells[k].nodeID == nodeID && cells[k].gcount > 0) {
+
+      /* Create tasks at top level. */
+      struct task *up =
+          scheduler_addtask(sched, task_type_grav_up, task_subtype_none, 0, 0,
+                            &cells[k], NULL, 0);
+      struct task *down =
+          scheduler_addtask(sched, task_type_grav_down, task_subtype_none, 0, 0,
+                            &cells[k], NULL, 0);
+
+      /* Push tasks down the cell hierarchy. */
+      engine_addtasks_grav(e, &cells[k], up, down);
+    }
 
   /* Count the number of tasks associated with each cell and
      store the density tasks in each cell, and make each sort
@@ -1173,42 +1159,42 @@ void engine_maketasks(struct engine *e) {
     if (t->type == task_type_self) {
       atomic_inc(&t->ci->nr_tasks);
       if (t->subtype == task_subtype_density) {
-        t->ci->link_density = engine_addlink(e, t->ci->link_density, t);
-        atomic_inc(&t->ci->nr_link_density);
+        t->ci->density = engine_addlink(e, t->ci->density, t);
+        atomic_inc(&t->ci->nr_density);
       }
     } else if (t->type == task_type_pair) {
       atomic_inc(&t->ci->nr_tasks);
       atomic_inc(&t->cj->nr_tasks);
       if (t->subtype == task_subtype_density) {
-        t->ci->link_density = engine_addlink(e, t->ci->link_density, t);
-        atomic_inc(&t->ci->nr_link_density);
-        t->cj->link_density = engine_addlink(e, t->cj->link_density, t);
-        atomic_inc(&t->cj->nr_link_density);
+        t->ci->density = engine_addlink(e, t->ci->density, t);
+        atomic_inc(&t->ci->nr_density);
+        t->cj->density = engine_addlink(e, t->cj->density, t);
+        atomic_inc(&t->cj->nr_density);
       }
     } else if (t->type == task_type_sub) {
       atomic_inc(&t->ci->nr_tasks);
       if (t->cj != NULL) atomic_inc(&t->cj->nr_tasks);
       if (t->subtype == task_subtype_density) {
-        t->ci->link_density = engine_addlink(e, t->ci->link_density, t);
-        atomic_inc(&t->ci->nr_link_density);
+        t->ci->density = engine_addlink(e, t->ci->density, t);
+        atomic_inc(&t->ci->nr_density);
         if (t->cj != NULL) {
-          t->cj->link_density = engine_addlink(e, t->cj->link_density, t);
-          atomic_inc(&t->cj->nr_link_density);
+          t->cj->density = engine_addlink(e, t->cj->density, t);
+          atomic_inc(&t->cj->nr_density);
         }
       }
     }
 
-    /* /\* Link gravity multipole tasks to the up/down tasks. *\/ */
-    /* if (t->type == task_type_grav_mm || */
-    /*     (t->type == task_type_sub && t->subtype == task_subtype_grav)) { */
-    /*   atomic_inc(&t->ci->nr_tasks); */
-    /*   scheduler_addunlock(sched, t->ci->grav_up, t); */
-    /*   scheduler_addunlock(sched, t, t->ci->grav_down); */
-    /*   if (t->cj != NULL && t->ci->grav_up != t->cj->grav_up) { */
-    /*     scheduler_addunlock(sched, t->cj->grav_up, t); */
-    /*     scheduler_addunlock(sched, t, t->cj->grav_down); */
-    /*   } */
-    /* } */
+    /* Link gravity multipole tasks to the up/down tasks. */
+    if (t->type == task_type_grav_mm ||
+        (t->type == task_type_sub && t->subtype == task_subtype_grav)) {
+      atomic_inc(&t->ci->nr_tasks);
+      scheduler_addunlock(sched, t->ci->grav_up, t);
+      scheduler_addunlock(sched, t, t->ci->grav_down);
+      if (t->cj != NULL && t->ci->grav_up != t->cj->grav_up) {
+        scheduler_addunlock(sched, t->cj->grav_up, t);
+        scheduler_addunlock(sched, t, t->cj->grav_down);
+      }
+    }
   }
 
   /* Append a ghost task to each cell, and add kick tasks to the
@@ -1235,8 +1221,8 @@ void engine_maketasks(struct engine *e) {
                              t->ci, NULL, 0);
       scheduler_addunlock(sched, t->ci->super->ghost, t2);
       scheduler_addunlock(sched, t2, t->ci->super->kick);
-      t->ci->link_force = engine_addlink(e, t->ci->link_force, t2);
-      atomic_inc(&t->ci->nr_link_force);
+      t->ci->force = engine_addlink(e, t->ci->force, t2);
+      atomic_inc(&t->ci->nr_force);
     }
 
     /* Otherwise, pair interaction? */
@@ -1255,10 +1241,10 @@ void engine_maketasks(struct engine *e) {
         scheduler_addunlock(sched, t->cj->super->ghost, t2);
         scheduler_addunlock(sched, t2, t->cj->super->kick);
       }
-      t->ci->link_force = engine_addlink(e, t->ci->link_force, t2);
-      atomic_inc(&t->ci->nr_link_force);
-      t->cj->link_force = engine_addlink(e, t->cj->link_force, t2);
-      atomic_inc(&t->cj->nr_link_force);
+      t->ci->force = engine_addlink(e, t->ci->force, t2);
+      atomic_inc(&t->ci->nr_force);
+      t->cj->force = engine_addlink(e, t->cj->force, t2);
+      atomic_inc(&t->cj->nr_force);
     }
 
     /* Otherwise, sub interaction? */
@@ -1266,23 +1252,21 @@ void engine_maketasks(struct engine *e) {
       t2 = scheduler_addtask(sched, task_type_sub, task_subtype_force, t->flags,
                              0, t->ci, t->cj, 0);
       if (t->ci->nodeID == nodeID) {
-        scheduler_addunlock(sched, t->ci->super->init, t);
         scheduler_addunlock(sched, t, t->ci->super->ghost);
         scheduler_addunlock(sched, t->ci->super->ghost, t2);
         scheduler_addunlock(sched, t2, t->ci->super->kick);
       }
       if (t->cj != NULL && t->cj->nodeID == nodeID &&
           t->ci->super != t->cj->super) {
-        scheduler_addunlock(sched, t->cj->super->init, t);
         scheduler_addunlock(sched, t, t->cj->super->ghost);
         scheduler_addunlock(sched, t->cj->super->ghost, t2);
         scheduler_addunlock(sched, t2, t->cj->super->kick);
       }
-      t->ci->link_force = engine_addlink(e, t->ci->link_force, t2);
-      atomic_inc(&t->ci->nr_link_force);
+      t->ci->force = engine_addlink(e, t->ci->force, t2);
+      atomic_inc(&t->ci->nr_force);
       if (t->cj != NULL) {
-        t->cj->link_force = engine_addlink(e, t->cj->link_force, t2);
-        atomic_inc(&t->cj->nr_link_force);
+        t->cj->force = engine_addlink(e, t->cj->force, t2);
+        atomic_inc(&t->cj->nr_force);
       }
     }
 
@@ -1341,99 +1325,99 @@ int engine_marktasks(struct engine *e) {
   struct cell *ci, *cj;
   // ticks tic = getticks();
 
-  /* /\* Much less to do here if we're on a fixed time-step. *\/ */
-  /* if (!(e->policy & engine_policy_multistep)) { */
+  /* Much less to do here if we're on a fixed time-step. */
+  if (!(e->policy & engine_policy_multistep)) {
 
-  /*   /\* Run through the tasks and mark as skip or not. *\/ */
-  /*   for (k = 0; k < nr_tasks; k++) { */
+    /* Run through the tasks and mark as skip or not. */
+    for (k = 0; k < nr_tasks; k++) {
 
-  /*     /\* Get a handle on the kth task. *\/ */
-  /*     t = &tasks[ind[k]]; */
+      /* Get a handle on the kth task. */
+      t = &tasks[ind[k]];
 
-  /*     /\* Pair? *\/ */
-  /*     if (t->type == task_type_pair || */
-  /*         (t->type == task_type_sub && t->cj != NULL)) { */
+      /* Pair? */
+      if (t->type == task_type_pair ||
+          (t->type == task_type_sub && t->cj != NULL)) {
 
-  /*       /\* Local pointers. *\/ */
-  /*       ci = t->ci; */
-  /*       cj = t->cj; */
+        /* Local pointers. */
+        ci = t->ci;
+        cj = t->cj;
 
-  /*       /\* Too much particle movement? *\/ */
-  /*       if (t->tight && */
-  /*           (fmaxf(ci->h_max, cj->h_max) + ci->dx_max + cj->dx_max > cj->dmin
-   * || */
-  /*            ci->dx_max > space_maxreldx * ci->h_max || */
-  /*            cj->dx_max > space_maxreldx * cj->h_max)) */
-  /*         return 1; */
+        /* Too much particle movement? */
+        if (t->tight &&
+            (fmaxf(ci->h_max, cj->h_max) + ci->dx_max + cj->dx_max > cj->dmin ||
+             ci->dx_max > space_maxreldx * ci->h_max ||
+             cj->dx_max > space_maxreldx * cj->h_max))
+          return 1;
 
-  /*     } */
+      }
 
-  /*     /\* Sort? *\/ */
-  /*     else if (t->type == task_type_sort) { */
+      /* Sort? */
+      else if (t->type == task_type_sort) {
 
-  /*       /\* If all the sorts have been done, make this task implicit. *\/ */
-  /*       if (!(t->flags & (t->flags ^ t->ci->sorted))) t->implicit = 1; */
-  /*     } */
-  /*   } */
+        /* If all the sorts have been done, make this task implicit. */
+        if (!(t->flags & (t->flags ^ t->ci->sorted))) t->implicit = 1;
+      }
+    }
 
-  /* } else { */
+  } else {
 
-  /* Run through the tasks and mark as skip or not. */
-  for (k = 0; k < nr_tasks; k++) {
+    /* Run through the tasks and mark as skip or not. */
+    for (k = 0; k < nr_tasks; k++) {
 
-    /* Get a handle on the kth task. */
-    t = &tasks[ind[k]];
+      /* Get a handle on the kth task. */
+      t = &tasks[ind[k]];
 
-    /* Sort-task? Note that due to the task ranking, the sorts
-       will all come before the pairs. */
-    if (t->type == task_type_sort) {
+      /* Sort-task? Note that due to the task ranking, the sorts
+         will all come before the pairs. */
+      if (t->type == task_type_sort) {
 
-      /* Re-set the flags. */
-      t->flags = 0;
-      t->skip = 1;
+        /* Re-set the flags. */
+        t->flags = 0;
+        t->skip = 1;
 
-    }
+      }
 
-    /* Single-cell task? */
-    else if (t->type == task_type_self || t->type == task_type_ghost ||
-             (t->type == task_type_sub && t->cj == NULL)) {
+      /* Single-cell task? */
+      else if (t->type == task_type_self || t->type == task_type_ghost ||
+               (t->type == task_type_sub && t->cj == NULL)) {
 
       /* Set this task's skip. */
       // t->skip = (t->ci->t_end_min >= t_end);
 
-    }
+      }
 
-    /* Pair? */
-    else if (t->type == task_type_pair ||
-             (t->type == task_type_sub && t->cj != NULL)) {
+      /* Pair? */
+      else if (t->type == task_type_pair ||
+               (t->type == task_type_sub && t->cj != NULL)) {
 
-      /* Local pointers. */
-      ci = t->ci;
-      cj = t->cj;
+        /* Local pointers. */
+        ci = t->ci;
+        cj = t->cj;
 
       /* Set this task's skip. */
       // t->skip = (ci->t_end_min >= t_end && cj->t_end_min >= t_end);
 
-      /* Too much particle movement? */
-      if (t->tight &&
-          (fmaxf(ci->h_max, cj->h_max) + ci->dx_max + cj->dx_max > cj->dmin ||
-           ci->dx_max > space_maxreldx * ci->h_max ||
-           cj->dx_max > space_maxreldx * cj->h_max))
-        return 1;
-
-      /* Set the sort flags. */
-      if (!t->skip && t->type == task_type_pair) {
-        if (!(ci->sorted & (1 << t->flags))) {
-          ci->sorts->flags |= (1 << t->flags);
-          ci->sorts->skip = 0;
-        }
-        if (!(cj->sorted & (1 << t->flags))) {
-          cj->sorts->flags |= (1 << t->flags);
-          cj->sorts->skip = 0;
+        /* Too much particle movement? */
+        if (t->tight &&
+            (fmaxf(ci->h_max, cj->h_max) + ci->dx_max + cj->dx_max > cj->dmin ||
+             ci->dx_max > space_maxreldx * ci->h_max ||
+             cj->dx_max > space_maxreldx * cj->h_max))
+          return 1;
+
+        /* Set the sort flags. */
+        if (!t->skip && t->type == task_type_pair) {
+          if (!(ci->sorted & (1 << t->flags))) {
+            ci->sorts->flags |= (1 << t->flags);
+            ci->sorts->skip = 0;
+          }
+          if (!(cj->sorted & (1 << t->flags))) {
+            cj->sorts->flags |= (1 << t->flags);
+            cj->sorts->skip = 0;
+          }
         }
+
       }
 
-    }
 
     /* Kick? */
     else if (t->type == task_type_kick)
@@ -1451,7 +1435,7 @@ int engine_marktasks(struct engine *e) {
     else if (t->type == task_type_none)
       t->skip = 1;
   }
-  //}
+  }
 
   // message( "took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 );
 
@@ -1847,6 +1831,13 @@ void engine_launch(struct engine *e, int nr_runners, unsigned int mask) {
  */
 void engine_init_particles(struct engine *e) {
 
+  int k;
+  float dt_max = 0.0f, dt_min = FLT_MAX;
+  double epot = 0.0, ekin = 0.0;
+  float mom[3] = {0.0, 0.0, 0.0};
+  float ang[3] = {0.0, 0.0, 0.0};
+  int count = 0;
+  struct cell *c;
   struct space *s = e->s;
 
   // engine_repartition(e);
@@ -1973,6 +1964,12 @@ void engine_step(struct engine *e) {
   count = in[0];
   ekin = in[1];
   epot = in[2];
+/* int nr_parts;
+if ( MPI_Allreduce( &s->nr_parts , &nr_parts , 1 , MPI_INT , MPI_SUM ,
+MPI_COMM_WORLD ) != MPI_SUCCESS )
+    error( "Failed to aggregate particle count." );
+if ( e->nodeID == 0 )
+    message( "nr_parts=%i." , nr_parts ); */
 #endif
 
   message("t_end_min=%f t_end_max=%f", t_end_min, t_end_max);
diff --git a/src/engine.h b/src/engine.h
index 043998074094d22cb8cbe8f893b52a1a62bd8ecf..f92ab261fe1a1465d5fd4b4cf93f38107994ce4f 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -139,6 +139,7 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
                  int nr_queues, int nr_nodes, int nodeID, int policy,
                  float timeBegin, float timeEnd,
 		 float dt_min, float dt_max);
+void engine_launch(struct engine *e, int nr_runners, unsigned int mask);
 void engine_prepare(struct engine *e);
 void engine_print(struct engine *e);
 void engine_init_particles(struct engine *e);
diff --git a/src/runner.c b/src/runner.c
index 1beb2c3c3b849c90f4dce658c5c846685c834c5e..52e106cb59e7585beef7ede2946ae499d45a9f4e 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -34,6 +34,7 @@
 #include "runner.h"
 
 /* Local headers. */
+#include "atomic.h"
 #include "const.h"
 #include "engine.h"
 #include "error.h"
@@ -65,6 +66,9 @@
 #define cell_getid(cdim, i, j, k) \
   ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i)))
 
+/* Histograms bins. */
+long long int runner_hist_bins[runner_hist_N];
+
 /* The counters. */
 int runner_counter[runner_counter_count];
 
@@ -715,7 +719,7 @@ void runner_doghost(struct runner *r, struct cell *c) {
       for (finger = c; finger != NULL; finger = finger->parent) {
 
         /* Run through this cell's density interactions. */
-        for (struct link *l = finger->link_density; l != NULL; l = l->next) {
+        for (struct link *l = finger->density; l != NULL; l = l->next) {
 
           // message("link: %p next: %p", l, l->next); fflush(stdout);
 
@@ -1105,12 +1109,12 @@ void *runner_main(void *data) {
       t->rid = r->cpuid;
 
       /* Set super to the first cell that I own. */
-      if (ci->super != NULL && ci->super->owner == r->qid)
-        super = ci->super;
-      else if (cj != NULL && cj->super != NULL && cj->super->owner == r->qid)
-        super = cj->super;
-      /* else
-          super = NULL; */
+      if (t->type != task_type_rewait && t->type != task_type_psort) {
+        if (ci->super != NULL && ci->super->owner == r->qid)
+          super = ci->super;
+        else if (cj != NULL && cj->super != NULL && cj->super->owner == r->qid)
+          super = cj->super;
+      }
 
       /* Different types of tasks... */
       switch (t->type) {
@@ -1183,6 +1187,19 @@ void *runner_main(void *data) {
         case task_type_grav_down:
           runner_dograv_down(r, t->ci);
           break;
+        case task_type_psort:
+          space_do_parts_sort();
+          break;
+        case task_type_split_cell:
+          space_split(e->s, t->ci);
+          break;
+        case task_type_rewait:
+          for (struct task *t2 = (struct task *)t->ci;
+               t2 != (struct task *)t->cj; t2++) {
+            for (k = 0; k < t2->nr_unlock_tasks; k++)
+              atomic_inc(&t2->unlock_tasks[k]->wait);
+          }
+          break;
         default:
           error("Unknown task type.");
       }
diff --git a/src/runner.h b/src/runner.h
index 2de7a5931c5af4c7c52de1b024cfa2dd41d1f603..ce4b8405bcde69d19990fbcfee257ddafc717b6e 100644
--- a/src/runner.h
+++ b/src/runner.h
@@ -58,7 +58,7 @@ extern int runner_counter[runner_counter_count];
 #define runner_hist_a 1.0
 #define runner_hist_b 100.0
 #define runner_hist_N 99
-long long int runner_hist_bins[runner_hist_N];
+extern long long int runner_hist_bins[runner_hist_N];
 #define runner_hist_hit(x)                                                   \
   __sync_add_and_fetch(                                                      \
       &runner_hist_bins[(int)fmax(                                           \
diff --git a/src/scheduler.c b/src/scheduler.c
index aa41983e232ad3cd0600f3b7dd4f395fd749daef..8cdc4b0f9ce46bfd3204934f2c13e974af205c54 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -46,10 +46,6 @@
 #include "kernel.h"
 #include "timers.h"
 
-#define to_check 5394
-#define num_checks 11
-struct task *check[num_checks];
-
 /**
  * @brief Add an unlock_task to the given task.
  *
@@ -61,40 +57,6 @@ struct task *check[num_checks];
 void scheduler_addunlock(struct scheduler *s, struct task *ta,
                          struct task *tb) {
 
-  /* /\* Main loop. *\/ */
-  /* while (1) { */
-
-  /*   /\* Follow the links. *\/ */
-  /*   while (ta->nr_unlock_tasks == task_maxunlock + 1) */
-  /*     ta = ta->unlock_tasks[task_maxunlock]; */
-
-  /*   /\* Get the index of the next free task. *\/ */
-  /*   const int ind = atomic_inc(&ta->nr_unlock_tasks); */
-
-  /*   /\* Is there room in this task? *\/ */
-  /*   if (ind < task_maxunlock) { */
-  /*     ta->unlock_tasks[ind] = tb; */
-  /*     break; */
-  /*   } */
-
-  /*   /\* Otherwise, generate a link task. *\/ */
-  /*   else { */
-
-  /*     /\* Only one thread should have to do this. *\/ */
-  /*     if (ind == task_maxunlock) { */
-  /*       ta->unlock_tasks[task_maxunlock] = */
-  /*           scheduler_addtask(s, task_type_link, task_subtype_none,
-   * ta->flags, */
-  /*                             0, ta->ci, ta->cj, 0); */
-  /*       ta->unlock_tasks[task_maxunlock]->implicit = 1; */
-  /*     } */
-
-  /*     /\* Otherwise, reduce the count. *\/ */
-  /*     else */
-  /*       atomic_dec(&ta->nr_unlock_tasks); */
-  /*   } */
-  /* } */
-
   /* Lock the scheduler since re-allocating the unlocks is not
      thread-safe. */
   if (lock_lock(&s->lock) != 0) error("Unable to lock scheduler.");
@@ -164,6 +126,9 @@ void scheduler_splittasks(struct scheduler *s) {
         break;
     }
 
+    /* Skip sorting tasks. */
+    if (t->type == task_type_psort) continue;
+
     /* Empty task? */
     if (t->ci == NULL || (t->type == task_type_pair && t->cj == NULL)) {
       t->type = task_type_none;
@@ -989,10 +954,12 @@ void scheduler_reweight(struct scheduler *s) {
  * @param s The #scheduler.
  * @param mask The task types to enqueue.
  */
+
 void scheduler_start(struct scheduler *s, unsigned int mask) {
 
   int nr_tasks = s->nr_tasks, *tid = s->tasks_ind;
   struct task *t, *tasks = s->tasks;
+  // ticks tic;
 
   /* Store the mask */
   s->mask = mask | (1 << task_type_rewait);
@@ -1003,6 +970,8 @@ void scheduler_start(struct scheduler *s, unsigned int mask) {
     s->tasks[k].wait = 1;
     s->tasks[k].rid = -1;
   }
+  // message( "waiting tasks took %.3f ms." , (double)( getticks() - tic ) /
+  // CPU_TPS * 1000 );
 
   /* Enqueue a set of extraenous tasks to set the task waits. */
   struct task *rewait_tasks = &s->tasks[s->nr_tasks];
@@ -1033,152 +1002,21 @@ void scheduler_start(struct scheduler *s, unsigned int mask) {
   }
   pthread_mutex_unlock(&s->sleep_mutex);
   /* message("waiting tasks took %.3f ms.",
-     (double)(getticks() - tic) / CPU_TPS * 1000); */
+          (double)(getticks() - tic) / CPU_TPS * 1000); */
 
   /* Loop over the tasks and enqueue whoever is ready. */
   // tic = getticks();
   for (int k = 0; k < s->nr_tasks; k++) {
     t = &tasks[tid[k]];
     if (atomic_dec(&t->wait) == 1 && ((1 << t->type) & s->mask) && !t->skip) {
-
-      scheduler_enqueue(s, t);
-      pthread_cond_broadcast(&s->sleep_cond);
-    }
-  }
-
-  // message( "enqueueing tasks took %.3f ms." , (double)( getticks() - tic ) /
-  // CPU_TPS * 1000 );
-}
-
-
-
-
-
-#if 0
-void scheduler_start(struct scheduler *s, unsigned int mask) {
-
-  int k, j, nr_tasks = s->nr_tasks, *tid = s->tasks_ind;
-  struct task *t, *tasks = s->tasks;
-  struct task *store = NULL;
-  int count = 0;
-  // ticks tic;
-
-  // message("begin");
-  // fflush(stdout);
-
-  /* Store the mask */
-  s->mask = mask;
-
-  for (k = 0;k<num_checks; ++k)
-    check[k] = NULL;
-  
-  /* Run through the tasks and set their waits. */
-  // tic = getticks();
-  for (k = nr_tasks - 1; k >= 0; k--) {
-    t = &tasks[tid[k]];
-    t->wait = 1;
-    t->rid = -1;
-
-    if(k==to_check) {
-
-      //message("LOOP1: task %d type=%s-%s unlock=%d wait=%d", k, taskID_names[t->type], subtaskID_names[t->subtype], t->nr_unlock_tasks, t->wait);
-
-      store = t;
-      
-    }
-
-    if (!((1 << t->type) & mask) || t->skip) continue;
-    for (j = 0; j < t->nr_unlock_tasks; j++) {
-      atomic_inc(&t->unlock_tasks[j]->wait);
-
-      /* if(t->unlock_tasks[j] == store) { */
-      /* 	message("task %d type=%s-%s unlocks the pair unlock=%d wait=%d %p", k, taskID_names[t->type], subtaskID_names[t->subtype], t->nr_unlock_tasks, t->wait, t); */
-      /* 	message("Link index: %6li", t->nr_unlock_tasks == task_maxunlock + 1 ? t->unlock_tasks[task_maxunlock] - s->tasks : -1); */
-	
-      /* 	check[count] = t; */
-      /* 	++count; */
-      /* } */
-      
-      /* if(t->unlock_tasks[j] == &tasks[9563] ) { */
-      /* 	message("task %d %s %s unlocking task %d %s %s\n", */
-      /* 		k, taskID_names[t->type], subtaskID_names[t->subtype], */
-      /* 		9563, taskID_names[t->unlock_tasks[j]->type], */
-      /* subtaskID_names[t->unlock_tasks[j]->type]); */
-      /* } */
-    }
-  }
-
-  // message( "waiting tasks took %.3f ms." , (double)( getticks() - tic ) /
-  // CPU_TPS * 1000 );
-
-  scheduler_print_tasks(s, "tasks_start.dat");
-
-  //message("All waits set nr_tasks=%d", nr_tasks);
-  //fflush(stdout);
-
-  /* Don't enqueue link tasks directly. */
-  mask &= ~(1 << task_type_link);
-  s->mask = mask;
-
-  for (k = 0; k < nr_tasks; k++) {
-    t = &tasks[tid[k]];
-
-    /* if(k==to_check) { */
-    /*   message("LOOP2: task %5d type=%s-%s unlock=%d wait=%d t=%p", k, taskID_names[t->type], subtaskID_names[t->subtype], t->nr_unlock_tasks, t->wait, t); */
-    /*   fflush(stdout); */
-    /* } */
-
-    for (j = 0; j < t->nr_unlock_tasks; j++) {
-      if(t->unlock_tasks[j] == store) {
-    	//message("task %d type=%s-%s unlocks the pair unlock=%d wait=%d %p", k, taskID_names[t->type], subtaskID_names[t->subtype], t->nr_unlock_tasks, t->wait, t);
-	//message("Link index: %6li", t->nr_unlock_tasks == task_maxunlock + 1 ? t->unlock_tasks[task_maxunlock] - s->tasks : -1);
-	
-    	check[count] = t;
-    	++count;
-      }
-
-    }
-
-  }
-  
-  /* Loop over the tasks and enqueue whoever is ready. */
-  // tic = getticks();
-  for (k = 0; k < s->nr_tasks; k++) {
-    t = &tasks[tid[k]];
-
-
-    /* if (((1 << t->type) & mask) && !t->skip) { */
-    /*   if (t->wait == 0) { */
-    /*     scheduler_enqueue(s, t); */
-    /*     pthread_cond_broadcast(&s->sleep_cond); */
-    /*   } else */
-    /*     break; */
-    /* } */
-
-    if (atomic_dec(&t->wait) == 1 &&
-	((1 << t->type) & s->mask) &&
-	!t->skip) {
-      
       scheduler_enqueue(s, t);
       pthread_cond_broadcast(&s->sleep_cond);
-
     }
-
   }
-  scheduler_dump_queue(s);
-
-  // message("Done enqueieing");fflush(stdout);
-
   // message( "enqueueing tasks took %.3f ms." , (double)( getticks() - tic ) /
   // CPU_TPS * 1000 );
 }
 
-#endif
-
-
-
-
-
 /**
  * @brief Put a task on one of the queues.
  *
@@ -1193,43 +1031,16 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
   int err;
 #endif
 
-  // if(t->type == task_type_pair) {
-  //  message("Enqueuing a %s", taskID_names[t->type]);
-  //  fflush(stdout);
-  // }
-
   /* Fail if this task has already been enqueued before. */
   if (t->rid >= 0) error("Task has already been enqueued.");
 
-  for (int k = 0; k < num_checks; ++k) {
-
-    if (t == check[k]) {
-      // message("task %5d type=%s-%s unlock=%d wait=%d %p", 0,
-      // taskID_names[t->type], subtaskID_names[t->subtype], t->nr_unlock_tasks,
-      // t->wait, t);
-    }
-  }
-
   /* Ignore skipped tasks and tasks not in the mask. */
   if (t->skip || (1 << t->type) & ~(s->mask)) {
     return;
   }
 
-  for (int k = 0; k < num_checks; ++k) {
-
-    if (t == check[k]) {
-      // message("not ignored !");
-    }
-  }
-
   /* If this is an implicit task, just pretend it's done. */
   if (t->implicit) {
-
-    for (int k = 0; k < num_checks; ++k) {
-      if (t == check[k]) {
-        // message("implicit");
-      }
-    }
     for (int j = 0; j < t->nr_unlock_tasks; j++) {
       struct task *t2 = t->unlock_tasks[j];
       if (atomic_dec(&t2->wait) == 1) scheduler_enqueue(s, t2);
@@ -1323,14 +1134,9 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
 
 struct task *scheduler_done(struct scheduler *s, struct task *t) {
 
-  for (int k = 0; k < num_checks; ++k) {
-
-    if (t == check[k]) {
-      // message("task %5d type=%s-%s unlock=%d wait=%d %p", 0,
-      // taskID_names[t->type], subtaskID_names[t->subtype], t->nr_unlock_tasks,
-      // t->wait, t);
-    }
-  }
+  int k, res;
+  struct task *t2, *next = NULL;
+  struct cell *super = t->ci->super;
 
   /* Release whatever locks this task held. */
   if (!t->implicit) task_unlock(t);
@@ -1340,22 +1146,6 @@ struct task *scheduler_done(struct scheduler *s, struct task *t) {
   for (int k = 0; k < t->nr_unlock_tasks; k++) {
     struct task *t2 = t->unlock_tasks[k];
     int res = atomic_dec(&t2->wait);
-    /* if (t->type == task_type_init) */
-    /*   message("Done with init ! Unlocking a %s task. %d dependencies left",
-     */
-    /*           taskID_names[t2->type], res); */
-    /* if (t->type == task_type_pair) */
-    /*   message("Done with pair ! Unlocking a %s task. %d dependencies left",
-     */
-    /*           taskID_names[t2->type], res); */
-
-    for (int k = 0; k < num_checks; ++k) {
-
-      if (t2 == check[k]) {
-        // message("Unlocking the task %p", t2);
-      }
-    }
-
     if (res < 1) {
       error("Negative wait!");
     } else if (res == 1) {
@@ -1390,6 +1180,9 @@ struct task *scheduler_done(struct scheduler *s, struct task *t) {
 
 struct task *scheduler_unlock(struct scheduler *s, struct task *t) {
 
+  int k, res;
+  struct task *t2, *next = NULL;
+
   /* Loop through the dependencies and add them to a queue if
      they are ready. */
   for (int k = 0; k < t->nr_unlock_tasks; k++) {
@@ -1548,46 +1341,6 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_queues,
   s->tasks_next = 0;
 }
 
-/**
- * @brief Print all the tasks in the queue of the scheduler
- *
- * @param s The #scheduler.
- */
-void scheduler_dump_queue(struct scheduler *s) {
-
-  int i, j;
-  FILE *file;
-  char buffer[256];
-  struct queue *q;
-  struct task *t;
-
-  for (i = 0; i < s->nr_queues; ++i) {
-
-    /* Open file */
-    sprintf(buffer, "queue_%d.dat", i);
-    file = fopen(buffer, "w");
-
-    /* Get the queue */
-    q = &s->queues[i];
-
-    /* Some general info */
-    fprintf(file, "# Queue %d, size=%d, count=%d\n", i, q->size, q->count);
-    fprintf(file, "# Index type subtype\n");
-
-    for (j = 0; j < q->count; ++j) {
-
-      /* Get the task */
-      t = &q->tasks[j];
-
-      /* And print... */
-      fprintf(file, "%d %s %s\n", j, taskID_names[t->type],
-              subtaskID_names[t->subtype]);
-    }
-
-    /* Be nice and clean */
-    fclose(file);
-  }
-}
 
 /**
  * @brief Prints the list of tasks to a file
@@ -1595,21 +1348,21 @@ void scheduler_dump_queue(struct scheduler *s) {
  * @param s The #scheduler
  * @param fileName Name of the file to write to
  */
-void scheduler_print_tasks(struct scheduler *s, char *fileName) {
+ void scheduler_print_tasks(struct scheduler *s, char *fileName) {
 
-  const int nr_tasks = s->nr_tasks, *tid = s->tasks_ind;
-  struct task *t, *tasks = s->tasks;
+   const int nr_tasks = s->nr_tasks, *tid = s->tasks_ind;
+   struct task *t, *tasks = s->tasks;
 
-  FILE *file = fopen(fileName, "w");
+   FILE *file = fopen(fileName, "w");
 
-  fprintf(file, "# Rank  Name  Subname  unlocks  waits\n");
+   fprintf(file, "# Rank  Name  Subname  unlocks  waits\n");
 
-  for (int k = nr_tasks - 1; k >= 0; k--) {
-    t = &tasks[tid[k]];
-    if (!((1 << t->type)) || t->skip) continue;
-    fprintf(file, "%d %s %s %d %d\n", k, taskID_names[t->type],
-            subtaskID_names[t->subtype], t->nr_unlock_tasks, t->wait);
-  }
+   for (int k = nr_tasks - 1; k >= 0; k--) {
+     t = &tasks[tid[k]];
+     if (!((1 << t->type)) || t->skip) continue;
+     fprintf(file, "%d %s %s %d %d\n", k, taskID_names[t->type],
+	     subtaskID_names[t->subtype], t->nr_unlock_tasks, t->wait);
+   }
 
-  fclose(file);
-}
+   fclose(file);
+ }
diff --git a/src/scheduler.h b/src/scheduler.h
index 79ff2738eb56575a9b02b274ad36b53d71035c89..f87734210689dc4ecb7f89572c3494614ee826a3 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -76,7 +76,7 @@ struct scheduler {
   struct task **unlocks;
   int *unlock_ind;
   int nr_unlocks, size_unlocks;
-  
+
   /* Lock for this scheduler. */
   lock_type lock;
 
diff --git a/src/space.c b/src/space.c
index 738491e747396b9fcd1db7357fda3f96b81fd959..37eb1bce0a6213ff28076c680797c478ee592ec1 100644
--- a/src/space.c
+++ b/src/space.c
@@ -1,4 +1,4 @@
-/*******************************************************************************
+ /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
  *
@@ -43,6 +43,9 @@
 #include "lock.h"
 #include "runner.h"
 
+/* Shared sort structure. */
+struct parallel_sort space_sort_struct;
+
 /* Split size. */
 int space_splitsize = space_splitsize_default;
 int space_subsize = space_subsize_default;
@@ -271,10 +274,10 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
       space_rebuild_recycle(s, &s->cells[k]);
       s->cells[k].sorts = NULL;
       s->cells[k].nr_tasks = 0;
-      s->cells[k].nr_link_density = 0;
-      s->cells[k].nr_link_force = 0;
-      s->cells[k].link_density = NULL;
-      s->cells[k].link_force = NULL;
+      s->cells[k].nr_density = 0;
+      s->cells[k].nr_force = 0;
+      s->cells[k].density = NULL;
+      s->cells[k].force = NULL;
       s->cells[k].dx_max = 0.0f;
       s->cells[k].sorted = 0;
       s->cells[k].count = 0;
@@ -391,7 +394,7 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
 
   /* Sort the parts according to their cells. */
   // tic = getticks();
-  parts_sort(s->parts, s->xparts, ind, nr_parts, 0, s->nr_cells - 1);
+  space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1);
   // message( "parts_sort took %.3f ms." , (double)(getticks() - tic) / CPU_TPS
   // * 1000 );
 
@@ -399,7 +402,7 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
   for (k = 0; k < nr_parts; k++)
     if (s->parts[k].gpart != NULL) s->parts[k].gpart->part = &s->parts[k];
 
-  /* Verify sort. */
+  /* Verify space_sort_struct. */
   /* for ( k = 1 ; k < nr_parts ; k++ ) {
       if ( ind[k-1] > ind[k] ) {
           error( "Sort failed!" );
@@ -465,7 +468,11 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
   /* At this point, we have the upper-level cells, old or new. Now make
      sure that the parts in each cell are ok. */
   // tic = getticks();
-  for (k = 0; k < s->nr_cells; k++) space_split(s, &cells[k]);
+  // for (k = 0; k < s->nr_cells; k++) space_split(s, &cells[k]);
+  for (k = 0; k < s->nr_cells; k++)
+    scheduler_addtask(&s->e->sched, task_type_split_cell, task_subtype_none,
+                      k, 0, &cells[k], NULL, 0);
+  engine_launch(s->e, s->e->nr_threads, 1 << task_type_split_cell);
 
   // message( "space_split took %.3f ms." , (double)(getticks() - tic) / CPU_TPS
   // * 1000 );
@@ -475,113 +482,131 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
  * @brief Sort the particles and condensed particles according to the given
  *indices.
  *
- * @param parts The list of #part
- * @param xparts The list of reduced particles
+ * @param s The #space.
  * @param ind The indices with respect to which the parts are sorted.
  * @param N The number of parts
  * @param min Lowest index.
  * @param max highest index.
  */
 
-void parts_sort(struct part *parts, struct xpart *xparts, int *ind, int N,
-                int min, int max) {
-
-  struct qstack {
-    volatile int i, j, min, max;
-    volatile int ready;
-  };
-  struct qstack *qstack;
-  unsigned int qstack_size = 2 * (max - min) + 10;
-  volatile unsigned int first, last, waiting;
-
-  int pivot;
-  int i, ii, j, jj, temp_i, qid;
-  struct part temp_p;
-  struct xpart temp_xp;
+void space_parts_sort(struct space *s, int *ind, int N, int min, int max) {
+  // Populate the global parallel_sort structure with the input data.
+  space_sort_struct.parts = s->parts;
+  space_sort_struct.xparts = s->xparts;
+  space_sort_struct.ind = ind;
+  space_sort_struct.stack_size = 2 * (max - min + 1) + 10 + s->e->nr_threads;
+  if ((space_sort_struct.stack = malloc(sizeof(struct qstack) *
+                                        space_sort_struct.stack_size)) == NULL)
+    error("Failed to allocate sorting stack.");
+  for (int i = 0; i < space_sort_struct.stack_size; i++)
+    space_sort_struct.stack[i].ready = 0;
+
+  // Add the first interval.
+  space_sort_struct.stack[0].i = 0;
+  space_sort_struct.stack[0].j = N - 1;
+  space_sort_struct.stack[0].min = min;
+  space_sort_struct.stack[0].max = max;
+  space_sort_struct.stack[0].ready = 1;
+  space_sort_struct.first = 0;
+  space_sort_struct.last = 1;
+  space_sort_struct.waiting = 1;
+
+  // Launch the sorting tasks.
+  engine_launch(s->e, s->e->nr_threads, (1 << task_type_psort));
+
+  /* Verify space_sort_struct. */
+  /* for (int i = 1; i < N; i++)
+    if (ind[i - 1] > ind[i])
+      error("Sorting failed (ind[%i]=%i,ind[%i]=%i), min=%i, max=%i.", i - 1, ind[i - 1], i,
+            ind[i], min, max);
+  message("Sorting succeeded."); */
+
+  // Clean up.
+  free(space_sort_struct.stack);
+}
 
-  /* for ( int k = 0 ; k < N ; k++ )
-      if ( ind[k] > max || ind[k] < min )
-          error( "ind[%i]=%i is not in [%i,%i]." , k , ind[k] , min , max ); */
+void space_do_parts_sort() {
 
-  /* Allocate the stack. */
-  if ((qstack = malloc(sizeof(struct qstack) * qstack_size)) == NULL)
-    error("Failed to allocate qstack.");
-
-  /* Init the interval stack. */
-  qstack[0].i = 0;
-  qstack[0].j = N - 1;
-  qstack[0].min = min;
-  qstack[0].max = max;
-  qstack[0].ready = 1;
-  for (i = 1; i < qstack_size; i++) qstack[i].ready = 0;
-  first = 0;
-  last = 1;
-  waiting = 1;
+  /* Pointers to the sorting data. */
+  int *ind = space_sort_struct.ind;
+  struct part *parts = space_sort_struct.parts;
+  struct xpart *xparts = space_sort_struct.xparts;
 
   /* Main loop. */
-  while (waiting > 0) {
+  while (space_sort_struct.waiting) {
 
     /* Grab an interval off the queue. */
-    qid = (first++) % qstack_size;
+    int qid =
+        atomic_inc(&space_sort_struct.first) % space_sort_struct.stack_size;
 
+    /* Wait for the entry to be ready, or for the sorting do be done. */
+    while (!space_sort_struct.stack[qid].ready)
+      if (!space_sort_struct.waiting) return;
+      
     /* Get the stack entry. */
-    i = qstack[qid].i;
-    j = qstack[qid].j;
-    min = qstack[qid].min;
-    max = qstack[qid].max;
-    qstack[qid].ready = 0;
+    int i = space_sort_struct.stack[qid].i;
+    int j = space_sort_struct.stack[qid].j;
+    int min = space_sort_struct.stack[qid].min;
+    int max = space_sort_struct.stack[qid].max;
+    space_sort_struct.stack[qid].ready = 0;
 
     /* Loop over sub-intervals. */
     while (1) {
 
       /* Bring beer. */
-      pivot = (min + max) / 2;
+      const int pivot = (min + max) / 2;
+      /* message("Working on interval [%i,%i] with min=%i, max=%i, pivot=%i.",
+              i, j, min, max, pivot); */
 
       /* One pass of QuickSort's partitioning. */
-      ii = i;
-      jj = j;
+      int ii = i;
+      int jj = j;
       while (ii < jj) {
         while (ii <= j && ind[ii] <= pivot) ii++;
         while (jj >= i && ind[jj] > pivot) jj--;
         if (ii < jj) {
-          temp_i = ind[ii];
+          int temp_i = ind[ii];
           ind[ii] = ind[jj];
           ind[jj] = temp_i;
-          temp_p = parts[ii];
+          struct part temp_p = parts[ii];
           parts[ii] = parts[jj];
           parts[jj] = temp_p;
-          temp_xp = xparts[ii];
+          struct xpart temp_xp = xparts[ii];
           xparts[ii] = xparts[jj];
           xparts[jj] = temp_xp;
         }
       }
 
-      /* Verify sort. */
-      /* for ( int k = i ; k <= jj ; k++ )
-         if ( ind[k] > pivot ) {
-         message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i,
-         N=%i." , k , ind[k] , pivot , i , j , N );
-         error( "Partition failed (<=pivot)." );
-         }
-         for ( int k = jj+1 ; k <= j ; k++ )
-         if ( ind[k] <= pivot ) {
-         message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i,
-         N=%i." , k , ind[k] , pivot , i , j , N );
-         error( "Partition failed (>pivot)." );
-         } */
+      /* Verify space_sort_struct. */
+      /* for (int k = i; k <= jj; k++)
+        if (ind[k] > pivot) {
+          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k,
+                  ind[k], pivot, i, j);
+          error("Partition failed (<=pivot).");
+        }
+      for (int k = jj + 1; k <= j; k++)
+        if (ind[k] <= pivot) {
+          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k,
+                  ind[k], pivot, i, j);
+          error("Partition failed (>pivot).");
+        } */
 
       /* Split-off largest interval. */
       if (jj - i > j - jj + 1) {
 
         /* Recurse on the left? */
         if (jj > i && pivot > min) {
-          qid = (last++) % qstack_size;
-          qstack[qid].i = i;
-          qstack[qid].j = jj;
-          qstack[qid].min = min;
-          qstack[qid].max = pivot;
-          qstack[qid].ready = 1;
-          if (waiting++ >= qstack_size) error("Qstack overflow.");
+          qid = atomic_inc(&space_sort_struct.last) %
+                space_sort_struct.stack_size;
+          while (space_sort_struct.stack[qid].ready);
+          space_sort_struct.stack[qid].i = i;
+          space_sort_struct.stack[qid].j = jj;
+          space_sort_struct.stack[qid].min = min;
+          space_sort_struct.stack[qid].max = pivot;
+          if (atomic_inc(&space_sort_struct.waiting) >=
+              space_sort_struct.stack_size)
+            error("Qstack overflow.");
+          space_sort_struct.stack[qid].ready = 1;
         }
 
         /* Recurse on the right? */
@@ -594,14 +619,18 @@ void parts_sort(struct part *parts, struct xpart *xparts, int *ind, int N,
       } else {
 
         /* Recurse on the right? */
-        if (jj + 1 < j && pivot + 1 < max) {
-          qid = (last++) % qstack_size;
-          qstack[qid].i = jj + 1;
-          qstack[qid].j = j;
-          qstack[qid].min = pivot + 1;
-          qstack[qid].max = max;
-          qstack[qid].ready = 1;
-          if ((waiting++) >= qstack_size) error("Qstack overflow.");
+        if (pivot + 1 < max) {
+          qid = atomic_inc(&space_sort_struct.last) %
+                space_sort_struct.stack_size;
+          while (space_sort_struct.stack[qid].ready);
+          space_sort_struct.stack[qid].i = jj + 1;
+          space_sort_struct.stack[qid].j = j;
+          space_sort_struct.stack[qid].min = pivot + 1;
+          space_sort_struct.stack[qid].max = max;
+          if (atomic_inc(&space_sort_struct.waiting) >=
+              space_sort_struct.stack_size)
+            error("Qstack overflow.");
+          space_sort_struct.stack[qid].ready = 1;
         }
 
         /* Recurse on the left? */
@@ -614,18 +643,9 @@ void parts_sort(struct part *parts, struct xpart *xparts, int *ind, int N,
 
     } /* loop over sub-intervals. */
 
-    waiting--;
+    atomic_dec(&space_sort_struct.waiting);
 
   } /* main loop. */
-
-  /* Verify sort. */
-  /* for ( i = 1 ; i < N ; i++ )
-      if ( ind[i-1] > ind[i] )
-          error( "Sorting failed (ind[%i]=%i,ind[%i]=%i)." , i-1 , ind[i-1] , i
-     , ind[i] ); */
-
-  /* Clean up. */
-  free(qstack);
 }
 
 void gparts_sort(struct gpart *gparts, int *ind, int N, int min, int max) {
@@ -696,7 +716,7 @@ void gparts_sort(struct gpart *gparts, int *ind, int N, int min, int max) {
         }
       }
 
-      /* Verify sort. */
+      /* Verify space_sort_struct. */
       /* for ( int k = i ; k <= jj ; k++ )
          if ( ind[k] > pivot ) {
          message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i,
@@ -734,7 +754,7 @@ void gparts_sort(struct gpart *gparts, int *ind, int N, int min, int max) {
       } else {
 
         /* Recurse on the right? */
-        if (jj + 1 < j && pivot + 1 < max) {
+        if (pivot + 1 < max) {
           qid = (last++) % qstack_size;
           qstack[qid].i = jj + 1;
           qstack[qid].j = j;
@@ -758,7 +778,7 @@ void gparts_sort(struct gpart *gparts, int *ind, int N, int min, int max) {
 
   } /* main loop. */
 
-  /* Verify sort. */
+  /* Verify space_sort_struct. */
   /* for ( i = 1 ; i < N ; i++ )
       if ( ind[i-1] > ind[i] )
           error( "Sorting failed (ind[%i]=%i,ind[%i]=%i)." , i-1 , ind[i-1] , i
@@ -1219,11 +1239,11 @@ void space_init(struct space *s, double dim[3], struct part *parts, int N,
 void space_link_cleanup(struct space *s) {
 
   void cell_clean_links(struct cell * c, void * data) {
-    c->link_density = NULL;
-    c->nr_link_density = 0;
+    c->density = NULL;
+    c->nr_density = 0;
 
-    c->link_force = NULL;
-    c->nr_link_force = 0;
+    c->force = NULL;
+    c->nr_force = 0;
   }
 
   space_map_cells_pre(s, 1, cell_clean_links, NULL);
diff --git a/src/space.h b/src/space.h
index 0ef083cf2e82e5dc2e677c903e57a34f1e78d675..b0b69b8f2f12bd674637f22d72a17debb57713fc 100644
--- a/src/space.h
+++ b/src/space.h
@@ -111,9 +111,23 @@ struct space {
   int nr_parts_foreign, size_parts_foreign;
 };
 
+/* Interval stack necessary for parallel particle sorting. */
+struct qstack {
+  volatile int i, j, min, max;
+  volatile int ready;
+};
+struct parallel_sort {
+  struct part *parts;
+  struct xpart *xparts;
+  int *ind;
+  struct qstack *stack;
+  unsigned int stack_size;
+  volatile unsigned int first, last, waiting;
+};
+extern struct parallel_sort space_sort_struct;
+
 /* function prototypes. */
-void parts_sort(struct part *parts, struct xpart *xparts, int *ind, int N,
-                int min, int max);
+void space_parts_sort(struct space *s, int *ind, int N, int min, int max);
 void gparts_sort(struct gpart *gparts, int *ind, int N, int min, int max);
 struct cell *space_getcell(struct space *s);
 int space_getsid(struct space *s, struct cell **ci, struct cell **cj,
@@ -133,6 +147,6 @@ void space_map_cells_post(struct space *s, int full,
 void space_rebuild(struct space *s, double h_max, int verbose);
 void space_recycle(struct space *s, struct cell *c);
 void space_split(struct space *s, struct cell *c);
+void space_do_parts_sort();
 void space_link_cleanup(struct space *s);
-
 #endif /* SWIFT_SPACE_H */
diff --git a/src/task.c b/src/task.c
index a911e3c1ffd3d4164dc0b7a57030d459fda0359b..b65bfd21fd194a21533df2b41a57ca84c16ad26f 100644
--- a/src/task.c
+++ b/src/task.c
@@ -1,7 +1,6 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -258,6 +257,7 @@ void task_addunlock_old(struct task *ta, struct task *tb) {
   lock_unlock_blind(&ta->lock);
 }
 
+
 void task_print_mask(unsigned int mask) {
 
   int k;
diff --git a/src/tools.c b/src/tools.c
index b6a945ffba39cdba3cbbf460e61133adc6794f97..70c9e1def1946de7aaf00a209fe0983edd54d5f1 100644
--- a/src/tools.c
+++ b/src/tools.c
@@ -176,6 +176,66 @@ void pairs_single_density(double *dim, long long int pid,
   fflush(stdout);
 }
 
+void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) {
+
+  float r2, hi, hj, hig2, hjg2, dx[3];
+  struct part *pi, *pj;
+
+  /* Implements a double-for loop and checks every interaction */
+  for (int i = 0; i < ci->count; ++i) {
+
+    pi = &ci->parts[i];
+    hi = pi->h;
+    hig2 = hi * hi * kernel_gamma2;
+
+    for (int j = 0; j < cj->count; ++j) {
+
+      pj = &cj->parts[j];
+
+      /* Pairwise distance */
+      r2 = 0.0f;
+      for (int k = 0; k < 3; k++) {
+        dx[k] = ci->parts[i].x[k] - cj->parts[j].x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+      /* Hit or miss? */
+      if (r2 < hig2) {
+
+        /* Interact */
+        runner_iact_nonsym_density(r2, dx, hi, pj->h, pi, pj);
+      }
+    }
+  }
+
+  /* Reverse double-for loop and checks every interaction */
+  for (int j = 0; j < cj->count; ++j) {
+
+    pj = &cj->parts[j];
+    hj = pj->h;
+    hjg2 = hj * hj * kernel_gamma2;
+
+    for (int i = 0; i < ci->count; ++i) {
+
+      pi = &ci->parts[i];
+
+      /* Pairwise distance */
+      r2 = 0.0f;
+      for (int k = 0; k < 3; k++) {
+        dx[k] = cj->parts[j].x[k] - ci->parts[i].x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+      /* Hit or miss? */
+      if (r2 < hjg2) {
+
+        /* Interact */
+        runner_iact_nonsym_density(r2, dx, hj, pi->h, pj, pi);
+      }
+    }
+  }
+}
+
 void pairs_single_grav(double *dim, long long int pid,
                        struct gpart *__restrict__ parts, int N, int periodic) {
 
diff --git a/src/tools.h b/src/tools.h
index ea7138672a3d2a037f1da98368c185bc4633fe08..59646291bda46a7dd0f5a34e158e3e0a6f21d3ca 100644
--- a/src/tools.h
+++ b/src/tools.h
@@ -19,6 +19,12 @@
  *
  ******************************************************************************/
 
+#ifndef SWIFT_TOOL_H
+#define SWIFT_TOOL_H
+
+#include "runner.h"
+#include "cell.h"
+
 void factor(int value, int *f1, int *f2);
 void density_dump(int N);
 void pairs_single_grav(double *dim, long long int pid,
@@ -26,5 +32,9 @@ void pairs_single_grav(double *dim, long long int pid,
 void pairs_single_density(double *dim, long long int pid,
                           struct part *__restrict__ parts, int N, int periodic);
 
+void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj);
+
 void pairs_n2(double *dim, struct part *__restrict__ parts, int N,
               int periodic);
+
+#endif /* SWIFT_TOOL_H */
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 2fc5e37578414d83c9858bd7da7c20fb98ff11cf..50665f6b742bdebb1bc5ac6553efe3bb472f3220 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -15,15 +15,16 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 # Add the source directory and debug to CFLAGS
-AM_CFLAGS = -I../src -DCPU_TPS=2.67e9 $(HDF5_CPPFLAGS)
+AM_CFLAGS = -I../src -DCPU_TPS=2.67e9 $(HDF5_CPPFLAGS) -DTIMER
+
 
 AM_LDFLAGS = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS)
 
 # List of programs and scripts to run in the test suite
-TESTS = testGreetings testReading.sh testSingle testTimeIntegration testSPHStep
+TESTS = testGreetings testReading.sh testSingle testTimeIntegration
 
 # List of test programs to compile
-check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration testSPHStep
+check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration testSPHStep testVectorize
 
 # Sources for the individual programs
 testGreetings_SOURCES = testGreetings.c
@@ -35,5 +36,5 @@ testTimeIntegration_SOURCES = testTimeIntegration.c
 testSPHStep_SOURCES = testSPHStep.c
 
 testSingle_SOURCES = testSingle.c
-testSingle_CFLAGS = $(MYFLAGS) $(AM_CFLAGS)
-testSingle_LDADD =  ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS)
+
+testVectorize_SOURCES = testVectorize.c
diff --git a/tests/testVectorize.c b/tests/testVectorize.c
new file mode 100644
index 0000000000000000000000000000000000000000..d86961e350ca13c0612f4e5d8196e30e15122ad9
--- /dev/null
+++ b/tests/testVectorize.c
@@ -0,0 +1,216 @@
+#include <fenv.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "swift.h"
+
+/* n is both particles per axis and box size:
+ * particles are generated on a mesh with unit spacing
+ */
+struct cell *make_cell(size_t n, double *offset, double h,
+                       unsigned long long *partId) {
+  size_t count = n * n * n;
+  struct cell *cell = malloc(sizeof *cell);
+  struct part *part;
+  size_t x, y, z, size;
+
+  size = count * sizeof(struct part);
+  if (posix_memalign((void **)&cell->parts, part_align, size) != 0) {
+    error("couldn't allocate particles, no. of particles: %d", (int)count);
+  }
+
+  part = cell->parts;
+  for (x = 0; x < n; ++x) {
+    for (y = 0; y < n; ++y) {
+      for (z = 0; z < n; ++z) {
+        // Add .5 for symmetry: 0.5, 1.5, 2.5 vs. 0, 1, 2
+        part->x[0] = x + offset[0] + 0.5;
+        part->x[1] = y + offset[1] + 0.5;
+        part->x[2] = z + offset[2] + 0.5;
+        part->v[0] = 1.0f;
+        part->v[1] = 1.0f;
+        part->v[2] = 1.0f;
+        part->dt = 0.01;
+        part->h = h;
+        part->id = ++(*partId);
+        part->mass = 1.0f;
+        ++part;
+      }
+    }
+  }
+
+  cell->split = 0;
+  cell->h_max = h;
+  cell->count = count;
+  cell->dx_max = 1.;
+  cell->h[0] = n;
+  cell->h[1] = n;
+  cell->h[2] = n;
+
+  cell->sort = malloc(13 * count * sizeof *cell->sort);
+  runner_dosort(NULL, cell, 0x1FFF, 0);
+
+  return cell;
+}
+
+void clean_up(struct cell *ci) {
+  free(ci->parts);
+  free(ci->sort);
+  free(ci);
+}
+
+/**
+ * @brief Initializes all particles field to be ready for a density calculation
+ */
+void zero_particle_fields(struct cell *c) {
+
+  for (size_t pid = 0; pid < c->count; pid++) {
+    c->parts[pid].rho = 0.f;
+    c->parts[pid].rho_dh = 0.f;
+    c->parts[pid].density.wcount = 0.f;
+    c->parts[pid].density.wcount_dh = 0.f;
+    c->parts[pid].density.div_v = 0.f;
+    c->parts[pid].density.curl_v[0] = 0.f;
+    c->parts[pid].density.curl_v[1] = 0.f;
+    c->parts[pid].density.curl_v[2] = 0.f;
+  }
+}
+
+/**
+ * @brief Dump all the particles to a file
+ */
+void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) {
+
+  FILE *file = fopen(fileName, "w");
+
+  fprintf(file,
+          "# ID  rho  rho_dh  wcount  wcount_dh  div_v  curl_v:[x y z]\n");
+
+  for (size_t pid = 0; pid < ci->count; pid++) {
+    fprintf(file, "%6llu %f %f %f %f %f %f %f %f\n", ci->parts[pid].id,
+            ci->parts[pid].rho, ci->parts[pid].rho_dh,
+            ci->parts[pid].density.wcount, ci->parts[pid].density.wcount_dh,
+            ci->parts[pid].density.div_v, ci->parts[pid].density.curl_v[0],
+            ci->parts[pid].density.curl_v[1], ci->parts[pid].density.curl_v[2]);
+  }
+
+  fprintf(file, "# -----------------------------------\n");
+
+  for (size_t pjd = 0; pjd < cj->count; pjd++) {
+    fprintf(file, "%6llu %f %f %f %f %f %f %f %f\n", cj->parts[pjd].id,
+            cj->parts[pjd].rho, cj->parts[pjd].rho_dh,
+            cj->parts[pjd].density.wcount, cj->parts[pjd].density.wcount_dh,
+            cj->parts[pjd].density.div_v, cj->parts[pjd].density.curl_v[0],
+            cj->parts[pjd].density.curl_v[1], cj->parts[pjd].density.curl_v[2]);
+  }
+
+  fclose(file);
+}
+
+/* Just a forward declaration... */
+void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
+
+int main(int argc, char *argv[]) {
+  size_t particles = 0, runs = 0, volume, type = 0;
+  double offset[3] = {0, 0, 0}, h = 1.1255;  // * DIM/PARTS_PER_AXIS == * 1
+  struct cell *ci, *cj;
+  struct space space;
+  struct engine engine;
+  struct runner runner;
+  char c;
+  static unsigned long long partId = 0;
+  ticks tic, toc, time;
+
+  while ((c = getopt(argc, argv, "h:p:r:t:")) != -1) {
+    switch (c) {
+      case 'h':
+        sscanf(optarg, "%lf", &h);
+        break;
+      case 'p':
+        sscanf(optarg, "%zu", &particles);
+        break;
+      case 'r':
+        sscanf(optarg, "%zu", &runs);
+        break;
+      case 't':
+        sscanf(optarg, "%zu", &type);
+        break;
+    }
+  }
+
+  if (h < 0 || particles == 0 || runs == 0 || type > 2) {
+    printf(
+        "\nUsage: %s -p PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n"
+        "\nGenerates a cell pair, filled with particles on a Cartesian grid."
+        "\nThese are then interacted using runner_dopair1_density."
+        "\n\nOptions:"
+        "\n-t TYPE=0          - cells share face (0), edge (1) or corner (2)"
+        "\n-h DISTANCE=1.1255 - smoothing length\n",
+        argv[0]);
+    exit(1);
+  }
+
+  volume = particles * particles * particles;
+  message("particles: %zu B\npositions: 0 B", 2 * volume * sizeof(struct part));
+
+  ci = make_cell(particles, offset, h, &partId);
+  for (size_t i = 0; i < type + 1; ++i) offset[i] = particles;
+  cj = make_cell(particles, offset, h, &partId);
+
+  for (int i = 0; i < 3; ++i) {
+    space.h_max = h;
+    space.dt_step = 0.1;
+  }
+
+  engine.s = &space;
+  engine.dt_step = 0.1;
+  runner.e = &engine;
+
+  time = 0;
+  for (size_t i = 0; i < runs; ++i) {
+
+    /* Zero the fields */
+    zero_particle_fields(ci);
+    zero_particle_fields(cj);
+
+    tic = getticks();
+
+    /* Run the test */
+    runner_dopair1_density(&runner, ci, cj);
+
+    toc = getticks();
+    time += toc - tic;
+    
+    /* Dump if necessary */
+    if (i % 50 == 0) dump_particle_fields("swift_dopair.dat", ci, cj);
+  }
+
+  /* Output timing */
+  message("SWIFT calculation took       %lli ticks." , time / runs);
+
+  /* Now perform a brute-force version for accuracy tests */
+
+  /* Zero the fields */
+  zero_particle_fields(ci);
+  zero_particle_fields(cj);
+
+  tic = getticks();
+
+  /* Run the test */
+  pairs_all_density(&runner, ci, cj);
+
+  toc = getticks();
+
+  /* Dump */
+  dump_particle_fields("brute_force.dat", ci, cj);
+
+  /* Output timing */
+  message("Brute force calculation took %lli ticks." , toc - tic);
+
+  /* Clean things to make the sanitizer happy ... */
+  clean_up(ci);
+  clean_up(cj);
+
+  return 0;
+}