diff --git a/configure.ac b/configure.ac
index 225796677a706183087283af4463474508a6b979..de0e1b35d4da78d103ddf27304ad527361a342c7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -60,6 +60,7 @@ AC_ARG_ENABLE([mpi],
 good_mpi="yes"
 if test "$enable_mpi" = "yes"; then
     AX_MPI([CC="$MPICC" AC_DEFINE(HAVE_MPI, 1, [Define if you have the MPI library.]) ])
+    MPI_LIBRARY="Unknown MPI"
 
     # Various MPI implementations require additional libraries when also using
     # threads. Use mpirun (on PATH) as that seems to be only command with
@@ -83,14 +84,17 @@ if test "$enable_mpi" = "yes"; then
        case "$version" in
          *Intel*MPI*)
             MPI_THREAD_LIBS="-mt_mpi"
+            MPI_LIBRARY="Intel MPI"
             AC_MSG_RESULT([Intel MPI])
          ;;
          *Platform*)
             MPI_THREAD_LIBS="-lmtmpi"
+            MPI_LIBRARY="PLATFORM MPI"
             AC_MSG_RESULT([PLATFORM MPI])
          ;;
          *"Open MPI"*)
             MPI_THREAD_LIBS=""
+            MPI_LIBRARY="Open MPI"
             AC_MSG_RESULT([Open MPI])
             #  OpenMPI should be 1.8.6 or later, if not complain.
             #  Version is last word on first line of -version output.
@@ -109,6 +113,7 @@ if test "$enable_mpi" = "yes"; then
        esac
        AC_SUBST([MPI_THREAD_LIBS])
     fi
+    AC_DEFINE_UNQUOTED([SWIFT_MPI_LIBRARY], ["$MPI_LIBRARY"], [The MPI library name, if known.])
 fi
 AM_CONDITIONAL([HAVEMPI],[test $enable_mpi = "yes"])
 
@@ -265,6 +270,9 @@ AC_CHECK_FUNC(pthread_setaffinity_np, AC_DEFINE([HAVE_SETAFFINITY],[true],
 AM_CONDITIONAL(HAVESETAFFINITY,
     [test "$ac_cv_func_pthread_setaffinity_np" = "yes"])
 
+# Check for libnuma.
+AC_CHECK_LIB([numa], [numa_available])
+
 # Check for timing functions needed by cycle.h.
 AC_HEADER_TIME
 AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h])
diff --git a/examples/main.c b/examples/main.c
index 22b1156b63cb148b0a6c974e69bcafa0d0ccdfb6..160e6f97b64b9c380affe80455a425cbfe157a03 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -79,6 +79,7 @@ int main(int argc, char *argv[]) {
   struct engine e;
   struct UnitSystem us;
   char ICfileName[200] = "";
+  char dumpfile[30];
   float dt_max = 0.0f;
   ticks tic;
   int nr_nodes = 1, myrank = 0, grid[3] = {1, 1, 1};
@@ -115,11 +116,26 @@ int main(int argc, char *argv[]) {
   /* Greeting message */
   if (myrank == 0) greetings();
 
+#if defined(HAVE_SETAFFINITY) && defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE)
+  /* Ensure the NUMA node on which we initialise (first touch) everything
+   * doesn't change before engine_init allocates NUMA-local workers. Otherwise,
+   * we may be scheduled elsewhere between the two times.
+   */
+  cpu_set_t affinity;
+  CPU_ZERO(&affinity);
+  CPU_SET(sched_getcpu(), &affinity);
+  if (sched_setaffinity(0, sizeof(cpu_set_t), &affinity) != 0) {
+    message("failed to set entry thread's affinity");
+  } else {
+    message("set entry thread's affinity");
+  }
+#endif
+
   /* Init the space. */
   bzero(&s, sizeof(struct space));
 
   /* Parse the options */
-  while ((c = getopt(argc, argv, "a:c:d:f:g:m:q:r:s:t:w:yz:")) != -1)
+  while ((c = getopt(argc, argv, "a:c:d:f:g:m:q:r:s:t:w:y:z:")) != -1)
     switch (c) {
       case 'a':
         if (sscanf(optarg, "%lf", &scaling) != 1)
@@ -178,7 +194,8 @@ int main(int argc, char *argv[]) {
         if (myrank == 0) message("sub size set to %i.", space_subsize);
         break;
       case 'y':
-        dump_tasks = 1;
+        if(sscanf(optarg, "%d", &dump_tasks) != 1)
+          error("Error parsing dump_tasks (-y)");
         break;
       case 'z':
         if (sscanf(optarg, "%d", &space_splitsize) != 1)
@@ -414,6 +431,69 @@ int main(int argc, char *argv[]) {
 #endif
     }
 
+  /* Dump the task data using the given frequency. */
+    if (dump_tasks && (dump_tasks == 1 || j % dump_tasks == 1)) {
+#ifdef WITH_MPI
+
+      /* Make sure output file is empty, only on one rank. */
+      sprintf(dumpfile, "thread_info_MPI-step%d.dat", j);
+      if (myrank == 0) {
+          file_thread = fopen(dumpfile, "w");
+          fclose(file_thread);
+      }
+      MPI_Barrier(MPI_COMM_WORLD);
+
+      for (int i = 0; i < nr_nodes; i++) {
+
+        /* Rank 0 decides the index of writing node, this happens one-by-one. */
+        int kk = i;
+        MPI_Bcast(&kk, 1, MPI_INT, 0, MPI_COMM_WORLD);
+
+        if (i == myrank) {
+
+          /* Open file and position at end. */
+          file_thread = fopen(dumpfile, "a");
+
+          fprintf(file_thread, " %03i 0 0 0 0 %lli 0 0 0 0\n", myrank,
+                  e.tic_step);
+          int count = 0;
+          for (int l = 0; l < e.sched.nr_tasks; l++)
+            if (!e.sched.tasks[l].skip && !e.sched.tasks[l].implicit) {
+              fprintf(
+                  file_thread, " %03i %i %i %i %i %lli %lli %i %i %i\n", myrank,
+                  e.sched.tasks[l].rid, e.sched.tasks[l].type,
+                  e.sched.tasks[l].subtype, (e.sched.tasks[l].cj == NULL),
+                  e.sched.tasks[l].tic, e.sched.tasks[l].toc,
+                  (e.sched.tasks[l].ci != NULL) ? e.sched.tasks[l].ci->count : 0,
+                  (e.sched.tasks[l].cj != NULL) ? e.sched.tasks[l].cj->count : 0,
+                  e.sched.tasks[l].flags);
+              fflush(stdout);
+              count++;
+            }
+          message("rank %d counted %d tasks", myrank, count);
+
+          fclose(file_thread);
+        }
+
+        /* And we wait for all to synchronize. */
+        MPI_Barrier(MPI_COMM_WORLD);
+      }
+
+#else
+      sprintf(dumpfile, "thread_info-step%d.dat", j);
+      file_thread = fopen(dumpfile, "w");
+      for (int l = 0; l < e.sched.nr_tasks; l++)
+        if (!e.sched.tasks[l].skip && !e.sched.tasks[l].implicit)
+          fprintf(file_thread, " %i %i %i %i %lli %lli %i %i\n",
+                  e.sched.tasks[l].rid, e.sched.tasks[l].type,
+                  e.sched.tasks[l].subtype, (e.sched.tasks[l].cj == NULL),
+                  e.sched.tasks[l].tic, e.sched.tasks[l].toc,
+                  (e.sched.tasks[l].ci == NULL) ? 0 : e.sched.tasks[l].ci->count,
+                  (e.sched.tasks[l].cj == NULL) ? 0 : e.sched.tasks[l].cj->count);
+      fclose(file_thread);
+#endif
+    }
+
     /* Dump a line of aggregate output. */
     /*     if (myrank == 0) { */
     /*       printf("%i %e %.16e %.16e %.16e %.3e %.3e %i %.3e %.3e", j, e.time,
@@ -447,43 +527,6 @@ int main(int argc, char *argv[]) {
            (double)runner_hist_bins[k]);
 #endif
 
-  /* Dump the task data. */
-  if (dump_tasks) {
-#ifdef WITH_MPI
-    file_thread = fopen("thread_info_MPI.dat", "w");
-    for (j = 0; j < nr_nodes; j++) {
-      MPI_Barrier(MPI_COMM_WORLD);
-      if (j == myrank) {
-        fprintf(file_thread, " %03i 0 0 0 0 %lli 0 0 0 0\n", myrank,
-                e.tic_step);
-        for (k = 0; k < e.sched.nr_tasks; k++)
-          if (!e.sched.tasks[k].skip && !e.sched.tasks[k].implicit)
-            fprintf(
-                file_thread, " %03i %i %i %i %i %lli %lli %i %i %i\n", myrank,
-                e.sched.tasks[k].rid, e.sched.tasks[k].type,
-                e.sched.tasks[k].subtype, (e.sched.tasks[k].cj == NULL),
-                e.sched.tasks[k].tic, e.sched.tasks[k].toc,
-                e.sched.tasks[k].ci->count,
-                (e.sched.tasks[k].cj != NULL) ? e.sched.tasks[k].cj->count : 0,
-                e.sched.tasks[k].flags);
-        fflush(stdout);
-        sleep(1);
-      }
-    }
-    fclose(file_thread);
-#else
-    file_thread = fopen("thread_info.dat", "w");
-    for (k = 0; k < e.sched.nr_tasks; k++)
-      if (!e.sched.tasks[k].skip && !e.sched.tasks[k].implicit)
-        fprintf(file_thread, " %i %i %i %i %lli %lli %i %i\n",
-                e.sched.tasks[k].rid, e.sched.tasks[k].type,
-                e.sched.tasks[k].subtype, (e.sched.tasks[k].cj == NULL),
-                e.sched.tasks[k].tic, e.sched.tasks[k].toc,
-                e.sched.tasks[k].ci->count,
-                (e.sched.tasks[k].cj == NULL) ? 0 : e.sched.tasks[k].cj->count);
-    fclose(file_thread);
-#endif
-  }
 
   if (with_outputs) {
 /* Write final output. */
diff --git a/examples/plot_tasks.py b/examples/plot_tasks.py
index 52f1a90592120e61bd3e789ee0cb8fe7edb63a8d..bf0002664aa04fb0efccf1c1cbeb86158cb17d23 100644
--- a/examples/plot_tasks.py
+++ b/examples/plot_tasks.py
@@ -3,27 +3,28 @@
  # Copyright (c) 2015 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
  #                    Bert Vandenbroucke (bert.vandenbroucke@ugent.be)
  #                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
- # 
+ #
  # This program is free software: you can redistribute it and/or modify
  # it under the terms of the GNU Lesser General Public License as published
  # by the Free Software Foundation, either version 3 of the License, or
  # (at your option) any later version.
- # 
+ #
  # This program is distributed in the hope that it will be useful,
  # but WITHOUT ANY WARRANTY; without even the implied warranty of
  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  # GNU General Public License for more details.
- # 
+ #
  # You should have received a copy of the GNU Lesser General Public License
  # along with this program.  If not, see <http://www.gnu.org/licenses/>.
- # 
+ #
  ##############################################################################
 
 
 import matplotlib
 matplotlib.use('Agg')
 import pylab as pl
-#import numpy as np
+import numpy as np
+import sys
 
 CPU_CLOCK = 2.7e9
 
@@ -50,18 +51,47 @@ pl.rc('font',**{'family':'sans-serif','sans-serif':['Times']})
 
 
 
-types = {"0": "task_none", "1": "task_sort", "2": "task_self", "3": "task_pair",
-         "4": "task_pair", "5": "task_sub", "6": "task_ghost",
-         "7": "task_kick1", "8": "task_kick2", "9": "task_send",
-         "10": "task_recv", "11": "task_link", "12": "task_grav_pp",
-         "13": "task_grav_mm", "14": "task_grav_up", "15": "task_grav_down"}
-
-subtypes = {"0": "subtask_none", "1": "subtask_density", "2": "subtask_force",
-            "3": "subtask_grav"} 
-
-colors = {"subtask_density": "g", "subtask_force": "b", "subtask_grav": "y", "subtask_none": "k"}
-colors2 = {"task_sort": "r", "task_kick1": "m"}
-
+types = {"0": "task_none",
+         "1": "task_sort",
+         "2": "task_self",
+         "3": "task_pair",
+         "4": "task_sub",
+         "5": "task_ghost",
+         "6": "task_kick1",
+         "7": "task_kick2",
+         "8": "task_send",
+         "9": "task_recv",
+         "10": "task_grav_pp",
+         "11": "task_grav_mm",
+         "12": "task_grav_up",
+         "13": "task_grav_down",
+         "14": "task_psort",
+         "15": "task_split_cell",
+         "16": "task_rewait",
+         "17": "task_count"}
+
+subtypes = {"0": "subtask_none",
+            "1": "subtask_density",
+            "2": "subtask_force",
+            "3": "subtask_grav",
+            "4": "subtask_count"}
+
+# Assign colours for all types.
+colors = ["red","blue","green","yellow","cyan","magenta","black"]
+colors = colors + list(matplotlib.colors.cnames)
+index = 0
+
+subtypecolors = {}
+for key in subtypes:
+    subtypecolors[subtypes[key]] = colors[index]
+    print subtypes[key], " = ", colors[index]
+    index = index + 1
+
+taskcolors = {}
+for key in types:
+    taskcolors[types[key]] = colors[index]
+    print types[key], " = ", colors[index]
+    index = index + 1
 
 data = pl.loadtxt("thread_info.dat")
 
@@ -86,7 +116,7 @@ for line in range(num_lines):
     tasks[thread][-1]["subtype"] = subtypes[str(int(data[line,2]))]
     tasks[thread][-1]["tic"] = int(data[line,4]) / CPU_CLOCK * 1000
     tasks[thread][-1]["toc"] = int(data[line,5]) / CPU_CLOCK * 1000
-    tasks[thread][-1]["t"] = (tasks[thread][-1]["toc"]+tasks[thread][-1]["tic"]) / 2 
+    tasks[thread][-1]["t"] = (tasks[thread][-1]["toc"]+tasks[thread][-1]["tic"]) / 2
 
 combtasks = {}
 combtasks[-1] = []
@@ -107,9 +137,9 @@ for thread in range(nthread):
             combtasks[thread][-1]["tic"] = task["tic"]
             combtasks[thread][-1]["toc"] = task["toc"]
             if task["type"] == 'task_self' or task["type"] == 'task_pair' or task["type"] == 'task_sub':
-                combtasks[thread][-1]["color"] = colors[task["subtype"]]
+                combtasks[thread][-1]["color"] = subtypecolors[task["subtype"]]
             else:
-                combtasks[thread][-1]["color"] = colors2[task["type"]]
+                combtasks[thread][-1]["color"] = taskcolors[task["type"]]
             lasttype = task["type"]
         else:
             combtasks[thread][-1]["toc"] = task["toc"]
diff --git a/examples/plot_tasks_MPI.py b/examples/plot_tasks_MPI.py
new file mode 100755
index 0000000000000000000000000000000000000000..c71d96a4e127f348284b7095a612b5783be73e79
--- /dev/null
+++ b/examples/plot_tasks_MPI.py
@@ -0,0 +1,246 @@
+#!/usr/bin/env python
+"""
+Usage:
+    plot_tasks_MPI.py input.dat png-output-prefix [time-range-ms]
+   
+where input.dat is a thread info file for a step of an MPI run.  Use the '-y
+interval' flag of the swift MPI commands to create these. The output plots
+will be called 'png-output-prefix<mpi-rank>.png', i.e. one each for all the
+threads in each MPI rank. Use the time-range-ms in millisecs to produce
+plots with the same time span.
+
+See the command 'process_plot_tasks_MPI' to efficiently wrap this command to
+process a number of thread info files and create an HTML file to view them.
+
+This file is part of SWIFT.
+
+Copyright (C) 2015 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
+                   Bert Vandenbroucke (bert.vandenbroucke@ugent.be)
+                   Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+                   Peter W. Draper (p.w.draper@durham.ac.uk)
+All Rights Reserved.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published
+by the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import matplotlib
+import matplotlib.collections as collections
+matplotlib.use("Agg")
+import pylab as pl
+import numpy as np
+import sys
+
+#  CPU ticks per second.
+CPU_CLOCK = 2.7e9
+
+#  Basic plot configuration.
+PLOT_PARAMS = {"axes.labelsize": 10,
+               "axes.titlesize": 10,
+               "font.size": 12,
+               "legend.fontsize": 12,
+               "xtick.labelsize": 10,
+               "ytick.labelsize": 10,
+               "figure.figsize" : (16., 4.),
+               "figure.subplot.left" : 0.03,
+               "figure.subplot.right" : 0.995,
+               "figure.subplot.bottom" : 0.1,
+               "figure.subplot.top" : 0.99,
+               "figure.subplot.wspace" : 0.,
+               "figure.subplot.hspace" : 0.,
+               "lines.markersize" : 6,
+               "lines.linewidth" : 3.
+               }
+pl.rcParams.update(PLOT_PARAMS)
+
+#  Tasks and subtypes. Indexed as in tasks.h.
+TASKTYPES = ["none", "sort", "self", "pair", "sub", "ghost", "kick1", "kick2",
+             "send", "recv", "grav_pp", "grav_mm", "grav_up", "grav_down",
+             "psort", "split_cell", "rewait", "count"]
+
+TASKCOLOURS = {"none": "black",
+               "sort": "lightblue",
+               "self": "greenyellow",
+               "pair": "navy",
+               "sub": "hotpink",
+               "ghost": "cyan",
+               "kick1": "maroon",
+               "kick2": "green",
+               "send": "yellow",
+               "recv": "magenta",
+               "grav_pp": "mediumorchid",
+               "grav_mm": "mediumturquoise",
+               "grav_up": "mediumvioletred",
+               "grav_down": "mediumnightblue",
+               "psort": "steelblue",
+               "split_cell": "seagreen",
+               "rewait": "olive",
+               "count": "powerblue"}
+
+SUBTYPES = ["none", "density", "force", "grav", "count"]
+
+SUBCOLOURS = {"none": "black",
+              "density": "red",
+              "force": "blue",
+              "grav": "indigo",
+              "count": "purple"}
+
+#  Show docs if help is requested.
+if len( sys.argv ) == 2 and ( sys.argv[1][0:2] == "-h" or sys.argv[1][0:3] == "--h" ):
+    from pydoc import help
+    help( "__main__" )
+    sys.exit( 0 )
+
+#  Handle command-line.
+if len( sys.argv ) != 3 and len( sys.argv ) != 4:
+    print "Usage: ", sys.argv[0], "input.dat png-output-prefix [time-range-ms]"
+    sys.exit(1)
+
+
+infile = sys.argv[1]
+outbase = sys.argv[2]
+delta_t = 0
+if len( sys.argv ) == 4:
+    delta_t = int(sys.argv[3]) * CPU_CLOCK / 1000
+
+#  Read input.
+data = pl.loadtxt( infile )
+
+nranks = int(max(data[:,0])) + 1
+print "Number of ranks:", nranks
+nthread = int(max(data[:,1])) + 1
+print "Number of threads:", nthread
+
+# Avoid start and end times of zero.
+sdata = data[data[:,5] != 0]
+sdata = sdata[sdata[:,6] != 0]
+
+# Each rank can have different clock (compute node), but we want to use the
+# same delta times range for comparisons, so we suck it up and take the hit of
+# precalculating this, unless the user knows better.
+if delta_t == 0:
+    for rank in range(nranks):
+        data = sdata[sdata[:,0] == rank]
+        dt = max(data[:,6]) - min(data[:,5])
+        if dt > delta_t:
+            delta_t = dt
+
+# Once more doing the real gather and plots this time.
+for rank in range(nranks):
+    data = sdata[sdata[:,0] == rank]
+
+    start_t = min(data[:,5])
+    data[:,5] -= start_t
+    data[:,6] -= start_t
+
+    tasks = {}
+    tasks[-1] = []
+    for i in range(nthread):
+        tasks[i] = []
+
+    num_lines = pl.size(data) / 10
+    for line in range(num_lines):
+        thread = int(data[line,1])
+        tasks[thread].append({})
+        tasks[thread][-1]["type"] = TASKTYPES[int(data[line,2])]
+        tasks[thread][-1]["subtype"] = SUBTYPES[int(data[line,3])]
+        tic = int(data[line,5]) / CPU_CLOCK * 1000
+        toc = int(data[line,6]) / CPU_CLOCK * 1000
+        tasks[thread][-1]["tic"] = tic
+        tasks[thread][-1]["toc"] = toc
+        tasks[thread][-1]["t"] = (toc + tic)/ 2
+
+    combtasks = {}
+    combtasks[-1] = []
+    for i in range(nthread):
+        combtasks[i] = []
+
+    for thread in range(nthread):
+        tasks[thread] = sorted(tasks[thread], key=lambda l: l["t"])
+        lasttype = ""
+        types = []
+        for task in tasks[thread]:
+            if task["type"] not in types:
+                types.append(task["type"])
+            if lasttype == "" or not lasttype == task["type"]:
+                combtasks[thread].append({})
+                combtasks[thread][-1]["type"] = task["type"]
+                combtasks[thread][-1]["subtype"] = task["subtype"]
+                combtasks[thread][-1]["tic"] = task["tic"]
+                combtasks[thread][-1]["toc"] = task["toc"]
+                if task["type"] == "self" or task["type"] == "pair" or task["type"] == "sub":
+                    combtasks[thread][-1]["colour"] = SUBCOLOURS[task["subtype"]]
+                else:
+                    combtasks[thread][-1]["colour"] = TASKCOLOURS[task["type"]]
+                lasttype = task["type"]
+            else:
+                combtasks[thread][-1]["toc"] = task["toc"]
+
+    typesseen = []
+    fig = pl.figure()
+    ax = fig.add_subplot(1,1,1)
+    ax.set_xlim(0, delta_t * 1.03 * 1000 / CPU_CLOCK)
+    ax.set_ylim(0, nthread)
+    tictoc = np.zeros(2)
+    for i in range(nthread):
+
+        #  Collect ranges and colours into arrays.
+        tictocs = np.zeros(len(combtasks[i])*2)
+        colours = np.empty(len(combtasks[i])*2, dtype='object')
+        coloursseen = []
+        j = 0
+        for task in combtasks[i]:
+            tictocs[j] = task["tic"]
+            tictocs[j+1] = task["toc"]
+            colours[j] = task["colour"]
+            colours[j+1] = task["colour"]
+            j = j + 2
+            if task["colour"] not in coloursseen:
+                coloursseen.append(task["colour"])
+
+            #  Legend support, collections don't add to this.
+            if task["subtype"] != "none":
+                qtask = task["type"] + "/" + task["subtype"]
+            else:
+                qtask = task["type"]
+            if qtask not in typesseen:
+                pl.plot([], [], color=task["colour"], label=qtask)
+                typesseen.append(qtask)
+
+        #  Now plot each colour, faster to use a mask to select colour ranges.
+        for colour in coloursseen:
+            collection = collections.BrokenBarHCollection.span_where(tictocs, ymin=i+0.05, ymax=i+0.95,
+                                                                     where=colours == colour,
+                                                                     facecolor=colour,
+                                                                     linewidths=0)
+            ax.add_collection(collection)
+
+
+    #  Legend and room for it.
+    nrow = len(typesseen) / 5
+    if len(typesseen) * 5 < nrow:
+        nrow = nrow + 1
+    ax.fill_between([0, 0], nthread+0.5, nthread + nrow + 0.5, facecolor="white")
+    ax.set_ylim(0, nthread + nrow + 1)
+    ax.legend(loc=1, shadow=True, mode="expand", ncol=5)
+
+    ax.set_xlabel("Wall clock time [ms]")
+    ax.set_ylabel("Thread ID for MPI Rank " + str(rank) )
+    ax.set_yticks(pl.array(range(nthread)), True)
+
+    pl.show()
+    outpng = outbase + str(rank) + ".png"
+    pl.savefig(outpng)
+    print "Graphics done, output written to", outpng
+
+sys.exit(0)
diff --git a/examples/process_plot_tasks_MPI b/examples/process_plot_tasks_MPI
new file mode 100755
index 0000000000000000000000000000000000000000..d3eb5d4a5fc5918b287cd5d98efcc5881b6f910c
--- /dev/null
+++ b/examples/process_plot_tasks_MPI
@@ -0,0 +1,96 @@
+#!/bin/bash
+#
+# Usage:
+#  process_plot_tasks_MPI nprocess time-range-ms
+#
+# Description:
+#  Process all the thread info files in the current directory
+#  creating task graphs for all MPI ranks and all threads.
+#
+#  The input files are created by a run using the "-y interval" flag and
+#  should be named "thread_info_MPI-step<n>.dat" in the current directory.
+#  All located files will be processed using "nprocess" concurrent
+#  processes and all plots will have the given time range. An output
+#  HTML file "index.html" will be created to view all the plots.
+#
+#
+# This file is part of SWIFT:
+#
+#  Copyright (C) 2015 Peter W. Draper (p.w.draper@durham.ac.uk)
+#  All Rights Reserved.
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU Lesser General Public License as published
+#  by the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public License
+#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#  Handle command-line
+if test "$2" == ""; then
+    echo "Usage: $0 nprocess time-range-ms"
+    exit 1
+fi
+NPROCS=$1
+TIMERANGE=$2
+
+#  Find all thread info files. Use version sort to get into correct order.
+files=$(ls -v thread_info_MPI-step*.dat)
+if test $? != 0; then
+    echo "Failed to find any thread info files"
+    exit 1
+fi
+
+#  Construct list of names, the step no and names for the graphics.
+list=""
+for f in $files; do
+    s=$(echo $f| sed 's,thread_info_MPI-step\(.*\).dat,\1,')
+    list="$list $f $s step${s}r"
+done
+
+#  Need number of ranks used.
+basefile=$(echo $list | awk '{print $1}')
+nrank=$(cat $basefile | awk '{print $1}' | uniq | wc -l)
+nrank=$(($nrank-1))
+
+#  And process them,
+echo "Processing thread info files..."
+echo $list | xargs -P $NPROCS -n 3 /bin/bash -c "./plot_tasks_MPI.py \$0 \$2 $TIMERANGE"
+
+echo "Writing output index.html file"
+#  Construct document - serial.
+cat <<EOF > index.html
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+  <head>
+    <title>SWIFT task graphs</title>
+  </head>
+  <body>
+  <h1>SWIFT task graphs</h1>
+EOF
+
+echo $list | xargs -n 3 | while read f s g; do
+    cat <<EOF >> index.html
+<h2>Step $s</h2>
+EOF
+    for i in $(seq 0 $nrank); do
+        cat <<EOF >> index.html
+<a href="step${s}r${i}.png"><img src="step${s}r${i}.png" width=400px/></a>
+EOF
+    done
+done
+
+cat <<EOF >> index.html
+  </body>
+</html>
+EOF
+
+echo "Finished"
+
+exit
diff --git a/src/Makefile.am b/src/Makefile.am
index 0fdc51571424e2a9ad91d1d7c714d91446702161..c71cbbbbd1b1463a61e8b138fb85525d74ebbe03 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -41,7 +41,7 @@ include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \
 AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \
     serial_io.c timers.c debug.c scheduler.c proxy.c parallel_io.c \
     units.c common_io.c single_io.c multipole.c version.c map.c \
-    kernel.c tools.c
+    kernel.c tools.c part.c
 
 # Include files for distribution, not installation.
 noinst_HEADERS = atomic.h cycle.h error.h inline.h kernel.h vector.h \
diff --git a/src/debug.c b/src/debug.c
index c4d9b43723386128623fa8cd596cbc6260a70086..49273e6c1c19f82f308a24f2d4158bde8aa656d6 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -111,14 +111,14 @@ void printParticle_single(struct part *p) {
  * @brief Dump the METIS graph in standard format, simple format and weights
  * only, to a file.
  *
- * @description The standard format output can be read into the METIS
+ * The standard format output can be read into the METIS
  * command-line tools. The simple format is just the cell connectivity (this
  * should not change between calls).  The weights format is the standard one,
  * minus the cell connectivity.
  *
  * The output filenames are generated from the prefix and the sequence number
- * of calls. So the first is called <prefix>_std_001.dat, <prefix>_simple_001.dat,
- * <prefix>_weights_001.dat, etc.
+ * of calls. So the first is called {prefix}_std_001.dat, {prefix}_simple_001.dat,
+ * {prefix}_weights_001.dat, etc.
  *
  * @param prefix base output filename
  * @param nvertices the number of vertices
diff --git a/src/engine.c b/src/engine.c
index 0911f444d14953f73b45c63b0b5de767c79495ba..a000c4eb4157ce2d1b52c767b0e8d14f700aa1ea 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -28,6 +28,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <stdbool.h>
 
 /* MPI headers. */
 #ifdef WITH_MPI
@@ -38,6 +39,10 @@
 #endif
 #endif
 
+#ifdef HAVE_LIBNUMA
+#include <numa.h>
+#endif
+
 /* This object's header. */
 #include "engine.h"
 
@@ -47,8 +52,15 @@
 #include "cycle.h"
 #include "debug.h"
 #include "error.h"
+#include "part.h"
 #include "timers.h"
 
+#ifdef LEGACY_GADGET2_SPH
+#include "runner_iact_legacy.h"
+#else
+#include "runner_iact.h"
+#endif
+
 /* Convert cell location to ID. */
 #define cell_getid(cdim, i, j, k) \
   ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i)))
@@ -209,28 +221,24 @@ void engine_redistribute(struct engine *e) {
         offset_send += counts[ind_send];
         offset_recv += counts[ind_recv];
       } else {
-        if (MPI_Isend(&s->parts[offset_send],
-                      sizeof(struct part) * counts[ind_send], MPI_BYTE, k,
-                      2 * ind_send + 0, MPI_COMM_WORLD,
+        if (MPI_Isend(&s->parts[offset_send], counts[ind_send],
+                      e->part_mpi_type, k, 2 * ind_send + 0, MPI_COMM_WORLD,
                       &reqs[4 * k]) != MPI_SUCCESS)
           error("Failed to isend parts to node %i.", k);
-        if (MPI_Isend(&s->xparts[offset_send],
-                      sizeof(struct xpart) * counts[ind_send], MPI_BYTE, k,
-                      2 * ind_send + 1, MPI_COMM_WORLD,
+        if (MPI_Isend(&s->xparts[offset_send], counts[ind_send],
+                      e->xpart_mpi_type, k, 2 * ind_send + 1, MPI_COMM_WORLD,
                       &reqs[4 * k + 1]) != MPI_SUCCESS)
           error("Failed to isend xparts to node %i.", k);
         offset_send += counts[ind_send];
       }
     }
     if (k != nodeID && counts[ind_recv] > 0) {
-      if (MPI_Irecv(&parts_new[offset_recv],
-                    sizeof(struct part) * counts[ind_recv], MPI_BYTE, k,
-                    2 * ind_recv + 0, MPI_COMM_WORLD,
+      if (MPI_Irecv(&parts_new[offset_recv], counts[ind_recv], e->part_mpi_type,
+                    k, 2 * ind_recv + 0, MPI_COMM_WORLD,
                     &reqs[4 * k + 2]) != MPI_SUCCESS)
         error("Failed to emit irecv of parts from node %i.", k);
-      if (MPI_Irecv(&xparts_new[offset_recv],
-                    sizeof(struct xpart) * counts[ind_recv], MPI_BYTE, k,
-                    2 * ind_recv + 1, MPI_COMM_WORLD,
+      if (MPI_Irecv(&xparts_new[offset_recv], counts[ind_recv],
+                    e->xpart_mpi_type, k, 2 * ind_recv + 1, MPI_COMM_WORLD,
                     &reqs[4 * k + 3]) != MPI_SUCCESS)
         error("Failed to emit irecv of parts from node %i.", k);
       offset_recv += counts[ind_recv];
@@ -1114,7 +1122,7 @@ void engine_maketasks(struct engine *e) {
   /* Allocate the list of cell-task links. The maximum number of links
      is the number of cells (s->tot_cells) times the number of neighbours (27)
      times the number of interaction types (2, density and force). */
-  free(e->links);
+  if (e->links != NULL) free(e->links);
   e->size_links = s->tot_cells * 27 * 2;
   if ((e->links = malloc(sizeof(struct link) * e->size_links)) == NULL)
     error("Failed to allocate cell-task links.");
@@ -1825,6 +1833,13 @@ void engine_launch(struct engine *e, int nr_runners, unsigned int mask) {
  */
 void engine_init_particles(struct engine *e) {
 
+  int k;
+  float dt_max = 0.0f, dt_min = FLT_MAX;
+  double epot = 0.0, ekin = 0.0;
+  float mom[3] = {0.0, 0.0, 0.0};
+  float ang[3] = {0.0, 0.0, 0.0};
+  int count = 0;
+  struct cell *c;
   struct space *s = e->s;
 
   // engine_repartition(e);
@@ -2151,6 +2166,26 @@ void engine_split(struct engine *e, int *grid) {
   s->xparts = xparts_new;
 }
 
+
+#if defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE)
+ static bool hyperthreads_present(void) {
+   #ifdef __linux__
+   FILE *f = fopen("/sys/devices/system/cpu/cpu0/topology/thread_siblings_list", "r");
+
+   int c;
+   while ((c = fgetc(f)) != EOF && c != ',');
+   fclose(f);
+
+   return c == ',';
+   #else
+   return true; // just guess
+   #endif
+ }
+#endif
+ 
+
+
+ 
 /**
  * @brief init an engine with the given number of threads, queues, and
  *      the given policy.
@@ -2171,10 +2206,10 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
                  int nr_queues, int nr_nodes, int nodeID, int policy,
                  float timeBegin, float timeEnd, float dt_min, float dt_max) {
 
-  int k;
+  int i, k;
 #if defined(HAVE_SETAFFINITY)
   int nr_cores = sysconf(_SC_NPROCESSORS_ONLN);
-  int i, j, cpuid[nr_cores];
+  int j, cpuid[nr_cores];
   cpu_set_t cpuset;
   if (policy & engine_policy_cputight) {
     for (k = 0; k < nr_cores; k++) cpuid[k] = k;
@@ -2189,6 +2224,42 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
       for (j = maxint / i / 2; j < maxint; j += maxint / i)
         if (j < nr_cores && j != 0) cpuid[k++] = j;
 
+#if defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE)
+    /* Ascending NUMA distance. Bubblesort(!) for stable equidistant CPUs. */
+    if (numa_available() >= 0) {
+      if (nodeID == 0) message("prefer NUMA-local CPUs");
+
+      int home = numa_node_of_cpu(sched_getcpu()), half = nr_cores / 2;
+      bool done = false, swap_hyperthreads = hyperthreads_present();
+      if (swap_hyperthreads) message("prefer physical cores to hyperthreads");
+
+      while (!done) {
+        done = true;
+        for (i = 1; i < nr_cores; i++) {
+          int node_a = numa_node_of_cpu(cpuid[i-1]);
+          int node_b = numa_node_of_cpu(cpuid[i]);
+
+          /* Avoid using local hyperthreads over unused remote physical cores.
+           * Assume two hyperthreads, and that cpuid >= half partitions them.
+           */
+          int thread_a = swap_hyperthreads && cpuid[i-1] >= half;
+          int thread_b = swap_hyperthreads && cpuid[i] >= half;
+
+          bool swap = thread_a > thread_b;
+          if (thread_a == thread_b)
+            swap = numa_distance(home, node_a) > numa_distance(home, node_b);
+
+          if (swap) {
+            int t = cpuid[i-1];
+            cpuid[i-1] = cpuid[i];
+            cpuid[i] = t;
+            done = false;
+          }
+        }
+      }
+    }
+#endif
+
     if (nodeID == 0) {
 #ifdef WITH_MPI
       message("engine_init: cpu map is [ ");
@@ -2250,11 +2321,13 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
   e->barrier_launch = 0;
   e->barrier_launchcount = 0;
 
-  /* Init the scheduler. */
-  scheduler_init(&e->sched, e->s, nr_queues, scheduler_flag_steal, e->nodeID);
-  s->nr_queues = nr_queues;
 
-  scheduler_reset(&e->sched, 2 * s->tot_cells + e->nr_threads);
+  /* Init the scheduler with sufficient tasks for the initial kick1 and sorting
+     tasks. */
+  int nr_tasks = 2 * s->tot_cells + e->nr_threads;
+  scheduler_init(&e->sched, e->s, nr_tasks, nr_queues, scheduler_flag_steal,
+                 e->nodeID);
+  s->nr_queues = nr_queues;
 
   /* Create the sorting tasks. */
   for (i = 0; i < e->nr_threads; i++)
diff --git a/src/engine.h b/src/engine.h
index eef97bb7ce9d3fed0d1044393cb97163b37ff1af..b70e609325d6abe707c8281d95798adb8191dd0f 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -19,6 +19,14 @@
 #ifndef SWIFT_ENGINE_H
 #define SWIFT_ENGINE_H
 
+/* Config parameters. */
+#include "../config.h"
+
+/* MPI headers. */
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
 /* Some standard headers. */
 #include <pthread.h>
 
@@ -131,6 +139,12 @@ struct engine {
   /* Linked list for cell-task association. */
   struct link *links;
   int nr_links, size_links;
+
+#ifdef WITH_MPI
+  /* MPI data type for the particle transfers */
+  MPI_Datatype part_mpi_type;
+  MPI_Datatype xpart_mpi_type;
+#endif
 };
 
 /* Function prototypes. */
diff --git a/src/kernel.h b/src/kernel.h
index d55e073479d1585f3677389939fa8e7c53dbe1c9..aead6a95adc35028834d671448223a31a57fc2b6 100644
--- a/src/kernel.h
+++ b/src/kernel.h
@@ -516,16 +516,17 @@ __attribute__((always_inline)) INLINE static void kernel_eval(float x,
 #define kernel_name "Wendland C2"
 #define kernel_degree 5
 #define kernel_ivals 1
-#define kernel_gamma 1.f
-#define kernel_gamma2 1.f
-#define kernel_gamma3 1.f
-#define kernel_igamma 1.f
+#define kernel_gamma 2.f
+#define kernel_gamma2 4.f
+#define kernel_gamma3 8.f
+#define kernel_igamma 0.5f
 #define kernel_nwneigh                                                      \
   (4.0 / 3.0 * M_PI *const_eta_kernel *const_eta_kernel *const_eta_kernel * \
    7.261825f)
 static float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
-    __attribute__((aligned(16))) = {4.0f, -15.0f, 20.0f, -10.0f, 0.0f, 1.0f,
-                                    0.0f, 0.0f,   0.0f,  0.0f,   0.0f, 0.0f};
+    __attribute__((aligned(16))) = {
+        0.05222272f, -0.39167037f, 1.04445431f, -1.04445431f, 0.f,  0.41778173f,
+        0.0f,        0.0f,         0.0f,        0.0f,         0.0f, 0.0f};
 #define kernel_root (kernel_coeffs[kernel_degree])
 #define kernel_wroot (4.0 / 3.0 * M_PI *kernel_coeffs[kernel_degree])
 
@@ -537,7 +538,7 @@ static float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
 __attribute__((always_inline)) INLINE static void kernel_deval(float x,
                                                                float *W,
                                                                float *dW_dx) {
-  int ind = fminf(x, kernel_ivals);
+  int ind = fminf(0.5f * x, kernel_ivals);
   float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
   float w = coeffs[0] * x + coeffs[1];
   float dw_dx = coeffs[0];
@@ -563,7 +564,7 @@ __attribute__((always_inline))
   int j, k;
 
   /* Load x and get the interval id. */
-  ind.m = vec_ftoi(vec_fmin(x->v, vec_set1((float)kernel_ivals)));
+  ind.m = vec_ftoi(vec_fmin(0.5f * x->v, vec_set1((float)kernel_ivals)));
 
   /* load the coefficients. */
   for (k = 0; k < VEC_SIZE; k++)
@@ -590,7 +591,7 @@ __attribute__((always_inline))
 
 __attribute__((always_inline)) INLINE static void kernel_eval(float x,
                                                               float *W) {
-  int ind = fmin(x, kernel_ivals);
+  int ind = fmin(0.5f * x, kernel_ivals);
   float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
   float w = coeffs[0] * x + coeffs[1];
   for (int k = 2; k <= kernel_degree; k++) w = x * w + coeffs[k];
diff --git a/src/part.c b/src/part.c
new file mode 100644
index 0000000000000000000000000000000000000000..6d49887ad72d6afdd1f419cfdaca88090e781f3e
--- /dev/null
+++ b/src/part.c
@@ -0,0 +1,68 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+/* Config parameters. */
+#include "../config.h"
+
+/* MPI headers. */
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+/* This object's header. */
+#include "part.h"
+
+#ifdef WITH_MPI
+/**
+ * @brief Registers and returns an MPI type for the particles
+ *
+ * @param part_type The type container
+ */
+void part_create_mpi_type(MPI_Datatype* part_type) {
+
+  /* This is not the recommended way of doing this.
+     One should define the structure field by field
+     But as long as we don't do serialization via MPI-IO
+     we don't really care.
+     Also we would have to modify this function everytime something
+     is added to the part structure. */
+  MPI_Type_contiguous(sizeof(struct part) / sizeof(unsigned char), MPI_BYTE,
+                      part_type);
+  MPI_Type_commit(part_type);
+}
+
+/**
+ * @brief Registers and returns an MPI type for the xparticles
+ *
+ * @param xpart_type The type container
+ */
+void xpart_create_mpi_type(MPI_Datatype* xpart_type) {
+
+  /* This is not the recommended way of doing this.
+     One should define the structure field by field 
+     But as long as we don't do serialization via MPI-IO 
+     we don't really care. 
+     Also we would have to modify this function everytime something 
+     is added to the part structure. */
+  MPI_Type_contiguous(sizeof(struct xpart) / sizeof(unsigned char), MPI_BYTE,
+                      xpart_type);
+  MPI_Type_commit(xpart_type);
+}
+
+#endif
diff --git a/src/part.h b/src/part.h
index 02c9f2d8fb95ba7810f04a521e74b7516a274486..f6e3c9c94b93fe88df732cd8a69402a810bbcb2a 100644
--- a/src/part.h
+++ b/src/part.h
@@ -19,6 +19,18 @@
 #ifndef SWIFT_PART_H
 #define SWIFT_PART_H
 
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
+#include <stdlib.h>
+
+/* MPI headers. */
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+/* Local headers. */
 #include "const.h"
 
 /* Some constants. */
@@ -72,7 +84,7 @@ struct gpart {
     size_t id;
 
     /* Pointer to corresponding SPH part. */
-    struct part *part;
+    struct part* part;
   };
 
 } __attribute__((aligned(part_align)));
@@ -165,8 +177,13 @@ struct part {
   unsigned long long id;
 
   /* Associated gravitas. */
-  struct gpart *gpart;
+  struct gpart* gpart;
 
 } __attribute__((aligned(part_align)));
 
+#ifdef WITH_MPI
+void part_create_mpi_type(MPI_Datatype* part_type);
+void xpart_create_mpi_type(MPI_Datatype* xpart_type);
+#endif
+
 #endif /* SWIFT_PART_H */
diff --git a/src/queue.c b/src/queue.c
index 4e5330518c3d067a3e8a8ce7806e3560a41cc8ef..af84c4b1f5e7e80fac05e5b122703fa718b7fead 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -126,15 +126,14 @@ void queue_init(struct queue *q, struct task *tasks) {
  * @brief Get a task free of dependencies and conflicts.
  *
  * @param q The task #queue.
- * @param super The super-cell tat might conflict with the #queue
+ * @param prev The previous #task extracted from this #queue.
  * @param blocking Block until access to the queue is granted.
  */
 
-struct task *queue_gettask(struct queue *q, struct cell *super, int blocking) {
+struct task *queue_gettask(struct queue *q, const struct task *prev, int blocking) {
 
-  int k, qcount, *qtid, gotcha;
   lock_type *qlock = &q->lock;
-  struct task *qtasks, *res = NULL;
+  struct task *res = NULL;
 
   /* If there are no tasks, leave immediately. */
   if (q->count == 0) return NULL;
@@ -147,49 +146,78 @@ struct task *queue_gettask(struct queue *q, struct cell *super, int blocking) {
   }
 
   /* Set some pointers we will use often. */
-  qtid = q->tid;
-  qtasks = q->tasks;
-  qcount = q->count;
-  gotcha = 0;
-
-  /* Loop over the task IDs looking for tasks with the same super-cell. */
-  if (super != NULL) {
-    for (k = 0; k < qcount && k < queue_maxsuper; k++) {
-
-      /* Put a finger on the task. */
-      res = &qtasks[qtid[k]];
-
-      /* Try to lock the task and exit if successful. */
-      if ((res->ci->super == super ||
-           (res->cj != NULL && res->cj->super == super)) &&
-          task_lock(res)) {
-        gotcha = 1;
+  int *qtid = q->tid;
+  struct task *qtasks = q->tasks;
+  const int qcount = q->count;
+
+  /* Data for the sliding window in which to try the task with the
+     best overlap with the previous task. */
+  struct {
+    int ind, tid;
+    float score;
+  } window[queue_search_window];
+  int window_count = 0;
+  int tid = -1;
+  int ind = -1;
+
+  /* Loop over the queue entries. */
+  for (int k = 0; k < qcount; k++) {
+    if (k < queue_search_window) {
+      window[window_count].ind = k;
+      window[window_count].tid = qtid[k];
+      window[window_count].score = task_overlap(prev, &qtasks[qtid[k]]);
+      window_count += 1;
+    } else {
+      /* Find the task with the largest overlap. */
+      int ind_max = 0;
+      for (int i = 1; i < window_count; i++)
+        if (window[i].score > window[ind_max].score) ind_max = i;
+
+      /* Try to lock that task. */
+      if (task_lock(&qtasks[window[ind_max].tid])) {
+        tid = window[ind_max].tid;
+        ind = window[ind_max].ind;
+        // message("best task has overlap %f.", window[ind_max].score);
         break;
-      }
 
-    } /* loop over the task IDs. */
+        /* Otherwise, replace it with a new one from the queue. */
+      } else {
+        window[ind_max].ind = k;
+        window[ind_max].tid = qtid[k];
+        window[ind_max].score = task_overlap(prev, &qtasks[qtid[k]]);
+      }
+    }
   }
 
-  /* Loop over the task IDs again if nothing was found, take anything. */
-  if (!gotcha) {
-    for (k = 0; k < qcount; k++) {
-
-      /* Put a finger on the task. */
-      res = &qtasks[qtid[k]];
-
-      /* Try to lock the task and exit if successful. */
-      if (task_lock(res)) break;
-
-    } /* loop over the task IDs. */
+  /* If we didn't get a task, loop through whatever is left in the window. */
+  if (tid < 0) {
+    while (window_count > 0) {
+      int ind_max = 0;
+      for (int i = 1; i < window_count; i++)
+        if (window[i].score > window[ind_max].score) ind_max = i;
+      if (task_lock(&qtasks[window[ind_max].tid])) {
+        tid = window[ind_max].tid;
+        ind = window[ind_max].ind;
+        // message("best task has overlap %f.", window[ind_max].score);
+        break;
+      } else {
+        window_count -= 1;
+        window[ind_max] = window[window_count];
+      }
+    }
   }
 
   /* Did we get a task? */
-  if (k < qcount) {
+  if (ind >= 0) {
 
     /* Another one bites the dust. */
-    qcount = q->count -= 1;
+    const int qcount = q->count -= 1;
+    
+    /* Get a pointer on the task that we want to return. */
+    res = &qtasks[tid];
 
     /* Swap this task with the last task and re-heap. */
+    int k = ind;
     if (k < qcount) {
       qtid[k] = qtid[qcount];
       int w = qtasks[qtid[k]].weight;
diff --git a/src/queue.h b/src/queue.h
index d81627b4c4a709498c31c6c6152d3650388020e2..7f6d13c425f80cac20125bf422fe9da1ed06361f 100644
--- a/src/queue.h
+++ b/src/queue.h
@@ -28,6 +28,7 @@
 #define queue_maxsuper 50
 #define queue_sizeinit 100
 #define queue_sizegrow 2
+#define queue_search_window 8
 
 /* Counters. */
 enum {
@@ -54,7 +55,7 @@ struct queue {
 } __attribute__((aligned(64)));
 
 /* Function prototypes. */
-struct task *queue_gettask(struct queue *q, struct cell *super, int blocking);
+struct task *queue_gettask(struct queue *q, const struct task *prev, int blocking);
 void queue_init(struct queue *q, struct task *tasks);
 void queue_insert(struct queue *q, struct task *t);
 
diff --git a/src/runner.c b/src/runner.c
index 30777c446fc40e1ae7cda0bcb4d3b9087965aa09..4440c3ac7d0ac7ffc3436947ab445073cd788629 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -44,9 +44,6 @@
 #include "timers.h"
 #include "timestep.h"
 
-//struct task *store = NULL;
-
-
 /* Include the right variant of the SPH interactions */
 #ifdef LEGACY_GADGET2_SPH
 #include "runner_iact_legacy.h"
@@ -55,25 +52,7 @@
 #endif
 #include "runner_iact_grav.h"
 
-#define PRINT_PART                                                          \
-  if (p->id == 1000) {                                                      \
-    message(                                                                \
-        "p->id=%lld p->h=%3.2f p->N_ngb=%3.2f p->rho=%3.2f p->t_beg=%3.2f p->t_end=%3.2f pos=[%3.2f %3.2f %3.2f] a=[%3.2f %3.2f %3.2f]", \
-        p->id, p->h, p->density.wcount, p->rho, p->t_begin, p->t_end, p->x[0], p->x[1], p->x[2], p->a[0], p->a[1], p->a[2]); \
-  }
-
-
-
-/* Convert cell location to ID. */
-#define cell_getid(cdim, i, j, k) \
-  ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i)))
-
-/* Histograms bins. */
-long long int runner_hist_bins[runner_hist_N];
-
-/* The counters. */
-int runner_counter[runner_counter_count];
-
+/* Orientation of the cell pairs */
 const float runner_shift[13 * 3] = {
     5.773502691896258e-01, 5.773502691896258e-01,  5.773502691896258e-01,
     7.071067811865475e-01, 7.071067811865475e-01,  0.0,
@@ -88,6 +67,8 @@ const float runner_shift[13 * 3] = {
     0.0,                   1.0,                    0.0,
     0.0,                   7.071067811865475e-01,  -7.071067811865475e-01,
     0.0,                   0.0,                    1.0, };
+
+/* Does the axis need flipping ? */
 const char runner_flip[27] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
                               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 
@@ -205,8 +186,6 @@ void runner_dosort(struct runner *r, struct cell *c, int flags, int clock) {
 
   TIMER_TIC
 
-  //    message("sort!");
-
   /* Clean-up the flags, i.e. filter out what's already been sorted. */
   flags &= ~c->sorted;
   if (flags == 0) return;
@@ -547,7 +526,7 @@ void runner_doinit(struct runner *r, struct cell *c) {
       for (k = 0; k < 3; k++) p->density.curl_v[k] = 0.0;
     }
 
-    PRINT_PART;
+    ;
   }
 }
 
@@ -621,7 +600,7 @@ void runner_doghost(struct runner *r, struct cell *c) {
         wcount_dh =
             p->density.wcount_dh * ih * (4.0f / 3.0 * M_PI * kernel_gamma3);
 
-        PRINT_PART;
+        ;
         // if(p->id==1000)
         //  message("wcount_dh=%f", wcount_dh);
 
@@ -723,8 +702,6 @@ void runner_doghost(struct runner *r, struct cell *c) {
         /* Run through this cell's density interactions. */
         for (struct link *l = finger->density; l != NULL; l = l->next) {
 
-          // message("link: %p next: %p", l, l->next); fflush(stdout);
-
           /* Self-interaction? */
           if (l->t->type == task_type_self)
             runner_doself_subset_density(r, finger, parts, pid, count);
@@ -732,8 +709,6 @@ void runner_doghost(struct runner *r, struct cell *c) {
           /* Otherwise, pair interaction? */
           else if (l->t->type == task_type_pair) {
 
-            // message("pair");
-
             /* Left or right? */
             if (l->t->ci == finger)
               runner_dopair_subset_density(r, finger, parts, pid, count,
@@ -747,8 +722,6 @@ void runner_doghost(struct runner *r, struct cell *c) {
           /* Otherwise, sub interaction? */
           else if (l->t->type == task_type_sub) {
 
-            // message("sub");
-
             /* Left or right? */
             if (l->t->ci == finger)
               runner_dosub_subset_density(r, finger, parts, pid, count,
@@ -758,7 +731,6 @@ void runner_doghost(struct runner *r, struct cell *c) {
                                           l->t->ci, -1, 1);
           }
         }
-        // error("done");
       }
     }
   }
@@ -791,6 +763,7 @@ void runner_dodrift(struct runner *r, struct cell *c, int timer) {
   struct xpart *restrict xp, *restrict xparts = c->xparts;
 
   TIMER_TIC
+
   /* No children? */
   if (!c->split) {
 
@@ -805,7 +778,7 @@ void runner_dodrift(struct runner *r, struct cell *c, int timer) {
       h = p->h;
       ih = 1.0f / h;
 
-      PRINT_PART;
+      ;
 
       /* Drift... */
       p->x[0] += xp->v_full[0] * dt;
@@ -907,6 +880,7 @@ void runner_dokick(struct runner *r, struct cell *c, int timer) {
       /* Get a handle on the part. */
       p = &parts[k];
       xp = &xparts[k];
+
       m = p->mass;
       x[0] = p->x[0], x[1] = p->x[1], x[2] = p->x[2];
 
@@ -948,7 +922,6 @@ void runner_dokick(struct runner *r, struct cell *c, int timer) {
         p->t_begin = p->t_end;
         p->t_end = p->t_begin + dt;
 
-        PRINT_PART
 
         /* Kick particles in momentum space */
         xp->v_full[0] += p->a[0] * dt;
@@ -990,7 +963,7 @@ void runner_dokick(struct runner *r, struct cell *c, int timer) {
 
   }
 
-  /* Otherwise, agregate data from children. */
+  /* Otherwise, aggregate data from children. */
   else {
 
     /* Init everything. */
@@ -1074,7 +1047,7 @@ void *runner_main(void *data) {
   struct engine *e = r->e;
   struct scheduler *sched = &e->sched;
   struct task *t = NULL;
-  struct cell *ci, *cj, *super;
+  struct cell *ci, *cj;
   struct part *parts;
   int k, nr_parts;
 
@@ -1084,8 +1057,8 @@ void *runner_main(void *data) {
     /* Wait at the barrier. */
     engine_barrier(e, r->id);
 
-    /* Re-set the pointer to the previous super cell. */
-    super = NULL;
+    /* Re-set the pointer to the previous task, as there is none. */
+    struct task* prev = NULL;
 
     /* Loop while there are tasks... */
     while (1) {
@@ -1095,11 +1068,9 @@ void *runner_main(void *data) {
 
         /* Get the task. */
         TIMER_TIC
-        t = scheduler_gettask(sched, r->qid, super);
+        t = scheduler_gettask(sched, r->qid, prev);
         TIMER_TOC(timer_gettask);
 
-        // message("Got task %p", t->type); fflush(stdout);
-
         /* Did I get anything? */
         if (t == NULL) break;
       }
@@ -1109,18 +1080,9 @@ void *runner_main(void *data) {
       cj = t->cj;
       t->rid = r->cpuid;
 
-      /* Set super to the first cell that I own. */
-      if (t->type != task_type_rewait && t->type != task_type_psort) {
-        if (ci->super != NULL && ci->super->owner == r->qid)
-          super = ci->super;
-        else if (cj != NULL && cj->super != NULL && cj->super->owner == r->qid)
-          super = cj->super;
-      }
-
       /* Different types of tasks... */
       switch (t->type) {
         case task_type_self:
-	  //message("self");
           if (t->subtype == task_subtype_density)
             runner_doself1_density(r, ci);
           else if (t->subtype == task_subtype_force)
@@ -1129,7 +1091,6 @@ void *runner_main(void *data) {
             error("Unknown task subtype.");
           break;
         case task_type_pair:
-	  //message("pair unlocking %d tasks", t->nr_unlock_tasks);
           if (t->subtype == task_subtype_density)
             runner_dopair1_density(r, ci, cj);
           else if (t->subtype == task_subtype_force)
@@ -1155,11 +1116,9 @@ void *runner_main(void *data) {
           runner_doinit(r, ci);
           break;
         case task_type_ghost:
-	  //message("ghost");
           runner_doghost(r, ci);
           break;
         case task_type_drift:
-	  
           runner_dodrift(r, ci, 1);
           break;
         case task_type_kick:
@@ -1195,13 +1154,14 @@ void *runner_main(void *data) {
           space_split(e->s, t->ci);
           break;
         case task_type_rewait:
-	  task_do_rewait(t);
+	  scheduler_do_rewait((struct task *)t->ci, (struct task *)t->cj, t->flags);
           break;
         default:
           error("Unknown task type.");
       }
 
       /* We're done with this task, see if we get a next one. */
+      prev = t;
       t = scheduler_done(sched, t);
 
     } /* main loop. */
diff --git a/src/runner.h b/src/runner.h
index ce4b8405bcde69d19990fbcfee257ddafc717b6e..366c4f7af6ec94c0b366da85b21713600d559dc0 100644
--- a/src/runner.h
+++ b/src/runner.h
@@ -19,53 +19,12 @@
 #ifndef SWIFT_RUNNER_H
 #define SWIFT_RUNNER_H
 
-/* Some standard headers. */
-#include <pthread.h>
-
 /* Includes. */
 #include "cell.h"
 #include "inline.h"
 
-/* Forward-declare the engine type to avoid cyclic header dependencies. */
-struct engine;
-
-/* Some constants/flags. */
-#define runner_prefetch 0
-
-/* SID stuff. */
-extern const char runner_flip[];
-
-/* Counters. */
-enum runner_counters {
-  runner_counter_swap = 0,
-  runner_counter_stall,
-  runner_counter_steal_stall,
-  runner_counter_steal_empty,
-  runner_counter_keep,
-  runner_counter_iact,
-  runner_counter_count,
-};
-extern int runner_counter[runner_counter_count];
-
-/* Counter macros. */
-#ifdef COUNTER
-#define COUNT(c) (__sync_add_and_fetch(&runner_counter[c], 1))
-#else
-#define COUNT(c)
-#endif
-
-/* Histogram functions. */
-#define runner_hist_a 1.0
-#define runner_hist_b 100.0
-#define runner_hist_N 99
-extern long long int runner_hist_bins[runner_hist_N];
-#define runner_hist_hit(x)                                                   \
-  __sync_add_and_fetch(                                                      \
-      &runner_hist_bins[(int)fmax(                                           \
-          0.0, fmin(runner_hist_N - 1, ((x) - runner_hist_a) /               \
-                                           (runner_hist_b - runner_hist_a) * \
-                                           runner_hist_N))],                 \
-      1)
+extern const float runner_shift[13 * 3];
+extern const char runner_flip[27];
 
 /* A struct representing a runner's thread and its data. */
 struct runner {
diff --git a/src/scheduler.c b/src/scheduler.c
index 726f71c6ede452ca79bc6fe45611005d9538c812..41826ce22181ebb3ed625aac07e327bb49461640 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1095,8 +1095,8 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         break;
       case task_type_recv:
 #ifdef WITH_MPI
-        if ((err = MPI_Irecv(t->ci->parts, sizeof(struct part) * t->ci->count,
-                             MPI_BYTE, t->ci->nodeID, t->flags, MPI_COMM_WORLD,
+        if ((err = MPI_Irecv(t->ci->parts, t->ci->count, s->part_mpi_type,
+                             t->ci->nodeID, t->flags, MPI_COMM_WORLD,
                              &t->req)) != MPI_SUCCESS) {
           char buff[MPI_MAX_ERROR_STRING];
           int len;
@@ -1113,9 +1113,9 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         break;
       case task_type_send:
 #ifdef WITH_MPI
-        if ((err = MPI_Isend(t->ci->parts, sizeof(struct part) * t->ci->count,
-                             MPI_BYTE, t->cj->nodeID, t->flags, MPI_COMM_WORLD,
-                             &t->req)) != MPI_SUCCESS) {
+        if ((err = MPI_Isend(t->ci->parts, t->ci->count, s->part_mpi_type, 
+			     t->cj->nodeID, t->flags, MPI_COMM_WORLD, 
+			     &t->req)) != MPI_SUCCESS) {
           char buff[MPI_MAX_ERROR_STRING];
           int len;
           MPI_Error_string(err, buff, &len);
@@ -1242,13 +1242,13 @@ struct task *scheduler_unlock(struct scheduler *s, struct task *t) {
  *
  * @param s The #scheduler.
  * @param qid The ID of the preferred #queue.
- * @param super the super-cell
+ * @param prev the previous task that was run.
  *
  * @return A pointer to a #task or @c NULL if there are no available tasks.
  */
 
 struct task *scheduler_gettask(struct scheduler *s, int qid,
-                               struct cell *super) {
+                               const struct task *prev) {
 
   struct task *res = NULL;
   int k, nr_queues = s->nr_queues;
@@ -1267,7 +1267,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
       /* Try to get a task from the suggested queue. */
       if (s->queues[qid].count > 0) {
         TIMER_TIC
-        res = queue_gettask(&s->queues[qid], super, 0);
+        res = queue_gettask(&s->queues[qid], prev, 0);
         TIMER_TOC(timer_qget);
         if (res != NULL) break;
       }
@@ -1280,7 +1280,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
         for (k = 0; k < scheduler_maxsteal && count > 0; k++) {
           int ind = rand_r(&seed) % count;
           TIMER_TIC
-          res = queue_gettask(&s->queues[qids[ind]], super, 0);
+          res = queue_gettask(&s->queues[qids[ind]], prev, 0);
           TIMER_TOC(timer_qsteal);
           if (res != NULL)
             break;
@@ -1318,15 +1318,14 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
  *
  * @param s The #scheduler.
  * @param space The #space we are working with
+ * @param nr_tasks The number of tasks to allocate initially.
  * @param nr_queues The number of queues in this scheduler.
  * @param flags The #scheduler flags.
  * @param nodeID The MPI rank
  */
 
-void scheduler_init(struct scheduler *s, struct space *space, int nr_queues,
-                    unsigned int flags, int nodeID) {
-
-  int k;
+void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
+                    int nr_queues, unsigned int flags, int nodeID) {
 
   /* Init the lock. */
   lock_init(&s->lock);
@@ -1337,7 +1336,7 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_queues,
     error("Failed to allocate queues.");
 
   /* Initialize each queue. */
-  for (k = 0; k < nr_queues; k++) queue_init(&s->queues[k], NULL);
+  for (int k = 0; k < nr_queues; k++) queue_init(&s->queues[k], NULL);
 
   /* Init the sleep mutex and cond. */
   if (pthread_cond_init(&s->sleep_cond, NULL) != 0 ||
@@ -1359,13 +1358,17 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_queues,
   s->space = space;
   s->nodeID = nodeID;
 
-  /* Init other values. */
+  /* Init the tasks array. */
+  s->size = 0;
   s->tasks = NULL;
   s->tasks_ind = NULL;
-  s->waiting = 0;
-  s->size = 0;
-  s->nr_tasks = 0;
-  s->tasks_next = 0;
+  scheduler_reset(s, nr_tasks);
+
+/* Construct types for MPI communications */
+#ifdef WITH_MPI
+  part_create_mpi_type(&s->part_mpi_type);
+  xpart_create_mpi_type(&s->xpart_mpi_type);
+#endif
 }
 
 /**
@@ -1392,3 +1395,31 @@ void scheduler_print_tasks(struct scheduler *s, char *fileName) {
 
   fclose(file);
 }
+
+/**
+ * @brief Sets the waits of the dependants of a range of task
+ *
+ * @param t_begin Beginning of the #task range
+ * @param t_end End of the #task range
+ * @param mask The scheduler mask
+ */
+void scheduler_do_rewait(struct task *t_begin, struct task *t_end,
+			  unsigned int mask) {
+  for (struct task *t2 = t_begin; t2 != t_end; t2++) {
+    
+    if (t2->skip) continue;
+    
+    /* Skip tasks not in the mask */
+    if (!((1 << t2->type) & mask)) continue;
+    
+    /* Skip sort tasks that have already been performed */
+    if (t2->type == task_type_sort && t2->flags == 0) continue;
+    
+    /* Sets the waits of the dependances */
+    for (int k = 0; k < t2->nr_unlock_tasks; k++) {
+      struct task *t3 = t2->unlock_tasks[k];
+      atomic_inc(&t3->wait);
+    }
+  }
+}
+ 
diff --git a/src/scheduler.h b/src/scheduler.h
index f87734210689dc4ecb7f89572c3494614ee826a3..fe2a979450f9af7cf4b2450225bdce364543a237 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -20,6 +20,14 @@
 #ifndef SWIFT_SCHEDULER_H
 #define SWIFT_SCHEDULER_H
 
+/* Config parameters. */
+#include "../config.h"
+
+/* MPI headers. */
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
 /* Some standard headers. */
 #include <pthread.h>
 
@@ -89,13 +97,19 @@ struct scheduler {
 
   /* The node we are working on. */
   int nodeID;
+
+#ifdef WITH_MPI
+  /* MPI data type for the particle transfers */
+  MPI_Datatype part_mpi_type;
+  MPI_Datatype xpart_mpi_type;
+#endif
 };
 
 /* Function prototypes. */
-void scheduler_init(struct scheduler *s, struct space *space, int nr_queues,
-                    unsigned int flags, int nodeID);
+void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
+                    int nr_queues, unsigned int flags, int nodeID);
 struct task *scheduler_gettask(struct scheduler *s, int qid,
-                               struct cell *super);
+                               const struct task* prev);
 void scheduler_enqueue(struct scheduler *s, struct task *t);
 void scheduler_start(struct scheduler *s, unsigned int mask);
 void scheduler_reset(struct scheduler *s, int nr_tasks);
@@ -111,5 +125,7 @@ void scheduler_addunlock(struct scheduler *s, struct task *ta, struct task *tb);
 void scheduler_set_unlocks(struct scheduler *s);
 void scheduler_dump_queue(struct scheduler *s);
 void scheduler_print_tasks(struct scheduler *s, char *fileName);
+void scheduler_do_rewait(struct task *t_begin, struct task *t_end,
+			 unsigned int mask);
 
 #endif /* SWIFT_SCHEDULER_H */
diff --git a/src/task.c b/src/task.c
index a388cb69a3b145ab14b274d3470ce7c192e1aa3f..452ffd0e721fa2e8467e906f6b19f1d02db29fd5 100644
--- a/src/task.c
+++ b/src/task.c
@@ -41,8 +41,6 @@
 #include "error.h"
 #include "lock.h"
 
-//struct task *store;
-
 /* Task type names. */
 const char *taskID_names[task_type_count] = {
     "none",    "sort",    "self",    "pair",     "sub",  "init",
@@ -53,6 +51,56 @@ const char *taskID_names[task_type_count] = {
 const char *subtaskID_names[task_type_count] = {"none",  "density",
                                                 "force", "grav"};
 
+
+/**
+ * @brief Computes the overlap between the parts array of two given cells.
+ */
+
+size_t task_cell_overlap(const struct cell *ci, const struct cell *cj) {
+  if (ci == NULL || cj == NULL) return 0;
+  if (ci->parts <= cj->parts &&
+      ci->parts + ci->count >= cj->parts + cj->count) {
+    return cj->count;
+  } else if (cj->parts <= ci->parts &&
+             cj->parts + cj->count >= ci->parts + ci->count) {
+    return ci->count;
+  }
+  return 0;
+}
+
+/**
+ * @brief Compute the Jaccard similarity of the data used by two
+ *        different tasks.
+ *
+ * @param ta The first #task.
+ * @param tb The second #task.
+ */
+
+float task_overlap(const struct task *ta, const struct task *tb) {
+  /* First check if any of the two tasks are of a type that don't
+     use cells. */
+  if (ta == NULL || tb == NULL || ta->type == task_type_none ||
+      ta->type == task_type_psort || ta->type == task_type_split_cell ||
+      ta->type == task_type_rewait || tb->type == task_type_none ||
+      tb->type == task_type_psort || tb->type == task_type_split_cell ||
+      tb->type == task_type_rewait)
+    return 0.0f;
+
+  /* Compute the union of the cell data. */
+  size_t size_union = 0;
+  if (ta->ci != NULL) size_union += ta->ci->count;
+  if (ta->cj != NULL) size_union += ta->cj->count;
+  if (tb->ci != NULL) size_union += tb->ci->count;
+  if (tb->cj != NULL) size_union += tb->cj->count;
+
+  /* Compute the intersection of the cell data. */
+  const size_t size_intersect =
+      task_cell_overlap(ta->ci, tb->ci) + task_cell_overlap(ta->ci, tb->cj) +
+      task_cell_overlap(ta->cj, tb->ci) + task_cell_overlap(ta->cj, tb->cj);
+
+  return ((float)size_intersect) / (size_union - size_intersect);
+}
+
 /**
  * @brief Unlock the cell held by this task.
  *
@@ -258,48 +306,3 @@ void task_addunlock_old(struct task *ta, struct task *tb) {
 
   lock_unlock_blind(&ta->lock);
 }
-
-
-void task_print_mask(unsigned int mask) {
-
-  int k;
-
-  printf("task_print_mask: The tasks to run are [");
-  for (k = 1; k < task_type_count; k++)
-    printf(" %s=%s", taskID_names[k], (mask & (1 << k)) ? "yes" : "no");
-  printf(" ]\n");
-}
-
-
-void task_do_rewait(struct task *t) {
-
-  const unsigned int mask = t->flags;
-  
-  for (struct task *t2 = (struct task *)t->ci; t2 != (struct task *)t->cj; t2++) {
-    
-    if( t2->skip ) continue;
-    
-    /* Skip tasks not in the mask */
-    if( !((1<<t2->type) & mask) ) continue;
-    
-    /* Skip sort tasks that have already been */
-    if(t2->type == task_type_sort && t2->flags == 0) continue;
-
-	    /* if(store == NULL && t2->type==task_type_pair && t2->subtype==task_subtype_density) { */
-	    /*   message("\n"); */
-	    /*   message("Checking task %s-%s address: %p", taskID_names[t2->type], subtaskID_names[t2->subtype], t2); */
-	    /*   store = t2; */
-	    /* } */
-
-    for (int k = 0; k < t2->nr_unlock_tasks; k++) {
-      
-      struct task *t3=t2->unlock_tasks[k];
-      
-      atomic_inc(&t3->wait);
-
-	      /* if (t3 == store) { */
-	      /* 	message("Unlocked by task %s-%s address: %p" , taskID_names[t2->type], subtaskID_names[t2->subtype], t2); */
-	      /* } */	      
-    }
-  }
-}
diff --git a/src/task.h b/src/task.h
index 8f633a640ef7457a28713fe3fa3f182232d137de..1203a954c9b9b5314683f61428995d47fdea7dc9 100644
--- a/src/task.h
+++ b/src/task.h
@@ -93,6 +93,7 @@ void task_rmunlock_blind(struct task *ta, struct task *tb);
 void task_cleanunlock(struct task *t, int type);
 void task_addunlock(struct task *ta, struct task *tb);
 void task_unlock(struct task *t);
+float task_overlap(const struct task *ta, const struct task *tb);
 int task_lock(struct task *t);
 void task_print_mask(unsigned int mask);
 void task_do_rewait(struct task *t);
diff --git a/src/version.c b/src/version.c
index 7370c061d28a1594213015c3ba8b2108fa4b7a37..4624f036a0178f5b31b1aa24b1607507914444a7 100644
--- a/src/version.c
+++ b/src/version.c
@@ -18,6 +18,21 @@
  *
  ******************************************************************************/
 
+/* Config parameters. */
+#include "../config.h"
+
+/* MPI headers. */
+#ifdef WITH_MPI
+#include <mpi.h>
+#ifdef HAVE_METIS
+#include <metis.h>
+#endif
+#endif
+
+#ifdef HAVE_HDF5
+#include <hdf5.h>
+#endif
+
 /* Some standard headers. */
 #include <stdio.h>
 #include <string.h>
@@ -28,26 +43,50 @@
 /**
  * @brief Return the source code git revision
  *
- * @details The SHA of the code checked out when the library was last built.
+ * The SHA of the code checked out when the library was last built.
  * Will include -dirty if they are local modifications.
+ *
+ * @result the git version.
  */
 const char *git_revision(void) {
+  static char buf[256];
+  static int initialised = 0;
   static const char *revision = GIT_REVISION;
-  return revision;
+  if (!initialised) {
+    if (strlen(revision) == 0)
+      sprintf(buf, "%s", "unknown");
+    else
+      sprintf(buf, "%s", revision);
+    initialised = 1;
+  }
+  return buf;
 }
 
 /**
  * @brief Return the source code git branch
  *
- * @details The name of the current branch when the code was last built.
+ * The name of the current branch when the code was last built.
+ *
+ * @result git branch
  */
 const char *git_branch(void) {
+  static char buf[256];
+  static int initialised = 0;
   static const char *branch = GIT_BRANCH;
-  return branch;
+  if (!initialised) {
+    if (strlen(branch) == 0)
+      sprintf(buf, "%s", "unknown");
+    else
+      sprintf(buf, "%s", branch);
+    initialised = 1;
+  }
+  return buf;
 }
 
 /**
  * @brief The version of SWIFT
+ *
+ * @result the package version
  */
 const char *package_version(void) {
   static const char *version = PACKAGE_VERSION;
@@ -56,6 +95,8 @@ const char *package_version(void) {
 
 /**
  * @brief A description of the package version and code status.
+ *
+ * @result description of the package version
  */
 const char *package_description(void) {
   static char buf[256];
@@ -68,6 +109,11 @@ const char *package_description(void) {
   return buf;
 }
 
+/**
+ * @brief return the name of the compiler used to build SWIFT.
+ *
+ * @result description of the compiler.
+ */
 const char *compiler_name(void) {
   static char compiler[256] = {0};
 #if defined(__INTEL_COMPILER)
@@ -83,6 +129,11 @@ const char *compiler_name(void) {
   return compiler;
 }
 
+/**
+ * @brief return compiler version used to build SWIFT.
+ *
+ * @result description of the compiler.
+ */
 const char *compiler_version(void) {
   static char version[256] = {0};
 #if defined(__INTEL_COMPILER)
@@ -104,6 +155,87 @@ const char *compiler_version(void) {
   return version;
 }
 
+
+/**
+ * @brief return the MPI version, runtime if possible otherwise that used when
+ *        built.
+ *
+ * @result description of the MPI version.
+ */
+const char *mpi_version(void) {
+  static char version[80] = {0};
+
+#ifdef WITH_MPI
+  int std_version, std_subversion;
+
+  /* Check that the library implements the version string routine */
+#ifdef MPI_MAX_LIBRARY_VERSION_STRING
+  static char lib_version[MPI_MAX_LIBRARY_VERSION_STRING] = {0};
+  int len;
+  MPI_Get_library_version(lib_version, &len);
+
+  /* Find first \n and truncate string to this length, can get many lines from
+   * some MPIs (MPICH). */
+  char *ptr = strchr(lib_version, '\n');
+  if (ptr != NULL) *ptr = '\0';
+
+  /* Also arbitrarily truncate to keep down to one line, Open MPI,
+   * check for last comma and keep to ~60 chars max. */
+  strcpy(lib_version+60, "...");
+  ptr = strrchr(lib_version, ',');
+  if (ptr != NULL) *ptr = '\0';
+
+#else
+  /* Use autoconf guessed value. */
+  static char lib_version[60] = {0};
+  snprintf(lib_version, 60, "%s", SWIFT_MPI_LIBRARY);
+#endif
+
+  /* Numeric version. */
+  MPI_Get_version(&std_version, &std_subversion);
+  snprintf(version, 80, "%s (MPI std v%i.%i)", lib_version,
+           std_version, std_subversion);
+#else
+  sprintf(version, "Code was not compiled with MPI support");
+#endif
+  return version;
+}
+
+/**
+ * @brief return the HDF5 version in use at runtime.
+ *
+ * @result description of the current HDF5 version.
+ */
+const char *hdf5_version(void) {
+
+  static char version[256] = {0};
+#ifdef HAVE_HDF5
+  unsigned int majnum, minnum, relnum;
+  H5get_libversion(&majnum, &minnum, &relnum);
+  sprintf(version, "%i.%i.%i", majnum, minnum, relnum);
+#else
+  sprintf(version, "Unknown version");
+#endif
+  return version;
+}
+
+/**
+ * @brief return the METIS version used when SWIFT was built.
+ *
+ * @result description of the METIS version.
+ */
+const char *metis_version(void) {
+
+  static char version[256] = {0};
+#if defined(WITH_MPI) && defined(HAVE_METIS)
+  sprintf(version, "%i.%i.%i", METIS_VER_MAJOR, METIS_VER_MINOR,
+          METIS_VER_SUBMINOR);
+#else
+  sprintf(version, "Unknown version");
+#endif
+  return version;
+}
+
 /**
  * @brief Prints a greeting message to the standard output containing code
  * version and revision number
@@ -120,6 +252,16 @@ void greetings(void) {
 
   printf(" Version : %s\n", package_version());
   printf(" Revision: %s, Branch: %s\n", git_revision(), git_branch());
-  printf(" Webpage : www.swiftsim.com\n");
-  printf(" Compiler: %s, Version: %s\n\n", compiler_name(), compiler_version());
+  printf(" Webpage : www.swiftsim.com\n\n");
+  printf(" Compiler: %s, Version: %s\n", compiler_name(), compiler_version());
+#ifdef HAVE_HDF5
+  printf(" HDF5 library version: %s\n", hdf5_version());
+#endif
+#ifdef WITH_MPI
+  printf(" MPI library: %s\n", mpi_version());
+#ifdef HAVE_METIS
+  printf(" METIS library version: %s\n", metis_version());
+#endif
+#endif
+  printf("\n");
 }
diff --git a/src/version.h.in b/src/version.h.in
index d29ab6d43784c0f64b37a9c0d1a99781c82770bd..7824e06b996dfbb4178b57f1b72365a1e2d24484 100644
--- a/src/version.h.in
+++ b/src/version.h.in
@@ -35,6 +35,9 @@ const char* git_revision(void);
 const char* git_branch(void);
 const char* compiler_name(void);
 const char* compiler_version(void);
+const char* mpi_version(void);
+const char *hdf5_version(void);
+const char *metis_version(void);
 void greetings(void);
 
 #endif /* SWIFT_VERSION_H */
diff --git a/theory/kernel/kernels.py b/theory/kernel/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7bdbe2bf9ba49a30f4c8a2ae136c4843ce5c2cf
--- /dev/null
+++ b/theory/kernel/kernels.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python2                                  
+# -*- coding: utf-8 -*-            
+import matplotlib
+matplotlib.use("Agg")
+from pylab import *
+from scipy import integrate
+import distinct_colours as colours
+from scipy.optimize import curve_fit
+from scipy.optimize import fsolve
+from matplotlib.font_manager import FontProperties
+import numpy
+
+params = {
+    'axes.labelsize': 8,
+    'axes.titlesize': 8,
+    'font.size': 8,
+    'legend.fontsize': 9,
+    'xtick.labelsize': 8,
+    'ytick.labelsize': 8,
+    'xtick.major.pad': 2.5,
+    'ytick.major.pad': 2.5,
+    'text.usetex': True,
+'figure.figsize' : (3.15,3.15),
+'figure.subplot.left'    : 0.12,
+'figure.subplot.right'   : 0.99  ,
+'figure.subplot.bottom'  : 0.08  ,
+'figure.subplot.top'     : 0.99  ,
+'figure.subplot.wspace'  : 0.  ,
+'figure.subplot.hspace'  : 0.  ,
+'lines.markersize' : 6,
+'lines.linewidth' : 2.,
+'text.latex.unicode': True
+}
+rcParams.update(params)
+rc('font',**{'family':'sans-serif','sans-serif':['Times']})
+
+
+#Parameters
+eta = 1.2349
+h = 2.1
+
+#Constants
+PI = math.pi
+
+#Cubic Spline
+cubic_kernel_degree = 3
+cubic_kernel_ivals = 2
+cubic_kernel_gamma = 2.
+cubic_kernel_ngb = 4.0 / 3.0 * PI * eta**3 * 6.0858
+cubic_kernel_coeffs = array([[3./(4.*PI) , -3./(2.*PI), 0.,     1./PI],
+                             [-1./(4.*PI),  3./(2.*PI), -3./PI, 2./PI],
+                             [0.,           0.,         0.,     0.]])
+def cubic_W(x):
+    if size(x) == 1:
+        x = array([x])
+    ind = (minimum(x, cubic_kernel_ivals)).astype(int)
+    coeffs = cubic_kernel_coeffs[ind,:]
+    w = coeffs[:,0] * x + coeffs[:,1]
+    for k in range(2, cubic_kernel_degree+1):
+        w = x * w + coeffs[:,k]
+    return w
+
+
+#Quartic Spline
+quartic_kernel_degree = 4
+quartic_kernel_ivals = 3
+quartic_kernel_gamma = 2.5
+quartic_kernel_ngb = 4.0 / 3.0 * PI * eta**3 * 8.2293
+quartic_kernel_coeffs = array([[3./(10.*PI) , 0.,           -3./(4.*PI) , 0.          ,  23./(32.*PI)],
+                               [-1./(5.*PI) , 1./PI       , -3./(2.*PI) ,1./(4.*PI)   ,  11./(16.*PI)],
+                               [1./(20.*PI) , -1./(2.*PI) , 15./(8.*PI) , -25./(8.*PI), 125./(64.*PI)],
+                               [ 0. , 0.,           0.,         0.,     0.]])
+def quartic_W(x):
+    if size(x) == 1:
+        x = array([x])
+    ind = (minimum(x+0.5, quartic_kernel_ivals)).astype(int)
+    coeffs = quartic_kernel_coeffs[ind,:]
+    w = coeffs[:,0] * x + coeffs[:,1]
+    for k in range(2, quartic_kernel_degree+1):
+        w = x * w + coeffs[:,k]
+    return w
+
+
+# Wendland kernel
+wendland2_kernel_degree = 5
+wendland2_kernel_ivals = 1
+wendland2_kernel_gamma = 2
+wendland2_kernel_ngb = 4.0 / 3.0 * PI * eta**3 * 7.261825
+wendland2_kernel_coeffs = 3.342253804929802 * array([[1./8, -30./32, 80./32, -80./32., 0., 1.],
+                                                     [ 0. , 0.,  0.,   0., 0., 0.]]) / 8.
+
+print wendland2_kernel_coeffs
+def wendland2_W(x):
+    if size(x) == 1:
+        x = array([x])
+    ind = (minimum(0.5*x, wendland2_kernel_ivals)).astype(int)
+    coeffs = wendland2_kernel_coeffs[ind,:]
+    w = coeffs[:,0] * x + coeffs[:,1]
+    for k in range(2, wendland2_kernel_degree+1):
+        w = x * w + coeffs[:,k]
+    return w 
+
+#def wendland2_W(x):
+#    if size(x) == 1:
+#        x = array([x])
+#    x /= 1.936492
+#    x[x>1.] = 1.
+#    oneminusu = 1.-x
+#    oneminusu4 = oneminusu * oneminusu * oneminusu * oneminusu
+#    return 3.342253804929802 * oneminusu4 * (1. + 4.*x) / h**3
+
+
+#Find H
+r = arange(0, 3.5*h, 1./1000.)
+xi = r/h
+cubic_Ws = cubic_W(xi)
+quartic_Ws = quartic_W(xi)
+wendland2_Ws = wendland2_W(xi)
+for j in range(size(r)):
+    if cubic_Ws[j] == 0:
+        cubic_H = r[j]
+        break
+for j in range(size(r)):
+    if quartic_Ws[j] == 0:
+        quartic_H = r[j]
+        break
+for j in range(size(r)):
+    if wendland2_Ws[j] == 0:
+        wendland2_H = r[j]
+        break
+
+    
+print "H=", cubic_H
+print "H=", quartic_H
+print "H=", wendland2_H
+
+
+# Compute sigma -----------------------------------------
+cubic_norm = 4.*PI*integrate.quad(lambda x: x**2*cubic_W(x), 0, cubic_H)[0]
+quartic_norm = 4.*PI*integrate.quad(lambda x: x**2*quartic_W(x), 0, quartic_H)[0]
+wendland2_norm = 4.*PI*integrate.quad(lambda x: x**2*wendland2_W(x), 0, wendland2_H)[0]
+
+print cubic_norm
+print quartic_norm
+print wendland2_norm
+
+
+# Plot kernels ------------------------------------------
+r = arange(0, 3.5*h, 1./100.)
+xi = r/h
+
+cubic_Ws = cubic_W(xi)
+quartic_Ws = quartic_W(xi)
+wendland2_Ws = wendland2_W(xi)
+
+
+
+figure()
+
+text(h-0.1, cubic_Ws[0]/20., "h", ha="right",va="center")
+arrow(h, cubic_Ws[0]/10., 0., -cubic_Ws[0]/10., fc='k', ec='k', length_includes_head=True, head_length=cubic_Ws[0]/30., head_width=0.1)
+
+
+plot(r,cubic_Ws, 'b-' ,label="Cubic")
+plot(r, quartic_Ws, 'r-', label="Quartic")
+plot(r, wendland2_Ws, 'g-', label="Wendland C2")
+
+text(cubic_H-0.1, cubic_Ws[0]/20., "H", ha="right",va="center", color='b')
+arrow(cubic_H, cubic_Ws[0]/10., 0., -cubic_Ws[0]/10., fc='b', ec='b', length_includes_head=True, head_length=cubic_Ws[0]/30., head_width=0.1)
+
+text(quartic_H-0.1, cubic_Ws[0]/20., "H", ha="right",va="center", color='r')
+arrow(quartic_H, cubic_Ws[0]/10., 0., -cubic_Ws[0]/10., fc='r', ec='r', length_includes_head=True, head_length=quartic_Ws[0]/30., head_width=0.1)
+
+text(wendland2_H-0.1, cubic_Ws[0]/20., "H", ha="right",va="center", color='r')
+arrow(wendland2_H, cubic_Ws[0]/10., 0., -cubic_Ws[0]/10., fc='g', ec='g', length_includes_head=True, head_length=wendland2_Ws[0]/30., head_width=0.1)
+
+
+xlabel("r", labelpad=0)
+ylabel("W(r,h)", labelpad=0)
+legend(loc="upper right")
+savefig("kernel.pdf")
+