diff --git a/.gitignore b/.gitignore
index 812aa06dd54ee8bb81e87796deb8ee05160fec72..893f786506ed168565fb995812136bcb7f1305f1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,11 +26,13 @@ examples/*/*.xmf
 examples/*/*.hdf5
 examples/*/*.png
 examples/*/*.txt
+examples/*/*.dot
 examples/*/used_parameters.yml
 examples/*/*/*.xmf
 examples/*/*/*.hdf5
 examples/*/*/*.png
 examples/*/*/*.txt
+examples/*/*/*.dot
 examples/*/*/used_parameters.yml
 examples/*/gravity_checks_*.dat
 
diff --git a/examples/analyse_dump_cells.py b/examples/analyse_dump_cells.py
new file mode 100755
index 0000000000000000000000000000000000000000..3140e799566c75fe494a75895db0f4a8dcff4e57
--- /dev/null
+++ b/examples/analyse_dump_cells.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+"""
+Usage:
+    analysedumpcells.py nx ny nx cell<1>.dat cell<2>.dat ...
+
+Analyses a number of output files created by calls to the dumpCells() debug
+function (presumably called in engine_step()) to output a list of active
+top-level cells, identifying those that are on the edges of the volumes being
+processed on by various nodes. The point is that these should be minimised to
+reduce the MPI communications.
+
+The nx, ny and nz arguments are the number of cells in the complete space,
+we need these so that we can wrap around the edge of space correctly.
+
+This file is part of SWIFT.
+Copyright (c) 2017 Peter W. Draper (p.w.draper@durham.ac.uk)
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published
+by the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+import pylab as pl
+import numpy as np
+import sys
+import pandas
+
+xcol = 0
+ycol = 1
+zcol = 2
+xwcol = 3
+ywcol = 4
+zwcol = 5
+localcol = 18
+supercol = 15
+activecol = 16
+
+#  Command-line arguments.
+if len(sys.argv) < 5:
+    print "usage: ", sys.argv[0], " nx ny nz cell1.dat cell2.dat ..."
+    sys.exit(1)
+nx = int(sys.argv[1])
+ny = int(sys.argv[2])
+nz = int(sys.argv[3])
+
+print "# x y z onedge"
+allactives = []
+onedge = 0
+tcount = 0
+for i in range(4, len(sys.argv)):
+
+    #  Read the file.
+    data = pl.loadtxt(sys.argv[i])
+    #print data
+
+    #  Select cells that are on the current rank and are super cells.
+    rdata = data[data[:,localcol] == 1]
+    tdata = rdata[rdata[:,supercol] == 1]
+
+    #  Separation of the cells is in data.
+    xwidth = tdata[0,xwcol]
+    ywidth = tdata[0,ywcol]
+    zwidth = tdata[0,zwcol]
+
+    #  Fill space nx, ny,n nz with all toplevel cells and flag their active
+    #  state.
+    space = np.zeros((nx,ny,nz))
+    actives = []
+    for line in tdata:
+        ix = int(np.rint(line[xcol] / xwidth))
+        iy = int(np.rint(line[ycol] / ywidth))
+        iz = int(np.rint(line[zcol] / zwidth))
+        active = int(line[activecol])
+        space[ix,iy,iz] = 1 + active
+        tcount = tcount + 1
+        if active == 1:
+            actives.append([ix, iy, iz, line])
+    
+    #  Report all active cells and flag any without 26 neighbours. These are
+    #  on the edge of the partition volume and will have foreign neighbour
+    #  cells.
+    for active in actives:
+        count = 0
+        for ii in [-1, 0, 1]:
+            i = active[0] + ii
+            if i < 0:
+                i = i + nx
+            elif i >= nx:
+                i = i - nx
+
+            for jj in [-1, 0, 1]:
+                j = active[1] + jj
+                if j < 0:
+                    j = j + ny
+                elif j >= ny:
+                    j = j - ny
+
+                for kk in [-1, 0, 1]:
+                    k = active[2] + kk
+                    if k < 0:
+                        k = k + nz
+                    elif k >= nz:
+                        k = k - nz
+                    if space[i, j, k] > 0:
+                        count = count + 1
+        if count < 27:
+            onedge = onedge + 1
+            print active[3][0], active[3][1], active[3][2], 1
+        else:
+            print active[3][0], active[3][1], active[3][2], 0
+
+    allactives.extend(actives)
+
+print "# top cells: ", tcount, " active: ", len(allactives), " on edge: ", onedge
+
+sys.exit(0)
+
diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml
index a5791306752c92bbe63985d029594178faae7213..948731d7879ec3e34f6e53f58dc72f6d7a6145ec 100644
--- a/examples/parameter_example.yml
+++ b/examples/parameter_example.yml
@@ -15,6 +15,7 @@ Scheduler:
   cell_split_size:        400       # (Optional) Maximal number of particles per cell (this is the default value).
   max_top_level_cells:    12        # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value).
   tasks_per_cell:         0         # (Optional) The average number of tasks per cell. If not large enough the simulation will fail (means guess...).
+  mpi_message_limit:      4096      # (Optional) Maximum MPI task message size to send non-buffered, KB.
 
 # Parameters governing the time integration (Set dt_min and dt_max to the same value for a fixed time-step run.)
 TimeIntegration:
@@ -79,8 +80,13 @@ DomainDecomposition:
   initial_grid_x:   10      # (Optional) Grid size if the "grid" strategy is chosen.
   initial_grid_y:   10      # ""
   initial_grid_z:   10      # ""
-  repartition_type: task_weights # (Optional) The re-decomposition strategy: "none", "task_weights", "particle_weights",
-                                 #            "edge_task_weights" or "hybrid_weights".
+
+  repartition_type: costs/costs # (Optional) The re-decomposition strategy, one of:
+                            # "none/none", "costs/costs", "counts/none", "none/costs", "counts/costs",
+                            # "costs/time" or "none/time".
+                            # These are vertex/edge weights with "costs" as task timing, "counts" as
+                            # sum of particles and "time" as the expected time of the next updates
+
   trigger:          0.05    # (Optional) Fractional (<1) CPU time difference between MPI ranks required to trigger a
                             # new decomposition, or number of steps (>1) between decompositions
   minfrac:          0.9     # (Optional) Fractional of all particles that should be updated in previous step when
@@ -120,7 +126,7 @@ SineWavePotential:
   amplitude:        10.     # Amplitude of the sine wave (internal units)
   timestep_limit:   1.      # Time-step dimensionless pre-factor.
   growth_time:      0.      # (Optional) Time for the potential to grow to its final size.
- 
+
 # Parameters related to cooling function  ----------------------------------------------
 
 # Constant du/dt cooling function
diff --git a/examples/plot_task_dependencies.sh b/examples/plot_task_dependencies.sh
new file mode 100755
index 0000000000000000000000000000000000000000..77784d8a9cdd3720621c9ad35c4cfbdaf0167ff1
--- /dev/null
+++ b/examples/plot_task_dependencies.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+#  Creates a graphic from the task graph file dependency_graph.dot.
+#  Requires the graphviz command "dot".
+
+if [ ! -e dependency_graph.dot ]; then
+    echo "Missing task-graph output 'dependency_graph.dot'! Cannot generate figure."
+else 
+    dot -Tpng dependency_graph.dot -o task_graph.png
+    echo "Output written to task_graph.png"
+fi
+
+exit
diff --git a/examples/process_cells b/examples/process_cells
new file mode 100755
index 0000000000000000000000000000000000000000..eead4572387f2732cf0b86265f14d20039f73ae1
--- /dev/null
+++ b/examples/process_cells
@@ -0,0 +1,64 @@
+#!/bin/bash
+#
+# Usage:
+#  process_cells nx ny nz nprocess
+#
+# Description:
+#  Process all the cell dumps in the current directory.
+
+#  Outputs file per rank with the active cells identified and marked as to
+#  whether they are near an edge or not. Note requires the numbers of cells
+#  per dimension of the space.
+#
+#  Also outputs a graphic showing the fraction of active cells on edges
+#  for each step.
+
+#  Handle command-line
+if test "$4" = ""; then
+    echo "Usage: $0 nx ny nz nprocess"
+    exit 1
+fi
+NX=$1
+NY=$2
+NZ=$3
+NPROCS=$4
+
+#  Locate script.
+SCRIPTHOME=$(dirname "$0")
+
+#  Find all files. Use version sort to get into correct order.
+files=$(ls -v cells_*.dat)
+if test $? != 0; then
+    echo "Failed to find any cell dump files"
+    exit 1
+fi
+
+#  Construct list of names need the number of ranks.
+ranks=$(ls -v cells_*.dat | sed 's,cells_\(.*\)_.*.dat,\1,' | sort | uniq | wc -l)
+echo "Number of ranks = $ranks"
+
+#  Now construct a list of files ordered by rank, not step.
+files=$(ls cells_*.dat | sort -t "_" -k 3,3 -n | xargs -n 4)
+
+#  Need number of steps.
+nfiles=$(echo $files| wc -w)
+echo "Number of files = $nfiles"
+steps=$(( $nfiles / $ranks + 1 ))
+echo "Number of steps = $steps"
+
+#  And process them,
+echo "Processing cell dumps files..."
+#echo $files | xargs -P $NPROCS -n 4 /bin/bash -c "${SCRIPTHOME}/process_cells_helper $NX $NY $NZ \$0 \$1 \$2 \$3"
+
+#  Create summary.
+grep "top cells" step*-active-cells.dat | sort -h > active_cells.log
+
+#  And plot of active cells to edge cells.
+stilts plot2plane ifmt=ascii in=active_cells.log xmin=-0.1 xmax=1.1 ymin=0 ymax=$steps grid=1 \
+       legend=false xpix=600 ypix=500 xlabel="Edge cells/Active cells" ylabel="Step" \
+       layer1=mark x1="col9/1.0/col6" y1="index" size1=3 shading1=aux auxmap=rainbow \
+       aux=col6 auxfunc=log auxlabel="Active cells" layer2=histogram x2="col9/1.0/col6" \
+       color2=grey binsize2=0.01 phase2=0.5 barform2=semi_steps thick2=1 \
+       out=active_cells.png
+
+exit
diff --git a/examples/process_cells_helper b/examples/process_cells_helper
new file mode 100755
index 0000000000000000000000000000000000000000..a96bdba0de56e04c28ed6d8675c705241ce241d9
--- /dev/null
+++ b/examples/process_cells_helper
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+#  Helper for process_cells.
+
+#  Locate script.
+SCRIPTHOME=$(dirname "$0")
+
+step=$(echo $4|sed 's,cells_\(.*\)_\(.*\).dat,\2,')
+echo "${SCRIPTHOME}/analyse_dump_cells.py $* > step${step}-active-cells.dat"
+${SCRIPTHOME}/analyse_dump_cells.py $* > step${step}-active-cells.dat
+
+exit
diff --git a/src/debug.c b/src/debug.c
index ac886fc8d75a03ac67b213e0639cd384f0980529..4c521397b12c555923f8cea8a4f0cdb4c4551b97 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -276,10 +276,13 @@ int checkCellhdxmax(const struct cell *c, int *depth) {
  * only.
  */
 static void dumpCells_map(struct cell *c, void *data) {
-  uintptr_t *ldata = (uintptr_t *)data;
+  size_t *ldata = (size_t *)data;
   FILE *file = (FILE *)ldata[0];
   struct engine *e = (struct engine *)ldata[1];
   float ntasks = c->nr_tasks;
+  int active = (int)ldata[2];
+  int mpiactive = (int)ldata[3];
+  int pactive = (int)ldata[4];
 
 #if SWIFT_DEBUG_CHECKS
   /* The c->nr_tasks field does not include all the tasks. So let's check this
@@ -300,46 +303,94 @@ static void dumpCells_map(struct cell *c, void *data) {
   }
 #endif
 
-  /* Only locally active cells are dumped. */
-  if (c->count > 0 || c->gcount > 0 || c->scount > 0)
-    fprintf(file,
-            "  %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6d %6d %6d %6d "
-            "%6.1f %20lld %6d %6d %6d %6d\n",
-            c->loc[0], c->loc[1], c->loc[2], c->width[0], c->width[1],
-            c->width[2], c->count, c->gcount, c->scount, c->depth, ntasks,
-            c->ti_end_min, get_time_bin(c->ti_end_min), (c->super == c),
-            cell_is_active(c, e), c->nodeID);
+  /* Only cells with particles are dumped. */
+  if (c->count > 0 || c->gcount > 0 || c->scount > 0) {
+
+/* In MPI mode we may only output cells with foreign partners.
+ * These define the edges of the partitions. */
+#if WITH_MPI
+    if (mpiactive)
+      mpiactive = (c->send_xv != NULL);
+    else
+      mpiactive = 1;
+#else
+    mpiactive = 1;
+#endif
+
+    /* Active cells, otherwise all. */
+    if (active)
+      active = cell_is_active(c, e);
+    else
+      active = 1;
+
+    /* So output local super cells that are active and have MPI tasks as
+     * requested. */
+    if (c->nodeID == e->nodeID && (c->super == c) && active && mpiactive) {
+
+      /* If requested we work out how many particles are active in this cell. */
+      int pactcount = 0;
+      if (pactive) {
+        const struct part *parts = c->parts;
+        for (int k = 0; k < c->count; k++)
+          if (part_is_active(&parts[k], e)) pactcount++;
+        struct gpart *gparts = c->gparts;
+        for (int k = 0; k < c->gcount; k++)
+          if (gpart_is_active(&gparts[k], e)) pactcount++;
+        struct spart *sparts = c->sparts;
+        for (int k = 0; k < c->scount; k++)
+          if (spart_is_active(&sparts[k], e)) pactcount++;
+      }
+
+      fprintf(file,
+              "  %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6d %6d %6d %6d %6d %6d "
+              "%6.1f %20lld %6d %6d %6d %6d %6d\n",
+              c->loc[0], c->loc[1], c->loc[2], c->width[0], c->width[1],
+              c->width[2], e->step, c->count, c->gcount, c->scount, pactcount,
+              c->depth, ntasks, c->ti_end_min, get_time_bin(c->ti_end_min),
+              (c->super == c), cell_is_active(c, e), c->nodeID,
+              c->nodeID == e->nodeID);
+    }
+  }
 }
 
 /**
  * @brief Dump the location, depth, task counts and timebins and active state,
- * for all cells to a simple text file.
+ * for all cells to a simple text file. A more costly count of the active
+ * particles in a cell can also be output.
  *
- * @param prefix base output filename
+ * @param prefix base output filename, result is written to
+ *               %prefix%_%rank%_%step%.dat
+ * @param active just output active cells.
+ * @param mpiactive just output MPI active cells, i.e. those with foreign cells.
+ * @param pactive also output a count of active particles.
  * @param s the space holding the cells to dump.
+ * @param rank node ID of MPI rank, or 0 if not relevant.
+ * @param step the current engine step, or some unique integer.
  */
-void dumpCells(const char *prefix, struct space *s) {
+void dumpCells(const char *prefix, int active, int mpiactive, int pactive,
+               struct space *s, int rank, int step) {
 
   FILE *file = NULL;
 
   /* Name of output file. */
-  static int nseq = 0;
   char fname[200];
-  int uniq = atomic_inc(&nseq);
-  sprintf(fname, "%s_%03d.dat", prefix, uniq);
-
+  sprintf(fname, "%s_%03d_%03d.dat", prefix, rank, step);
   file = fopen(fname, "w");
 
   /* Header. */
   fprintf(file,
-          "# %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s "
-          "%20s %6s %6s %6s\n",
-          "x", "y", "z", "xw", "yw", "zw", "count", "gcount", "scount", "depth",
-          "tasks", "ti_end_min", "timebin", "issuper", "active", "rank");
+          "# %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s "
+          "%20s %6s %6s %6s %6s %6s\n",
+          "x", "y", "z", "xw", "yw", "zw", "step", "count", "gcount", "scount",
+          "actcount", "depth", "tasks", "ti_end_min", "timebin", "issuper",
+          "active", "rank", "local");
 
-  uintptr_t data[2];
+  size_t data[5];
   data[0] = (size_t)file;
   data[1] = (size_t)s->e;
+  data[2] = (size_t)active;
+  data[3] = (size_t)mpiactive;
+  data[4] = (size_t)pactive;
   space_map_cells_pre(s, 1, dumpCells_map, &data);
   fclose(file);
 }
diff --git a/src/debug.h b/src/debug.h
index 20ae8fc2c20fa458b7be82f75b2fbb3be9acd32f..2cff37fa6f6f03185dc769bb770fe3a98a424ce1 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -36,7 +36,8 @@ void printgParticle_single(struct gpart *gp);
 
 int checkSpacehmax(struct space *s);
 int checkCellhdxmax(const struct cell *c, int *depth);
-void dumpCells(const char *prefix, struct space *s);
+void dumpCells(const char *prefix, int active, int mpiactive, int pactive,
+               struct space *s, int rank, int step);
 
 #ifdef HAVE_METIS
 #include "metis.h"
diff --git a/src/engine.c b/src/engine.c
index 128cb15dab09ffd55a098a3f3ce01cf845913498..84b4237df26771b5f17399405d43a5ffa18863dd 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -4008,6 +4008,8 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
     gravity_exact_force_compute(e->s, e);
 #endif
 
+  if (e->nodeID == 0) scheduler_write_dependencies(&e->sched, e->verbose);
+
   /* Run the 0th time-step */
   engine_launch(e);
 
@@ -4162,6 +4164,9 @@ void engine_step(struct engine *e) {
   /* Print the number of active tasks ? */
   if (e->verbose) engine_print_task_counts(e);
 
+/* Dump local cells and active particle counts. */
+/* dumpCells("cells", 0, 0, 1, e->s, e->nodeID, e->step); */
+
 #ifdef SWIFT_DEBUG_CHECKS
   /* Check that we have the correct total mass in the top-level multipoles */
   long long num_gpart_mpole = 0;
@@ -5113,6 +5118,12 @@ void engine_init(struct engine *e, struct space *s,
   scheduler_init(&e->sched, e->s, engine_estimate_nr_tasks(e), nr_queues,
                  (policy & scheduler_flag_steal), e->nodeID, &e->threadpool);
 
+  /* Maximum size of MPI task messages, in KB, that should not be buffered,
+   * that is sent using MPI_Issend, not MPI_Isend. 4Mb by default.
+   */
+  e->sched.mpi_message_limit =
+      parser_get_opt_param_int(params, "Scheduler:mpi_message_limit", 4) * 1024;
+
   /* Allocate and init the threads. */
   if ((e->runners = (struct runner *)malloc(sizeof(struct runner) *
                                             e->nr_threads)) == NULL)
diff --git a/src/equation_of_state.h b/src/equation_of_state.h
index eadc73c919e3593b2b8433d4215ba562ceb7a659..76b8bb922133c9c4f29453a14068fe3f9044d66f 100644
--- a/src/equation_of_state.h
+++ b/src/equation_of_state.h
@@ -185,7 +185,8 @@ struct eos_parameters {
 /**
  * @brief Returns the internal energy given density and entropy
  *
- * Since we are using an isothermal EoS, the entropy value is ignored
+ * Since we are using an isothermal EoS, the entropy and density values are
+ * ignored.
  * Computes \f$u = u_{cst}\f$.
  *
  * @param density The density \f$\rho\f$.
@@ -196,10 +197,11 @@ gas_internal_energy_from_entropy(float density, float entropy) {
 
   return eos.isothermal_internal_energy;
 }
+
 /**
  * @brief Returns the pressure given density and entropy
  *
- * Since we are using an isothermal EoS, the entropy value is ignored
+ * Since we are using an isothermal EoS, the entropy value is ignored.
  * Computes \f$P = (\gamma - 1)u_{cst}\rho\f$.
  *
  * @param density The density \f$\rho\f$.
@@ -214,7 +216,8 @@ __attribute__((always_inline)) INLINE static float gas_pressure_from_entropy(
 /**
  * @brief Returns the entropy given density and pressure.
  *
- * Computes \f$A = \frac{P}{\rho^\gamma}\f$.
+ * Since we are using an isothermal EoS, the pressure value is ignored.
+ * Computes \f$A = (\gamma - 1)u_{cst}\rho^{-(\gamma-1)}\f$.
  *
  * @param density The density \f$\rho\f$.
  * @param pressure The pressure \f$P\f$ (ignored).
@@ -230,8 +233,9 @@ __attribute__((always_inline)) INLINE static float gas_entropy_from_pressure(
 /**
  * @brief Returns the sound speed given density and entropy
  *
- * Since we are using an isothermal EoS, the entropy value is ignored
- * Computes \f$c = \sqrt{u_{cst} \gamma \rho^{\gamma-1}}\f$.
+ * Since we are using an isothermal EoS, the entropy and density values are
+ * ignored.
+ * Computes \f$c = \sqrt{u_{cst} \gamma (\gamma-1)}\f$.
  *
  * @param density The density \f$\rho\f$.
  * @param entropy The entropy \f$S\f$.
@@ -246,7 +250,7 @@ __attribute__((always_inline)) INLINE static float gas_soundspeed_from_entropy(
 /**
  * @brief Returns the entropy given density and internal energy
  *
- * Since we are using an isothermal EoS, the energy value is ignored
+ * Since we are using an isothermal EoS, the energy value is ignored.
  * Computes \f$S = \frac{(\gamma - 1)u_{cst}}{\rho^{\gamma-1}}\f$.
  *
  * @param density The density \f$\rho\f$
@@ -262,7 +266,7 @@ gas_entropy_from_internal_energy(float density, float u) {
 /**
  * @brief Returns the pressure given density and internal energy
  *
- * Since we are using an isothermal EoS, the energy value is ignored
+ * Since we are using an isothermal EoS, the energy value is ignored.
  * Computes \f$P = (\gamma - 1)u_{cst}\rho\f$.
  *
  * @param density The density \f$\rho\f$
@@ -291,8 +295,9 @@ gas_internal_energy_from_pressure(float density, float pressure) {
 /**
  * @brief Returns the sound speed given density and internal energy
  *
- * Since we are using an isothermal EoS, the energy value is ignored
- * Computes \f$c = \sqrt{u_{cst} \gamma \rho^{\gamma-1}}\f$.
+ * Since we are using an isothermal EoS, the energy and density values are
+ * ignored.
+ * Computes \f$c = \sqrt{u_{cst} \gamma (\gamma-1)}\f$.
  *
  * @param density The density \f$\rho\f$
  * @param u The internal energy \f$u\f$
@@ -307,8 +312,9 @@ gas_soundspeed_from_internal_energy(float density, float u) {
 /**
  * @brief Returns the sound speed given density and pressure
  *
- * Since we are using an isothermal EoS, the pressure value is ignored
- * Computes \f$c = \sqrt{u_{cst} \gamma \rho^{\gamma-1}}\f$.
+ * Since we are using an isothermal EoS, the pressure and density values are
+ * ignored.
+ * Computes \f$c = \sqrt{u_{cst} \gamma (\gamma-1)}\f$.
  *
  * @param density The density \f$\rho\f$
  * @param P The pressure \f$P\f$
diff --git a/src/parallel_io.c b/src/parallel_io.c
index a542a5ff6f7f18d4e616abf355120844652075f2..b31d3b6351b00af4a042e4b28bf492258b17b073 100644
--- a/src/parallel_io.c
+++ b/src/parallel_io.c
@@ -146,7 +146,7 @@ void readArray_chunk(hid_t h_data, hid_t h_plist_id,
  * @brief Reads a data array from a given HDF5 group.
  *
  * @param grp The group from which to read.
- * @param prop The #io_props of the field to read.
+ * @param props The #io_props of the field to read.
  * @param N The number of particles on that rank.
  * @param N_total The total number of particles.
  * @param mpi_rank The MPI rank of this node.
diff --git a/src/partition.c b/src/partition.c
index d9aade781e824defedb05f5a201fec8401da7ee2..567fcb38c030a0dec1f052329111b0cc6d82d8f7 100644
--- a/src/partition.c
+++ b/src/partition.c
@@ -63,10 +63,15 @@ const char *initial_partition_name[] = {
 
 /* Simple descriptions of repartition types for reports. */
 const char *repartition_name[] = {
-    "no", "METIS edge and vertex time weighted cells",
-    "METIS particle count vertex weighted cells",
-    "METIS time edge weighted cells",
-    "METIS particle count vertex and time edge cells"};
+    "no",
+    "METIS edge and vertex task cost weights",
+    "METIS particle count vertex weights",
+    "METIS task cost edge weights",
+    "METIS particle count vertex and task cost edge weights",
+    "METIS vertex task costs and edge delta timebin weights",
+    "METIS particle count vertex and edge delta timebin weights",
+    "METIS edge delta timebin weights",
+};
 
 /* Local functions, if needed. */
 static int check_complete(struct space *s, int verbose, int nregions);
@@ -236,14 +241,14 @@ static void graph_init_metis(struct space *s, idx_t *adjncy, idx_t *xadj) {
  * @param counts the number of particles per cell. Should be
  *               allocated as size s->nr_parts.
  */
-static void accumulate_counts(struct space *s, int *counts) {
+static void accumulate_counts(struct space *s, double *counts) {
 
   struct part *parts = s->parts;
   int *cdim = s->cdim;
   double iwidth[3] = {s->iwidth[0], s->iwidth[1], s->iwidth[2]};
   double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
 
-  bzero(counts, sizeof(int) * s->nr_cells);
+  bzero(counts, sizeof(double) * s->nr_cells);
 
   for (size_t k = 0; k < s->nr_parts; k++) {
     for (int j = 0; j < 3; j++) {
@@ -274,6 +279,9 @@ static void accumulate_counts(struct space *s, int *counts) {
 static void split_metis(struct space *s, int nregions, int *celllist) {
 
   for (int i = 0; i < s->nr_cells; i++) s->cells_top[i].nodeID = celllist[i];
+
+  /* To check or visualise the partition dump all the cells. */
+  /* dumpCellRanks("metis_partition", s->cells_top, s->nr_cells); */
 }
 #endif
 
@@ -300,15 +308,16 @@ static int indexvalcmp(const void *p1, const void *p2) {
  * @param s the space of cells to partition.
  * @param nregions the number of regions required in the partition.
  * @param vertexw weights for the cells, sizeof number of cells if used,
- *        NULL for unit weights.
+ *        NULL for unit weights. Need to be in the range of idx_t.
  * @param edgew weights for the graph edges between all cells, sizeof number
  *        of cells * 26 if used, NULL for unit weights. Need to be packed
- *        in CSR format, so same as adjncy array.
+ *        in CSR format, so same as adjncy array. Need to be in the range of
+ *        idx_t.
  * @param celllist on exit this contains the ids of the selected regions,
  *        sizeof number of cells.
  */
-static void pick_metis(struct space *s, int nregions, int *vertexw, int *edgew,
-                       int *celllist) {
+static void pick_metis(struct space *s, int nregions, double *vertexw,
+                       double *edgew, int *celllist) {
 
   /* Total number of cells. */
   int ncells = s->cdim[0] * s->cdim[1] * s->cdim[2];
@@ -380,8 +389,8 @@ static void pick_metis(struct space *s, int nregions, int *vertexw, int *edgew,
   idx_t objval;
 
   /* Dump graph in METIS format */
-  /* dumpMETISGraph("metis_graph", idx_ncells, one, xadj, adjncy,
-   *                weights_v, NULL, weights_e);
+  /*dumpMETISGraph("metis_graph", idx_ncells, one, xadj, adjncy,
+   *               weights_v, NULL, weights_e);
    */
   if (METIS_PartGraphKway(&idx_ncells, &one, xadj, adjncy, weights_v, NULL,
                           weights_e, &idx_nregions, NULL, NULL, options,
@@ -465,31 +474,28 @@ static void pick_metis(struct space *s, int nregions, int *vertexw, int *edgew,
 
 #if defined(WITH_MPI) && defined(HAVE_METIS)
 /**
- * @brief Repartition the cells amongst the nodes using task timings
- *        as edge weights and vertex weights also from task timings
+ * @brief Repartition the cells amongst the nodes using task costs
+ *        as edge weights and vertex weights also from task costs
  *        or particle cells counts.
  *
  * @param partweights whether particle counts will be used as vertex weights.
  * @param bothweights whether vertex and edge weights will be used, otherwise
  *                    only edge weights will be used.
+ * @param timebins use timebins as edge weights.
  * @param nodeID our nodeID.
  * @param nr_nodes the number of nodes.
  * @param s the space of cells holding our local particles.
  * @param tasks the completed tasks from the last engine step for our node.
  * @param nr_tasks the number of tasks.
  */
-static void repart_edge_metis(int partweights, int bothweights, int nodeID,
-                              int nr_nodes, struct space *s, struct task *tasks,
-                              int nr_tasks) {
+static void repart_edge_metis(int partweights, int bothweights, int timebins,
+                              int nodeID, int nr_nodes, struct space *s,
+                              struct task *tasks, int nr_tasks) {
 
   /* Create weight arrays using task ticks for vertices and edges (edges
    * assume the same graph structure as used in the part_ calls). */
   int nr_cells = s->nr_cells;
   struct cell *cells = s->cells_top;
-  float wscale = 1.f, wscale_buff = 0.0;
-  int wtot = 0;
-  int wmax = 1e9 / nr_nodes;
-  int wmin;
 
   /* Allocate and fill the adjncy indexing array defining the graph of
    * cells. */
@@ -499,16 +505,16 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
   graph_init_metis(s, inds, NULL);
 
   /* Allocate and init weights. */
-  int *weights_v = NULL;
-  int *weights_e = NULL;
+  double *weights_v = NULL;
+  double *weights_e = NULL;
   if (bothweights) {
-    if ((weights_v = (int *)malloc(sizeof(int) * nr_cells)) == NULL)
+    if ((weights_v = (double *)malloc(sizeof(double) * nr_cells)) == NULL)
       error("Failed to allocate vertex weights arrays.");
-    bzero(weights_v, sizeof(int) * nr_cells);
+    bzero(weights_v, sizeof(double) * nr_cells);
   }
-  if ((weights_e = (int *)malloc(sizeof(int) * 26 * nr_cells)) == NULL)
+  if ((weights_e = (double *)malloc(sizeof(double) * 26 * nr_cells)) == NULL)
     error("Failed to allocate edge weights arrays.");
-  bzero(weights_e, sizeof(int) * 26 * nr_cells);
+  bzero(weights_e, sizeof(double) * 26 * nr_cells);
 
   /* Generate task weights for vertices. */
   int taskvweights = (bothweights && !partweights);
@@ -521,19 +527,8 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
     /* Skip un-interesting tasks. */
     if (t->cost == 0) continue;
 
-    /* Get the task weight. */
-    int w = t->cost * wscale;
-
-    /* Do we need to re-scale? */
-    wtot += w;
-    while (wtot > wmax) {
-      wscale /= 2;
-      wtot /= 2;
-      w /= 2;
-      for (int k = 0; k < 26 * nr_cells; k++) weights_e[k] *= 0.5;
-      if (taskvweights)
-        for (int k = 0; k < nr_cells; k++) weights_v[k] *= 0.5;
-    }
+    /* Get the task weight based on costs. */
+    double w = (double)t->cost;
 
     /* Get the top-level cells involved. */
     struct cell *ci, *cj;
@@ -552,9 +547,9 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
     if (t->type == task_type_ghost || t->type == task_type_kick1 ||
         t->type == task_type_kick2 || t->type == task_type_timestep ||
         t->type == task_type_drift_part || t->type == task_type_drift_gpart) {
+
       /* Particle updates add only to vertex weight. */
       if (taskvweights) weights_v[cid] += w;
-
     }
 
     /* Self interaction? */
@@ -575,64 +570,72 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
 
       }
 
-      /* Distinct cells with local ci? */
-      else if (ci->nodeID == nodeID) {
+      /* Distinct cells. */
+      else {
         /* Index of the jth cell. */
         int cjd = cj - cells;
 
-        /* Add half of weight to each cell. */
-        if (taskvweights) {
-          if (ci->nodeID == nodeID) weights_v[cid] += 0.5 * w;
+        /* Local cells add weight to vertices. */
+        if (taskvweights && ci->nodeID == nodeID) {
+          weights_v[cid] += 0.5 * w;
           if (cj->nodeID == nodeID) weights_v[cjd] += 0.5 * w;
         }
 
-        /* Add weights to edge. */
-        int kk;
-        for (kk = 26 * cid; inds[kk] != cjd; kk++)
-          ;
-        weights_e[kk] += w;
-        for (kk = 26 * cjd; inds[kk] != cid; kk++)
-          ;
-        weights_e[kk] += w;
+        if (timebins) {
+          /* Add weights to edge for all cells based on the expected
+           * interaction time (calculated as the time to the last expected
+           * time) as we want to avoid having active cells on the edges, so we
+           * cut for that. Note that weight is added to the local and remote
+           * cells, as we want to keep both away from any cuts, this can
+           * overflow int, so take care. */
+          int dti = num_time_bins - get_time_bin(ci->ti_end_min);
+          int dtj = num_time_bins - get_time_bin(cj->ti_end_min);
+          double dt = (double)(1 << dti) + (double)(1 << dtj);
+
+          /* ci */
+          int kk;
+          for (kk = 26 * cid; inds[kk] != cjd; kk++)
+            ;
+          weights_e[kk] += dt;
+
+          /* cj */
+          for (kk = 26 * cjd; inds[kk] != cid; kk++)
+            ;
+          weights_e[kk] += dt;
+        } else {
+
+          /* Add weights from task costs to the edge. */
+
+          /* ci */
+          int kk;
+          for (kk = 26 * cid; inds[kk] != cjd; kk++)
+            ;
+          weights_e[kk] += w;
+
+          /* cj */
+          for (kk = 26 * cjd; inds[kk] != cid; kk++)
+            ;
+          weights_e[kk] += w;
+        }
       }
     }
   }
 
   /* Re-calculate the vertices if using particle counts. */
-  if (partweights && bothweights) {
-    accumulate_counts(s, weights_v);
-
-    /*  Rescale to balance times. */
-    float vwscale = (float)wtot / (float)nr_tasks;
-    for (int k = 0; k < nr_cells; k++) {
-      weights_v[k] *= vwscale;
-    }
-  }
-
-  /* Get the minimum scaling and re-scale if necessary. */
-  int res;
-  if ((res = MPI_Allreduce(&wscale, &wscale_buff, 1, MPI_FLOAT, MPI_MIN,
-                           MPI_COMM_WORLD)) != MPI_SUCCESS)
-    mpi_error(res, "Failed to allreduce the weight scales.");
-
-  if (wscale_buff != wscale) {
-    float scale = wscale_buff / wscale;
-    for (int k = 0; k < 26 * nr_cells; k++) weights_e[k] *= scale;
-    if (bothweights)
-      for (int k = 0; k < nr_cells; k++) weights_v[k] *= scale;
-  }
+  if (partweights && bothweights) accumulate_counts(s, weights_v);
 
   /* Merge the weights arrays across all nodes. */
+  int res;
   if (bothweights) {
     if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_v, weights_v,
-                          nr_cells, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD)) !=
+                          nr_cells, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD)) !=
         MPI_SUCCESS)
       mpi_error(res, "Failed to allreduce vertex weights.");
   }
 
   if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_e, weights_e,
-                        26 * nr_cells, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD)) !=
-      MPI_SUCCESS)
+                        26 * nr_cells, MPI_DOUBLE, MPI_SUM, 0,
+                        MPI_COMM_WORLD)) != MPI_SUCCESS)
     mpi_error(res, "Failed to allreduce edge weights.");
 
   /* Allocate cell list for the partition. */
@@ -641,39 +644,53 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
 
   /* As of here, only one node needs to compute the partition. */
   if (nodeID == 0) {
-    /* Final rescale of all weights to avoid a large range. Large ranges
-     * have been seen to cause an incomplete graph. */
-    wmin = wmax;
-    wmax = 0;
-    for (int k = 0; k < 26 * nr_cells; k++) {
-      wmax = weights_e[k] > wmax ? weights_e[k] : wmax;
-      wmin = weights_e[k] < wmin ? weights_e[k] : wmin;
-    }
+
+    /* We need to rescale the weights into the range of an integer for METIS
+     * (really range of idx_t). Also we would like the range of vertex and
+     * edges weights to be similar so they balance. */
+    double wminv = 0.0;
+    double wmaxv = 0.0;
     if (bothweights) {
+      wminv = weights_v[0];
+      wmaxv = weights_v[0];
       for (int k = 0; k < nr_cells; k++) {
-        wmax = weights_v[k] > wmax ? weights_v[k] : wmax;
-        wmin = weights_v[k] < wmin ? weights_v[k] : wmin;
+        wmaxv = weights_v[k] > wmaxv ? weights_v[k] : wmaxv;
+        wminv = weights_v[k] < wminv ? weights_v[k] : wminv;
       }
     }
 
-    if ((wmax - wmin) > metis_maxweight) {
-      wscale = metis_maxweight / (wmax - wmin);
-      for (int k = 0; k < 26 * nr_cells; k++) {
-        weights_e[k] = (weights_e[k] - wmin) * wscale + 1;
+    double wmine = weights_e[0];
+    double wmaxe = weights_e[0];
+    for (int k = 0; 26 * k < nr_cells; k++) {
+      wmaxe = weights_e[k] > wmaxe ? weights_e[k] : wmaxe;
+      wmine = weights_e[k] < wmine ? weights_e[k] : wmine;
+    }
+
+    double wscalev = 1.0;
+    double wscalee = 1.0;
+    if (bothweights) {
+
+      /* Make maximum value same in both weights systems. */
+      if (wmaxv > wmaxe) {
+        wscalee = wmaxv / wmaxe;
+        wmaxe = wmaxv;
+      } else {
+        wscalev = wmaxe / wmaxv;
+        wmaxv = wmaxe;
       }
-      if (bothweights) {
-        for (int k = 0; k < nr_cells; k++) {
-          weights_v[k] = (weights_v[k] - wmin) * wscale + 1;
-        }
+
+      /* Scale to the METIS range. */
+      wscalev *= metis_maxweight / (wmaxv - wminv);
+      for (int k = 0; k < nr_cells; k++) {
+        weights_v[k] = (weights_v[k] - wminv) * wscalev + 1.0;
       }
     }
 
-    /* Make sure there are no zero weights. */
-    for (int k = 0; k < 26 * nr_cells; k++)
-      if (weights_e[k] == 0) weights_e[k] = 1;
-    if (bothweights)
-      for (int k = 0; k < nr_cells; k++)
-        if (weights_v[k] == 0) weights_v[k] = 1;
+    /* Scale to the METIS range. */
+    wscalee *= metis_maxweight / (wmaxe - wmine);
+    for (int k = 0; k < 26 * nr_cells; k++) {
+      weights_e[k] = (weights_e[k] - wmine) * wscalee + 1.0;
+    }
 
     /* And partition, use both weights or not as requested. */
     if (bothweights)
@@ -736,8 +753,8 @@ static void repart_vertex_metis(struct space *s, int nodeID, int nr_nodes) {
 
   /* Use particle counts as vertex weights. */
   /* Space for particles per cell counts, which will be used as weights. */
-  int *weights = NULL;
-  if ((weights = (int *)malloc(sizeof(int) * s->nr_cells)) == NULL)
+  double *weights = NULL;
+  if ((weights = (double *)malloc(sizeof(double) * s->nr_cells)) == NULL)
     error("Failed to allocate weights buffer.");
 
   /* Check each particle and accumulate the counts per cell. */
@@ -745,8 +762,8 @@ static void repart_vertex_metis(struct space *s, int nodeID, int nr_nodes) {
 
   /* Get all the counts from all the nodes. */
   int res;
-  if ((res = MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_INT, MPI_SUM,
-                           MPI_COMM_WORLD)) != MPI_SUCCESS)
+  if ((res = MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_DOUBLE,
+                           MPI_SUM, MPI_COMM_WORLD)) != MPI_SUCCESS)
     mpi_error(res, "Failed to allreduce particle cell weights.");
 
   /* Main node does the partition calculation. */
@@ -787,35 +804,32 @@ void partition_repartition(struct repartition *reparttype, int nodeID,
 
 #if defined(WITH_MPI) && defined(HAVE_METIS)
 
-  if (reparttype->type == REPART_METIS_BOTH ||
-      reparttype->type == REPART_METIS_EDGE ||
-      reparttype->type == REPART_METIS_VERTEX_EDGE) {
+  if (reparttype->type == REPART_METIS_VERTEX_COSTS_EDGE_COSTS) {
+    repart_edge_metis(0, 1, 0, nodeID, nr_nodes, s, tasks, nr_tasks);
 
-    int partweights;
-    int bothweights;
-    if (reparttype->type == REPART_METIS_VERTEX_EDGE)
-      partweights = 1;
-    else
-      partweights = 0;
+  } else if (reparttype->type == REPART_METIS_EDGE_COSTS) {
+    repart_edge_metis(0, 0, 0, nodeID, nr_nodes, s, tasks, nr_tasks);
 
-    if (reparttype->type == REPART_METIS_BOTH)
-      bothweights = 1;
-    else
-      bothweights = 0;
+  } else if (reparttype->type == REPART_METIS_VERTEX_COUNTS_EDGE_COSTS) {
+    repart_edge_metis(1, 1, 0, nodeID, nr_nodes, s, tasks, nr_tasks);
+
+  } else if (reparttype->type == REPART_METIS_VERTEX_COSTS_EDGE_TIMEBINS) {
+    repart_edge_metis(0, 1, 1, nodeID, nr_nodes, s, tasks, nr_tasks);
 
-    repart_edge_metis(partweights, bothweights, nodeID, nr_nodes, s, tasks,
-                      nr_tasks);
+  } else if (reparttype->type == REPART_METIS_VERTEX_COUNTS_EDGE_TIMEBINS) {
+    repart_edge_metis(1, 1, 1, nodeID, nr_nodes, s, tasks, nr_tasks);
 
-  } else if (reparttype->type == REPART_METIS_VERTEX) {
+  } else if (reparttype->type == REPART_METIS_EDGE_TIMEBINS) {
+    repart_edge_metis(0, 0, 1, nodeID, nr_nodes, s, tasks, nr_tasks);
 
+  } else if (reparttype->type == REPART_METIS_VERTEX_COUNTS) {
     repart_vertex_metis(s, nodeID, nr_nodes);
 
   } else if (reparttype->type == REPART_NONE) {
-
     /* Doing nothing. */
 
   } else {
-    error("Unknown repartition type");
+    error("Impossible repartition type");
   }
 #else
   error("SWIFT was not compiled with METIS support.");
@@ -884,17 +898,17 @@ void partition_initial_partition(struct partition *initial_partition,
 
     /* Space for particles per cell counts, which will be used as weights or
      * not. */
-    int *weights = NULL;
+    double *weights = NULL;
     if (initial_partition->type == INITPART_METIS_WEIGHT) {
-      if ((weights = (int *)malloc(sizeof(int) * s->nr_cells)) == NULL)
+      if ((weights = (double *)malloc(sizeof(double) * s->nr_cells)) == NULL)
         error("Failed to allocate weights buffer.");
-      bzero(weights, sizeof(int) * s->nr_cells);
+      bzero(weights, sizeof(double) * s->nr_cells);
 
       /* Check each particle and accumilate the counts per cell. */
       accumulate_counts(s, weights);
 
       /* Get all the counts from all the nodes. */
-      if (MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_INT, MPI_SUM,
+      if (MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_DOUBLE, MPI_SUM,
                         MPI_COMM_WORLD) != MPI_SUCCESS)
         error("Failed to allreduce particle cell weights.");
     }
@@ -969,10 +983,10 @@ void partition_init(struct partition *partition,
 
 /* Defaults make use of METIS if available */
 #ifdef HAVE_METIS
-  const char *default_repart = "task_weights";
+  const char *default_repart = "costs/costs";
   const char *default_part = "simple_metis";
 #else
-  const char *default_repart = "none";
+  const char *default_repart = "none/none";
   const char *default_part = "grid";
 #endif
 
@@ -1029,32 +1043,40 @@ void partition_init(struct partition *partition,
   parser_get_opt_param_string(params, "DomainDecomposition:repartition_type",
                               part_type, default_repart);
 
-  switch (part_type[0]) {
-    case 'n':
-      repartition->type = REPART_NONE;
-      break;
+  if (strcmp("none/none", part_type) == 0) {
+    repartition->type = REPART_NONE;
+
 #ifdef HAVE_METIS
-    case 't':
-      repartition->type = REPART_METIS_BOTH;
-      break;
-    case 'e':
-      repartition->type = REPART_METIS_EDGE;
-      break;
-    case 'p':
-      repartition->type = REPART_METIS_VERTEX;
-      break;
-    case 'h':
-      repartition->type = REPART_METIS_VERTEX_EDGE;
-      break;
-    default:
-      message("Invalid choice of re-partition type '%s'.", part_type);
-      error(
-          "Permitted values are: 'none', 'task_weights', 'particle_weights',"
-          "'edge_task_weights' or 'hybrid_weights'");
+  } else if (strcmp("costs/costs", part_type) == 0) {
+    repartition->type = REPART_METIS_VERTEX_COSTS_EDGE_COSTS;
+
+  } else if (strcmp("counts/none", part_type) == 0) {
+    repartition->type = REPART_METIS_VERTEX_COUNTS;
+
+  } else if (strcmp("none/costs", part_type) == 0) {
+    repartition->type = REPART_METIS_EDGE_COSTS;
+
+  } else if (strcmp("counts/costs", part_type) == 0) {
+    repartition->type = REPART_METIS_VERTEX_COUNTS_EDGE_COSTS;
+
+  } else if (strcmp("costs/time", part_type) == 0) {
+    repartition->type = REPART_METIS_VERTEX_COSTS_EDGE_TIMEBINS;
+
+  } else if (strcmp("counts/time", part_type) == 0) {
+    repartition->type = REPART_METIS_VERTEX_COUNTS_EDGE_TIMEBINS;
+
+  } else if (strcmp("none/time", part_type) == 0) {
+    repartition->type = REPART_METIS_EDGE_TIMEBINS;
+  } else {
+    message("Invalid choice of re-partition type '%s'.", part_type);
+    error(
+        "Permitted values are: 'none/none', 'costs/costs',"
+        "'counts/none', 'none/costs', 'counts/costs', "
+        "'costs/time', 'counts/time' or 'none/time'");
 #else
-    default:
-      message("Invalid choice of re-partition type '%s'.", part_type);
-      error("Permitted values are: 'none' when compiled without METIS.");
+  } else {
+    message("Invalid choice of re-partition type '%s'.", part_type);
+    error("Permitted values are: 'none/none' when compiled without METIS.");
 #endif
   }
 
diff --git a/src/partition.h b/src/partition.h
index 03523b165a930b085224e458ac0dd8c8232a578d..c3eade190c9514efb4c44011e3990745e20846fd 100644
--- a/src/partition.h
+++ b/src/partition.h
@@ -43,10 +43,13 @@ struct partition {
 /* Repartition type to use. */
 enum repartition_type {
   REPART_NONE = 0,
-  REPART_METIS_BOTH,
-  REPART_METIS_VERTEX,
-  REPART_METIS_EDGE,
-  REPART_METIS_VERTEX_EDGE
+  REPART_METIS_VERTEX_COSTS_EDGE_COSTS,
+  REPART_METIS_VERTEX_COUNTS,
+  REPART_METIS_EDGE_COSTS,
+  REPART_METIS_VERTEX_COUNTS_EDGE_COSTS,
+  REPART_METIS_VERTEX_COSTS_EDGE_TIMEBINS,
+  REPART_METIS_VERTEX_COUNTS_EDGE_TIMEBINS,
+  REPART_METIS_EDGE_TIMEBINS
 };
 
 /* Repartition preferences. */
diff --git a/src/scheduler.c b/src/scheduler.c
index dd6cfed6480b4f42b025c62c8574e72cefeb196b..263466c8841ebb3d04053bca55fb9200236750f5 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -50,6 +50,7 @@
 #include "space.h"
 #include "task.h"
 #include "timers.h"
+#include "version.h"
 
 /**
  * @brief Re-set the list of active tasks.
@@ -111,6 +112,125 @@ void scheduler_addunlock(struct scheduler *s, struct task *ta,
   atomic_inc(&s->completed_unlock_writes);
 }
 
+/**
+ * @brief Write a dot file with the task dependencies.
+ *
+ * Run plot_task_dependencies.sh for an example of how to use it
+ * to generate the figure.
+ *
+ * @param s The #scheduler we are working in.
+ * @param verbose Are we verbose about this?
+ */
+void scheduler_write_dependencies(struct scheduler *s, int verbose) {
+
+  const ticks tic = getticks();
+
+  /* Conservative number of dependencies per task type */
+  const int max_nber_dep = 128;
+
+  /* Number of possible relations between tasks */
+  const int nber_relation =
+      2 * task_type_count * task_subtype_count * max_nber_dep;
+
+  /* To get the table of max_nber_dep for a task:
+   * ind = (ta * task_subtype_count + sa) * max_nber_dep * 2
+   * where ta is the value of task_type and sa is the value of
+   * task_subtype  */
+  int *table = malloc(nber_relation * sizeof(int));
+  if (table == NULL)
+    error("Error allocating memory for task-dependency graph.");
+
+  /* Reset everything */
+  for (int i = 0; i < nber_relation; i++) table[i] = -1;
+
+  /* Create file */
+  char filename[200] = "dependency_graph.dot";
+  FILE *f = fopen(filename, "w");
+  if (f == NULL) error("Error opening dependency graph file.");
+
+  /* Write header */
+  fprintf(f, "digraph task_dep {\n");
+  fprintf(f, "label=\"Task dependencies for SWIFT %s\"", git_revision());
+  fprintf(f, "\t compound=true;\n");
+  fprintf(f, "\t ratio=0.66;\n");
+  fprintf(f, "\t node[nodesep=0.15];\n");
+
+  /* loop over all tasks */
+  for (int i = 0; i < s->nr_tasks; i++) {
+    const struct task *ta = &s->tasks[i];
+
+    /* and their dependencies */
+    for (int j = 0; j < ta->nr_unlock_tasks; j++) {
+      const struct task *tb = ta->unlock_tasks[j];
+
+      /* check if dependency already written */
+      int written = 0;
+
+      /* Current index */
+      int ind = ta->type * task_subtype_count + ta->subtype;
+      ind *= 2 * max_nber_dep;
+
+      int k = 0;
+      int *cur = &table[ind];
+      while (k < max_nber_dep) {
+
+        /* not written yet */
+        if (cur[0] == -1) {
+          cur[0] = tb->type;
+          cur[1] = tb->subtype;
+          break;
+        }
+
+        /* already written */
+        if (cur[0] == tb->type && cur[1] == tb->subtype) {
+          written = 1;
+          break;
+        }
+
+        k += 1;
+        cur = &cur[3];
+      }
+
+      /* max_nber_dep is too small */
+      if (k == max_nber_dep)
+        error("Not enough memory, please increase max_nber_dep");
+
+      /* Not written yet => write it */
+      if (!written) {
+
+        /* text to write */
+        char ta_name[200];
+        char tb_name[200];
+
+        /* construct line */
+        if (ta->subtype == task_subtype_none)
+          sprintf(ta_name, "%s", taskID_names[ta->type]);
+        else
+          sprintf(ta_name, "\"%s %s\"", taskID_names[ta->type],
+                  subtaskID_names[ta->subtype]);
+
+        if (tb->subtype == task_subtype_none)
+          sprintf(tb_name, "%s", taskID_names[tb->type]);
+        else
+          sprintf(tb_name, "\"%s %s\"", taskID_names[tb->type],
+                  subtaskID_names[tb->subtype]);
+
+        /* Write to the ffile */
+        fprintf(f, "\t %s->%s;\n", ta_name, tb_name);
+      }
+    }
+  }
+
+  /* Be clean */
+  fprintf(f, "}");
+  fclose(f);
+  free(table);
+
+  if (verbose)
+    message("Printing task graph took %.3f %s.",
+            clocks_from_ticks(getticks() - tic), clocks_getunit());
+}
+
 /**
  * @brief Split a hydrodynamic task if too large.
  *
@@ -1319,23 +1439,41 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         if (t->subtype == task_subtype_tend) {
           t->buff = malloc(sizeof(struct pcell_step) * t->ci->pcell_size);
           cell_pack_end_step(t->ci, t->buff);
-          err = MPI_Isend(
-              t->buff, t->ci->pcell_size * sizeof(struct pcell_step), MPI_BYTE,
-              t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+          if ((t->ci->pcell_size * sizeof(struct pcell_step)) >
+              s->mpi_message_limit)
+            err = MPI_Isend(
+                t->buff, t->ci->pcell_size * sizeof(struct pcell_step),
+                MPI_BYTE, t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+          else
+            err = MPI_Issend(
+                t->buff, t->ci->pcell_size * sizeof(struct pcell_step),
+                MPI_BYTE, t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
         } else if (t->subtype == task_subtype_xv ||
                    t->subtype == task_subtype_rho ||
                    t->subtype == task_subtype_gradient) {
-          err = MPI_Isend(t->ci->parts, t->ci->count, part_mpi_type,
-                          t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+          if ((t->ci->count * sizeof(struct part)) > s->mpi_message_limit)
+            err = MPI_Isend(t->ci->parts, t->ci->count, part_mpi_type,
+                            t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+          else
+            err = MPI_Issend(t->ci->parts, t->ci->count, part_mpi_type,
+                             t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
           // message( "sending %i parts with tag=%i from %i to %i." ,
           //     t->ci->count , t->flags , s->nodeID , t->cj->nodeID );
           // fflush(stdout);
         } else if (t->subtype == task_subtype_gpart) {
-          err = MPI_Isend(t->ci->gparts, t->ci->gcount, gpart_mpi_type,
-                          t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+          if ((t->ci->gcount * sizeof(struct gpart)) > s->mpi_message_limit)
+            err = MPI_Isend(t->ci->gparts, t->ci->gcount, gpart_mpi_type,
+                            t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+          else
+            err = MPI_Issend(t->ci->gparts, t->ci->gcount, gpart_mpi_type,
+                             t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
         } else if (t->subtype == task_subtype_spart) {
-          err = MPI_Isend(t->ci->sparts, t->ci->scount, spart_mpi_type,
-                          t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+          if ((t->ci->scount * sizeof(struct spart)) > s->mpi_message_limit)
+            err = MPI_Isend(t->ci->sparts, t->ci->scount, spart_mpi_type,
+                            t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+          else
+            err = MPI_Issend(t->ci->sparts, t->ci->scount, spart_mpi_type,
+                             t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
         } else if (t->subtype == task_subtype_multipole) {
           t->buff = malloc(sizeof(struct gravity_tensors) * t->ci->pcell_size);
           cell_pack_multipoles(t->ci, t->buff);
diff --git a/src/scheduler.h b/src/scheduler.h
index c5ccbf43f3048dfb47d2d3eb7a7db6634b646700..1a75544de12b8402e553e3ae2b84e2d8a65c56e8 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -103,6 +103,10 @@ struct scheduler {
   /* The node we are working on. */
   int nodeID;
 
+  /* Maximum size of task messages, in bytes, to sent using non-buffered
+   * MPI. */
+  size_t mpi_message_limit;
+
   /* 'Pointer' to the seed for the random number generator */
   pthread_key_t local_seed_pointer;
 };
@@ -168,5 +172,6 @@ void scheduler_dump_queue(struct scheduler *s);
 void scheduler_print_tasks(const struct scheduler *s, const char *fileName);
 void scheduler_clean(struct scheduler *s);
 void scheduler_free_tasks(struct scheduler *s);
+void scheduler_write_dependencies(struct scheduler *s, int verbose);
 
 #endif /* SWIFT_SCHEDULER_H */
diff --git a/src/space.c b/src/space.c
index eccf7e910bf558425463f4174c02e6264263244a..6051320e26c9c463cb85598ac8c106abd769d32e 100644
--- a/src/space.c
+++ b/src/space.c
@@ -2708,7 +2708,7 @@ void space_init_gparts(struct space *s, int verbose) {
 
   if (s->nr_gparts > 0)
     threadpool_map(&s->e->threadpool, space_init_gparts_mapper, s->gparts,
-                   s->nr_gparts, sizeof(struct part), 0, NULL);
+                   s->nr_gparts, sizeof(struct gpart), 0, NULL);
   if (verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
             clocks_getunit());