diff --git a/examples/main.c b/examples/main.c
index e5417c628680bacdc8e896b36639aaa57aed1d28..19d2d7a7871200ba6e4902d5918a59511bf32b47 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -236,8 +236,8 @@ int main(int argc, char *argv[]) {
         "Executing a dry run. No i/o or time integration will be performed.");
 
   /* Report CPU frequency. */
+  cpufreq = clocks_get_cpufreq();
   if (myrank == 0) {
-    cpufreq = clocks_get_cpufreq();
     message("CPU frequency used for tick conversion: %llu Hz", cpufreq);
   }
 
@@ -252,6 +252,8 @@ int main(int argc, char *argv[]) {
     message("sizeof(struct part)  is %4zi bytes.", sizeof(struct part));
     message("sizeof(struct xpart) is %4zi bytes.", sizeof(struct xpart));
     message("sizeof(struct gpart) is %4zi bytes.", sizeof(struct gpart));
+    message("sizeof(struct task)  is %4zi bytes.", sizeof(struct task));
+    message("sizeof(struct cell)  is %4zi bytes.", sizeof(struct cell));
   }
 
   /* How vocal are we ? */
@@ -514,19 +516,23 @@ int main(int argc, char *argv[]) {
           /* Open file and position at end. */
           file_thread = fopen(dumpfile, "a");
 
-          fprintf(file_thread, " %03i 0 0 0 0 %lli 0 0 0 0\n", myrank,
-                  e.tic_step);
+          fprintf(file_thread, " %03i 0 0 0 0 %lli %lli 0 0 0 0 %lli\n", myrank,
+                  e.tic_step, e.toc_step, cpufreq);
           int count = 0;
           for (int l = 0; l < e.sched.nr_tasks; l++)
             if (!e.sched.tasks[l].skip && !e.sched.tasks[l].implicit) {
-              fprintf(file_thread, " %03i %i %i %i %i %lli %lli %i %i %i\n",
-                      myrank, e.sched.tasks[l].rid, e.sched.tasks[l].type,
+              fprintf(file_thread, " %03i %i %i %i %i %lli %lli %i %i %i %i %i\n",
+                      myrank, e.sched.tasks[l].last_rid, e.sched.tasks[l].type,
                       e.sched.tasks[l].subtype, (e.sched.tasks[l].cj == NULL),
                       e.sched.tasks[l].tic, e.sched.tasks[l].toc,
                       (e.sched.tasks[l].ci != NULL) ? e.sched.tasks[l].ci->count
                                                     : 0,
                       (e.sched.tasks[l].cj != NULL) ? e.sched.tasks[l].cj->count
                                                     : 0,
+                      (e.sched.tasks[l].ci != NULL) ? e.sched.tasks[l].ci->gcount
+                                                    : 0,
+                      (e.sched.tasks[l].cj != NULL) ? e.sched.tasks[l].cj->gcount
+                                                    : 0,
                       e.sched.tasks[l].flags);
               fflush(stdout);
               count++;
@@ -545,15 +551,20 @@ int main(int argc, char *argv[]) {
       snprintf(dumpfile, 30, "thread_info-step%d.dat", j);
       FILE *file_thread;
       file_thread = fopen(dumpfile, "w");
+      /* Add some information to help with the plots */
+      fprintf(file_thread, " %i %i %i %i %lli %lli %i %i %i %lli\n", -2, -1, -1, 1,
+              e.tic_step, e.toc_step, 0, 0, 0, cpufreq);
       for (int l = 0; l < e.sched.nr_tasks; l++)
         if (!e.sched.tasks[l].skip && !e.sched.tasks[l].implicit)
           fprintf(
-              file_thread, " %i %i %i %i %lli %lli %i %i\n",
-              e.sched.tasks[l].rid, e.sched.tasks[l].type,
+              file_thread, " %i %i %i %i %lli %lli %i %i %i %i\n",
+              e.sched.tasks[l].last_rid, e.sched.tasks[l].type,
               e.sched.tasks[l].subtype, (e.sched.tasks[l].cj == NULL),
               e.sched.tasks[l].tic, e.sched.tasks[l].toc,
               (e.sched.tasks[l].ci == NULL) ? 0 : e.sched.tasks[l].ci->count,
-              (e.sched.tasks[l].cj == NULL) ? 0 : e.sched.tasks[l].cj->count);
+              (e.sched.tasks[l].cj == NULL) ? 0 : e.sched.tasks[l].cj->count,
+              (e.sched.tasks[l].ci == NULL) ? 0 : e.sched.tasks[l].ci->gcount,
+              (e.sched.tasks[l].cj == NULL) ? 0 : e.sched.tasks[l].cj->gcount);
       fclose(file_thread);
 #endif
     }
diff --git a/examples/plot_tasks.py b/examples/plot_tasks.py
index c96d063e0bf1adf614a447f0dd524302a070e9dd..ffabdf06f9324fba35770e7fbba4cfadc8add770 100755
--- a/examples/plot_tasks.py
+++ b/examples/plot_tasks.py
@@ -35,9 +35,6 @@ import pylab as pl
 import numpy as np
 import sys
 
-#  CPU ticks per second.
-CPU_CLOCK = 2.7e9
-
 #  Basic plot configuration.
 PLOT_PARAMS = {"axes.labelsize": 10,
                "axes.titlesize": 10,
@@ -108,7 +105,7 @@ infile = sys.argv[1]
 outpng = sys.argv[2]
 delta_t = 0
 if len( sys.argv ) == 4:
-    delta_t = int(sys.argv[3]) * CPU_CLOCK / 1000
+    delta_t = int(sys.argv[3])
 
 #  Read input.
 data = pl.loadtxt( infile )
@@ -116,20 +113,31 @@ data = pl.loadtxt( infile )
 nthread = int(max(data[:,0])) + 1
 print "Number of threads:", nthread
 
+# Recover the start and end time
+full_step = data[0,:]
+tic_step = int(full_step[4])
+toc_step = int(full_step[5])
+CPU_CLOCK = float(full_step[-1])
+data = data[1:,:]
+
+print "CPU frequency:", CPU_CLOCK / 1.e9
+
 # Avoid start and end times of zero.
 data = data[data[:,4] != 0]
 data = data[data[:,5] != 0]
 
-# Calculate the time range, it not given.
+# Calculate the time range, if not given.
+delta_t = delta_t * CPU_CLOCK / 1000
 if delta_t == 0:
     dt = max(data[:,5]) - min(data[:,4])
     if dt > delta_t:
         delta_t = dt
 
 # Once more doing the real gather and plots this time.
-start_t = min(data[:,4])
+start_t = tic_step 
 data[:,4] -= start_t
 data[:,5] -= start_t
+end_t = (toc_step - start_t) / CPU_CLOCK * 1000
 
 tasks = {}
 tasks[-1] = []
@@ -147,7 +155,7 @@ for line in range(num_lines):
     tasks[thread][-1]["tic"] = tic
     tasks[thread][-1]["toc"] = toc
     tasks[thread][-1]["t"] = (toc + tic)/ 2
-
+    
 combtasks = {}
 combtasks[-1] = []
 for i in range(nthread):
@@ -173,11 +181,11 @@ for thread in range(nthread):
             lasttype = task["type"]
         else:
             combtasks[thread][-1]["toc"] = task["toc"]
-
+            
 typesseen = []
 fig = pl.figure()
 ax = fig.add_subplot(1,1,1)
-ax.set_xlim(0, delta_t * 1.03 * 1000 / CPU_CLOCK)
+ax.set_xlim(-delta_t * 0.03 * 1000 / CPU_CLOCK, delta_t * 1.03 * 1000 / CPU_CLOCK)
 ax.set_ylim(0, nthread)
 tictoc = np.zeros(2)
 for i in range(nthread):
@@ -222,6 +230,10 @@ ax.fill_between([0, 0], nthread+0.5, nthread + nrow + 0.5, facecolor="white")
 ax.set_ylim(0, nthread + nrow + 1)
 ax.legend(loc=1, shadow=True, mode="expand", ncol=5)
 
+# Start and end of time-step
+ax.plot([0, 0], [0, nthread + nrow + 1], 'k--', linewidth=1)
+ax.plot([end_t, end_t], [0, nthread + nrow + 1], 'k--', linewidth=1)
+
 ax.set_xlabel("Wall clock time [ms]")
 ax.set_ylabel("Thread ID" )
 ax.set_yticks(pl.array(range(nthread)), True)
diff --git a/examples/plot_tasks_MPI.py b/examples/plot_tasks_MPI.py
index ae84b0177bfa01d4bb4c2c9c3e44c088b8ae7776..cf591028b6dc3f724847b52c9efac4355d27f87e 100755
--- a/examples/plot_tasks_MPI.py
+++ b/examples/plot_tasks_MPI.py
@@ -41,9 +41,6 @@ import pylab as pl
 import numpy as np
 import sys
 
-#  CPU ticks per second.
-CPU_CLOCK = 2.7e9
-
 #  Basic plot configuration.
 PLOT_PARAMS = {"axes.labelsize": 10,
                "axes.titlesize": 10,
@@ -115,11 +112,20 @@ infile = sys.argv[1]
 outbase = sys.argv[2]
 delta_t = 0
 if len( sys.argv ) == 4:
-    delta_t = int(sys.argv[3]) * CPU_CLOCK / 1000
-
+    delta_t = int(sys.argv[3])
+    
 #  Read input.
 data = pl.loadtxt( infile )
 
+# Recover the start and end time
+full_step = data[0,:]
+tic_step = int(full_step[5])
+toc_step = int(full_step[6])
+CPU_CLOCK = float(full_step[-1])
+
+print "CPU frequency:", CPU_CLOCK / 1.e9
+
+
 nranks = int(max(data[:,0])) + 1
 print "Number of ranks:", nranks
 nthread = int(max(data[:,1])) + 1
@@ -132,6 +138,7 @@ sdata = sdata[sdata[:,6] != 0]
 # Each rank can have different clock (compute node), but we want to use the
 # same delta times range for comparisons, so we suck it up and take the hit of
 # precalculating this, unless the user knows better.
+delta_t = delta_t * CPU_CLOCK / 1000
 if delta_t == 0:
     for rank in range(nranks):
         data = sdata[sdata[:,0] == rank]
@@ -143,16 +150,22 @@ if delta_t == 0:
 for rank in range(nranks):
     data = sdata[sdata[:,0] == rank]
 
-    start_t = min(data[:,5])
+    full_step = data[0,:]
+    tic_step = int(full_step[5])
+    toc_step = int(full_step[6])
+    data = data[1:,:]
+
+    start_t = tic_step
     data[:,5] -= start_t
     data[:,6] -= start_t
+    end_t = (toc_step - start_t) / CPU_CLOCK * 1000
 
     tasks = {}
     tasks[-1] = []
     for i in range(nthread):
         tasks[i] = []
 
-    num_lines = pl.size(data) / 10
+    num_lines = pl.shape(data)[0]
     for line in range(num_lines):
         thread = int(data[line,1])
         tasks[thread].append({})
@@ -193,7 +206,7 @@ for rank in range(nranks):
     typesseen = []
     fig = pl.figure()
     ax = fig.add_subplot(1,1,1)
-    ax.set_xlim(0, delta_t * 1.03 * 1000 / CPU_CLOCK)
+    ax.set_xlim(-delta_t * 0.03 * 1000 / CPU_CLOCK, delta_t * 1.03 * 1000 / CPU_CLOCK)
     ax.set_ylim(0, nthread)
     tictoc = np.zeros(2)
     for i in range(nthread):
@@ -238,6 +251,10 @@ for rank in range(nranks):
     ax.set_ylim(0, nthread + nrow + 1)
     ax.legend(loc=1, shadow=True, mode="expand", ncol=5)
 
+    # Start and end of time-step
+    ax.plot([0, 0], [0, nthread + nrow + 1], 'k--', linewidth=1)
+    ax.plot([end_t, end_t], [0, nthread + nrow + 1], 'k--', linewidth=1)
+
     ax.set_xlabel("Wall clock time [ms]")
     ax.set_ylabel("Thread ID for MPI Rank " + str(rank) )
     ax.set_yticks(pl.array(range(nthread)), True)
diff --git a/src/engine.c b/src/engine.c
index 76243b1e481191c095d223a6db973aaa82c227a0..819e6907c5e62864fb69517502143101d90d442c 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -1729,7 +1729,6 @@ void engine_prepare(struct engine *e) {
     error("Failed to aggregate the rebuild flag across nodes.");
   rebuild = buff;
 #endif
-  e->tic_step = getticks();
 
   /* Did this not go through? */
   if (rebuild) {
@@ -1984,6 +1983,7 @@ void engine_step(struct engine *e) {
   double e_pot = 0.0, e_int = 0.0, e_kin = 0.0;
   float mom[3] = {0.0, 0.0, 0.0};
   float ang[3] = {0.0, 0.0, 0.0};
+  double snapshot_drift_time = 0.;
   struct space *s = e->s;
 
   TIMER_TIC2;
@@ -1991,6 +1991,8 @@ void engine_step(struct engine *e) {
   struct clocks_time time1, time2;
   clocks_gettime(&time1);
 
+  e->tic_step = getticks();
+    
   /* Collect the cell data. */
   for (int k = 0; k < s->nr_cells; k++)
     if (s->cells[k].nodeID == e->nodeID) {
@@ -2057,6 +2059,7 @@ void engine_step(struct engine *e) {
     e->time = e->ti_current * e->timeBase + e->timeBegin;
     e->timeOld = e->ti_old * e->timeBase + e->timeBegin;
     e->timeStep = (e->ti_current - e->ti_old) * e->timeBase;
+    snapshot_drift_time = e->timeStep;
 
     /* Drift everybody to the snapshot position */
     engine_launch(e, e->nr_threads, 1 << task_type_drift, 0);
@@ -2074,7 +2077,7 @@ void engine_step(struct engine *e) {
   e->step += 1;
   e->time = e->ti_current * e->timeBase + e->timeBegin;
   e->timeOld = e->ti_old * e->timeBase + e->timeBegin;
-  e->timeStep = (e->ti_current - e->ti_old) * e->timeBase;
+  e->timeStep = (e->ti_current - e->ti_old) * e->timeBase + snapshot_drift_time;
 
   /* Drift everybody */
   engine_launch(e, e->nr_threads, 1 << task_type_drift, 0);
@@ -2147,6 +2150,7 @@ void engine_step(struct engine *e) {
   clocks_gettime(&time2);
 
   e->wallclock_time = (float)clocks_diff(&time1, &time2);
+  e->toc_step = getticks();
 }
 
 /**
diff --git a/src/engine.h b/src/engine.h
index 9ef7d57599d30aad5e8000e64148812493299d23..cc6ed9bb038667d4bd548f33dafad07176be0750 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -161,8 +161,8 @@ struct engine {
   struct proxy *proxies;
   int nr_proxies, *proxy_ind;
 
-  /* Tic at the start of a step. */
-  ticks tic_step;
+  /* Tic/toc at the start/end of a step. */
+  ticks tic_step, toc_step;
 
   /* Wallclock time of the last time-step */
   float wallclock_time;
diff --git a/src/runner.c b/src/runner.c
index 3349fa35138c717d0994d6cbcce16657641a5a27..5c79c5fe08af41306d4f59a41a213fd4ceed105a 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -1327,6 +1327,7 @@ void *runner_main(void *data) {
       struct cell *ci = t->ci;
       struct cell *cj = t->cj;
       t->rid = r->cpuid;
+      t->last_rid = r->cpuid;
 
       /* Different types of tasks... */
       switch (t->type) {
diff --git a/src/scheduler.c b/src/scheduler.c
index d1d343240b37f5afd5f41fecacf106b0e85f726f..278b1d0cedb7dde293bda1765120111e1be27903 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -687,6 +687,8 @@ struct task *scheduler_addtask(struct scheduler *s, int type, int subtype,
   t->tic = 0;
   t->toc = 0;
   t->nr_unlock_tasks = 0;
+  t->rid = -1;
+  t->last_rid = -1;
 
   /* Init the lock. */
   lock_init(&t->lock);
diff --git a/src/task.h b/src/task.h
index 25cc886f4b38456a0431fb6c7d0b7b1864053dd9..3e6bdc7370b005b32bafcbb20dd2ddbf807996ae 100644
--- a/src/task.h
+++ b/src/task.h
@@ -85,7 +85,7 @@ struct task {
   MPI_Request req;
 #endif
 
-  int rid;
+  int rid, last_rid;
   ticks tic, toc;
 
   int nr_unlock_tasks;