diff --git a/src/CUDA/queue_cuda.h b/src/CUDA/queue_cuda.h
index 99a7a0dd357bea2d5ef09757678e48d91de520f7..6e4a19a00fb6f17614f57e3e72e30c9a681e8d71 100644
--- a/src/CUDA/queue_cuda.h
+++ b/src/CUDA/queue_cuda.h
@@ -34,6 +34,8 @@ extern "C" {
 #define type_unload task_type_unload
 #define type_implicit_load task_type_implicit_load
 #define type_implicit_unload task_type_implicit_unload
+#define type_recv_load task_type_recv_load
+#define type_send_unload task_type_send_unload
 
 const int num_gpu_types = 5;
 const int gpu_work_task_array[num_gpu_types] = {task_type_self, task_type_pair, task_type_sub_self,
diff --git a/src/CUDA/runner_cuda_main.cu b/src/CUDA/runner_cuda_main.cu
index 9c0095ae79c09b0643a4fe0bc7606ddd259110b2..772f61d3d2e48743ea34d49aa41d6bf5718ff9b0 100644
--- a/src/CUDA/runner_cuda_main.cu
+++ b/src/CUDA/runner_cuda_main.cu
@@ -475,10 +475,10 @@ __device__ void doself_density(struct cell_cuda *ci) {
   /* Is the cell active? */
   if (!cuda_cell_is_active(ci)) {
     if(threadIdx.x ==0)
-    printf(
+/*    printf(
         "Cell isn't active..., ti_end_min=%lli, ti_current=%lli, "
         "max_active_bin=%i, cell_id = %lli\n",
-        ci->ti_end_min, ti_current, max_active_bin, (ci-cells_cuda));
+        ci->ti_end_min, ti_current, max_active_bin, (ci-cells_cuda));*/
     return;
   }
 
@@ -2229,6 +2229,9 @@ __host__ void create_transfer_tasks(struct cell *c, int *k,
     tasks_host[*k].skip = 0;
     tasks_host[*k].implicit = 0;
     tasks_host[*k].task = NULL;
+#ifdef REDUCED_TRANSFER
+    tasks_host[*k].cell = c;
+#endif
     /* The load implicit tasks unlocks the parent's task */
     if (parent_load_task >= 0) {
       tasks_host[*k].unlocks[tasks_host[*k].nr_unlock_tasks++] =
@@ -2254,7 +2257,9 @@ __host__ void create_transfer_tasks(struct cell *c, int *k,
     tasks_host[*k].skip = 0;
     tasks_host[*k].implicit = 0;
     tasks_host[*k].task = NULL;
-
+#ifdef REDUCED_TRANSFER
+    tasks_host[*k].cell = c;
+#endif
     /* The unload implicit task is unlocked by the parent task */
     if (parent_unload_task >= 0) {
       tasks_host[parent_unload_task]
@@ -2284,6 +2289,9 @@ __host__ void create_transfer_tasks(struct cell *c, int *k,
     tasks_host[*k].skip = 0;
     tasks_host[*k].implicit = 0;
     tasks_host[*k].task = NULL;
+#ifdef REDUCED_TRANSFER
+    tasks_host[*k].cell = c;
+#endif
     /* This load task unlocks the parent's task. */
     if (parent_load_task >= 0) {
       tasks_host[*k].unlocks[tasks_host[*k].nr_unlock_tasks++] =
@@ -2305,6 +2313,9 @@ __host__ void create_transfer_tasks(struct cell *c, int *k,
     tasks_host[*k].skip = 0;
     tasks_host[*k].implicit = 0;
     tasks_host[*k].task = NULL;
+#ifdef REDUCED_TRANSFER
+    tasks_host[*k].cell = c;
+#endif
     /* The unload task is unlocked by the parent task */
     if (parent_unload_task >= 0) {
       tasks_host[parent_unload_task]
@@ -2488,7 +2499,7 @@ __host__ void update_tasks(struct engine *e) {
   
 /*  Relies on assumption implicit unloads are always before unloads in host_tasks, which i believe to be true by conscruction.
    for(int i = 0; i < nr_gpu_tasks; i++){
-    if(host_tasks[i].type == type_unload && host_tasks[i].type == type_implicit_unload){
+    if(host_tasks[i].type == type_unload || host_tasks[i].type == type_implicit_unload){
       if(host_tasks[i].wait==1){
         host_tasks[i].skip = 1;
         task_count--;
@@ -2496,16 +2507,24 @@ __host__ void update_tasks(struct engine *e) {
         int *unlocks = host_unlock_copy + (temp_t->unlocks-host_unlock_pointer);
         for(int ii = 0; ii < temp_t->nr_unlock_tasks; ii++){
           if(!host_tasks[unlocks[ii].skip)
-            host_tasks[unlock[ii]].wait--;
+            host_tasks[unlocks[ii]].wait--;
         }
         *Find the corresponding load task*
-        Have to search the cells for this at the moment.
-
+         struct task_cuda *l_task = &host_tasks[host_tasks[i].cell->load_task];
+         l_tasks->skip = 1;
+         task_count--;
+         int *unlocks = host_unlock_copy + (l_task->unlocks-host_unlock_pointer);
+         for(int ii=0; ii < l_task->nr_unlock_tasks; ii++){
+           if(!host_tasks[unlocks[ii].skip)
+             host_tasks[unlocks[ii]].wait--;
+         }
       }
         
     }
   }*/
 
+  /* TODO Reset the waits again.*/
+
   cudaErrCheck(cudaMemcpyToSymbol(tot_num_tasks, &task_count, sizeof(int)));
   /* Reset the queue data.*/
   int qsize;
diff --git a/src/CUDA/task_cuda.h b/src/CUDA/task_cuda.h
index d1583a344e73feab7cdd3130d07bc237004cbeb0..ae709412740d0ca242de8664d9199c633297ab11 100644
--- a/src/CUDA/task_cuda.h
+++ b/src/CUDA/task_cuda.h
@@ -46,6 +46,11 @@ struct task_cuda {
   /* Size of unlock array during initialisation. */
   int size_unlocks;
 
+#ifdef REDUCED_TRANSFER
+  /* Pointer to the loaded/unloaded cell for load/unload tasks*/
+  struct cell *cell;
+#endif
+
 #ifdef CUDA_TASK_TIMERS
   /* Executing block*/
   int blockID;
diff --git a/src/task.h b/src/task.h
index 13fcc5cb363d69773e37b48d6db8bfdf0aad86fb..c6a163bb9a4e593567f8543100077d88d7fc5e7e 100644
--- a/src/task.h
+++ b/src/task.h
@@ -43,6 +43,8 @@ enum task_types {
   task_type_unload = -102,
   task_type_implicit_load = -103,
   task_type_implicit_unload = -104,
+  task_type_recv_load = -201,
+  task_type_send_unload = -202,
   task_type_none = 0,
   task_type_sort,
   task_type_self,