diff --git a/configure.ac b/configure.ac
index 929e21900933b64dc5349933aaa959b4a5de0ec2..13612d1c3bdb9e1792e60c3a289374d1c0ebf54b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -424,6 +424,7 @@ if test "x$with_cuda" != "xno"; then
    if test "x$have_cuda" != "xno"; then
       AC_DEFINE([HAVE_CUDA], 1, [The CUDA compiler is installed.])
    fi
+   CFLAGS="${CFLAGS} -m64"
 fi
 AC_SUBST(CUDA_CFLAGS)
 AC_SUBST(CUDA_LIBS)
diff --git a/src/CUDA/Makefile.am b/src/CUDA/Makefile.am
index 8a27092277f24594909507c545dba6e5e1077006..59cff8bce9cc22c7134f231f2ed47aa8c867d23c 100644
--- a/src/CUDA/Makefile.am
+++ b/src/CUDA/Makefile.am
@@ -5,7 +5,7 @@ EXTRA_DIST = $(SOURCES_CUDA) $(include_HEADERS)
 if HAVECUDA
 
 AM_CFLAGS = -I.. $(HDF5_CPPFLAGS) -g
-CUDA_MYFLAGS = -D_FORCE_INLINES -O0 -g -lineinfo -src-in-ptx --maxrregcount=32 -ftz=true -DWITH_CUDA -G -ccbin=gcc-4.8 -m64
+CUDA_MYFLAGS = -D_FORCE_INLINES -O3 -g -lineinfo -src-in-ptx --maxrregcount=32 -ftz=true -DWITH_CUDA -ccbin=gcc-4.8 -m64
 #-dc
 
 # Assign a "safe" version number
diff --git a/src/CUDA/runner_cuda_main.cu b/src/CUDA/runner_cuda_main.cu
index 3de932bc109fc4f2eea39f802ac2904272226ba0..7ec692675b44dfcf031434f837bc08f9280d57da 100644
--- a/src/CUDA/runner_cuda_main.cu
+++ b/src/CUDA/runner_cuda_main.cu
@@ -437,7 +437,6 @@ __device__ void doself_density(struct cell_cuda *ci) {
 
     /* If the particle isn't active skip it. */
     if (!cuda_part_is_active(pid)) {
-      printf("Particle %i isn't active\n", pid);
       continue;
     }
 
@@ -1331,7 +1330,6 @@ __device__ void do_ghost(struct cell_cuda *c) {
       }
       /* Did we get the right number of neighbours? */
       if (fabsf(h_new - h_old) > eps * h_old) {
-
         cuda_parts.h[i] = h_new;
 
         // If below absolute max try again
@@ -1341,7 +1339,6 @@ __device__ void do_ghost(struct cell_cuda *c) {
         } else {
           /* The particle is a lost cause */
           cuda_parts.h[i] = hydro_h_max;
-
           /* Do some damage control if no neighbours were found */
           if (cuda_parts.wcount[i] == cuda_kernel_root * kernel_norm)
             cuda_hydro_part_has_no_neighbours(i);
@@ -1464,8 +1461,11 @@ __global__ void swift_device_kernel() {
         doself_force(ci);
       }
     } else if (type == task_type_ghost) {
-      struct cell_cuda *ci = &cells_cuda[tasks[tid].ci];
-      do_ghost(ci);
+      if(! tasks[tid].implicit){
+        struct cell_cuda *ci = &cells_cuda[tasks[tid].ci];
+
+        do_ghost(ci);
+      }
     }
 
     __syncthreads();
@@ -1893,7 +1893,7 @@ __host__ void update_tasks(struct engine *e) {
                                     sizeof(struct queue_cuda)));
 
   int *data = (int *)malloc(sizeof(int) * qsize);
-  int nr_unload;
+  int nr_unload=0;
   unload_host.count = 0;
   for (int i = 0; i < nr_gpu_tasks; i++) {
     if (host_tasks[i].type <= type_unload &&
@@ -2050,6 +2050,7 @@ __host__ int is_gpu_task(struct task *t) {
   for (int i = 0; i < num_gpu_types; i++) {
     if (t->type == gpu_work_task_array[i]) result = 1;
   }
+  if(t->subtype != task_subtype_none && t->subtype != task_subtype_density && t->subtype != task_subtype_force) result = 0;
   return result;
 }
 
@@ -2079,9 +2080,12 @@ __host__ void create_tasks(struct engine *e) {
   /* Create the task to call the GPU kernel */
   struct task *gpu_mega = scheduler_addtask(sched, task_type_GPU_mega,
                             task_subtype_none, 0, 0, NULL, NULL );
+  e->s->GPU_task = gpu_mega;
   /* Create a task for the GPU work call on the host */
   /* Loop through tke tasks and sort the unlocks... */
   for(i = 0; i < sched->nr_tasks; i++) {
+    if(&sched->tasks[i] == gpu_mega)
+      continue;
      if(!sched->tasks[i].gpu){
        /* Loop through the non-gpu tasks and move the dependency to GPU tasks to the mega.*/
        for(int j = 0; j < sched->tasks[i].nr_unlock_tasks; j++){
@@ -2095,7 +2099,7 @@ __host__ void create_tasks(struct engine *e) {
        }
      }
   }
-
+//  gpu_mega->skip = 0;
   scheduler_set_unlocks(sched);
   /* We also create a load and unload task for every cell in the system */
   num_gpu_tasks += s->tot_cells * 2;
@@ -2114,7 +2118,6 @@ __host__ void create_tasks(struct engine *e) {
   k = 0;
   /* Create the tasks. */
   for (i = 0; i < sched->nr_tasks; i++) {
-
     if (is_gpu_task(&sched->tasks[i])) {
       /* Copy the data to the CUDA task. */
       struct task *t = &sched->tasks[i];
@@ -2175,6 +2178,11 @@ __host__ void create_tasks(struct engine *e) {
       }
     }
 
+    /* Ghost tasks also unlock the unload tasks. */
+    if(t->type == task_type_ghost)
+      deps++;
+
+
     /* Allocate some CPU memory for the unlocks. */
     t->unlocks = (int *)malloc(sizeof(int) * deps);
     t->size_unlocks = deps;
@@ -2193,6 +2201,9 @@ __host__ void create_tasks(struct engine *e) {
         t->unlocks[t->nr_unlock_tasks++] = t->task->cj->unload_task;
       }
     }
+    if(t->type == task_type_ghost) {
+        t->unlocks[t->nr_unlock_tasks++] = t->task->ci->unload_task;
+    }
 
     /* If it is a density task then it is unlocked by the load task. */
     if (t->subtype == task_subtype_density) {
@@ -2650,7 +2661,6 @@ __host__ void create_tasks(struct engine *e) {
 }
 
 __host__ void run_cuda() {
-  printf("running cuda\n");
   swift_device_kernel << <num_blocks, num_cuda_threads>>> ();
   cudaErrCheck(cudaDeviceSynchronize());
 }
@@ -3029,6 +3039,7 @@ __host__ void free_cell(void *cell) { cudaErrCheck(cudaFreeHost(cell)); }
 __host__ void printTaskTimers(){
 #ifdef CUDA_TASK_TIMERS
 /* Load the task structures back from the GPU */
+  int nr_gpu_tasks, nr_tasks;
   cudaErrCheck(cudaMemcpyFromSymbol(&nr_gpu_tasks, cuda_numtasks, sizeof(int)));
   cudaErrCheck(cudaMemcpyFromSymbol(&nr_tasks, tot_num_tasks, sizeof(int)));
   struct task_cuda *gpu_pointer = NULL;
@@ -3041,11 +3052,11 @@ __host__ void printTaskTimers(){
                           sizeof(struct task_cuda ) * nr_gpu_tasks,
                           cudaMemcpyDeviceToHost));
   FILE *file = fopen("CUDA_TASKTIMERS", "w");
-  for(int i = 0; i < ...; i++){
+  for(int i = 0; i < nr_gpu_tasks; i++){
     struct task_cuda *t = &host_tasks[i];
     fprintf(file, "%i %i %i %lli %lli\n",t->type, t->subtype, t->blockID, t->tic, t->toc);
   } 
-
+  fclose(file);
   free(host_tasks);
 #else
   error("CUDA Task Timers unavailable, compile with CUDA_TASK_TIMERS defined.");
diff --git a/src/CUDA/test125cells.c b/src/CUDA/test125cells.c
index 154a5f2bafaa0bbbc4959af099aeed15cf1298a6..9abe414c6ee9ae6b98f045aa3b3d9a3d2f0d01a1 100644
--- a/src/CUDA/test125cells.c
+++ b/src/CUDA/test125cells.c
@@ -843,8 +843,8 @@ int main(int argc, char *argv[]) {
   dump_particle_fields(outputFileName, main_cell, solution, 0);
 
   /* Clean things to make the sanitizer happy ... */
-  for (int i = 0; i < 125; ++i) clean_up(cells[i]);
-  free(solution);
+//  for (int i = 0; i < 125; ++i) clean_up(cells[i]);
+//  free(solution);
 
   return 0;
 }
diff --git a/src/engine.c b/src/engine.c
index f22d5be707c6302df8e1e2140cb0840200536baa..f889ab3062d66cfdd13c533b21d497e76d00f8f3 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -2908,7 +2908,11 @@ void engine_print_task_counts(struct engine *e) {
   int counts[task_type_count + 1];
   for (int k = 0; k <= task_type_count; k++) counts[k] = 0;
   for (int k = 0; k < nr_tasks; k++) {
+#ifdef WITH_CUDA
+    if (tasks[k].skip || tasks[k].gpu)
+#else
     if (tasks[k].skip)
+#endif
       counts[task_type_count] += 1;
     else
       counts[(int)tasks[k].type] += 1;
@@ -2945,7 +2949,6 @@ void engine_rebuild(struct engine *e, int clean_h_values) {
 
   const ticks tic = getticks();
 
-  message("We got to a rebuild");
   /* Clear the forcerebuild flag, whatever it was. */
   e->forcerebuild = 0;
 
@@ -2963,6 +2966,9 @@ void engine_rebuild(struct engine *e, int clean_h_values) {
   /* Re-build the tasks. */
   engine_maketasks(e);
 
+#ifdef WITH_CUDA
+    create_tasks(e);
+#endif
   /* Run through the tasks and mark as skip or not. */
   if (engine_marktasks(e))
     error("engine_marktasks failed after space_rebuild.");
@@ -3011,11 +3017,6 @@ void engine_prepare(struct engine *e) {
   if (e->forcerebuild){
    
     engine_rebuild(e, 0);
-#ifdef WITH_CUDA
-    message("Creating CUDA tasks\n");
-    printf(" s = {dim= {%f, %f, %f}, periodic = %i, hs = {<No data fields>}, gravity = %i, width = {%f, %f, %f}, iwidth = {%f, %f, %f}, cell_min = %e, dx_max = %e, cdim = {%i, %i, %i,}, maxdepth = %i, nr_cells = %i, tot_cells = %i\n", e->s->dim[0], e->s->dim[1], e->s->dim[2], e->s->periodic, e->s->gravity, e->s->width[0], e->s->width[1], e->s->width[2], e->s->iwidth[0], e->s->iwidth[1], e->s->iwidth[2], e->s->cell_min, e->s->dx_max, e->s->cdim[0], e->s->cdim[1], e->s->cdim[2], e->s->maxdepth, e->s->nr_cells, e->s->tot_cells);
-    create_tasks(e);
-#endif
   }
 
   /* Unskip active tasks and check for rebuild */
@@ -3385,14 +3386,14 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
     hydro_init_part(&s->parts[k], &e->s->hs);
   for (size_t k = 0; k < s->nr_gparts; k++) gravity_init_gpart(&s->gparts[k]);
 
+#ifdef WITH_CUDA
+  update_tasks(e);
+#endif
   /* Now, launch the calculation */
   TIMER_TIC;
   engine_launch(e);
   TIMER_TOC(timer_runners);
 
-#ifdef WITH_CUDA
-//  run_cuda();
-#endif
 
   /* Apply some conversions (e.g. internal energy -> entropy) */
   if (!flag_entropy_ICs) {
@@ -3448,6 +3449,9 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
     gravity_exact_force_compute(e->s, e);
 #endif
 
+#ifdef WITH_CUDA
+  update_tasks(e);
+#endif
   /* Run the 0th time-step */
   engine_launch(e);
 
@@ -3622,7 +3626,6 @@ void engine_step(struct engine *e) {
 #endif
 
 #ifdef WITH_CUDA
-  message("Updating tasks\n");
   update_tasks(e);
 #endif
   /* Start all the tasks. */
@@ -3630,11 +3633,6 @@ void engine_step(struct engine *e) {
   engine_launch(e);
   TIMER_TOC(timer_runners);
 
-#ifdef WITH_CUDA
-  message("Running cuda\n");
-  run_cuda();
-#endif
-
 #ifdef SWIFT_GRAVITY_FORCE_CHECKS
   /* Check the accuracy of the gravity calculation */
   if (e->policy & engine_policy_self_gravity)
@@ -3725,6 +3723,9 @@ void engine_unskip(struct engine *e) {
   if (e->s->periodic && (e->policy & engine_policy_self_gravity))
     scheduler_activate(&e->sched, e->s->grav_top_level);
 
+#ifdef WITH_CUDA
+  scheduler_activate(&e->sched, e->s->GPU_task);
+#endif
   if (e->verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
             clocks_getunit());
diff --git a/src/runner.c b/src/runner.c
index 01489eefcdb3ac7400b2c316df7a674a4604b3bd..750f99f8a6834f42643a0eaf2e1475d5af72d1fb 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -1753,12 +1753,12 @@ void *runner_main(void *data) {
   struct engine *e = r->e;
   struct scheduler *sched = &e->sched;
 
+  
   /* Main loop. */
   while (1) {
 
     /* Wait at the barrier. */
     engine_barrier(e);
-
     /* Re-set the pointer to the previous task, as there is none. */
     struct task *t = NULL;
     struct task *prev = NULL;
@@ -1775,7 +1775,9 @@ void *runner_main(void *data) {
         TIMER_TOC(timer_gettask);
 
         /* Did I get anything? */
-        if (t == NULL) break;
+        if (t == NULL){
+          break;
+        }
       }
 
       /* Get the cells. */
@@ -1907,8 +1909,10 @@ void *runner_main(void *data) {
           break;
 
         case task_type_sub_pair:
-          if (t->subtype == task_subtype_density)
+          if (t->subtype == task_subtype_density){
             runner_dosub_pair1_density(r, ci, cj, t->flags, 1);
+            message("Found a density task...");
+          }
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient)
             runner_dosub_pair1_gradient(r, ci, cj, t->flags, 1);
diff --git a/src/scheduler.c b/src/scheduler.c
index ef9432de6b0591ba78a4de1aadff5051e7eb4f97..a02e0dd9c3f8d390e3ec7189b9ad7ec5e0f01689 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -795,8 +795,10 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
   t->toc = 0;
 #endif
 
+
 #ifdef WITH_CUDA
   t->cuda_task = -1;
+  t->gpu = 0;
 #endif
 
   /* Add an index for it. */
@@ -816,10 +818,19 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
 void scheduler_set_unlocks(struct scheduler *s) {
 
   /* Store the counts for each task. */
+#ifdef WITH_CUDA
+  int *counts;
+  if ((counts = ( int *)malloc(sizeof(int) * s->nr_tasks)) == NULL)
+#else
   short int *counts;
-  if ((counts = (short int *)malloc(sizeof(short int) * s->nr_tasks)) == NULL)
+  if ((counts = ( short int *)malloc(sizeof(short int) * s->nr_tasks)) == NULL)
+#endif
     error("Failed to allocate temporary counts array.");
+#ifdef WITH_CUDA
+  bzero(counts, sizeof(int) * s->nr_tasks);
+#else
   bzero(counts, sizeof(short int) * s->nr_tasks);
+#endif
   for (int k = 0; k < s->nr_unlocks; k++) {
     counts[s->unlock_ind[k]] += 1;
 
@@ -851,15 +862,21 @@ void scheduler_set_unlocks(struct scheduler *s) {
   if ((unlocks = (struct task **)malloc(sizeof(struct task *) *
                                         s->size_unlocks)) == NULL)
     error("Failed to allocate temporary unlocks array.");
+  int *volatile indices;
+   if( (indices = (int *volatile) malloc( sizeof(int) * s->size_unlocks)) == NULL)
+       error("Failed to allocate temporary indices array.");
   for (int k = 0; k < s->nr_unlocks; k++) {
     const int ind = s->unlock_ind[k];
     unlocks[offsets[ind]] = s->unlocks[k];
+    indices[offsets[ind]] = ind;
     offsets[ind] += 1;
   }
 
   /* Swap the unlocks. */
   free(s->unlocks);
   s->unlocks = unlocks;
+  free(s->unlock_ind);
+  s->unlock_ind = indices;
 
   /* Re-set the offsets. */
   offsets[0] = 0;
@@ -873,6 +890,7 @@ void scheduler_set_unlocks(struct scheduler *s) {
     t->unlock_tasks = &s->unlocks[offsets[k]];
   }
 
+
 #ifdef SWIFT_DEBUG_CHECKS
   /* Verify that there are no duplicate unlocks. */
   for (int k = 0; k < s->nr_tasks; k++) {
@@ -1105,7 +1123,6 @@ void scheduler_rewait_mapper(void *map_data, int num_elements,
 
   for (int ind = 0; ind < num_elements; ind++) {
     struct task *t = &s->tasks[tid[ind]];
-
     /* Ignore skipped tasks. */
 #ifdef WITH_CUDA
     if (t->skip || t->gpu) continue;
@@ -1123,6 +1140,7 @@ void scheduler_rewait_mapper(void *map_data, int num_elements,
             (1 << (8 * sizeof(t->wait) - 1)) - 1);
 #endif
 
+
     /* Sets the waits of the dependances */
     for (int k = 0; k < t->nr_unlock_tasks; k++) {
       struct task *u = t->unlock_tasks[k];
@@ -1417,10 +1435,17 @@ struct task *scheduler_done(struct scheduler *s, struct task *t) {
      they are ready. */
   for (int k = 0; k < t->nr_unlock_tasks; k++) {
     struct task *t2 = t->unlock_tasks[k];
+#ifdef WITH_CUDA
+    if (t2->skip || t2->gpu) continue;
+#else
     if (t2->skip) continue;
-
+#endif
     const int res = atomic_dec(&t2->wait);
+    if(t2->type == task_type_kick2 && t->type != task_type_GPU_mega)
+      message("Unlocked by type not GPU_mega");
     if (res < 1) {
+    if(t->type == task_type_GPU_mega)
+      message("GPU_mega");
       error("Negative wait!");
     } else if (res == 1) {
       scheduler_enqueue(s, t2);
diff --git a/src/scheduler.h b/src/scheduler.h
index a4f9592672f3eb57b0a553481b288905d8634143..dc898e96325ca304897f11f026a0fc4a3f9121ce 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -113,7 +113,11 @@ struct scheduler {
  */
 __attribute__((always_inline)) INLINE static void scheduler_activate(
     struct scheduler *s, struct task *t) {
-  if (atomic_cas(&t->skip, 1, 0)) {
+#ifdef WITH_CUDA
+  if (!t->gpu && atomic_cas(&t->skip, 1, 0)) {
+#else
+  if (atomic_cas(&t->skip, 1, 0)){ 
+#endif
     t->wait = 0;
     int ind = atomic_inc(&s->active_count);
     s->tid_active[ind] = t - s->tasks;
diff --git a/src/space.h b/src/space.h
index dbbba714c2b3c9841905b2ba54e4f2d854b820a6..df67f8ef20fd3b484687ce3da5ade5dd7c25d537 100644
--- a/src/space.h
+++ b/src/space.h
@@ -161,6 +161,9 @@ struct space {
   size_t nr_sparts_foreign, size_sparts_foreign;
 
 #endif
+#ifdef WITH_CUDA
+  struct task *GPU_task;
+#endif
 };
 
 /* function prototypes. */
diff --git a/src/task.c b/src/task.c
index 43da1d35680783d977ea743dd4f43c52f0f291bc..b466ffcae35a4e87d8c1c98f4e77a24e119736a3 100644
--- a/src/task.c
+++ b/src/task.c
@@ -152,7 +152,9 @@ __attribute__((always_inline)) INLINE static enum task_actions task_acts_on(
           break;
       }
       break;
-
+    case task_type_GPU_mega:
+      return task_action_part;
+      break;
     case task_type_kick1:
     case task_type_kick2:
     case task_type_timestep:
diff --git a/src/task.h b/src/task.h
index 3b1c640df0d9c4258936143052380040d5fb97e9..13fcc5cb363d69773e37b48d6db8bfdf0aad86fb 100644
--- a/src/task.h
+++ b/src/task.h
@@ -147,10 +147,18 @@ struct task {
 #endif
 
   /*! Number of tasks unlocked by this one */
+#ifdef WITH_CUDA
+  int nr_unlock_tasks;
+#else
   short int nr_unlock_tasks;
+#endif
 
   /*! Number of unsatisfied dependencies */
+#ifdef WITH_CUDA
+  int wait;
+#else
   short int wait;
+#endif
 
   /*! Type of the task */
   enum task_types type;