diff --git a/src/CUDA/Makefile.am b/src/CUDA/Makefile.am
index 2440d6c1d5a6541483e8b5ef40ada065e07d67db..8a27092277f24594909507c545dba6e5e1077006 100644
--- a/src/CUDA/Makefile.am
+++ b/src/CUDA/Makefile.am
@@ -5,7 +5,7 @@ EXTRA_DIST = $(SOURCES_CUDA) $(include_HEADERS)
 if HAVECUDA
 
 AM_CFLAGS = -I.. $(HDF5_CPPFLAGS) -g
-CUDA_MYFLAGS = -D_FORCE_INLINES -O0 -g -lineinfo -src-in-ptx --maxrregcount=32 -ftz=true -DWITH_CUDA -G -ccbin=gcc-4.8
+CUDA_MYFLAGS = -D_FORCE_INLINES -O0 -g -lineinfo -src-in-ptx --maxrregcount=32 -ftz=true -DWITH_CUDA -G -ccbin=gcc-4.8 -m64
 #-dc
 
 # Assign a "safe" version number
@@ -29,7 +29,8 @@ libswiftCUDA_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
         $(LDFLAGS) -o $@
 
 libswiftCUDA_la_SOURCES = $(SOURCES_CUDA)
-libswiftCUDA_la_CFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS)
+libswiftCUDA_la_CFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) ../libswiftsim_cuda.la ../libswiftsim_mpicuda.la -I../
+libswiftCUDA_la_LIBADD = ../.libs/libswiftsim_cuda.la ../.libs/libswiftsim_mpicuda.la
 libswiftCUDA_la_LDFLAGS = $(AM_LDFLAGS)
 
 libswiftdummy_la_SOURCES = dummy.c
@@ -38,12 +39,12 @@ libswiftdummy_la_LDFLAGS = $(AM_LDFLAGS)
 
 test_27_cells_SOURCES=test27cells.c
 test_27_cells_CFLAGS=$(AM_CFLAGS) -DWITH_CUDA $(CUDA_CFLAGS) 
-test_27_cells_LDADD= libswiftCUDA.la ../libswiftsim_cuda.la ../libswiftsim_mpicuda.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -lcudart
+test_27_cells_LDADD= ../.libs/libswiftsim_cuda.la ../.libs/libswiftsim_mpicuda.la libswiftCUDA.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -L/home/aidan/cuda_7.5/lib64/ -lcudart
 test_27_cells_LDFLAGS = $(AM_LDFLAGS) $(CUDA_CFLAGS)
 
 test_125_cells_SOURCES=test125cells.c
 test_125_cells_CFLAGS=$(AM_CFLAGS) -DWITH_CUDA $(CUDA_CFLAGS)
-test_125_cells_LDADD= libswiftCUDA.la ../libswiftsim_cuda.la ../libswiftsim_mpicuda.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -L/home/aidan/cuda_7.5/lib64/ -lcudart
+test_125_cells_LDADD= ../libswiftsim_cuda.la ../libswiftsim_mpicuda.la libswiftCUDA.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -L/home/aidan/cuda_7.5/lib64/ -lcudart
 test_125_cells_LDFLAGS = $(AM_LDFLAGS) $(CUDA_CFLAGS) 
 
 
diff --git a/src/CUDA/runner_cuda_main.cu b/src/CUDA/runner_cuda_main.cu
index 0241d066c914625e06ca157de9ad08f3539523d9..4f1ec53a7ca6a12ba9af8cb3532043199e55fa00 100644
--- a/src/CUDA/runner_cuda_main.cu
+++ b/src/CUDA/runner_cuda_main.cu
@@ -2055,9 +2055,34 @@ __host__ void create_tasks(struct engine *e) {
 
   /* We only create density, ghost and force tasks on the device at current. */
   for (i = 0; i < sched->nr_tasks; i++) {
-    if (is_gpu_task(&sched->tasks[i])) num_gpu_tasks++;
+    if (is_gpu_task(&sched->tasks[i])) 
+    {
+      num_gpu_tasks++;
+      sched->tasks[i].gpu = 1;
+    }
   }
 
+  /* Create the task to call the GPU kernel */
+  struct task *gpu_mega = scheduler_addtask(sched, task_type_GPU_mega,
+                            task_subtype_none, 0, 0, NULL, NULL );
+  /* Create a task for the GPU work call on the host */
+  /* Loop through tke tasks and sort the unlocks... */
+  for(i = 0; i < sched->nr_tasks; i++) {
+     if(!sched->tasks[i].gpu){
+       /* Loop through the non-gpu tasks and move the dependency to GPU tasks to the mega.*/
+       for(int j = 0; j < sched->tasks[i].nr_unlock_tasks; j++){
+         if(sched->tasks[i].unlock_tasks[j]->gpu)
+           sched->tasks[i].unlock_tasks[j] = gpu_mega;
+       }
+     }else{
+       for(int j = 0; j < sched->tasks[i].nr_unlock_tasks; j++){
+         if(!sched->tasks[i].unlock_tasks[j]->gpu)
+          scheduler_addunlock(sched, gpu_mega, sched->tasks[i].unlock_tasks[j]);
+       }
+     }
+  }
+
+  scheduler_set_unlocks(sched);
   /* We also create a load and unload task for every cell in the system */
   num_gpu_tasks += s->tot_cells * 2;
 
diff --git a/src/engine.c b/src/engine.c
index 9aba0a722fc1ffdcfca56e106288c76ce958823f..f22d5be707c6302df8e1e2140cb0840200536baa 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -2665,6 +2665,12 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       }
     }
 
+#ifdef WITH_CUDA
+    /* If with CUDA we need the mega task*/
+    else if(t->type == task_type_GPU_mega){
+      scheduler_activate(s, t);
+    }
+#endif
     /* Pair? */
     else if (t->type == task_type_pair || t->type == task_type_sub_pair) {
 
diff --git a/src/runner.c b/src/runner.c
index bb5c5cb91c7ca0ab0040f1ff637e124b1c673aeb..01489eefcdb3ac7400b2c316df7a674a4604b3bd 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -63,6 +63,9 @@
 #include "task.h"
 #include "timers.h"
 #include "timestep.h"
+#ifdef WITH_CUDA
+#include "CUDA/runner_cuda_main.h"
+#endif
 
 /* Import the density loop functions. */
 #define FUNCTION density
@@ -1996,6 +1999,11 @@ void *runner_main(void *data) {
         case task_type_sourceterms:
           runner_do_sourceterms(r, t->ci, 1);
           break;
+#ifdef WITH_CUDA
+        case task_type_GPU_mega:
+          run_cuda();
+          break;
+#endif
         default:
           error("Unknown/invalid task type (%d).", t->type);
       }
diff --git a/src/scheduler.c b/src/scheduler.c
index 1445adef5824cf96b8ee4155e1f660dce917d31b..ef9432de6b0591ba78a4de1aadff5051e7eb4f97 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1107,7 +1107,11 @@ void scheduler_rewait_mapper(void *map_data, int num_elements,
     struct task *t = &s->tasks[tid[ind]];
 
     /* Ignore skipped tasks. */
+#ifdef WITH_CUDA
+    if (t->skip || t->gpu) continue;
+#else
     if (t->skip) continue;
+#endif
 
     /* Increment the task's own wait counter for the enqueueing. */
     atomic_inc(&t->wait);
@@ -1262,7 +1266,11 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
   int qid = -1;
 
   /* Ignore skipped tasks */
+#ifdef WITH_CUDA
+  if (t->skip || t->gpu) return;
+#else
   if (t->skip) return;
+#endif
 
   /* If this is an implicit task, just pretend it's done. */
   if (t->implicit) {
diff --git a/src/scheduler.h b/src/scheduler.h
index cdb3a28f1a3482e934480b0c0ccd18f18b66d437..a4f9592672f3eb57b0a553481b288905d8634143 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -121,6 +121,9 @@ __attribute__((always_inline)) INLINE static void scheduler_activate(
 }
 
 /* Function prototypes. */
+struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
+                               enum task_subtypes subtype, int flags,
+                               int implicit, struct cell *ci, struct cell *cj);
 void scheduler_clear_active(struct scheduler *s);
 void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
                     int nr_queues, unsigned int flags, int nodeID,
diff --git a/src/task.h b/src/task.h
index 5a134a0bef1e9527992c34bd489806186a832a8b..3b1c640df0d9c4258936143052380040d5fb97e9 100644
--- a/src/task.h
+++ b/src/task.h
@@ -66,6 +66,7 @@ enum task_types {
   task_type_grav_down,
   task_type_cooling,
   task_type_sourceterms,
+  task_type_GPU_mega,
   task_type_count
 } __attribute__((packed));
 
@@ -182,6 +183,8 @@ struct task {
 #ifdef WITH_CUDA
 /* Index of the CUDA task in initial array. */
   int cuda_task;
+  /* Is the task a GPU task (and therefore skipped on the CPU) */
+  char gpu;
 #endif
 
 } SWIFT_STRUCT_ALIGN;