From 7bedebfcc77a826b2b1aecdee322876d19b3dff4 Mon Sep 17 00:00:00 2001
From: d74ksy <aidan.chalk@durham.ac.uk>
Date: Thu, 17 Aug 2017 15:42:14 +0100
Subject: [PATCH] If we can compile we can maybe run a GPU version now...

---
 src/CUDA/Makefile.am         |  9 +++++----
 src/CUDA/runner_cuda_main.cu | 27 ++++++++++++++++++++++++++-
 src/engine.c                 |  6 ++++++
 src/runner.c                 |  8 ++++++++
 src/scheduler.c              |  8 ++++++++
 src/scheduler.h              |  3 +++
 src/task.h                   |  3 +++
 7 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/src/CUDA/Makefile.am b/src/CUDA/Makefile.am
index 2440d6c1d5..8a27092277 100644
--- a/src/CUDA/Makefile.am
+++ b/src/CUDA/Makefile.am
@@ -5,7 +5,7 @@ EXTRA_DIST = $(SOURCES_CUDA) $(include_HEADERS)
 if HAVECUDA
 
 AM_CFLAGS = -I.. $(HDF5_CPPFLAGS) -g
-CUDA_MYFLAGS = -D_FORCE_INLINES -O0 -g -lineinfo -src-in-ptx --maxrregcount=32 -ftz=true -DWITH_CUDA -G -ccbin=gcc-4.8
+CUDA_MYFLAGS = -D_FORCE_INLINES -O0 -g -lineinfo -src-in-ptx --maxrregcount=32 -ftz=true -DWITH_CUDA -G -ccbin=gcc-4.8 -m64
 #-dc
 
 # Assign a "safe" version number
@@ -29,7 +29,8 @@ libswiftCUDA_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
         $(LDFLAGS) -o $@
 
 libswiftCUDA_la_SOURCES = $(SOURCES_CUDA)
-libswiftCUDA_la_CFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS)
+libswiftCUDA_la_CFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) ../libswiftsim_cuda.la ../libswiftsim_mpicuda.la -I../
+libswiftCUDA_la_LIBADD = ../.libs/libswiftsim_cuda.la ../.libs/libswiftsim_mpicuda.la
 libswiftCUDA_la_LDFLAGS = $(AM_LDFLAGS)
 
 libswiftdummy_la_SOURCES = dummy.c
@@ -38,12 +39,12 @@ libswiftdummy_la_LDFLAGS = $(AM_LDFLAGS)
 
 test_27_cells_SOURCES=test27cells.c
 test_27_cells_CFLAGS=$(AM_CFLAGS) -DWITH_CUDA $(CUDA_CFLAGS) 
-test_27_cells_LDADD= libswiftCUDA.la ../libswiftsim_cuda.la ../libswiftsim_mpicuda.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -lcudart
+test_27_cells_LDADD= ../.libs/libswiftsim_cuda.la ../.libs/libswiftsim_mpicuda.la libswiftCUDA.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -L/home/aidan/cuda_7.5/lib64/ -lcudart
 test_27_cells_LDFLAGS = $(AM_LDFLAGS) $(CUDA_CFLAGS)
 
 test_125_cells_SOURCES=test125cells.c
 test_125_cells_CFLAGS=$(AM_CFLAGS) -DWITH_CUDA $(CUDA_CFLAGS)
-test_125_cells_LDADD= libswiftCUDA.la ../libswiftsim_cuda.la ../libswiftsim_mpicuda.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -L/home/aidan/cuda_7.5/lib64/ -lcudart
+test_125_cells_LDADD= ../libswiftsim_cuda.la ../libswiftsim_mpicuda.la libswiftCUDA.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -L/home/aidan/cuda_7.5/lib64/ -lcudart
 test_125_cells_LDFLAGS = $(AM_LDFLAGS) $(CUDA_CFLAGS) 
 
 
diff --git a/src/CUDA/runner_cuda_main.cu b/src/CUDA/runner_cuda_main.cu
index 0241d066c9..4f1ec53a7c 100644
--- a/src/CUDA/runner_cuda_main.cu
+++ b/src/CUDA/runner_cuda_main.cu
@@ -2055,9 +2055,34 @@ __host__ void create_tasks(struct engine *e) {
 
   /* We only create density, ghost and force tasks on the device at current. */
   for (i = 0; i < sched->nr_tasks; i++) {
-    if (is_gpu_task(&sched->tasks[i])) num_gpu_tasks++;
+    if (is_gpu_task(&sched->tasks[i])) 
+    {
+      num_gpu_tasks++;
+      sched->tasks[i].gpu = 1;
+    }
   }
 
+  /* Create the task to call the GPU kernel */
+  struct task *gpu_mega = scheduler_addtask(sched, task_type_GPU_mega,
+                            task_subtype_none, 0, 0, NULL, NULL );
+  /* Create a task for the GPU work call on the host */
+  /* Loop through tke tasks and sort the unlocks... */
+  for(i = 0; i < sched->nr_tasks; i++) {
+     if(!sched->tasks[i].gpu){
+       /* Loop through the non-gpu tasks and move the dependency to GPU tasks to the mega.*/
+       for(int j = 0; j < sched->tasks[i].nr_unlock_tasks; j++){
+         if(sched->tasks[i].unlock_tasks[j]->gpu)
+           sched->tasks[i].unlock_tasks[j] = gpu_mega;
+       }
+     }else{
+       for(int j = 0; j < sched->tasks[i].nr_unlock_tasks; j++){
+         if(!sched->tasks[i].unlock_tasks[j]->gpu)
+          scheduler_addunlock(sched, gpu_mega, sched->tasks[i].unlock_tasks[j]);
+       }
+     }
+  }
+
+  scheduler_set_unlocks(sched);
   /* We also create a load and unload task for every cell in the system */
   num_gpu_tasks += s->tot_cells * 2;
 
diff --git a/src/engine.c b/src/engine.c
index 9aba0a722f..f22d5be707 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -2665,6 +2665,12 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       }
     }
 
+#ifdef WITH_CUDA
+    /* If with CUDA we need the mega task*/
+    else if(t->type == task_type_GPU_mega){
+      scheduler_activate(s, t);
+    }
+#endif
     /* Pair? */
     else if (t->type == task_type_pair || t->type == task_type_sub_pair) {
 
diff --git a/src/runner.c b/src/runner.c
index bb5c5cb91c..01489eefcd 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -63,6 +63,9 @@
 #include "task.h"
 #include "timers.h"
 #include "timestep.h"
+#ifdef WITH_CUDA
+#include "CUDA/runner_cuda_main.h"
+#endif
 
 /* Import the density loop functions. */
 #define FUNCTION density
@@ -1996,6 +1999,11 @@ void *runner_main(void *data) {
         case task_type_sourceterms:
           runner_do_sourceterms(r, t->ci, 1);
           break;
+#ifdef WITH_CUDA
+        case task_type_GPU_mega:
+          run_cuda();
+          break;
+#endif
         default:
           error("Unknown/invalid task type (%d).", t->type);
       }
diff --git a/src/scheduler.c b/src/scheduler.c
index 1445adef58..ef9432de6b 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1107,7 +1107,11 @@ void scheduler_rewait_mapper(void *map_data, int num_elements,
     struct task *t = &s->tasks[tid[ind]];
 
     /* Ignore skipped tasks. */
+#ifdef WITH_CUDA
+    if (t->skip || t->gpu) continue;
+#else
     if (t->skip) continue;
+#endif
 
     /* Increment the task's own wait counter for the enqueueing. */
     atomic_inc(&t->wait);
@@ -1262,7 +1266,11 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
   int qid = -1;
 
   /* Ignore skipped tasks */
+#ifdef WITH_CUDA
+  if (t->skip || t->gpu) return;
+#else
   if (t->skip) return;
+#endif
 
   /* If this is an implicit task, just pretend it's done. */
   if (t->implicit) {
diff --git a/src/scheduler.h b/src/scheduler.h
index cdb3a28f1a..a4f9592672 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -121,6 +121,9 @@ __attribute__((always_inline)) INLINE static void scheduler_activate(
 }
 
 /* Function prototypes. */
+struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
+                               enum task_subtypes subtype, int flags,
+                               int implicit, struct cell *ci, struct cell *cj);
 void scheduler_clear_active(struct scheduler *s);
 void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
                     int nr_queues, unsigned int flags, int nodeID,
diff --git a/src/task.h b/src/task.h
index 5a134a0bef..3b1c640df0 100644
--- a/src/task.h
+++ b/src/task.h
@@ -66,6 +66,7 @@ enum task_types {
   task_type_grav_down,
   task_type_cooling,
   task_type_sourceterms,
+  task_type_GPU_mega,
   task_type_count
 } __attribute__((packed));
 
@@ -182,6 +183,8 @@ struct task {
 #ifdef WITH_CUDA
 /* Index of the CUDA task in initial array. */
   int cuda_task;
+  /* Is the task a GPU task (and therefore skipped on the CPU) */
+  char gpu;
 #endif
 
 } SWIFT_STRUCT_ALIGN;
-- 
GitLab