diff --git a/src/CUDA/Makefile.am b/src/CUDA/Makefile.am index 2440d6c1d5a6541483e8b5ef40ada065e07d67db..8a27092277f24594909507c545dba6e5e1077006 100644 --- a/src/CUDA/Makefile.am +++ b/src/CUDA/Makefile.am @@ -5,7 +5,7 @@ EXTRA_DIST = $(SOURCES_CUDA) $(include_HEADERS) if HAVECUDA AM_CFLAGS = -I.. $(HDF5_CPPFLAGS) -g -CUDA_MYFLAGS = -D_FORCE_INLINES -O0 -g -lineinfo -src-in-ptx --maxrregcount=32 -ftz=true -DWITH_CUDA -G -ccbin=gcc-4.8 +CUDA_MYFLAGS = -D_FORCE_INLINES -O0 -g -lineinfo -src-in-ptx --maxrregcount=32 -ftz=true -DWITH_CUDA -G -ccbin=gcc-4.8 -m64 #-dc # Assign a "safe" version number @@ -29,7 +29,8 @@ libswiftCUDA_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \ $(LDFLAGS) -o $@ libswiftCUDA_la_SOURCES = $(SOURCES_CUDA) -libswiftCUDA_la_CFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) +libswiftCUDA_la_CFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) ../libswiftsim_cuda.la ../libswiftsim_mpicuda.la -I../ +libswiftCUDA_la_LIBADD = ../.libs/libswiftsim_cuda.la ../.libs/libswiftsim_mpicuda.la libswiftCUDA_la_LDFLAGS = $(AM_LDFLAGS) libswiftdummy_la_SOURCES = dummy.c @@ -38,12 +39,12 @@ libswiftdummy_la_LDFLAGS = $(AM_LDFLAGS) test_27_cells_SOURCES=test27cells.c test_27_cells_CFLAGS=$(AM_CFLAGS) -DWITH_CUDA $(CUDA_CFLAGS) -test_27_cells_LDADD= libswiftCUDA.la ../libswiftsim_cuda.la ../libswiftsim_mpicuda.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -lcudart +test_27_cells_LDADD= ../.libs/libswiftsim_cuda.la ../.libs/libswiftsim_mpicuda.la libswiftCUDA.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -L/home/aidan/cuda_7.5/lib64/ -lcudart test_27_cells_LDFLAGS = $(AM_LDFLAGS) $(CUDA_CFLAGS) test_125_cells_SOURCES=test125cells.c test_125_cells_CFLAGS=$(AM_CFLAGS) -DWITH_CUDA $(CUDA_CFLAGS) -test_125_cells_LDADD= libswiftCUDA.la ../libswiftsim_cuda.la ../libswiftsim_mpicuda.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -L/home/aidan/cuda_7.5/lib64/ -lcudart +test_125_cells_LDADD= ../libswiftsim_cuda.la ../libswiftsim_mpicuda.la libswiftCUDA.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -L/home/aidan/cuda_7.5/lib64/ -lcudart test_125_cells_LDFLAGS = $(AM_LDFLAGS) $(CUDA_CFLAGS) diff --git a/src/CUDA/runner_cuda_main.cu b/src/CUDA/runner_cuda_main.cu index 0241d066c914625e06ca157de9ad08f3539523d9..4f1ec53a7ca6a12ba9af8cb3532043199e55fa00 100644 --- a/src/CUDA/runner_cuda_main.cu +++ b/src/CUDA/runner_cuda_main.cu @@ -2055,9 +2055,34 @@ __host__ void create_tasks(struct engine *e) { /* We only create density, ghost and force tasks on the device at current. */ for (i = 0; i < sched->nr_tasks; i++) { - if (is_gpu_task(&sched->tasks[i])) num_gpu_tasks++; + if (is_gpu_task(&sched->tasks[i])) + { + num_gpu_tasks++; + sched->tasks[i].gpu = 1; + } } + /* Create the task to call the GPU kernel */ + struct task *gpu_mega = scheduler_addtask(sched, task_type_GPU_mega, + task_subtype_none, 0, 0, NULL, NULL ); + /* Create a task for the GPU work call on the host */ + /* Loop through tke tasks and sort the unlocks... */ + for(i = 0; i < sched->nr_tasks; i++) { + if(!sched->tasks[i].gpu){ + /* Loop through the non-gpu tasks and move the dependency to GPU tasks to the mega.*/ + for(int j = 0; j < sched->tasks[i].nr_unlock_tasks; j++){ + if(sched->tasks[i].unlock_tasks[j]->gpu) + sched->tasks[i].unlock_tasks[j] = gpu_mega; + } + }else{ + for(int j = 0; j < sched->tasks[i].nr_unlock_tasks; j++){ + if(!sched->tasks[i].unlock_tasks[j]->gpu) + scheduler_addunlock(sched, gpu_mega, sched->tasks[i].unlock_tasks[j]); + } + } + } + + scheduler_set_unlocks(sched); /* We also create a load and unload task for every cell in the system */ num_gpu_tasks += s->tot_cells * 2; diff --git a/src/engine.c b/src/engine.c index 9aba0a722fc1ffdcfca56e106288c76ce958823f..f22d5be707c6302df8e1e2140cb0840200536baa 100644 --- a/src/engine.c +++ b/src/engine.c @@ -2665,6 +2665,12 @@ void engine_marktasks_mapper(void *map_data, int num_elements, } } +#ifdef WITH_CUDA + /* If with CUDA we need the mega task*/ + else if(t->type == task_type_GPU_mega){ + scheduler_activate(s, t); + } +#endif /* Pair? */ else if (t->type == task_type_pair || t->type == task_type_sub_pair) { diff --git a/src/runner.c b/src/runner.c index bb5c5cb91c7ca0ab0040f1ff637e124b1c673aeb..01489eefcdb3ac7400b2c316df7a674a4604b3bd 100644 --- a/src/runner.c +++ b/src/runner.c @@ -63,6 +63,9 @@ #include "task.h" #include "timers.h" #include "timestep.h" +#ifdef WITH_CUDA +#include "CUDA/runner_cuda_main.h" +#endif /* Import the density loop functions. */ #define FUNCTION density @@ -1996,6 +1999,11 @@ void *runner_main(void *data) { case task_type_sourceterms: runner_do_sourceterms(r, t->ci, 1); break; +#ifdef WITH_CUDA + case task_type_GPU_mega: + run_cuda(); + break; +#endif default: error("Unknown/invalid task type (%d).", t->type); } diff --git a/src/scheduler.c b/src/scheduler.c index 1445adef5824cf96b8ee4155e1f660dce917d31b..ef9432de6b0591ba78a4de1aadff5051e7eb4f97 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -1107,7 +1107,11 @@ void scheduler_rewait_mapper(void *map_data, int num_elements, struct task *t = &s->tasks[tid[ind]]; /* Ignore skipped tasks. */ +#ifdef WITH_CUDA + if (t->skip || t->gpu) continue; +#else if (t->skip) continue; +#endif /* Increment the task's own wait counter for the enqueueing. */ atomic_inc(&t->wait); @@ -1262,7 +1266,11 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) { int qid = -1; /* Ignore skipped tasks */ +#ifdef WITH_CUDA + if (t->skip || t->gpu) return; +#else if (t->skip) return; +#endif /* If this is an implicit task, just pretend it's done. */ if (t->implicit) { diff --git a/src/scheduler.h b/src/scheduler.h index cdb3a28f1a3482e934480b0c0ccd18f18b66d437..a4f9592672f3eb57b0a553481b288905d8634143 100644 --- a/src/scheduler.h +++ b/src/scheduler.h @@ -121,6 +121,9 @@ __attribute__((always_inline)) INLINE static void scheduler_activate( } /* Function prototypes. */ +struct task *scheduler_addtask(struct scheduler *s, enum task_types type, + enum task_subtypes subtype, int flags, + int implicit, struct cell *ci, struct cell *cj); void scheduler_clear_active(struct scheduler *s); void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks, int nr_queues, unsigned int flags, int nodeID, diff --git a/src/task.h b/src/task.h index 5a134a0bef1e9527992c34bd489806186a832a8b..3b1c640df0d9c4258936143052380040d5fb97e9 100644 --- a/src/task.h +++ b/src/task.h @@ -66,6 +66,7 @@ enum task_types { task_type_grav_down, task_type_cooling, task_type_sourceterms, + task_type_GPU_mega, task_type_count } __attribute__((packed)); @@ -182,6 +183,8 @@ struct task { #ifdef WITH_CUDA /* Index of the CUDA task in initial array. */ int cuda_task; + /* Is the task a GPU task (and therefore skipped on the CPU) */ + char gpu; #endif } SWIFT_STRUCT_ALIGN;