diff --git a/configure.ac b/configure.ac index 929e21900933b64dc5349933aaa959b4a5de0ec2..13612d1c3bdb9e1792e60c3a289374d1c0ebf54b 100644 --- a/configure.ac +++ b/configure.ac @@ -424,6 +424,7 @@ if test "x$with_cuda" != "xno"; then if test "x$have_cuda" != "xno"; then AC_DEFINE([HAVE_CUDA], 1, [The CUDA compiler is installed.]) fi + CFLAGS="${CFLAGS} -m64" fi AC_SUBST(CUDA_CFLAGS) AC_SUBST(CUDA_LIBS) diff --git a/src/CUDA/Makefile.am b/src/CUDA/Makefile.am index 8a27092277f24594909507c545dba6e5e1077006..59cff8bce9cc22c7134f231f2ed47aa8c867d23c 100644 --- a/src/CUDA/Makefile.am +++ b/src/CUDA/Makefile.am @@ -5,7 +5,7 @@ EXTRA_DIST = $(SOURCES_CUDA) $(include_HEADERS) if HAVECUDA AM_CFLAGS = -I.. $(HDF5_CPPFLAGS) -g -CUDA_MYFLAGS = -D_FORCE_INLINES -O0 -g -lineinfo -src-in-ptx --maxrregcount=32 -ftz=true -DWITH_CUDA -G -ccbin=gcc-4.8 -m64 +CUDA_MYFLAGS = -D_FORCE_INLINES -O3 -g -lineinfo -src-in-ptx --maxrregcount=32 -ftz=true -DWITH_CUDA -ccbin=gcc-4.8 -m64 #-dc # Assign a "safe" version number diff --git a/src/CUDA/runner_cuda_main.cu b/src/CUDA/runner_cuda_main.cu index 3de932bc109fc4f2eea39f802ac2904272226ba0..7ec692675b44dfcf031434f837bc08f9280d57da 100644 --- a/src/CUDA/runner_cuda_main.cu +++ b/src/CUDA/runner_cuda_main.cu @@ -437,7 +437,6 @@ __device__ void doself_density(struct cell_cuda *ci) { /* If the particle isn't active skip it. */ if (!cuda_part_is_active(pid)) { - printf("Particle %i isn't active\n", pid); continue; } @@ -1331,7 +1330,6 @@ __device__ void do_ghost(struct cell_cuda *c) { } /* Did we get the right number of neighbours? */ if (fabsf(h_new - h_old) > eps * h_old) { - cuda_parts.h[i] = h_new; // If below absolute max try again @@ -1341,7 +1339,6 @@ __device__ void do_ghost(struct cell_cuda *c) { } else { /* The particle is a lost cause */ cuda_parts.h[i] = hydro_h_max; - /* Do some damage control if no neighbours were found */ if (cuda_parts.wcount[i] == cuda_kernel_root * kernel_norm) cuda_hydro_part_has_no_neighbours(i); @@ -1464,8 +1461,11 @@ __global__ void swift_device_kernel() { doself_force(ci); } } else if (type == task_type_ghost) { - struct cell_cuda *ci = &cells_cuda[tasks[tid].ci]; - do_ghost(ci); + if(! tasks[tid].implicit){ + struct cell_cuda *ci = &cells_cuda[tasks[tid].ci]; + + do_ghost(ci); + } } __syncthreads(); @@ -1893,7 +1893,7 @@ __host__ void update_tasks(struct engine *e) { sizeof(struct queue_cuda))); int *data = (int *)malloc(sizeof(int) * qsize); - int nr_unload; + int nr_unload=0; unload_host.count = 0; for (int i = 0; i < nr_gpu_tasks; i++) { if (host_tasks[i].type <= type_unload && @@ -2050,6 +2050,7 @@ __host__ int is_gpu_task(struct task *t) { for (int i = 0; i < num_gpu_types; i++) { if (t->type == gpu_work_task_array[i]) result = 1; } + if(t->subtype != task_subtype_none && t->subtype != task_subtype_density && t->subtype != task_subtype_force) result = 0; return result; } @@ -2079,9 +2080,12 @@ __host__ void create_tasks(struct engine *e) { /* Create the task to call the GPU kernel */ struct task *gpu_mega = scheduler_addtask(sched, task_type_GPU_mega, task_subtype_none, 0, 0, NULL, NULL ); + e->s->GPU_task = gpu_mega; /* Create a task for the GPU work call on the host */ /* Loop through tke tasks and sort the unlocks... */ for(i = 0; i < sched->nr_tasks; i++) { + if(&sched->tasks[i] == gpu_mega) + continue; if(!sched->tasks[i].gpu){ /* Loop through the non-gpu tasks and move the dependency to GPU tasks to the mega.*/ for(int j = 0; j < sched->tasks[i].nr_unlock_tasks; j++){ @@ -2095,7 +2099,7 @@ __host__ void create_tasks(struct engine *e) { } } } - +// gpu_mega->skip = 0; scheduler_set_unlocks(sched); /* We also create a load and unload task for every cell in the system */ num_gpu_tasks += s->tot_cells * 2; @@ -2114,7 +2118,6 @@ __host__ void create_tasks(struct engine *e) { k = 0; /* Create the tasks. */ for (i = 0; i < sched->nr_tasks; i++) { - if (is_gpu_task(&sched->tasks[i])) { /* Copy the data to the CUDA task. */ struct task *t = &sched->tasks[i]; @@ -2175,6 +2178,11 @@ __host__ void create_tasks(struct engine *e) { } } + /* Ghost tasks also unlock the unload tasks. */ + if(t->type == task_type_ghost) + deps++; + + /* Allocate some CPU memory for the unlocks. */ t->unlocks = (int *)malloc(sizeof(int) * deps); t->size_unlocks = deps; @@ -2193,6 +2201,9 @@ __host__ void create_tasks(struct engine *e) { t->unlocks[t->nr_unlock_tasks++] = t->task->cj->unload_task; } } + if(t->type == task_type_ghost) { + t->unlocks[t->nr_unlock_tasks++] = t->task->ci->unload_task; + } /* If it is a density task then it is unlocked by the load task. */ if (t->subtype == task_subtype_density) { @@ -2650,7 +2661,6 @@ __host__ void create_tasks(struct engine *e) { } __host__ void run_cuda() { - printf("running cuda\n"); swift_device_kernel << <num_blocks, num_cuda_threads>>> (); cudaErrCheck(cudaDeviceSynchronize()); } @@ -3029,6 +3039,7 @@ __host__ void free_cell(void *cell) { cudaErrCheck(cudaFreeHost(cell)); } __host__ void printTaskTimers(){ #ifdef CUDA_TASK_TIMERS /* Load the task structures back from the GPU */ + int nr_gpu_tasks, nr_tasks; cudaErrCheck(cudaMemcpyFromSymbol(&nr_gpu_tasks, cuda_numtasks, sizeof(int))); cudaErrCheck(cudaMemcpyFromSymbol(&nr_tasks, tot_num_tasks, sizeof(int))); struct task_cuda *gpu_pointer = NULL; @@ -3041,11 +3052,11 @@ __host__ void printTaskTimers(){ sizeof(struct task_cuda ) * nr_gpu_tasks, cudaMemcpyDeviceToHost)); FILE *file = fopen("CUDA_TASKTIMERS", "w"); - for(int i = 0; i < ...; i++){ + for(int i = 0; i < nr_gpu_tasks; i++){ struct task_cuda *t = &host_tasks[i]; fprintf(file, "%i %i %i %lli %lli\n",t->type, t->subtype, t->blockID, t->tic, t->toc); } - + fclose(file); free(host_tasks); #else error("CUDA Task Timers unavailable, compile with CUDA_TASK_TIMERS defined."); diff --git a/src/CUDA/test125cells.c b/src/CUDA/test125cells.c index 154a5f2bafaa0bbbc4959af099aeed15cf1298a6..9abe414c6ee9ae6b98f045aa3b3d9a3d2f0d01a1 100644 --- a/src/CUDA/test125cells.c +++ b/src/CUDA/test125cells.c @@ -843,8 +843,8 @@ int main(int argc, char *argv[]) { dump_particle_fields(outputFileName, main_cell, solution, 0); /* Clean things to make the sanitizer happy ... */ - for (int i = 0; i < 125; ++i) clean_up(cells[i]); - free(solution); +// for (int i = 0; i < 125; ++i) clean_up(cells[i]); +// free(solution); return 0; } diff --git a/src/engine.c b/src/engine.c index f22d5be707c6302df8e1e2140cb0840200536baa..f889ab3062d66cfdd13c533b21d497e76d00f8f3 100644 --- a/src/engine.c +++ b/src/engine.c @@ -2908,7 +2908,11 @@ void engine_print_task_counts(struct engine *e) { int counts[task_type_count + 1]; for (int k = 0; k <= task_type_count; k++) counts[k] = 0; for (int k = 0; k < nr_tasks; k++) { +#ifdef WITH_CUDA + if (tasks[k].skip || tasks[k].gpu) +#else if (tasks[k].skip) +#endif counts[task_type_count] += 1; else counts[(int)tasks[k].type] += 1; @@ -2945,7 +2949,6 @@ void engine_rebuild(struct engine *e, int clean_h_values) { const ticks tic = getticks(); - message("We got to a rebuild"); /* Clear the forcerebuild flag, whatever it was. */ e->forcerebuild = 0; @@ -2963,6 +2966,9 @@ void engine_rebuild(struct engine *e, int clean_h_values) { /* Re-build the tasks. */ engine_maketasks(e); +#ifdef WITH_CUDA + create_tasks(e); +#endif /* Run through the tasks and mark as skip or not. */ if (engine_marktasks(e)) error("engine_marktasks failed after space_rebuild."); @@ -3011,11 +3017,6 @@ void engine_prepare(struct engine *e) { if (e->forcerebuild){ engine_rebuild(e, 0); -#ifdef WITH_CUDA - message("Creating CUDA tasks\n"); - printf(" s = {dim= {%f, %f, %f}, periodic = %i, hs = {<No data fields>}, gravity = %i, width = {%f, %f, %f}, iwidth = {%f, %f, %f}, cell_min = %e, dx_max = %e, cdim = {%i, %i, %i,}, maxdepth = %i, nr_cells = %i, tot_cells = %i\n", e->s->dim[0], e->s->dim[1], e->s->dim[2], e->s->periodic, e->s->gravity, e->s->width[0], e->s->width[1], e->s->width[2], e->s->iwidth[0], e->s->iwidth[1], e->s->iwidth[2], e->s->cell_min, e->s->dx_max, e->s->cdim[0], e->s->cdim[1], e->s->cdim[2], e->s->maxdepth, e->s->nr_cells, e->s->tot_cells); - create_tasks(e); -#endif } /* Unskip active tasks and check for rebuild */ @@ -3385,14 +3386,14 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs, hydro_init_part(&s->parts[k], &e->s->hs); for (size_t k = 0; k < s->nr_gparts; k++) gravity_init_gpart(&s->gparts[k]); +#ifdef WITH_CUDA + update_tasks(e); +#endif /* Now, launch the calculation */ TIMER_TIC; engine_launch(e); TIMER_TOC(timer_runners); -#ifdef WITH_CUDA -// run_cuda(); -#endif /* Apply some conversions (e.g. internal energy -> entropy) */ if (!flag_entropy_ICs) { @@ -3448,6 +3449,9 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs, gravity_exact_force_compute(e->s, e); #endif +#ifdef WITH_CUDA + update_tasks(e); +#endif /* Run the 0th time-step */ engine_launch(e); @@ -3622,7 +3626,6 @@ void engine_step(struct engine *e) { #endif #ifdef WITH_CUDA - message("Updating tasks\n"); update_tasks(e); #endif /* Start all the tasks. */ @@ -3630,11 +3633,6 @@ void engine_step(struct engine *e) { engine_launch(e); TIMER_TOC(timer_runners); -#ifdef WITH_CUDA - message("Running cuda\n"); - run_cuda(); -#endif - #ifdef SWIFT_GRAVITY_FORCE_CHECKS /* Check the accuracy of the gravity calculation */ if (e->policy & engine_policy_self_gravity) @@ -3725,6 +3723,9 @@ void engine_unskip(struct engine *e) { if (e->s->periodic && (e->policy & engine_policy_self_gravity)) scheduler_activate(&e->sched, e->s->grav_top_level); +#ifdef WITH_CUDA + scheduler_activate(&e->sched, e->s->GPU_task); +#endif if (e->verbose) message("took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); diff --git a/src/runner.c b/src/runner.c index 01489eefcdb3ac7400b2c316df7a674a4604b3bd..750f99f8a6834f42643a0eaf2e1475d5af72d1fb 100644 --- a/src/runner.c +++ b/src/runner.c @@ -1753,12 +1753,12 @@ void *runner_main(void *data) { struct engine *e = r->e; struct scheduler *sched = &e->sched; + /* Main loop. */ while (1) { /* Wait at the barrier. */ engine_barrier(e); - /* Re-set the pointer to the previous task, as there is none. */ struct task *t = NULL; struct task *prev = NULL; @@ -1775,7 +1775,9 @@ void *runner_main(void *data) { TIMER_TOC(timer_gettask); /* Did I get anything? */ - if (t == NULL) break; + if (t == NULL){ + break; + } } /* Get the cells. */ @@ -1907,8 +1909,10 @@ void *runner_main(void *data) { break; case task_type_sub_pair: - if (t->subtype == task_subtype_density) + if (t->subtype == task_subtype_density){ runner_dosub_pair1_density(r, ci, cj, t->flags, 1); + message("Found a density task..."); + } #ifdef EXTRA_HYDRO_LOOP else if (t->subtype == task_subtype_gradient) runner_dosub_pair1_gradient(r, ci, cj, t->flags, 1); diff --git a/src/scheduler.c b/src/scheduler.c index ef9432de6b0591ba78a4de1aadff5051e7eb4f97..a02e0dd9c3f8d390e3ec7189b9ad7ec5e0f01689 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -795,8 +795,10 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type, t->toc = 0; #endif + #ifdef WITH_CUDA t->cuda_task = -1; + t->gpu = 0; #endif /* Add an index for it. */ @@ -816,10 +818,19 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type, void scheduler_set_unlocks(struct scheduler *s) { /* Store the counts for each task. */ +#ifdef WITH_CUDA + int *counts; + if ((counts = ( int *)malloc(sizeof(int) * s->nr_tasks)) == NULL) +#else short int *counts; - if ((counts = (short int *)malloc(sizeof(short int) * s->nr_tasks)) == NULL) + if ((counts = ( short int *)malloc(sizeof(short int) * s->nr_tasks)) == NULL) +#endif error("Failed to allocate temporary counts array."); +#ifdef WITH_CUDA + bzero(counts, sizeof(int) * s->nr_tasks); +#else bzero(counts, sizeof(short int) * s->nr_tasks); +#endif for (int k = 0; k < s->nr_unlocks; k++) { counts[s->unlock_ind[k]] += 1; @@ -851,15 +862,21 @@ void scheduler_set_unlocks(struct scheduler *s) { if ((unlocks = (struct task **)malloc(sizeof(struct task *) * s->size_unlocks)) == NULL) error("Failed to allocate temporary unlocks array."); + int *volatile indices; + if( (indices = (int *volatile) malloc( sizeof(int) * s->size_unlocks)) == NULL) + error("Failed to allocate temporary indices array."); for (int k = 0; k < s->nr_unlocks; k++) { const int ind = s->unlock_ind[k]; unlocks[offsets[ind]] = s->unlocks[k]; + indices[offsets[ind]] = ind; offsets[ind] += 1; } /* Swap the unlocks. */ free(s->unlocks); s->unlocks = unlocks; + free(s->unlock_ind); + s->unlock_ind = indices; /* Re-set the offsets. */ offsets[0] = 0; @@ -873,6 +890,7 @@ void scheduler_set_unlocks(struct scheduler *s) { t->unlock_tasks = &s->unlocks[offsets[k]]; } + #ifdef SWIFT_DEBUG_CHECKS /* Verify that there are no duplicate unlocks. */ for (int k = 0; k < s->nr_tasks; k++) { @@ -1105,7 +1123,6 @@ void scheduler_rewait_mapper(void *map_data, int num_elements, for (int ind = 0; ind < num_elements; ind++) { struct task *t = &s->tasks[tid[ind]]; - /* Ignore skipped tasks. */ #ifdef WITH_CUDA if (t->skip || t->gpu) continue; @@ -1123,6 +1140,7 @@ void scheduler_rewait_mapper(void *map_data, int num_elements, (1 << (8 * sizeof(t->wait) - 1)) - 1); #endif + /* Sets the waits of the dependances */ for (int k = 0; k < t->nr_unlock_tasks; k++) { struct task *u = t->unlock_tasks[k]; @@ -1417,10 +1435,17 @@ struct task *scheduler_done(struct scheduler *s, struct task *t) { they are ready. */ for (int k = 0; k < t->nr_unlock_tasks; k++) { struct task *t2 = t->unlock_tasks[k]; +#ifdef WITH_CUDA + if (t2->skip || t2->gpu) continue; +#else if (t2->skip) continue; - +#endif const int res = atomic_dec(&t2->wait); + if(t2->type == task_type_kick2 && t->type != task_type_GPU_mega) + message("Unlocked by type not GPU_mega"); if (res < 1) { + if(t->type == task_type_GPU_mega) + message("GPU_mega"); error("Negative wait!"); } else if (res == 1) { scheduler_enqueue(s, t2); diff --git a/src/scheduler.h b/src/scheduler.h index a4f9592672f3eb57b0a553481b288905d8634143..dc898e96325ca304897f11f026a0fc4a3f9121ce 100644 --- a/src/scheduler.h +++ b/src/scheduler.h @@ -113,7 +113,11 @@ struct scheduler { */ __attribute__((always_inline)) INLINE static void scheduler_activate( struct scheduler *s, struct task *t) { - if (atomic_cas(&t->skip, 1, 0)) { +#ifdef WITH_CUDA + if (!t->gpu && atomic_cas(&t->skip, 1, 0)) { +#else + if (atomic_cas(&t->skip, 1, 0)){ +#endif t->wait = 0; int ind = atomic_inc(&s->active_count); s->tid_active[ind] = t - s->tasks; diff --git a/src/space.h b/src/space.h index dbbba714c2b3c9841905b2ba54e4f2d854b820a6..df67f8ef20fd3b484687ce3da5ade5dd7c25d537 100644 --- a/src/space.h +++ b/src/space.h @@ -161,6 +161,9 @@ struct space { size_t nr_sparts_foreign, size_sparts_foreign; #endif +#ifdef WITH_CUDA + struct task *GPU_task; +#endif }; /* function prototypes. */ diff --git a/src/task.c b/src/task.c index 43da1d35680783d977ea743dd4f43c52f0f291bc..b466ffcae35a4e87d8c1c98f4e77a24e119736a3 100644 --- a/src/task.c +++ b/src/task.c @@ -152,7 +152,9 @@ __attribute__((always_inline)) INLINE static enum task_actions task_acts_on( break; } break; - + case task_type_GPU_mega: + return task_action_part; + break; case task_type_kick1: case task_type_kick2: case task_type_timestep: diff --git a/src/task.h b/src/task.h index 3b1c640df0d9c4258936143052380040d5fb97e9..13fcc5cb363d69773e37b48d6db8bfdf0aad86fb 100644 --- a/src/task.h +++ b/src/task.h @@ -147,10 +147,18 @@ struct task { #endif /*! Number of tasks unlocked by this one */ +#ifdef WITH_CUDA + int nr_unlock_tasks; +#else short int nr_unlock_tasks; +#endif /*! Number of unsatisfied dependencies */ +#ifdef WITH_CUDA + int wait; +#else short int wait; +#endif /*! Type of the task */ enum task_types type;