diff --git a/src/CUDA/runner_cuda_main.cu b/src/CUDA/runner_cuda_main.cu index edccab1bece77bb32f970daaef711a2ddc643855..0241d066c914625e06ca157de9ad08f3539523d9 100644 --- a/src/CUDA/runner_cuda_main.cu +++ b/src/CUDA/runner_cuda_main.cu @@ -1462,7 +1462,7 @@ __global__ void swift_device_kernel() { __syncthreads(); /* Unlock dependencies*/ - for (i = threadIdx.x; i < tasks[tid].nr_unlock_tasks; i++) { + for (i = threadIdx.x; i < tasks[tid].nr_unlock_tasks; i+=blockDim.x) { int dependant = tasks[tid].unlocks[i]; if (atomicSub(&tasks[dependant].wait, 1) == 1 && !tasks[dependant].skip) { if (tasks[dependant].type <= type_unload && @@ -1822,9 +1822,10 @@ __host__ int find_priority_cutoff(struct task_cuda *tasks, int count) { __host__ void update_tasks(struct engine *e) { int nr_gpu_tasks; - + int nr_tasks; /* Download the cuda_tasks from the GPU. */ cudaErrCheck(cudaMemcpyFromSymbol(&nr_gpu_tasks, cuda_numtasks, sizeof(int))); + cudaErrCheck(cudaMemcpyFromSymbol(&nr_tasks, tot_num_tasks, sizeof(int))); struct task_cuda *gpu_pointer = NULL; cudaErrCheck(cudaMemcpyFromSymbol( &gpu_pointer, tasks, sizeof(struct task_cuda *))); // TODO check. @@ -1842,6 +1843,7 @@ __host__ void update_tasks(struct engine *e) { cudaErrCheck( cudaDeviceSynchronize()); cudaErrCheck( cudaMemcpy( host_unlock_copy, host_unlock_pointer, sizeof(int) * cuda_unlock_count, cudaMemcpyDeviceToHost) ); + int task_count=0; for (int i = 0; i < nr_gpu_tasks; i++) { // Update the skip flag and reset the wait to 0. @@ -1855,14 +1857,17 @@ __host__ void update_tasks(struct engine *e) { /* Reset the waits. */ for (int i = 0; i < nr_gpu_tasks; i++) { if (!host_tasks[i].skip) { + task_count++; struct task_cuda *temp_t = &host_tasks[i]; int *unlocks = host_unlock_copy + (temp_t->unlocks - host_unlock_pointer); for (int ii = 0; ii < temp_t->nr_unlock_tasks; ii++) { - host_tasks[unlocks[ii]].wait++; + if(!host_tasks[unlocks[ii]].skip) + host_tasks[unlocks[ii]].wait++; } } } + cudaErrCheck(cudaMemcpyToSymbol(tot_num_tasks, &task_count, sizeof(int))); /* Reset the queue data.*/ int qsize; cudaErrCheck(cudaMemcpyFromSymbol(&qsize, cuda_queue_size, sizeof(int))); @@ -1888,7 +1893,6 @@ __host__ void update_tasks(struct engine *e) { for (int i = unload_host.count; i < qsize; i++) { data[i] = -1; } - /* Allocate and copy the data to the device. */ cudaErrCheck(cudaMemcpy((void *)unload_host.data, data, sizeof(int) * qsize, cudaMemcpyHostToDevice)); @@ -1995,7 +1999,7 @@ __host__ void update_tasks(struct engine *e) { /* Copy the tasks back to the GPU. */ cudaErrCheck(cudaMemcpy(gpu_pointer, host_tasks, - sizeof(struct task_cuda *) * nr_gpu_tasks, + sizeof(struct task_cuda) * nr_gpu_tasks, cudaMemcpyHostToDevice)); /* Update simulation constants @@ -2293,7 +2297,7 @@ __host__ void create_tasks(struct engine *e) { /* Allocate space on the device for the cells. */ struct cell_cuda *cell_device = NULL; - struct cell **pointers_device = NULL; + struct cell *pointers_device = NULL; if (firstrun) { /* If we already have an array for this we need to remove it. */ cudaErrCheck(cudaMemcpyFromSymbol(cell_device, &cells_cuda, @@ -2316,14 +2320,14 @@ __host__ void create_tasks(struct engine *e) { cudaMemcpyHostToDevice)); cudaErrCheck( - cudaMemcpyToSymbol(cells_cuda, cell_device, sizeof(struct cell_cuda *))); + cudaMemcpyToSymbol(cells_cuda, &cell_device, sizeof(struct cell_cuda *))); cudaErrCheck(cudaMemcpy(pointers_device, host_pointers, sizeof(struct cell *) * s->tot_cells, cudaMemcpyHostToDevice)); cudaErrCheck( - cudaMemcpyToSymbol(cpu_cells, pointers_device, sizeof(struct cell **))); + cudaMemcpyToSymbol(cpu_cells, &pointers_device, sizeof(struct cell **))); /* Setup the queues. */ /* We have 4 queues, one containing unload & implicit tasks. */ @@ -2609,6 +2613,7 @@ __host__ void create_tasks(struct engine *e) { __host__ void run_cuda() { printf("running cuda\n"); swift_device_kernel << <num_blocks, num_cuda_threads>>> (); + cudaErrCheck(cudaDeviceSynchronize()); } /* Make the tests! */ diff --git a/src/engine.c b/src/engine.c index 43a730cf63c48a76329a01b2e6e5bdaf076c1250..9aba0a722fc1ffdcfca56e106288c76ce958823f 100644 --- a/src/engine.c +++ b/src/engine.c @@ -3624,6 +3624,11 @@ void engine_step(struct engine *e) { engine_launch(e); TIMER_TOC(timer_runners); +#ifdef WITH_CUDA + message("Running cuda\n"); + run_cuda(); +#endif + #ifdef SWIFT_GRAVITY_FORCE_CHECKS /* Check the accuracy of the gravity calculation */ if (e->policy & engine_policy_self_gravity)