diff --git a/src/CUDA/queue_cuda.h b/src/CUDA/queue_cuda.h index 99a7a0dd357bea2d5ef09757678e48d91de520f7..6e4a19a00fb6f17614f57e3e72e30c9a681e8d71 100644 --- a/src/CUDA/queue_cuda.h +++ b/src/CUDA/queue_cuda.h @@ -34,6 +34,8 @@ extern "C" { #define type_unload task_type_unload #define type_implicit_load task_type_implicit_load #define type_implicit_unload task_type_implicit_unload +#define type_recv_load task_type_recv_load +#define type_send_unload task_type_send_unload const int num_gpu_types = 5; const int gpu_work_task_array[num_gpu_types] = {task_type_self, task_type_pair, task_type_sub_self, diff --git a/src/CUDA/runner_cuda_main.cu b/src/CUDA/runner_cuda_main.cu index 9c0095ae79c09b0643a4fe0bc7606ddd259110b2..772f61d3d2e48743ea34d49aa41d6bf5718ff9b0 100644 --- a/src/CUDA/runner_cuda_main.cu +++ b/src/CUDA/runner_cuda_main.cu @@ -475,10 +475,10 @@ __device__ void doself_density(struct cell_cuda *ci) { /* Is the cell active? */ if (!cuda_cell_is_active(ci)) { if(threadIdx.x ==0) - printf( +/* printf( "Cell isn't active..., ti_end_min=%lli, ti_current=%lli, " "max_active_bin=%i, cell_id = %lli\n", - ci->ti_end_min, ti_current, max_active_bin, (ci-cells_cuda)); + ci->ti_end_min, ti_current, max_active_bin, (ci-cells_cuda));*/ return; } @@ -2229,6 +2229,9 @@ __host__ void create_transfer_tasks(struct cell *c, int *k, tasks_host[*k].skip = 0; tasks_host[*k].implicit = 0; tasks_host[*k].task = NULL; +#ifdef REDUCED_TRANSFER + tasks_host[*k].cell = c; +#endif /* The load implicit tasks unlocks the parent's task */ if (parent_load_task >= 0) { tasks_host[*k].unlocks[tasks_host[*k].nr_unlock_tasks++] = @@ -2254,7 +2257,9 @@ __host__ void create_transfer_tasks(struct cell *c, int *k, tasks_host[*k].skip = 0; tasks_host[*k].implicit = 0; tasks_host[*k].task = NULL; - +#ifdef REDUCED_TRANSFER + tasks_host[*k].cell = c; +#endif /* The unload implicit task is unlocked by the parent task */ if (parent_unload_task >= 0) { tasks_host[parent_unload_task] @@ -2284,6 +2289,9 @@ __host__ void create_transfer_tasks(struct cell *c, int *k, tasks_host[*k].skip = 0; tasks_host[*k].implicit = 0; tasks_host[*k].task = NULL; +#ifdef REDUCED_TRANSFER + tasks_host[*k].cell = c; +#endif /* This load task unlocks the parent's task. */ if (parent_load_task >= 0) { tasks_host[*k].unlocks[tasks_host[*k].nr_unlock_tasks++] = @@ -2305,6 +2313,9 @@ __host__ void create_transfer_tasks(struct cell *c, int *k, tasks_host[*k].skip = 0; tasks_host[*k].implicit = 0; tasks_host[*k].task = NULL; +#ifdef REDUCED_TRANSFER + tasks_host[*k].cell = c; +#endif /* The unload task is unlocked by the parent task */ if (parent_unload_task >= 0) { tasks_host[parent_unload_task] @@ -2488,7 +2499,7 @@ __host__ void update_tasks(struct engine *e) { /* Relies on assumption implicit unloads are always before unloads in host_tasks, which i believe to be true by conscruction. for(int i = 0; i < nr_gpu_tasks; i++){ - if(host_tasks[i].type == type_unload && host_tasks[i].type == type_implicit_unload){ + if(host_tasks[i].type == type_unload || host_tasks[i].type == type_implicit_unload){ if(host_tasks[i].wait==1){ host_tasks[i].skip = 1; task_count--; @@ -2496,16 +2507,24 @@ __host__ void update_tasks(struct engine *e) { int *unlocks = host_unlock_copy + (temp_t->unlocks-host_unlock_pointer); for(int ii = 0; ii < temp_t->nr_unlock_tasks; ii++){ if(!host_tasks[unlocks[ii].skip) - host_tasks[unlock[ii]].wait--; + host_tasks[unlocks[ii]].wait--; } *Find the corresponding load task* - Have to search the cells for this at the moment. - + struct task_cuda *l_task = &host_tasks[host_tasks[i].cell->load_task]; + l_tasks->skip = 1; + task_count--; + int *unlocks = host_unlock_copy + (l_task->unlocks-host_unlock_pointer); + for(int ii=0; ii < l_task->nr_unlock_tasks; ii++){ + if(!host_tasks[unlocks[ii].skip) + host_tasks[unlocks[ii]].wait--; + } } } }*/ + /* TODO Reset the waits again.*/ + cudaErrCheck(cudaMemcpyToSymbol(tot_num_tasks, &task_count, sizeof(int))); /* Reset the queue data.*/ int qsize; diff --git a/src/CUDA/task_cuda.h b/src/CUDA/task_cuda.h index d1583a344e73feab7cdd3130d07bc237004cbeb0..ae709412740d0ca242de8664d9199c633297ab11 100644 --- a/src/CUDA/task_cuda.h +++ b/src/CUDA/task_cuda.h @@ -46,6 +46,11 @@ struct task_cuda { /* Size of unlock array during initialisation. */ int size_unlocks; +#ifdef REDUCED_TRANSFER + /* Pointer to the loaded/unloaded cell for load/unload tasks*/ + struct cell *cell; +#endif + #ifdef CUDA_TASK_TIMERS /* Executing block*/ int blockID; diff --git a/src/task.h b/src/task.h index 13fcc5cb363d69773e37b48d6db8bfdf0aad86fb..c6a163bb9a4e593567f8543100077d88d7fc5e7e 100644 --- a/src/task.h +++ b/src/task.h @@ -43,6 +43,8 @@ enum task_types { task_type_unload = -102, task_type_implicit_load = -103, task_type_implicit_unload = -104, + task_type_recv_load = -201, + task_type_send_unload = -202, task_type_none = 0, task_type_sort, task_type_self,