diff --git a/src/cuda_queue.cu b/src/cuda_queue.cu index 4477f61be656a653bb98c98b8d1dd67e68a5d15d..0b50bb1b715420f6384a6752e24a9b5620b9db78 100644 --- a/src/cuda_queue.cu +++ b/src/cuda_queue.cu @@ -16,11 +16,24 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. * ******************************************************************************/ +/* Config parameters. */ +#include "../config.h" + +/* Standard includes. */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> #include "cuda_queue.h" +#include "quicksched.h" +#include "res.h" -/*Define if conflicts are enabled or not.*/ +/*Define task types required for GPU */ +#define type_load -100 +#define type_unload -101 + +/*Define if conflicts are enabled or not.*/ //#define GPU_locks /*Declare variables required for the queue*/ @@ -29,9 +42,9 @@ __constant__ int cuda_nrqueues; __constant__ int cuda_queue_size; __device__ struct task *tasks_cuda; -__device__ int *locks_cuda; -__device__ int *uses_cuda; -__device__ int *deps_cuda; +__device__ qsched_res_t *locks_cuda; +__device__ qsched_res_t *uses_cuda; +__device__ qsched_task_t *deps_cuda; __device__ struct res *res_cuda; /** @@ -149,30 +162,151 @@ __device__ int runner_cuda_gettask ( struct queue_cuda *q ) { -int qsched_prepare_cuda ( struct qsched *s , ) { + +void qsched_prepare_loads ( struct qsched *s ) { + +int i, task, unload, tb, j, k, unlocked=0; +struct task *t; +/* Create a load task for each resource. */ +for(i = 0; i < s->count_res; i++) +{ + + if(s->res[i].size == 0) + continue; + + + + + task = qsched_addtask( s , type_load , 0 , &i , sizeof(int) , 0 ); + unload = qsched_addtask( s , type_unload, 0 , &i, sizeof(int), 0 ); + /*Load task unlocks each task that uses or locks the specified resource */ + for(j = 0; j < s->count; j++) + { + t = &s->tasks[j]; + + for(k = 0; k < t->nr_uses; k++) + { + if(t->uses[k] == i){ + qsched_addunlock( s , task, j ); + qsched_addunlock( s , j , unload); + unlocked = 1; + break; + } + } + if(unlocked == 1) + { + unlocked = 0; + continue; + } + for(k = 0; k < t->nr_locks; k++) + { + if(t->locks[k] == i){ + qsched_addunlock( s , task , j ); + qsched_addunlock( s , j , unload); + break; + } + + } + } + + +} + + + + + +} + + + + +int qsched_prepare_cuda ( struct qsched *s ) { + +int i; +struct task *cuda_t; +qsched_res_t *setup; +qsched_task_t *setup_t; +struct res *res_t; +int *data; + +/*Setup Load and unload tasks*/ + + + + + + + + + + + /*Copy the qsched data to the device*/ -if( cudaMalloc( &tasks_cuda , sizeof(struct task) * s->count != cudaSuccess ) +if( cudaMalloc( &cuda_t , sizeof(struct task) * s->count ) != cudaSuccess ) error("Failed to allocate task array on the device."); -if( cudaMemcpy( ) +if( cudaMemcpy( cuda_t, s->tasks, sizeof(struct task) * s->count , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy tasks to the device."); +if( cudaMemcpyToSymbol ( cuda_t , tasks_cuda, sizeof(struct task *) , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy task pointer to the device."); -if( cudaMalloc( &locks_cuda , sizeof(int) * s->count_lock != cudaSuccess ) +if( cudaMalloc( &setup , sizeof(qsched_res_t) * s->count_locks ) != cudaSuccess ) error("Failed to allocate locks array on the device."); +if( cudaMemcpy( &setup , s->locks , sizeof(qsched_res_t) * s->count_locks, cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy locks to the device."); +if( cudaMemcpyToSymbol ( setup , locks_cuda, sizeof(qsched_res_t *), cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy locks pointer to the device."); -if( cudaMalloc( &deps_cuda , sizeof(int) * s->count_deps != cudaSuccess ) +if( cudaMalloc( &setup_t , sizeof(qsched_task_t) * s->count_deps ) != cudaSuccess ) error("Failed to allocate deps array on the device."); +if( cudaMemcpy( &setup_t , s->deps , sizeof(qsched_task_t) * s->count_deps, cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy deps to the device."); +if( cudaMemcpyToSymbol ( &setup_t , deps_cuda, sizeof(qsched_task_t *) , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy deps pointer to the device."); -if( cudaMalloc( &uses_cuda , sizeof(int) * s->count_uses != cudaSuccess ) +if( cudaMalloc( &uses_cuda , sizeof(qsched_res_t) * s->count_uses ) != cudaSuccess ) error("Failed to allocate use array on the device."); +if( cudaMemcpy( &setup , s->uses , sizeof(qsched_res_t) * s->count_uses, cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy locks to the device."); +if( cudaMemcpyToSymbol ( setup , uses_cuda, sizeof(qsched_res_t *), cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy locks pointer to the device."); -if( cudaMalloc( &res_cuda , sizeof( res) * s->count_res != cudaSuccess ) +if( cudaMalloc( &res_t , sizeof(struct res) * s->count_res ) != cudaSuccess ) error("Failed to allocated on the device."); +for(i = 0; i < s->count_res; i++) +{ + if(s->res[i].size == 0) + continue; + if( cudaMalloc( &data, sizeof(int) * s->res[i].size) != cudaSuccess ) + error("Failed to allocate data space on the device."); + s->res[i].gpu_data = data; +} + + +if( cudaMemcpy( &res_t , s->res , sizeof(struct res) * s->count_res , cudaMemcpyHostToDevice) != cudaSuccess ) + error("Failed to copy resources to the device."); +if( cudaMemcpyToSymbol( res_t , res_cuda , sizeof(struct res *) * s->count_res , cudaMemcpyHostToDevice) != cudaSuccess ) + error("Failed to copy res pointer to the device."); + + + + + + + + + + + + + /* Initialize the queues. */ int nr_queues= 2, k, qsize; -int *data; -struct queue_cuda queues[ cuda_maxqueues ]; +//int *data; +struct queue_cuda queues[ cuda_numqueues ]; diff --git a/src/cycle.h b/src/cycle.h index 16f57e7e1ef942d2736f4328be9117b2deab6d6e..f92d1d973fa84aaf166e5154b0627a9fad955d32 100644 --- a/src/cycle.h +++ b/src/cycle.h @@ -78,6 +78,10 @@ /***************************************************************************/ +#ifdef WITH_CUDA +#define INLINE inline +#endif + #if TIME_WITH_SYS_TIME # include <sys/time.h> # include <time.h> @@ -526,3 +530,6 @@ INLINE_ELAPSED(inline) #endif #endif /* HAVE_MIPS_ZBUS_TIMER */ +#ifdef WITH_CUDA +#undef INLINE +#endif diff --git a/src/lock.h b/src/lock.h index abdfc5ddf1df0ee03438ca3aa99385c9ccf6f3ad..f814a09e378b570fc606ff24535cc21adb60a5c3 100644 --- a/src/lock.h +++ b/src/lock.h @@ -35,16 +35,16 @@ #define lock_type volatile int #define lock_init( l ) ( *l = 0 ) #define lock_destroy( l ) 0 - INLINE static int lock_lock ( volatile int *l ) { - while( atomicCAS( l , 0 , 1 ) != 0 ); + __device__ inline static int lock_lock ( volatile int *l ) { + while( atomicCAS( (int *)l , 0 , 1 ) != 0 ); return 0; } - INLINE static int lock_trylock ( volatile int *l ) { - int res = atomicCAS( l , 0 , 1 ); + __device__ inline static int lock_trylock ( volatile int *l ) { + int res = atomicCAS( (int *)l , 0 , 1 ); return res; } - INLINE static int lock_unlock ( volatile int *l ) { - int res = atomicCAS ( l , 1 , 0 ) != 1; + __device__ inline static int lock_unlock ( volatile int *l ) { + int res = atomicCAS ( (int *)l , 1 , 0 ) != 1; return res; } #define lock_unlock_blind( l ) atomicCAS( &l , 1 , 0 ) diff --git a/src/qsched.c b/src/qsched.c index a183dd24611a3b9e8f6b7c8ac95e5b9a168bb148..5bcf1487d8b321ae308f5aa46254e8fdbb9902b6 100644 --- a/src/qsched.c +++ b/src/qsched.c @@ -884,6 +884,10 @@ void qsched_prepare ( struct qsched *s ) { /* Lock the sched. */ lock_lock( &s->lock ); + + #ifdef WITH_CUDA + qsched_prepare_loads( s ); + #endif /* Get a pointer to the tasks, set the count. */ tasks = s->tasks; diff --git a/src/qsched.h b/src/qsched.h index ee08fe41f7a58d530e8af0277a499aafa2167b4e..a2d18d3818d074a5e74c9c640a85e0503f6fee7e 100644 --- a/src/qsched.h +++ b/src/qsched.h @@ -163,8 +163,8 @@ struct qsched { /* Function prototypes. */ /* Internal functions. */ -void qsched_sort ( int *restrict data , int *restrict ind , int N , int min , int max ); -void qsched_sort_rec ( int *restrict data , int *restrict ind , int N , int min , int max ); +void qsched_sort ( int *data , int *ind , int N , int min , int max ); +void qsched_sort_rec ( int *data , int *ind , int N , int min , int max ); struct task *qsched_gettask ( struct qsched *s , int qid ); void qsched_done ( struct qsched *s , struct task *t ); void *qsched_getdata( struct qsched *s , struct task *t ); diff --git a/src/task.h b/src/task.h index c9c57d6436fe659352133ce38b79fc63c719da7e..dccc2dae7e078510db30c7d368ad1c9769668978 100644 --- a/src/task.h +++ b/src/task.h @@ -22,7 +22,6 @@ #define task_flag_skip 1 #define task_flag_virtual 2 - /* The task data structure. */ struct task {