Added files I had missed from the svn repository

165c6ccc · aidan · 80576f37 · 165c6ccc · 165c6ccc · 165c6ccc
Commit 165c6ccc authored 10 years ago by aidan
--- a/examples/test_bh_2.cu
+++ b/examples/test_bh_2.cu
+/*******************************************************************************
+ * This file is part of QuickSched.
+ * Coypright (c) 2014 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
+ *                    Aidan Chalk (aidan.chalk@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+* *****************************************************************************/
+
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Standard includes. */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+#include <omp.h>
+#include <fenv.h>
+
+/* Local includes. */
+#include "quicksched.h"
+
+
+struct cell{
+
+double2 loc_xy;
+double loc_z;
+double h;
+int count;
+unsigned short int split, sorted;
+int parts, firstchild, sibling;
+int res, com_tid;
+
+}__attribute__((aligned(64)));
+
+/* Requred variables to obtain cells. */
+#define CELL_STRETCH 2
+#define INITIAL_CELLS 256
+struct *cell cell_pool=NULL;
+int used_cells=0;
+int num_cells = INITIAL_CELLS;
+int cell_size = INITIAL_CELLS*sizeof(struct cell);
+
+/* Device locations for the particle values. */
+__device__ double2 *parts_pos_xy;
+__device__ double *parts_pos_z;
+__device__ float4 *parts_a_m;
+__device__ double2 *com_xy;
+__device__ double *com_z;
+__device__ float com_mass;
+
+
+/* Host locations for the particle values. */
+double2 *parts_pos_xy_host;
+double *parts_pos_z_host;
+float4 *parts_a_m_host;
+double2 *com_xy_host;
+double *com_z_host;
+float com_mass_host;
+
+
+/**
+ * @brief Checks whether the cells are direct neighbours ot not. Both cells have
+ * to be of the same size
+ */
+static inline int are_neighbours(struct cell *ci, struct cell *cj) {
+
+  int k;
+  float dx[3];
+
+#ifdef SANITY_CHECKS
+  if (ci->h != cj->h)
+    error(" Cells of different size in distance calculation.");
+#endif
+
+  /* Maximum allowed distance */
+  float min_dist = ci->h;
+
+  /* (Manhattan) Distance between the cells */
+    double2 loc1=ci->loc_xy, loc2=cj->loc_xy;
+    float center_i = loc1.x;
+    float center_j = loc2.x;
+    dx[0] = fabs(center_i - center_j);
+    center_i = loc1.y;
+    center_j = loc2.y;
+    dx[1] = fabs(center_i - center_j);
+    center_i = ci->loc_z;
+    center_J = cj->loc_z;
+    dx[2] = fabs(center_i - center_j);
+
+  return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
+}
+
+
+struct *cell cell_get()
+{
+    struct *cell res;
+    
+    if(used_cells >= num_cells)
+    {
+        /* Stretch */
+        struct *cell new_pool;
+        cell_size *= CELL_STRETCH;
+        new_pool = (struct *cell) calloc(cell_size);
+        if(cell_pool != NULL)
+        memcpy(new_pool, cell_pool, num_cells*sizeof(struct cell));
+
+
+        
+        double2 *temp = (*double2) calloc(num_cells*sizeof(double2));
+        memcpy(temp, com_xy_host, sizeof(double2)*num_cells);
+        free(com_xy_host);
+        com_xy_host = temp;
+        double temp2 = (*double) calloc(num_cells*sizeof(double));
+        memcpy(temp2, com_z_host, num_cells*sizeof(double));
+        free(com_z_host);
+        com_z_host = temp2;
+        float temp3 = (*float) calloc(num_cells*sizeof(float));
+        memcpy(temp3, com_mass_host, num_cells*sizeof(float));
+        free(com_mass_host);
+        com_mass_host = temp3;
+
+        num_cells *= CELL_STRETCH;
+        free(cell_pool);
+        cell_pool = new_pool;
+    }
+    used_cells++;
+    cell_pool[used_cells-1].sibling = -1;
+    cell_pool[used_cells-1].firstchild = -1;
+    cell_poo[used_cells-1].res = qsched_res_none;
+    return &cell_pool[used_cells-1];
+}
+
+void comp_com(struct cell *c){
+
+    int k, count = c->count;
+    int cpi;
+    struct cell *cp;
+    int parts = c->parts;
+    double com[3] = {0.0, 0.0, 0.0}, mass = 0.0;
+
+    if(c->split) {
+        for(cp = &cell_pool[(cpi = c->firstchild)]; cp != &cell_pool[c->sibling]; &cell_pool[(cpi = cp->sibling)]) {
+            float cp_mass = com_mass_host[cpi];
+            com[0] += com_xy_host[cpi].x * cp_mass;
+            com[1] += com_xy_host[cpi].y * cp_mass;
+            com[2] += com_z_host[cpi] * cp_mass;
+            mass += cp_mass;
+        }
+
+
+     /* Otherwise collect the multiple from the particles */
+    } else {
+
+        for(k = parts; k < parts+count; k++)
+        {
+            float p_mass = parts_a_m_host[k].w;
+            com[0] += parts_pos_xy_host[k].x * p_mass;
+            com[1] += parts_pos_xy_host[k].y * p_mass;
+            com[2] += parts_pos_z_host[k] * p_mass;
+            mass += p_mass;
+        }
+    }
+
+
+    k = c - cell_pool;
+    /* Store the COM data, if it was collected. */
+    if(mass > 0.0) {
+        float imass = 1.0f/mass;
+        com_xy_host[k].x = com[0] * imass;
+        com_xy_host[k].y = com[1] * imass;
+        com_z_host[k] = com[2] * imass;
+        com_mass_host[k] = mass;
+    }else
+    {
+        com_xy_host[k].x = 0.0;
+        com_xy_host[k].y = 0.0;
+        com_z_host[k] = 0.0;
+        com_mass_host[k] = 0.0f;
+    }
+
+
+
+}
+
+/**
+ * @brief Sort the parts into eight bins along the given pivots and
+ *        fill the multipoles. Also adds the hierarchical resources
+ *        to the sched (TODO).
+ *
+ * @param c The #cell to be split.
+ * @param N The total number of parts.
+ * @param s The #sched to store the resources.
+ */
+void cell_split(struct cell *c, struct qsched *s) {
+    int i, j, k, kk, count = c->count;
+    int parts = c->parts;
+    double2 temp;
+    double temp1;
+    float4 temp2;
+    struct cell *cp;
+    int left[8], right[8];
+    double pivot[3];
+    static struct cell *root = NULL;
+    struct cell *progenitors[8];
+
+    /* Set the root cell. */
+    if (root == NULL) {
+        root = c;
+        c->sibling = 0;
+    }
+
+    if(c->res == qsched_res_none)
+        error("Cell has no resource");
+
+    if(c->count > cell_maxparts )
+    {
+        c->split = 1;
+
+        for(k = 0; k < 8; k++)
+        {
+            progenitors[k] = cp = cell_get();
+            cp->loc_xy = c->loc_xy;
+            cp->loc_z = c->loc_z;
+            cp->h = c->h*0.5;
+            if(k & 4) cp->loc_xy.x += cp->h;
+            if(k & 2) cp->loc_xy.y += cp->h;
+            if(k & 1) cp->loc_z += cp->h;
+        }
+
+        /* Init the pivots.*/
+        pivot[0] = c->loc_xy.x + c->h * 0.5;
+        pivot[1] = c->loc_xy.y + c->h * 0.5;
+        pivot[2] = c->loc_z + c->h * 0.5;
+
+        /* Split along the x axis. */
+        i = parts;
+        j = parts+count-1;
+        while(i < j)
+        {
+            while(i <= parts+count-1 && parts_pos_xy_host[i].x < pivot[0]) i += 1;
+            while(j >= parts && parts_pos_xy_host[j].x >= pivot[0]) j -= 1;
+            if(i < j){
+                temp = parts_pos_xy_host[i];
+                temp1 = parts_pos_z_host[i];
+                temp2 = parts_a_m_host[i];
+                parts_pos_xy_host[i] = parts_pos_xy_host[j];
+                parts_pos_z_host[i] = parts_pos_z_host[j];
+                parts_a_m_host[i] = parts_a_m_host[j];
+                parts_pos_xy_host[j] = temp;
+                parts_pos_z_host[j] = temp1;
+                parts_a_m_host[j] = temp2;
+            }
+        }
+        left[1] = i;
+        right[1] parts+count-1;
+        left[0] = parts;
+        right[0] = j;
+        
+
+        /* Split along the y axis twice. */
+        for (k = 1; k >= 0; k--) {
+            i = left[k];
+            j = right[k];
+            while(i <= j){
+                while(i <= right[k] && parts_pos_xy_host[i].y < pivot[1]) i += 1;
+                while(j >= left[k] && parts_pos_xy_host[j].y >= pivot[1]) j -= 1;
+                if(i < j)
+                {
+                    temp = parts_pos_xy_host[i];
+                    temp1 = parts_pos_z_host[i];
+                    temp2 = parts_a_m_host[i];
+                    parts_pos_xy_host[i] = parts_pos_xy_host[j];
+                    parts_pos_z_host[i] = parts_pos_z_host[j];
+                    parts_a_m_host[i] = parts_a_m_host[j];
+                    parts_pos_xy_host[j] = temp;
+                    parts_pos_z_host[j] = temp1;
+                    parts_a_m_host[j] = temp2;
+                }
+            }
+            left[2*k+1] = i;
+            right[2*k+1] = right[k];
+            left[2*k] = left[k];
+            right[2*k] = j;
+        }
+        
+        /* Split along the z axis four times.*/
+        for(k = 3; k >=0; k--)
+        {
+            i = left[k];
+            j = right[k];
+            while(i <= j){
+                while(i <= right[k] && parts_pos_z_host[i] < pivot[2]) i += 1;
+                while(i >= left[k] && parts_post_z_host[i] >= pivot[2]) j -= 1;
+                if(i < j)
+                {
+                    temp = parts_pos_xy_host[i];
+                    temp1 = parts_pos_z_host[i];
+                    temp2 = parts_a_m_host[i];
+                    parts_pos_xy_host[i] = parts_pos_xy_host[j];
+                    parts_pos_z_host[i] = parts_pos_z_host[j];
+                    parts_a_m_host[i] = parts_a_m_host[j];
+                    parts_pos_xy_host[j] = temp;
+                    parts_pos_z_host[j] = temp1;
+                    parts_a_m_host[j] = temp2;
+                }
+            }
+            left[2 * k + 1] = i;
+            right[2 * k + 1] = right[k];
+            left[2 * k] = left[k];
+            right[2 * k] = j;
+        }
+        
+        /* Store the counts and offsets. */
+        for(k = 0; k < 8; k++)
+        {
+            progenitors[k]->count = right[k]-left[k]+1;
+            progenitors[k]->parts = left[k];
+            //TODO ADD RESOURCES HERE.
+        }
+
+        /* Find the first non-empty progenitor */
+        for(k = 0; k < 8; k++)
+        {
+            if(progenitors[k]->count > 0)
+            {
+                c->firstchild = &progenitors[k]-cell_pool;
+                break;
+            }
+        }
+
+        #ifdef SANITY_CHECKS
+            if(c->firstchild == -1)
+                error("Cell has been split but all children have 0 parts");
+        #endif
+
+        /*Prepare the pointers*/
+        for(k = 0; k < 8; k++)
+        {
+            /* Find the next non-empty sibling */
+            for(kk = k+1; kk < 8; ++kk){
+                if(progenitors[kk]->count > 0){
+                    progenitors[k]->sibling = &progenitors[kk]-cell_pool;
+                    break;
+                }
+            }
+
+            /* No non-empty sibling, go back a level.*/
+            if(kk == 8) progenitors[k]->sibling = c->sibling;
+
+        }
+
+        /* Recurse */
+        for(k = 0; k < 8; k++)
+            if(progenitors[k]->count > 0) cell_split(progenitors[k], s);
+     
+    /* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */   
+    } else {
+
+    struct cell *data[2] = {root, c};
+        int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
+                                 2 * sizeof(struct cell *), 1);
+        qsched_addlock(s, tid, c->res);
+        //TODO Create task.
+        //TODO Deal with multiple resources.
+    }
+
+#ifndef COM_AS_TASK
+    comp_com(c);
+#endif
+}
+
+/**
+ * @brief Create the tasks for the cell pair/self.
+ *
+ * @param s The #sched in which to create the tasks.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ */
+void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
+    
+    qsched_task_t tid;
+    int *data[2];
+    struct cell /**data[2],*/ *cp, *cps;
+    int cpi;
+    
+    
+    if(cj == NULL)
+    {
+        if(ci->split)
+        {
+            for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling])
+            {
+                //Self Interaction.
+                create_tasks(s, cp, NULL);
+                
+                for(cps = &cell_pool[cp->sibling]; cps != &cell_pool[ci->sibling]; cps = &cell_pool[cps->sibling])
+                    create_tasks(s, cp, cps);
+            }
+        }
+        /* Self task */
+        else{
+            data[0] = ci - cell_pool;
+            data[1] = -1;
+
+            tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, ci->count*ci->count/2);
+            qsched_addlock(s, tid, ci->res);
+        }
+    }
+    /* Else its a pair!*/
+    else{
+        if(are_neighbours(ci,cj){/* Cells are neighbours */
+            /*Are both split? */
+            if(ci->split && cj->split)
+            {
+                /* Recurse over both cells. */
+                for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling])
+                    for(cps = &cell_pool[cj->firstchild]; cps != &cell_pool[cj->sibling]; cps = &cell_pool[cps->sibling])
+                        create_tasks(s, cp, cps);
+
+            /* Otherwise, at least one of the cells is not split, build a direct
+             * interaction. */
+            }else{
+                data[0] = ci-cell_pool;
+                data[1] = cj-cell_pool;
+                
+                /* Create the task. */
+                tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
+                                     sizeof(struct cell *) * 2, ci->count * cj->count);
+
+                /* Add the resources. */
+                qsched_addlock(s, tid, ci->res);
+                qsched_addlock(s, tid, cj->res);
+            }
+        }
+
+    }
+
+
+
+}
+
+/**
+ * @brief Set up and run a task-based Barnes-Hutt N-body solver.
+ *
+ * @param N The number of random particles to use.
+ * @param runs Number of force evaluations to use as a benchmark.
+ * @param fileName Input file name. If @c NULL or an empty string, random
+ *        particle positions will be used.
+ */
+void test_bh(int N, int runs, char *fileName) {
+  int i, k;
+  struct cell *root;
+  struct part *parts;
+  FILE *file;
+  struct qsched s;
+  ticks tic, toc_run, tot_setup = 0, tot_run = 0;
+  int countMultipoles = 0, countPairs = 0, countCoMs = 0;
+
+
+  /* Initialize the scheduler. */
+  qsched_init(&s, 1, qsched_flag_none);
+
+    //Create host particle arrays.
+    if( cudaMallocHost(&parts_pos_xy_host, sizeof(double2) * N) != cudaSuccess)
+        error("Failed to allocated parts array");
+}
--- a/examples/test_gpu_simple.cu
+++ b/examples/test_gpu_simple.cu
+/*******************************************************************************
+ * This file is part of QuickSched.
+ * Coypright (c) 2013 Aidan Chalk (aidan.chalk@durham.ac.uk)
+ * 
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ * 
+ ******************************************************************************/
+/* Standard includes. */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include <omp.h>
+
+/* Local includes. */
+extern "C"{
+#include "quicksched.h"
+}
+#include "cuda_queue.h"
+
+#define type_square 1
+#define type_double 2
+#define type_quarter 3
+
+#define M 100000
+
+#define TID threadIdx.x
+
+__device__ float *cuda_data;
+
+
+
+__device__ void square(float *data)
+{
+    int i;
+    for( i = TID; i < 1000; i += blockDim.x )
+        data[i] = data[i] * data[i];
+
+}
+
+__device__ void doubles(float *data)
+{
+    int i;
+    for( i = TID; i < 1000; i += blockDim.x )
+        data[i] = 2.0f * data[i];
+}
+
+
+__device__ void quarter(float *data)
+{
+    int i;
+    for( i = TID; i < 1000; i += blockDim.x )
+        data[i] = data[i] * 0.25f;
+}
+
+__global__ void Manual(float *src)
+{
+    int i;
+    int datas = blockIdx.x;
+    cuda_data[datas*1000+threadIdx.x] = src[datas*1000+threadIdx.x];
+    
+    square(&cuda_data[datas*1000]);
+    doubles(&cuda_data[datas*1000]);
+    quarter(&cuda_data[datas*1000]);
+
+    src[datas*1000+threadIdx.x] = cuda_data[datas*1000+threadIdx.x];
+}
+
+
+__device__ __noinline__ void gpuTest(int type , void *data)
+{
+    int datas = *(int*)data;
+
+    if(type == type_square)
+    {
+        square(&cuda_data[datas*1000]);
+    }else if( type == type_double )
+    {
+        doubles(&cuda_data[datas*1000]);
+    }else if( type == type_quarter )
+    {
+        quarter(&cuda_data[datas*1000]);
+    }
+}
+
+__device__ qsched_funtype function = gpuTest;
+
+__global__ void Setup()
+{
+    printf("%i\n", function);
+}
+
+
+
+
+
+
+
+int main ( int argc , char *argv[] ) {
+    float *array, *cuda_array, *cuda_array2, *device_array;
+    int i, k=0;
+    qsched_funtype func;
+    struct qsched s;
+    qsched_task_t *tid;
+    qsched_res_t *rid;
+    ticks tic, toc_run, tot_setup, tot_run = 0;
+    qsched_init( &s , 1 , qsched_flag_none );
+    cudaDeviceReset();
+    cudaSetDevice(0);
+    Setup<<<1,1>>>();
+    if(cudaDeviceSynchronize() != cudaSuccess)
+    error("Setup Failed: %s", cudaGetErrorString(cudaPeekAtLastError()));
+    
+    if( cudaMalloc(&device_array , M*sizeof(float) ) != cudaSuccess )
+        error("Failed to allocate device array: %s", cudaGetErrorString(cudaPeekAtLastError()));
+    if( cudaMemcpyToSymbol( cuda_data , &device_array,sizeof(float*), 0 ,  cudaMemcpyHostToDevice) != cudaSuccess )
+        error("Failed to copy array pointer to device: %s", cudaGetErrorString(cudaPeekAtLastError()));
+    /* Allocate and fill the original array*/
+    if( cudaHostAlloc( &array , M*sizeof(float) , cudaHostAllocMapped) != cudaSuccess)
+        error("Failed to allocate array");
+        for( i = 0; i < M ; i++ ){
+        array[i] = (float)i;
+}
+
+       // if(cudaHostGetDevicePointer(&cuda_array , &array[k*1000] , 0) != cudaSuccess)
+        //error("Failed to get device pointer for data: %s", cudaGetErrorString(cudaPeekAtLastError()));
+
+//    if ( cudaMallocHost(&array, M*sizeof(float)) != cudaSuccess )
+  //      error("Failed to allocate array");
+
+ /* Initialize the scheduler. */
+
+
+   /* Allocate and init the task ID and resource ID matrix. */
+    if( cudaMallocHost(&tid , sizeof(qsched_task_t) * 3 * M/1000 ) != cudaSuccess )
+        error("Failed to allocate tid");
+    if( cudaMallocHost(&rid , sizeof(qsched_task_t) * M/1000) != cudaSuccess)
+        error("Failed to allocate rid");
+
+
+    for ( k = 0 ; k < M / 1000 ; k++ ) {
+        tid[k] = qsched_task_none;
+        if(cudaHostGetDevicePointer(&cuda_array , &array[k*1000] , 0) != cudaSuccess)
+            error("Failed to get device pointer for data %i: %s",k, cudaGetErrorString(cudaPeekAtLastError()));
+        rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none , cuda_array, sizeof(float) * 1000, &device_array[k*1000]);
+        }
+    for ( k = M/1000 ; k < 3*M / 1000 ; k++ ){
+        tid[k] = qsched_task_none;
+    }
+
+
+
+    /*Build the tasks*/
+    for( k  = 0; k < M/1000; k++){
+        tid[3*k] = qsched_addtask(&s , type_square , task_flag_none , &k , sizeof(int) , 0 );
+        qsched_adduse(&s , tid[3*k] , rid[k] );
+        tid[3*k+1] = qsched_addtask(&s , type_double , task_flag_none , &k , sizeof(int) , 0 );
+        qsched_adduse( &s , tid[3*k+1] , rid[k] );
+        qsched_addunlock( &s , tid[3*k] , tid[3*k+1] );
+        tid[3*k+2] = qsched_addtask(&s , type_quarter , task_flag_none , &k , sizeof(int) , 0 );
+        qsched_adduse( &s , tid[3*k+2] , rid[k] );
+        qsched_addunlock( &s , tid[3*k+1] , tid[3*k+2] );
+    }
+
+    if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
+        error("Failed to copy function pointer from device");
+
+
+    tic = getticks();
+    qsched_run_CUDA( &s , func );
+    toc_run = getticks(); 
+    double itpms = 1000.0 / CPU_TPS;
+	message( "qsched_run_CUDA took %.3f ms..." , ((double)(toc_run - tic)) * itpms );
+    tot_run += toc_run - tic;
+
+    for(i = 0; i < M; i++ )
+        if(array[i] != ((float)i)*((float)i)*0.5f)
+            printf("%i wrong, %.3f != %.3f\n", i, array[i], ((float)i)*((float)i)*0.5f );
+printf("Starting second run\n");
+        qsched_run_CUDA( &s , func );
+printf("Second run complete\n");
+
+    struct task *completed_tasks = qsched_get_timers( &s, s.count );
+/*    tic = getticks();
+    qsched_run_CUDA( &s , func );
+    toc_run = getticks(); 
+	message( "qsched_run_CUDA took %lli ticks..." , toc_run - tic );
+    tot_run += toc_run - tic;*/
+
+
+
+//    if( cudaFreeHost( array) != cudaSuccess)
+  //      error("Failed to free array");
+
+cudaDeviceReset();
+
+    if( cudaHostAlloc( &array , M*sizeof(float) , cudaHostAllocMapped) != cudaSuccess)
+        error("Failed to allocate array");
+        for( i = 0; i < M ; i++ )
+        array[i] = i;
+
+    if(cudaHostGetDevicePointer(&cuda_array , array , 0) != cudaSuccess)
+        error("Failed to get device pointer for data: %s",cudaGetErrorString(cudaPeekAtLastError()));
+    if( cudaMalloc(&device_array , M*sizeof(float) ) != cudaSuccess )
+        error("Failed to allocate device array: %s", cudaGetErrorString(cudaPeekAtLastError()));
+    /*if( cudaMemcpy( &device_array , array , M*sizeof(float), cudaMemcpyHostToDevice ) != cudaSuccess )
+        error("Failed to copy device array: %s", cudaGetErrorString(cudaPeekAtLastError()));*/
+    if( cudaMemcpyToSymbol( cuda_data , &device_array,sizeof(float*), 0 ,  cudaMemcpyHostToDevice) != cudaSuccess )
+        error("Failed to copy array pointer to device: %s", cudaGetErrorString(cudaPeekAtLastError()));
+    tic = getticks();
+    Manual<<<100, 1000>>>(cuda_array);
+    cudaDeviceSynchronize();
+    toc_run = getticks(); 
+	message( "Manual run took %.3f ms" , ((double)(toc_run - tic)) * itpms );
+
+    for(i = 0; i < M; i++ )
+        if(array[i] != ((float)i)*((float)i)*0.5f)
+            printf("%i wrong, %.3f != %.3f\n", i, array[i], ((float)i)*((float)i)*0.5f );
+
+    //printf("%.3f\n", array[2]);
+
+}
--- a/examples/test_hierarchy.cu
+++ b/examples/test_hierarchy.cu
+/* Config parameters. */
+#include "../config.h"
+
+/* Standard includes. */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+
+/* Local includes. */
+extern "C"{
+#include "quicksched.h"
+}
+#include "cuda_queue.h"
+
+#define size 6
+
+__device__ int device_array[size];
+
+__device__ void runner ( int type , void *data ) {
+    
+	printf("Hi\n");
+
+}
+
+__device__ qsched_funtype function = runner;
+
+__global__ void Setup()
+{
+    printf("%i\n", function);
+}
+
+qsched_funtype func;
+int main ( int argc , char *argv[] ) {
+int *data;
+int *res_data;
+int res1;
+int res2;
+int res11;
+int res12;
+int res21;
+int res22;
+int task_id;
+qsched s;
+
+    cudaDeviceReset();
+    cudaSetDevice(0);
+    Setup<<<1,1>>>();
+    if(cudaDeviceSynchronize() != cudaSuccess)
+    error("Setup Failed: %s", cudaGetErrorString(cudaPeekAtLastError()));
+
+    qsched_init( &s , 1 , qsched_flag_none );
+    data = (int*) malloc(sizeof(int));
+    res_data = (int*) malloc(sizeof(int) * size);
+    res2 = qsched_addres(&s , qsched_owner_none , qsched_res_none , &res_data[3], sizeof(int)*3, NULL);
+    res1 = qsched_addres(&s , qsched_owner_none , qsched_res_none , &res_data[0], sizeof(int)*3, NULL);
+    res11 = qsched_addres( &s, qsched_owner_none, res1, &res_data[2], sizeof(int), NULL);
+    res21 = qsched_addres( &s, qsched_owner_none, res2, &res_data[4], sizeof(int), NULL);
+    res12 = qsched_addres( &s, qsched_owner_none, res1, &res_data[1], sizeof(int), NULL);
+    res22 = qsched_addres( &s, qsched_owner_none, res2, &res_data[5], sizeof(int), NULL);
+
+    task_id = qsched_addtask(&s, 1, task_flag_none, data, sizeof(int), 1);
+    qsched_addlock(&s, task_id, res1);
+    qsched_addlock(&s, task_id, res11);
+    qsched_addlock(&s, task_id, res12);
+    task_id = qsched_addtask(&s, 2, task_flag_none, data, sizeof(int), 1);
+    qsched_addlock(&s, task_id, res2);
+    qsched_addlock(&s, task_id, res21);
+    qsched_addlock(&s, task_id, res22);
+    if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
+        error("Failed to copy function pointer from device");
+    
+    qsched_run_CUDA( &s , func );       
+
+    free(data);
+    free(res_data);
+
+}
--- a/src/CUDACompile.sh
+++ b/src/CUDACompile.sh
+#!/bin/bash
+FLAGS2="-Xcompiler=-fsanitize=address -Xcompiler=-fno-omit-frame-pointer"
+DEBUG_FLAGS="-G -DDEBUG_GPU"
+FLAGS="-O3 -g -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8"
+# -DGPU_locks -Xptxas -dlcm=cg  -Xptxas="-v"" 
+# -DNO_LOADS
+
+CFLAGS="-fsanitize=address -fno-omit-frame-pointer"
+
+gcc-4.8 -std=gnu99 -DHAVE_CONFIG_H -I. -I.. -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize -funroll-loops -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -fopenmp -DTIMERS -DWITH_CUDA -g -O2 -march=native -pthread -MT qsched.lo -MD -MP -MF .deps/qsched.Tpo -c qsched.c -o qsched.o -lprofiler
+gcc-4.8 -std=gnu99 -DHAVE_CONFIG_H -I. -I.. -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize -funroll-loops -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -fopenmp -DTIMERS -DWITH_CUDA -g -O2 -march=native -pthread -MT qsched.lo -MD -MP -MF .deps/qsched.Tpo -c queue.c -o queue.o -lprofiler
+
+/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc cuda_queue.cu qsched.o queue.o -lprofiler
+
+/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -lib cuda_queue.o qsched.o queue.o -o .libs/libquicksched_cuda.a -lprofiler
+
+/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -DGPU_locks -dc cuda_queue.cu qsched.o queue.o -lprofiler
+
+/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -lib cuda_queue.o qsched.o queue.o -o .libs/libquicksched_cuda_locks.a -lprofiler
+
+cd ../examples
+
+/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda  test_gpu_simple.cu -lprofiler
+
+/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_gpu_simple.o ../src/.libs/libquicksched_cuda.a -o simple_cuda_test -lprofiler
+
+
+/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -lf77blas -lcblas -latlas -lm  -L/home/aidan/ATLAS/ATLAS_linux/lib/ -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda /usr/lib64/atlas/libcblas.a /usr/lib64/atlas/libptcblas.a test_qr.cu -lprofiler
+
+/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -m64 -I../src -lf77blas -lcblas -latlas -lm  -L/home/aidan/ATLAS/ATLAS_linux/lib/ -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_qr.o ../src/.libs/libquicksched_cuda.a /usr/lib64/atlas/libcblas.a /usr/lib64/atlas/libptcblas.a -o qr_cuda_test -lprofiler
+
+/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda  test_hierarchy.cu -lprofiler
+
+/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_hierarchy.o ../src/.libs/libquicksched_cuda.a -o test_heirarchy -lprofiler
+
+/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda  test_bh.cu -lprofiler
+
+/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh.o ../src/.libs/libquicksched_cuda.a -o test_heirarchy -lprofiler
--- a/src/Makefile.in
+++ b/src/Makefile.in