Skip to content
Snippets Groups Projects
Commit 165c6ccc authored by aidan's avatar aidan
Browse files

Added files I had missed from the svn repository

parent 80576f37
Branches
No related tags found
No related merge requests found
/*******************************************************************************
* This file is part of QuickSched.
* Coypright (c) 2014 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
* Aidan Chalk (aidan.chalk@durham.ac.uk)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* *****************************************************************************/
/* Config parameters. */
#include "../config.h"
/* Standard includes. */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include <omp.h>
#include <fenv.h>
/* Local includes. */
#include "quicksched.h"
struct cell{
double2 loc_xy;
double loc_z;
double h;
int count;
unsigned short int split, sorted;
int parts, firstchild, sibling;
int res, com_tid;
}__attribute__((aligned(64)));
/* Requred variables to obtain cells. */
#define CELL_STRETCH 2
#define INITIAL_CELLS 256
struct *cell cell_pool=NULL;
int used_cells=0;
int num_cells = INITIAL_CELLS;
int cell_size = INITIAL_CELLS*sizeof(struct cell);
/* Device locations for the particle values. */
__device__ double2 *parts_pos_xy;
__device__ double *parts_pos_z;
__device__ float4 *parts_a_m;
__device__ double2 *com_xy;
__device__ double *com_z;
__device__ float com_mass;
/* Host locations for the particle values. */
double2 *parts_pos_xy_host;
double *parts_pos_z_host;
float4 *parts_a_m_host;
double2 *com_xy_host;
double *com_z_host;
float com_mass_host;
/**
* @brief Checks whether the cells are direct neighbours ot not. Both cells have
* to be of the same size
*/
static inline int are_neighbours(struct cell *ci, struct cell *cj) {
int k;
float dx[3];
#ifdef SANITY_CHECKS
if (ci->h != cj->h)
error(" Cells of different size in distance calculation.");
#endif
/* Maximum allowed distance */
float min_dist = ci->h;
/* (Manhattan) Distance between the cells */
double2 loc1=ci->loc_xy, loc2=cj->loc_xy;
float center_i = loc1.x;
float center_j = loc2.x;
dx[0] = fabs(center_i - center_j);
center_i = loc1.y;
center_j = loc2.y;
dx[1] = fabs(center_i - center_j);
center_i = ci->loc_z;
center_J = cj->loc_z;
dx[2] = fabs(center_i - center_j);
return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
}
struct *cell cell_get()
{
struct *cell res;
if(used_cells >= num_cells)
{
/* Stretch */
struct *cell new_pool;
cell_size *= CELL_STRETCH;
new_pool = (struct *cell) calloc(cell_size);
if(cell_pool != NULL)
memcpy(new_pool, cell_pool, num_cells*sizeof(struct cell));
double2 *temp = (*double2) calloc(num_cells*sizeof(double2));
memcpy(temp, com_xy_host, sizeof(double2)*num_cells);
free(com_xy_host);
com_xy_host = temp;
double temp2 = (*double) calloc(num_cells*sizeof(double));
memcpy(temp2, com_z_host, num_cells*sizeof(double));
free(com_z_host);
com_z_host = temp2;
float temp3 = (*float) calloc(num_cells*sizeof(float));
memcpy(temp3, com_mass_host, num_cells*sizeof(float));
free(com_mass_host);
com_mass_host = temp3;
num_cells *= CELL_STRETCH;
free(cell_pool);
cell_pool = new_pool;
}
used_cells++;
cell_pool[used_cells-1].sibling = -1;
cell_pool[used_cells-1].firstchild = -1;
cell_poo[used_cells-1].res = qsched_res_none;
return &cell_pool[used_cells-1];
}
void comp_com(struct cell *c){
int k, count = c->count;
int cpi;
struct cell *cp;
int parts = c->parts;
double com[3] = {0.0, 0.0, 0.0}, mass = 0.0;
if(c->split) {
for(cp = &cell_pool[(cpi = c->firstchild)]; cp != &cell_pool[c->sibling]; &cell_pool[(cpi = cp->sibling)]) {
float cp_mass = com_mass_host[cpi];
com[0] += com_xy_host[cpi].x * cp_mass;
com[1] += com_xy_host[cpi].y * cp_mass;
com[2] += com_z_host[cpi] * cp_mass;
mass += cp_mass;
}
/* Otherwise collect the multiple from the particles */
} else {
for(k = parts; k < parts+count; k++)
{
float p_mass = parts_a_m_host[k].w;
com[0] += parts_pos_xy_host[k].x * p_mass;
com[1] += parts_pos_xy_host[k].y * p_mass;
com[2] += parts_pos_z_host[k] * p_mass;
mass += p_mass;
}
}
k = c - cell_pool;
/* Store the COM data, if it was collected. */
if(mass > 0.0) {
float imass = 1.0f/mass;
com_xy_host[k].x = com[0] * imass;
com_xy_host[k].y = com[1] * imass;
com_z_host[k] = com[2] * imass;
com_mass_host[k] = mass;
}else
{
com_xy_host[k].x = 0.0;
com_xy_host[k].y = 0.0;
com_z_host[k] = 0.0;
com_mass_host[k] = 0.0f;
}
}
/**
* @brief Sort the parts into eight bins along the given pivots and
* fill the multipoles. Also adds the hierarchical resources
* to the sched (TODO).
*
* @param c The #cell to be split.
* @param N The total number of parts.
* @param s The #sched to store the resources.
*/
void cell_split(struct cell *c, struct qsched *s) {
int i, j, k, kk, count = c->count;
int parts = c->parts;
double2 temp;
double temp1;
float4 temp2;
struct cell *cp;
int left[8], right[8];
double pivot[3];
static struct cell *root = NULL;
struct cell *progenitors[8];
/* Set the root cell. */
if (root == NULL) {
root = c;
c->sibling = 0;
}
if(c->res == qsched_res_none)
error("Cell has no resource");
if(c->count > cell_maxparts )
{
c->split = 1;
for(k = 0; k < 8; k++)
{
progenitors[k] = cp = cell_get();
cp->loc_xy = c->loc_xy;
cp->loc_z = c->loc_z;
cp->h = c->h*0.5;
if(k & 4) cp->loc_xy.x += cp->h;
if(k & 2) cp->loc_xy.y += cp->h;
if(k & 1) cp->loc_z += cp->h;
}
/* Init the pivots.*/
pivot[0] = c->loc_xy.x + c->h * 0.5;
pivot[1] = c->loc_xy.y + c->h * 0.5;
pivot[2] = c->loc_z + c->h * 0.5;
/* Split along the x axis. */
i = parts;
j = parts+count-1;
while(i < j)
{
while(i <= parts+count-1 && parts_pos_xy_host[i].x < pivot[0]) i += 1;
while(j >= parts && parts_pos_xy_host[j].x >= pivot[0]) j -= 1;
if(i < j){
temp = parts_pos_xy_host[i];
temp1 = parts_pos_z_host[i];
temp2 = parts_a_m_host[i];
parts_pos_xy_host[i] = parts_pos_xy_host[j];
parts_pos_z_host[i] = parts_pos_z_host[j];
parts_a_m_host[i] = parts_a_m_host[j];
parts_pos_xy_host[j] = temp;
parts_pos_z_host[j] = temp1;
parts_a_m_host[j] = temp2;
}
}
left[1] = i;
right[1] parts+count-1;
left[0] = parts;
right[0] = j;
/* Split along the y axis twice. */
for (k = 1; k >= 0; k--) {
i = left[k];
j = right[k];
while(i <= j){
while(i <= right[k] && parts_pos_xy_host[i].y < pivot[1]) i += 1;
while(j >= left[k] && parts_pos_xy_host[j].y >= pivot[1]) j -= 1;
if(i < j)
{
temp = parts_pos_xy_host[i];
temp1 = parts_pos_z_host[i];
temp2 = parts_a_m_host[i];
parts_pos_xy_host[i] = parts_pos_xy_host[j];
parts_pos_z_host[i] = parts_pos_z_host[j];
parts_a_m_host[i] = parts_a_m_host[j];
parts_pos_xy_host[j] = temp;
parts_pos_z_host[j] = temp1;
parts_a_m_host[j] = temp2;
}
}
left[2*k+1] = i;
right[2*k+1] = right[k];
left[2*k] = left[k];
right[2*k] = j;
}
/* Split along the z axis four times.*/
for(k = 3; k >=0; k--)
{
i = left[k];
j = right[k];
while(i <= j){
while(i <= right[k] && parts_pos_z_host[i] < pivot[2]) i += 1;
while(i >= left[k] && parts_post_z_host[i] >= pivot[2]) j -= 1;
if(i < j)
{
temp = parts_pos_xy_host[i];
temp1 = parts_pos_z_host[i];
temp2 = parts_a_m_host[i];
parts_pos_xy_host[i] = parts_pos_xy_host[j];
parts_pos_z_host[i] = parts_pos_z_host[j];
parts_a_m_host[i] = parts_a_m_host[j];
parts_pos_xy_host[j] = temp;
parts_pos_z_host[j] = temp1;
parts_a_m_host[j] = temp2;
}
}
left[2 * k + 1] = i;
right[2 * k + 1] = right[k];
left[2 * k] = left[k];
right[2 * k] = j;
}
/* Store the counts and offsets. */
for(k = 0; k < 8; k++)
{
progenitors[k]->count = right[k]-left[k]+1;
progenitors[k]->parts = left[k];
//TODO ADD RESOURCES HERE.
}
/* Find the first non-empty progenitor */
for(k = 0; k < 8; k++)
{
if(progenitors[k]->count > 0)
{
c->firstchild = &progenitors[k]-cell_pool;
break;
}
}
#ifdef SANITY_CHECKS
if(c->firstchild == -1)
error("Cell has been split but all children have 0 parts");
#endif
/*Prepare the pointers*/
for(k = 0; k < 8; k++)
{
/* Find the next non-empty sibling */
for(kk = k+1; kk < 8; ++kk){
if(progenitors[kk]->count > 0){
progenitors[k]->sibling = &progenitors[kk]-cell_pool;
break;
}
}
/* No non-empty sibling, go back a level.*/
if(kk == 8) progenitors[k]->sibling = c->sibling;
}
/* Recurse */
for(k = 0; k < 8; k++)
if(progenitors[k]->count > 0) cell_split(progenitors[k], s);
/* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */
} else {
struct cell *data[2] = {root, c};
int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
2 * sizeof(struct cell *), 1);
qsched_addlock(s, tid, c->res);
//TODO Create task.
//TODO Deal with multiple resources.
}
#ifndef COM_AS_TASK
comp_com(c);
#endif
}
/**
* @brief Create the tasks for the cell pair/self.
*
* @param s The #sched in which to create the tasks.
* @param ci The first #cell.
* @param cj The second #cell.
*/
void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
qsched_task_t tid;
int *data[2];
struct cell /**data[2],*/ *cp, *cps;
int cpi;
if(cj == NULL)
{
if(ci->split)
{
for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling])
{
//Self Interaction.
create_tasks(s, cp, NULL);
for(cps = &cell_pool[cp->sibling]; cps != &cell_pool[ci->sibling]; cps = &cell_pool[cps->sibling])
create_tasks(s, cp, cps);
}
}
/* Self task */
else{
data[0] = ci - cell_pool;
data[1] = -1;
tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, ci->count*ci->count/2);
qsched_addlock(s, tid, ci->res);
}
}
/* Else its a pair!*/
else{
if(are_neighbours(ci,cj){/* Cells are neighbours */
/*Are both split? */
if(ci->split && cj->split)
{
/* Recurse over both cells. */
for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling])
for(cps = &cell_pool[cj->firstchild]; cps != &cell_pool[cj->sibling]; cps = &cell_pool[cps->sibling])
create_tasks(s, cp, cps);
/* Otherwise, at least one of the cells is not split, build a direct
* interaction. */
}else{
data[0] = ci-cell_pool;
data[1] = cj-cell_pool;
/* Create the task. */
tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
sizeof(struct cell *) * 2, ci->count * cj->count);
/* Add the resources. */
qsched_addlock(s, tid, ci->res);
qsched_addlock(s, tid, cj->res);
}
}
}
}
/**
* @brief Set up and run a task-based Barnes-Hutt N-body solver.
*
* @param N The number of random particles to use.
* @param runs Number of force evaluations to use as a benchmark.
* @param fileName Input file name. If @c NULL or an empty string, random
* particle positions will be used.
*/
void test_bh(int N, int runs, char *fileName) {
int i, k;
struct cell *root;
struct part *parts;
FILE *file;
struct qsched s;
ticks tic, toc_run, tot_setup = 0, tot_run = 0;
int countMultipoles = 0, countPairs = 0, countCoMs = 0;
/* Initialize the scheduler. */
qsched_init(&s, 1, qsched_flag_none);
//Create host particle arrays.
if( cudaMallocHost(&parts_pos_xy_host, sizeof(double2) * N) != cudaSuccess)
error("Failed to allocated parts array");
}
/*******************************************************************************
* This file is part of QuickSched.
* Coypright (c) 2013 Aidan Chalk (aidan.chalk@durham.ac.uk)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
******************************************************************************/
/* Standard includes. */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <math.h>
#include <omp.h>
/* Local includes. */
extern "C"{
#include "quicksched.h"
}
#include "cuda_queue.h"
#define type_square 1
#define type_double 2
#define type_quarter 3
#define M 100000
#define TID threadIdx.x
__device__ float *cuda_data;
__device__ void square(float *data)
{
int i;
for( i = TID; i < 1000; i += blockDim.x )
data[i] = data[i] * data[i];
}
__device__ void doubles(float *data)
{
int i;
for( i = TID; i < 1000; i += blockDim.x )
data[i] = 2.0f * data[i];
}
__device__ void quarter(float *data)
{
int i;
for( i = TID; i < 1000; i += blockDim.x )
data[i] = data[i] * 0.25f;
}
__global__ void Manual(float *src)
{
int i;
int datas = blockIdx.x;
cuda_data[datas*1000+threadIdx.x] = src[datas*1000+threadIdx.x];
square(&cuda_data[datas*1000]);
doubles(&cuda_data[datas*1000]);
quarter(&cuda_data[datas*1000]);
src[datas*1000+threadIdx.x] = cuda_data[datas*1000+threadIdx.x];
}
__device__ __noinline__ void gpuTest(int type , void *data)
{
int datas = *(int*)data;
if(type == type_square)
{
square(&cuda_data[datas*1000]);
}else if( type == type_double )
{
doubles(&cuda_data[datas*1000]);
}else if( type == type_quarter )
{
quarter(&cuda_data[datas*1000]);
}
}
__device__ qsched_funtype function = gpuTest;
__global__ void Setup()
{
printf("%i\n", function);
}
int main ( int argc , char *argv[] ) {
float *array, *cuda_array, *cuda_array2, *device_array;
int i, k=0;
qsched_funtype func;
struct qsched s;
qsched_task_t *tid;
qsched_res_t *rid;
ticks tic, toc_run, tot_setup, tot_run = 0;
qsched_init( &s , 1 , qsched_flag_none );
cudaDeviceReset();
cudaSetDevice(0);
Setup<<<1,1>>>();
if(cudaDeviceSynchronize() != cudaSuccess)
error("Setup Failed: %s", cudaGetErrorString(cudaPeekAtLastError()));
if( cudaMalloc(&device_array , M*sizeof(float) ) != cudaSuccess )
error("Failed to allocate device array: %s", cudaGetErrorString(cudaPeekAtLastError()));
if( cudaMemcpyToSymbol( cuda_data , &device_array,sizeof(float*), 0 , cudaMemcpyHostToDevice) != cudaSuccess )
error("Failed to copy array pointer to device: %s", cudaGetErrorString(cudaPeekAtLastError()));
/* Allocate and fill the original array*/
if( cudaHostAlloc( &array , M*sizeof(float) , cudaHostAllocMapped) != cudaSuccess)
error("Failed to allocate array");
for( i = 0; i < M ; i++ ){
array[i] = (float)i;
}
// if(cudaHostGetDevicePointer(&cuda_array , &array[k*1000] , 0) != cudaSuccess)
//error("Failed to get device pointer for data: %s", cudaGetErrorString(cudaPeekAtLastError()));
// if ( cudaMallocHost(&array, M*sizeof(float)) != cudaSuccess )
// error("Failed to allocate array");
/* Initialize the scheduler. */
/* Allocate and init the task ID and resource ID matrix. */
if( cudaMallocHost(&tid , sizeof(qsched_task_t) * 3 * M/1000 ) != cudaSuccess )
error("Failed to allocate tid");
if( cudaMallocHost(&rid , sizeof(qsched_task_t) * M/1000) != cudaSuccess)
error("Failed to allocate rid");
for ( k = 0 ; k < M / 1000 ; k++ ) {
tid[k] = qsched_task_none;
if(cudaHostGetDevicePointer(&cuda_array , &array[k*1000] , 0) != cudaSuccess)
error("Failed to get device pointer for data %i: %s",k, cudaGetErrorString(cudaPeekAtLastError()));
rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none , cuda_array, sizeof(float) * 1000, &device_array[k*1000]);
}
for ( k = M/1000 ; k < 3*M / 1000 ; k++ ){
tid[k] = qsched_task_none;
}
/*Build the tasks*/
for( k = 0; k < M/1000; k++){
tid[3*k] = qsched_addtask(&s , type_square , task_flag_none , &k , sizeof(int) , 0 );
qsched_adduse(&s , tid[3*k] , rid[k] );
tid[3*k+1] = qsched_addtask(&s , type_double , task_flag_none , &k , sizeof(int) , 0 );
qsched_adduse( &s , tid[3*k+1] , rid[k] );
qsched_addunlock( &s , tid[3*k] , tid[3*k+1] );
tid[3*k+2] = qsched_addtask(&s , type_quarter , task_flag_none , &k , sizeof(int) , 0 );
qsched_adduse( &s , tid[3*k+2] , rid[k] );
qsched_addunlock( &s , tid[3*k+1] , tid[3*k+2] );
}
if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
error("Failed to copy function pointer from device");
tic = getticks();
qsched_run_CUDA( &s , func );
toc_run = getticks();
double itpms = 1000.0 / CPU_TPS;
message( "qsched_run_CUDA took %.3f ms..." , ((double)(toc_run - tic)) * itpms );
tot_run += toc_run - tic;
for(i = 0; i < M; i++ )
if(array[i] != ((float)i)*((float)i)*0.5f)
printf("%i wrong, %.3f != %.3f\n", i, array[i], ((float)i)*((float)i)*0.5f );
printf("Starting second run\n");
qsched_run_CUDA( &s , func );
printf("Second run complete\n");
struct task *completed_tasks = qsched_get_timers( &s, s.count );
/* tic = getticks();
qsched_run_CUDA( &s , func );
toc_run = getticks();
message( "qsched_run_CUDA took %lli ticks..." , toc_run - tic );
tot_run += toc_run - tic;*/
// if( cudaFreeHost( array) != cudaSuccess)
// error("Failed to free array");
cudaDeviceReset();
if( cudaHostAlloc( &array , M*sizeof(float) , cudaHostAllocMapped) != cudaSuccess)
error("Failed to allocate array");
for( i = 0; i < M ; i++ )
array[i] = i;
if(cudaHostGetDevicePointer(&cuda_array , array , 0) != cudaSuccess)
error("Failed to get device pointer for data: %s",cudaGetErrorString(cudaPeekAtLastError()));
if( cudaMalloc(&device_array , M*sizeof(float) ) != cudaSuccess )
error("Failed to allocate device array: %s", cudaGetErrorString(cudaPeekAtLastError()));
/*if( cudaMemcpy( &device_array , array , M*sizeof(float), cudaMemcpyHostToDevice ) != cudaSuccess )
error("Failed to copy device array: %s", cudaGetErrorString(cudaPeekAtLastError()));*/
if( cudaMemcpyToSymbol( cuda_data , &device_array,sizeof(float*), 0 , cudaMemcpyHostToDevice) != cudaSuccess )
error("Failed to copy array pointer to device: %s", cudaGetErrorString(cudaPeekAtLastError()));
tic = getticks();
Manual<<<100, 1000>>>(cuda_array);
cudaDeviceSynchronize();
toc_run = getticks();
message( "Manual run took %.3f ms" , ((double)(toc_run - tic)) * itpms );
for(i = 0; i < M; i++ )
if(array[i] != ((float)i)*((float)i)*0.5f)
printf("%i wrong, %.3f != %.3f\n", i, array[i], ((float)i)*((float)i)*0.5f );
//printf("%.3f\n", array[2]);
}
/* Config parameters. */
#include "../config.h"
/* Standard includes. */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <math.h>
/* Local includes. */
extern "C"{
#include "quicksched.h"
}
#include "cuda_queue.h"
#define size 6
__device__ int device_array[size];
__device__ void runner ( int type , void *data ) {
printf("Hi\n");
}
__device__ qsched_funtype function = runner;
__global__ void Setup()
{
printf("%i\n", function);
}
qsched_funtype func;
int main ( int argc , char *argv[] ) {
int *data;
int *res_data;
int res1;
int res2;
int res11;
int res12;
int res21;
int res22;
int task_id;
qsched s;
cudaDeviceReset();
cudaSetDevice(0);
Setup<<<1,1>>>();
if(cudaDeviceSynchronize() != cudaSuccess)
error("Setup Failed: %s", cudaGetErrorString(cudaPeekAtLastError()));
qsched_init( &s , 1 , qsched_flag_none );
data = (int*) malloc(sizeof(int));
res_data = (int*) malloc(sizeof(int) * size);
res2 = qsched_addres(&s , qsched_owner_none , qsched_res_none , &res_data[3], sizeof(int)*3, NULL);
res1 = qsched_addres(&s , qsched_owner_none , qsched_res_none , &res_data[0], sizeof(int)*3, NULL);
res11 = qsched_addres( &s, qsched_owner_none, res1, &res_data[2], sizeof(int), NULL);
res21 = qsched_addres( &s, qsched_owner_none, res2, &res_data[4], sizeof(int), NULL);
res12 = qsched_addres( &s, qsched_owner_none, res1, &res_data[1], sizeof(int), NULL);
res22 = qsched_addres( &s, qsched_owner_none, res2, &res_data[5], sizeof(int), NULL);
task_id = qsched_addtask(&s, 1, task_flag_none, data, sizeof(int), 1);
qsched_addlock(&s, task_id, res1);
qsched_addlock(&s, task_id, res11);
qsched_addlock(&s, task_id, res12);
task_id = qsched_addtask(&s, 2, task_flag_none, data, sizeof(int), 1);
qsched_addlock(&s, task_id, res2);
qsched_addlock(&s, task_id, res21);
qsched_addlock(&s, task_id, res22);
if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
error("Failed to copy function pointer from device");
qsched_run_CUDA( &s , func );
free(data);
free(res_data);
}
#!/bin/bash
FLAGS2="-Xcompiler=-fsanitize=address -Xcompiler=-fno-omit-frame-pointer"
DEBUG_FLAGS="-G -DDEBUG_GPU"
FLAGS="-O3 -g -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8"
# -DGPU_locks -Xptxas -dlcm=cg -Xptxas="-v""
# -DNO_LOADS
CFLAGS="-fsanitize=address -fno-omit-frame-pointer"
gcc-4.8 -std=gnu99 -DHAVE_CONFIG_H -I. -I.. -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize -funroll-loops -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -fopenmp -DTIMERS -DWITH_CUDA -g -O2 -march=native -pthread -MT qsched.lo -MD -MP -MF .deps/qsched.Tpo -c qsched.c -o qsched.o -lprofiler
gcc-4.8 -std=gnu99 -DHAVE_CONFIG_H -I. -I.. -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize -funroll-loops -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -fopenmp -DTIMERS -DWITH_CUDA -g -O2 -march=native -pthread -MT qsched.lo -MD -MP -MF .deps/qsched.Tpo -c queue.c -o queue.o -lprofiler
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc cuda_queue.cu qsched.o queue.o -lprofiler
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -lib cuda_queue.o qsched.o queue.o -o .libs/libquicksched_cuda.a -lprofiler
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -DGPU_locks -dc cuda_queue.cu qsched.o queue.o -lprofiler
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -lib cuda_queue.o qsched.o queue.o -o .libs/libquicksched_cuda_locks.a -lprofiler
cd ../examples
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda test_gpu_simple.cu -lprofiler
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_gpu_simple.o ../src/.libs/libquicksched_cuda.a -o simple_cuda_test -lprofiler
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -lf77blas -lcblas -latlas -lm -L/home/aidan/ATLAS/ATLAS_linux/lib/ -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda /usr/lib64/atlas/libcblas.a /usr/lib64/atlas/libptcblas.a test_qr.cu -lprofiler
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -lf77blas -lcblas -latlas -lm -L/home/aidan/ATLAS/ATLAS_linux/lib/ -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_qr.o ../src/.libs/libquicksched_cuda.a /usr/lib64/atlas/libcblas.a /usr/lib64/atlas/libptcblas.a -o qr_cuda_test -lprofiler
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda test_hierarchy.cu -lprofiler
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_hierarchy.o ../src/.libs/libquicksched_cuda.a -o test_heirarchy -lprofiler
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda test_bh.cu -lprofiler
/home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh.o ../src/.libs/libquicksched_cuda.a -o test_heirarchy -lprofiler
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment