Skip to content
Snippets Groups Projects
Commit 60922b1a authored by aidan's avatar aidan
Browse files

Hopefully fixed the issues with heirarchical task generation

parent 045b696d
No related branches found
No related tags found
No related merge requests found
...@@ -90,7 +90,9 @@ float4 *parts_a_m_host; ...@@ -90,7 +90,9 @@ float4 *parts_a_m_host;
double2 *com_xy_host; double2 *com_xy_host;
double *com_z_host; double *com_z_host;
float *com_mass_host; float *com_mass_host;
double2 *parts_pos_xy_temp;
double *parts_pos_z_temp;
float4 *parts_a_m_temp;
...@@ -113,9 +115,9 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c ...@@ -113,9 +115,9 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
/* Loop over cell i.*/ /* Loop over cell i.*/
for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) { for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
xi[0] = parts_xy[i].x; xi[0] = parts_pos_xy[i].x;
xi[1] = parts_xy[i].y; xi[1] = parts_pos_xy[i].y;
xi[2] = parts_z[i]; xi[2] = parts_pos_z[i];
for(k = 0; k < 3; k++) { for(k = 0; k < 3; k++) {
ai[k] = 0.0f; ai[k] = 0.0f;
} }
...@@ -154,9 +156,9 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c ...@@ -154,9 +156,9 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
} }
/*Loop over cell j. */ /*Loop over cell j. */
for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) { for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
xi[0] = parts_xy[i].x; xi[0] = parts_pos_xy[i].x;
xi[1] = parts_xy[i].y; xi[1] = parts_pos_xy[i].y;
xi[2] = parts_z[i]; xi[2] = parts_pos_z[i];
for(k = 0; k < 3; k++) { for(k = 0; k < 3; k++) {
ai[k] = 0.0f; ai[k] = 0.0f;
} }
...@@ -290,23 +292,40 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf ...@@ -290,23 +292,40 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf
struct cell *cp ,*cps; struct cell *cp ,*cps;
if(leaf->split)
{
printf("Leaf split = 1, oh dear.");
asm("trap;");
}
if(ci->split > 1)
{
printf("Cell %i had split > 1\n", ci - cells);
asm("trap;");
}
if(cj->split > 1)
{
printf("cell %i had split > 1\n", cj - cells);
asm("trap;");
}
for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) { for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
if(is_inside(leaf, cp)) break; if(is_inside(leaf, cp)) break;
} }
if(are_neighbours_different_size(cp, cj)) { if(are_neighbours_different_size(cp, cj)) {
for(cps = &cells[cj->firstchild]; cps != &cells[ci->sibling]; cps = &cells[cps->sibling]) { for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
if(are_neighbours(cp, cps)) { if(are_neighbours(cp, cps)) {
if(cp->split && cps->split) { if(cp->split && cps->split) {
iact_pair_pc(cp, cps, leaf); iact_pair_pc(cp, cps, leaf);
} }
} else { } else {
make_interact_pc(leaf, cps); make_interact_pc(leaf, cps);
__syncthreads();
} }
} }
} }
__syncthreads();
} }
/** /**
...@@ -320,6 +339,17 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) { ...@@ -320,6 +339,17 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
struct cell *cp, *cps; struct cell *cp, *cps;
if(leaf->split)
{
printf("Leaf split = 1, oh dear.");
asm("trap;");
}
if(c->split > 1)
{
printf("Cell had split > 1\n");
asm("trap;");
}
/* Find the subcell of c the leaf is in.*/ /* Find the subcell of c the leaf is in.*/
for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) { for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) {
if(is_inside(leaf, cp)) break; if(is_inside(leaf, cp)) break;
...@@ -606,23 +636,23 @@ void cell_split(int c, struct qsched *s) { ...@@ -606,23 +636,23 @@ void cell_split(int c, struct qsched *s) {
{ {
if( cudaHostGetDevicePointer(&temp_xy, &parts_pos_xy_host[cell_pool[c].parts], 0) != cudaSuccess ) if( cudaHostGetDevicePointer(&temp_xy, &parts_pos_xy_host[cell_pool[c].parts], 0) != cudaSuccess )
error("Failed to get host device pointer."); error("Failed to get host device pointer.");
printf("tempxy = %p\n", temp_xy); // printf("tempxy = %p\n", temp_xy);
cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy, cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy,
sizeof(double2) * cell_pool[c].count, parts_pos_xy + cell_pool[c].parts); sizeof(double2) * cell_pool[c].count, parts_pos_xy_temp + cell_pool[c].parts);
} }
if(cell_pool[c].resz == qsched_res_none) if(cell_pool[c].resz == qsched_res_none)
{ {
if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[c].parts], 0) != cudaSuccess ) if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[c].parts], 0) != cudaSuccess )
error("Failed to get host device pointer."); error("Failed to get host device pointer.");
cell_pool[c].resz = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_z, cell_pool[c].resz = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_z,
sizeof(double) * cell_pool[c].count, parts_pos_z + cell_pool[c].parts); sizeof(double) * cell_pool[c].count, parts_pos_z_temp + cell_pool[c].parts);
} }
if(cell_pool[c].resm == qsched_res_none) if(cell_pool[c].resm == qsched_res_none)
{ {
if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[c].parts], 0) != cudaSuccess ) if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[c].parts], 0) != cudaSuccess )
error("Failed to get host device pointer."); error("Failed to get host device pointer.");
cell_pool[c].resm = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_a_m, cell_pool[c].resm = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_a_m,
sizeof(float4) * cell_pool[c].count, parts_a_m + cell_pool[c].parts); sizeof(float4) * cell_pool[c].count, parts_a_m_temp + cell_pool[c].parts);
} }
// error("Cell has no resource");*///TODO // error("Cell has no resource");*///TODO
...@@ -731,15 +761,15 @@ void cell_split(int c, struct qsched *s) { ...@@ -731,15 +761,15 @@ void cell_split(int c, struct qsched *s) {
if( cudaHostGetDevicePointer(&temp_xy, &parts_pos_xy_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess ) if( cudaHostGetDevicePointer(&temp_xy, &parts_pos_xy_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
error("Failed to get host device pointer."); error("Failed to get host device pointer.");
cell_pool[progenitors[k]].res = qsched_addres(s, qsched_owner_none, cell->res, temp_xy, cell_pool[progenitors[k]].res = qsched_addres(s, qsched_owner_none, cell->res, temp_xy,
sizeof(double2) * cell_pool[progenitors[k]].count, parts_pos_xy + cell_pool[progenitors[k]].parts); sizeof(double2) * cell_pool[progenitors[k]].count, parts_pos_xy_temp + cell_pool[progenitors[k]].parts);
if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess ) if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
error("Failed to get host device pointer."); error("Failed to get host device pointer.");
cell_pool[progenitors[k]].resz = qsched_addres(s, qsched_owner_none, cell->resz, temp_z, cell_pool[progenitors[k]].resz = qsched_addres(s, qsched_owner_none, cell->resz, temp_z,
sizeof(double) * cell_pool[progenitors[k]].count, parts_pos_z + cell_pool[progenitors[k]].parts); sizeof(double) * cell_pool[progenitors[k]].count, parts_pos_z_temp + cell_pool[progenitors[k]].parts);
if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess ) if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
error("Failed to get host device pointer."); error("Failed to get host device pointer.");
cell_pool[progenitors[k]].resm = qsched_addres(s, qsched_owner_none, cell->resm, temp_a_m, cell_pool[progenitors[k]].resm = qsched_addres(s, qsched_owner_none, cell->resm, temp_a_m,
sizeof(float4) * cell_pool[progenitors[k]].count, parts_a_m + cell_pool[progenitors[k]].parts); sizeof(float4) * cell_pool[progenitors[k]].count, parts_a_m_temp + cell_pool[progenitors[k]].parts);
} }
/* Find the first non-empty progenitor */ /* Find the first non-empty progenitor */
...@@ -784,6 +814,9 @@ void cell_split(int c, struct qsched *s) { ...@@ -784,6 +814,9 @@ void cell_split(int c, struct qsched *s) {
int data[2] = {root, c}; int data[2] = {root, c};
int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data, int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
2 * sizeof(int), 1); 2 * sizeof(int), 1);
qsched_addlock(s, tid, cell_pool[root].res);
qsched_addlock(s, tid, cell_pool[root].resz);
qsched_addlock(s, tid, cell_pool[root].resm);
qsched_addlock(s, tid, cell_pool[c].res); qsched_addlock(s, tid, cell_pool[c].res);
qsched_addlock(s, tid, cell_pool[c].resz); qsched_addlock(s, tid, cell_pool[c].resz);
qsched_addlock(s, tid, cell_pool[c].resm); qsched_addlock(s, tid, cell_pool[c].resm);
...@@ -913,9 +946,7 @@ void test_bh(int N, int runs, char *fileName) { ...@@ -913,9 +946,7 @@ void test_bh(int N, int runs, char *fileName) {
struct qsched s; struct qsched s;
ticks tic, toc_run, tot_setup = 0, tot_run = 0; ticks tic, toc_run, tot_setup = 0, tot_run = 0;
int countMultipoles = 0, countPairs = 0, countCoMs = 0; int countMultipoles = 0, countPairs = 0, countCoMs = 0;
double2 *parts_pos_xy_temp; struct cell *gpu_ptr_cells;
double *parts_pos_z_temp;
float4 *parts_a_m_temp;
cudaFree(0); cudaFree(0);
if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess) if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
...@@ -934,10 +965,10 @@ void test_bh(int N, int runs, char *fileName) { ...@@ -934,10 +965,10 @@ void test_bh(int N, int runs, char *fileName) {
if( cudaMalloc(&parts_pos_xy_temp, sizeof(double2) * N) != cudaSuccess) if( cudaMalloc(&parts_pos_xy_temp, sizeof(double2) * N) != cudaSuccess)
error("Failed to allocate device parts array"); error("Failed to allocate device parts array");
printf("parts_pos_xy_temp = %p\n", parts_pos_xy_temp); // printf("parts_pos_xy_temp = %p\n", parts_pos_xy_temp);
if( cudaMemcpyToSymbol(parts_pos_xy, &parts_pos_xy_temp, sizeof(double2*), 0, cudaMemcpyHostToDevice) != cudaSuccess) if( cudaMemcpyToSymbol(parts_pos_xy, &parts_pos_xy_temp, sizeof(double2*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to set device symbol for parts array"); error("Failed to set device symbol for parts array");
printf("parts_pos_xy = %p\n", parts_pos_xy); // printf("parts_pos_xy = %p\n", parts_pos_xy);
if( cudaMalloc(&parts_pos_z_temp, sizeof(double) * N) != cudaSuccess) if( cudaMalloc(&parts_pos_z_temp, sizeof(double) * N) != cudaSuccess)
error("Failed to allocate device parts array"); error("Failed to allocate device parts array");
if( cudaMemcpyToSymbol(parts_pos_z, &parts_pos_z_temp, sizeof(double*), 0, cudaMemcpyHostToDevice) != cudaSuccess) if( cudaMemcpyToSymbol(parts_pos_z, &parts_pos_z_temp, sizeof(double*), 0, cudaMemcpyHostToDevice) != cudaSuccess)
...@@ -1018,6 +1049,10 @@ void test_bh(int N, int runs, char *fileName) { ...@@ -1018,6 +1049,10 @@ void test_bh(int N, int runs, char *fileName) {
message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves)); message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves));
message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts); message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts);
for(k = 0; k < num_cells; k++)
if(cell_pool[k].split > 1 )
printf("Split > 1\n");
create_tasks(&s, root, NULL); create_tasks(&s, root, NULL);
message("total number of tasks: %i.", s.count); message("total number of tasks: %i.", s.count);
...@@ -1033,6 +1068,56 @@ void test_bh(int N, int runs, char *fileName) { ...@@ -1033,6 +1068,56 @@ void test_bh(int N, int runs, char *fileName) {
parts_a_m_host[i].z = 0.0; parts_a_m_host[i].z = 0.0;
} }
/* Copy the cells to the device. */
if( cudaMalloc( &gpu_ptr_cells , sizeof(struct cell) * used_cells) != cudaSuccess)
error("Failed to allocate cells on the GPU");
if( cudaMemcpy( gpu_ptr_cells, cell_pool, sizeof(struct cell) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to copy cells to the GPU");
if( cudaMemcpyToSymbol(cells, &gpu_ptr_cells, sizeof(struct cell*), 0, cudaMemcpyHostToDevice) != cudaSuccess )
error("Failed to copy cell pointer to the GPU");
double2 *com_temp;
double *comz_temp;
float *comm_temp;
if(cudaMalloc( &com_temp, sizeof(double2) * used_cells) != cudaSuccess)
error("Failed to allocate com on the GPU");
if( cudaMemcpy( com_temp, com_xy_host, sizeof(double2) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
error("failed to copy com to the GPU");
if( cudaMemcpyToSymbol(com_xy, &com_temp, sizeof(double2 *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to copy com pointer to the GPU");
if(cudaMalloc( &comz_temp, sizeof(double) * used_cells) != cudaSuccess)
error("Failed to allocate com on the GPU");
if( cudaMemcpy( comz_temp, com_z_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
error("failed to copy com to the GPU");
if( cudaMemcpyToSymbol(com_z, &comz_temp, sizeof(double *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to copy com pointer to the GPU");
if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess)
error("Failed to allocate com on the GPU");
if( cudaMemcpy( comm_temp, com_z_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess )
error("failed to copy com to the GPU");
if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
error("Failed to copy com pointer to the GPU");
/* for(i = 0; i < s.count; i++){
int *idata = (int*)&s.data[s.tasks[i].data];
if(s.tasks[i].type == task_type_self)
printf("Self task with data[0] = %i\n", idata[0]);
if(s.tasks[i].type == task_type_pair) {
printf("Pair task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
}
if(s.tasks[i].type == task_type_self_pc) {
printf("PC task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
}
}*/
//Run code. //Run code.
printf("gpu_data = %p\n", (int*)s.res[0].gpu_data); printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
qsched_run_CUDA( &s , func ); qsched_run_CUDA( &s , func );
......
This diff is collapsed.
/*
* Not used.
*/
void qsched_prepare_deps( struct qsched *s )
{
int **is_loaded;
int **parents;
int i, k, j, use, usek, usem;
int *num_parents;
is_loaded = (int**)malloc(sizeof(int*) * s->count);
parents = (int**)malloc(sizeof(int*) * s->count);
num_parents = (int*)malloc(sizeof(int) * s->count);
bzero(num_parents, sizeof(int)*s->count);
k = (sizeof(int)*s->count_res)/32 +1;
for(i = 0; i < s->count; i++)
{
is_loaded[i] = (int*)malloc(k);
bzero(is_loaded[i], k);
}
/* Is loaded[i][k] gives the set of k*32 resources for task i*/
/* Reset the waits to 0... */
for( k = 0; k < s->count; k++ )
{
s->tasks[k].wait = 0;
}
/* Run through the tasks and set the waits... */
for ( k = 0 ; k < s->count ; k++ ) {
struct task *t = &s->tasks[k];
if ( !( t->flags & task_flag_skip ) )
for ( j = 0 ; j < t->nr_unlocks ; j++ )
s->tasks[ t->unlocks[j] ].wait += 1;
}
/* Sort the tasks topologically. */
int *tid = (int *)malloc( sizeof(int) * s->count );
for ( j = 0 , k = 0 ; k < s->count ; k++ )
if ( s->tasks[k].wait == 0 ) {
tid[j] = k;
j += 1;
}
for ( k = 0 ; k < j ; k++ ) {
struct task *t = &s->tasks[ tid[k] ];
for ( int kk = 0 ; kk < t->nr_unlocks ; kk++ )
if ( ( s->tasks[ t->unlocks[kk] ].wait -= 1 ) == 0 ) {
tid[j] = t->unlocks[kk];
j += 1;
}
}
if ( k < s->count )
{
error( "Circular dependencies detected." );
}
/* Store the maximum number of parents of a task.*/
int max_parents = 0;
/* Compute the number of parents for each task */
for(i = s->count-1; i >= 0; i--)
{
for(j = 0; j < s->tasks[i].nr_unlocks; j++)
{
num_parents[s->tasks[i].unlocks[j]]++;
}
}
/* Allocate memory to store parents in.*/
for(i = 0; i < s->count; i++)
{
if(num_parents[i] > 0)
parents[i] = (int*)calloc(num_parents[i],sizeof(int));
else
parents[i] = NULL;
if(num_parents[i] > max_parents)
{
max_parents = num_parents[i];
}
num_parents[i] = 0;
}
/* This seems to be outdated and unused?*/
for(i = 0; i < s->count; i++)
{
if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload)
continue;
for(k = 0; k < s->tasks[i].nr_uses; k++)
{
use = s->tasks[i].uses[k];
usek = use >> 5; // use / 32;
usem = use & 31; // use % 32.
if((is_loaded[i][usek] & (1 << (31-usem))) == 0 )
{
qsched_addunlock(s, s->res[use].task , i ) ;
printf("We actually did this?!");
is_loaded[i][usek] |= (1 <<(31-usem));
}
}
for(k = 0; k < s->tasks[i].nr_unlocks; k++)
{
if(s->tasks[s->tasks[i].unlocks[k]].type == type_load ||
s->tasks[s->tasks[i].unlocks[k]].type == type_unload )
continue;
for(j = 0; j < s->count_res/32 +1; j++)
{
is_loaded[s->tasks[i].unlocks[k]][j] |= is_loaded[i][j];
}
parents[s->tasks[i].unlocks[k]][num_parents[s->tasks[i].unlocks[k]]] = i;
num_parents[s->tasks[i].unlocks[k]] = num_parents[s->tasks[i].unlocks[k]] + 1;
}
}
max_parents = 0;
for(i = 0; i < s->count; i++)
{
if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload)
continue;
if(num_parents[i] > max_parents)
{
max_parents = num_parents[i];
}
bzero(is_loaded[i], k);
}
for(i = s->count-1; i >= 0; i--)
{
if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload)
continue;
for(k = 0; k < s->tasks[i].nr_uses; k++)
{
use = s->tasks[i].uses[k];
usek = use >> 5; // use / 32;
usem = use & 31; // use % 32.
if((is_loaded[i][usek] & (1 << (31-usem))) == 0 )
{
qsched_addunlock(s, i, s->res[use].utask );
is_loaded[i][usek] |= (1 << (31-usem));
}
}
for(k = 0; k < num_parents[i]; k++)
{
for(j = 0; j < s->count_res/32 +1; j++)
{
is_loaded[parents[i][k]][j] |= is_loaded[i][j];
}
}
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment