diff --git a/src/CUDA/runner_cuda_main.cu b/src/CUDA/runner_cuda_main.cu index e2f331d83cdb5a0350608b53c4a27a33f9a2e7d2..8d333322ee63eda2ec2a6f49afc6e8533da5fe8a 100644 --- a/src/CUDA/runner_cuda_main.cu +++ b/src/CUDA/runner_cuda_main.cu @@ -79,7 +79,6 @@ struct particle_arrays { }; -__device__ int global_count; __device__ struct particle_arrays cuda_parts; __constant__ int cuda_nr_parts; @@ -392,8 +391,6 @@ __device__ void unload_cell(int cell_index) { /* Copy back the acceleration */ float3 local_a_hydro = cuda_parts.a_hydro[start + i]; current->a_hydro[0] = local_a_hydro.x; - if(cuda_parts.id[start+i] == 103498) - printf("Unloaded value is %e\n", current->a_hydro[0]); current->a_hydro[1] = local_a_hydro.y; current->a_hydro[2] = local_a_hydro.z; @@ -493,8 +490,6 @@ __device__ void doself_density(struct cell_cuda *ci) { /* Compute contribution to the density. */ rho += mj * w; rho_dh -= mj * (hydro_dimension * w + ui * dw_dx); - /*if(cuda_parts.id[pid] == 103498) - printf("%lli %lli %e %e %e %e %e\n", cuda_parts.id[pid], cuda_parts.id[pjd], temp - rho_dh, hydro_dimension, w, ui, dw_dx);*/ /* Compute contribution to the number of neighbours */ wcount += w; @@ -622,8 +617,6 @@ __device__ void dopair_density(struct cell_cuda *ci, struct cell_cuda *cj) { /* Compute contribution to the density. */ rho += mj * w; rho_dh -= mj * (hydro_dimension * w + ui * dw_dx); - /*if(cuda_parts.id[pid] == 103498) - printf("%lli %lli %e %e %e %e %e\n", cuda_parts.id[pid], cuda_parts.id[pjd], temp-rho_dh, hydro_dimension, w, ui, dw_dx);*/ /* Compute contribution to the number of neighbours */ wcount += w; @@ -774,21 +767,11 @@ __device__ void doself_force(struct cell_cuda *ci) { /* Compute the acceleration */ const float acc = visc_term + sph_term; -// if(cuda_parts.id[pid] == 79985 /*&& cuda_parts.id[pjd] == 103660*/){ - // printf("%lli %lli %e %e %e \n", cuda_parts.id[pid], cuda_parts.id[pjd], -mj*acc*dx[0],-mj*acc*dx[1],-mj*acc*dx[2]); -/* printf("%e %e %e ::: %e %e %e\n", cuda_parts.x_x[pid], cuda_parts.x_y[pid], cuda_parts.x_z[pid], cuda_parts.x_x[pjd], cuda_parts.x_y[pjd], cuda_parts.x_z[pjd]); - printf("%e %e %e %e \n", sph_term, visc_term, wi_dr, wj_dr); */ - //} /* Compute the force */ a_hydro.x -= mj * acc * dx[0]; a_hydro.y -= mj * acc * dx[1]; a_hydro.z -= mj * acc * dx[2]; -/* if(cuda_parts.id[pid] == 103498 && cuda_parts.id[pjd] == 103292) - printf("%e %e %e %e\n", f_i, f_j, cuda_parts.rho_dh[pid], cuda_parts.rho_dh[pjd]);*/ - - if(cuda_parts.id[pid] == 103498) - printf("%lli %lli %e %e %e %e\n", cuda_parts.id[pid], cuda_parts.id[pjd], -mj*acc*dx[0], -mj*acc*dx[1], -mj*acc*dx[2], r_inv ); /* Get the time derivative for h. */ h_dt -= mj * dvdr * r_inv / rhoj * wi_dr; @@ -804,8 +787,6 @@ __device__ void doself_force(struct cell_cuda *ci) { /* Flush to global stores.*/ atomicAdd(&cuda_parts.a_hydro[pid].x, a_hydro.x); - if(cuda_parts.id[pid] == 103498) - printf("global hydro = %e a_hydro = %e\n", cuda_parts.a_hydro[pid].x, a_hydro.x); atomicAdd(&cuda_parts.a_hydro[pid].y, a_hydro.y); atomicAdd(&cuda_parts.a_hydro[pid].z, a_hydro.z); atomicAdd(&cuda_parts.h_dt[pid], h_dt); @@ -960,10 +941,6 @@ __device__ void dopair_force(struct cell_cuda *ci, struct cell_cuda *cj) { a_hydro.y -= mj * acc * dx[1]; a_hydro.z -= mj * acc * dx[2]; -/* if(cuda_parts.id[pid] == 103498 && cuda_parts.id[pjd] == 103292) - printf("%e %e %e %e\n", f_i, f_j, cuda_parts.rho_dh[pid], cuda_parts.rho_dh[pjd]);*/ - if(cuda_parts.id[pid] == 103498) - printf("%lli %lli %e %e %e %e\n", cuda_parts.id[pid], cuda_parts.id[pjd], -mj*acc*dx[0], -mj*acc*dx[1], -mj*acc*dx[2], r_inv ); /* Get the time derivative for h. */ h_dt -= mj * dvdr * r_inv / rhoj * wi_dr; @@ -979,8 +956,6 @@ __device__ void dopair_force(struct cell_cuda *ci, struct cell_cuda *cj) { /* Flush to global stores.*/ atomicAdd(&cuda_parts.a_hydro[pid].x, a_hydro.x); - if(cuda_parts.id[pid] == 103498) - printf("global hydro = %e a_hydro = %e\n", cuda_parts.a_hydro[pid].x, a_hydro.x); atomicAdd(&cuda_parts.a_hydro[pid].y, a_hydro.y); atomicAdd(&cuda_parts.a_hydro[pid].z, a_hydro.z); atomicAdd(&cuda_parts.h_dt[pid], h_dt); @@ -1152,35 +1127,11 @@ __device__ void cuda_doself_subset_density(int pid, int cell) { /* Get r and 1/r */ const float r = sqrtf(r2); const float ri = 1.0f / r; -/* if(cuda_parts.id[pid] == 142801) - printf("neighbour = %lli, r=%e, h=%e\n", cuda_parts.id[j], r2, hi);*/ /* Compute the kernel function */ const float hi_inv = 1.0f / hi; const float ui = r * hi_inv; - if( 0 && cuda_parts.id[pid] == 79985 && cuda_parts.id[j] == 103498){ - printf("pos:: %e %e %e :::: %e %e %e\n",pix[0], pix[1], pix[2], cuda_parts.x_x[j], cuda_parts.x_y[j], cuda_parts.x_z[j]); - printf("dx %e %e %e\n", dx[0], dx[1], dx[2]); - printf("%e %e %e %e %e %e\n", r2, mj, r, ri, hi_inv, ui); - printf("kernel:\n x = %e ind = %i\n", ui*kernel_gamma_inv, (int)(ui*kernel_gamma_inv*kernel_ivals_f )); - const float x = ui*kernel_gamma_inv; - const int temp = (int)(x*kernel_ivals_f); - const int ind = temp > kernel_ivals ? kernel_ivals :temp; - const float *coeffs = &cuda_kernel_coeffs[ind*(kernel_degree+1)]; - w = coeffs[0]*x +coeffs[1]; - dw_dx = coeffs[0]; - printf("initial w=%e dw_dx = %e\n", w, dw_dx); - for(int k = 2; k <= kernel_degree; k++){ - printf("step %i coeffs[%i] = %e\n", k,k,coeffs[k]); - dw_dx = dw_dx * x + w; - w = x*w + coeffs[k]; - printf("step %i w= %e dw_dx = %e\n", k, w, dw_dx); - } - w = w * kernel_constant * kernel_gamma_inv_dim; - dw_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one; - printf("result w = %e dw_dx = %e\n", w, dw_dx); - } cuda_kernel_deval(ui, &w, &dw_dx); /* Compute contribution to the density */ @@ -1207,8 +1158,6 @@ __device__ void cuda_doself_subset_density(int pid, int cell) { curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2]; curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0]; - if(0 && cuda_parts.id[pid] == 79985 && cuda_parts.id[j] == 103498) - printf("%lli %lli %e %e %e %e %i\n", cuda_parts.id[pid], cuda_parts.id[j],w, r, hi_inv , kernel_gamma_inv, kernel_ivals); rot_v.x += fac * curlvr[0]; rot_v.y += fac * curlvr[1]; rot_v.z += fac * curlvr[2]; @@ -1218,8 +1167,6 @@ __device__ void cuda_doself_subset_density(int pid, int cell) { /* Write the data for particle pid */ atomicAdd(&cuda_parts.rho[pid], rho); atomicAdd(&cuda_parts.rho_dh[pid], rho_dh); - if(cuda_parts.id[pid] == 103498) - printf("Hello\n"); atomicAdd(&cuda_parts.wcount[pid], wcount); atomicAdd(&cuda_parts.wcount_dh[pid], wcount_dh); atomicAdd(&cuda_parts.div_v[pid], div_v); @@ -1277,34 +1224,10 @@ __device__ void cuda_dopair_subset_density(int pid, int cell, int pid_cell) { const float r = sqrtf(r2); const float ri = 1.0f / r; - /* if(cuda_parts.id[pid] == 142801) - printf("pair neighbour = %lli, r=%e, h=%e\n", cuda_parts.id[j], r2, hi);*/ /* Compute the kernel function */ const float hi_inv = 1.0f / hi; const float ui = r * hi_inv; - if(0 &&cuda_parts.id[pid] == 79985 && cuda_parts.id[j] == 103498){ - printf("pos:: %e %e %e :::: %e %e %e\n",cuda_parts.x_x[pid], cuda_parts.x_y[pid], cuda_parts.x_z[pid], cuda_parts.x_x[j], cuda_parts.x_y[j], cuda_parts.x_z[j]); - printf("dx %e %e %e\n", dx[0], dx[1], dx[2]); - printf("%e %e %e %e %e %e\n", r2, mj, r, ri, hi_inv, ui); - printf("kernel:\n x = %e ind = %i\n", ui*kernel_gamma_inv, (int)(ui*kernel_gamma_inv*kernel_ivals_f )); - const float x = ui*kernel_gamma_inv; - const int temp = (int)(x*kernel_ivals_f); - const int ind = temp > kernel_ivals ? kernel_ivals :temp; - const float *coeffs = &cuda_kernel_coeffs[ind*(kernel_degree+1)]; - w = coeffs[0]*x +coeffs[1]; - dw_dx = coeffs[0]; - printf("initial w=%e dw_dx = %e\n", w, dw_dx); - for(int k = 2; k <= kernel_degree; k++){ - printf("step %i coeffs[%i] = %e\n", k,k,coeffs[k]); - dw_dx = dw_dx * x + w; - w = x*w + coeffs[k]; - printf("step %i w= %e dw_dx = %e\n", k, w, dw_dx); - } - w = w * kernel_constant * kernel_gamma_inv_dim; - dw_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one; - printf("result w = %e dw_dx = %e\n", w, dw_dx); - } cuda_kernel_deval(ui, &w, &dw_dx); /* Compute contribution to the density */ @@ -1331,8 +1254,6 @@ __device__ void cuda_dopair_subset_density(int pid, int cell, int pid_cell) { curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2]; curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0]; - if(0 && cuda_parts.id[pid] == 79985 && cuda_parts.id[j] == 103498) - printf("%lli %lli %e %e %e %e %i\n", cuda_parts.id[pid], cuda_parts.id[j],w, r, hi_inv , kernel_gamma_inv, kernel_ivals); rot_v.x += fac * curlvr[0]; rot_v.y += fac * curlvr[1]; rot_v.z += fac * curlvr[2]; @@ -1342,8 +1263,6 @@ __device__ void cuda_dopair_subset_density(int pid, int cell, int pid_cell) { /* Write the data for particle pid */ atomicAdd(&cuda_parts.rho[pid], rho); atomicAdd(&cuda_parts.rho_dh[pid], rho_dh); - if(cuda_parts.id[pid] == 103498) - printf("Hello\n"); atomicAdd(&cuda_parts.wcount[pid], wcount); atomicAdd(&cuda_parts.wcount_dh[pid], wcount_dh); atomicAdd(&cuda_parts.div_v[pid], div_v); @@ -1381,8 +1300,6 @@ __device__ void cuda_hydro_fix_particle(int pid, struct cell_cuda *c) { cuda_doself_subset_density(pid, cell); else if (tasks[tid].type == task_type_pair || tasks[tid].type == task_type_sub_pair) { -/* if(cuda_parts.id[pid] == 142801) - printf("cell %lli Task %lli type %i ci %i cj %i \n", cell, (&tasks[tid]-tasks), tasks[tid].type, tasks[tid].ci, tasks[tid].cj);*/ if (tasks[tid].ci == cell) { cuda_dopair_subset_density(pid, tasks[tid].cj, cell); } else { @@ -1425,8 +1342,6 @@ __device__ void do_ghost(struct cell_cuda *c) { a++; redo = 0; float h_new; - if(cuda_parts.id[i]==103498) - printf("notrho = %.10e, a = %i dhdrho = %.10e\n", cuda_parts.rho[i], a, cuda_parts.rho_dh[i]); const float h_old = cuda_parts.h[i]; const float h_old_dim = cuda_pow_dimension(h_old);//h_old*h_old*h_old;//d); const float h_old_dim_minus_one = cuda_pow_dimension_minus_one(h_old); @@ -1436,8 +1351,6 @@ __device__ void do_ghost(struct cell_cuda *c) { /* Finish the density calculation. */ hydro_end_density(i); - if(cuda_parts.id[i]==103498) - printf("rho = %.10e, a = %i\n", cuda_parts.rho[i], a); /* Compute a step of the Newton-Raphson scheme */ const float n_sum = cuda_parts.wcount[i] * h_old_dim; const float n_target = hydro_eta_dim; @@ -1468,7 +1381,6 @@ __device__ void do_ghost(struct cell_cuda *c) { cuda_parts.rot_v[i].y = 0.f; cuda_parts.rot_v[i].z = 0.f; redo = 1; - atomicAdd(&global_count, 1); } else { /* The particle is a lost cause */ cuda_parts.h[i] = hydro_h_max; @@ -1483,8 +1395,6 @@ __device__ void do_ghost(struct cell_cuda *c) { } } - if(cuda_parts.id[i]==103498) - printf("rho = %.10e, a = %i\n, dh_drho=%e\n", cuda_parts.rho[i], a, cuda_parts.rho_dh[i]); /* We now have a particle whose smoothing length has convegered */ /* Time to set the force variables */ /* Compute variables required for the force loop */ @@ -1580,8 +1490,9 @@ __global__ void swift_device_kernel() { if (type == type_load) { load_cell(tasks[tid].ci); } else if (type == type_unload) { + unload_cell(tasks[tid].ci); - } else if (type == task_type_pair || type == task_type_sub_pair) { + }else if (type == task_type_pair || type == task_type_sub_pair) { if (subtype == task_subtype_density) { struct cell_cuda *ci = &cells_cuda[tasks[tid].ci]; struct cell_cuda *cj = &cells_cuda[tasks[tid].cj]; @@ -1734,12 +1645,6 @@ __device__ void test_27_unload_cell(int cell_index) { /* Host function to check cuda functions don't return errors */ __host__ inline void cudaErrCheck(cudaError_t status) { if (status != cudaSuccess) { - if(status == cudaErrorMemoryAllocation ) - { - int i = 32; - i = i - (7. * rand()); - printf("HELLO? %i\n", i); - } printf("cudaErrCheck: %s\n", cudaGetErrorString(status)); } } @@ -2294,7 +2199,8 @@ __host__ void create_tasks(struct engine *e) { tasks_host[k].ci = t->ci->cuda_ID; if(t->cj != NULL) tasks_host[k].cj = t->cj->cuda_ID; - + else + tasks_host[k].cj = -1; /* We have a double linked structure because its easier to create. */ tasks_host[k].task = t; t->cuda_task = k; @@ -2333,7 +2239,7 @@ __host__ void create_tasks(struct engine *e) { if (t->subtype == task_subtype_force) { deps++; /* If its a pair force task then it needs 2 unlocks. */ - if (t->type == task_type_pair) { + if (t->type == task_type_sub_pair || t->type == task_type_pair) { deps++; } } @@ -2357,7 +2263,7 @@ __host__ void create_tasks(struct engine *e) { /* If it is a force task then add the unload tasks.*/ if (t->subtype == task_subtype_force) { t->unlocks[t->nr_unlock_tasks++] = t->task->ci->unload_task; - if (t->type == task_type_pair) { + if (t->type == task_type_sub_pair || t->type == task_type_pair) { t->unlocks[t->nr_unlock_tasks++] = t->task->cj->unload_task; } } @@ -2383,7 +2289,7 @@ __host__ void create_tasks(struct engine *e) { tasks_host[t->task->ci->load_task] .unlocks[tasks_host[t->task->ci->load_task].nr_unlock_tasks++] = i; - if (t->type == task_type_pair) { + if (t->type == task_type_pair || t->type == task_type_sub_pair) { /* We may need to stretch the load task's unlocks */ if (tasks_host[t->task->cj->load_task].nr_unlock_tasks == tasks_host[t->task->cj->load_task].size_unlocks) { @@ -3238,11 +3144,6 @@ __host__ void run_cuda() { printTaskTimers(); #endif cudaErrCheck(cudaDeviceSynchronize()); - /*int friends; - cudaErrCheck(cudaMemcpyFromSymbol(&friends,number_dense_friends, sizeof(int))); - printf("!!!!!!!!!!!!!!!!!!!!\nFound %i friends!\n!!!!!!!!!!!!!!!!!!!!!!\n", friends); - cudaErrCheck(cudaMemcpyFromSymbol(&friends,number_force_friends, sizeof(int))); - printf("!!!!!!!!!!!!!!!!!!!!\nFound %i friends!\n!!!!!!!!!!!!!!!!!!!!!!\n", friends);*/ } #undef float #ifdef WITH_CUDA