diff --git a/src/CUDA/runner_cuda_main.cu b/src/CUDA/runner_cuda_main.cu index 1bfa2c8fe92d1f3d94e83715950006e1338e4750..d1e70eb6d5dceb1b58d181813d37d56522304470 100644 --- a/src/CUDA/runner_cuda_main.cu +++ b/src/CUDA/runner_cuda_main.cu @@ -19,9 +19,7 @@ extern "C" { #include "../cell.h" #include "../hydro_properties.h" #include "../engine.h" -extern "C" { #include "../scheduler.h" -} #include "../space.h" #include "../adiabatic_index.h" } @@ -77,7 +75,7 @@ struct particle_arrays { /* Device array containing time step length */ timebin_t *time_bin; }; - +__device__ int global_count; __device__ struct particle_arrays cuda_parts; __constant__ int cuda_nr_parts; @@ -160,6 +158,14 @@ __device__ void cuda_queue_puttask(struct queue_cuda *q, int tid) { /* Density kernel function */ __constant__ float cuda_kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]; +/* definition to expand macro then apply to pragma message */ +#define VALUE_TO_STRING(x) #x +#define VALUE(x) VALUE_TO_STRING(x) +#define VAR_NAME_VALUE(var) #var "=" VALUE(var) + +#pragma message(VAR_NAME_VALUE(kernel_degree)) +#pragma message(VAR_NAME_VALUE(kernel_ivals)) + #define cuda_kernel_root \ ((float)(cuda_kernel_coeffs[kernel_degree]) * \ kernel_constant *kernel_gamma_inv_dim) @@ -304,6 +310,8 @@ __device__ void load_cell(int cell_index) { local_entropy_dt = current->entropy_dt; cuda_parts.h[start + i] = local_h; + if(start+i == 0) + printf("%f\n", cuda_parts.h[start+i]); cuda_parts.mass[start + i] = local_mass; cuda_parts.rho[start + i] = local_rho; cuda_parts.entropy[start + i] = local_entropy; @@ -360,22 +368,10 @@ __device__ void unload_cell(int cell_index) { int i; /* Get the index to copy the data for the 0th particle in this cell to.*/ int start = cell->first_part; - + __syncthreads(); for (i = threadIdx.x; i < cell->part_count; i += blockDim.x) { struct part *current = &parts[i]; - /* Copy back the ID and position.*/ - current->id = cuda_parts.id[start + i]; - current->x[0] = cuda_parts.x_x[start + i]; - current->x[1] = cuda_parts.x_y[start + i]; - current->x[2] = cuda_parts.x_z[start + i]; - - /* Copy back the velocity*/ - float3 local_v = cuda_parts.v[start + i]; - current->v[0] = local_v.x; - current->v[1] = local_v.y; - current->v[2] = local_v.z; - /* Copy back the acceleration */ float3 local_a_hydro = cuda_parts.a_hydro[start + i]; current->a_hydro[0] = local_a_hydro.x; @@ -384,9 +380,7 @@ __device__ void unload_cell(int cell_index) { /* Copy back the cutoff, mass, density, entropy and entropy_dt*/ current->h = cuda_parts.h[start + i]; - current->mass = cuda_parts.mass[start + i]; current->rho = cuda_parts.rho[start + i]; - current->entropy = cuda_parts.entropy[start + i]; current->entropy_dt = cuda_parts.entropy_dt[start + i]; /* Copy back the force union.*/ @@ -397,14 +391,11 @@ __device__ void unload_cell(int cell_index) { current->force.v_sig = cuda_parts.v_sig[start + i]; current->force.h_dt = cuda_parts.h_dt[start + i]; - /* Copy back the timebin. */ - current->time_bin = cuda_parts.time_bin[start + i]; } } /* Task function to execute a self-density task. */ __device__ void doself_density(struct cell_cuda *ci) { - /* Is the cell active? */ if (!cuda_cell_is_active(ci)) { printf( @@ -504,6 +495,7 @@ __device__ void doself_density(struct cell_cuda *ci) { /* Write data for particle pid back to global stores. */ atomicAdd(&cuda_parts.rho[pid], rho); atomicAdd(&cuda_parts.rho_dh[pid], rho_dh); + atomicAdd(&cuda_parts.wcount[pid], wcount); atomicAdd(&cuda_parts.wcount_dh[pid], wcount_dh); atomicAdd(&cuda_parts.div_v[pid], div_v); @@ -934,6 +926,8 @@ __device__ void dopair_force(struct cell_cuda *ci, struct cell_cuda *cj) { /* Flush to global stores.*/ atomicAdd(&cuda_parts.a_hydro[pid].x, a_hydro.x); + if(cuda_parts.id[pid] == 11816) + printf("a_hydro.x = %f\n", cuda_parts.a_hydro[pid].x); atomicAdd(&cuda_parts.a_hydro[pid].y, a_hydro.y); atomicAdd(&cuda_parts.a_hydro[pid].z, a_hydro.z); atomicAdd(&cuda_parts.h_dt[pid], h_dt); @@ -969,10 +963,10 @@ __device__ void hydro_end_density(int pid) { cuda_parts.wcount[pid] += cuda_kernel_root; cuda_parts.wcount_dh[pid] -= hydro_dimension * cuda_kernel_root; - /* Finish the calculation by inser4ting the missing h-factors */ + /* Finish the calculation by inserting the missing h-factors */ cuda_parts.rho[pid] *= h_inv_dim; cuda_parts.rho_dh[pid] *= h_inv_dim_plus_one; - cuda_parts.wcount[pid] *= kernel_norm; + cuda_parts.wcount[pid] *= h_inv_dim; cuda_parts.wcount_dh[pid] *= h_inv_dim_plus_one; const float rho_inv = 1.0 / cuda_parts.rho[pid]; @@ -1126,8 +1120,8 @@ __device__ void cuda_doself_subset_density(int pid, int cell) { piv = cuda_parts.v[pid]; pjv = cuda_parts.v[j]; dv[0] = piv.x - pjv.x; - dx[1] = piv.y - pjv.y; - dx[2] = piv.z - pjv.z; + dv[1] = piv.y - pjv.y; + dv[2] = piv.z - pjv.z; const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2]; div_v -= fac * dvdr; @@ -1218,8 +1212,8 @@ __device__ void cuda_dopair_subset_density(int pid, int cell, int pid_cell) { piv = cuda_parts.v[pid]; pjv = cuda_parts.v[j]; dv[0] = piv.x - pjv.x; - dx[1] = piv.y - pjv.y; - dx[2] = piv.z - pjv.z; + dv[1] = piv.y - pjv.y; + dv[2] = piv.z - pjv.z; const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2]; div_v -= fac * dvdr; @@ -1259,7 +1253,7 @@ __device__ void cuda_hydro_fix_particle(int pid, struct cell_cuda *c) { cuda_parts.rot_v[pid].x = 0.f; cuda_parts.rot_v[pid].y = 0.f; cuda_parts.rot_v[pid].z = 0.f; - + atomicAdd(&global_count, 1); /* Climb up the cell hierarchy. */ struct cell_cuda *c2 = c; for (int cell = (c - cells_cuda); cell >= 0; cell = c2->parent) { @@ -1281,6 +1275,7 @@ __device__ void cuda_hydro_fix_particle(int pid, struct cell_cuda *c) { } } + /* During ghost work only this block will be accessing the thread, no need for * atomics. */ __device__ void do_ghost(struct cell_cuda *c) { @@ -1290,9 +1285,9 @@ __device__ void do_ghost(struct cell_cuda *c) { // const float target_wcount = target_neighbours; // const float max_wcount = target_wcount + delta_neighbours; // const float min_wcount = target_wcount - delta_neighbours; - const float hydro_eta_dim = cuda_eta_neighbours; + const float hydro_eta_dim = cuda_pow_dimension(cuda_eta_neighbours); const float eps = cuda_h_tolerance; - + int redo; /* Is the cell active? */ if (!cuda_cell_is_active(c)) return; @@ -1302,15 +1297,18 @@ __device__ void do_ghost(struct cell_cuda *c) { if (c->progeny[k] >= 0) do_ghost(&cells_cuda[k]); } } else { - /* Loop over the particles in the cell. */ for (int i = part_i + threadIdx.x; i < part_i + count_i; i+= blockDim.x) { + redo = 1; + int a = 0; + if (!cuda_part_is_active(i)) continue; + while( redo && a < 7 ){ + a++; + redo = 0; float h_new; const float h_old = cuda_parts.h[i]; - const float h_old_dim = cuda_pow_dimension(h_old); + const float h_old_dim = h_old*h_old*h_old;//cuda_pow_dimension(h_old); const float h_old_dim_minus_one = cuda_pow_dimension_minus_one(h_old); - - if (!cuda_part_is_active(i)) continue; if (cuda_parts.wcount[i] == 0.f) { h_new = 2.f * h_old; } else { @@ -1325,6 +1323,7 @@ __device__ void do_ghost(struct cell_cuda *c) { cuda_parts.wcount_dh[i] * h_old_dim + hydro_dimension * cuda_parts.wcount[i] * h_old_dim_minus_one; + h_new = h_old - f / f_prime; /* Safety check: truncate to the range [ h_old/2 , 2h_old ]. */ @@ -1337,8 +1336,17 @@ __device__ void do_ghost(struct cell_cuda *c) { // If below absolute max try again // else give up... - if (cuda_parts.h[i] < hydro_h_max) { - cuda_hydro_fix_particle(i, c); + if ( cuda_parts.h[i] < hydro_h_max) { + cuda_parts.rho[i] = 0.f; + cuda_parts.wcount[i] = 0.f; + cuda_parts.wcount_dh[i] = 0.f; + cuda_parts.rho_dh[i] = 0.f; + cuda_parts.div_v[i] = 0.f; + cuda_parts.rot_v[i].x = 0.f; + cuda_parts.rot_v[i].y = 0.f; + cuda_parts.rot_v[i].z = 0.f; + redo = 1; + atomicAdd(&global_count, 1); } else { /* The particle is a lost cause */ cuda_parts.h[i] = hydro_h_max; @@ -1348,7 +1356,11 @@ __device__ void do_ghost(struct cell_cuda *c) { } } /* correct number of neighbours */ - + if(redo) { + cuda_hydro_fix_particle(i,c); + } + + } /* We now have a particle whose smoothing length has convegered */ /* Time to set the force variables */ /* Compute variables required for the force loop */ @@ -1394,7 +1406,8 @@ __global__ void swift_device_kernel() { __shared__ volatile int tid; __shared__ volatile int done; int i; - + if(threadIdx.x == 0 && blockIdx.x == 0) + global_count = 0; /* Main loop */ while (1) { __syncthreads(); @@ -1496,7 +1509,8 @@ __global__ void swift_device_kernel() { } } // End of main loop - + if(threadIdx.x == 0 && blockIdx.x == 0) + printf("global_count = %i\n", global_count); /* Don't need to do any cleanup, all the dependencies and skips and queues are * set by CPU. */ } @@ -1616,12 +1630,12 @@ __host__ void create_transfer_tasks(struct cell *c, int *k, int parent_load_task, int parent_unload_task) { tasks_host[*k].unlocks = - (int *)malloc(sizeof(int) * 9); // For now we create CPU storage for + (int *)malloc(sizeof(int) * 11); // For now we create CPU storage for // these unlocks for each task. No task // should have more than 9 unlocks to // other transfer tasks. tasks_host[*k].nr_unlock_tasks = 0; - tasks_host[*k].size_unlocks = 9; + tasks_host[*k].size_unlocks = 11; if (c->split) { /* If the task is split we create implicit tasks to deal with dependencies. @@ -1669,6 +1683,8 @@ __host__ void create_transfer_tasks(struct cell *c, int *k, c->unload_task = *k; *k = *k + 1; + /* load task unlocks the unload task...*/ + tasks_host[(*k)-2].unlocks[tasks_host[(*k)-2].nr_unlock_tasks++] = (*k)-1; /* Recurse down the tree. */ int load = *k - 2; int unload = *k - 1; @@ -1716,6 +1732,8 @@ __host__ void create_transfer_tasks(struct cell *c, int *k, } c->unload_task = *k; *k = *k + 1; + /* load task unlocks the unload task...*/ + tasks_host[(*k)-2].unlocks[tasks_host[(*k)-2].nr_unlock_tasks++] = (*k)-1; } } @@ -2663,10 +2681,6 @@ __host__ void create_tasks(struct engine *e) { host_pointers = NULL; } -__host__ void run_cuda() { - swift_device_kernel << <num_blocks, num_cuda_threads>>> (); - cudaErrCheck(cudaDeviceSynchronize()); -} /* Make the tests! */ @@ -3066,6 +3080,14 @@ __host__ void printTaskTimers(){ #endif } +__host__ void run_cuda() { + swift_device_kernel << <num_blocks, num_cuda_threads>>> (); +#ifdef CUDA_TASK_TIMERS + printTaskTimers(); +#endif + cudaErrCheck(cudaDeviceSynchronize()); +} + #ifdef WITH_CUDA #undef static #undef restrict