From 7065af1a826443ba935dcdf00f29fcbad5e603f6 Mon Sep 17 00:00:00 2001
From: d74ksy <aidan.chalk@durham.ac.uk>
Date: Tue, 5 Sep 2017 11:20:38 +0100
Subject: [PATCH] Broke it somehow but the forces and densities are closer

---
 src/CUDA/runner_cuda_main.cu | 69 ++++++++++++++++++++++--------------
 src/engine.c                 |  2 ++
 2 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/src/CUDA/runner_cuda_main.cu b/src/CUDA/runner_cuda_main.cu
index d1e70eb6d5..0a62a60d41 100644
--- a/src/CUDA/runner_cuda_main.cu
+++ b/src/CUDA/runner_cuda_main.cu
@@ -14,6 +14,7 @@ extern "C" {
 #include "queue_cuda.h"
 #include "task_cuda.h"
 #include "cell_cuda.h"
+#include "../active.h"
 #include "../kernel_hydro.h"
 #include "../dimension.h"
 #include "../cell.h"
@@ -310,8 +311,6 @@ __device__ void load_cell(int cell_index) {
     local_entropy_dt = current->entropy_dt;
 
     cuda_parts.h[start + i] = local_h;
-    if(start+i == 0) 
-      printf("%f\n", cuda_parts.h[start+i]);
     cuda_parts.mass[start + i] = local_mass;
     cuda_parts.rho[start + i] = local_rho;
     cuda_parts.entropy[start + i] = local_entropy;
@@ -363,8 +362,9 @@ __device__ void unload_cell(int cell_index) {
 
   /* Get the pointer to the relevant cells. */
   struct cell *cpu_cell = cpu_cells[cell_index];
-  const struct cell_cuda *cell = &cells_cuda[cell_index];
+  struct cell_cuda *cell = &cells_cuda[cell_index];
   struct part *parts = cpu_cell->parts;
+  if (!cuda_cell_is_active(cell)) return;
   int i;
   /* Get the index to copy the data for the 0th particle in this cell to.*/
   int start = cell->first_part;
@@ -398,8 +398,9 @@ __device__ void unload_cell(int cell_index) {
 __device__ void doself_density(struct cell_cuda *ci) {
   /* Is the cell active? */
   if (!cuda_cell_is_active(ci)) {
+    if(threadIdx.x ==0)
     printf(
-        "Cell isn't active..., ti_end_min=%i, ti_current=%i, "
+        "Cell isn't active..., ti_end_min=%lli, ti_current=%lli, "
         "max_active_bin=%i\n",
         ci->ti_end_min, ti_current, max_active_bin);
     return;
@@ -511,7 +512,7 @@ __device__ void doself_density(struct cell_cuda *ci) {
 __device__ void dopair_density(struct cell_cuda *ci, struct cell_cuda *cj) {
 
   /* Are these cells active? */
-  if (!cuda_cell_is_active(ci) && !cuda_cell_is_active(cj)) return;
+  if (!cuda_cell_is_active(ci)) return;
 
   const int count_i = ci->part_count;
   const int count_j = cj->part_count;
@@ -678,6 +679,10 @@ __device__ void doself_force(struct cell_cuda *ci) {
         float wi, wj, wi_dx, wj_dx;
         const float fac_mu = 1.f;
 
+/*        if(cuda_parts.id[pid] == 142801)
+          printf("self Part %i dx %e %e %e\n", cuda_parts.id[pjd], dx[0], dx[1], dx[2]);
+        if(cuda_parts.id[pid] == 142801)
+          printf("self Part %i r2 = %e, hi = %e, hig2 = %e, j = %e, hjg2 = %e\n", cuda_parts.id[pjd], r2, hi, hig2, hj, hj*hj*kernel_gamma2);*/
         const float r = sqrtf(r2);
         const float r_inv = 1.0f / r;
 
@@ -775,7 +780,6 @@ __device__ void doself_force(struct cell_cuda *ci) {
       if (v_sig_stor > global_vsig)
         old = atomicCAS(address_as_int, assumed, __float_as_int(v_sig_stor));
     } while (assumed != old && v_sig_stor > global_vsig);
-
   }  // Outer loop
 }
 
@@ -785,7 +789,7 @@ __device__ void doself_force(struct cell_cuda *ci) {
 __device__ void dopair_force(struct cell_cuda *ci, struct cell_cuda *cj) {
 
   /* Are these cells active? */
-  if (!cuda_cell_is_active(ci) && !cuda_cell_is_active(cj)) return;
+  if (!cuda_cell_is_active(ci)) return;
 
   const int count_i = ci->part_count;
   const int count_j = cj->part_count;
@@ -842,7 +846,6 @@ __device__ void dopair_force(struct cell_cuda *ci, struct cell_cuda *cj) {
       if (r2 < hig2 || r2 < hj * hj * kernel_gamma2) {
         float wi, wj, wi_dx, wj_dx;
         const float fac_mu = 1.f;
-
         const float r = sqrtf(r2);
         const float r_inv = 1.0f / r;
 
@@ -926,8 +929,6 @@ __device__ void dopair_force(struct cell_cuda *ci, struct cell_cuda *cj) {
 
     /* Flush to global stores.*/
     atomicAdd(&cuda_parts.a_hydro[pid].x, a_hydro.x);
-    if(cuda_parts.id[pid] == 11816)
-      printf("a_hydro.x = %f\n", cuda_parts.a_hydro[pid].x);
     atomicAdd(&cuda_parts.a_hydro[pid].y, a_hydro.y);
     atomicAdd(&cuda_parts.a_hydro[pid].z, a_hydro.z);
     atomicAdd(&cuda_parts.h_dt[pid], h_dt);
@@ -1099,6 +1100,8 @@ __device__ void cuda_doself_subset_density(int pid, int cell) {
       /* Get r and 1/r */
       const float r = sqrtf(r2);
       const float ri = 1.0f / r;
+/*      if(cuda_parts.id[pid] == 142801)
+        printf("neighbour = %lli, r=%e, h=%e\n", cuda_parts.id[j], r2, hi);*/
 
       /* Compute the kernel function */
       const float hi_inv = 1.0f / hi;
@@ -1113,7 +1116,7 @@ __device__ void cuda_doself_subset_density(int pid, int cell) {
       /* Compute condtribution to the number of neighbours */
       wcount += w;
       wcount_dh -= (hydro_dimension * w + ui * dw_dx);
-
+          
       const float fac = mj * dw_dx * ri;
 
       float3 piv, pjv;
@@ -1192,6 +1195,8 @@ __device__ void cuda_dopair_subset_density(int pid, int cell, int pid_cell) {
       const float r = sqrtf(r2);
       const float ri = 1.0f / r;
 
+    /*  if(cuda_parts.id[pid] == 142801)
+        printf("pair neighbour = %lli, r=%e, h=%e\n", cuda_parts.id[j], r2, hi);*/
       /* Compute the kernel function */
       const float hi_inv = 1.0f / hi;
       const float ui = r * hi_inv;
@@ -1253,22 +1258,25 @@ __device__ void cuda_hydro_fix_particle(int pid, struct cell_cuda *c) {
   cuda_parts.rot_v[pid].x = 0.f;
   cuda_parts.rot_v[pid].y = 0.f;
   cuda_parts.rot_v[pid].z = 0.f;
-  atomicAdd(&global_count, 1);
   /* Climb up the cell hierarchy. */
   struct cell_cuda *c2 = c;
+  
   for (int cell = (c - cells_cuda); cell >= 0; cell = c2->parent) {
     c2 = &cells_cuda[cell];
 
-    for (int l = 0; l < c->nr_links; l++) {
-      if (tasks[l].type == task_type_self ||
-          tasks[l].type == task_type_sub_self)
+    for (int l = 0; l < c2->nr_links; l++) {
+      const int tid = c2->links[l];
+      if (tasks[tid].type == task_type_self ||
+          tasks[tid].type == task_type_sub_self)
         cuda_doself_subset_density(pid, cell);
-      else if (tasks[l].type == task_type_pair ||
-               tasks[l].type == task_type_sub_pair) {
-        if (tasks[l].ci == cell) {
-          cuda_dopair_subset_density(pid, tasks[l].cj, cell);
+      else if (tasks[tid].type == task_type_pair ||
+               tasks[tid].type == task_type_sub_pair) {
+/*        if(cuda_parts.id[pid] == 142801)
+          printf("cell %lli Task %lli type %i ci %i cj %i \n", cell, (&tasks[tid]-tasks), tasks[tid].type, tasks[tid].ci, tasks[tid].cj);*/
+        if (tasks[tid].ci == cell) {
+          cuda_dopair_subset_density(pid, tasks[tid].cj, cell);
         } else {
-          cuda_dopair_subset_density(pid, tasks[l].ci, cell);
+          cuda_dopair_subset_density(pid, tasks[tid].ci, cell);
         }
       }
     }
@@ -1302,12 +1310,12 @@ __device__ void do_ghost(struct cell_cuda *c) {
       redo = 1;
       int a = 0;
       if (!cuda_part_is_active(i)) continue;
-      while( redo && a < 7  ){
+      while( redo && a < 30  ){
       a++;
       redo = 0;
       float h_new;
       const float h_old = cuda_parts.h[i];
-      const float h_old_dim = h_old*h_old*h_old;//cuda_pow_dimension(h_old);
+      const float h_old_dim = cuda_pow_dimension(h_old);//h_old*h_old*h_old;//d);
       const float h_old_dim_minus_one = cuda_pow_dimension_minus_one(h_old);
       if (cuda_parts.wcount[i] == 0.f) {
         h_new = 2.f * h_old;
@@ -1360,7 +1368,7 @@ __device__ void do_ghost(struct cell_cuda *c) {
         cuda_hydro_fix_particle(i,c); 
       }
         
-      } 
+      }
       /* We now have a particle whose smoothing length has convegered */
       /* Time to set the force variables */
       /* Compute variables required for the force loop */
@@ -1407,7 +1415,6 @@ __global__ void swift_device_kernel() {
   __shared__ volatile int done;
   int i;
   if(threadIdx.x == 0 && blockIdx.x == 0)
-    global_count = 0;
   /* Main loop */
   while (1) {
     __syncthreads();
@@ -1463,10 +1470,12 @@ __global__ void swift_device_kernel() {
         struct cell_cuda *ci = &cells_cuda[tasks[tid].ci];
         struct cell_cuda *cj = &cells_cuda[tasks[tid].cj];
         dopair_density(ci, cj);
+        dopair_density(cj, ci);
       } else if (subtype == task_subtype_force) {
         struct cell_cuda *ci = &cells_cuda[tasks[tid].ci];
         struct cell_cuda *cj = &cells_cuda[tasks[tid].cj];
         dopair_force(ci, cj);
+        dopair_force(cj, ci);
       }
     } else if (type == task_type_self || type == task_type_sub_self) {
       if (subtype == task_subtype_density) {
@@ -1509,8 +1518,6 @@ __global__ void swift_device_kernel() {
     }
 
   }  // End of main loop
-  if(threadIdx.x == 0 && blockIdx.x == 0)
-  printf("global_count = %i\n", global_count);
   /* Don't need to do any cleanup, all the dependencies and skips and queues are
    * set by CPU. */
 }
@@ -1883,8 +1890,16 @@ __host__ void update_tasks(struct engine *e) {
   for (int i = 0; i < nr_gpu_tasks; i++) {
     // Update the skip flag and reset the wait to 0.
     host_tasks[i].wait = 0;
-    if (host_tasks[i].type > type_load)
+    if (host_tasks[i].type > type_load){
+      struct task *t = host_tasks[i].task;
+      if(!(cell_is_active(t->ci, e) || (t->cj != NULL && cell_is_active(t->cj, e))) && !t->skip){
+            printf("Inactive task isn't skipped\n");
+          }
+      if(t->type != host_tasks[i].type || t->subtype != host_tasks[i].subtype)
+        printf("Pointers broke\n");
       host_tasks[i].skip = host_tasks[i].task->skip;
+      host_tasks[i].task->skip = 1;
+    }
     else
       host_tasks[i].skip = 0;
   }
diff --git a/src/engine.c b/src/engine.c
index f889ab3062..a3227d9a0e 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -3633,6 +3633,8 @@ void engine_step(struct engine *e) {
   engine_launch(e);
   TIMER_TOC(timer_runners);
 
+  printParticle(e->s->parts, e->s->xparts, 142800, e->s->nr_parts);
+
 #ifdef SWIFT_GRAVITY_FORCE_CHECKS
   /* Check the accuracy of the gravity calculation */
   if (e->policy & engine_policy_self_gravity)
-- 
GitLab