diff --git a/tests/testcuda_shared.cu b/tests/testcuda_shared.cu
index 8c7b5375058e193a78172917029c1f2170fae0e4..bbaea05387bd3969d3676a730677b62e6ac62da9 100644
--- a/tests/testcuda_shared.cu
+++ b/tests/testcuda_shared.cu
@@ -633,107 +633,142 @@ __global__ void doself_density_shared(struct cell_cuda *ci) {
   float *v_y  = shared + 6*count_i;
   float *v_z  = shared + 7*count_i;
 
-  // Load particle position in shared mem
-  // TODO: loop fission?
-  // TODO: thread memory access rule? (skip blockDim or half-warp???)
-  for (int pid = threadIdx.x; pid < count_i; pid += blockDim.x) {
-    int id = part_i + pid;
-    x_x[pid] = cuda_parts.x_x[id] - ci->loc[0];
-    x_y[pid] = cuda_parts.x_y[id] - ci->loc[1];
-    x_z[pid] = cuda_parts.x_z[id] - ci->loc[2];
-    mass[pid] = cuda_parts.mass[id];
-    h[pid] = cuda_parts.h[id];
-    v_x[pid] = cuda_parts.v[id].x;
-    v_y[pid] = cuda_parts.v[id].y;
-    v_z[pid] = cuda_parts.v[id].z;
-  }  
-  __syncthreads();
-
-//  for (int pid = part_i + threadIdx.x; pid < part_i + count_i;
-//       pid += blockDim.x) {
-  for (int pid = threadIdx.x; pid < count_i;
-       pid += blockDim.x) {
-    const float hi = h[pid];
-    const float hig2 = hi * hi * kernel_gamma2;
-    int pid_write = pid + part_i;
-    /* Reset local values. */
-    rho = 0.0f;
-    rho_dh = 0.0f;
-    div_v = 0.0f;
-    wcount = 0.0f;
-    wcount_dh = 0.0f;
-    rot_v.x = 0.0f;
-    rot_v.y = 0.0f;
-    rot_v.z = 0.0f;
-
-    /* If the particle isn't active skip it. */
-    if (!cuda_part_is_active(pid_write)) {
-      continue;
-    }
-    /* Search for the neighbours! */
-    for (int pjd = 0; pjd < count_i; pjd++) {
-      /* Particles don't interact with themselves */
-      if (pid == pjd && pjd != count_i - 1) pjd++;
-      float dx[3], r2 = 0.0f;
-      dx[0] = x_x[pid] - x_x[pjd];
-      r2 += dx[0] * dx[0];
-      dx[1] = x_y[pid] - x_y[pjd];
-      r2 += dx[1] * dx[1];
-      dx[2] = x_z[pid] - x_z[pjd];
-      r2 += dx[2] * dx[2];
-
-      /* If in range then interact. */
-      if (r2 < hig2) {
-        float w, dw_dx;
-        float dv[3], curlvr[3];
-        /* Load mass on particle pj. */
-        const float mj = mass[pjd];
-
-        /* Get r and 1/r */
-        const float r = sqrtf(r2);
-        const float ri = 1.0f / r;
 
-        /* Compute the kernel function */
-        const float hi_inv = 1.0f / hi;
-        const float ui = r * hi_inv;
-
-        cuda_kernel_deval(ui, &w, &dw_dx);
-        /* Compute contribution to the density. */
-        rho += mj * w;
-        rho_dh -= mj * (hydro_dimension * w + ui * dw_dx);
-
-        /* Compute contribution to the number of neighbours */
-        wcount += w;
-        wcount_dh -= (hydro_dimension * w + ui * dw_dx);
-
-        const float fac = mj * dw_dx * ri;
-
-        /* Compute dv dot r */
-        dv[0] = v_x[pid] - v_x[pjd];
-        dv[1] = v_y[pid] - v_y[pjd];
-        dv[2] = v_z[pid] - v_z[pjd];
-        const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
-
-        div_v -= fac * dvdr;
-
-        curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
-        curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
-        curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];
-
-        rot_v.x += fac * curlvr[0];
-        rot_v.y += fac * curlvr[1];
-        rot_v.z += fac * curlvr[2];
-      }
-    }  // Loop over cj.
-    /* Write data for particle pid back to global stores. */
-    atomicAdd(&cuda_parts.rho[pid_write], rho);
-    atomicAdd(&cuda_parts.rho_dh[pid_write], rho_dh);
-    atomicAdd(&cuda_parts.wcount[pid_write], wcount);
-    atomicAdd(&cuda_parts.wcount_dh[pid_write], wcount_dh);
-    atomicAdd(&cuda_parts.div_v[pid_write], div_v);
-    atomicAdd(&cuda_parts.rot_v[pid_write].x, rot_v.x);
-    atomicAdd(&cuda_parts.rot_v[pid_write].y, rot_v.y);
-    atomicAdd(&cuda_parts.rot_v[pid_write].z, rot_v.z);
+  int tile_size = count_i/2; 
+  int num_full_tiles = (count_i-1)/tile_size + 1;
+  int residual_parts = count_i - num_full_tiles*tile_size;
+  int num_tiles = num_full_tiles;
+  if(residual_parts>0)
+	num_tiles++;	
+  int current_tile_size = tile_size;
+  int tile_offset = part_i;
+
+  for (int pid_loc = threadIdx.x; pid_loc < count_i; pid_loc += blockDim.x) {
+  	 
+	 // Load local particle
+     int id_loc = part_i + pid_loc;
+     float x_x_loc = cuda_parts.x_x[id_loc] - ci->loc[0];// TODO: register?
+     float x_y_loc = cuda_parts.x_y[id_loc] - ci->loc[1];
+     float x_z_loc = cuda_parts.x_z[id_loc] - ci->loc[2];
+     float h_loc = cuda_parts.h[id_loc];
+     float v_x_loc = cuda_parts.v[id_loc].x;
+     float v_y_loc = cuda_parts.v[id_loc].y;
+     float v_z_loc = cuda_parts.v[id_loc].z;
+
+	 const float hi = h_loc;// TODO: remove this line
+   	 const float hig2 = hi * hi * kernel_gamma2;
+   	 int pid_write = pid_loc;// TODO: remove this line
+
+	 /* Reset local values. */
+     rho = 0.0f;		
+     rho_dh = 0.0f;	
+	 div_v = 0.0f;
+     wcount = 0.0f;
+     wcount_dh = 0.0f;
+     rot_v.x = 0.0f;
+     rot_v.y = 0.0f;
+     rot_v.z = 0.0f;
+
+     /* If the particle isn't active skip it. */
+     if (!cuda_part_is_active(pid_write)) {
+     	continue;
+     }
+	 __syncthreads();// TODO: remove this
+
+	 // Loop over tiles
+	 for(int tileid=0;tileid<num_tiles;tileid++) {
+
+     	// Load particle position in shared mem
+ 		// TODO: loop fission?
+  	 	// TODO: thread memory access rule? (skip blockDim or half-warp???)
+	 	if(tileid == num_tiles - 1 && residual_parts>0)
+	    	current_tile_size = residual_parts;
+
+     	printf("tileid: %d, tile_size: %d, num_full_tiles: %d, residual_parts: %d, num_tiles:"
+     " %d, current_tile_size: %d, tileid, tile_offset: %d\n",tile_size,num_full_tiles,residual_parts,num_tiles,current_tile_size,tile_offset);
+
+       	for (int pjd = threadIdx.x; pjd < current_tile_size; pjd +=blockDim.x) {//  
+    	 	int jd = tile_offset + pjd;
+         	x_x[pjd] = cuda_parts.x_x[jd] - ci->loc[0];// TODO: register?
+    	 	x_y[pjd] = cuda_parts.x_y[jd] - ci->loc[1];
+    	 	x_z[pjd] = cuda_parts.x_z[jd] - ci->loc[2];
+    	 	mass[pjd] = cuda_parts.mass[jd];
+    	 	h[pjd] = cuda_parts.h[jd];
+    	 	v_x[pjd] = cuda_parts.v[jd].x;
+    	 	v_y[pjd] = cuda_parts.v[jd].y;
+    	 	v_z[pjd] = cuda_parts.v[jd].z;
+     	}  
+     	__syncthreads();
+	     
+		/* Search for the neighbours! */
+    	for (int pjd = 0; pjd < current_tile_size; pjd++) {
+    	 	int jd = tile_offset + pjd;			
+      		/* Particles don't interact with themselves */
+      		if (id_loc == jd && pjd != current_tile_size - 1) pjd++;
+      		float dx[3], r2 = 0.0f;
+      		dx[0] = x_x_loc - x_x[pjd];
+      		r2 += dx[0] * dx[0];
+      		dx[1] = x_y_loc - x_y[pjd];
+      		r2 += dx[1] * dx[1];
+      		dx[2] = x_z_loc - x_z[pjd];
+      		r2 += dx[2] * dx[2];
+
+      		/* If in range then interact. */
+      		if (r2 < hig2) {
+        	   float w, dw_dx;
+        	   float dv[3], curlvr[3];
+        	   /* Load mass on particle pj. */
+        	   const float mj = mass[pjd];
+
+       		   /* Get r and 1/r */
+        	   const float r = sqrtf(r2);
+        	   const float ri = 1.0f / r;
+
+			   /* Compute the kernel function */
+        	   const float hi_inv = 1.0f / hi;
+        	   const float ui = r * hi_inv;
+
+        	   cuda_kernel_deval(ui, &w, &dw_dx);
+        	   /* Compute contribution to the density. */
+        	   rho += mj * w;
+        	   rho_dh -= mj * (hydro_dimension * w + ui * dw_dx);
+
+        	   /* Compute contribution to the number of neighbours */
+        	   wcount += w;
+        	   wcount_dh -= (hydro_dimension * w + ui * dw_dx);
+
+        	   const float fac = mj * dw_dx * ri;
+
+        	   /* Compute dv dot r */
+        	   dv[0] = v_x_loc - v_x[pjd];
+        	   dv[1] = v_y_loc - v_y[pjd];
+        	   dv[2] = v_z_loc - v_z[pjd];
+        	   const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
+
+        	   div_v -= fac * dvdr;
+
+        	   curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
+        	   curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
+        	   curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];
+
+        	   rot_v.x += fac * curlvr[0];
+        	   rot_v.y += fac * curlvr[1];
+        	   rot_v.z += fac * curlvr[2];
+      		}
+    		
+		}
+
+		/* Write data for particle pid back to global stores. */
+    	atomicAdd(&cuda_parts.rho[pid_write], rho);
+    	atomicAdd(&cuda_parts.rho_dh[pid_write], rho_dh);
+    	atomicAdd(&cuda_parts.wcount[pid_write], wcount);
+    	atomicAdd(&cuda_parts.wcount_dh[pid_write], wcount_dh);
+    	atomicAdd(&cuda_parts.div_v[pid_write], div_v);
+    	atomicAdd(&cuda_parts.rot_v[pid_write].x, rot_v.x);
+    	atomicAdd(&cuda_parts.rot_v[pid_write].y, rot_v.y);
+    	atomicAdd(&cuda_parts.rot_v[pid_write].z, rot_v.z);
+		tile_offset += current_tile_size; 
+  	} 
   }
 }
 
@@ -1499,8 +1534,9 @@ int main(int argc, char *argv[]) {
     }
 
     //    do_test_self_density<<<volume/CUDA_THREADS + 1,CUDA_THREADS>>>(cuda_ci);
-    //doself_density<<<1,CUDA_THREADS>>>(cuda_ci);
-    size_t sum = 8*sizeof(float);
+//    doself_density<<<1,CUDA_THREADS>>>(cuda_ci);
+    
+	size_t sum = 8*sizeof(float);
     sum *= volume;
     doself_density_shared<<<1,CUDA_THREADS,sum>>>(cuda_ci);
     cudaErrCheck( cudaPeekAtLastError() );