Commit 4e0393b2 authored by James Willis's avatar James Willis
Browse files

Formatting.

parent 85b1b2cc
......@@ -23,22 +23,22 @@
#include "../config.h"
/* Local headers */
#include "vector.h"
#include "part.h"
#include "cell.h"
#include "error.h"
#include "part.h"
#include "vector.h"
#define NUM_VEC_PROC 2
#define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE)
#define C2_CACHE_ALIGN sizeof(float) * VEC_SIZE
/* Cache struct to hold a local copy of a cells' particle
/* Cache struct to hold a local copy of a cells' particle
* properties required for density/force calculations.*/
struct cache {
struct cache {
/* Particle x position. */
float *restrict x __attribute__((aligned(sizeof(float) * VEC_SIZE)));
float *restrict x __attribute__((aligned(sizeof(float) * VEC_SIZE)));
/* Particle y position. */
float *restrict y __attribute__((aligned(sizeof(float) * VEC_SIZE)));
......@@ -62,10 +62,10 @@ struct cache {
/* Cache size. */
int count;
};
/* Secondary cache struct to hold a list of interactions between two particles.*/
/* Secondary cache struct to hold a list of interactions between two
* particles.*/
struct c2_cache {
/* Separation between two particles squared. */
......@@ -81,11 +81,11 @@ struct c2_cache {
float dzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
/* Mass of particle pj. */
float mq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
float mq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
/* x velocity of particle pj. */
float vxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
/* y velocity of particle pj. */
float vyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
......@@ -99,9 +99,11 @@ struct c2_cache {
* @param c The cache.
* @param count Number of particles to allocate space for.
*/
__attribute__((always_inline)) INLINE void cache_init(struct cache *c, size_t count) {
__attribute__((always_inline)) INLINE void cache_init(struct cache *c,
size_t count) {
/* Align cache on correct byte boundary and pad cache size to include 2 vector lengths for remainder operations. */
/* Align cache on correct byte boundary and pad cache size to include 2 vector
* lengths for remainder operations. */
unsigned long alignment = sizeof(float) * VEC_SIZE;
unsigned int sizeBytes = (count + (2 * VEC_SIZE)) * sizeof(float);
int error = 0;
......@@ -118,16 +120,17 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, size_t co
free(c->h);
}
error += posix_memalign((void **)&c->x, alignment,sizeBytes);
error += posix_memalign((void **)&c->y, alignment,sizeBytes);
error += posix_memalign((void **)&c->z, alignment,sizeBytes);
error += posix_memalign((void **)&c->m, alignment,sizeBytes);
error += posix_memalign((void **)&c->vx, alignment,sizeBytes);
error += posix_memalign((void **)&c->vy, alignment,sizeBytes);
error += posix_memalign((void **)&c->vz, alignment,sizeBytes);
error += posix_memalign((void **)&c->h, alignment,sizeBytes);
if (error !=0) error("Couldn't allocate cache, no. of particles: %d", (int)count);
error += posix_memalign((void **)&c->x, alignment, sizeBytes);
error += posix_memalign((void **)&c->y, alignment, sizeBytes);
error += posix_memalign((void **)&c->z, alignment, sizeBytes);
error += posix_memalign((void **)&c->m, alignment, sizeBytes);
error += posix_memalign((void **)&c->vx, alignment, sizeBytes);
error += posix_memalign((void **)&c->vy, alignment, sizeBytes);
error += posix_memalign((void **)&c->vz, alignment, sizeBytes);
error += posix_memalign((void **)&c->h, alignment, sizeBytes);
if (error != 0)
error("Couldn't allocate cache, no. of particles: %d", (int)count);
c->count = count;
}
......@@ -137,22 +140,22 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, size_t co
* @param ci The #cell.
* @param ci_cache The cache.
*/
__attribute__((always_inline)) INLINE void cache_read_particles(const struct cell *const ci, struct cache *const ci_cache) {
__attribute__((always_inline)) INLINE void cache_read_particles(
const struct cell *const ci, struct cache *const ci_cache) {
/* Shift the particles positions to a local frame so single precision can be used instead of double precision. */
for (int i=0; i<ci->count; i++) {
/* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */
for (int i = 0; i < ci->count; i++) {
ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0];
ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1];
ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2];
ci_cache->h[i] = ci->parts[i].h;
ci_cache->m[i] = ci->parts[i].mass;
ci_cache->vx[i] = ci->parts[i].v[0];
ci_cache->vy[i] = ci->parts[i].v[1];
ci_cache->vz[i] = ci->parts[i].v[2];
}
}
}
#endif /* SWIFT_CACHE_H */
......@@ -384,11 +384,16 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
#ifdef WITH_VECTORIZATION
/**
* @brief Density interaction computed using 2 interleaved vectors (non-symmetric vectorized version).
* @brief Density interaction computed using 2 interleaved vectors
* (non-symmetric vectorized version).
*/
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv,
vector vix, vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum,vector *curlvySum, vector *curlvzSum, vector mask, vector mask2, int knlMask, int knlMask2) {
runner_iact_nonsym_2_vec_density(
float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, vector vix,
vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj,
vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
vector mask, vector mask2, int knlMask, int knlMask2) {
vector r, ri, r2, xi, wi, wi_dx;
vector mj;
......@@ -431,7 +436,7 @@ runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz, vec
xi2.v = vec_mul(r_2.v, hi_inv.v);
/* Calculate the kernel for two particles. */
kernel_deval_2_vec(&xi, &wi, &wi_dx,&xi2, &wi2, &wi_dx2);
kernel_deval_2_vec(&xi, &wi, &wi_dx, &xi2, &wi2, &wi_dx2);
/* Compute dv. */
dvx.v = vec_sub(vix.v, vjx.v);
......@@ -443,66 +448,106 @@ runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz, vec
/* Compute dv dot r */
dvdr.v = vec_fma(dvx.v, dx.v, vec_fma(dvy.v, dy.v, vec_mul(dvz.v, dz.v)));
dvdr2.v = vec_fma(dvx2.v, dx2.v, vec_fma(dvy2.v, dy2.v, vec_mul(dvz2.v, dz2.v)));
dvdr2.v =
vec_fma(dvx2.v, dx2.v, vec_fma(dvy2.v, dy2.v, vec_mul(dvz2.v, dz2.v)));
dvdr.v = vec_mul(dvdr.v, ri.v);
dvdr2.v = vec_mul(dvdr2.v, ri2.v);
/* Compute dv cross r */
curlvrx.v = vec_fma(dvy.v, dz.v, vec_mul(vec_set1(-1.0f),vec_mul(dvz.v, dy.v)));
curlvrx2.v = vec_fma(dvy2.v, dz2.v, vec_mul(vec_set1(-1.0f),vec_mul(dvz2.v, dy2.v)));
curlvry.v = vec_fma(dvz.v, dx.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz.v)));
curlvry2.v = vec_fma(dvz2.v, dx2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx2.v, dz2.v)));
curlvrz.v = vec_fma(dvx.v, dy.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx.v)));
curlvrz2.v = vec_fma(dvx2.v, dy2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy2.v, dx2.v)));
curlvrx.v = vec_mul(curlvrx.v,ri.v);
curlvrx2.v = vec_mul(curlvrx2.v,ri2.v);
curlvry.v = vec_mul(curlvry.v,ri.v);
curlvry2.v = vec_mul(curlvry2.v,ri2.v);
curlvrz.v = vec_mul(curlvrz.v,ri.v);
curlvrz2.v = vec_mul(curlvrz2.v,ri2.v);
/* Mask updates to intermediate vector sums for particle pi. */
curlvrx.v =
vec_fma(dvy.v, dz.v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy.v)));
curlvrx2.v =
vec_fma(dvy2.v, dz2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvz2.v, dy2.v)));
curlvry.v =
vec_fma(dvz.v, dx.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz.v)));
curlvry2.v =
vec_fma(dvz2.v, dx2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx2.v, dz2.v)));
curlvrz.v =
vec_fma(dvx.v, dy.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx.v)));
curlvrz2.v =
vec_fma(dvx2.v, dy2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy2.v, dx2.v)));
curlvrx.v = vec_mul(curlvrx.v, ri.v);
curlvrx2.v = vec_mul(curlvrx2.v, ri2.v);
curlvry.v = vec_mul(curlvry.v, ri.v);
curlvry2.v = vec_mul(curlvry2.v, ri2.v);
curlvrz.v = vec_mul(curlvrz.v, ri.v);
curlvrz2.v = vec_mul(curlvrz2.v, ri2.v);
/* Mask updates to intermediate vector sums for particle pi. */
#ifdef HAVE_AVX512_F
rhoSum->v = _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);
rhoSum->v = _mm512_mask_add_ps(rhoSum->v, knlMask2, vec_mul(mj2.v, wi2.v), rhoSum->v);
rho_dhSum->v = _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(xi.v, wi_dx.v))));
rho_dhSum->v = _mm512_mask_sub_ps(rho_dhSum->v, knlMask2, rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(xi2.v, wi_dx2.v))));
rhoSum->v =
_mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);
rhoSum->v =
_mm512_mask_add_ps(rhoSum->v, knlMask2, vec_mul(mj2.v, wi2.v), rhoSum->v);
rho_dhSum->v =
_mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(xi.v, wi_dx.v))));
rho_dhSum->v = _mm512_mask_sub_ps(
rho_dhSum->v, knlMask2, rho_dhSum->v,
vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
vec_mul(xi2.v, wi_dx2.v))));
wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);
wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask2, wi2.v, wcountSum->v);
wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask, wcount_dhSum->v, vec_mul(xi.v, wi_dx.v));
wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask2, wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v));
div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask2, div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)));
curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), curlvxSum->v);
curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), curlvxSum->v);
curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), curlvySum->v);
curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), curlvySum->v);
curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), curlvzSum->v);
curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), curlvzSum->v);
wcountSum->v =
_mm512_mask_add_ps(wcountSum->v, knlMask2, wi2.v, wcountSum->v);
wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask,
wcount_dhSum->v, vec_mul(xi.v, wi_dx.v));
wcount_dhSum->v = _mm512_mask_sub_ps(
wcount_dhSum->v, knlMask2, wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v));
div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask2, div_vSum->v,
vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)));
curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
curlvxSum->v);
curlvxSum->v = _mm512_mask_add_ps(
curlvxSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)),
curlvxSum->v);
curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
curlvySum->v);
curlvySum->v = _mm512_mask_add_ps(
curlvySum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)),
curlvySum->v);
curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
curlvzSum->v);
curlvzSum->v = _mm512_mask_add_ps(
curlvzSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)),
curlvzSum->v);
#else
rhoSum->v += vec_and(vec_mul(mj.v, wi.v),mask.v);
rhoSum->v += vec_and(vec_mul(mj2.v, wi2.v),mask2.v);
rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(xi.v, wi_dx.v))),mask.v);
rho_dhSum->v -= vec_and(vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(xi2.v, wi_dx2.v))),mask2.v);
wcountSum->v += vec_and(wi.v,mask.v);
wcountSum->v += vec_and(wi2.v,mask2.v);
wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v),mask.v);
wcount_dhSum->v -= vec_and(vec_mul(xi2.v, wi_dx2.v),mask2.v);
div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)),mask.v);
div_vSum->v -= vec_and(vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)),mask2.v);
curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),mask.v);
curlvxSum->v += vec_and(vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)),mask2.v);
curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),mask.v);
curlvySum->v += vec_and(vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)),mask2.v);
curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),mask.v);
curlvzSum->v += vec_and(vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)),mask2.v);
rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
rhoSum->v += vec_and(vec_mul(mj2.v, wi2.v), mask2.v);
rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(xi.v, wi_dx.v))),
mask.v);
rho_dhSum->v -=
vec_and(vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
vec_mul(xi2.v, wi_dx2.v))),
mask2.v);
wcountSum->v += vec_and(wi.v, mask.v);
wcountSum->v += vec_and(wi2.v, mask2.v);
wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v);
wcount_dhSum->v -= vec_and(vec_mul(xi2.v, wi_dx2.v), mask2.v);
div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
div_vSum->v -= vec_and(vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2.v);
curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
curlvxSum->v +=
vec_and(vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2.v);
curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
curlvySum->v +=
vec_and(vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2.v);
curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
curlvzSum->v +=
vec_and(vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2.v);
#endif
}
#endif
......@@ -639,17 +684,19 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2,
pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2,
pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2);
pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2,
pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2,
pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2);
pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2,
pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2,
pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2);
grad_hi.v = vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f,
pi[4]->force.f, pi[5]->force.f, pi[6]->force.f, pi[7]->force.f);
grad_hj.v = vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f,
pj[4]->force.f, pj[5]->force.f, pj[6]->force.f, pj[7]->force.f);
pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2,
pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2,
pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2);
grad_hi.v =
vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f,
pi[4]->force.f, pi[5]->force.f, pi[6]->force.f, pi[7]->force.f);
grad_hj.v =
vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f,
pj[4]->force.f, pj[5]->force.f, pj[6]->force.f, pj[7]->force.f);
pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho,
pi[5]->rho, pi[6]->rho, pi[7]->rho);
pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho,
......@@ -682,11 +729,13 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass);
mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2);
pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2);
pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2);
grad_hi.v = vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f);
grad_hj.v = vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f);
pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2);
grad_hi.v =
vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f);
grad_hj.v =
vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f);
pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho);
pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho);
ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed,
......@@ -748,7 +797,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
/* Now, convolve with the kernel */
visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v;
sph_term.v = (grad_hi.v * piPOrho2.v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) * ri.v;
sph_term.v =
(grad_hi.v * piPOrho2.v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) *
ri.v;
/* Eventually get the acceleration */
acc.v = visc_term.v + sph_term.v;
......@@ -913,17 +964,19 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2,
pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2,
pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2);
pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2,
pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2,
pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2);
pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2,
pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2,
pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2);
grad_hi.v = vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f,
pi[4]->force.f, pi[5]->force.f, pi[6]->force.f, pi[7]->force.f);
grad_hj.v = vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f,
pj[4]->force.f, pj[5]->force.f, pj[6]->force.f, pj[7]->force.f);
pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2,
pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2,
pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2);
grad_hi.v =
vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f,
pi[4]->force.f, pi[5]->force.f, pi[6]->force.f, pi[7]->force.f);
grad_hj.v =
vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f,
pj[4]->force.f, pj[5]->force.f, pj[6]->force.f, pj[7]->force.f);
pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho,
pi[5]->rho, pi[6]->rho, pi[7]->rho);
pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho,
......@@ -955,11 +1008,13 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
#elif VEC_SIZE == 4
mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2);
pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2);
pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2);
grad_hi.v = vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f);
grad_hj.v = vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f);
pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2);
grad_hi.v =
vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f);
grad_hj.v =
vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f);
pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho);
pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho);
ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed,
......@@ -1021,7 +1076,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
/* Now, convolve with the kernel */
visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v;
sph_term.v = (grad_hi.v * piPOrho2.v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) * ri.v;
sph_term.v =
(grad_hi.v * piPOrho2.v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) *
ri.v;
/* Eventually get the acceleration */
acc.v = visc_term.v + sph_term.v;
......
......@@ -373,19 +373,24 @@ static const vector c5 = FILL_VEC(1.f);
#endif
/**
* @brief Computes the kernel function and its derivative for two particles using interleaved vectors.
* @brief Computes the kernel function and its derivative for two particles
* using interleaved vectors.
*
* Return 0 if $u > \\gamma = H/h$
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param w (return) The value of the kernel function $W(x,h)$.
* @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
* @param u2 The ratio of the distance to the smoothing length $u = x/h$ for second particle.
* @param w2 (return) The value of the kernel function $W(x,h)$ for second particle.
* @param dw_dx2 (return) The norm of the gradient of $|\\nabla W(x,h)|$ for second particle.
* @param u2 The ratio of the distance to the smoothing length $u = x/h$ for
* second particle.
* @param w2 (return) The value of the kernel function $W(x,h)$ for second
* particle.
* @param dw_dx2 (return) The norm of the gradient of $|\\nabla W(x,h)|$ for
* second particle.
*/
__attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
vector *u, vector *w, vector *dw_dx, vector *u2, vector *w2, vector *dw_dx2) {
vector *u, vector *w, vector *dw_dx, vector *u2, vector *w2,
vector *dw_dx2) {
/* Go to the range [0,1[ from [0,H[ */
vector x, x2;
......@@ -414,17 +419,21 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
w->v = vec_fma(x.v, w->v, c4.v);
w2->v = vec_fma(x2.v, w2->v, c4.v);
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
w->v = vec_fma(x.v, w->v, c5.v);
w2->v = vec_fma(x2.v, w2->v, c5.v);
/* Return everything */
w->v = vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
w2->v = vec_mul(w2->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v));
dw_dx2->v = vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v));
w->v =
vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
w2->v = vec_mul(w2->v,
vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
dw_dx2->v = vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
#else
/* Load x and get the interval id. */
......@@ -462,7 +471,6 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
dw_dx2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
#endif
}
#endif
......
......@@ -53,13 +53,13 @@
#include "hydro_properties.h"
#include "kick.h"
#include "minmax.h"
#include "runner_doiact_vec.h"
#include "scheduler.h"
#include "sourceterms.h"
#include "space.h"
#include "task.h"
#include "timers.h"
#include "timestep.h"
#include "runner_doiact_vec.h"
/**
* @brief Entry in a list of sorted indices.
......
......@@ -47,7 +47,7 @@ struct runner {
/*! The engine owing this runner. */
struct engine *e;
/*! The particle cache of this runner. */
struct cache par_cache;
};
......
This diff is collapsed.
......@@ -24,13 +24,13 @@
#include "../config.h"
/* Local headers */
#include "vector.h"
#include "part.h"
#include "cell.h"
#include "runner.h"
#include "engine.h"
#include "hydro.h"
#include "part.h"
#include "runner.h"
#include "timers.h"
#include "engine.h"
#include "vector.h"
/* Function prototypes. */
void runner_doself1_density_vec(struct runner *r, struct cell *restrict c);
......
......@@ -24,6 +24,7 @@
/* Local headers. */
#include "atomic.h"
#include "cache.h"
#include "cell.h"
#include "clocks.h"
#include "const.h"
......@@ -56,6 +57,5 @@
#include "tools.h"
#include "units.h"
#include "version.h"
#include "cache.h"