Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
SWIFT
SWIFTsim
Commits
028fb65b
Commit
028fb65b
authored
May 21, 2017
by
James Willis
Browse files
Formatting.
parent
ae2120aa
Changes
8
Expand all
Hide whitespace changes
Inline
Side-by-side
src/cache.h
View file @
028fb65b
...
...
@@ -62,19 +62,19 @@ struct cache {
/* Particle z velocity. */
float
*
restrict
vz
__attribute__
((
aligned
(
CACHE_ALIGN
)));
/* Particle density. */
float
*
restrict
rho
__attribute__
((
aligned
(
CACHE_ALIGN
)));
/* Particle smoothing length gradient. */
float
*
restrict
grad_h
__attribute__
((
aligned
(
CACHE_ALIGN
)));
/* Pressure over density squared. */
float
*
restrict
pOrho2
__attribute__
((
aligned
(
CACHE_ALIGN
)));
/* Balsara switch. */
float
*
restrict
balsara
__attribute__
((
aligned
(
CACHE_ALIGN
)));
/* Particle sound speed. */
float
*
restrict
soundspeed
__attribute__
((
aligned
(
CACHE_ALIGN
)));
...
...
@@ -112,19 +112,19 @@ struct c2_cache {
/* z velocity of particle pj. */
float
vzq
[
C2_CACHE_SIZE
]
__attribute__
((
aligned
(
C2_CACHE_ALIGN
)));
/* Density of particle pj. */
float
rhoq
[
C2_CACHE_SIZE
]
__attribute__
((
aligned
(
C2_CACHE_ALIGN
)));
/* Smoothing length gradient of particle pj. */
float
grad_hq
[
C2_CACHE_SIZE
]
__attribute__
((
aligned
(
C2_CACHE_ALIGN
)));
/* Pressure over density squared of particle pj. */
float
pOrho2q
[
C2_CACHE_SIZE
]
__attribute__
((
aligned
(
C2_CACHE_ALIGN
)));
/* Balsara switch of particle pj. */
float
balsaraq
[
C2_CACHE_SIZE
]
__attribute__
((
aligned
(
C2_CACHE_ALIGN
)));
/* Sound speed of particle pj. */
float
soundspeedq
[
C2_CACHE_SIZE
]
__attribute__
((
aligned
(
C2_CACHE_ALIGN
)));
...
...
@@ -213,7 +213,7 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
ci_cache
->
vx
[
i
]
=
ci
->
parts
[
i
].
v
[
0
];
ci_cache
->
vy
[
i
]
=
ci
->
parts
[
i
].
v
[
1
];
ci_cache
->
vz
[
i
]
=
ci
->
parts
[
i
].
v
[
2
];
ci_cache
->
rho
[
i
]
=
ci
->
parts
[
i
].
rho
;
ci_cache
->
grad_h
[
i
]
=
ci
->
parts
[
i
].
force
.
f
;
ci_cache
->
pOrho2
[
i
]
=
ci
->
parts
[
i
].
force
.
P_over_rho2
;
...
...
src/hydro/Gadget2/hydro_iact.h
View file @
028fb65b
...
...
@@ -434,14 +434,20 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
/* Mask updates to intermediate vector sums for particle pi. */
rhoSum
->
v
=
vec_mask_add
(
rhoSum
->
v
,
vec_mul
(
mj
.
v
,
wi
.
v
),
mask
);
rho_dhSum
->
v
=
vec_mask_sub
(
rho_dhSum
->
v
,
vec_mul
(
mj
.
v
,
vec_fma
(
vec_set1
(
hydro_dimension
),
wi
.
v
,
vec_mul
(
xi
.
v
,
wi_dx
.
v
))),
mask
);
rho_dhSum
->
v
=
vec_mask_sub
(
rho_dhSum
->
v
,
vec_mul
(
mj
.
v
,
vec_fma
(
vec_set1
(
hydro_dimension
),
wi
.
v
,
vec_mul
(
xi
.
v
,
wi_dx
.
v
))),
mask
);
wcountSum
->
v
=
vec_mask_add
(
wcountSum
->
v
,
wi
.
v
,
mask
);
wcount_dhSum
->
v
=
vec_mask_sub
(
wcount_dhSum
->
v
,
vec_mul
(
xi
.
v
,
wi_dx
.
v
),
mask
);
div_vSum
->
v
=
vec_mask_sub
(
div_vSum
->
v
,
vec_mul
(
mj
.
v
,
vec_mul
(
dvdr
.
v
,
wi_dx
.
v
)),
mask
);
curlvxSum
->
v
=
vec_mask_add
(
curlvxSum
->
v
,
vec_mul
(
mj
.
v
,
vec_mul
(
curlvrx
.
v
,
wi_dx
.
v
)),
mask
);
curlvySum
->
v
=
vec_mask_add
(
curlvySum
->
v
,
vec_mul
(
mj
.
v
,
vec_mul
(
curlvry
.
v
,
wi_dx
.
v
)),
mask
);
curlvzSum
->
v
=
vec_mask_add
(
curlvzSum
->
v
,
vec_mul
(
mj
.
v
,
vec_mul
(
curlvrz
.
v
,
wi_dx
.
v
)),
mask
);
div_vSum
->
v
=
vec_mask_sub
(
div_vSum
->
v
,
vec_mul
(
mj
.
v
,
vec_mul
(
dvdr
.
v
,
wi_dx
.
v
)),
mask
);
curlvxSum
->
v
=
vec_mask_add
(
curlvxSum
->
v
,
vec_mul
(
mj
.
v
,
vec_mul
(
curlvrx
.
v
,
wi_dx
.
v
)),
mask
);
curlvySum
->
v
=
vec_mask_add
(
curlvySum
->
v
,
vec_mul
(
mj
.
v
,
vec_mul
(
curlvry
.
v
,
wi_dx
.
v
)),
mask
);
curlvzSum
->
v
=
vec_mask_add
(
curlvzSum
->
v
,
vec_mul
(
mj
.
v
,
vec_mul
(
curlvrz
.
v
,
wi_dx
.
v
)),
mask
);
}
/**
...
...
@@ -449,12 +455,14 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
* (non-symmetric vectorized version).
*/
__attribute__
((
always_inline
))
INLINE
static
void
runner_iact_nonsym_2_vec_density
(
float
*
R2
,
float
*
Dx
,
float
*
Dy
,
float
*
Dz
,
vector
hi_inv
,
vector
vix
,
vector
viy
,
vector
viz
,
float
*
Vjx
,
float
*
Vjy
,
float
*
Vjz
,
float
*
Mj
,
vector
*
rhoSum
,
vector
*
rho_dhSum
,
vector
*
wcountSum
,
vector
*
wcount_dhSum
,
vector
*
div_vSum
,
vector
*
curlvxSum
,
vector
*
curlvySum
,
vector
*
curlvzSum
,
mask_t
mask
,
mask_t
mask2
,
short
mask_cond
)
{
runner_iact_nonsym_2_vec_density
(
float
*
R2
,
float
*
Dx
,
float
*
Dy
,
float
*
Dz
,
vector
hi_inv
,
vector
vix
,
vector
viy
,
vector
viz
,
float
*
Vjx
,
float
*
Vjy
,
float
*
Vjz
,
float
*
Mj
,
vector
*
rhoSum
,
vector
*
rho_dhSum
,
vector
*
wcountSum
,
vector
*
wcount_dhSum
,
vector
*
div_vSum
,
vector
*
curlvxSum
,
vector
*
curlvySum
,
vector
*
curlvzSum
,
mask_t
mask
,
mask_t
mask2
,
short
mask_cond
)
{
vector
r
,
ri
,
r2
,
xi
,
wi
,
wi_dx
;
vector
mj
;
...
...
@@ -534,35 +542,48 @@ runner_iact_nonsym_2_vec_density(
curlvrz
.
v
=
vec_mul
(
curlvrz
.
v
,
ri
.
v
);
curlvrz2
.
v
=
vec_mul
(
curlvrz2
.
v
,
ri2
.
v
);
/* Mask updates to intermediate vector sums for particle pi. */
/* Mask updates to intermediate vector sums for particle pi. */
/* Mask only when needed. */
if
(
mask_cond
)
{
if
(
mask_cond
)
{
rhoSum
->
v
=
vec_mask_add
(
rhoSum
->
v
,
vec_mul
(
mj
.
v
,
wi
.
v
),
mask
);
rhoSum
->
v
=
vec_mask_add
(
rhoSum
->
v
,
vec_mul
(
mj2
.
v
,
wi2
.
v
),
mask2
);
rho_dhSum
->
v
=
vec_mask_sub
(
rho_dhSum
->
v
,
vec_mul
(
mj
.
v
,
vec_fma
(
vec_set1
(
hydro_dimension
),
wi
.
v
,
vec_mul
(
xi
.
v
,
wi_dx
.
v
))),
mask
);
rho_dhSum
->
v
=
vec_mask_sub
(
rho_dhSum
->
v
,
vec_mul
(
mj2
.
v
,
vec_fma
(
vec_set1
(
hydro_dimension
),
wi2
.
v
,
vec_mul
(
xi2
.
v
,
wi_dx2
.
v
))),
mask2
);
rho_dhSum
->
v
=
vec_mask_sub
(
rho_dhSum
->
v
,
vec_mul
(
mj
.
v
,
vec_fma
(
vec_set1
(
hydro_dimension
),
wi
.
v
,
vec_mul
(
xi
.
v
,
wi_dx
.
v
))),
mask
);
rho_dhSum
->
v
=
vec_mask_sub
(
rho_dhSum
->
v
,
vec_mul
(
mj2
.
v
,
vec_fma
(
vec_set1
(
hydro_dimension
),
wi2
.
v
,
vec_mul
(
xi2
.
v
,
wi_dx2
.
v
))),
mask2
);
wcountSum
->
v
=
vec_mask_add
(
wcountSum
->
v
,
wi
.
v
,
mask
);
wcountSum
->
v
=
vec_mask_add
(
wcountSum
->
v
,
wi2
.
v
,
mask2
);
wcount_dhSum
->
v
=
vec_mask_sub
(
wcount_dhSum
->
v
,
vec_mul
(
xi
.
v
,
wi_dx
.
v
),
mask
);
wcount_dhSum
->
v
=
vec_mask_sub
(
wcount_dhSum
->
v
,
vec_mul
(
xi2
.
v
,
wi_dx2
.
v
),
mask2
);
div_vSum
->
v
=
vec_mask_sub
(
div_vSum
->
v
,
vec_mul
(
mj
.
v
,
vec_mul
(
dvdr
.
v
,
wi_dx
.
v
)),
mask
);
div_vSum
->
v
=
vec_mask_sub
(
div_vSum
->
v
,
vec_mul
(
mj2
.
v
,
vec_mul
(
dvdr2
.
v
,
wi_dx2
.
v
)),
mask2
);
curlvxSum
->
v
=
vec_mask_add
(
curlvxSum
->
v
,
vec_mul
(
mj
.
v
,
vec_mul
(
curlvrx
.
v
,
wi_dx
.
v
)),
mask
);
curlvxSum
->
v
=
vec_mask_add
(
curlvxSum
->
v
,
vec_mul
(
mj2
.
v
,
vec_mul
(
curlvrx2
.
v
,
wi_dx2
.
v
)),
mask2
);
curlvySum
->
v
=
vec_mask_add
(
curlvySum
->
v
,
vec_mul
(
mj
.
v
,
vec_mul
(
curlvry
.
v
,
wi_dx
.
v
)),
mask
);
curlvySum
->
v
=
vec_mask_add
(
curlvySum
->
v
,
vec_mul
(
mj2
.
v
,
vec_mul
(
curlvry2
.
v
,
wi_dx2
.
v
)),
mask2
);
curlvzSum
->
v
=
vec_mask_add
(
curlvzSum
->
v
,
vec_mul
(
mj
.
v
,
vec_mul
(
curlvrz
.
v
,
wi_dx
.
v
)),
mask
);
curlvzSum
->
v
=
vec_mask_add
(
curlvzSum
->
v
,
vec_mul
(
mj2
.
v
,
vec_mul
(
curlvrz2
.
v
,
wi_dx2
.
v
)),
mask2
);
}
else
{
wcount_dhSum
->
v
=
vec_mask_sub
(
wcount_dhSum
->
v
,
vec_mul
(
xi
.
v
,
wi_dx
.
v
),
mask
);
wcount_dhSum
->
v
=
vec_mask_sub
(
wcount_dhSum
->
v
,
vec_mul
(
xi2
.
v
,
wi_dx2
.
v
),
mask2
);
div_vSum
->
v
=
vec_mask_sub
(
div_vSum
->
v
,
vec_mul
(
mj
.
v
,
vec_mul
(
dvdr
.
v
,
wi_dx
.
v
)),
mask
);
div_vSum
->
v
=
vec_mask_sub
(
div_vSum
->
v
,
vec_mul
(
mj2
.
v
,
vec_mul
(
dvdr2
.
v
,
wi_dx2
.
v
)),
mask2
);
curlvxSum
->
v
=
vec_mask_add
(
curlvxSum
->
v
,
vec_mul
(
mj
.
v
,
vec_mul
(
curlvrx
.
v
,
wi_dx
.
v
)),
mask
);
curlvxSum
->
v
=
vec_mask_add
(
curlvxSum
->
v
,
vec_mul
(
mj2
.
v
,
vec_mul
(
curlvrx2
.
v
,
wi_dx2
.
v
)),
mask2
);
curlvySum
->
v
=
vec_mask_add
(
curlvySum
->
v
,
vec_mul
(
mj
.
v
,
vec_mul
(
curlvry
.
v
,
wi_dx
.
v
)),
mask
);
curlvySum
->
v
=
vec_mask_add
(
curlvySum
->
v
,
vec_mul
(
mj2
.
v
,
vec_mul
(
curlvry2
.
v
,
wi_dx2
.
v
)),
mask2
);
curlvzSum
->
v
=
vec_mask_add
(
curlvzSum
->
v
,
vec_mul
(
mj
.
v
,
vec_mul
(
curlvrz
.
v
,
wi_dx
.
v
)),
mask
);
curlvzSum
->
v
=
vec_mask_add
(
curlvzSum
->
v
,
vec_mul
(
mj2
.
v
,
vec_mul
(
curlvrz2
.
v
,
wi_dx2
.
v
)),
mask2
);
}
else
{
rhoSum
->
v
+=
vec_mul
(
mj
.
v
,
wi
.
v
);
rhoSum
->
v
+=
vec_mul
(
mj2
.
v
,
wi2
.
v
);
rho_dhSum
->
v
-=
vec_mul
(
mj
.
v
,
vec_fma
(
vec_set1
(
hydro_dimension
),
wi
.
v
,
vec_mul
(
xi
.
v
,
wi_dx
.
v
)));
rho_dhSum
->
v
-=
vec_mul
(
mj
.
v
,
vec_fma
(
vec_set1
(
hydro_dimension
),
wi
.
v
,
vec_mul
(
xi
.
v
,
wi_dx
.
v
)));
rho_dhSum
->
v
-=
vec_mul
(
mj2
.
v
,
vec_fma
(
vec_set1
(
hydro_dimension
),
wi2
.
v
,
vec_mul
(
xi2
.
v
,
wi_dx2
.
v
)));
vec_mul
(
xi2
.
v
,
wi_dx2
.
v
)));
wcountSum
->
v
+=
wi
.
v
;
wcountSum
->
v
+=
wi2
.
v
;
wcount_dhSum
->
v
-=
vec_mul
(
xi
.
v
,
wi_dx
.
v
);
...
...
@@ -1140,9 +1161,15 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
}
#ifdef WITH_VECTORIZATION
__attribute__
((
always_inline
))
INLINE
static
void
runner_iact_nonsym_1_vec_force
(
float
*
R2
,
float
*
Dx
,
float
*
Dy
,
float
*
Dz
,
vector
*
vix
,
vector
*
viy
,
vector
*
viz
,
vector
*
pirho
,
vector
*
grad_hi
,
vector
*
piPOrho2
,
vector
*
balsara_i
,
vector
*
ci
,
float
*
Vjx
,
float
*
Vjy
,
float
*
Vjz
,
float
*
Pjrho
,
float
*
Grad_hj
,
float
*
PjPOrho2
,
float
*
Balsara_j
,
float
*
Cj
,
float
*
Mj
,
vector
*
hi_inv
,
float
*
Hj_inv
,
vector
*
a_hydro_xSum
,
vector
*
a_hydro_ySum
,
vector
*
a_hydro_zSum
,
vector
*
h_dtSum
,
vector
*
v_sigSum
,
vector
*
entropy_dtSum
,
mask_t
mask
)
{
__attribute__
((
always_inline
))
INLINE
static
void
runner_iact_nonsym_1_vec_force
(
float
*
R2
,
float
*
Dx
,
float
*
Dy
,
float
*
Dz
,
vector
*
vix
,
vector
*
viy
,
vector
*
viz
,
vector
*
pirho
,
vector
*
grad_hi
,
vector
*
piPOrho2
,
vector
*
balsara_i
,
vector
*
ci
,
float
*
Vjx
,
float
*
Vjy
,
float
*
Vjz
,
float
*
Pjrho
,
float
*
Grad_hj
,
float
*
PjPOrho2
,
float
*
Balsara_j
,
float
*
Cj
,
float
*
Mj
,
vector
*
hi_inv
,
float
*
Hj_inv
,
vector
*
a_hydro_xSum
,
vector
*
a_hydro_ySum
,
vector
*
a_hydro_zSum
,
vector
*
h_dtSum
,
vector
*
v_sigSum
,
vector
*
entropy_dtSum
,
mask_t
mask
)
{
#ifdef WITH_VECTORIZATION
...
...
@@ -1164,7 +1191,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force
dx
.
v
=
vec_load
(
Dx
);
dy
.
v
=
vec_load
(
Dy
);
dz
.
v
=
vec_load
(
Dz
);
vjx
.
v
=
vec_load
(
Vjx
);
vjy
.
v
=
vec_load
(
Vjy
);
vjz
.
v
=
vec_load
(
Vjz
);
...
...
@@ -1179,7 +1206,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force
fac_mu
.
v
=
vec_set1
(
1
.
f
);
/* Will change with cosmological integration */
/* Load stuff. */
/* Load stuff. */
balsara
.
v
=
balsara_i
->
v
+
balsara_j
.
v
;
/* Get the radius and inverse radius. */
...
...
@@ -1195,10 +1222,10 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force
/* Get the kernel for hj. */
hjd_inv
=
pow_dimension_plus_one_vec
(
hj_inv
);
xj
.
v
=
r
.
v
*
hj_inv
.
v
;
/* Calculate the kernel for two particles. */
kernel_eval_dWdx_force_vec
(
&
xj
,
&
wj_dx
);
wj_dr
.
v
=
hjd_inv
.
v
*
wj_dx
.
v
;
/* Compute dv dot r. */
...
...
@@ -1223,7 +1250,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force
sph_term
.
v
=
(
grad_hi
->
v
*
piPOrho2
->
v
*
wi_dr
.
v
+
grad_hj
.
v
*
pjPOrho2
.
v
*
wj_dr
.
v
)
*
ri
.
v
;
/* Eventually get the acceleration */
acc
.
v
=
visc_term
.
v
+
sph_term
.
v
;
...
...
@@ -1255,9 +1282,16 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force
#endif
}
__attribute__
((
always_inline
))
INLINE
static
void
runner_iact_nonsym_2_vec_force
(
float
*
R2
,
float
*
Dx
,
float
*
Dy
,
float
*
Dz
,
vector
*
vix
,
vector
*
viy
,
vector
*
viz
,
vector
*
pirho
,
vector
*
grad_hi
,
vector
*
piPOrho2
,
vector
*
balsara_i
,
vector
*
ci
,
float
*
Vjx
,
float
*
Vjy
,
float
*
Vjz
,
float
*
Pjrho
,
float
*
Grad_hj
,
float
*
PjPOrho2
,
float
*
Balsara_j
,
float
*
Cj
,
float
*
Mj
,
vector
*
hi_inv
,
float
*
Hj_inv
,
vector
*
a_hydro_xSum
,
vector
*
a_hydro_ySum
,
vector
*
a_hydro_zSum
,
vector
*
h_dtSum
,
vector
*
v_sigSum
,
vector
*
entropy_dtSum
,
mask_t
mask
,
mask_t
mask_2
,
short
mask_cond
)
{
__attribute__
((
always_inline
))
INLINE
static
void
runner_iact_nonsym_2_vec_force
(
float
*
R2
,
float
*
Dx
,
float
*
Dy
,
float
*
Dz
,
vector
*
vix
,
vector
*
viy
,
vector
*
viz
,
vector
*
pirho
,
vector
*
grad_hi
,
vector
*
piPOrho2
,
vector
*
balsara_i
,
vector
*
ci
,
float
*
Vjx
,
float
*
Vjy
,
float
*
Vjz
,
float
*
Pjrho
,
float
*
Grad_hj
,
float
*
PjPOrho2
,
float
*
Balsara_j
,
float
*
Cj
,
float
*
Mj
,
vector
*
hi_inv
,
float
*
Hj_inv
,
vector
*
a_hydro_xSum
,
vector
*
a_hydro_ySum
,
vector
*
a_hydro_zSum
,
vector
*
h_dtSum
,
vector
*
v_sigSum
,
vector
*
entropy_dtSum
,
mask_t
mask
,
mask_t
mask_2
,
short
mask_cond
)
{
#ifdef WITH_VECTORIZATION
...
...
@@ -1292,7 +1326,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
dx
.
v
=
vec_load
(
Dx
);
dy
.
v
=
vec_load
(
Dy
);
dz
.
v
=
vec_load
(
Dz
);
vjx
.
v
=
vec_load
(
Vjx
);
vjy
.
v
=
vec_load
(
Vjy
);
vjz
.
v
=
vec_load
(
Vjz
);
...
...
@@ -1311,7 +1345,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
dx_2
.
v
=
vec_load
(
&
Dx
[
VEC_SIZE
]);
dy_2
.
v
=
vec_load
(
&
Dy
[
VEC_SIZE
]);
dz_2
.
v
=
vec_load
(
&
Dz
[
VEC_SIZE
]);
vjx_2
.
v
=
vec_load
(
&
Vjx
[
VEC_SIZE
]);
vjy_2
.
v
=
vec_load
(
&
Vjy
[
VEC_SIZE
]);
vjz_2
.
v
=
vec_load
(
&
Vjz
[
VEC_SIZE
]);
...
...
@@ -1324,7 +1358,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
cj_2
.
v
=
vec_load
(
&
Cj
[
VEC_SIZE
]);
hj_inv_2
.
v
=
vec_load
(
&
Hj_inv
[
VEC_SIZE
]);
/* Load stuff. */
/* Load stuff. */
balsara
.
v
=
balsara_i
->
v
+
balsara_j
.
v
;
balsara_2
.
v
=
balsara_i
->
v
+
balsara_j_2
.
v
;
...
...
@@ -1348,11 +1382,11 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
hjd_inv_2
=
pow_dimension_plus_one_vec
(
hj_inv_2
);
xj
.
v
=
r
.
v
*
hj_inv
.
v
;
xj_2
.
v
=
r_2
.
v
*
hj_inv_2
.
v
;
/* Calculate the kernel for two particles. */
kernel_eval_dWdx_force_vec
(
&
xj
,
&
wj_dx
);
kernel_eval_dWdx_force_vec
(
&
xj_2
,
&
wj_dx_2
);
wj_dr
.
v
=
hjd_inv
.
v
*
wj_dx
.
v
;
wj_dr_2
.
v
=
hjd_inv_2
.
v
*
wj_dx_2
.
v
;
...
...
@@ -1360,13 +1394,13 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
dvdr
.
v
=
((
vix
->
v
-
vjx
.
v
)
*
dx
.
v
)
+
((
viy
->
v
-
vjy
.
v
)
*
dy
.
v
)
+
((
viz
->
v
-
vjz
.
v
)
*
dz
.
v
);
dvdr_2
.
v
=
((
vix
->
v
-
vjx_2
.
v
)
*
dx_2
.
v
)
+
((
viy
->
v
-
vjy_2
.
v
)
*
dy_2
.
v
)
+
((
viz
->
v
-
vjz_2
.
v
)
*
dz_2
.
v
);
((
viz
->
v
-
vjz_2
.
v
)
*
dz_2
.
v
);
/* Compute the relative velocity. (This is 0 if the particles move away from
* each other and negative otherwise) */
omega_ij
.
v
=
vec_fmin
(
dvdr
.
v
,
vec_setzero
());
omega_ij_2
.
v
=
vec_fmin
(
dvdr_2
.
v
,
vec_setzero
());
mu_ij
.
v
=
fac_mu
.
v
*
ri
.
v
*
omega_ij
.
v
;
/* This is 0 or negative */
mu_ij
.
v
=
fac_mu
.
v
*
ri
.
v
*
omega_ij
.
v
;
/* This is 0 or negative */
mu_ij_2
.
v
=
fac_mu
.
v
*
ri_2
.
v
*
omega_ij_2
.
v
;
/* This is 0 or negative */
/* Compute signal velocity */
...
...
@@ -1379,7 +1413,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
visc
.
v
=
vec_set1
(
-
0
.
25
f
)
*
vec_set1
(
const_viscosity_alpha
)
*
v_sig
.
v
*
mu_ij
.
v
*
balsara
.
v
/
rho_ij
.
v
;
visc_2
.
v
=
vec_set1
(
-
0
.
25
f
)
*
vec_set1
(
const_viscosity_alpha
)
*
v_sig_2
.
v
*
mu_ij_2
.
v
*
balsara_2
.
v
/
rho_ij_2
.
v
;
mu_ij_2
.
v
*
balsara_2
.
v
/
rho_ij_2
.
v
;
/* Now, convolve with the kernel */
visc_term
.
v
=
vec_set1
(
0
.
5
f
)
*
visc
.
v
*
(
wi_dr
.
v
+
wj_dr
.
v
)
*
ri
.
v
;
...
...
@@ -1387,9 +1421,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
sph_term
.
v
=
(
grad_hi
->
v
*
piPOrho2
->
v
*
wi_dr
.
v
+
grad_hj
.
v
*
pjPOrho2
.
v
*
wj_dr
.
v
)
*
ri
.
v
;
sph_term_2
.
v
=
(
grad_hi
->
v
*
piPOrho2
->
v
*
wi_dr_2
.
v
+
grad_hj_2
.
v
*
pjPOrho2_2
.
v
*
wj_dr_2
.
v
)
*
ri_2
.
v
;
sph_term_2
.
v
=
(
grad_hi
->
v
*
piPOrho2
->
v
*
wi_dr_2
.
v
+
grad_hj_2
.
v
*
pjPOrho2_2
.
v
*
wj_dr_2
.
v
)
*
ri_2
.
v
;
/* Eventually get the acceleration */
acc
.
v
=
visc_term
.
v
+
sph_term
.
v
;
...
...
@@ -1412,7 +1446,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
entropy_dt_2
.
v
=
mj_2
.
v
*
visc_term_2
.
v
*
dvdr_2
.
v
;
/* Store the forces back on the particles. */
if
(
mask_cond
)
{
if
(
mask_cond
)
{
a_hydro_xSum
->
v
=
vec_mask_sub
(
a_hydro_xSum
->
v
,
piax
.
v
,
mask
);
a_hydro_xSum
->
v
=
vec_mask_sub
(
a_hydro_xSum
->
v
,
piax_2
.
v
,
mask_2
);
a_hydro_ySum
->
v
=
vec_mask_sub
(
a_hydro_ySum
->
v
,
piay
.
v
,
mask
);
...
...
@@ -1425,8 +1459,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
v_sigSum
->
v
=
vec_fmax
(
v_sigSum
->
v
,
vec_and_mask
(
v_sig_2
,
mask_2
));
entropy_dtSum
->
v
=
vec_mask_add
(
entropy_dtSum
->
v
,
entropy_dt
.
v
,
mask
);
entropy_dtSum
->
v
=
vec_mask_add
(
entropy_dtSum
->
v
,
entropy_dt_2
.
v
,
mask_2
);
}
else
{
}
else
{
a_hydro_xSum
->
v
=
vec_sub
(
a_hydro_xSum
->
v
,
piax
.
v
);
a_hydro_xSum
->
v
=
vec_sub
(
a_hydro_xSum
->
v
,
piax_2
.
v
);
a_hydro_ySum
->
v
=
vec_sub
(
a_hydro_ySum
->
v
,
piay
.
v
);
...
...
src/kernel_hydro.h
View file @
028fb65b
...
...
@@ -436,7 +436,7 @@ static const vector cond = FILL_VEC(0.5f);
/**
* @brief Computes the kernel function and its derivative for two particles
* using vectors. Does not return zero if $u > \\gamma = H/h$, should only
* using vectors. Does not return zero if $u > \\gamma = H/h$, should only
* be called if particles are known to interact.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
...
...
@@ -517,7 +517,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
/**
* @brief Computes the kernel function and its derivative for two particles
* using interleaved vectors. Does not return zero if $u > \\gamma = H/h$, should only
* using interleaved vectors. Does not return zero if $u > \\gamma = H/h$,
* should only
* be called if particles are known to interact.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
...
...
@@ -582,9 +583,9 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
vector
mask_reg1
,
mask_reg2
,
mask_reg1_v2
,
mask_reg2_v2
;
/* Form a mask for each part of the kernel. */
mask_reg1
.
v
=
vec_cmp_lt
(
x
.
v
,
cond
.
v
);
/* 0 < x < 0.5 */
mask_reg1_v2
.
v
=
vec_cmp_lt
(
x2
.
v
,
cond
.
v
);
/* 0 < x < 0.5 */
mask_reg2
.
v
=
vec_cmp_gte
(
x
.
v
,
cond
.
v
);
/* 0.5 < x < 1 */
mask_reg1
.
v
=
vec_cmp_lt
(
x
.
v
,
cond
.
v
);
/* 0 < x < 0.5 */
mask_reg1_v2
.
v
=
vec_cmp_lt
(
x2
.
v
,
cond
.
v
);
/* 0 < x < 0.5 */
mask_reg2
.
v
=
vec_cmp_gte
(
x
.
v
,
cond
.
v
);
/* 0.5 < x < 1 */
mask_reg2_v2
.
v
=
vec_cmp_gte
(
x2
.
v
,
cond
.
v
);
/* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
...
...
@@ -648,7 +649,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
/**
* @brief Computes the kernel function for two particles
* using vectors. Does not return zero if $u > \\gamma = H/h$, should only
* using vectors. Does not return zero if $u > \\gamma = H/h$, should only
* be called if particles are known to interact.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
...
...
@@ -709,7 +710,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
/**
* @brief Computes the kernel function derivative for two particles
* using vectors. Does not return zero if $u > \\gamma = H/h$, should only
* using vectors. Does not return zero if $u > \\gamma = H/h$, should only
* be called if particles are known to interact.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
...
...
@@ -825,9 +826,9 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_vec(
/* Mask out result for particles that lie outside of the kernel function. */
vector
mask
;
mask
.
v
=
vec_cmp_lt
(
x
.
v
,
vec_set1
(
1
.
f
));
mask
.
v
=
vec_cmp_lt
(
x
.
v
,
vec_set1
(
1
.
f
));
dw_dx
->
v
=
vec_and
(
dw_dx
->
v
,
mask
.
v
);
dw_dx
->
v
=
vec_and
(
dw_dx
->
v
,
mask
.
v
);
/* Return everything */
dw_dx
->
v
=
vec_mul
(
dw_dx
->
v
,
vec_mul
(
kernel_constant_vec
.
v
,
...
...
@@ -854,7 +855,8 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
#ifdef WENDLAND_C2_KERNEL
/* Init the iteration for Horner's scheme. */
dw_dx
->
v
=
vec_fma
(
wendland_dwdx_const_c0
.
v
,
x
.
v
,
wendland_dwdx_const_c1
.
v
);
dw_dx_2
->
v
=
vec_fma
(
wendland_dwdx_const_c0
.
v
,
x_2
.
v
,
wendland_dwdx_const_c1
.
v
);
dw_dx_2
->
v
=
vec_fma
(
wendland_dwdx_const_c0
.
v
,
x_2
.
v
,
wendland_dwdx_const_c1
.
v
);
/* Calculate the polynomial interleaving vector operations */
dw_dx
->
v
=
vec_fma
(
dw_dx
->
v
,
x
.
v
,
wendland_dwdx_const_c2
.
v
);
...
...
@@ -872,9 +874,9 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
vector
mask_reg1_2
,
mask_reg2_2
;
/* Form a mask for each part of the kernel. */
mask_reg1
.
v
=
vec_cmp_lt
(
x
.
v
,
cond
.
v
);
/* 0 < x < 0.5 */
mask_reg1
.
v
=
vec_cmp_lt
(
x
.
v
,
cond
.
v
);
/* 0 < x < 0.5 */
mask_reg1_2
.
v
=
vec_cmp_lt
(
x_2
.
v
,
cond
.
v
);
/* 0 < x < 0.5 */
mask_reg2
.
v
=
vec_cmp_gte
(
x
.
v
,
cond
.
v
);
/* 0.5 < x < 1 */
mask_reg2
.
v
=
vec_cmp_gte
(
x
.
v
,
cond
.
v
);
/* 0.5 < x < 1 */
mask_reg2_2
.
v
=
vec_cmp_gte
(
x_2
.
v
,
cond
.
v
);
/* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
...
...
@@ -887,7 +889,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
dw_dx2_2
.
v
=
vec_fma
(
cubic_2_dwdx_const_c0
.
v
,
x_2
.
v
,
cubic_2_dwdx_const_c1
.
v
);
/* Calculate the polynomial interleaving vector operations. */
dw_dx
->
v
=
vec_mul
(
dw_dx
->
v
,
x
.
v
);
/* cubic_1_dwdx_const_c2 is zero. */
dw_dx
->
v
=
vec_mul
(
dw_dx
->
v
,
x
.
v
);
/* cubic_1_dwdx_const_c2 is zero. */
dw_dx_2
->
v
=
vec_mul
(
dw_dx_2
->
v
,
x_2
.
v
);
/* cubic_1_dwdx_const_c2 is zero. */
dw_dx2
.
v
=
vec_fma
(
dw_dx2
.
v
,
x
.
v
,
cubic_2_dwdx_const_c2
.
v
);
dw_dx2_2
.
v
=
vec_fma
(
dw_dx2_2
.
v
,
x_2
.
v
,
cubic_2_dwdx_const_c2
.
v
);
...
...
@@ -907,17 +909,18 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
/* Mask out result for particles that lie outside of the kernel function. */
vector
mask
,
mask_2
;
mask
.
v
=
vec_cmp_lt
(
x
.
v
,
vec_set1
(
1
.
f
));
mask_2
.
v
=
vec_cmp_lt
(
x_2
.
v
,
vec_set1
(
1
.
f
));
mask
.
v
=
vec_cmp_lt
(
x
.
v
,
vec_set1
(
1
.
f
));
mask_2
.
v
=
vec_cmp_lt
(
x_2
.
v
,
vec_set1
(
1
.
f
));
dw_dx
->
v
=
vec_and
(
dw_dx
->
v
,
mask
.
v
);
dw_dx_2
->
v
=
vec_and
(
dw_dx_2
->
v
,
mask_2
.
v
);
dw_dx
->
v
=
vec_and
(
dw_dx
->
v
,
mask
.
v
);
dw_dx_2
->
v
=
vec_and
(
dw_dx_2
->
v
,
mask_2
.
v
);
/* Return everything */
dw_dx
->
v
=
vec_mul
(
dw_dx
->
v
,
vec_mul
(
kernel_constant_vec
.
v
,
kernel_gamma_inv_dim_plus_one_vec
.
v
));
dw_dx_2
->
v
=
vec_mul
(
dw_dx_2
->
v
,
vec_mul
(
kernel_constant_vec
.
v
,
kernel_gamma_inv_dim_plus_one_vec
.
v
));
dw_dx_2
->
v
=
vec_mul
(
dw_dx_2
->
v
,
vec_mul
(
kernel_constant_vec
.
v
,
kernel_gamma_inv_dim_plus_one_vec
.
v
));
}
#endif
/* WITH_VECTORIZATION */
...
...
src/runner.c
View file @
028fb65b
...
...
@@ -1793,8 +1793,7 @@ void *runner_main(void *data) {
#else
runner_doself2_force
(
r
,
ci
);
#endif
}
else
if
(
t
->
subtype
==
task_subtype_grav
)
}
else
if
(
t
->
subtype
==
task_subtype_grav
)
runner_doself_grav
(
r
,
ci
,
1
);
else
if
(
t
->
subtype
==
task_subtype_external_grav
)
runner_do_grav_external
(
r
,
ci
,
1
);
...
...
src/runner_doiact_vec.c
View file @
028fb65b
This diff is collapsed.
Click to expand it.
src/vector.h
View file @
028fb65b
...
...
@@ -60,9 +60,9 @@
#define vec_dbl_set(a, b, c, d, e, f, g, h) \
_mm512_set_pd(h, g, f, e, d, c, b, a)
#define vec_add(a, b) _mm512_add_ps(a, b)
#define vec_mask_add(a, b, mask) _mm512_mask_add_ps(a, mask, b, a)
#define vec_mask_add(a, b, mask) _mm512_mask_add_ps(a, mask, b, a)
#define vec_sub(a, b) _mm512_sub_ps(a, b)
#define vec_mask_sub(a, b, mask) _mm512_mask_sub_ps(a, mask, a, b)
#define vec_mask_sub(a, b, mask) _mm512_mask_sub_ps(a, mask, a, b)
#define vec_mul(a, b) _mm512_mul_ps(a, b)
#define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c)
#define vec_sqrt(a) _mm512_sqrt_ps(a)
...
...
@@ -80,12 +80,12 @@
#define vec_cmp_result(a) a
#define vec_form_int_mask(a) a
#define vec_and(a, b) _mm512_and_ps(a, b)
#define vec_mask_and(a, b) a &
b
#define vec_mask_and(a, b) a &b
#define vec_and_mask(a, mask) _mm512_maskz_expand_ps(mask, a)
#define vec_init_mask(mask) mask = 0xFFFF
#define vec_zero_mask(mask) mask = 0
#define vec_create_mask(mask, cond) mask = cond
#define vec_pad_mask(mask,pad) mask = mask >> (pad)
#define vec_pad_mask(mask,
pad) mask = mask >> (pad)
#define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1)
...
...
@@ -127,7 +127,7 @@
}
#endif
/* Do nothing in the case of AVX-512 as there are already
/* Do nothing in the case of AVX-512 as there are already
* instructions for left-packing.*/
#define VEC_FORM_PACKED_MASK(mask, packed_mask)
...
...
@@ -155,9 +155,9 @@
#define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a)
#define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a)
#define vec_add(a, b) _mm256_add_ps(a, b)
#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b,mask.v))
#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b,
mask.v))
#define vec_sub(a, b) _mm256_sub_ps(a, b)
#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b,mask.v))
#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b,
mask.v))
#define vec_mul(a, b) _mm256_mul_ps(a, b)
#define vec_sqrt(a) _mm256_sqrt_ps(a)
#define vec_rcp(a) _mm256_rcp_ps(a)
...
...
@@ -172,14 +172,15 @@
#define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
#define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
#define vec_cmp_result(a) _mm256_movemask_ps(a)
#define vec_form_int_mask(a) _mm256_movemask_ps(a.v)
#define vec_form_int_mask(a) _mm256_movemask_ps(a.v)
#define vec_and(a, b) _mm256_and_ps(a, b)
#define vec_mask_and(a, b) _mm256_and_ps(a.v, b.v)
#define vec_and_mask(a, mask) vec_mask_and(a, mask)
#define vec_init_mask(mask) mask.m = vec_setint1(0xFFFFFFFF)
#define vec_create_mask(mask, cond) mask.v = cond
#define vec_zero_mask(mask) mask.v = vec_setzero()
#define vec_pad_mask(mask,pad) for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0
#define vec_pad_mask(mask, pad) \
for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0
#define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1)
...
...
@@ -205,12 +206,12 @@
a.v = _mm256_hadd_ps(a.v, a.v); \
b += a.f[0] + a.f[4];
/* Performs a horizontal maximum on the vector and takes the maximum of the
result with a float, b. */
#define VEC_HMAX(a, b) \
{
\
for(int k=0; k<VEC_SIZE; k++)
\
b = max(b, a.f[k]);
\
}
/* Performs a horizontal maximum on the vector and takes the maximum of the
* result with a float, b. */
#define VEC_HMAX(a, b)
\
{
\
for (int k = 0; k < VEC_SIZE; k++)
b = max(b, a.f[k]); \
}
/* Returns the lower 128-bits of the 256-bit vector. */
#define VEC_GET_LOW(a) _mm256_castps256_ps128(a)
...
...
tests/test125cells.c
View file @
028fb65b
...
...
@@ -690,7 +690,7 @@ int main(int argc, char *argv[]) {
struct
cell
*
cj
=
cells
[
i
*
25
+
j
*
5
+
k
];
if
(
main_cell
!=
cj
)
{
const
ticks
sub_tic
=
getticks
();
runner_dopair2_force
(
&
runner
,
main_cell
,
cj
);
...
...
@@ -703,7 +703,7 @@ int main(int argc, char *argv[]) {
}
#ifdef WITH_VECTORIZATION
/* Initialise the cache. */
/* Initialise the cache. */
runner
.
ci_cache
.
count
=
0
;
cache_init
(
&
runner
.
ci_cache
,
512
);
#endif
...
...
@@ -712,7 +712,7 @@ int main(int argc, char *argv[]) {
/* And now the self-interaction for the main cell */
DOSELF2
(
&
runner
,
main_cell
);
timings
[
26
]
+=
getticks
()
-
self_tic
;
#endif
...
...
@@ -729,13 +729,12 @@ int main(int argc, char *argv[]) {
}
}
for
(
size_t
n
=
0
;
n
<
100
*
runs
;
++
n
)
{
for
(
size_t
n
=
0
;
n
<
100
*
runs
;
++
n
)
{
ticks
self_tic
=
getticks
();
DOSELF2
(
&
runner
,
main_cell
);
self_force_time
+=
getticks
()
-
self_tic
;
}
/* Output timing */
...
...
@@ -752,7 +751,8 @@ int main(int argc, char *argv[]) {