Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
SWIFT
SWIFTsim
Commits
32870962
Commit
32870962
authored
Feb 26, 2013
by
Pedro Gonnet
Browse files
make vectorized version of viscosity stuff.
Former-commit-id: a4c64268feee068a89e2794bef41b9ffa18af845
parent
005be63d
Changes
4
Hide whitespace changes
Inline
Side-by-side
src/engine.c
View file @
32870962
...
...
@@ -187,9 +187,6 @@ void engine_step ( struct engine *e , int sort_queues ) {
double
lang
[
3
],
ang
[
3
]
=
{
0
.
0
,
0
.
0
,
0
.
0
};
double
lent
,
ent
=
0
.
0
;
int
threadID
,
nthreads
,
count
=
0
,
lcount
;
// #ifdef __SSE2__
// VEC_MACRO(4,float) hdtv = _mm_set1_ps( hdt );
// #endif
/* Get the maximum dt. */
dt_step
=
2
.
0
f
*
dt
;
...
...
@@ -212,14 +209,9 @@ void engine_step ( struct engine *e , int sort_queues ) {
xp
=
p
->
xtras
;
/* Step and store the velocity and internal energy. */
// #ifdef __SSE__
// _mm_store_ps( &v_bar[4*k] , _mm_load_ps( &p->v[0] ) + hdtv * _mm_load_ps( &p->a[0] ) );
// #else
xp
->
v_old
[
0
]
=
p
->
v
[
0
]
+
hdt
*
p
->
a
[
0
];
xp
->
v_old
[
1
]
=
p
->
v
[
1
]
+
hdt
*
p
->
a
[
1
];
xp
->
v_old
[
2
]
=
p
->
v
[
2
]
+
hdt
*
p
->
a
[
2
];
// #endif
// xp->u_old = fmaxf( p->u + hdt * p->u_dt , FLT_EPSILON );
xp
->
v_old
[
0
]
=
p
->
v
[
0
]
+
hdt
*
p
->
a
[
0
];
xp
->
v_old
[
1
]
=
p
->
v
[
1
]
+
hdt
*
p
->
a
[
1
];
xp
->
v_old
[
2
]
=
p
->
v
[
2
]
+
hdt
*
p
->
a
[
2
];
xp
->
u_old
=
p
->
u
+
hdt
*
p
->
u_dt
;
/* Move the particles with the velocitie at the half-step. */
...
...
@@ -306,14 +298,9 @@ void engine_step ( struct engine *e , int sort_queues ) {
p
->
dt
=
const_cfl
*
p
->
h
/
(
p
->
v_sig
);
/* Update positions and energies at the half-step. */
// #ifdef __SSE__
// _mm_store_ps( &p->v[0] , _mm_load_ps( &v_bar[4*k] ) + hdtv * _mm_load_ps( &p->a[0] ) );
// #else
p
->
v
[
0
]
=
xp
->
v_old
[
0
]
+
hdt
*
p
->
a
[
0
];
p
->
v
[
1
]
=
xp
->
v_old
[
1
]
+
hdt
*
p
->
a
[
1
];
p
->
v
[
2
]
=
xp
->
v_old
[
2
]
+
hdt
*
p
->
a
[
2
];
// #endif
// p->u = fmaxf( xp->u_old + hdt * p->u_dt , FLT_EPSILON );
p
->
v
[
0
]
=
xp
->
v_old
[
0
]
+
hdt
*
p
->
a
[
0
];
p
->
v
[
1
]
=
xp
->
v_old
[
1
]
+
hdt
*
p
->
a
[
1
];
p
->
v
[
2
]
=
xp
->
v_old
[
2
]
+
hdt
*
p
->
a
[
2
];
p
->
u
=
xp
->
u_old
+
hdt
*
p
->
u_dt
;
/* Get the smallest/largest dt. */
...
...
src/part.h
View file @
32870962
...
...
@@ -73,7 +73,7 @@ struct part {
float
curl_v
[
3
]
__attribute__
((
aligned
(
16
)));
/* Balsara switch */
float
B
alsara
;
float
b
alsara
;
/* Particle pressure. */
// float P;
...
...
src/runner.c
View file @
32870962
...
...
@@ -365,8 +365,8 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
/* Is this part within the timestep? */
if
(
cp
->
dt
<=
dt_step
)
{
/* Some smoothing length multiples*/
ihg
=
kernel_igamma
/
p
->
h
;
/* Some smoothing length multiples*/
ihg
=
kernel_igamma
/
p
->
h
;
ihg2
=
ihg
*
ihg
;
/* Adjust the computed weighted number of neighbours. */
...
...
@@ -394,13 +394,13 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
p
->
wcount_dh
=
0
.
0
;
p
->
rho
=
0
.
0
;
p
->
rho_dh
=
0
.
0
;
p
->
div_v
=
0
.
0
;
for
(
k
=
0
;
k
<
3
;
k
++
)
p
->
curl_v
[
k
]
=
0
.
0
;
p
->
div_v
=
0
.
0
;
for
(
k
=
0
;
k
<
3
;
k
++
)
p
->
curl_v
[
k
]
=
0
.
0
;
continue
;
}
/* Final operation on the density */
/* Final operation on the density */
p
->
rho
=
ihg
*
ihg2
*
(
p
->
rho
+
p
->
mass
*
kernel_root
);
p
->
rho_dh
*=
ihg2
*
ihg2
;
...
...
@@ -410,20 +410,20 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
/* Compute the P/Omega/rho2. */
p
->
POrho2
=
p
->
u
*
(
const_gamma
-
1
.
0
f
)
/
(
p
->
rho
+
p
->
h
*
p
->
rho_dh
/
3
.
0
f
);
/* Final operation on the velocity divergence */
p
->
div_v
/=
p
->
rho
;
p
->
div_v
*=
ihg2
*
ihg2
;
/* Final operation on the velocity divergence */
p
->
div_v
/=
p
->
rho
;
p
->
div_v
*=
ihg2
*
ihg2
;
/* Final operation on the velocity curl */
for
(
k
=
0
;
k
<
3
;
k
++
){
p
->
curl_v
[
k
]
/=
-
p
->
rho
;
p
->
curl_v
[
k
]
*=
ihg2
*
ihg2
;
}
/* Final operation on the velocity curl */
for
(
k
=
0
;
k
<
3
;
k
++
){
p
->
curl_v
[
k
]
/=
-
p
->
rho
;
p
->
curl_v
[
k
]
*=
ihg2
*
ihg2
;
}
/* Balsara switch */
normDiv_v
=
fabs
(
p
->
div_v
);
normCurl_v
=
sqrtf
(
p
->
curl_v
[
0
]
*
p
->
curl_v
[
0
]
+
p
->
curl_v
[
1
]
*
p
->
curl_v
[
1
]
+
p
->
curl_v
[
2
]
*
p
->
curl_v
[
2
]
);
p
->
B
alsara
=
normCurl_v
/
(
normDiv_v
+
normCurl_v
+
0
.
0001
f
*
p
->
c
/
p
->
h
);
/* Balsara switch */
normDiv_v
=
fabs
(
p
->
div_v
);
normCurl_v
=
sqrtf
(
p
->
curl_v
[
0
]
*
p
->
curl_v
[
0
]
+
p
->
curl_v
[
1
]
*
p
->
curl_v
[
1
]
+
p
->
curl_v
[
2
]
*
p
->
curl_v
[
2
]
);
p
->
b
alsara
=
normCurl_v
/
(
normDiv_v
+
normCurl_v
+
0
.
0001
f
*
p
->
c
/
p
->
h
);
/* Reset the acceleration. */
for
(
k
=
0
;
k
<
3
;
k
++
)
...
...
src/runner_iact.h
View file @
32870962
...
...
@@ -72,9 +72,10 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_density ( float r
pi
->
wcount_dh
-=
xi
*
h_inv
*
wi_dx
*
(
4
.
0
f
*
M_PI
/
3
.
0
f
*
kernel_igamma3
);
// pi->icount += 1;
pi
->
div_v
+=
pj
->
mass
*
dvdr
*
wi_dx
;
for
(
k
=
0
;
k
<
3
;
k
++
)
pi
->
curl_v
[
k
]
+=
pj
->
mass
*
curlvr
[
k
]
*
wi_dx
;
pi
->
div_v
+=
pj
->
mass
*
dvdr
*
wi_dx
;
for
(
k
=
0
;
k
<
3
;
k
++
)
pi
->
curl_v
[
k
]
+=
pj
->
mass
*
curlvr
[
k
]
*
wi_dx
;
}
if
(
r2
<
hj
*
hj
&&
pj
!=
NULL
)
{
...
...
@@ -90,9 +91,10 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_density ( float r
pj
->
wcount_dh
-=
xj
*
h_inv
*
wj_dx
*
(
4
.
0
f
*
M_PI
/
3
.
0
f
*
kernel_igamma3
);
// pj->icount += 1;
pj
->
div_v
=
pi
->
mass
*
dvdr
*
wj_dx
;
for
(
k
=
0
;
k
<
3
;
k
++
)
pj
->
curl_v
[
k
]
+=
pi
->
mass
*
curlvr
[
k
]
*
wj_dx
;
pj
->
div_v
=
pi
->
mass
*
dvdr
*
wj_dx
;
for
(
k
=
0
;
k
<
3
;
k
++
)
pj
->
curl_v
[
k
]
+=
pi
->
mass
*
curlvr
[
k
]
*
wj_dx
;
}
#ifdef HIST
...
...
@@ -192,17 +194,16 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( flo
pi
[
k
]
->
rho_dh
-=
rhoi_dh
.
f
[
k
];
pi
[
k
]
->
wcount
+=
wcounti
.
f
[
k
];
pi
[
k
]
->
wcount_dh
-=
wcounti_dh
.
f
[
k
];
pi
[
k
]
->
div_v
+=
div_vi
.
f
[
k
];
for
(
j
=
0
;
j
<
3
;
j
++
)
pi
[
k
]
->
curl_v
[
j
]
+=
curl_vi
[
j
].
f
[
k
];
pi
[
k
]
->
div_v
+=
div_vi
.
f
[
k
];
for
(
j
=
0
;
j
<
3
;
j
++
)
pi
[
k
]
->
curl_v
[
j
]
+=
curl_vi
[
j
].
f
[
k
];
pj
[
k
]
->
rho
+=
rhoj
.
f
[
k
];
pj
[
k
]
->
rho_dh
-=
rhoj_dh
.
f
[
k
];
pj
[
k
]
->
wcount
+=
wcountj
.
f
[
k
];
pj
[
k
]
->
wcount_dh
-=
wcountj_dh
.
f
[
k
];
pj
[
k
]
->
div_v
+=
div_vj
.
f
[
k
];
for
(
j
=
0
;
j
<
3
;
j
++
)
pj
[
k
]
->
curl_v
[
j
]
+=
curl_vj
[
j
].
f
[
k
];
pj
[
k
]
->
div_v
+=
div_vj
.
f
[
k
];
for
(
j
=
0
;
j
<
3
;
j
++
)
pj
[
k
]
->
curl_v
[
j
]
+=
curl_vj
[
j
].
f
[
k
];
}
#else
...
...
@@ -330,10 +331,9 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_densit
pi
[
k
]
->
rho_dh
-=
rhoi_dh
.
f
[
k
];
pi
[
k
]
->
wcount
+=
wcounti
.
f
[
k
];
pi
[
k
]
->
wcount_dh
-=
wcounti_dh
.
f
[
k
];
pi
[
k
]
->
div_v
+=
div_vi
.
f
[
k
];
for
(
j
=
0
;
j
<
3
;
j
++
)
pi
[
k
]
->
curl_v
[
j
]
+=
curl_vi
[
j
].
f
[
k
];
pi
[
k
]
->
div_v
+=
div_vi
.
f
[
k
];
for
(
j
=
0
;
j
<
3
;
j
++
)
pi
[
k
]
->
curl_v
[
j
]
+=
curl_vi
[
j
].
f
[
k
];
}
#else
...
...
@@ -388,8 +388,8 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2
/* Compute viscosity tensor */
Pi_ij
=
-
const_viscosity_alpha
*
v_sig
*
omega_ij
/
(
pi
->
rho
+
pj
->
rho
);
/* Apply
B
alsara switch */
Pi_ij
*=
(
pi
->
B
alsara
+
pj
->
B
alsara
);
/* Apply
b
alsara switch */
Pi_ij
*=
(
pi
->
b
alsara
+
pj
->
b
alsara
);
/* Get the common factor out. */
w
=
ri
*
(
(
pi
->
POrho2
*
wi_dr
+
pj
->
POrho2
*
wj_dr
)
-
0
.
25
f
*
Pi_ij
*
(
wi_dr
+
wj_dr
)
);
...
...
@@ -446,7 +446,8 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float
vector
pia
[
3
],
pja
[
3
];
vector
piu_dt
,
pju_dt
;
vector
pih_dt
,
pjh_dt
;
vector
ci
,
cj
,
vi_sig
,
vj_sig
;
vector
ci
,
cj
,
v_sig
,
vi_sig
,
vj_sig
;
vector
omega_ij
,
Pi_ij
,
balsara
;
int
j
,
k
;
/* Load stuff. */
...
...
@@ -467,6 +468,8 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float
}
for
(
k
=
0
;
k
<
3
;
k
++
)
dx
[
k
].
v
=
_mm256_set_ps
(
Dx
[
21
+
k
]
,
Dx
[
18
+
k
]
,
Dx
[
15
+
k
]
,
Dx
[
12
+
k
]
,
Dx
[
9
+
k
]
,
Dx
[
6
+
k
]
,
Dx
[
3
+
k
]
,
Dx
[
0
+
k
]
);
balsara
.
v
=
_mm256_set_ps
(
pi
[
7
]
->
balsara
,
pi
[
6
]
->
balsara
,
pi
[
5
]
->
balsara
,
pi
[
4
]
->
balsara
,
pi
[
3
]
->
balsara
,
pi
[
2
]
->
balsara
,
pi
[
1
]
->
balsara
,
pi
[
0
]
->
balsara
)
+
_mm256_set_ps
(
pj
[
7
]
->
balsara
,
pj
[
6
]
->
balsara
,
pj
[
5
]
->
balsara
,
pj
[
4
]
->
balsara
,
pj
[
3
]
->
balsara
,
pj
[
2
]
->
balsara
,
pj
[
1
]
->
balsara
,
pj
[
0
]
->
balsara
);
#elif VEC_SIZE==4
mi
.
v
=
_mm_set_ps
(
pi
[
3
]
->
mass
,
pi
[
2
]
->
mass
,
pi
[
1
]
->
mass
,
pi
[
0
]
->
mass
);
mj
.
v
=
_mm_set_ps
(
pj
[
3
]
->
mass
,
pj
[
2
]
->
mass
,
pj
[
1
]
->
mass
,
pj
[
0
]
->
mass
);
...
...
@@ -484,12 +487,12 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float
}
for
(
k
=
0
;
k
<
3
;
k
++
)
dx
[
k
].
v
=
_mm_set_ps
(
Dx
[
9
+
k
]
,
Dx
[
6
+
k
]
,
Dx
[
3
+
k
]
,
Dx
[
0
+
k
]
);
balsara
.
v
=
_mm_set_ps
(
pi
[
3
]
->
balsara
,
pi
[
2
]
->
balsara
,
pi
[
1
]
->
balsara
,
pi
[
0
]
->
balsara
)
+
_mm_set_ps
(
pj
[
3
]
->
balsara
,
pj
[
2
]
->
balsara
,
pj
[
1
]
->
balsara
,
pj
[
0
]
->
balsara
);
#else
#error
#endif
#error Matthieu: Not yet implemented the visocisty switch in the vectorized version. Use scalar version instead !
/* Get the radius and inverse radius. */
r2
.
v
=
vec_load
(
R2
);
ri
.
v
=
vec_rsqrt
(
r2
.
v
);
...
...
@@ -516,9 +519,27 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float
kernel_deval_vec
(
&
xj
,
&
wj
,
&
wj_dx
);
wj_dr
.
v
=
hj2_inv
.
v
*
hj2_inv
.
v
*
wj_dx
.
v
;
/* Get the common factor out. */
w
.
v
=
ri
.
v
*
(
piPOrho2
.
v
*
wi_dr
.
v
+
pjPOrho2
.
v
*
wj_dr
.
v
);
/* Compute dv dot r. */
dvdr
.
v
=
(
(
vi
[
0
].
v
-
vj
[
0
].
v
)
*
dx
[
0
].
v
)
+
(
(
vi
[
1
].
v
-
vj
[
1
].
v
)
*
dx
[
1
].
v
)
+
(
(
vi
[
2
].
v
-
vj
[
2
].
v
)
*
dx
[
2
].
v
);
dvdr
.
v
=
dvdr
.
v
*
ri
.
v
;
/* Get the time derivative for h. */
pih_dt
.
v
=
mj
.
v
/
pjrho
.
v
*
dvdr
.
v
*
wi_dr
.
v
;
pjh_dt
.
v
=
mi
.
v
/
pirho
.
v
*
dvdr
.
v
*
wj_dr
.
v
;
/* Compute the relative velocity. (This is 0 if the particles move away from each other and negative otherwise) */
omega_ij
.
v
=
vec_fmin
(
dvdr
.
v
,
vec_set1
(
0
.
0
f
)
);
/* Compute signal velocity */
v_sig
.
v
=
ci
.
v
+
cj
.
v
-
vec_set1
(
3
.
0
f
)
*
omega_ij
.
v
;
/* Compute viscosity tensor */
Pi_ij
.
v
=
-
balsara
.
v
*
vec_set1
(
const_viscosity_alpha
)
*
v_sig
.
v
*
omega_ij
.
v
/
(
pirho
.
v
+
pjrho
.
v
);
Pi_ij
.
v
*=
(
wi_dr
.
v
+
wj_dr
.
v
);
/* Get the common factor out. */
w
.
v
=
ri
.
v
*
(
(
piPOrho2
.
v
*
wi_dr
.
v
+
pjPOrho2
.
v
*
wj_dr
.
v
)
-
vec_set1
(
0
.
25
f
)
*
Pi_ij
.
v
);
/* Use the force, Luke! */
for
(
k
=
0
;
k
<
3
;
k
++
)
{
f
.
v
=
dx
[
k
].
v
*
w
.
v
;
...
...
@@ -526,21 +547,13 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float
pja
[
k
].
v
=
mi
.
v
*
f
.
v
;
}
/* Compute dv dot r. */
dvdr
.
v
=
(
(
vi
[
0
].
v
-
vj
[
0
].
v
)
*
dx
[
0
].
v
)
+
(
(
vi
[
1
].
v
-
vj
[
1
].
v
)
*
dx
[
1
].
v
)
+
(
(
vi
[
2
].
v
-
vj
[
2
].
v
)
*
dx
[
2
].
v
);
dvdr
.
v
=
dvdr
.
v
*
ri
.
v
;
/* Get the time derivative for u. */
piu_dt
.
v
=
mj
.
v
*
dvdr
.
v
*
wi_dr
.
v
;
pju_dt
.
v
=
mi
.
v
*
dvdr
.
v
*
wj_dr
.
v
;
/* Get the time derivative for h. */
pih_dt
.
v
=
mj
.
v
/
pjrho
.
v
*
dvdr
.
v
*
wi_dr
.
v
;
pjh_dt
.
v
=
mi
.
v
/
pirho
.
v
*
dvdr
.
v
*
wj_dr
.
v
;
piu_dt
.
v
=
mj
.
v
*
dvdr
.
v
*
(
piPOrho2
.
v
*
wi_dr
.
v
+
vec_set1
(
0
.
125
f
)
*
Pi_ij
.
v
);
pju_dt
.
v
=
mi
.
v
*
dvdr
.
v
*
(
pjPOrho2
.
v
*
wj_dr
.
v
+
vec_set1
(
0
.
125
f
)
*
Pi_ij
.
v
);
/* compute the signal velocity (this is always symmetrical). */
vi_sig
.
v
=
vec_fmax
(
vi_sig
.
v
,
cj
.
v
-
vec_set1
(
3
.
0
f
)
*
dvdr
.
v
);
vj_sig
.
v
=
vec_fmax
(
vj_sig
.
v
,
ci
.
v
-
vec_set1
(
3
.
0
f
)
*
dvdr
.
v
);
vi_sig
.
v
=
vec_fmax
(
vi_sig
.
v
,
v_sig
.
v
);
vj_sig
.
v
=
vec_fmax
(
vj_sig
.
v
,
v_sig
.
v
);
/* Store the forces back on the particles. */
for
(
k
=
0
;
k
<
VEC_SIZE
;
k
++
)
{
...
...
@@ -608,8 +621,8 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_force ( fl
/* Compute viscosity tensor */
Pi_ij
=
-
const_viscosity_alpha
*
v_sig
*
omega_ij
/
(
pi
->
rho
+
pj
->
rho
);
/* Apply
B
alsara switch */
Pi_ij
*=
(
pi
->
B
alsara
+
pj
->
B
alsara
);
/* Apply
b
alsara switch */
Pi_ij
*=
(
pi
->
b
alsara
+
pj
->
b
alsara
);
/* Get the common factor out. */
w
=
ri
*
(
(
pi
->
POrho2
*
wi_dr
+
pj
->
POrho2
*
wj_dr
)
-
0
.
25
f
*
Pi_ij
*
(
wi_dr
+
wj_dr
)
);
...
...
@@ -629,6 +642,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_force ( fl
/* Update the signal velocity. */
pi
->
v_sig
=
fmaxf
(
pi
->
v_sig
,
v_sig
);
pj
->
v_sig
=
fmaxf
(
pj
->
v_sig
,
v_sig
);
}
...
...
@@ -648,7 +662,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force
vector
hjg_inv
,
hj2_inv
;
vector
wi
,
wj
,
wi_dx
,
wj_dx
,
wi_dr
,
wj_dr
,
dvdr
;
vector
w
;
vector
piPOrho2
,
pjPOrho2
,
pjrho
;
vector
piPOrho2
,
pjPOrho2
,
pirho
,
pjrho
;
vector
mj
;
vector
f
;
vector
dx
[
3
];
...
...
@@ -656,7 +670,8 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force
vector
pia
[
3
];
vector
piu_dt
;
vector
pih_dt
;
vector
ci
,
cj
,
vi_sig
,
vj_sig
;
vector
ci
,
cj
,
v_sig
,
vi_sig
,
vj_sig
;
vector
omega_ij
,
Pi_ij
,
balsara
;
int
j
,
k
;
/* Load stuff. */
...
...
@@ -664,6 +679,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force
mj
.
v
=
_mm256_set_ps
(
pj
[
7
]
->
mass
,
pj
[
6
]
->
mass
,
pj
[
5
]
->
mass
,
pj
[
4
]
->
mass
,
pj
[
3
]
->
mass
,
pj
[
2
]
->
mass
,
pj
[
1
]
->
mass
,
pj
[
0
]
->
mass
);
piPOrho2
.
v
=
_mm256_set_ps
(
pi
[
7
]
->
POrho2
,
pi
[
6
]
->
POrho2
,
pi
[
5
]
->
POrho2
,
pi
[
4
]
->
POrho2
,
pi
[
3
]
->
POrho2
,
pi
[
2
]
->
POrho2
,
pi
[
1
]
->
POrho2
,
pi
[
0
]
->
POrho2
);
pjPOrho2
.
v
=
_mm256_set_ps
(
pj
[
7
]
->
POrho2
,
pj
[
6
]
->
POrho2
,
pj
[
5
]
->
POrho2
,
pj
[
4
]
->
POrho2
,
pj
[
3
]
->
POrho2
,
pj
[
2
]
->
POrho2
,
pj
[
1
]
->
POrho2
,
pj
[
0
]
->
POrho2
);
pirho
.
v
=
_mm256_set_ps
(
pi
[
7
]
->
rho
,
pi
[
6
]
->
rho
,
pi
[
5
]
->
rho
,
pi
[
4
]
->
rho
,
pi
[
3
]
->
rho
,
pi
[
2
]
->
rho
,
pi
[
1
]
->
rho
,
pi
[
0
]
->
rho
);
pjrho
.
v
=
_mm256_set_ps
(
pj
[
7
]
->
rho
,
pj
[
6
]
->
rho
,
pj
[
5
]
->
rho
,
pj
[
4
]
->
rho
,
pj
[
3
]
->
rho
,
pj
[
2
]
->
rho
,
pj
[
1
]
->
rho
,
pj
[
0
]
->
rho
);
ci
.
v
=
_mm256_set_ps
(
pi
[
7
]
->
c
,
pi
[
6
]
->
c
,
pi
[
5
]
->
c
,
pi
[
4
]
->
c
,
pi
[
3
]
->
c
,
pi
[
2
]
->
c
,
pi
[
1
]
->
c
,
pi
[
0
]
->
c
);
cj
.
v
=
_mm256_set_ps
(
pj
[
7
]
->
c
,
pj
[
6
]
->
c
,
pj
[
5
]
->
c
,
pj
[
4
]
->
c
,
pj
[
3
]
->
c
,
pj
[
2
]
->
c
,
pj
[
1
]
->
c
,
pj
[
0
]
->
c
);
...
...
@@ -675,10 +691,13 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force
}
for
(
k
=
0
;
k
<
3
;
k
++
)
dx
[
k
].
v
=
_mm256_set_ps
(
Dx
[
21
+
k
]
,
Dx
[
18
+
k
]
,
Dx
[
15
+
k
]
,
Dx
[
12
+
k
]
,
Dx
[
9
+
k
]
,
Dx
[
6
+
k
]
,
Dx
[
3
+
k
]
,
Dx
[
0
+
k
]
);
balsara
.
v
=
_mm256_set_ps
(
pi
[
7
]
->
balsara
,
pi
[
6
]
->
balsara
,
pi
[
5
]
->
balsara
,
pi
[
4
]
->
balsara
,
pi
[
3
]
->
balsara
,
pi
[
2
]
->
balsara
,
pi
[
1
]
->
balsara
,
pi
[
0
]
->
balsara
)
+
_mm256_set_ps
(
pj
[
7
]
->
balsara
,
pj
[
6
]
->
balsara
,
pj
[
5
]
->
balsara
,
pj
[
4
]
->
balsara
,
pj
[
3
]
->
balsara
,
pj
[
2
]
->
balsara
,
pj
[
1
]
->
balsara
,
pj
[
0
]
->
balsara
);
#elif VEC_SIZE==4
mj
.
v
=
_mm_set_ps
(
pj
[
3
]
->
mass
,
pj
[
2
]
->
mass
,
pj
[
1
]
->
mass
,
pj
[
0
]
->
mass
);
piPOrho2
.
v
=
_mm_set_ps
(
pi
[
3
]
->
POrho2
,
pi
[
2
]
->
POrho2
,
pi
[
1
]
->
POrho2
,
pi
[
0
]
->
POrho2
);
pjPOrho2
.
v
=
_mm_set_ps
(
pj
[
3
]
->
POrho2
,
pj
[
2
]
->
POrho2
,
pj
[
1
]
->
POrho2
,
pj
[
0
]
->
POrho2
);
pirho
.
v
=
_mm_set_ps
(
pi
[
3
]
->
rho
,
pi
[
2
]
->
rho
,
pi
[
1
]
->
rho
,
pi
[
0
]
->
rho
);
pjrho
.
v
=
_mm_set_ps
(
pj
[
3
]
->
rho
,
pj
[
2
]
->
rho
,
pj
[
1
]
->
rho
,
pj
[
0
]
->
rho
);
ci
.
v
=
_mm_set_ps
(
pi
[
3
]
->
c
,
pi
[
2
]
->
c
,
pi
[
1
]
->
c
,
pi
[
0
]
->
c
);
cj
.
v
=
_mm_set_ps
(
pj
[
3
]
->
c
,
pj
[
2
]
->
c
,
pj
[
1
]
->
c
,
pj
[
0
]
->
c
);
...
...
@@ -690,12 +709,12 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force
}
for
(
k
=
0
;
k
<
3
;
k
++
)
dx
[
k
].
v
=
_mm_set_ps
(
Dx
[
9
+
k
]
,
Dx
[
6
+
k
]
,
Dx
[
3
+
k
]
,
Dx
[
0
+
k
]
);
balsara
.
v
=
_mm_set_ps
(
pi
[
3
]
->
balsara
,
pi
[
2
]
->
balsara
,
pi
[
1
]
->
balsara
,
pi
[
0
]
->
balsara
)
+
_mm_set_ps
(
pj
[
3
]
->
balsara
,
pj
[
2
]
->
balsara
,
pj
[
1
]
->
balsara
,
pj
[
0
]
->
balsara
);
#else
#error
#endif
#error Matthieu: Not yet implemented the visocisty switch in the vectorized version. Use scalar version instead !
/* Get the radius and inverse radius. */
r2
.
v
=
vec_load
(
R2
);
ri
.
v
=
vec_rsqrt
(
r2
.
v
);
...
...
@@ -722,28 +741,38 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force
kernel_deval_vec
(
&
xj
,
&
wj
,
&
wj_dx
);
wj_dr
.
v
=
hj2_inv
.
v
*
hj2_inv
.
v
*
wj_dx
.
v
;
/* Get the common factor out. */
w
.
v
=
ri
.
v
*
(
piPOrho2
.
v
*
wi_dr
.
v
+
pjPOrho2
.
v
*
wj_dr
.
v
);
/* Compute dv dot r. */
dvdr
.
v
=
(
(
vi
[
0
].
v
-
vj
[
0
].
v
)
*
dx
[
0
].
v
)
+
(
(
vi
[
1
].
v
-
vj
[
1
].
v
)
*
dx
[
1
].
v
)
+
(
(
vi
[
2
].
v
-
vj
[
2
].
v
)
*
dx
[
2
].
v
);
dvdr
.
v
=
dvdr
.
v
*
ri
.
v
;
/* Get the time derivative for h. */
pih_dt
.
v
=
mj
.
v
/
pjrho
.
v
*
dvdr
.
v
*
wi_dr
.
v
;
/* Compute the relative velocity. (This is 0 if the particles move away from each other and negative otherwise) */
omega_ij
.
v
=
vec_fmin
(
dvdr
.
v
,
vec_set1
(
0
.
0
f
)
);
/* Compute signal velocity */
v_sig
.
v
=
ci
.
v
+
cj
.
v
-
vec_set1
(
3
.
0
f
)
*
omega_ij
.
v
;
/* Compute viscosity tensor */
Pi_ij
.
v
=
-
balsara
.
v
*
vec_set1
(
const_viscosity_alpha
)
*
v_sig
.
v
*
omega_ij
.
v
/
(
pirho
.
v
+
pjrho
.
v
);
Pi_ij
.
v
*=
(
wi_dr
.
v
+
wj_dr
.
v
);
/* Get the common factor out. */
w
.
v
=
ri
.
v
*
(
(
piPOrho2
.
v
*
wi_dr
.
v
+
pjPOrho2
.
v
*
wj_dr
.
v
)
-
vec_set1
(
0
.
25
f
)
*
Pi_ij
.
v
);
/* Use the force, Luke! */
for
(
k
=
0
;
k
<
3
;
k
++
)
{
f
.
v
=
dx
[
k
].
v
*
w
.
v
;
pia
[
k
].
v
=
mj
.
v
*
f
.
v
;
}
/* Compute dv dot r. */
dvdr
.
v
=
(
(
vi
[
0
].
v
-
vj
[
0
].
v
)
*
dx
[
0
].
v
)
+
(
(
vi
[
1
].
v
-
vj
[
1
].
v
)
*
dx
[
1
].
v
)
+
(
(
vi
[
2
].
v
-
vj
[
2
].
v
)
*
dx
[
2
].
v
);
dvdr
.
v
=
dvdr
.
v
*
ri
.
v
;
/* Get the time derivative for u. */
piu_dt
.
v
=
mj
.
v
*
dvdr
.
v
*
wi_dr
.
v
;
/* Get the time derivative for h. */
pih_dt
.
v
=
mj
.
v
/
pjrho
.
v
*
dvdr
.
v
*
wi_dr
.
v
;
piu_dt
.
v
=
mj
.
v
*
dvdr
.
v
*
(
piPOrho2
.
v
*
wi_dr
.
v
+
vec_set1
(
0
.
125
f
)
*
Pi_ij
.
v
);
/* compute the signal velocity (this is always symmetrical). */
vi_sig
.
v
=
vec_fmax
(
vi_sig
.
v
,
cj
.
v
-
vec_set1
(
3
.
0
f
)
*
dvdr
.
v
);
vj_sig
.
v
=
vec_fmax
(
vj_sig
.
v
,
ci
.
v
-
vec_set1
(
3
.
0
f
)
*
dvdr
.
v
);
vi_sig
.
v
=
vec_fmax
(
vi_sig
.
v
,
v_sig
.
v
);
vj_sig
.
v
=
vec_fmax
(
vj_sig
.
v
,
v_sig
.
v
);
/* Store the forces back on the particles. */
for
(
k
=
0
;
k
<
VEC_SIZE
;
k
++
)
{
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment