Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
SWIFT
SWIFTsim
Commits
840de9bb
Commit
840de9bb
authored
Oct 29, 2017
by
Matthieu Schaller
Browse files
Merge branch 'master' into pthread_barrier
parents
c5eec7f5
b0b0b877
Changes
29
Expand all
Hide whitespace changes
Inline
Side-by-side
.gitignore
View file @
840de9bb
...
...
@@ -55,7 +55,9 @@ tests/brute_force_125_standard.dat
tests/swift_dopair_125_standard.dat
tests/brute_force_125_perturbed.dat
tests/swift_dopair_125_perturbed.dat
tests/brute_force_active.dat
tests/brute_force_pair_active.dat
tests/brute_force_dopair2_active.dat
tests/swift_dopair2_force_active.dat
tests/brute_force_periodic_BC_perturbed.dat
tests/swift_dopair_active.dat
tests/test_nonsym_density_serial.dat
...
...
README
View file @
840de9bb
...
...
@@ -20,7 +20,7 @@ Valid options are:
-C Run with cooling.
-d Dry run. Read the parameter file, allocate memory but does not read
the particles from ICs and exit before the start of time integration.
Allows user to check validy of parameter and IC files as well as memory limits.
Allows user to check valid
it
y of parameter and IC files as well as memory limits.
-D Always drift all particles even the ones far from active particles. This emulates
Gadget-[23] and GIZMO's default behaviours.
-e Enable floating-point exceptions (debugging mode).
...
...
src/cache.h
View file @
840de9bb
...
...
@@ -198,10 +198,7 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
swift_declare_aligned_ptr
(
float
,
vz
,
ci_cache
->
vz
,
SWIFT_CACHE_ALIGNMENT
);
const
struct
part
*
restrict
parts
=
ci
->
parts
;
double
loc
[
3
];
loc
[
0
]
=
ci
->
loc
[
0
];
loc
[
1
]
=
ci
->
loc
[
1
];
loc
[
2
]
=
ci
->
loc
[
2
];
const
double
loc
[
3
]
=
{
ci
->
loc
[
0
],
ci
->
loc
[
1
],
ci
->
loc
[
2
]};
/* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */
...
...
@@ -210,7 +207,6 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
y
[
i
]
=
(
float
)(
parts
[
i
].
x
[
1
]
-
loc
[
1
]);
z
[
i
]
=
(
float
)(
parts
[
i
].
x
[
2
]
-
loc
[
2
]);
h
[
i
]
=
parts
[
i
].
h
;
m
[
i
]
=
parts
[
i
].
mass
;
vx
[
i
]
=
parts
[
i
].
v
[
0
];
vy
[
i
]
=
parts
[
i
].
v
[
1
];
...
...
@@ -254,10 +250,7 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles(
SWIFT_CACHE_ALIGNMENT
);
const
struct
part
*
restrict
parts
=
ci
->
parts
;
double
loc
[
3
];
loc
[
0
]
=
ci
->
loc
[
0
];
loc
[
1
]
=
ci
->
loc
[
1
];
loc
[
2
]
=
ci
->
loc
[
2
];
const
double
loc
[
3
]
=
{
ci
->
loc
[
0
],
ci
->
loc
[
1
],
ci
->
loc
[
2
]};
/* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */
...
...
@@ -266,12 +259,10 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles(
y
[
i
]
=
(
float
)(
parts
[
i
].
x
[
1
]
-
loc
[
1
]);
z
[
i
]
=
(
float
)(
parts
[
i
].
x
[
2
]
-
loc
[
2
]);
h
[
i
]
=
parts
[
i
].
h
;
m
[
i
]
=
parts
[
i
].
mass
;
vx
[
i
]
=
parts
[
i
].
v
[
0
];
vy
[
i
]
=
parts
[
i
].
v
[
1
];
vz
[
i
]
=
parts
[
i
].
v
[
2
];
rho
[
i
]
=
parts
[
i
].
rho
;
grad_h
[
i
]
=
parts
[
i
].
force
.
f
;
pOrho2
[
i
]
=
parts
[
i
].
force
.
P_over_rho2
;
...
...
@@ -296,7 +287,6 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles(
* @param shift The amount to shift the particle positions to account for BCs
* @param first_pi The first particle in cell ci that is in range.
* @param last_pj The last particle in cell cj that is in range.
* @param num_vec_proc Number of vectors that will be used to process the
* interaction.
*/
__attribute__
((
always_inline
))
INLINE
void
cache_read_two_partial_cells_sorted
(
...
...
@@ -304,38 +294,39 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
struct
cache
*
restrict
const
ci_cache
,
struct
cache
*
restrict
const
cj_cache
,
const
struct
entry
*
restrict
sort_i
,
const
struct
entry
*
restrict
sort_j
,
const
double
*
restrict
const
shift
,
int
*
first_pi
,
int
*
last_pj
,
const
int
num_vec_proc
)
{
int
*
first_pi
,
int
*
last_pj
)
{
/* Make the number of particles to be read a multiple of the vector size.
* This eliminates serial remainder loops where possible when populating the
* cache. */
int
idx
;
/* Pad number of particles read to the vector size. */
int
rem
=
(
ci
->
count
-
*
first_pi
)
%
(
num_vec_proc
*
VEC_SIZE
);
/* Is the number of particles to read a multiple of the vector size? */
int
rem
=
(
ci
->
count
-
*
first_pi
)
%
VEC_SIZE
;
if
(
rem
!=
0
)
{
int
pad
=
(
num_vec_proc
*
VEC_SIZE
)
-
rem
;
int
pad
=
VEC_SIZE
-
rem
;
/* Decrease first_pi if there are particles in the cell left to read. */
if
(
*
first_pi
-
pad
>=
0
)
*
first_pi
-=
pad
;
}
rem
=
*
last_pj
%
(
num_vec_proc
*
VEC_SIZE
)
;
rem
=
(
*
last_pj
+
1
)
%
VEC_SIZE
;
if
(
rem
!=
0
)
{
int
pad
=
(
num_vec_proc
*
VEC_SIZE
)
-
rem
;
int
pad
=
VEC_SIZE
-
rem
;
/* Increase last_pj if there are particles in the cell left to read. */
if
(
*
last_pj
+
pad
<
cj
->
count
)
*
last_pj
+=
pad
;
}
int
first_pi_align
=
*
first_pi
;
int
last_pj_align
=
*
last_pj
;
/* Get some local pointers */
const
int
first_pi_align
=
*
first_pi
;
const
int
last_pj_align
=
*
last_pj
;
const
struct
part
*
restrict
parts_i
=
ci
->
parts
;
const
struct
part
*
restrict
parts_j
=
cj
->
parts
;
double
loc
[
3
];
loc
[
0
]
=
cj
->
loc
[
0
];
loc
[
1
]
=
cj
->
loc
[
1
];
loc
[
2
]
=
cj
->
loc
[
2
];
/* Shift ci particles for boundary conditions and location of cell.*/
double
total_ci_shift
[
3
];
total_ci_shift
[
0
]
=
loc
[
0
]
+
shift
[
0
];
total_ci_shift
[
1
]
=
loc
[
1
]
+
shift
[
1
];
total_ci_shift
[
2
]
=
loc
[
2
]
+
shift
[
2
];
/* Shift particles to the local frame and account for boundary conditions.*/
const
double
total_ci_shift
[
3
]
=
{
cj
->
loc
[
0
]
+
shift
[
0
],
cj
->
loc
[
1
]
+
shift
[
1
],
cj
->
loc
[
2
]
+
shift
[
2
]};
const
double
total_cj_shift
[
3
]
=
{
cj
->
loc
[
0
],
cj
->
loc
[
1
],
cj
->
loc
[
2
]};
/* Let the compiler know that the data is aligned and create pointers to the
* arrays inside the cache. */
...
...
@@ -349,19 +340,15 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
swift_declare_aligned_ptr
(
float
,
vz
,
ci_cache
->
vz
,
SWIFT_CACHE_ALIGNMENT
);
int
ci_cache_count
=
ci
->
count
-
first_pi_align
;
/* Shift the particles positions to a local frame (ci frame) so single
* precision
* can be
* used instead of double precision. Also shift the cell ci, particles
* positions
* due to BCs but leave cell cj. */
* precision can be used instead of double precision. */
for
(
int
i
=
0
;
i
<
ci_cache_count
;
i
++
)
{
idx
=
sort_i
[
i
+
first_pi_align
].
i
;
const
int
idx
=
sort_i
[
i
+
first_pi_align
].
i
;
x
[
i
]
=
(
float
)(
parts_i
[
idx
].
x
[
0
]
-
total_ci_shift
[
0
]);
y
[
i
]
=
(
float
)(
parts_i
[
idx
].
x
[
1
]
-
total_ci_shift
[
1
]);
z
[
i
]
=
(
float
)(
parts_i
[
idx
].
x
[
2
]
-
total_ci_shift
[
2
]);
h
[
i
]
=
parts_i
[
idx
].
h
;
m
[
i
]
=
parts_i
[
idx
].
mass
;
vx
[
i
]
=
parts_i
[
idx
].
v
[
0
];
vy
[
i
]
=
parts_i
[
idx
].
v
[
1
];
...
...
@@ -384,36 +371,42 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
"is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
"2*space_maxreldx)]. x=%f, ci->width[0]=%f"
,
ci
->
loc
[
0
],
ci
->
loc
[
1
],
ci
->
loc
[
2
],
loc
[
0
],
loc
[
1
],
loc
[
2
],
i
,
x
[
i
],
ci
->
width
[
0
]);
ci
->
loc
[
0
],
ci
->
loc
[
1
],
ci
->
loc
[
2
],
cj
->
loc
[
0
],
cj
->
loc
[
1
],
cj
->
loc
[
2
],
i
,
x
[
i
],
ci
->
width
[
0
]);
if
(
y
[
i
]
>
shift_threshold_y
||
y
[
i
]
<
-
shift_threshold_y
)
error
(
"Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d y pos "
"is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
"2*space_maxreldx)]. y=%f, ci->width[1]=%f"
,
ci
->
loc
[
0
],
ci
->
loc
[
1
],
ci
->
loc
[
2
],
loc
[
0
],
loc
[
1
],
loc
[
2
],
i
,
y
[
i
],
ci
->
width
[
1
]);
ci
->
loc
[
0
],
ci
->
loc
[
1
],
ci
->
loc
[
2
],
cj
->
loc
[
0
],
cj
->
loc
[
1
],
cj
->
loc
[
2
],
i
,
y
[
i
],
ci
->
width
[
1
]);
if
(
z
[
i
]
>
shift_threshold_z
||
z
[
i
]
<
-
shift_threshold_z
)
error
(
"Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d z pos "
"is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
"2*space_maxreldx)]. z=%f, ci->width[2]=%f"
,
ci
->
loc
[
0
],
ci
->
loc
[
1
],
ci
->
loc
[
2
],
loc
[
0
],
loc
[
1
],
loc
[
2
],
i
,
z
[
i
],
ci
->
width
[
2
]);
ci
->
loc
[
0
],
ci
->
loc
[
1
],
ci
->
loc
[
2
],
cj
->
loc
[
0
],
cj
->
loc
[
1
],
cj
->
loc
[
2
],
i
,
z
[
i
],
ci
->
width
[
2
]);
}
#endif
/* Pad cache with fake particles that exist outside the cell so will not
* interact.*/
float
fake_pix
=
2
.
0
f
*
parts_i
[
sort_i
[
ci
->
count
-
1
].
i
].
x
[
0
];
* interact. We use values of the same magnitude (but negative!) as the real
* particles to avoid overflow problems. */
const
double
max_dx
=
max
(
ci
->
dx_max_part
,
cj
->
dx_max_part
);
const
float
pos_padded
[
3
]
=
{
-
(
2
.
*
ci
->
width
[
0
]
+
max_dx
),
-
(
2
.
*
ci
->
width
[
1
]
+
max_dx
),
-
(
2
.
*
ci
->
width
[
2
]
+
max_dx
)};
const
float
h_padded
=
ci
->
parts
[
0
].
h
;
for
(
int
i
=
ci
->
count
-
first_pi_align
;
i
<
ci
->
count
-
first_pi_align
+
VEC_SIZE
;
i
++
)
{
x
[
i
]
=
fake_pix
;
y
[
i
]
=
1
.
f
;
z
[
i
]
=
1
.
f
;
h
[
i
]
=
1
.
f
;
x
[
i
]
=
pos_padded
[
0
]
;
y
[
i
]
=
pos_padded
[
1
]
;
z
[
i
]
=
pos_padded
[
2
]
;
h
[
i
]
=
h_padded
;
m
[
i
]
=
1
.
f
;
vx
[
i
]
=
1
.
f
;
...
...
@@ -433,12 +426,11 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
swift_declare_aligned_ptr
(
float
,
vzj
,
cj_cache
->
vz
,
SWIFT_CACHE_ALIGNMENT
);
for
(
int
i
=
0
;
i
<=
last_pj_align
;
i
++
)
{
idx
=
sort_j
[
i
].
i
;
xj
[
i
]
=
(
float
)(
parts_j
[
idx
].
x
[
0
]
-
loc
[
0
]);
yj
[
i
]
=
(
float
)(
parts_j
[
idx
].
x
[
1
]
-
loc
[
1
]);
zj
[
i
]
=
(
float
)(
parts_j
[
idx
].
x
[
2
]
-
loc
[
2
]);
const
int
idx
=
sort_j
[
i
].
i
;
xj
[
i
]
=
(
float
)(
parts_j
[
idx
].
x
[
0
]
-
total_cj_shift
[
0
]);
yj
[
i
]
=
(
float
)(
parts_j
[
idx
].
x
[
1
]
-
total_cj_shift
[
1
]);
zj
[
i
]
=
(
float
)(
parts_j
[
idx
].
x
[
2
]
-
total_cj_shift
[
2
]);
hj
[
i
]
=
parts_j
[
idx
].
h
;
mj
[
i
]
=
parts_j
[
idx
].
mass
;
vxj
[
i
]
=
parts_j
[
idx
].
v
[
0
];
vyj
[
i
]
=
parts_j
[
idx
].
v
[
1
];
...
...
@@ -454,40 +446,228 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
"pos is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
"2*space_maxreldx)]. xj=%f, ci->width[0]=%f"
,
ci
->
loc
[
0
],
ci
->
loc
[
1
],
ci
->
loc
[
2
],
loc
[
0
],
loc
[
1
],
loc
[
2
],
i
,
xj
[
i
],
ci
->
width
[
0
]);
ci
->
loc
[
0
],
ci
->
loc
[
1
],
ci
->
loc
[
2
],
cj
->
loc
[
0
],
cj
->
loc
[
1
],
cj
->
loc
[
2
],
i
,
xj
[
i
],
ci
->
width
[
0
]);
if
(
yj
[
i
]
>
shift_threshold_y
||
yj
[
i
]
<
-
shift_threshold_y
)
error
(
"Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d yj "
"pos is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
"2*space_maxreldx)]. yj=%f, ci->width[1]=%f"
,
ci
->
loc
[
0
],
ci
->
loc
[
1
],
ci
->
loc
[
2
],
loc
[
0
],
loc
[
1
],
loc
[
2
],
i
,
yj
[
i
],
ci
->
width
[
1
]);
ci
->
loc
[
0
],
ci
->
loc
[
1
],
ci
->
loc
[
2
],
cj
->
loc
[
0
],
cj
->
loc
[
1
],
cj
->
loc
[
2
],
i
,
yj
[
i
],
ci
->
width
[
1
]);
if
(
zj
[
i
]
>
shift_threshold_z
||
zj
[
i
]
<
-
shift_threshold_z
)
error
(
"Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d zj "
"pos is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
"2*space_maxreldx)]. zj=%f, ci->width[2]=%f"
,
ci
->
loc
[
0
],
ci
->
loc
[
1
],
ci
->
loc
[
2
],
loc
[
0
],
loc
[
1
],
loc
[
2
],
i
,
zj
[
i
],
ci
->
width
[
2
]);
ci
->
loc
[
0
],
ci
->
loc
[
1
],
ci
->
loc
[
2
],
cj
->
loc
[
0
],
cj
->
loc
[
1
],
cj
->
loc
[
2
],
i
,
zj
[
i
],
ci
->
width
[
2
]);
}
#endif
/* Pad cache with fake particles that exist outside the cell so will not
* interact.*/
float
fake_pjx
=
2
.
0
f
*
cj
->
parts
[
sort_j
[
cj
->
count
-
1
].
i
].
x
[
0
];
* interact. We use values of the same magnitude (but negative!) as the real
* particles to avoid overflow problems. */
const
float
pos_padded_j
[
3
]
=
{
-
(
2
.
*
cj
->
width
[
0
]
+
max_dx
),
-
(
2
.
*
cj
->
width
[
1
]
+
max_dx
),
-
(
2
.
*
cj
->
width
[
2
]
+
max_dx
)};
const
float
h_padded_j
=
cj
->
parts
[
0
].
h
;
for
(
int
i
=
last_pj_align
+
1
;
i
<
last_pj_align
+
1
+
VEC_SIZE
;
i
++
)
{
xj
[
i
]
=
fake_pjx
;
yj
[
i
]
=
1
.
f
;
zj
[
i
]
=
1
.
f
;
hj
[
i
]
=
1
.
f
;
xj
[
i
]
=
pos_padded_j
[
0
];
yj
[
i
]
=
pos_padded_j
[
1
];
zj
[
i
]
=
pos_padded_j
[
2
];
hj
[
i
]
=
h_padded_j
;
mj
[
i
]
=
1
.
f
;
vxj
[
i
]
=
1
.
f
;
vyj
[
i
]
=
1
.
f
;
vzj
[
i
]
=
1
.
f
;
}
}
/**
* @brief Populate caches by only reading particles that are within range of
* each other within the adjoining cell.Also read the particles into the cache
* in sorted order.
*
* @param ci The i #cell.
* @param cj The j #cell.
* @param ci_cache The #cache for cell ci.
* @param cj_cache The #cache for cell cj.
* @param sort_i The array of sorted particle indices for cell ci.
* @param sort_j The array of sorted particle indices for cell ci.
* @param shift The amount to shift the particle positions to account for BCs
* @param first_pi The first particle in cell ci that is in range.
* @param last_pj The last particle in cell cj that is in range.
* interaction.
*/
__attribute__
((
always_inline
))
INLINE
void
cache_read_two_partial_cells_sorted_force
(
const
struct
cell
*
const
ci
,
const
struct
cell
*
const
cj
,
struct
cache
*
const
ci_cache
,
struct
cache
*
const
cj_cache
,
const
struct
entry
*
restrict
sort_i
,
const
struct
entry
*
restrict
sort_j
,
const
double
*
const
shift
,
int
*
first_pi
,
int
*
last_pj
)
{
/* Make the number of particles to be read a multiple of the vector size.
* This eliminates serial remainder loops where possible when populating the
* cache. */
/* Is the number of particles to read a multiple of the vector size? */
int
rem
=
(
ci
->
count
-
*
first_pi
)
%
VEC_SIZE
;
if
(
rem
!=
0
)
{
int
pad
=
VEC_SIZE
-
rem
;
/* Decrease first_pi if there are particles in the cell left to read. */
if
(
*
first_pi
-
pad
>=
0
)
*
first_pi
-=
pad
;
}
rem
=
(
*
last_pj
+
1
)
%
VEC_SIZE
;
if
(
rem
!=
0
)
{
int
pad
=
VEC_SIZE
-
rem
;
/* Increase last_pj if there are particles in the cell left to read. */
if
(
*
last_pj
+
pad
<
cj
->
count
)
*
last_pj
+=
pad
;
}
/* Get some local pointers */
const
int
first_pi_align
=
*
first_pi
;
const
int
last_pj_align
=
*
last_pj
;
const
struct
part
*
restrict
parts_i
=
ci
->
parts
;
const
struct
part
*
restrict
parts_j
=
cj
->
parts
;
/* Shift particles to the local frame and account for boundary conditions.*/
const
double
total_ci_shift
[
3
]
=
{
cj
->
loc
[
0
]
+
shift
[
0
],
cj
->
loc
[
1
]
+
shift
[
1
],
cj
->
loc
[
2
]
+
shift
[
2
]};
const
double
total_cj_shift
[
3
]
=
{
cj
->
loc
[
0
],
cj
->
loc
[
1
],
cj
->
loc
[
2
]};
/* Let the compiler know that the data is aligned and create pointers to the
* arrays inside the cache. */
swift_declare_aligned_ptr
(
float
,
x
,
ci_cache
->
x
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
y
,
ci_cache
->
y
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
z
,
ci_cache
->
z
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
h
,
ci_cache
->
h
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
m
,
ci_cache
->
m
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
vx
,
ci_cache
->
vx
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
vy
,
ci_cache
->
vy
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
vz
,
ci_cache
->
vz
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
rho
,
ci_cache
->
rho
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
grad_h
,
ci_cache
->
grad_h
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
pOrho2
,
ci_cache
->
pOrho2
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
balsara
,
ci_cache
->
balsara
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
soundspeed
,
ci_cache
->
soundspeed
,
SWIFT_CACHE_ALIGNMENT
);
int
ci_cache_count
=
ci
->
count
-
first_pi_align
;
/* Shift the particles positions to a local frame (ci frame) so single
* precision can be used instead of double precision. */
for
(
int
i
=
0
;
i
<
ci_cache_count
;
i
++
)
{
const
int
idx
=
sort_i
[
i
+
first_pi_align
].
i
;
x
[
i
]
=
(
float
)(
parts_i
[
idx
].
x
[
0
]
-
total_ci_shift
[
0
]);
y
[
i
]
=
(
float
)(
parts_i
[
idx
].
x
[
1
]
-
total_ci_shift
[
1
]);
z
[
i
]
=
(
float
)(
parts_i
[
idx
].
x
[
2
]
-
total_ci_shift
[
2
]);
h
[
i
]
=
parts_i
[
idx
].
h
;
m
[
i
]
=
parts_i
[
idx
].
mass
;
vx
[
i
]
=
parts_i
[
idx
].
v
[
0
];
vy
[
i
]
=
parts_i
[
idx
].
v
[
1
];
vz
[
i
]
=
parts_i
[
idx
].
v
[
2
];
rho
[
i
]
=
parts_i
[
idx
].
rho
;
grad_h
[
i
]
=
parts_i
[
idx
].
force
.
f
;
pOrho2
[
i
]
=
parts_i
[
idx
].
force
.
P_over_rho2
;
balsara
[
i
]
=
parts_i
[
idx
].
force
.
balsara
;
soundspeed
[
i
]
=
parts_i
[
idx
].
force
.
soundspeed
;
}
/* Pad cache with fake particles that exist outside the cell so will not
* interact. We use values of the same magnitude (but negative!) as the real
* particles to avoid overflow problems. */
const
double
max_dx
=
max
(
ci
->
dx_max_part
,
cj
->
dx_max_part
);
const
float
pos_padded
[
3
]
=
{
-
(
2
.
*
ci
->
width
[
0
]
+
max_dx
),
-
(
2
.
*
ci
->
width
[
1
]
+
max_dx
),
-
(
2
.
*
ci
->
width
[
2
]
+
max_dx
)};
const
float
h_padded
=
ci
->
parts
[
0
].
h
;
for
(
int
i
=
ci
->
count
-
first_pi_align
;
i
<
ci
->
count
-
first_pi_align
+
VEC_SIZE
;
i
++
)
{
x
[
i
]
=
pos_padded
[
0
];
y
[
i
]
=
pos_padded
[
1
];
z
[
i
]
=
pos_padded
[
2
];
h
[
i
]
=
h_padded
;
m
[
i
]
=
1
.
f
;
vx
[
i
]
=
1
.
f
;
vy
[
i
]
=
1
.
f
;
vz
[
i
]
=
1
.
f
;
rho
[
i
]
=
1
.
f
;
grad_h
[
i
]
=
1
.
f
;
pOrho2
[
i
]
=
1
.
f
;
balsara
[
i
]
=
1
.
f
;
soundspeed
[
i
]
=
1
.
f
;
}
/* Let the compiler know that the data is aligned and create pointers to the
* arrays inside the cache. */
swift_declare_aligned_ptr
(
float
,
xj
,
cj_cache
->
x
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
yj
,
cj_cache
->
y
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
zj
,
cj_cache
->
z
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
hj
,
cj_cache
->
h
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
mj
,
cj_cache
->
m
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
vxj
,
cj_cache
->
vx
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
vyj
,
cj_cache
->
vy
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
vzj
,
cj_cache
->
vz
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
rhoj
,
cj_cache
->
rho
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
grad_hj
,
cj_cache
->
grad_h
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
pOrho2j
,
cj_cache
->
pOrho2
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
balsaraj
,
cj_cache
->
balsara
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
soundspeedj
,
cj_cache
->
soundspeed
,
SWIFT_CACHE_ALIGNMENT
);
for
(
int
i
=
0
;
i
<=
last_pj_align
;
i
++
)
{
const
int
idx
=
sort_j
[
i
].
i
;
xj
[
i
]
=
(
float
)(
parts_j
[
idx
].
x
[
0
]
-
total_cj_shift
[
0
]);
yj
[
i
]
=
(
float
)(
parts_j
[
idx
].
x
[
1
]
-
total_cj_shift
[
1
]);
zj
[
i
]
=
(
float
)(
parts_j
[
idx
].
x
[
2
]
-
total_cj_shift
[
2
]);
hj
[
i
]
=
parts_j
[
idx
].
h
;
mj
[
i
]
=
parts_j
[
idx
].
mass
;
vxj
[
i
]
=
parts_j
[
idx
].
v
[
0
];
vyj
[
i
]
=
parts_j
[
idx
].
v
[
1
];
vzj
[
i
]
=
parts_j
[
idx
].
v
[
2
];
rhoj
[
i
]
=
parts_j
[
idx
].
rho
;
grad_hj
[
i
]
=
parts_j
[
idx
].
force
.
f
;
pOrho2j
[
i
]
=
parts_j
[
idx
].
force
.
P_over_rho2
;
balsaraj
[
i
]
=
parts_j
[
idx
].
force
.
balsara
;
soundspeedj
[
i
]
=
parts_j
[
idx
].
force
.
soundspeed
;
}
/* Pad cache with fake particles that exist outside the cell so will not
* interact. We use values of the same magnitude (but negative!) as the real
* particles to avoid overflow problems. */
const
float
pos_padded_j
[
3
]
=
{
-
(
2
.
*
cj
->
width
[
0
]
+
max_dx
),
-
(
2
.
*
cj
->
width
[
1
]
+
max_dx
),
-
(
2
.
*
cj
->
width
[
2
]
+
max_dx
)};
const
float
h_padded_j
=
cj
->
parts
[
0
].
h
;
for
(
int
i
=
last_pj_align
+
1
;
i
<
last_pj_align
+
1
+
VEC_SIZE
;
i
++
)
{
xj
[
i
]
=
pos_padded_j
[
0
];
yj
[
i
]
=
pos_padded_j
[
1
];
zj
[
i
]
=
pos_padded_j
[
2
];
hj
[
i
]
=
h_padded_j
;
mj
[
i
]
=
1
.
f
;
vxj
[
i
]
=
1
.
f
;
vyj
[
i
]
=
1
.
f
;
vzj
[
i
]
=
1
.
f
;
rhoj
[
i
]
=
1
.
f
;
grad_hj
[
i
]
=
1
.
f
;
pOrho2j
[
i
]
=
1
.
f
;
balsaraj
[
i
]
=
1
.
f
;
soundspeedj
[
i
]
=
1
.
f
;
}
}
...
...
@@ -506,6 +686,11 @@ static INLINE void cache_clean(struct cache *c) {
free
(
c
->
vz
);
free
(
c
->
h
);
free
(
c
->
max_index
);
free
(
c
->
rho
);
free
(
c
->
grad_h
);
free
(
c
->
pOrho2
);
free
(
c
->
balsara
);
free
(
c
->
soundspeed
);
}
}
...
...
src/hydro/Gadget2/hydro_iact.h
View file @
840de9bb
...
...
@@ -170,17 +170,15 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
mask_t
mask
)
{
vector
r
,
ri
,
ui
,
wi
,
wi_dx
;
vector
mj
;
vector
dvx
,
dvy
,
dvz
;
vector
vjx
,
vjy
,
vjz
;
vector
dvdr
;
vector
curlvrx
,
curlvry
,
curlvrz
;
/* Fill the vectors. */
mj
.
v
=
vec_load
(
Mj
);
vjx
.
v
=
vec_load
(
Vjx
);
vjy
.
v
=
vec_load
(
Vjy
);
vjz
.
v
=
vec_load
(
Vjz
);
const
vector
mj
=
vec
tor
_load
(
Mj
);
const
vector
vjx
=
vec
tor
_load
(
Vjx
);
const
vector
vjy
=
vec
tor
_load
(
Vjy
);
const
vector
vjz
=
vec
tor
_load
(
Vjz
);
/* Get the radius and inverse radius. */
ri
=
vec_reciprocal_sqrt
(
*
r2
);
...
...
@@ -245,38 +243,34 @@ runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz,
vector
*
curlvySum
,
vector
*
curlvzSum
,
mask_t
mask
,
mask_t
mask2
,
short
mask_cond
)
{
vector
r
,
ri
,
r2
,
ui
,
wi
,
wi_dx
;
vector
mj
;
vector
dx
,
dy
,
dz
,
dvx
,
dvy
,
dvz
;
vector
vjx
,
vjy
,
vjz
;
vector
r
,
ri
,
ui
,
wi
,
wi_dx
;
vector
dvx
,
dvy
,
dvz
;
vector
dvdr
;
vector
curlvrx
,
curlvry
,
curlvrz
;
vector
r_2
,
ri2
,
r2_2
,
ui2
,
wi2
,
wi_dx2
;
vector
mj2
;
vector
dx2
,
dy2
,
dz2
,
dvx2
,
dvy2
,
dvz2
;
vector
vjx2
,
vjy2
,
vjz2
;
vector
r_2
,
ri2
,
ui2
,
wi2
,
wi_dx2
;
vector
dvx2
,
dvy2
,
dvz2
;
vector
dvdr2
;
vector
curlvrx2
,
curlvry2
,
curlvrz2
;
/* Fill the vectors. */
mj
.
v
=
vec_load
(
Mj
);
mj2
.
v
=
vec_load
(
&
Mj
[
VEC_SIZE
]);
vjx
.
v
=
vec_load
(
Vjx
);
vjx2
.
v
=
vec_load
(
&
Vjx
[
VEC_SIZE
]);
vjy
.
v
=
vec_load
(
Vjy
);
vjy2
.
v
=
vec_load
(
&
Vjy
[
VEC_SIZE
]);
vjz
.
v
=
vec_load
(
Vjz
);
vjz2
.
v
=
vec_load
(
&
Vjz
[
VEC_SIZE
]);
dx
.
v
=
vec_load
(
Dx
);
dx2
.
v
=
vec_load
(
&
Dx
[
VEC_SIZE
]);
dy
.
v
=
vec_load
(
Dy
);
dy2
.
v
=
vec_load
(
&
Dy
[
VEC_SIZE
]);
dz
.
v
=
vec_load
(
Dz
);
dz2
.
v
=
vec_load
(
&
Dz
[
VEC_SIZE
]);
const
vector
mj
=
vec
tor
_load
(
Mj
);
const
vector
mj2
=
vec
tor
_load
(
&
Mj
[
VEC_SIZE
]);
const
vector
vjx
=
vec
tor
_load
(
Vjx
);
const
vector
vjx2
=
vec
tor
_load
(
&
Vjx
[
VEC_SIZE
]);
const
vector
vjy
=
vec
tor
_load
(
Vjy
);
const
vector
vjy2
=
vec
tor
_load
(
&
Vjy
[
VEC_SIZE
]);
const
vector
vjz
=
vec
tor
_load
(
Vjz
);
const
vector
vjz2
=
vec
tor
_load
(
&
Vjz
[
VEC_SIZE
]);
const
vector
dx
=
vec
tor
_load
(
Dx
);
const
vector
dx2
=
vec
tor
_load
(
&
Dx
[
VEC_SIZE
]);
const
vector
dy
=
vec
tor
_load
(
Dy
);
const
vector
dy2
=
vec
tor
_load
(
&
Dy
[
VEC_SIZE
]);
const
vector
dz
=
vec
tor
_load
(
Dz
);
const
vector
dz2
=
vec
tor
_load
(
&
Dz
[
VEC_SIZE
]);
/* Get the radius and inverse radius. */
r2
.
v
=
vec_load
(
R2
);
r2_2
.
v
=
vec_load
(
&
R2
[
VEC_SIZE
]);
const
vector
r2
=
vec
tor
_load
(
R2
);
const
vector
r2_2
=
vec
tor
_load
(
&
R2
[
VEC_SIZE
]);
ri
=
vec_reciprocal_sqrt
(
r2
);
ri2
=
vec_reciprocal_sqrt
(
r2_2
);
r
.
v
=
vec_mul
(
r2
.
v
,
ri
.
v
);
...
...
@@ -592,30 +586,29 @@ runner_iact_nonsym_1_vec_force(
#ifdef WITH_VECTORIZATION
vector
r
,
ri
;
vector
vjx
,
vjy
,
vjz
,
dvx
,
dvy
,
dvz
;
vector
pjrho
,
grad_hj
,
pjPOrho2
,
balsara_j
,
cj
,
mj
;
vector
dvx
,
dvy
,
dvz
;
vector
xi
,
xj
;
vector
hid_inv
,
hjd_inv
;
vector
wi_dx
,
wj_dx
,
wi_dr
,
wj_dr
,
dvdr
;
vector
piax
,
piay
,
piaz
;
vector
pih_dt
;
vector
v_sig
;
vector
omega_ij
,
mu_ij
,
fac_mu
,
balsara
;
vector
omega_ij
,
mu_ij
,
balsara
;
vector
rho_ij
,
visc
,
visc_term
,
sph_term
,
acc
,
entropy_dt
;
/* Fill vectors. */
vjx
.
v
=
vec_load
(
Vjx
);
vjy
.
v
=
vec_load
(
Vjy
);
vjz
.
v
=
vec_load
(
Vjz
);
mj
.
v
=
vec_load
(
Mj
);
pjrho
.
v
=
vec_load
(
Pjrho
);
grad_hj
.
v
=
vec_load
(
Grad_hj
);
pjPOrho2
.
v
=
vec_load
(
PjPOrho2
);