Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
SWIFTsim
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
SWIFT
SWIFTsim
Commits
dcbe6d03
Commit
dcbe6d03
authored
7 years ago
by
James Willis
Browse files
Options
Downloads
Patches
Plain Diff
Help the compiler auto-vectorise the reading of cache for force interactions with hints.
parent
b759fd04
No related branches found
No related tags found
1 merge request
!440
Dopair2 vectorisation
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/cache.h
+104
-72
104 additions, 72 deletions
src/cache.h
with
104 additions
and
72 deletions
src/cache.h
+
104
−
72
View file @
dcbe6d03
...
@@ -442,7 +442,7 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted_f
...
@@ -442,7 +442,7 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted_f
const
double
*
const
shift
,
int
*
first_pi
,
int
*
last_pj
,
const
double
*
const
shift
,
int
*
first_pi
,
int
*
last_pj
,
const
int
num_vec_proc
)
{
const
int
num_vec_proc
)
{
int
idx
,
ci_cache_
idx
;
int
idx
;
/* Pad number of particles read to the vector size. */
/* Pad number of particles read to the vector size. */
int
rem
=
(
ci
->
count
-
*
first_pi
)
%
(
num_vec_proc
*
VEC_SIZE
);
int
rem
=
(
ci
->
count
-
*
first_pi
)
%
(
num_vec_proc
*
VEC_SIZE
);
if
(
rem
!=
0
)
{
if
(
rem
!=
0
)
{
...
@@ -460,33 +460,53 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted_f
...
@@ -460,33 +460,53 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted_f
int
first_pi_align
=
*
first_pi
;
int
first_pi_align
=
*
first_pi
;
int
last_pj_align
=
*
last_pj
;
int
last_pj_align
=
*
last_pj
;
const
struct
part
*
restrict
parts_i
=
ci
->
parts
;
const
struct
part
*
restrict
parts_j
=
cj
->
parts
;
double
loc
[
3
];
loc
[
0
]
=
ci
->
loc
[
0
];
loc
[
1
]
=
ci
->
loc
[
1
];
loc
[
2
]
=
ci
->
loc
[
2
];
/* Shift the particles positions to a local frame (ci frame) so single precision
/* Let the compiler know that the data is aligned and create pointers to the
* can be
* arrays inside the cache. */
* used instead of double precision. Also shift the cell ci, particles positions
swift_declare_aligned_ptr
(
float
,
x
,
ci_cache
->
x
,
SWIFT_CACHE_ALIGNMENT
);
* due to BCs but leave cell cj. */
swift_declare_aligned_ptr
(
float
,
y
,
ci_cache
->
y
,
SWIFT_CACHE_ALIGNMENT
);
#if defined(WITH_VECTORIZATION) && defined(__ICC)
swift_declare_aligned_ptr
(
float
,
z
,
ci_cache
->
z
,
SWIFT_CACHE_ALIGNMENT
);
#pragma vector aligned
swift_declare_aligned_ptr
(
float
,
h
,
ci_cache
->
h
,
SWIFT_CACHE_ALIGNMENT
);
#endif
swift_declare_aligned_ptr
(
float
,
m
,
ci_cache
->
m
,
SWIFT_CACHE_ALIGNMENT
);
for
(
int
i
=
first_pi_align
;
i
<
ci
->
count
;
i
++
)
{
swift_declare_aligned_ptr
(
float
,
vx
,
ci_cache
->
vx
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
vy
,
ci_cache
->
vy
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
vz
,
ci_cache
->
vz
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
rho
,
ci_cache
->
rho
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
grad_h
,
ci_cache
->
grad_h
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
pOrho2
,
ci_cache
->
pOrho2
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
balsara
,
ci_cache
->
balsara
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
soundspeed
,
ci_cache
->
soundspeed
,
SWIFT_CACHE_ALIGNMENT
);
int
ci_cache_count
=
ci
->
count
-
first_pi_align
;
/* Shift the particles positions to a local frame (ci frame) so single precision
* can be
* used instead of double precision. Also shift the cell ci, particles positions
* due to BCs but leave cell cj. */
for
(
int
i
=
0
;
i
<
ci_cache_count
;
i
++
)
{
/* Make sure ci_cache is filled from the first element. */
/* Make sure ci_cache is filled from the first element. */
ci_cache_idx
=
i
-
first_pi_align
;
idx
=
sort_i
[
i
].
i
;
ci_cache
->
x
[
ci_cache_idx
]
=
ci
->
parts
[
idx
].
x
[
0
]
-
ci
->
loc
[
0
]
-
shift
[
0
];
ci_cache
->
y
[
ci_cache_idx
]
=
ci
->
parts
[
idx
].
x
[
1
]
-
ci
->
loc
[
1
]
-
shift
[
1
];
ci_cache
->
z
[
ci_cache_idx
]
=
ci
->
parts
[
idx
].
x
[
2
]
-
ci
->
loc
[
2
]
-
shift
[
2
];
ci_cache
->
h
[
ci_cache_idx
]
=
ci
->
parts
[
idx
].
h
;
ci_cache
->
m
[
ci_cache_idx
]
=
ci
->
parts
[
idx
].
mass
;
ci_cache
->
vx
[
ci_cache_idx
]
=
ci
->
parts
[
idx
].
v
[
0
];
ci_cache
->
vy
[
ci_cache_idx
]
=
ci
->
parts
[
idx
].
v
[
1
];
ci_cache
->
vz
[
ci_cache_idx
]
=
ci
->
parts
[
idx
].
v
[
2
];
ci_cache
->
rho
[
ci_cache_idx
]
=
ci
->
parts
[
idx
].
rho
;
idx
=
sort_i
[
i
+
first_pi_align
].
i
;
ci_cache
->
grad_h
[
ci_cache_idx
]
=
ci
->
parts
[
idx
].
force
.
f
;
x
[
i
]
=
(
float
)(
parts_i
[
idx
].
x
[
0
]
-
loc
[
0
]
-
shift
[
0
]);
ci_cache
->
pOrho2
[
ci_cache_idx
]
=
ci
->
parts
[
idx
].
force
.
P_over_rho2
;
y
[
i
]
=
(
float
)(
parts_i
[
idx
].
x
[
1
]
-
loc
[
1
]
-
shift
[
1
]);
ci_cache
->
balsara
[
ci_cache_idx
]
=
ci
->
parts
[
idx
].
force
.
balsara
;
z
[
i
]
=
(
float
)(
parts_i
[
idx
].
x
[
2
]
-
loc
[
2
]
-
shift
[
2
]);
ci_cache
->
soundspeed
[
ci_cache_idx
]
=
ci
->
parts
[
idx
].
force
.
soundspeed
;
h
[
i
]
=
parts_i
[
idx
].
h
;
m
[
i
]
=
parts_i
[
idx
].
mass
;
vx
[
i
]
=
parts_i
[
idx
].
v
[
0
];
vy
[
i
]
=
parts_i
[
idx
].
v
[
1
];
vz
[
i
]
=
parts_i
[
idx
].
v
[
2
];
rho
[
i
]
=
parts_i
[
idx
].
rho
;
grad_h
[
i
]
=
parts_i
[
idx
].
force
.
f
;
pOrho2
[
i
]
=
parts_i
[
idx
].
force
.
P_over_rho2
;
balsara
[
i
]
=
parts_i
[
idx
].
force
.
balsara
;
soundspeed
[
i
]
=
parts_i
[
idx
].
force
.
soundspeed
;
}
}
/* Pad cache with fake particles that exist outside the cell so will not
/* Pad cache with fake particles that exist outside the cell so will not
...
@@ -494,65 +514,77 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted_f
...
@@ -494,65 +514,77 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted_f
float
fake_pix
=
2
.
0
f
*
ci
->
parts
[
sort_i
[
ci
->
count
-
1
].
i
].
x
[
0
];
float
fake_pix
=
2
.
0
f
*
ci
->
parts
[
sort_i
[
ci
->
count
-
1
].
i
].
x
[
0
];
for
(
int
i
=
ci
->
count
-
first_pi_align
;
for
(
int
i
=
ci
->
count
-
first_pi_align
;
i
<
ci
->
count
-
first_pi_align
+
VEC_SIZE
;
i
++
)
{
i
<
ci
->
count
-
first_pi_align
+
VEC_SIZE
;
i
++
)
{
ci_cache
->
x
[
i
]
=
fake_pix
;
x
[
i
]
=
fake_pix
;
ci_cache
->
y
[
i
]
=
1
.
f
;
y
[
i
]
=
1
.
f
;
ci_cache
->
z
[
i
]
=
1
.
f
;
z
[
i
]
=
1
.
f
;
ci_cache
->
h
[
i
]
=
1
.
f
;
h
[
i
]
=
1
.
f
;
ci_cache
->
m
[
i
]
=
1
.
f
;
m
[
i
]
=
1
.
f
;
ci_cache
->
vx
[
i
]
=
1
.
f
;
vx
[
i
]
=
1
.
f
;
ci_cache
->
vy
[
i
]
=
1
.
f
;
vy
[
i
]
=
1
.
f
;
ci_cache
->
vz
[
i
]
=
1
.
f
;
vz
[
i
]
=
1
.
f
;
ci_cache
->
rho
[
i
]
=
1
.
f
;
rho
[
i
]
=
1
.
f
;
ci_cache
->
grad_h
[
i
]
=
1
.
f
;
grad_h
[
i
]
=
1
.
f
;
ci_cache
->
pOrho2
[
i
]
=
1
.
f
;
pOrho2
[
i
]
=
1
.
f
;
ci_cache
->
balsara
[
i
]
=
1
.
f
;
balsara
[
i
]
=
1
.
f
;
ci_cache
->
soundspeed
[
i
]
=
1
.
f
;
soundspeed
[
i
]
=
1
.
f
;
}
}
#if defined(WITH_VECTORIZATION) && defined(__ICC)
/* Let the compiler know that the data is aligned and create pointers to the
#pragma vector aligned
* arrays inside the cache. */
#endif
swift_declare_aligned_ptr
(
float
,
xj
,
cj_cache
->
x
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
yj
,
cj_cache
->
y
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
zj
,
cj_cache
->
z
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
hj
,
cj_cache
->
h
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
mj
,
cj_cache
->
m
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
vxj
,
cj_cache
->
vx
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
vyj
,
cj_cache
->
vy
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
vzj
,
cj_cache
->
vz
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
rhoj
,
cj_cache
->
rho
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
grad_hj
,
cj_cache
->
grad_h
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
pOrho2j
,
cj_cache
->
pOrho2
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
balsaraj
,
cj_cache
->
balsara
,
SWIFT_CACHE_ALIGNMENT
);
swift_declare_aligned_ptr
(
float
,
soundspeedj
,
cj_cache
->
soundspeed
,
SWIFT_CACHE_ALIGNMENT
);
for
(
int
i
=
0
;
i
<=
last_pj_align
;
i
++
)
{
for
(
int
i
=
0
;
i
<=
last_pj_align
;
i
++
)
{
idx
=
sort_j
[
i
].
i
;
idx
=
sort_j
[
i
].
i
;
cj_cache
->
x
[
i
]
=
cj
->
parts
[
idx
].
x
[
0
]
-
ci
->
loc
[
0
];
xj
[
i
]
=
(
float
)(
parts
_j
[
idx
].
x
[
0
]
-
loc
[
0
]
)
;
cj_cache
->
y
[
i
]
=
cj
->
parts
[
idx
].
x
[
1
]
-
ci
->
loc
[
1
];
yj
[
i
]
=
(
float
)(
parts
_j
[
idx
].
x
[
1
]
-
loc
[
1
]
)
;
cj_cache
->
z
[
i
]
=
cj
->
parts
[
idx
].
x
[
2
]
-
ci
->
loc
[
2
];
zj
[
i
]
=
(
float
)(
parts
_j
[
idx
].
x
[
2
]
-
loc
[
2
]
)
;
cj_cache
->
h
[
i
]
=
cj
->
parts
[
idx
].
h
;
h
j
[
i
]
=
parts
_j
[
idx
].
h
;
cj_cache
->
m
[
i
]
=
cj
->
parts
[
idx
].
mass
;
m
j
[
i
]
=
parts
_j
[
idx
].
mass
;
cj_cache
->
vx
[
i
]
=
cj
->
parts
[
idx
].
v
[
0
];
vx
j
[
i
]
=
parts
_j
[
idx
].
v
[
0
];
cj_cache
->
vy
[
i
]
=
cj
->
parts
[
idx
].
v
[
1
];
vy
j
[
i
]
=
parts
_j
[
idx
].
v
[
1
];
cj_cache
->
vz
[
i
]
=
cj
->
parts
[
idx
].
v
[
2
];
vz
j
[
i
]
=
parts
_j
[
idx
].
v
[
2
];
cj_cache
->
rho
[
i
]
=
cj
->
parts
[
idx
].
rho
;
rho
j
[
i
]
=
parts
_j
[
idx
].
rho
;
cj_cache
->
grad_h
[
i
]
=
cj
->
parts
[
idx
].
force
.
f
;
grad_h
j
[
i
]
=
parts
_j
[
idx
].
force
.
f
;
cj_cache
->
pOrho2
[
i
]
=
cj
->
parts
[
idx
].
force
.
P_over_rho2
;
pOrho2
j
[
i
]
=
parts
_j
[
idx
].
force
.
P_over_rho2
;
cj_cache
->
balsara
[
i
]
=
cj
->
parts
[
idx
].
force
.
balsara
;
balsara
j
[
i
]
=
parts
_j
[
idx
].
force
.
balsara
;
cj_cache
->
soundspeed
[
i
]
=
cj
->
parts
[
idx
].
force
.
soundspeed
;
soundspeed
j
[
i
]
=
parts
_j
[
idx
].
force
.
soundspeed
;
}
}
/* Pad cache with fake particles that exist outside the cell so will not
/* Pad cache with fake particles that exist outside the cell so will not
* interact.*/
* interact.*/
float
fake_pjx
=
2
.
0
f
*
cj
->
parts
[
sort_j
[
cj
->
count
-
1
].
i
].
x
[
0
];
float
fake_pjx
=
2
.
0
f
*
cj
->
parts
[
sort_j
[
cj
->
count
-
1
].
i
].
x
[
0
];
for
(
int
i
=
last_pj_align
+
1
;
i
<
last_pj_align
+
1
+
VEC_SIZE
;
i
++
)
{
for
(
int
i
=
last_pj_align
+
1
;
i
<
last_pj_align
+
1
+
VEC_SIZE
;
i
++
)
{
cj_cache
->
x
[
i
]
=
fake_pjx
;
xj
[
i
]
=
fake_pjx
;
cj_cache
->
y
[
i
]
=
1
.
f
;
yj
[
i
]
=
1
.
f
;
cj_cache
->
z
[
i
]
=
1
.
f
;
zj
[
i
]
=
1
.
f
;
cj_cache
->
h
[
i
]
=
1
.
f
;
hj
[
i
]
=
1
.
f
;
cj_cache
->
m
[
i
]
=
1
.
f
;
cj_cache
->
vx
[
i
]
=
1
.
f
;
cj_cache
->
vy
[
i
]
=
1
.
f
;
cj_cache
->
vz
[
i
]
=
1
.
f
;
cj_cache
->
rho
[
i
]
=
1
.
f
;
cj_cache
->
grad_h
[
i
]
=
1
.
f
;
cj_cache
->
pOrho2
[
i
]
=
1
.
f
;
cj_cache
->
balsara
[
i
]
=
1
.
f
;
cj_cache
->
soundspeed
[
i
]
=
1
.
f
;
mj
[
i
]
=
1
.
f
;
vxj
[
i
]
=
1
.
f
;
vyj
[
i
]
=
1
.
f
;
vzj
[
i
]
=
1
.
f
;
rhoj
[
i
]
=
1
.
f
;
grad_hj
[
i
]
=
1
.
f
;
pOrho2j
[
i
]
=
1
.
f
;
balsaraj
[
i
]
=
1
.
f
;
soundspeedj
[
i
]
=
1
.
f
;
}
}
}
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment