Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
SWIFT
SWIFTsim
Commits
6f6f3f54
Commit
6f6f3f54
authored
Feb 12, 2013
by
Pedro Gonnet
Browse files
several fixes to dopair2 and doself2 for multiple time-stepping.
Former-commit-id: fa77a1b6804434b679dc88ffed39f6b96ceaa5f4
parent
3f62f44a
Changes
10
Hide whitespace changes
Inline
Side-by-side
src/Makefile.am
View file @
6f6f3f54
...
...
@@ -20,11 +20,11 @@
AUTOMAKE_OPTIONS
=
gnu
# Add the debug flag to the whole thing
# AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \
# -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \
# -DTIMER -DCOUNTER -DCPU_TPS=2.67e9
AM_CFLAGS
=
-Wall
-Werror
$(OPENMP_CFLAGS)
\
AM_CFLAGS
=
-g
-O3
-Wall
-Werror
-ffast-math
-fstrict-aliasing
-ftree-vectorize
\
-funroll-loops
$(SIMD_FLAGS)
$(OPENMP_CFLAGS)
\
-DTIMER
-DCOUNTER
-DCPU_TPS
=
2.67e9
# AM_CFLAGS = -Wall -Werror $(OPENMP_CFLAGS) \
# -DTIMER -DCOUNTER -DCPU_TPS=2.67e9
# Assign a "safe" version number
AM_LDFLAGS
=
$(LAPACK_LIBS)
$(BLAS_LIBS)
$(HDF5_LDFLAGS)
-version-info
0:0:0
...
...
src/debug.c
View file @
6f6f3f54
/*******************************************************************************
* This file is part of SWIFT.
* Coypright (c) 2012 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
* Coypright (c) 2013 Matthieu Schaller (matthieu.schaller@durham.ac.uk),
* Pedro Gonnet (pedro.gonnet@durham.ac.uk).
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
...
...
@@ -29,7 +30,7 @@ void printParticle ( struct part *parts , long long int id ) {
/* Look for the particle. */
for
(
i
=
0
;
parts
[
i
].
id
!=
id
;
i
++
);
printf
(
"## Particle[%d]: id=%lld, x=
(
%f,%f,%f
)
, v=
(
%f,%f,%f
)
, a=
(
%f,%f,%f
)
, h=%
f
, h_dt=%
f
, wcount=%f, m=%
f
, rho=%f, u=%f, dudt=%f, dt=%.3e
\n
"
,
printf
(
"## Particle[%d]: id=%lld, x=
[
%f,%f,%f
]
, v=
[
%f,%f,%f
]
, a=
[
%f,%f,%f
]
, h=%
.3e
, h_dt=%
.3e
, wcount=%f, m=%
.3e
, rho=%f, u=%f, dudt=%f, dt=%.3e
\n
"
,
i
,
parts
[
i
].
id
,
parts
[
i
].
x
[
0
],
parts
[
i
].
x
[
1
],
parts
[
i
].
x
[
2
],
...
...
src/engine.c
View file @
6f6f3f54
...
...
@@ -66,7 +66,7 @@ void engine_prepare ( struct engine *e ) {
int
j
,
k
,
qid
;
struct
space
*
s
=
e
->
s
;
struct
queue
*
q
;
float
dt_
max
=
e
->
dt_
max
;
float
dt_
step
=
e
->
dt_
step
;
TIMER_TIC
...
...
@@ -95,7 +95,7 @@ void engine_prepare ( struct engine *e ) {
// tic = getticks();
#pragma omp parallel for schedule(static)
for
(
k
=
0
;
k
<
s
->
nr_parts
;
k
++
)
if
(
s
->
parts
[
k
].
dt
<=
dt_
max
)
{
if
(
s
->
parts
[
k
].
dt
<=
dt_
step
)
{
s
->
parts
[
k
].
wcount
=
0
.
0
f
;
s
->
parts
[
k
].
wcount_dh
=
0
.
0
f
;
s
->
parts
[
k
].
rho
=
0
.
0
f
;
...
...
@@ -179,71 +179,70 @@ void engine_step ( struct engine *e , int sort_queues ) {
int
k
,
nr_parts
=
e
->
s
->
nr_parts
;
struct
part
*
restrict
parts
=
e
->
s
->
parts
,
*
restrict
p
;
floa
t
*
restrict
v_bar
;
float
dt
=
e
->
dt
,
hdt
=
0
.
5
*
dt
,
dt_max
,
dt_min
,
ldt_min
,
ldt_max
;
double
e
t
ot
=
0
.
0
,
le
t
ot
,
lmom
[
3
],
mom
[
3
]
=
{
0
.
0
,
0
.
0
,
0
.
0
};
int
threadID
,
nthreads
;
struct
xpar
t
*
restrict
xp
;
float
dt
=
e
->
dt
,
hdt
=
0
.
5
*
dt
,
dt_step
,
dt_max
,
dt_min
,
ldt_min
,
ldt_max
;
double
e
p
ot
=
0
.
0
,
ekin
=
0
.
0
,
le
p
ot
,
lekin
,
lmom
[
3
],
mom
[
3
]
=
{
0
.
0
,
0
.
0
,
0
.
0
};
int
threadID
,
nthreads
,
count
=
0
,
lcount
;
// #ifdef __SSE2__
// VEC_MACRO(4,float) hdtv = _mm_set1_ps( hdt );
// #endif
/* Get the maximum dt. */
dt_
max
=
2
.
0
f
*
dt
;
dt_
step
=
2
.
0
f
*
dt
;
for
(
k
=
0
;
k
<
32
&&
(
e
->
step
&
(
1
<<
k
))
==
0
;
k
++
)
dt_
max
*=
2
;
dt_max
=
1
;
dt_
step
*=
2
;
// dt_step = FLT_MAX
;
/* Set the maximum dt. */
e
->
dt_
max
=
dt_
max
;
e
->
s
->
dt_
max
=
dt_
max
;
printf
(
"engine_step: dt_
max
set to %.3e.
\n
"
,
dt_
max
);
fflush
(
stdout
);
e
->
dt_
step
=
dt_
step
;
e
->
s
->
dt_
step
=
dt_
step
;
printf
(
"engine_step: dt_
step
set to %.3e.
\n
"
,
dt_
step
);
fflush
(
stdout
);
/* Allocate a buffer for the old velocities. */
if
(
posix_memalign
(
(
void
**
)
&
v_bar
,
16
,
sizeof
(
float
)
*
nr_parts
*
4
)
!=
0
)
error
(
"Failed to allocate v_old buffer."
);
/* First kick. */
TIMER_TIC
//
#pragma omp parallel for schedule(static) private(p)
#pragma omp parallel for schedule(static) private(p
,xp
)
for
(
k
=
0
;
k
<
nr_parts
;
k
++
)
{
/* Get a handle on the part. */
p
=
&
parts
[
k
];
xp
=
p
->
xtras
;
/* Step and store the velocity and internal energy. */
// #ifdef __SSE__
// _mm_store_ps( &v_bar[4*k] ,
_mm_add_ps(
_mm_load_ps( &p->v[0] )
, _mm_mul_ps(
hdtv
,
_mm_load_ps( &p->a[0] )
) )
);
// _mm_store_ps( &v_bar[4*k] , _mm_load_ps( &p->v[0] )
+
hdtv
*
_mm_load_ps( &p->a[0] ) );
// #else
v_bar
[
4
*
k
+
0
]
=
p
->
v
[
0
]
+
hdt
*
p
->
a
[
0
];
v_bar
[
4
*
k
+
1
]
=
p
->
v
[
1
]
+
hdt
*
p
->
a
[
1
];
v_bar
[
4
*
k
+
2
]
=
p
->
v
[
2
]
+
hdt
*
p
->
a
[
2
];
xp
->
v_old
[
0
]
=
p
->
v
[
0
]
+
hdt
*
p
->
a
[
0
];
xp
->
v_old
[
1
]
=
p
->
v
[
1
]
+
hdt
*
p
->
a
[
1
];
xp
->
v_old
[
2
]
=
p
->
v
[
2
]
+
hdt
*
p
->
a
[
2
];
// #endif
v_bar
[
4
*
k
+
3
]
=
p
->
u
+
hdt
*
p
->
u_dt
;
// xp->u_old = fmaxf( p->u + hdt * p->u_dt , FLT_EPSILON );
xp
->
u_old
=
p
->
u
+
hdt
*
p
->
u_dt
;
/* Move the particles with the velocitie at the half-step. */
p
->
x
[
0
]
+=
dt
*
v_bar
[
4
*
k
+
0
];
p
->
x
[
1
]
+=
dt
*
v_bar
[
4
*
k
+
1
];
p
->
x
[
2
]
+=
dt
*
v_bar
[
4
*
k
+
2
];
p
->
x
[
0
]
+=
dt
*
xp
->
v_old
[
0
];
p
->
x
[
1
]
+=
dt
*
xp
->
v_old
[
1
];
p
->
x
[
2
]
+=
dt
*
xp
->
v_old
[
2
];
/* Update positions and energies at the half-step. */
p
->
v
[
0
]
+=
dt
*
p
->
a
[
0
];
p
->
v
[
1
]
+=
dt
*
p
->
a
[
1
];
p
->
v
[
2
]
+=
dt
*
p
->
a
[
2
];
p
->
u
*=
expf
(
p
->
u_dt
/
p
->
u
*
dt
);
//
p->h *= expf( p->h_dt / p->h * dt );
p
->
h
*=
expf
(
p
->
h_dt
/
p
->
h
*
dt
);
/* Integrate other values if this particle will not be updated. */
//
if ( p->dt > dt_
max
) {
//
p->rho *= expf( -3.0f * p->h_dt / p->h * dt );
//
p->POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f );
//
}
if
(
p
->
dt
>
dt_
step
)
{
p
->
rho
*=
expf
(
-
3
.
0
f
*
p
->
h_dt
/
p
->
h
*
dt
);
p
->
POrho2
=
p
->
u
*
(
const_gamma
-
1
.
0
f
)
/
(
p
->
rho
+
p
->
h
*
p
->
rho_dh
/
3
.
0
f
);
}
}
TIMER_TOC
(
timer_kick1
);
// for(k=0; k<10; ++k)
// printParticle(parts, k);
// printParticle( parts , 494849 );
/* Prepare the space. */
engine_prepare
(
e
);
...
...
@@ -274,44 +273,52 @@ void engine_step ( struct engine *e , int sort_queues ) {
// for(k=0; k<10; ++k)
// printParticle(parts, k);
// printParticle( parts , 494849 );
/* Second kick. */
TIMER_TIC_ND
dt_min
=
FLT_MAX
;
dt_max
=
0
.
0
f
;
#pragma omp parallel private(p,k,ldt_min,ldt_max,lmom,le
t
ot,threadID,nthreads)
#pragma omp parallel private(p,
xp,
k,ldt_min,
lcount,
ldt_max,lmom,le
kin,lep
ot,threadID,nthreads)
{
threadID
=
omp_get_thread_num
();
nthreads
=
omp_get_num_threads
();
ldt_min
=
FLT_MAX
;
ldt_max
=
0
.
0
f
;
lmom
[
0
]
=
0
.
0
;
lmom
[
1
]
=
0
.
0
;
lmom
[
2
]
=
0
.
0
;
le
tot
=
0
.
0
;
le
kin
=
0
.
0
;
lepot
=
0
.
0
;
lcount
=
0
;
for
(
k
=
nr_parts
*
threadID
/
nthreads
;
k
<
nr_parts
*
(
threadID
+
1
)
/
nthreads
;
k
++
)
{
/* Get a handle on the part. */
p
=
&
parts
[
k
];
xp
=
p
->
xtras
;
/* Scale the derivatives if they're freshly computed. */
if
(
p
->
dt
<=
dt_
max
)
{
if
(
p
->
dt
<=
dt_
step
)
{
p
->
u_dt
*=
p
->
POrho2
;
p
->
h_dt
*=
p
->
h
*
0
.
333333333
f
;
lcount
+=
1
;
}
/* Update the particle's time step. */
p
->
dt
=
const_cfl
*
p
->
h
/
(
p
->
c
+
p
->
v_sig
);
/* Update positions and energies at the half-step. */
// #ifdef __SSE__
// _mm_store_ps( &p->v[0] ,
_mm_add_ps(
_mm_load_ps( &v_bar[4*k] )
, _mm_mul_ps(
hdtv
,
_mm_load_ps( &p->a[0] )
) )
);
// _mm_store_ps( &p->v[0] , _mm_load_ps( &v_bar[4*k] )
+
hdtv
*
_mm_load_ps( &p->a[0] ) );
// #else
p
->
v
[
0
]
=
v_bar
[
4
*
k
+
0
]
+
hdt
*
p
->
a
[
0
];
p
->
v
[
1
]
=
v_bar
[
4
*
k
+
1
]
+
hdt
*
p
->
a
[
1
];
p
->
v
[
2
]
=
v_bar
[
4
*
k
+
2
]
+
hdt
*
p
->
a
[
2
];
p
->
v
[
0
]
=
xp
->
v_old
[
0
]
+
hdt
*
p
->
a
[
0
];
p
->
v
[
1
]
=
xp
->
v_old
[
1
]
+
hdt
*
p
->
a
[
1
];
p
->
v
[
2
]
=
xp
->
v_old
[
2
]
+
hdt
*
p
->
a
[
2
];
// #endif
p
->
u
=
v_bar
[
4
*
k
+
3
]
+
hdt
*
p
->
u_dt
;
// p->u = fmaxf( xp->u_old + hdt * p->u_dt , FLT_EPSILON );
p
->
u
=
xp
->
u_old
+
hdt
*
p
->
u_dt
;
/* Get the smallest/largest dt. */
ldt_min
=
fminf
(
ldt_min
,
p
->
dt
);
ldt_max
=
fmaxf
(
ldt_max
,
p
->
dt
);
/* Collect total energy. */
letot
+=
0
.
5
*
p
->
mass
*
(
p
->
v
[
0
]
*
p
->
v
[
0
]
+
p
->
v
[
1
]
*
p
->
v
[
1
]
+
p
->
v
[
2
]
*
p
->
v
[
2
]
)
+
p
->
mass
*
p
->
u
;
lekin
+=
0
.
5
*
p
->
mass
*
(
p
->
v
[
0
]
*
p
->
v
[
0
]
+
p
->
v
[
1
]
*
p
->
v
[
1
]
+
p
->
v
[
2
]
*
p
->
v
[
2
]
);
lepot
+=
p
->
mass
*
p
->
u
;
/* Collect momentum */
lmom
[
0
]
+=
p
->
mass
*
p
->
v
[
0
];
...
...
@@ -326,31 +333,33 @@ void engine_step ( struct engine *e , int sort_queues ) {
mom
[
0
]
+=
lmom
[
0
];
mom
[
1
]
+=
lmom
[
1
];
mom
[
2
]
+=
lmom
[
2
];
etot
+=
letot
;
epot
+=
lepot
;
ekin
+=
lekin
;
count
+=
lcount
;
}
}
TIMER_TOC
(
timer_kick2
);
e
->
dt_min
=
dt_min
;
printf
(
"engine_step: dt_min/dt_max is %e/%e.
\n
"
,
dt_min
,
dt_max
);
fflush
(
stdout
);
printf
(
"engine_step: etot is %e
.
\n
"
,
et
ot
);
fflush
(
stdout
);
printf
(
"engine_step: etot is %e
(ekin=%e, epot=%e).
\n
"
,
ekin
+
epot
,
ekin
,
ep
ot
);
fflush
(
stdout
);
printf
(
"engine_step: total momentum is [ %e , %e , %e ].
\n
"
,
mom
[
0
]
,
mom
[
1
]
,
mom
[
2
]
);
fflush
(
stdout
);
printf
(
"engine_step: updated %i parts (dt_step=%.3e).
\n
"
,
count
,
dt_step
);
fflush
(
stdout
);
/* Increase the step counter. */
e
->
step
+=
1
;
/* Does the time step need adjusting? */
if
(
dt_min
<
e
->
dt
)
{
while
(
dt_min
<
e
->
dt
)
{
e
->
dt
*=
0
.
5
;
e
->
step
*=
2
;
printf
(
"engine_step: dt_min dropped below time step, adjusting to dt=%e.
\n
"
,
e
->
dt
);
}
else
if
(
dt_min
>
2
*
e
->
dt
)
{
while
(
dt_min
>
2
*
e
->
dt
&&
(
e
->
step
&
1
)
==
0
)
{
e
->
dt
*=
2
.
0
;
e
->
step
/=
2
;
printf
(
"engine_step: dt_min is larger than twice the time step, adjusting to dt=%e.
\n
"
,
e
->
dt
);
}
/* Clean up. */
free
(
v_bar
);
/* Increase the step counter. */
e
->
step
+=
1
;
}
...
...
src/engine.h
View file @
6f6f3f54
...
...
@@ -50,8 +50,10 @@ struct engine {
/* The queues. */
struct
queue
*
queues
;
/* The maximum dt to step. */
float
dt_max
;
/* The maximum dt to step (current). */
float
dt_step
;
/* The minimum dt over all particles in the system. */
float
dt_min
;
/* The system time step. */
...
...
src/part.h
View file @
6f6f3f54
...
...
@@ -38,6 +38,21 @@ struct cpart {
}
__attribute__
((
aligned
(
32
)));
/* Extra particle data not needed during the computation. */
struct
xpart
{
/* Old position, at last tree rebuild. */
double
x_old
[
3
];
/* Old velocity. */
float
v_old
[
3
];
/* Old entropy. */
float
u_old
;
}
__attribute__
((
aligned
(
32
)));
/* Data of a single particle. */
struct
part
{
...
...
@@ -66,6 +81,9 @@ struct part {
/* Particle acceleration. */
float
a
[
3
]
__attribute__
((
aligned
(
16
)));
/* Maximum neighbouring u. */
float
c
,
v_sig
;
/* Derivative of the density with respect to this particle's smoothing length. */
float
rho_dh
;
...
...
@@ -80,8 +98,8 @@ struct part {
/* Particle ID. */
unsigned
long
long
id
;
/*
Old position, at last tree rebuild
. */
double
x_old
[
3
]
;
/*
Pointer to extra particle data
. */
struct
xpart
*
xtras
;
/* Particle position. */
double
x
[
3
];
...
...
src/runner.c
View file @
6f6f3f54
...
...
@@ -329,8 +329,8 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
struct
cell
*
finger
;
int
i
,
k
,
redo
,
count
=
c
->
count
;
int
*
pid
;
float
ihg
,
ihg2
;
float
dt_
max
=
r
->
e
->
dt_
max
;
float
ihg
,
ihg2
,
h_corr
;
float
dt_
step
=
r
->
e
->
dt_
step
;
TIMER_TIC
/* Recurse? */
...
...
@@ -361,7 +361,7 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
cp
=
&
c
->
cparts
[
pid
[
i
]
];
/* Is this part within the timestep? */
if
(
cp
->
dt
<=
dt_
max
)
{
if
(
cp
->
dt
<=
dt_
step
)
{
/* Adjust the computed rho. */
ihg
=
kernel_igamma
/
p
->
h
;
...
...
@@ -370,8 +370,15 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
p
->
rho_dh
*=
ihg2
*
ihg2
;
p
->
wcount
+=
kernel_wroot
;
/* Update the smoothing length. */
p
->
h
-=
(
p
->
wcount
-
const_nwneigh
)
/
p
->
wcount_dh
;
/* Compute the smoothing length update (Newton step). */
h_corr
=
(
const_nwneigh
-
p
->
wcount
)
/
p
->
wcount_dh
;
/* Truncate to the range [ -p->h/2 , p->h ]. */
h_corr
=
fminf
(
h_corr
,
p
->
h
);
h_corr
=
fmaxf
(
h_corr
,
-
p
->
h
/
2
);
/* Apply the correction to p->h. */
p
->
h
+=
h_corr
;
cp
->
h
=
p
->
h
;
/* Did we get the right number density? */
...
...
@@ -389,8 +396,7 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
}
/* Compute this particle's time step. */
p
->
dt
=
const_cfl
*
p
->
h
/
sqrtf
(
const_gamma
*
(
const_gamma
-
1
.
0
f
)
*
p
->
u
);
cp
->
dt
=
p
->
dt
;
p
->
c
=
sqrtf
(
const_gamma
*
(
const_gamma
-
1
.
0
f
)
*
p
->
u
);
/* Compute the pressure. */
// p->P = p->rho * p->u * ( const_gamma - 1.0f );
...
...
@@ -405,6 +411,7 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
/* Reset the time derivatives. */
p
->
u_dt
=
0
.
0
f
;
p
->
h_dt
=
0
.
0
f
;
p
->
v_sig
=
0
.
0
f
;
}
...
...
src/runner_doiact.h
View file @
6f6f3f54
...
...
@@ -106,7 +106,7 @@ void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *r
struct
cpart
*
restrict
cpi
,
*
restrict
cparts_i
=
ci
->
cparts
;
double
pix
[
3
];
float
dx
[
3
],
hi
,
hi2
,
r2
;
float
dt_
max
=
e
->
dt_
max
;
float
dt_
step
=
e
->
dt_
step
;
#ifdef VECTORIZE
int
icount
=
0
;
float
r2q
[
VEC_SIZE
]
__attribute__
((
aligned
(
16
)));
...
...
@@ -118,7 +118,7 @@ void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *r
TIMER_TIC
/* Anything to do here? */
if
(
ci
->
dt_min
>
dt_
max
&&
cj
->
dt_min
>
dt_
max
)
if
(
ci
->
dt_min
>
dt_
step
&&
cj
->
dt_min
>
dt_
step
)
return
;
/* Get the relative distance between the pairs, wrapping. */
...
...
@@ -215,7 +215,7 @@ void DOSELF_NAIVE ( struct runner *r , struct cell *restrict c ) {
struct
cpart
*
restrict
cpi
,
*
restrict
cpj
,
*
restrict
cparts
=
c
->
cparts
;
double
pix
[
3
];
float
dx
[
3
],
hi
,
hi2
,
r2
;
float
dt_
max
=
r
->
e
->
dt_
max
;
float
dt_
step
=
r
->
e
->
dt_
step
;
#ifdef VECTORIZE
int
icount
=
0
;
float
r2q
[
VEC_SIZE
]
__attribute__
((
aligned
(
16
)));
...
...
@@ -227,7 +227,7 @@ void DOSELF_NAIVE ( struct runner *r , struct cell *restrict c ) {
TIMER_TIC
/* Anything to do here? */
if
(
c
->
dt_min
>
dt_
max
)
if
(
c
->
dt_min
>
dt_
step
)
return
;
/* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" ,
...
...
@@ -638,7 +638,7 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) {
double
hi_max
,
hj_max
;
double
di_max
,
dj_min
;
int
count_i
,
count_j
;
float
dt_
max
=
e
->
dt_
max
;
float
dt_
step
=
e
->
dt_
step
;
#ifdef VECTORIZE
int
icount
=
0
;
float
r2q
[
VEC_SIZE
]
__attribute__
((
aligned
(
16
)));
...
...
@@ -650,7 +650,7 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) {
TIMER_TIC
/* Anything to do here? */
if
(
ci
->
dt_min
>
dt_
max
&&
cj
->
dt_min
>
dt_
max
)
if
(
ci
->
dt_min
>
dt_
step
&&
cj
->
dt_min
>
dt_
step
)
return
;
/* Get the sort ID. */
...
...
@@ -722,7 +722,7 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) {
/* Get a hold of the ith part in ci. */
pi
=
&
parts_i
[
sort_i
[
pid
].
i
];
cpi
=
&
cparts_i
[
sort_i
[
pid
].
i
];
if
(
cpi
->
dt
>
dt_
max
)
if
(
cpi
->
dt
>
dt_
step
)
continue
;
hi
=
cpi
->
h
;
di
=
sort_i
[
pid
].
d
+
hi
-
rshift
;
...
...
@@ -789,7 +789,7 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) {
/* Get a hold of the jth part in cj. */
pj
=
&
parts_j
[
sort_j
[
pjd
].
i
];
cpj
=
&
cparts_j
[
sort_j
[
pjd
].
i
];
if
(
cpj
->
dt
>
dt_
max
)
if
(
cpj
->
dt
>
dt_
step
)
continue
;
hj
=
cpj
->
h
;
dj
=
sort_j
[
pjd
].
d
-
hj
-
rshift
;
...
...
@@ -879,19 +879,25 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
double
hi_max
,
hj_max
;
double
di_max
,
dj_min
;
int
count_i
,
count_j
;
float
dt_
max
=
e
->
dt_
max
;
float
dt_
step
=
e
->
dt_
step
;
#ifdef VECTORIZE
int
icount
=
0
;
float
r2q
[
VEC_SIZE
]
__attribute__
((
aligned
(
16
)));
float
hiq
[
VEC_SIZE
]
__attribute__
((
aligned
(
16
)));
float
hjq
[
VEC_SIZE
]
__attribute__
((
aligned
(
16
)));
float
dxq
[
3
*
VEC_SIZE
]
__attribute__
((
aligned
(
16
)));
struct
part
*
piq
[
VEC_SIZE
],
*
pjq
[
VEC_SIZE
];
int
icount1
=
0
;
float
r2q1
[
VEC_SIZE
]
__attribute__
((
aligned
(
16
)));
float
hiq1
[
VEC_SIZE
]
__attribute__
((
aligned
(
16
)));
float
hjq1
[
VEC_SIZE
]
__attribute__
((
aligned
(
16
)));
float
dxq1
[
3
*
VEC_SIZE
]
__attribute__
((
aligned
(
16
)));
struct
part
*
piq1
[
VEC_SIZE
],
*
pjq1
[
VEC_SIZE
];
int
icount2
=
0
;
float
r2q2
[
VEC_SIZE
]
__attribute__
((
aligned
(
16
)));
float
hiq2
[
VEC_SIZE
]
__attribute__
((
aligned
(
16
)));
float
hjq2
[
VEC_SIZE
]
__attribute__
((
aligned
(
16
)));
float
dxq2
[
3
*
VEC_SIZE
]
__attribute__
((
aligned
(
16
)));
struct
part
*
piq2
[
VEC_SIZE
],
*
pjq2
[
VEC_SIZE
];
#endif
TIMER_TIC
/* Anything to do here? */
if
(
ci
->
dt_min
>
dt_
max
&&
cj
->
dt_min
>
dt_
max
)
if
(
ci
->
dt_min
>
dt_
step
&&
cj
->
dt_min
>
dt_
step
)
return
;
/* Get the shift ID. */
...
...
@@ -931,28 +937,28 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
dj_min
=
sort_j
[
0
].
d
;
/* Collect the number of parts left and right below dt. */
if
(
c
j
->
dt_m
in
>
dt_max
)
{
if
(
c
i
->
dt_m
ax
<=
dt_step
)
{
sortdt_i
=
sort_i
;
countdt_i
=
count_i
;
}
else
if
(
c
j
->
dt_m
ax
>
dt_max
)
{
else
if
(
c
i
->
dt_m
in
<=
dt_step
)
{
if
(
(
sortdt_i
=
(
struct
entry
*
)
alloca
(
sizeof
(
struct
entry
)
*
count_i
)
)
==
NULL
)
error
(
"Failed to allocate dt sortlists."
);
for
(
k
=
0
;
k
<
count_i
;
k
++
)
if
(
cparts_i
[
sort_i
[
k
].
i
].
dt
<=
dt_
max
)
{
if
(
cparts_i
[
sort_i
[
k
].
i
].
dt
<=
dt_
step
)
{
sortdt_i
[
countdt_i
]
=
sort_i
[
k
];
countdt_i
+=
1
;
}
}
if
(
c
i
->
dt_m
in
>
dt_max
)
{
if
(
c
j
->
dt_m
ax
<=
dt_step
)
{
sortdt_j
=
sort_j
;
countdt_j
=
count_j
;
}
else
if
(
c
i
->
dt_m
ax
>
dt_max
)
{
else
if
(
c
j
->
dt_m
in
<=
dt_step
)
{
if
(
(
sortdt_j
=
(
struct
entry
*
)
alloca
(
sizeof
(
struct
entry
)
*
count_j
)
)
==
NULL
)
error
(
"Failed to allocate dt sortlists."
);
for
(
k
=
0
;
k
<
count_j
;
k
++
)
if
(
cparts_j
[
sort_j
[
k
].
i
].
dt
<=
dt_
max
)
{
if
(
cparts_j
[
sort_j
[
k
].
i
].
dt
<=
dt_
step
)
{
sortdt_j
[
countdt_j
]
=
sort_j
[
k
];
countdt_j
+=
1
;
}
...
...
@@ -977,7 +983,7 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
pix
[
k
]
=
cpi
->
x
[
k
]
-
shift
[
k
];
/* Look at valid dt parts only? */
if
(
cpi
->
dt
>
dt_
max
)
{
if
(
cpi
->
dt
>
dt_
step
)
{
/* Loop over the parts in cj within dt. */
for
(
pjd
=
0
;
pjd
<
countdt_j
&&
sortdt_j
[
pjd
].
d
<
di
;
pjd
++
)
{
...
...
@@ -988,7 +994,7 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
/* Compute the pairwise distance. */
r2
=
0
.
0
f
;
for
(
k
=
0
;
k
<
3
;
k
++
)
{
dx
[
k
]
=
pi
x
[
k
]
-
cpj
->
x
[
k
];
dx
[
k
]
=
cpj
->
x
[
k
]
-
pi
x
[
k
];
r2
+=
dx
[
k
]
*
dx
[
k
];
}
...
...
@@ -997,25 +1003,25 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
#ifndef VECTORIZE
IACT
(
r2
,
dx
,
hi
,
cpj
->
h
,
p
i
,
&
parts_j
[
sortdt_j
[
pjd
].
i
]
);
IACT
_NONSYM
(
r2
,
dx
,
cpj
->
h
,
h
i
,
&
parts_j
[
sortdt_j
[
pjd
].
i
]
,
pi
);
#else
/* Add this interaction to the queue. */
r2q
[
icount
]
=
r2
;
dxq
[
3
*
icount
+
0
]
=
dx
[
0
];
dxq
[
3
*
icount
+
1
]
=
dx
[
1
];
dxq
[
3
*
icount
+
2
]
=
dx
[
2
];
hiq
[
icount
]
=
h
i
;
hjq
[
icount
]
=
cpj
->
h
;
piq
[
icount
]
=
pi
;
pjq
[
icount
]
=
&
parts_j
[
sortdt_j
[
pjd
].
i
]
;
icount
+=
1
;
r2q
1
[
icount
1
]
=
r2
;
dxq
1
[
3
*
icount
1
+
0
]
=
dx
[
0
];
dxq
1
[
3
*
icount
1
+
1
]
=
dx
[
1
];
dxq
1
[
3
*
icount
1
+
2
]
=
dx
[
2
];
hiq
1
[
icount
1
]
=
cpj
->
h
;
hjq
1
[
icount
1
]
=
h
i
;
piq
1
[
icount
1
]
=
&
parts_j
[
sortdt_j
[
pjd
].
i
]
;
pjq
1
[
icount
1
]
=
pi
;
icount
1
+=
1
;
/* Flush? */
if
(
icount
==
VEC_SIZE
)
{
IACT_VEC
(
r2q
,
dxq
,
hiq
,
hjq
,
piq
,
pjq
);
icount
=
0
;
if
(
icount
1
==
VEC_SIZE
)
{
IACT_
NONSYM_
VEC
(
r2q
1
,
dxq
1
,
hiq
1
,
hjq
1
,
piq
1
,
pjq
1
);
icount
1
=
0
;
}
#endif
...
...
@@ -1047,25 +1053,55 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
#ifndef VECTORIZE
IACT
(
r2
,
dx
,
hi
,
cpj
->
h
,
pi
,
&
parts_j
[
sort_j
[
pjd
].
i
]
);
/* Does pj need to be updated too? */
if
(
cpj
->
dt
<=
dt_step
)
IACT
(
r2
,
dx
,
hi
,
cpj
->
h
,
pi
,
&
parts_j
[
sort_j
[
pjd
].
i
]
);
else
IACT_NONSYM
(
r2
,
dx
,
hi
,
cpj
->
h
,
pi
,
&
parts_j
[
sort_j
[
pjd
].
i
]
);
#else
/* Add this interaction to the queue. */
r2q
[
icount
]
=
r2
;
dxq
[
3
*
icount
+
0
]
=
dx
[
0
];
dxq
[
3
*
icount
+
1
]
=
dx
[
1
];
dxq
[
3
*
icount
+
2
]
=
dx
[
2
];
hiq
[
icount
]
=
hi
;
hjq
[
icount
]
=
cpj
->
h
;
piq
[
icount
]
=
pi
;
pjq
[
icount
]
=
&
parts_j
[
sort_j
[
pjd
].
i
];
icount
+=
1
;
/* Flush? */
if
(
icount
==
VEC_SIZE
)
{
IACT_VEC
(
r2q
,
dxq
,
hiq
,
hjq
,
piq
,
pjq
);
icount
=
0
;
/* Does pj need to be updated too? */
if
(
cpj
->
dt
<=
dt_step
)
{
/* Add this interaction to the symmetric queue. */
r2q2
[
icount2
]
=
r2
;
dxq2
[
3
*
icount2
+
0
]
=
dx
[
0
];
dxq2
[
3
*
icount2
+
1
]
=
dx
[
1
];
dxq2
[
3
*
icount2
+
2
]
=
dx
[
2
];
hiq2
[
icount2
]
=
hi
;
hjq2
[
icount2
]
=
cpj
->
h
;