Commit 50a466ff authored by Pedro Gonnet's avatar Pedro Gonnet
Browse files

several fixes to the particle tree and task construction.


Former-commit-id: c6a968263d1d47644bfac82ba3459a44e618890e
parent 920b17ef
......@@ -20,18 +20,19 @@
AUTOMAKE_OPTIONS=gnu
# Add the debug flag to the whole thing
# AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \
# -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \
# -DTIMER -DCOUNTER -DCPU_TPS=2.67e9
AM_CFLAGS = -Wall -Werror $(OPENMP_CFLAGS) \
AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \
-funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \
-DTIMER -DCOUNTER -DCPU_TPS=2.67e9
# AM_CFLAGS = -Wall -Werror $(OPENMP_CFLAGS) \
# -DTIMER -DCOUNTER -DCPU_TPS=2.67e9
# Assign a "safe" version number
AM_LDFLAGS = $(LAPACK_LIBS) $(BLAS_LIBS) $(HDF5_LDFLAGS) -version-info 0:0:0
# Build the libswiftsim library
lib_LTLIBRARIES = libswiftsim.la
libswiftsim_la_SOURCES = space.c runner.c queue.c task.c cell.c engine.c input.c output.c timers.c debug.c
libswiftsim_la_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \
input.c output.c timers.c debug.c
# List required headers
include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \
......
......@@ -80,6 +80,7 @@ struct cell {
/* The tasks computing this cell's sorts. */
struct task *sorts[13];
int sortsize;
/* The tasks computing this cell's density. */
struct task *density[27];
......
......@@ -35,6 +35,7 @@
#include "cycle.h"
#include "timers.h"
#include "const.h"
#include "vector.h"
#include "lock.h"
#include "task.h"
#include "part.h"
......@@ -175,8 +176,11 @@ void engine_step ( struct engine *e , int sort_queues ) {
int k, nr_parts = e->s->nr_parts;
struct part *restrict parts = e->s->parts, *restrict p;
float *v_bar, *u_bar;
float *restrict v_bar;
float dt = e->dt, hdt = 0.5*dt, dt_max;
#ifdef __SSE2__
VEC_MACRO(4,float) hdtv = _mm_set1_ps( hdt );
#endif
/* Get the maximum dt. */
dt_max = dt;
......@@ -189,12 +193,11 @@ void engine_step ( struct engine *e , int sort_queues ) {
printf( "engine_step: dt_max set to %.3e.\n" , dt_max ); fflush(stdout);
/* Allocate a buffer for the old velocities. */
if ( ( v_bar = (float *)malloc( sizeof(float) * nr_parts * 3 ) ) == NULL )
error( "Failed to allocate v_old buffer." );
if ( ( u_bar = (float *)malloc( sizeof(float) * nr_parts ) ) == NULL )
if ( posix_memalign( (void **)&v_bar , 16 , sizeof(float) * nr_parts * 4 ) != 0 )
error( "Failed to allocate v_old buffer." );
/* First kick. */
TIMER_TIC
#pragma omp parallel for schedule(static) private(p)
for ( k = 0 ; k < nr_parts ; k++ ) {
......@@ -202,10 +205,14 @@ void engine_step ( struct engine *e , int sort_queues ) {
p = &parts[k];
/* Step and store the velocity and internal energy. */
v_bar[3*k+0] = p->v[0] + hdt * p->a[0];
v_bar[3*k+1] = p->v[1] + hdt * p->a[1];
v_bar[3*k+2] = p->v[2] + hdt * p->a[2];
u_bar[k] = p->u + hdt * p->u_dt;
#ifdef __SSE__
_mm_store_ps( &v_bar[4*k] , _mm_add_ps( _mm_load_ps( &p->v[0] ) , _mm_mul_ps( hdtv , _mm_load_ps( &p->a[0] ) ) ) );
#else
v_bar[4*k+0] = p->v[0] + hdt * p->a[0];
v_bar[4*k+1] = p->v[1] + hdt * p->a[1];
v_bar[4*k+2] = p->v[2] + hdt * p->a[2];
#endif
v_bar[4*k+3] = p->u + hdt * p->u_dt;
/* Move the particles with the velocitie at the half-step. */
// p->x[0] += dt * v_bar[3*k+0];
......@@ -226,6 +233,7 @@ void engine_step ( struct engine *e , int sort_queues ) {
}
}
TIMER_TOC( timer_kick1 );
/* Prepare the space. */
engine_prepare( e );
......@@ -240,7 +248,7 @@ void engine_step ( struct engine *e , int sort_queues ) {
}
/* Start the clock. */
TIMER_TIC
TIMER_TIC_ND
/* Cry havoc and let loose the dogs of war. */
e->barrier_count = -e->barrier_count;
......@@ -256,6 +264,7 @@ void engine_step ( struct engine *e , int sort_queues ) {
TIMER_TOC(timer_step);
/* Second kick. */
TIMER_TIC_ND
e->dt_min = FLT_MAX;
#pragma omp parallel private(p,k)
{
......@@ -272,10 +281,14 @@ void engine_step ( struct engine *e , int sort_queues ) {
p->h_dt *= p->h * 0.333333333f;
/* Update positions and energies at the half-step. */
p->v[0] = v_bar[3*k+0] + hdt * p->a[0];
p->v[1] = v_bar[3*k+0] + hdt * p->a[1];
p->v[2] = v_bar[3*k+0] + hdt * p->a[2];
// p->u = u_bar[k] + hdt * p->u_dt;
#ifdef __SSE__
_mm_store_ps( &p->v[0] , _mm_add_ps( _mm_load_ps( &v_bar[4*k] ) , _mm_mul_ps( hdtv , _mm_load_ps( &p->a[0] ) ) ) );
#else
p->v[0] = v_bar[4*k+0] + hdt * p->a[0];
p->v[1] = v_bar[4*k+1] + hdt * p->a[1];
p->v[2] = v_bar[4*k+2] + hdt * p->a[2];
#endif
// p->u = v_bar[4*k+3] + hdt * p->u_dt;
/* Get the smallest dt. */
dt_min = fminf( dt_min , p->dt );
......@@ -284,11 +297,11 @@ void engine_step ( struct engine *e , int sort_queues ) {
#pragma omp critical
e->dt_min = fminf( e->dt_min , dt_min );
}
TIMER_TOC( timer_kick2 );
printf( "engine_step: dt_min is %e.\n" , e->dt_min ); fflush(stdout);
/* Clean up. */
free( v_bar );
free( u_bar );
/* Increase the step counter. */
e->step += 1;
......
......@@ -130,7 +130,7 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim
printf("readArray: Reading %s '%s' array...\n", importance == COMPULSORY ? "compulsory": "optional ", name);
/* Open data space */
h_data = H5Dopen(grp, name);
h_data = H5Dopen1(grp, name);
if(h_data < 0)
{
char buf[100];
......@@ -224,7 +224,7 @@ void read_ic ( char* fileName, double dim[3], struct part **parts, int* N, int*
/* Open header to read simulation properties */
printf("read_ic: Reading runtime parameters...\n");
h_grp = H5Gopen(h_file, "/RuntimePars");
h_grp = H5Gopen1(h_file, "/RuntimePars");
if(h_grp < 0)
error("Error while opening runtime parameters\n");
......@@ -236,7 +236,7 @@ void read_ic ( char* fileName, double dim[3], struct part **parts, int* N, int*
/* Open header to read simulation properties */
printf("read_ic: Reading file header...\n");
h_grp = H5Gopen(h_file, "/Header");
h_grp = H5Gopen1(h_file, "/Header");
if(h_grp < 0)
error("Error while opening file header\n");
......@@ -262,7 +262,7 @@ void read_ic ( char* fileName, double dim[3], struct part **parts, int* N, int*
/* Open SPH particles group */
printf("read_ic: Reading particle arrays...\n");
h_grp = H5Gopen(h_file, "/PartType0");
h_grp = H5Gopen1(h_file, "/PartType0");
if(h_grp < 0)
error( "Error while opening particle group.\n");
......
......@@ -77,7 +77,7 @@ void writeAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data, int
error(buf);
}
h_attr = H5Acreate(grp, name, hdf5Type(type), h_space, H5P_DEFAULT);
h_attr = H5Acreate1(grp, name, hdf5Type(type), h_space, H5P_DEFAULT);
if(h_attr < 0)
{
char buf[100];
......@@ -169,7 +169,7 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, enu
}
/* Create dataset */
h_data = H5Dcreate(grp, name, hdf5Type(type), h_space, H5P_DEFAULT);
h_data = H5Dcreate1(grp, name, hdf5Type(type), h_space, H5P_DEFAULT);
if(h_data < 0)
{
char buf[100];
......@@ -248,7 +248,7 @@ void write_output ( char* fileName, double dim[3], struct part *parts, int N, i
/* Open header to read simulation properties */
printf("write_output: Writing runtime parameters...\n");
h_grp = H5Gcreate(h_file, "/RuntimePars", 0);
h_grp = H5Gcreate1(h_file, "/RuntimePars", 0);
if(h_grp < 0)
error("Error while creating runtime parameters group\n");
......@@ -262,7 +262,7 @@ void write_output ( char* fileName, double dim[3], struct part *parts, int N, i
/* Open header to write simulation properties */
printf("write_output: Writing file header...\n");
h_grp = H5Gcreate(h_file, "/Header", 0);
h_grp = H5Gcreate1(h_file, "/Header", 0);
if(h_grp < 0)
error("Error while creating file header\n");
......@@ -275,7 +275,7 @@ void write_output ( char* fileName, double dim[3], struct part *parts, int N, i
/* Create SPH particles group */
printf("write_output: Writing particle arrays...\n");
h_grp = H5Gcreate(h_file, "/PartType0", 0);
h_grp = H5Gcreate1(h_file, "/PartType0", 0);
if(h_grp < 0)
error( "Error while creating particle group.\n");
......
......@@ -42,12 +42,12 @@ struct cpart {
/* Data of a single particle. */
struct part {
/* Particle velocity. */
float v[3] __attribute__((aligned (16)));
/* Particle mass. */
float mass;
/* Particle velocity. */
float v[3];
/* Particle density. */
float rho;
......@@ -63,12 +63,12 @@ struct part {
/* Change in smoothing length over time. */
float h_dt;
/* Particle acceleration. */
float a[3] __attribute__((aligned (16)));
/* Derivative of the density with respect to this particle's smoothing length. */
float rho_dh;
/* Particle acceleration. */
float a[3];
/* Particle number density. */
// int icount;
float wcount;
......
......@@ -180,9 +180,13 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags ) {
/* start by allocating the entry arrays. */
if ( lock_lock( &c->lock ) != 0 )
error( "Failed to lock cell." );
if ( c->sort == NULL )
if ( ( c->sort = (struct entry *)malloc( sizeof(struct entry) * (c->count + 1) * 13 ) ) == NULL )
if ( c->sort == NULL || c->sortsize < c->count ) {
if ( c->sort != NULL )
free( c->sort );
c->sortsize = c->count * 1.1;
if ( ( c->sort = (struct entry *)malloc( sizeof(struct entry) * (c->sortsize + 1) * 13 ) ) == NULL )
error( "Failed to allocate sort memory." );
}
if ( lock_unlock( &c->lock ) != 0 )
error( "Failed to unlock cell." );
......
......@@ -19,43 +19,44 @@
/* Load the vector stuff. */
#ifdef __SSE2__
#define VECTORIZE
#include <immintrin.h>
#ifdef __AVX__
typedef union {
__m256 v;
__m256i m;
float f[8];
int i[8];
} vector;
#define VEC_SIZE 8
#define vec_load(a) _mm256_load_ps(a)
#define vec_set1(a) _mm256_set1_ps(a)
#define vec_sqrt(a) _mm256_sqrt_ps(a)
#define vec_rcp(a) _mm256_rcp_ps(a)
#define vec_rsqrt(a) _mm256_rsqrt_ps(a)
#define vec_ftoi(a) _mm256_cvttps_epi32(a)
#define vec_fmin(a,b) _mm256_min_ps(a,b)
#else
typedef union {
__m128 v;
__m128i m;
float f[4];
int i[4];
} vector;
#define VEC_SIZE 4
#define vec_load(a) _mm_load_ps(a)
#define vec_set1(a) _mm_set1_ps(a)
#define vec_sqrt(a) _mm_sqrt_ps(a)
#define vec_rcp(a) _mm_rcp_ps(a)
#define vec_rsqrt(a) _mm_rsqrt_ps(a)
#define vec_ftoi(a) _mm_cvttps_epi32(a)
#define vec_fmin(a,b) _mm_min_ps(a,b)
#endif
#else
#define VEC_SIZE 4
#endif
// #ifdef __SSE2__
// #define VECTORIZE
// #include <immintrin.h>
// #ifdef __AVX__
// typedef union {
// __m256 v;
// __m256i m;
// float f[8];
// int i[8];
// } vector;
// #define VEC_SIZE 8
// #define vec_load(a) _mm256_load_ps(a)
// #define vec_set1(a) _mm256_set1_ps(a)
// #define vec_sqrt(a) _mm256_sqrt_ps(a)
// #define vec_rcp(a) _mm256_rcp_ps(a)
// #define vec_rsqrt(a) _mm256_rsqrt_ps(a)
// #define vec_ftoi(a) _mm256_cvttps_epi32(a)
// #define vec_fmin(a,b) _mm256_min_ps(a,b)
// #else
// typedef union {
// __m128 v;
// __m128i m;
// float f[4];
// int i[4];
// } vector;
// #define VEC_SIZE 4
// #define vec_load(a) _mm_load_ps(a)
// #define vec_set1(a) _mm_set1_ps(a)
// #define vec_sqrt(a) _mm_sqrt_ps(a)
// #define vec_rcp(a) _mm_rcp_ps(a)
// #define vec_rsqrt(a) _mm_rsqrt_ps(a)
// #define vec_ftoi(a) _mm_cvttps_epi32(a)
// #define vec_fmin(a,b) _mm_min_ps(a,b)
// #endif
// #else
// #define VEC_SIZE 4
// #endif
#include "vector.h"
/* Coefficients for the kernel. */
#define kernel_degree 3
......@@ -164,7 +165,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_density ( float r
kernel_deval( xj , &wj , &wj_dx );
pj->rho += pi->mass * wj;
pj->rho_dh += -pi->mass * ( 3.0*wj + xj*wj_dx );
pj->rho_dh -= pi->mass * ( 3.0*wj + xj*wj_dx );
pj->wcount += wj * ( 4.0f * M_PI / 3.0f * kernel_igamma3 );
pj->wcount_dh -= xj * h_inv * wj_dx * ( 4.0f * M_PI / 3.0f * kernel_igamma3 );
// pj->icount += 1;
......@@ -266,7 +267,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_density (
kernel_deval( xi , &wi , &wi_dx );
pi->rho += pj->mass * wi;
pi->rho_dh += -pj->mass * ( 3.0*wi + xi*wi_dx );
pi->rho_dh -= pj->mass * ( 3.0*wi + xi*wi_dx );
pi->wcount += wi * ( 4.0f * M_PI / 3.0f * kernel_igamma3 );
pi->wcount_dh -= xi * h_inv * wi_dx * ( 4.0f * M_PI / 3.0f * kernel_igamma3 );
// pi->icount += 1;
......
......@@ -475,6 +475,9 @@ void space_rebuild ( struct space *s , double cell_max ) {
int *ind;
// ticks tic;
/* Be verbose about this. */
printf( "space_rebuild: (re)building space...\n" ); fflush(stdout);
/* Run through the parts and get the current h_max. */
// tic = getticks();
for ( k = 0 ; k < s->nr_parts ; k++ ) {
......
......@@ -23,6 +23,8 @@
enum {
timer_none = 0,
timer_prepare,
timer_kick1,
timer_kick2,
timer_dosort,
timer_doself_density,
timer_doself_force,
......@@ -53,6 +55,8 @@ extern ticks timers[ timer_count ];
#endif
#endif
#ifdef TIMER
#define TIMER_TIC_ND tic = getticks();
#define TIMER_TIC2_ND ticks tic2 = getticks();
#define TIMER_TIC ticks tic = getticks();
#define TIMER_TOC(t) timers_toc( t , tic )
#define TIMER_TIC2 ticks tic2 = getticks();
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment