Commit c969365b authored by Pedro Gonnet's avatar Pedro Gonnet
Browse files

several bug fixes using the new test cases.


Former-commit-id: 36567220a6b658f9dbff8091561e53ed45c4bf7a
parent cfe714f1
......@@ -32,7 +32,7 @@ L = 50 # Number of particles along one axis
rho = 1. # Density
P = 1. # Pressure
gamma = 5./3. # Gas adiabatic index
pert = 0.1 # Perturbation scale (in units of the interparticle separation)
pert = 0.01 # Perturbation scale (in units of the interparticle separation)
fileName = "perturbedBox.hdf5"
......
......@@ -28,7 +28,7 @@ from numpy import *
periodic= 1 # 1 For periodic box
boxSize = 1.
L = 50 # Number of particles along one axis
rho = 1. # Density
rho = 2. # Density
P = 1. # Pressure
gamma = 5./3. # Gas adiabatic index
fileName = "uniformBox.hdf5"
......
......@@ -28,6 +28,7 @@
#include <string.h>
#include <pthread.h>
#include <math.h>
#include <fenv.h>
#include <omp.h>
/* Conditional headers. */
......@@ -193,6 +194,24 @@ void map_wcount_max ( struct part *p , struct cell *c , void *data ) {
}
void map_h_min ( struct part *p , struct cell *c , void *data ) {
struct part **p2 = (struct part **)data;
if ( p->h < (*p2)->h )
*p2 = p;
}
void map_h_max ( struct part *p , struct cell *c , void *data ) {
struct part **p2 = (struct part **)data;
if ( p->h > (*p2)->h )
*p2 = p;
}
/**
* @brief Mapping function for neighbour count.
......@@ -689,6 +708,9 @@ int main ( int argc , char *argv[] ) {
float dt_max = 0.0f;
ticks tic;
/* Choke on FP-exceptions. */
feenableexcept( FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW );
/* Init the space. */
bzero( &s , sizeof(struct space) );
......@@ -862,6 +884,7 @@ int main ( int argc , char *argv[] ) {
parts[k].x[2] += shift[2];
}
/* Dump the first few particles. */
for(k=0; k<10; ++k)
printParticle(parts, k);
......@@ -910,12 +933,6 @@ int main ( int argc , char *argv[] ) {
/* Dump the particle positions. */
// space_map_parts( &s , &map_dump , shift );
/* Dump the acceleration of the first particle. */
for ( k = 0 ; k < 3 ; k++ ) {
printf( "main: parts[%lli].a is [ %.16e %.16e %.16e ].\n" , s.parts[k].id , s.parts[k].a[0] , s.parts[k].a[1] , s.parts[k].a[2] );
printf( "main: parts[%lli].a has h=%e, rho=%e, wcount=%.3f.\n" , s.parts[k].id , s.parts[k].h , s.parts[k].rho , s.parts[k].wcount + 32.0/3 );
}
/* Initialize the runner with this space. */
tic = getticks();
engine_init( &e , &s , nr_threads , nr_queues , engine_policy_steal | engine_policy_keep );
......@@ -943,6 +960,23 @@ int main ( int argc , char *argv[] ) {
/* Take a step. */
engine_step( &e , 0 );
/* Dump the first few particles. */
for(k=0; k<10; ++k)
printParticle(parts, k);
printParticle( parts , 113531 );
/* Get the particle with the lowest h. */
p = &s.parts[0];
space_map_parts( &s , &map_h_min , &p );
printf( "main: particle %lli/%i at [ %e %e %e ] has minimum h=%.3e (h_dt=%.3e).\n" ,
p->id , (int)(p - s.parts) , p->x[0] , p->x[1] , p->x[2] , p->h , p->h_dt );
/* Get the particle with the highest h. */
p = &s.parts[0];
space_map_parts( &s , &map_h_max , &p );
printf( "main: particle %lli/%i at [ %e %e %e ] has maximum h=%.3e (h_dt=%.3e).\n" ,
p->id , (int)(p - s.parts) , p->x[0] , p->x[1] , p->x[2] , p->h , p->h_dt );
/* Output. */
#ifdef TIMER
printf( "main: runner timers are [ %.3f" , timers[0]/CPU_TPS*1000 );
......@@ -1010,11 +1044,9 @@ int main ( int argc , char *argv[] ) {
// space_map_parts( &s , &map_icount , &icount );
// printf( "main: average neighbours per particle is %.3f.\n" , (double)icount / s.nr_parts );
/* Dump the acceleration of the first particle. */
for ( k = 0 ; k < 3 ; k++ ) {
printf( "main: parts[%lli].a is [ %.16e %.16e %.16e ].\n" , s.parts[k].id , s.parts[k].a[0] , s.parts[k].a[1] , s.parts[k].a[2] );
printf( "main: parts[%lli].a has h=%e, rho=%e, wcount=%.3f.\n" , s.parts[k].id , s.parts[k].h , s.parts[k].rho , s.parts[k].wcount );
}
/* Dump the first few particles. */
for(k=0; k<10; ++k)
printParticle(parts, k);
/* Get all the cells of a certain depth. */
// icount = 1;
......
......@@ -20,11 +20,11 @@
AUTOMAKE_OPTIONS=gnu
# Add the debug flag to the whole thing
AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \
-funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \
-DTIMER -DCOUNTER -DCPU_TPS=2.67e9
# AM_CFLAGS = -Wall -Werror $(OPENMP_CFLAGS) \
# AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \
# -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \
# -DTIMER -DCOUNTER -DCPU_TPS=2.67e9
AM_CFLAGS = -Wall -Werror $(OPENMP_CFLAGS) \
-DTIMER -DCOUNTER -DCPU_TPS=2.67e9
# Assign a "safe" version number
AM_LDFLAGS = $(LAPACK_LIBS) $(BLAS_LIBS) $(HDF5_LDFLAGS) -version-info 0:0:0
......
......@@ -22,17 +22,26 @@
#include "part.h"
void printParticle(struct part *parts, int i)
{
printf("## Particle[%d]: id= %lld x=( %f, %f, %f) v=( %f, %f, %f) h= %f m= %f rho= %f u= %f dt= %f\n",
void printParticle ( struct part *parts , long long int id ) {
int i;
/* Look for the particle. */
for ( i = 0 ; parts[i].id != id ; i++ );
printf("## Particle[%d]: id=%lld, x=(%f,%f,%f), v=(%f,%f,%f), a=(%f,%f,%f), h=%f, h_dt=%f, wcount=%f, m=%f, rho=%f, u=%f, dudt=%f, dt=%.3e\n",
i,
parts[i].id,
parts[i].x[0], parts[i].x[1], parts[i].x[2],
parts[i].v[0], parts[i].v[1], parts[i].v[2],
parts[i].a[0], parts[i].a[1], parts[i].a[2],
parts[i].h,
parts[i].h_dt,
parts[i].wcount,
parts[i].mass,
parts[i].rho,
parts[i].u,
parts[i].u_dt,
parts[i].dt
);
}
......
......@@ -20,4 +20,4 @@
void printParticle(struct part *parts, int i);
void printParticle(struct part *parts, long long int i);
......@@ -39,6 +39,7 @@
#include "lock.h"
#include "task.h"
#include "part.h"
#include "debug.h"
#include "cell.h"
#include "space.h"
#include "queue.h"
......@@ -65,6 +66,7 @@ void engine_prepare ( struct engine *e ) {
int j, k, qid;
struct space *s = e->s;
struct queue *q;
float dt_max = e->dt_max;
TIMER_TIC
......@@ -92,12 +94,13 @@ void engine_prepare ( struct engine *e ) {
/* Re-set the particle data. */
// tic = getticks();
#pragma omp parallel for schedule(static)
for ( k = 0 ; k < s->nr_parts ; k++ ) {
s->parts[k].wcount = 0.0f;
s->parts[k].wcount_dh = 0.0f;
s->parts[k].rho = 0.0f;
s->parts[k].rho_dh = 0.0f;
}
for ( k = 0 ; k < s->nr_parts ; k++ )
if ( s->parts[k].dt <= dt_max ) {
s->parts[k].wcount = 0.0f;
s->parts[k].wcount_dh = 0.0f;
s->parts[k].rho = 0.0f;
s->parts[k].rho_dh = 0.0f;
}
// printf( "engine_prepare: re-setting particle data took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
/* Run throught the tasks and get all the waits right. */
......@@ -177,15 +180,18 @@ void engine_step ( struct engine *e , int sort_queues ) {
int k, nr_parts = e->s->nr_parts;
struct part *restrict parts = e->s->parts, *restrict p;
float *restrict v_bar;
float dt = e->dt, hdt = 0.5*dt, dt_max;
#ifdef __SSE2__
VEC_MACRO(4,float) hdtv = _mm_set1_ps( hdt );
#endif
float dt = e->dt, hdt = 0.5*dt, dt_max, dt_min, ldt_min, ldt_max;
double etot = 0.0, letot, lmom[3], mom[3] = { 0.0 , 0.0 , 0.0 };
int threadID, nthreads;
// #ifdef __SSE2__
// VEC_MACRO(4,float) hdtv = _mm_set1_ps( hdt );
// #endif
/* Get the maximum dt. */
dt_max = dt;
dt_max = 2.0f*dt;
for ( k = 0 ; k < 32 && (e->step & (1 << k)) == 0 ; k++ )
dt_max *= 2;
dt_max = 1;
/* Set the maximum dt. */
e->dt_max = dt_max;
......@@ -198,43 +204,46 @@ void engine_step ( struct engine *e , int sort_queues ) {
/* First kick. */
TIMER_TIC
#pragma omp parallel for schedule(static) private(p)
// #pragma omp parallel for schedule(static) private(p)
for ( k = 0 ; k < nr_parts ; k++ ) {
/* Get a handle on the part. */
p = &parts[k];
/* Step and store the velocity and internal energy. */
#ifdef __SSE__
_mm_store_ps( &v_bar[4*k] , _mm_add_ps( _mm_load_ps( &p->v[0] ) , _mm_mul_ps( hdtv , _mm_load_ps( &p->a[0] ) ) ) );
#else
// #ifdef __SSE__
// _mm_store_ps( &v_bar[4*k] , _mm_add_ps( _mm_load_ps( &p->v[0] ) , _mm_mul_ps( hdtv , _mm_load_ps( &p->a[0] ) ) ) );
// #else
v_bar[4*k+0] = p->v[0] + hdt * p->a[0];
v_bar[4*k+1] = p->v[1] + hdt * p->a[1];
v_bar[4*k+2] = p->v[2] + hdt * p->a[2];
#endif
// #endif
v_bar[4*k+3] = p->u + hdt * p->u_dt;
/* Move the particles with the velocitie at the half-step. */
// p->x[0] += dt * v_bar[3*k+0];
// p->x[1] += dt * v_bar[3*k+1];
// p->x[2] += dt * v_bar[3*k+2];
p->x[0] += dt * v_bar[4*k+0];
p->x[1] += dt * v_bar[4*k+1];
p->x[2] += dt * v_bar[4*k+2];
/* Update positions and energies at the half-step. */
// p->v[0] += dt * p->a[0];
// p->v[1] += dt * p->a[1];
// p->v[2] += dt * p->a[2];
// p->u *= expf( p->u_dt / p->u * dt );
// p->h *= expf( -1.0f * p->h_dt / p->h * dt );
p->v[0] += dt * p->a[0];
p->v[1] += dt * p->a[1];
p->v[2] += dt * p->a[2];
p->u *= expf( p->u_dt / p->u * dt );
// p->h *= expf( p->h_dt / p->h * dt );
/* Integrate other values if this particle will not be updated. */
if ( p->dt > dt_max ) {
p->rho *= expf( -3.0f * p->h_dt / p->h * dt );
p->POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f );
}
// if ( p->dt > dt_max ) {
// p->rho *= expf( -3.0f * p->h_dt / p->h * dt );
// p->POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f );
// }
}
TIMER_TOC( timer_kick1 );
// for(k=0; k<10; ++k)
// printParticle(parts, k);
/* Prepare the space. */
engine_prepare( e );
......@@ -263,43 +272,79 @@ void engine_step ( struct engine *e , int sort_queues ) {
/* Stop the clock. */
TIMER_TOC(timer_step);
// for(k=0; k<10; ++k)
// printParticle(parts, k);
/* Second kick. */
TIMER_TIC_ND
e->dt_min = FLT_MAX;
#pragma omp parallel private(p,k)
dt_min = FLT_MAX; dt_max = 0.0f;
#pragma omp parallel private(p,k,ldt_min,ldt_max,lmom,letot,threadID,nthreads)
{
int threadID = omp_get_thread_num();
int nthreads = omp_get_num_threads();
float dt_min = FLT_MAX;
threadID = omp_get_thread_num();
nthreads = omp_get_num_threads();
ldt_min = FLT_MAX; ldt_max = 0.0f;
lmom[0] = 0.0; lmom[1] = 0.0; lmom[2] = 0.0;
letot = 0.0;
for ( k = nr_parts * threadID / nthreads ; k < nr_parts * (threadID + 1) / nthreads ; k++ ) {
/* Get a handle on the part. */
p = &parts[k];
/* Scale the derivatives. */
p->u_dt *= p->POrho2;
p->h_dt *= p->h * 0.333333333f;
/* Scale the derivatives if they're freshly computed. */
if ( p->dt <= dt_max ) {
p->u_dt *= p->POrho2;
p->h_dt *= p->h * 0.333333333f;
}
/* Update positions and energies at the half-step. */
#ifdef __SSE__
_mm_store_ps( &p->v[0] , _mm_add_ps( _mm_load_ps( &v_bar[4*k] ) , _mm_mul_ps( hdtv , _mm_load_ps( &p->a[0] ) ) ) );
#else
// #ifdef __SSE__
// _mm_store_ps( &p->v[0] , _mm_add_ps( _mm_load_ps( &v_bar[4*k] ) , _mm_mul_ps( hdtv , _mm_load_ps( &p->a[0] ) ) ) );
// #else
p->v[0] = v_bar[4*k+0] + hdt * p->a[0];
p->v[1] = v_bar[4*k+1] + hdt * p->a[1];
p->v[2] = v_bar[4*k+2] + hdt * p->a[2];
#endif
// p->u = v_bar[4*k+3] + hdt * p->u_dt;
// #endif
p->u = v_bar[4*k+3] + hdt * p->u_dt;
/* Get the smallest dt. */
dt_min = fminf( dt_min , p->dt );
/* Get the smallest/largest dt. */
ldt_min = fminf( ldt_min , p->dt );
ldt_max = fmaxf( ldt_max , p->dt );
/* Collect total energy. */
letot += 0.5 * p->mass * ( p->v[0]*p->v[0] + p->v[1]*p->v[1] + p->v[2]*p->v[2] ) + p->mass * p->u;
/* Collect momentum */
lmom[0] += p->mass * p->v[0];
lmom[1] += p->mass * p->v[1];
lmom[2] += p->mass * p->v[2];
}
#pragma omp critical
e->dt_min = fminf( e->dt_min , dt_min );
{
dt_min = fminf( dt_min , ldt_min );
dt_max = fmaxf( dt_max , ldt_max );
mom[0] += lmom[0];
mom[1] += lmom[1];
mom[2] += lmom[2];
etot += letot;
}
}
TIMER_TOC( timer_kick2 );
printf( "engine_step: dt_min is %e.\n" , e->dt_min ); fflush(stdout);
e->dt_min = dt_min;
printf( "engine_step: dt_min/dt_max is %e/%e.\n" , dt_min , dt_max ); fflush(stdout);
printf( "engine_step: etot is %e.\n" , etot ); fflush(stdout);
printf( "engine_step: total momentum is [ %e , %e , %e ].\n" , mom[0] , mom[1] , mom[2] ); fflush(stdout);
/* Does the time step need adjusting? */
if ( dt_min < e->dt ) {
e->dt *= 0.5;
printf( "engine_step: dt_min dropped below time step, adjusting to dt=%e.\n" , e->dt );
}
else if ( dt_min > 2*e->dt ) {
e->dt *= 2.0;
printf( "engine_step: dt_min is larger than twice the time step, adjusting to dt=%e.\n" , e->dt );
}
/* Clean up. */
free( v_bar );
......
......@@ -325,6 +325,7 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags ) {
void runner_doghost ( struct runner *r , struct cell *c ) {
struct part *p;
struct cpart *cp;
struct cell *finger;
int i, k, redo, count = c->count;
int *pid;
......@@ -357,9 +358,10 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
/* Get a direct pointer on the part. */
p = &c->parts[ pid[i] ];
cp = &c->cparts[ pid[i] ];
/* Is this part within the timestep? */
if ( p->dt <= dt_max ) {
if ( cp->dt <= dt_max ) {
/* Adjust the computed rho. */
ihg = kernel_igamma / p->h;
......@@ -370,11 +372,12 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
/* Update the smoothing length. */
p->h -= ( p->wcount - const_nwneigh ) / p->wcount_dh;
cp->h = p->h;
/* Did we get the right number density? */
if ( p->wcount > const_nwneigh + 1 ||
p->wcount < const_nwneigh - 1 ) {
printf( "runner_doghost: particle %lli (h=%e,depth=%i) has bad wcount=%f.\n" , p->id , p->h , c->depth , p->wcount ); fflush(stdout);
// printf( "runner_doghost: particle %lli (h=%e,h_dt=%e,depth=%i) has bad wcount=%.3f.\n" , p->id , p->h , p->h_dt , c->depth , p->wcount ); fflush(stdout);
// p->h += ( p->wcount + kernel_root - const_nwneigh ) / p->wcount_dh;
pid[redo] = pid[i];
redo += 1;
......@@ -387,6 +390,7 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
/* Compute this particle's time step. */
p->dt = const_cfl * p->h / sqrtf( const_gamma * ( const_gamma - 1.0f ) * p->u );
cp->dt = p->dt;
/* Compute the pressure. */
// p->P = p->rho * p->u * ( const_gamma - 1.0f );
......@@ -394,15 +398,15 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
/* Compute the P/Omega/rho2. */
p->POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f );
}
/* Reset the acceleration. */
for ( k = 0 ; k < 3 ; k++ )
p->a[k] = 0.0f;
/* Reset the acceleration. */
for ( k = 0 ; k < 3 ; k++ )
p->a[k] = 0.0f;
/* Reset the time derivatives. */
p->u_dt = 0.0f;
p->h_dt = 0.0f;
/* Reset the time derivatives. */
p->u_dt = 0.0f;
p->h_dt = 0.0f;
}
}
......
......@@ -368,7 +368,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2
pj->u_dt += pi->mass * dvdr * wj_dr;
/* Get the time derivative for h. */
pi->h_dt += pj->mass / pj->rho * dvdr * wi_dr;
pi->h_dt -= pj->mass / pj->rho * dvdr * wi_dr;
pj->h_dt -= pi->mass / pi->rho * dvdr * wj_dr;
#ifdef HIST
......@@ -485,8 +485,8 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float
for ( k = 0 ; k < VEC_SIZE ; k++ ) {
pi[k]->u_dt += piu_dt.f[k];
pj[k]->u_dt += pju_dt.f[k];
pi[k]->h_dt += pih_dt.f[k];
pj[k]->h_dt += pjh_dt.f[k];
pi[k]->h_dt -= pih_dt.f[k];
pj[k]->h_dt -= pjh_dt.f[k];
for ( j = 0 ; j < 3 ; j++ ) {
pi[k]->a[j] -= pia[j].f[k];
pj[k]->a[j] += pja[j].f[k];
......@@ -545,7 +545,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_force ( fl
pi->u_dt += pj->mass * dvdr * wi_dr;
/* Get the time derivative for h. */
pi->h_dt += pj->mass / pj->rho * dvdr * wi_dr;
pi->h_dt -= pj->mass / pj->rho * dvdr * wi_dr;
}
......@@ -646,7 +646,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force
/* Store the forces back on the particles. */
for ( k = 0 ; k < VEC_SIZE ; k++ ) {
pi[k]->u_dt += piu_dt.f[k];
pi[k]->h_dt += pih_dt.f[k];
pi[k]->h_dt -= pih_dt.f[k];
for ( j = 0 ; j < 3 ; j++ )
pi[k]->a[j] -= pia[j].f[k];
}
......
......@@ -161,12 +161,17 @@ void space_prepare ( struct space *s ) {
int k;
struct task *t;
float dt_max = s->dt_max;
float dt_max = s->dt_max, dx_max = 0.0f;
int counts[ task_type_count + 1 ];
/* Traverse the cells and set their dt_min and dt_max. */
space_map_cells_post( s , 1 , &space_map_prepare , NULL );
/* Get the maximum displacement in the whole system. */
for ( k = 0 ; k < s->nr_cells ; k++ )
dx_max = fmaxf( dx_max , s->cells[k].dx_max );
printf( "space_prepare: dx_max is %e.\n" , dx_max );
/* Run through the tasks and mark as skip or not. */
for ( k = 0 ; k < s->nr_tasks ; k++ ) {
t = &s->tasks[k];
......@@ -192,15 +197,6 @@ void space_prepare ( struct space *s ) {
/* Traverse the cells and set their dt_min and dt_max. */
space_map_cells_post( s , 1 , &space_map_prepare , NULL );
/* Run through the tasks and mark as skip or not. */
for ( k = 0 ; k < s->nr_tasks ; k++ ) {
t = &s->tasks[k];
if ( t->type == task_type_sort || t->type == task_type_self || t->type == task_type_ghost )
t->skip = ( t->ci->dt_min > dt_max );
else if ( t->type == task_type_pair )
t->skip = ( t->ci->dt_min > dt_max && t->cj->dt_min > dt_max );
}
}
/* Store the condensed particle data. */
......@@ -262,6 +258,10 @@ void space_ranktasks ( struct space *s ) {
temp = tid[j]; tid[j] = tid[k]; tid[k] = temp;
j += 1;
}
/* Did we get anything? */
if ( j == left )
error( "Unsatisfiable task dependencies detected." );
/* Traverse the task tree and add tasks with no weight. */
for ( i = left ; i < j ; i++ ) {
......@@ -488,6 +488,7 @@ void space_rebuild ( struct space *s , double cell_max ) {
}
s->h_min = h_min;
s->h_max = h_max;
printf( "space_rebuild: h_min/h_max is %.3e/%.3e.\n" , h_min , h_max );
// printf( "space_rebuild: getting h_min and h_max took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
/* Get the new putative cell dimensions. */
......@@ -537,6 +538,9 @@ void space_rebuild ( struct space *s , double cell_max ) {
c->dmin = dmin;
c->depth = 0;
}
/* Be verbose about the change. */
printf( "space_rebuild: set cell dimensions to [ %i %i %i ].\n" , cdim[0] , cdim[1] , cdim[2] ); fflush(stdout);
} /* re-build upper-level cells? */
// printf( "space_rebuild: rebuilding upper-level cells took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
......@@ -844,7 +848,7 @@ struct task *space_addtask ( struct space *s , int type , int subtype , int flag
t->ci = ci;
t->cj = cj;
t->skip = 0;
t->tight = 0;
t->tight = tight;
t->nr_unlock_tasks = 0;
t->nr_unlock_cells = 0;
......@@ -1398,9 +1402,14 @@ void space_maketasks ( struct space *s , int do_sort ) {
if ( t->skip )
continue;
if ( t->type == task_type_sort && t->ci->split )
for ( j = 0 ; j < 8 ; j++ )
if ( t->ci->progeny[j] != NULL && t->ci->progeny[j]->sorts[0] != NULL )
task_addunlock( t->ci->progeny[j]->sorts[0] , t );
for ( j = 0 ; j < 8 ; j++ ) {
if ( t->ci->progeny[j] == NULL )
continue;
if ( t->ci->progeny[j]->sorts[0] == NULL )
t->ci->progeny[j]->sorts[0] = space_addtask( s , task_type_sort , task_subtype_none , 0 /* t->flags? */ , 0 , t->ci , NULL , 0 );
t->ci->progeny[j]->sorts[0]->skip = 0;
task_addunlock( t->ci->progeny[j]->sorts[0] , t );
}
}
/* Count the number of tasks associated with each cell and
......
......@@ -27,8 +27,11 @@
#define VEC_MACRO(elcount, type) __attribute__((vector_size((elcount)*sizeof(type)))) type
/* So what will the vector size be? */
#ifdef __AVX__
#ifdef NO__AVX__
#define VECTORIZE
#define VEC_SIZE 8
#define VEC_FLOAT __m256
#define VEC_INT __m256i
#define vec_load(a) _mm256_load_ps(a)
#define vec_set1(a) _mm256_set1_ps(a)
#define vec_sqrt(a) _mm256_sqrt_ps(a)
......@@ -36,8 +39,11 @@
#define vec_rsqrt(a) _mm256_rsqrt_ps(a)
#define vec_ftoi(a) _mm256_cvttps_epi32(a)
#define vec_fmin(a,b) _mm256_min_ps(a,b)
#else
#elif defined( NO__SSE2__ )
#define VECTORIZE
#define VEC_SIZE 4
#define VEC_FLOAT __m128
#define VEC_INT __m128i
#define vec_load(a) _mm_load_ps(a)
#define vec_set1(a) _mm_set1_ps(a)
#define vec_sqrt(a) _mm_sqrt_ps(a)
......@@ -45,6 +51,8 @@
#define vec_rsqrt(a) _mm_rsqrt_ps(a)
#define vec_ftoi(a) _mm_cvttps_epi32(a)