Commit decf122a authored by Pedro Gonnet's avatar Pedro Gonnet
Browse files

add a number of checks on system sanity that happen between each step in engine_step.


Former-commit-id: 70bc02755eb2c0d36163afa582ef8338bb9b042e
parent f02d42e2
......@@ -773,7 +773,7 @@ int main ( int argc , char *argv[] ) {
/* Initialize the engine with this space. */
tic = getticks();
message( "nr_nodes is %i." , nr_nodes );
engine_init( &e , &s , dt_max , nr_threads , nr_queues , nr_nodes , myrank , ENGINE_POLICY | engine_policy_steal );
engine_init( &e , &s , dt_max , nr_threads , nr_queues , nr_nodes , myrank , ENGINE_POLICY | engine_policy_steal | engine_policy_paranoid );
if ( myrank == 0 )
message( "engine_init took %.3f ms." , ((double)(getticks() - tic)) / CPU_TPS * 1000 ); fflush(stdout);
......
......@@ -23,8 +23,26 @@
#include "const.h"
#include "part.h"
#include "lock.h"
#include "multipole.h"
#include "space.h"
#include "cell.h"
/**
* @brief Dump the information pertaining to the given cell.
*/
void print_cell(struct cell *c) {
printf("## Cell 0x%0zx: loc=[%.3e,%.3e,%.3e], h=[%.3e,%.3e,%.3e], depth=%i, split=%i, maxdepth=%i.\n",
(size_t)c,
c->loc[0], c->loc[1], c->loc[2],
c->h[0], c->h[1], c->h[2],
c->depth,
c->split,
c->maxdepth);
}
/**
* @brief Looks for the particle with the given id and prints its information to the standard output.
*
......
......@@ -19,7 +19,7 @@
void print_cell(struct cell *c);
void printParticle(struct part *parts, long long int i, int N);
void printgParticle(struct gpart *parts, long long int i, int N);
void printParticle_single ( struct part *p );
......@@ -78,6 +78,81 @@
int engine_rank;
/**
* @brief Check if a single particle is OK.
*
* @return Zero if all checks passed, non-zero otherwise.
*/
int engine_check_part(struct part *p) {
if (p == NULL ||
p->mass == 0.0f ||
p->h == 0.0f ) {
message("Bad particle data.");
printParticle_single(p);
return 1;
} else if (p->x[0] == 0.0 && p->x[1] == 0.0 && p->x[2] == 0.0) {
message("Bad particle location.");
printParticle_single(p);
return 1;
} else {
return 0;
}
}
/**
* @brief Check if a cell's data is reasonable, also check if its particles
* are OK.
*
* @return Zero if all checks passed, non-zero otherwise.
*/
void engine_check_cell(struct cell *c, void *data) {
/* Check the cell data. */
if (c->count == 0) {
print_cell(c);
error("Empty cell.");
}
/* Check the particles. */
for (int k = 0; k < c->count; k++) {
if (engine_check_part(&c->parts[k])) {
print_cell(c);
error("Bad particle in cell.");
}
}
/* Check that the progeny, if any, contain all the particles. */
if (c->split) {
int count = 0;
for (int k = 0; k < 8; k++) {
if (c->progeny[k] != NULL) {
count += c->progeny[k]->count;
}
}
if (count != c->count) {
print_cell(c);
error("Progeny cell counts don't add up.");
}
}
}
/**
* @brief Runs a series of checks to make sure we have no bad particles.
*/
void engine_check(struct engine *e) {
/* Check all particles directly. */
struct space *s = e->s;
for (int k = 0; k < s->nr_parts; k++) {
if (engine_check_part(&s->parts[k])) {
error("Bad particle s->parts[%i], aborting.", k);
}
}
/* Check each cell in the space. */
space_map_cells_post(s, 1, &engine_check_cell, NULL);
}
/**
* @brief Link a density/force task to a cell.
*
......@@ -235,12 +310,11 @@ void engine_redistribute ( struct engine *e ) {
int res;
if ( ( res = MPI_Waitall( 4*nr_nodes , reqs , stats ) ) != MPI_SUCCESS ) {
for ( k = 0 ; k < 4*nr_nodes ; k++ ) {
char buff[ MPI_MAX_ERROR_STRING ];
int res;
MPI_Error_string( stats[k].MPI_ERROR , buff , &res );
message( "request %i has error '%s'." , k , buff );
}
message( "counts is [ %i %i %i %i ]." , counts[0] , counts[1] , counts[2] , counts[3] );
char buff[ MPI_MAX_ERROR_STRING ];
int res;
MPI_Error_string( stats[k].MPI_ERROR , buff , &res );
message( "request %i has error '%s'." , k , buff );
}
error( "Failed during waitall for part data." );
}
......@@ -1721,6 +1795,11 @@ void engine_step ( struct engine *e ) {
struct space *s = e->s;
TIMER_TIC2
if (e->policy & engine_policy_paranoid) {
message("Checking system sanity...");
engine_check(e);
}
/* Get the maximum dt. */
if ( e->policy & engine_policy_multistep ) {
......@@ -1755,10 +1834,20 @@ void engine_step ( struct engine *e ) {
// printParticle(parts, k);
// printParticle( e->s->parts , 3392063069037 , e->s->nr_parts );
if (e->policy & engine_policy_paranoid) {
message("Checking system sanity...");
engine_check(e);
}
/* Re-distribute the particles amongst the nodes? */
if ( e->forcerepart )
engine_repartition( e );
if (e->policy & engine_policy_paranoid) {
message("Checking system sanity...");
engine_check(e);
}
/* Prepare the space. */
engine_prepare( e );
......@@ -1779,6 +1868,12 @@ void engine_step ( struct engine *e ) {
(1 << task_type_grav_up) |
(1 << task_type_grav_down) |
(1 << task_type_link) );
if (e->policy & engine_policy_paranoid) {
message("Checking system sanity...");
engine_check(e);
}
TIMER_TOC(timer_runners);
// engine_single_force( e->s->dim , 8328423931905 , e->s->parts , e->s->nr_parts , e->s->periodic );
......
......@@ -30,6 +30,7 @@
#define engine_policy_cputight 64
#define engine_policy_mpi 128
#define engine_policy_setaffinity 256
#define engine_policy_paranoid 512
#define engine_queue_scale 1.2
#define engine_maxtaskspercell 128
......
......@@ -870,7 +870,7 @@ void space_map_parts ( struct space *s , void (*fun)( struct part *p , struct ce
/**
* @brief Map a function to all particles in a aspace.
* @brief Map a function to all particles in a space.
*
* @param s The #space we are working in.
* @param full Map to all cells, including cells with sub-cells.
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment