Commit e777228d authored by Peter W. Draper's avatar Peter W. Draper
Browse files

Merge branch 'mpi_fixes' into 'master'

Mpi fixes

This should fix issue #28.

Tested it with the SodShock example, was able to reproduce the bug and catch the exact location using the address sanitizer. Have not yet tested it in a "real" parallel environment, e.g. a cosma run, but only on my laptop.

Could you test this with the CosmoVolume and verify that it indeed no longer crashes?

See merge request !17


Former-commit-id: 79fbeba71a8f2e5e6e3986f09302bc304880d947
parents 1505a1d6 81312e57
......@@ -169,13 +169,18 @@ void engine_redistribute ( struct engine *e ) {
getting the counts. */
int *counts, *dest;
struct part *parts = s->parts;
double ih[3];
double ih[3], dim[3];
ih[0] = s->ih[0]; ih[1] = s->ih[1]; ih[2] = s->ih[2];
dim[0] = s->dim[0]; dim[1] = s->dim[1]; dim[2] = s->dim[2];
if ( ( counts = (int *)malloc( sizeof(int) * nr_nodes * nr_nodes ) ) == NULL ||
( dest = (int *)malloc( sizeof(int) * s->nr_parts ) ) == NULL )
error( "Failed to allocate count and dest buffers." );
bzero( counts , sizeof(int) * nr_nodes * nr_nodes );
for ( k = 0 ; k < s->nr_parts ; k++ ) {
for ( j = 0 ; j < 3 ; j++ ) {
if ( parts[k].x[j] < 0.0 ) parts[k].x[j] += dim[j];
else if ( parts[k].x[j] >= dim[j] ) parts[k].x[j] -= dim[j];
}
cid = cell_getid( cdim , parts[k].x[0]*ih[0] , parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] );
dest[k] = cells[ cid ].nodeID;
counts[ nodeID*nr_nodes + dest[k] ] += 1;
......@@ -292,7 +297,9 @@ void engine_repartition ( struct engine *e ) {
struct task *t, *tasks = e->sched.tasks;
struct cell *ci, *cj;
int nr_nodes = e->nr_nodes, nodeID = e->nodeID;
float wscale = 0.0001, vscale = 0.001;
float wscale = 1.0, vscale = 1e-3, wscale_buff;
idx_t wtot = 0;
const idx_t wmax = 1e9 / e->nr_nodes;
/* Clear the repartition flag. */
e->forcerepart = 0;
......@@ -332,12 +339,8 @@ void engine_repartition ( struct engine *e ) {
}
/* Init the weights arrays. */
/* bzero( weights_e , sizeof(idx_t) * 26*nr_cells );
bzero( weights_v , sizeof(idx_t) * nr_cells ); */
for ( k = 0 ; k < 26*nr_cells ; k++ )
weights_e[k] = 1;
for ( k = 0 ; k < nr_cells ; k++ )
weights_v[k] = 1;
bzero( weights_e , sizeof(idx_t) * 26*nr_cells );
bzero( weights_v , sizeof(idx_t) * nr_cells );
/* Loop over the tasks... */
for ( j = 0 ; j < e->sched.nr_tasks ; j++ ) {
......@@ -358,6 +361,15 @@ void engine_repartition ( struct engine *e ) {
w = ( t->toc - t->tic ) * wscale;
if ( w < 0 )
error( "Bad task weight (%i)." , w );
/* Do we need to re-scale? */
wtot += w;
if (wtot > wmax) {
wscale /= 2;
wtot /= 2;
for (k = 0; k < 26 * nr_cells; k++) weights_e[k] *= 0.5;
for (k = 0; k < nr_cells; k++) weights_v[k] *= 0.5;
}
/* Get the top-level cells involved. */
for ( ci = t->ci ; ci->parent != NULL ; ci = ci->parent );
......@@ -424,6 +436,18 @@ void engine_repartition ( struct engine *e ) {
}
/* Get the minimum scaling and re-scale if necessary. */
if ( ( res = MPI_Allreduce( &wscale , &wscale_buff , 1 , MPI_FLOAT , MPI_MIN , MPI_COMM_WORLD ) ) != MPI_SUCCESS ) {
char buff[ MPI_MAX_ERROR_STRING ];
MPI_Error_string( res , buff , &i );
error( "Failed to allreduce the weight scales (%s)." , buff );
}
if (wscale_buff != wscale) {
float scale = wscale / wscale_buff;
for (k = 0; k < 26 * nr_cells; k++) weights_e[k] *= scale;
for (k = 0; k < nr_cells; k++) weights_v[k] *= scale;
}
/* Merge the weights arrays accross all nodes. */
#if IDXTYPEWIDTH==32
if ( ( res = MPI_Reduce( ( nodeID == 0 ) ? MPI_IN_PLACE : weights_v , weights_v , nr_cells , MPI_INT , MPI_SUM , 0 , MPI_COMM_WORLD ) ) != MPI_SUCCESS ) {
......@@ -508,10 +532,10 @@ void engine_repartition ( struct engine *e ) {
error( "Call to METIS_PartGraphKway failed." );
/* Dump the 3d array of cell IDs. */
printf( "engine_repartition: nodeIDs = reshape( [" );
/* printf( "engine_repartition: nodeIDs = reshape( [" );
for ( i = 0 ; i < cdim[0]*cdim[1]*cdim[2] ; i++ )
printf( "%i " , (int)nodeIDs[ i ] );
printf("] ,%i,%i,%i);\n",cdim[0],cdim[1],cdim[2]);
printf("] ,%i,%i,%i);\n",cdim[0],cdim[1],cdim[2]); */
}
......
......@@ -21,7 +21,8 @@
#include "inline.h"
#ifdef PTHREAD_LOCK
#ifdef PTHREAD_SPINLOCK
#include <pthread.h>
#define lock_type pthread_spinlock_t
#define lock_init( l ) ( pthread_spin_init( l , PTHREAD_PROCESS_PRIVATE ) != 0 )
#define lock_destroy( l ) ( pthread_spin_destroy( l ) != 0 )
......@@ -29,9 +30,18 @@
#define lock_trylock( l ) ( pthread_spin_lock( l ) != 0 )
#define lock_unlock( l ) ( pthread_spin_unlock( l ) != 0 )
#define lock_unlock_blind( l ) pthread_spin_unlock( l )
#elif defined(PTHREAD_LOCK)
#include <pthread.h>
#define lock_type pthread_mutex_t
#define lock_init( l ) ( pthread_mutex_init( l , NULL ) != 0 )
#define lock_destroy( l ) ( pthread_mutex_destroy( l ) != 0 )
#define lock_lock( l ) ( pthread_mutex_lock( l ) != 0 )
#define lock_trylock( l ) ( pthread_mutex_trylock( l ) != 0 )
#define lock_unlock( l ) ( pthread_mutex_unlock( l ) != 0 )
#define lock_unlock_blind( l ) pthread_mutex_unlock( l )
#else
#define lock_type volatile int
#define lock_init( l ) ( *l = 0 )
#define lock_init( l ) ( *(l) = 0 )
#define lock_destroy( l ) 0
INLINE static int lock_lock ( volatile int *l ) {
while ( __sync_val_compare_and_swap( l , 0 , 1 ) != 0 );
......
......@@ -1159,6 +1159,21 @@ void space_init ( struct space *s , double dim[3] , struct part *parts , int N ,
s->nr_queues = 1;
s->size_parts_foreign = 0;
/* Check that all the particle positions are reasonable, wrap if periodic. */
if ( periodic ) {
for ( int k = 0 ; k < N ; k++ )
for ( int j = 0 ; j < 3 ; j++ ) {
while ( parts[k].x[j] < 0 ) parts[k].x[j] += dim[j];
while ( parts[k].x[j] >= dim[j] ) parts[k].x[j] -= dim[j];
}
}
else {
for ( int k = 0 ; k < N ; k++ )
for ( int j = 0 ; j < 3 ; j++ )
if ( parts[k].x[j] < 0 || parts[k].x[j] >= dim[j] )
error( "Not all particles are within the specified domain." );
}
/* Allocate the xtra parts array. */
if ( posix_memalign( (void *)&s->xparts , 32 , N * sizeof(struct xpart) ) != 0 )
error( "Failed to allocate xparts." );
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment