engine.c 21.2 KB
Newer Older
Pedro Gonnet's avatar
Pedro Gonnet committed
1
/*******************************************************************************
2
 * This file is part of SWIFT.
Pedro Gonnet's avatar
Pedro Gonnet committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
 * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 ******************************************************************************/

/* Config parameters. */
#include "../config.h"

/* Some standard headers. */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include <omp.h>
#include <sched.h>

/* Local headers. */
#include "cycle.h"
36
#include "timers.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
37
#include "const.h"
38
#include "vector.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
39
40
41
#include "lock.h"
#include "task.h"
#include "part.h"
42
#include "debug.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
43
44
45
46
47
48
#include "cell.h"
#include "space.h"
#include "queue.h"
#include "engine.h"
#include "runner.h"
#include "runner_iact.h"
49
#include "error.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
50
51
52
53
54

/* Convert cell location to ID. */
#define cell_getid( cdim , i , j , k ) ( (int)(k) + (cdim)[2]*( (int)(j) + (cdim)[1]*(int)(i) ) )


55
56
57
58
59
60
/**
 * @brief Prepare the #engine by re-building the cells and tasks.
 *
 * @param e The #engine to prepare.
 */
 
61
void engine_prepare ( struct engine *e ) {
62

63
    int j, k, qid;
64
    struct space *s = e->s;
65
66
67
    struct queue *q;
    
    TIMER_TIC
68
69

    /* Rebuild the space. */
70
    // tic = getticks();
71
    space_prepare( e->s );
72
    // printf( "engine_prepare: space_prepare took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
73
    
74
    // tic = getticks();
75
76
77
78
79
80
81
82
83
84
85
86
    /* Init the queues (round-robin). */
    for ( qid = 0 ; qid < e->nr_queues ; qid++ )
        queue_init( &e->queues[qid] , s->nr_tasks , s->tasks );

    /* Fill the queues (round-robin). */
    for ( qid = 0 , k = 0 ; k < s->nr_tasks ; k++ ) {
        if ( s->tasks[ s->tasks_ind[k] ].skip )
            continue;
        q = &e->queues[qid];
        qid = ( qid + 1 ) % e->nr_queues;
        q->tid[ q->count ] = s->tasks_ind[k];
        q->count += 1;
87
        }
88
    // printf( "engine_prepare: re-filling queues took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
89

90
    /* Run throught the tasks and get all the waits right. */
91
92
    // tic = getticks();
    #pragma omp parallel for schedule(static) private(j)
93
    for ( k = 0 ; k < s->nr_tasks ; k++ ) {
94
95
        if ( s->tasks[k].skip )
            continue;
96
97
98
        for ( j = 0 ; j < s->tasks[k].nr_unlock_tasks ; j++ )
            __sync_add_and_fetch( &s->tasks[k].unlock_tasks[j]->wait , 1 );
        }
99
    // printf( "engine_prepare: preparing task dependencies took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
100
101
102
103
    
    /* Re-set the queues.*/
    for ( k = 0 ; k < e->nr_queues ; k++ )
        e->queues[k].next = 0;
Pedro Gonnet's avatar
Pedro Gonnet committed
104
        
105
106
    TIMER_TOC( timer_prepare );
    
Pedro Gonnet's avatar
Pedro Gonnet committed
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
    }


/**
 * @brief Implements a barrier for the #runner threads.
 *
 * @param e The #engine.
 */
 
void engine_barrier( struct engine *e ) {

    /* First, get the barrier mutex. */
    if ( pthread_mutex_lock( &e->barrier_mutex ) != 0 )
        error( "Failed to get barrier mutex." );
        
    /* Wait for the barrier to close. */
    while ( e->barrier_count < 0 )
        if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 )
            error( "Eror waiting for barrier to close." );
        
    /* Once I'm in, increase the barrier count. */
    e->barrier_count += 1;
    
    /* If all threads are in, send a signal... */
    if ( e->barrier_count == e->nr_threads )
        if ( pthread_cond_broadcast( &e->barrier_cond ) != 0 )
            error( "Failed to broadcast barrier full condition." );
        
    /* Wait for barrier to be released. */
    while ( e->barrier_count > 0 )
        if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 )
            error( "Error waiting for barrier to be released." );
            
    /* Decrease the counter before leaving... */
    e->barrier_count += 1;
    
    /* If I'm the last one out, signal the condition again. */
    if ( e->barrier_count == 0 )
        if ( pthread_cond_broadcast( &e->barrier_cond ) != 0 )
            error( "Failed to broadcast empty barrier condition." );
            
    /* Last but not least, release the mutex. */
    if ( pthread_mutex_unlock( &e->barrier_mutex ) != 0 )
        error( "Failed to get unlock the barrier mutex." );

    }
    
    
155
156
157
158
159
160
161
162
163
/**
 * @brief Mapping function to set dt_min and dt_max, do the first
 * kick.
 */

void engine_map_kick_first ( struct cell *c , void *data ) {

    int j, k;
    struct engine *e = (struct engine *)data;
164
    float pdt, dt_step = e->dt_step, dt = e->dt, hdt = 0.5f*dt;
165
    float dt_min, dt_max, h_max, dx, h2dx_max, dx_max, a[3], v[3], u, u_dt, h, h_dt, rho, v_old[3];
166
    double x[3], x_old[3];
167
168
169
170
171
172
173
174
175
176
177
178
    struct part *restrict p;
    struct xpart *restrict xp;
    struct cpart *restrict cp;

    /* No children? */
    if ( !c->split ) {
    
        /* Init the min/max counters. */
        dt_min = FLT_MAX;
        dt_max = 0.0f;
        h_max = 0.0f;
        dx_max = 0.0f;
179
        h2dx_max = 0.0f;
180
181
182
183
184
185
186
187
188
    
        /* Loop over parts. */
        for ( k = 0 ; k < c->count ; k++ ) {
            
            /* Get a handle on the kth particle. */
            p = &c->parts[k];
            xp = p->xtras;
            cp = &c->cparts[k];
            
189
190
191
192
193
194
195
196
197
198
199
            /* Load the data locally. */
            a[0] = p->a[0]; a[1] = p->a[1]; a[2] = p->a[2];
            v[0] = p->v[0]; v[1] = p->v[1]; v[2] = p->v[2];
            x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2];
            x_old[0] = xp->x_old[0]; x_old[1] = xp->x_old[1]; x_old[2] = xp->x_old[2];
            h = p->h;
            u = p->u;
            h_dt = p->force.h_dt;
            u_dt = p->force.u_dt;
            pdt = p->dt;
            
200
            /* Store the min/max dt. */
201
202
            dt_min = fminf( dt_min , pdt );
            dt_max = fmaxf( dt_max , pdt );
203
204
            
            /* Step and store the velocity and internal energy. */
205
206
207
            xp->v_old[0] = v_old[0] = v[0] + hdt * a[0];
            xp->v_old[1] = v_old[1] = v[1] + hdt * a[1];
            xp->v_old[2] = v_old[2] = v[2] + hdt * a[2];
208
209
210
            xp->u_old = p->u + hdt * p->force.u_dt;

            /* Move the particles with the velocitie at the half-step. */
211
212
213
            p->x[0] = x[0] += dt * v_old[0];
            p->x[1] = x[1] += dt * v_old[1];
            p->x[2] = x[2] += dt * v_old[2];
214
215
216
217
218
            dx = sqrtf( (x[0] - x_old[0])*(x[0] - x_old[0]) +
                        (x[1] - x_old[1])*(x[1] - x_old[1]) +
                        (x[2] - x_old[2])*(x[2] - x_old[2]) );
            dx_max = fmaxf( dx_max , dx );
            h2dx_max = fmaxf( h2dx_max , dx*2 + h );
219
220

            /* Update positions and energies at the half-step. */
221
222
223
224
225
226
            p->v[0] = v[0] += dt * a[0];
            p->v[1] = v[1] += dt * a[1];
            p->v[2] = v[2] += dt * a[2];
            p->u = u += u_dt * dt;
            p->h = h += h_dt * dt;
            h_max = fmaxf( h_max , h );
227
228
229

        
            /* Fill the cpart. */
230
231
232
233
234
            cp->x[0] = x[0];
            cp->x[1] = x[1];
            cp->x[2] = x[2];
            cp->h = h;
            cp->dt = pdt;
235
            
236
            /* Integrate other values if this particle will not be updated. */
237
            /* Init fields for density calculation. */
238
            if ( pdt > dt_step ) {
239
240
241
                // rho = p->rho *= expf( -3.0f * h_dt / h * dt );
                float w = -3.0f * h_dt / h * dt;
                rho = p->rho *= 1.0f + w*( -1.0f + w*( 0.5f - 1.0f/6.0f*w ) );
242
243
244
                p->force.POrho2 = u * ( const_gamma - 1.0f ) / ( rho + h * p->rho_dh / 3.0f );
                }
            else {
245
                p->density.wcount = 0.0f;
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
                p->density.wcount_dh = 0.0f;
                p->rho = 0.0f;
                p->rho_dh = 0.0f;
	            p->density.div_v = 0.0f;
	            for ( j = 0 ; j < 3 ; ++j)
	                p->density.curl_v[j] = 0.0f;
                }
                
            }
            
        }
        
    /* Otherwise, agregate data from children. */
    else {
    
        /* Init with the first non-null child. */
262
        for ( k = 0 ; c->progeny[k] == NULL ; k++ );
263
264
265
266
        dt_min = c->progeny[k]->dt_min;
        dt_max = c->progeny[k]->dt_max;
        h_max = c->progeny[k]->h_max;
        dx_max = c->progeny[k]->dx_max;
267
        h2dx_max = c->progeny[k]->h2dx_max;
268
269
270
271
272
273
274
275
        
        /* Loop over the remaining progeny. */
        for ( k += 1 ; k < 8 ; k++ )
            if ( c->progeny[k] != NULL ) {
                dt_min = fminf( dt_min , c->progeny[k]->dt_min );
                dt_max = fmaxf( dt_max , c->progeny[k]->dt_max );
                h_max = fmaxf( h_max , c->progeny[k]->h_max );
                dx_max = fmaxf( dx_max , c->progeny[k]->dx_max );
276
                h2dx_max = fmaxf( h2dx_max , c->progeny[k]->h2dx_max );
277
278
279
280
281
282
283
284
285
                }
    
        }

    /* Store the values. */
    c->dt_min = dt_min;
    c->dt_max = dt_max;
    c->h_max = h_max;
    c->dx_max = dx_max;
286
    c->h2dx_max = h2dx_max;
287
288
289
290
    
    }


291
292
293
294
295
/**
 * @brief Compute the force on a single particle brute-force.
 */

void engine_single ( double *dim , long long int pid , struct part *__restrict__ parts , int N , int periodic ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341

    int i, k;
    double r2, dx[3];
    float fdx[3], ihg;
    struct part p, p2;
    
    /* Find "our" part. */
    for ( k = 0 ; k < N && parts[k].id != pid ; k++ );
    if ( k == N )
        error( "Part not found." );
    p = parts[k];
    
    /* Clear accumulators. */
    ihg = kernel_igamma / p.h;
    p.a[0] = 0.0f; p.a[1] = 0.0f; p.a[2] = 0.0f;
    p.force.u_dt = 0.0f; p.force.h_dt = 0.0f; p.force.v_sig = 0.0f;
            
    /* Loop over all particle pairs (force). */
    for ( k = 0 ; k < N ; k++ ) {
        if ( parts[k].id == p.id )
            continue;
        for ( i = 0 ; i < 3 ; i++ ) {
            dx[i] = p.x[i] - parts[k].x[i];
            if ( periodic ) {
                if ( dx[i] < -dim[i]/2 )
                    dx[i] += dim[i];
                else if ( dx[i] > dim[i]/2 )
                    dx[i] -= dim[i];
                }
            fdx[i] = dx[i];
            }
        r2 = fdx[0]*fdx[0] + fdx[1]*fdx[1] + fdx[2]*fdx[2];
        if ( r2 < p.h*p.h || r2 < parts[k].h*parts[k].h ) {
            p.a[0] = 0.0f; p.a[1] = 0.0f; p.a[2] = 0.0f;
            p.force.u_dt = 0.0f; p.force.h_dt = 0.0f; p.force.v_sig = 0.0f;
            p2 = parts[k];
            runner_iact_nonsym_force( r2 , fdx , p.h , p2.h , &p , &p2 );
            printf( "pairs_simple: interacting particles %lli and %lli (rho=%.3e,r=%e): a=[%.3e,%.3e,%.3e], u_dt=%.3e, h_dt=%.3e, v_sig=%.3e.\n" ,
                pid , p2.id , p2.rho , sqrtf(r2) ,
                p.a[0] , p.a[1] , p.a[2] ,
                p.force.u_dt , p.force.h_dt , p.force.v_sig );
            }
        }
        
    /* Dump the result. */
    ihg = kernel_igamma / p.h;
342
    printf( "pairs_single: part %lli (h=%e) has wcount=%f, rho=%f.\n" , p.id , p.h , p.density.wcount + 32.0/3 , ihg * ihg * ihg * ( p.rho + p.mass*kernel_root ) );
Pedro Gonnet's avatar
Pedro Gonnet committed
343
344
345
346
347
    fflush(stdout);
    
    }


Pedro Gonnet's avatar
Pedro Gonnet committed
348
349
350
351
352
353
354
/**
 * @brief Let the #engine loose to compute the forces.
 *
 * @param e The #engine.
 * @param sort_queues Flag to try to sort the queues topologically.
 */
 
355
void engine_step ( struct engine *e , int sort_queues ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
356

357
358
    int k, nr_parts = e->s->nr_parts;
    struct part *restrict parts = e->s->parts, *restrict p;
359
360
    struct xpart *restrict xp;
    float dt = e->dt, hdt = 0.5*dt, dt_step, dt_max, dt_min, ldt_min, ldt_max;
361
362
    float pdt, h, m, v[3], u;
    double x[3];
363
    double epot = 0.0, ekin = 0.0, lepot, lekin, lmom[3], mom[3] = { 0.0 , 0.0 , 0.0 };
364
    double lang[3], ang[3] = { 0.0 , 0.0 , 0.0 };
365
    // double lent, ent = 0.0;
366
    int threadID, nthreads, count = 0, lcount;
367
368
    
    TIMER_TIC2
369

370
    /* Get the maximum dt. */
371
    dt_step = 2.0f*dt;
372
    for ( k = 0 ; k < 32 && (e->step & (1 << k)) == 0 ; k++ )
373
374
        dt_step *= 2;
    // dt_step = FLT_MAX;
375
376
        
    /* Set the maximum dt. */
377
378
    e->dt_step = dt_step;
    e->s->dt_step = dt_step;
379
    printf( "engine_step: dt_step set to %.3e (dt=%.3e).\n" , dt_step , e->dt ); fflush(stdout);
380
    
Pedro Gonnet's avatar
Pedro Gonnet committed
381
382
    // printParticle( parts , 432626 );
    
383
    /* First kick. */
384
    TIMER_TIC
385
    space_map_cells_post( e->s , 1 , &engine_map_kick_first , e );
386
    TIMER_TOC( timer_kick1 );
387
        
388
389
    // for(k=0; k<10; ++k)
    //   printParticle(parts, k);
Pedro Gonnet's avatar
Pedro Gonnet committed
390
391
    // printParticle( parts , 432626 );
    // printParticle( parts , 432628 );
392
 
393
394
395
396
    /* Prepare the space. */
    engine_prepare( e );
    
    /* Sort the queues?*/
Pedro Gonnet's avatar
Pedro Gonnet committed
397
398
399
400
401
402
403
    if ( sort_queues ) {
        #pragma omp parallel for default(none), shared(e)
        for ( k = 0 ; k < e->nr_queues ; k++ ) {
            queue_sort( &e->queues[k] );
            e->queues[k].next = 0;
            }
        }
404
        
405
    /* Start the clock. */
406
    TIMER_TIC_ND
Pedro Gonnet's avatar
Pedro Gonnet committed
407
408
409
410
411
412
413
414
415
416
    
    /* Cry havoc and let loose the dogs of war. */
    e->barrier_count = -e->barrier_count;
    if ( pthread_cond_broadcast( &e->barrier_cond ) != 0 )
        error( "Failed to broadcast barrier open condition." );
        
    /* Sit back and wait for the runners to come home. */
    while ( e->barrier_count < e->nr_threads )
        if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 )
            error( "Error while waiting for barrier." );
417
418
            
    /* Stop the clock. */
419
    TIMER_TOC(timer_runners);
420
    
421
422
    // for(k=0; k<10; ++k)
    //   printParticle(parts, k);
423
    // engine_single( e->s->dim , 432626 , e->s->parts , e->s->nr_parts , e->s->periodic );
Pedro Gonnet's avatar
Pedro Gonnet committed
424
425
    // printParticle( parts , 432626 );
    // printParticle( parts , 432628 );
426

427
    /* Second kick. */
428
    TIMER_TIC_ND
429
    dt_min = FLT_MAX; dt_max = 0.0f;
430
    #pragma omp parallel private(p,xp,k,ldt_min,lcount,ldt_max,lmom,lang,lekin,lepot,threadID,nthreads,pdt,v,h,m,x,u)
431
    {
432
433
434
435
        threadID = omp_get_thread_num();
        nthreads = omp_get_num_threads();
        ldt_min = FLT_MAX; ldt_max = 0.0f;
        lmom[0] = 0.0; lmom[1] = 0.0; lmom[2] = 0.0;
436
        lang[0] = 0.0; lang[1] = 0.0; lang[2] = 0.0;
437
        lekin = 0.0; lepot = 0.0; /* lent=0.0; */ lcount = 0;
438
439
440
441
        for ( k = nr_parts * threadID / nthreads ; k < nr_parts * (threadID + 1) / nthreads ; k++ ) {

            /* Get a handle on the part. */
            p = &parts[k];
442
            xp = p->xtras;
443
444
445
446
447
448
            
            /* Get local copies of particle data. */
            pdt = p->dt;
            h = p->h;
            m = p->mass;
            x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2];
449

450
            /* Scale the derivatives if they're freshly computed. */
451
452
            if ( pdt <= dt_step ) {
                p->force.h_dt *= h * 0.333333333f;
453
                lcount += 1;
454
                }
455
456
                
            /* Update the particle's time step. */
457
            p->dt = pdt = const_cfl * h / ( p->force.v_sig );
458
459

            /* Update positions and energies at the half-step. */
460
461
462
463
            p->v[0] = v[0] = xp->v_old[0] + hdt * p->a[0];
            p->v[1] = v[1] = xp->v_old[1] + hdt * p->a[1];
            p->v[2] = v[2] = xp->v_old[2] + hdt * p->a[2];
            p->u = u = xp->u_old + hdt * p->force.u_dt;
464
            
465
            /* Get the smallest/largest dt. */
466
467
            ldt_min = fminf( ldt_min , pdt );
            ldt_max = fmaxf( ldt_max , pdt );
468
469
            
            /* Collect total energy. */
470
471
            lekin += 0.5 * m * ( v[0]*v[0] + v[1]*v[1] + v[2]*v[2] );
            lepot += m * u;
472
473

            /* Collect momentum */
474
475
476
            lmom[0] += m * p->v[0];
            lmom[1] += m * p->v[1];
            lmom[2] += m * p->v[2];
477
	    
Pedro Gonnet's avatar
Pedro Gonnet committed
478
	        /* Collect angular momentum */
479
480
481
	        lang[0] += m * ( x[1]*v[2] - v[2]*x[1] );
	        lang[1] += m * ( x[2]*v[0] - v[0]*x[2] );
	        lang[2] += m * ( x[0]*v[1] - v[1]*x[0] );
482

Pedro Gonnet's avatar
Pedro Gonnet committed
483
	        /* Collect entropic function */
484
	        // lent += u * pow( p->rho, 1.f-const_gamma );
485
486
487

            }
        #pragma omp critical
488
489
490
491
492
493
        {
            dt_min = fminf( dt_min , ldt_min );
            dt_max = fmaxf( dt_max , ldt_max );
            mom[0] += lmom[0];
            mom[1] += lmom[1];
            mom[2] += lmom[2];
494
495
496
            ang[0] += lang[0];
            ang[1] += lang[1];
            ang[2] += lang[2];
497
	        // ent += (const_gamma -1.) * lent;
498
499
500
            epot += lepot;
            ekin += lekin;
            count += lcount;
501
        }
502
    }
503
    TIMER_TOC( timer_kick2 );
504
    e->dt_min = dt_min;
Pedro Gonnet's avatar
Pedro Gonnet committed
505
    // printParticle( parts , 432626 );
506
    printf( "engine_step: dt_min/dt_max is %e/%e.\n" , dt_min , dt_max ); fflush(stdout);
507
    printf( "engine_step: etot is %e (ekin=%e, epot=%e).\n" , ekin+epot , ekin , epot ); fflush(stdout);
508
    printf( "engine_step: total momentum is [ %e , %e , %e ].\n" , mom[0] , mom[1] , mom[2] ); fflush(stdout);
509
    printf( "engine_step: total angular momentum is [ %e , %e , %e ].\n" , ang[0] , ang[1] , ang[2] ); fflush(stdout);
510
    // printf( "engine_step: total entropic function is %e .\n", ent ); fflush(stdout);
511
    printf( "engine_step: updated %i parts (dt_step=%.3e).\n" , count , dt_step ); fflush(stdout);
512
        
513
514
515
    /* Increase the step counter. */
    e->step += 1;
    
516
    /* Does the time step need adjusting? */
517
    if ( e->dt == 0 ) {
518
        e->dt = e->dt_orig;
519
520
521
522
523
        while ( dt_min < e->dt )
            e->dt *= 0.5;
        while ( dt_min > 2*e->dt )
            e->dt *= 2.0;
        printf( "engine_step: dt_min=%.3e, adjusting time step to dt=%e.\n" , dt_min , e->dt );
524
        }
525
526
527
528
529
530
531
532
533
534
535
    else {
        while ( dt_min < e->dt ) {
            e->dt *= 0.5;
            e->step *= 2;
            printf( "engine_step: dt_min dropped below time step, adjusting to dt=%e.\n" , e->dt );
            }
        while ( dt_min > 2*e->dt && (e->step & 1) == 0 ) {
            e->dt *= 2.0;
            e->step /= 2;
            printf( "engine_step: dt_min is larger than twice the time step, adjusting to dt=%e.\n" , e->dt );
            }
536
        }
537
538
539
    
    /* Set the system time. */
    e->time = e->dt * e->step;
540
541
        
    TIMER_TOC2(timer_step);
542
    
Pedro Gonnet's avatar
Pedro Gonnet committed
543
544
545
546
547
548
549
550
551
    }
    
    
/**
 * @brief init an engine with the given number of threads, queues, and
 *      the given policy.
 *
 * @param e The #engine.
 * @param s The #space in which this #runner will run.
552
 * @param dt The initial time step to use.
Pedro Gonnet's avatar
Pedro Gonnet committed
553
554
555
556
557
 * @param nr_threads The number of threads to spawn.
 * @param nr_queues The number of task queues to create.
 * @param policy The queueing policy to use.
 */
 
558
void engine_init ( struct engine *e , struct space *s , float dt , int nr_threads , int nr_queues , int policy ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
559
560
561
562

    #if defined(HAVE_SETAFFINITY)
        cpu_set_t cpuset;
    #endif
563
    int k;
564
    float dt_min = dt;
Pedro Gonnet's avatar
Pedro Gonnet committed
565
566
567
568
569
570
    
    /* Store the values. */
    e->s = s;
    e->nr_threads = nr_threads;
    e->nr_queues = nr_queues;
    e->policy = policy;
571
    e->step = 0;
Pedro Gonnet's avatar
Pedro Gonnet committed
572
573
574
575
576
577
578
579
580
581
    
    /* First of all, init the barrier and lock it. */
    if ( pthread_mutex_init( &e->barrier_mutex , NULL ) != 0 )
        error( "Failed to initialize barrier mutex." );
    if ( pthread_cond_init( &e->barrier_cond , NULL ) != 0 )
        error( "Failed to initialize barrier condition variable." );
    if ( pthread_mutex_lock( &e->barrier_mutex ) != 0 )
        error( "Failed to lock barrier mutex." );
    e->barrier_count = 0;
    
582
    /* Run through the parts and get the minimum time step. */
583
    e->dt_orig = dt;
584
    for ( k = 0 ; k < s->nr_parts ; k++ )
585
586
587
588
589
590
591
592
        if ( s->parts[k].dt < dt_min )
            dt_min = s->parts[k].dt;
    if ( dt_min == 0.0f )
        dt = 0.0f;
    else
        while ( dt > dt_min )
            dt *= 0.5f;
    e->dt = dt;
593
    
Pedro Gonnet's avatar
Pedro Gonnet committed
594
595
596
597
598
599
    /* Allocate the queues. */
    if ( posix_memalign( (void *)(&e->queues) , 64 , nr_queues * sizeof(struct queue) ) != 0 )
        error( "Failed to allocate queues." );
    bzero( e->queues , nr_queues * sizeof(struct queue) );
        
    /* Sort the queues topologically. */
600
601
    // for ( k = 0 ; k < nr_queues ; k++ )
    //     queue_sort( &e->queues[k] );
Pedro Gonnet's avatar
Pedro Gonnet committed
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
        
    /* Allocate and init the threads. */
    if ( ( e->runners = (struct runner *)malloc( sizeof(struct runner) * nr_threads ) ) == NULL )
        error( "Failed to allocate threads array." );
    for ( k = 0 ; k < nr_threads ; k++ ) {
        e->runners[k].id = k;
        e->runners[k].e = e;
        if ( pthread_create( &e->runners[k].thread , NULL , &runner_main , &e->runners[k] ) != 0 )
            error( "Failed to create runner thread." );
        #if defined(HAVE_SETAFFINITY)
            /* Set the cpu mask to zero | e->id. */
            CPU_ZERO( &cpuset );
            CPU_SET( e->runners[k].id , &cpuset );

            /* Apply this mask to the runner's pthread. */
            if ( pthread_setaffinity_np( e->runners[k].thread , sizeof(cpu_set_t) , &cpuset ) != 0 )
                error( "Failed to set thread affinity." );
        #endif
        }
        
    /* Wait for the runner threads to be in place. */
    while ( e->barrier_count != e->nr_threads )
        if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 )
            error( "Error while waiting for runner threads to get in place." );
    
    }