runner.c 41.8 KB
Newer Older
1
/*******************************************************************************
2
 * This file is part of SWIFT.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
 * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 ******************************************************************************/
Pedro Gonnet's avatar
Pedro Gonnet committed
19

Pedro Gonnet's avatar
Pedro Gonnet committed
20
21
/* Config parameters. */
#include "../config.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
22
23
24
25
26
27
28
29
30
31
32

/* Some standard headers. */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include <omp.h>

33
34
35
36
37
/* MPI headers. */
#ifdef WITH_MPI
    #include <mpi.h>
#endif

Pedro Gonnet's avatar
Pedro Gonnet committed
38
/* Local headers. */
39
#include "const.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
40
#include "cycle.h"
41
#include "atomic.h"
42
#include "timers.h"
43
#include "const.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
44
#include "lock.h"
45
46
#include "task.h"
#include "part.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
47
#include "space.h"
48
#include "cell.h"
49
#include "queue.h"
50
#include "scheduler.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
51
#include "engine.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
52
#include "runner.h"
53
#include "error.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
54

55
56
57
58
59
60
61
/* Include the right variant of the SPH interactions */
#ifdef LEGACY_GADGET2_SPH
#include "runner_iact_legacy.h"
#else
#include "runner_iact.h"
#endif

Pedro Gonnet's avatar
Pedro Gonnet committed
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/* Convert cell location to ID. */
#define cell_getid( cdim , i , j , k ) ( (int)(k) + (cdim)[2]*( (int)(j) + (cdim)[1]*(int)(i) ) )

/* The counters. */
int runner_counter[ runner_counter_count ];

        

const float runner_shift[13*3] = {
     5.773502691896258e-01 ,  5.773502691896258e-01 ,  5.773502691896258e-01 ,
     7.071067811865475e-01 ,  7.071067811865475e-01 ,  0.0                   ,
     5.773502691896258e-01 ,  5.773502691896258e-01 , -5.773502691896258e-01 ,
     7.071067811865475e-01 ,  0.0                   ,  7.071067811865475e-01 ,
     1.0                   ,  0.0                   ,  0.0                   ,
     7.071067811865475e-01 ,  0.0                   , -7.071067811865475e-01 ,
     5.773502691896258e-01 , -5.773502691896258e-01 ,  5.773502691896258e-01 ,
     7.071067811865475e-01 , -7.071067811865475e-01 ,  0.0                   ,
     5.773502691896258e-01 , -5.773502691896258e-01 , -5.773502691896258e-01 ,
     0.0                   ,  7.071067811865475e-01 ,  7.071067811865475e-01 ,
     0.0                   ,  1.0                   ,  0.0                   ,
     0.0                   ,  7.071067811865475e-01 , -7.071067811865475e-01 ,
     0.0                   ,  0.0                   ,  1.0                   ,
    };
const char runner_flip[27] = { 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 ,
                               0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }; 


89
90
91
92
93
94
95
96
97
/* Import the density functions. */
#define FUNCTION density
#include "runner_doiact.h"

#undef FUNCTION
#define FUNCTION force
#include "runner_doiact.h"


98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/**
 * @brief Send a local cell's particle data to another node.
 *
 * @param r The #runner.
 * @param c The #cell.
 * @param nodeID The destination node's ID.
 * @param tagbit bit to distinguish between xv and rho sends.
 */
 
void runner_dosend ( struct runner *r , struct cell *c , int nodeID , int tag ) {

#ifdef WITH_MPI

    MPI_Request req;
    
    /* First check if all the density tasks have been run. */
    if ( tag & 1 )
        if ( c->parts[0].rho == 0.0 )
            error( "Attempting to send rhos before ghost task completed." );
    
    /* Emit the isend. */
    if ( MPI_Isend( c->parts , sizeof(struct part) * c->count , MPI_BYTE , nodeID , tag , MPI_COMM_WORLD , &req ) != MPI_SUCCESS )
        error( "Failed to isend particle data." );
        
    message( "sending %i parts with tag=%i from %i to %i." ,
        c->count , tag , r->e->nodeID , nodeID ); fflush(stdout);
    
    /* Free the request handler as we don't care what happens next. */
    MPI_Request_free( &req );

#else
    error( "SWIFT was not compiled with MPI support." );
#endif

    }
    

Pedro Gonnet's avatar
Pedro Gonnet committed
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
/**
 * @brief Sort the entries in ascending order using QuickSort.
 *
 * @param sort The entries
 * @param N The number of entries.
 */
 
void runner_dosort_ascending ( struct entry *sort , int N ) {

    struct {
        short int lo, hi;
        } qstack[10];
    int qpos, i, j, lo, hi, imin;
    struct entry temp;
    float pivot;
        
    /* Sort parts in cell_i in decreasing order with quicksort */
    qstack[0].lo = 0; qstack[0].hi = N - 1; qpos = 0;
    while ( qpos >= 0 ) {
        lo = qstack[qpos].lo; hi = qstack[qpos].hi;
        qpos -= 1;
        if ( hi - lo < 15 ) {
            for ( i = lo ; i < hi ; i++ ) {
                imin = i;
                for ( j = i+1 ; j <= hi ; j++ )
                    if ( sort[j].d < sort[imin].d )
                        imin = j;
                if ( imin != i ) {
                    temp = sort[imin]; sort[imin] = sort[i]; sort[i] = temp;
                    }
                }
            }
        else {
            pivot = sort[ ( lo + hi ) / 2 ].d;
            i = lo; j = hi;
            while ( i <= j ) {
                while ( sort[i].d < pivot ) i++;
                while ( sort[j].d > pivot ) j--;
                if ( i <= j ) {
                    if ( i < j ) {
                        temp = sort[i]; sort[i] = sort[j]; sort[j] = temp;
                        }
                    i += 1; j -= 1;
                    }
                }
            if ( j > ( lo + hi ) / 2 ) {
                if ( lo < j ) {
                    qpos += 1;
                    qstack[qpos].lo = lo;
                    qstack[qpos].hi = j;
                    }
                if ( i < hi ) {
                    qpos += 1;
                    qstack[qpos].lo = i;
                    qstack[qpos].hi = hi;
                    }
                }
            else {
                if ( i < hi ) {
                    qpos += 1;
                    qstack[qpos].lo = i;
                    qstack[qpos].hi = hi;
                    }
                if ( lo < j ) {
                    qpos += 1;
                    qstack[qpos].lo = lo;
                    qstack[qpos].hi = j;
                    }
                }
            }
        }
                
    }
    
    
/**
 * @brief Sort the particles in the given cell along all cardinal directions.
 *
 * @param r The #runner.
 * @param c The #cell.
215
 * @param flags Cell flag.
216
217
 * @param clock Flag indicating whether to record the timing or not, needed
 *      for recursive calls.
Pedro Gonnet's avatar
Pedro Gonnet committed
218
219
 */
 
220
void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
221
222
223

    struct entry *finger;
    struct entry *fingers[8];
Pedro Gonnet's avatar
Pedro Gonnet committed
224
    struct part *parts = c->parts;
Pedro Gonnet's avatar
Pedro Gonnet committed
225
    int j, k, count = c->count;
Pedro Gonnet's avatar
Pedro Gonnet committed
226
    int i, ind, off[8], inds[8], temp_i, missing;
227
    // float shift[3];
Pedro Gonnet's avatar
Pedro Gonnet committed
228
    float buff[8], px[3];
229
    
Pedro Gonnet's avatar
Pedro Gonnet committed
230
231
    TIMER_TIC
    
232
233
234
235
236
    /* Clean-up the flags, i.e. filter out what's already been sorted. */
    flags &= ~c->sorted;
    if ( flags == 0 )
        return;
    
Pedro Gonnet's avatar
Pedro Gonnet committed
237
    /* start by allocating the entry arrays. */
238
239
240
241
242
    if ( c->sort == NULL || c->sortsize < c->count ) {
        if ( c->sort != NULL )
            free( c->sort );
        c->sortsize = c->count * 1.1;
        if ( ( c->sort = (struct entry *)malloc( sizeof(struct entry) * (c->sortsize + 1) * 13 ) ) == NULL )
Pedro Gonnet's avatar
Pedro Gonnet committed
243
            error( "Failed to allocate sort memory." );
244
        }
Pedro Gonnet's avatar
Pedro Gonnet committed
245
246
247
248
        
    /* Does this cell have any progeny? */
    if ( c->split ) {
    
Pedro Gonnet's avatar
Pedro Gonnet committed
249
250
251
252
        /* Fill in the gaps within the progeny. */
        for ( k = 0 ; k < 8 ; k++ ) {
            if ( c->progeny[k] == NULL )
                continue;
253
            missing = flags & ~c->progeny[k]->sorted;
Pedro Gonnet's avatar
Pedro Gonnet committed
254
            if ( missing )
255
                runner_dosort( r , c->progeny[k] , missing , 0 );
Pedro Gonnet's avatar
Pedro Gonnet committed
256
257
258
259
            }
    
        /* Loop over the 13 different sort arrays. */
        for ( j = 0 ; j < 13 ; j++ ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
260
261
262
263
264
        
            /* Has this sort array been flagged? */
            if ( !( flags & (1 << j) ) )
                continue;
                
265
266
267
268
269
270
271
272
273
274
275
276
277
278
            /* Init the particle index offsets. */
            for ( off[0] = 0 , k = 1 ; k < 8 ; k++ )
                if ( c->progeny[k-1] != NULL )
                    off[k] = off[k-1] + c->progeny[k-1]->count;
                else
                    off[k] = off[k-1];

            /* Init the entries and indices. */
            for ( k = 0 ; k < 8 ; k++ ) {
                inds[k] = k;
                if ( c->progeny[k] != NULL && c->progeny[k]->count > 0 ) {
                    fingers[k] = &c->progeny[k]->sort[ j*(c->progeny[k]->count + 1) ];
                    buff[k] = fingers[k]->d;
                    off[k] = off[k];
Pedro Gonnet's avatar
Pedro Gonnet committed
279
                    }
280
281
                else
                    buff[k] = FLT_MAX;
Pedro Gonnet's avatar
Pedro Gonnet committed
282
283
                }

284
285
286
287
288
289
            /* Sort the buffer. */
            for ( i = 0 ; i < 7 ; i++ )
                for ( k = i+1 ; k < 8 ; k++ )
                    if ( buff[ inds[k] ] < buff[ inds[i] ] ) {
                        temp_i = inds[i]; inds[i] = inds[k]; inds[k] = temp_i;
                        }
Pedro Gonnet's avatar
Pedro Gonnet committed
290

291
292
293
            /* For each entry in the new sort list. */
            finger = &c->sort[ j*(count + 1) ];
            for ( ind = 0 ; ind < count ; ind++ ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
294

295
296
297
                /* Copy the minimum into the new sort array. */
                finger[ind].d = buff[inds[0]];
                finger[ind].i = fingers[inds[0]]->i + off[inds[0]];
Pedro Gonnet's avatar
Pedro Gonnet committed
298

299
300
301
                /* Update the buffer. */
                fingers[inds[0]] += 1;
                buff[inds[0]] = fingers[inds[0]]->d;
Pedro Gonnet's avatar
Pedro Gonnet committed
302

303
304
305
306
                /* Find the smallest entry. */
                for ( k = 1 ; k < 8 && buff[inds[k]] < buff[inds[k-1]] ; k++ ) {
                    temp_i = inds[k-1]; inds[k-1] = inds[k]; inds[k] = temp_i;
                    }
Pedro Gonnet's avatar
Pedro Gonnet committed
307

308
                } /* Merge. */
Pedro Gonnet's avatar
Pedro Gonnet committed
309
310
311
312
313
            
            /* Add a sentinel. */
            c->sort[ j*(c->count + 1) + c->count ].d = FLT_MAX;
            c->sort[ j*(c->count + 1) + c->count ].i = 0;
            
314
315
316
            /* Mark as sorted. */
            c->sorted |= ( 1 << j );
            
Pedro Gonnet's avatar
Pedro Gonnet committed
317
318
319
320
321
322
323
            } /* loop over sort arrays. */
    
        } /* progeny? */
        
    /* Otherwise, just sort. */
    else {
    
324
        /* Fill the sort array. */
Pedro Gonnet's avatar
Pedro Gonnet committed
325
        for ( k = 0 ; k < count ; k++ ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
326
327
328
            px[0] = parts[k].x[0];
            px[1] = parts[k].x[1];
            px[2] = parts[k].x[2];
329
330
331
            for ( j = 0 ; j < 13 ; j++ )
                if ( flags & (1 << j) ) {
                    c->sort[ j*(count + 1) + k].i = k;
Pedro Gonnet's avatar
Pedro Gonnet committed
332
                    c->sort[ j*(count + 1) + k].d = px[0]*runner_shift[ 3*j + 0 ] + px[1]*runner_shift[ 3*j + 1 ] + px[2]*runner_shift[ 3*j + 2 ];
333
                    }
Pedro Gonnet's avatar
Pedro Gonnet committed
334
            }
335
336

        /* Add the sentinel and sort. */
Pedro Gonnet's avatar
Pedro Gonnet committed
337
        for ( j = 0 ; j < 13 ; j++ )
338
339
340
341
            if ( flags & (1 << j) ) {
                c->sort[ j*(count + 1) + c->count ].d = FLT_MAX;
                c->sort[ j*(count + 1) + c->count ].i = 0;
                runner_dosort_ascending( &c->sort[ j*(count + 1) ] , c->count );
342
                c->sorted |= ( 1 << j );
Pedro Gonnet's avatar
Pedro Gonnet committed
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
                }
            
        }
        
    /* Verify the sorting. */
    /* for ( j = 0 ; j < 13 ; j++ ) {
        if ( !( flags & (1 << j) ) )
            continue;
        finger = &c->sort[ j*(c->count + 1) ];
        for ( k = 1 ; k < c->count ; k++ ) {
            if ( finger[k].d < finger[k-1].d )
                error( "Sorting failed, ascending array." );
            if ( finger[k].i >= c->count )
                error( "Sorting failed, indices borked." );
            }
        } */
359
        
Pedro Gonnet's avatar
Pedro Gonnet committed
360
    #ifdef TIMER_VERBOSE
361
        message( "runner %02i: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) took %.3f ms." ,
Pedro Gonnet's avatar
Pedro Gonnet committed
362
            r->id , c->count , c->depth ,
Pedro Gonnet's avatar
Pedro Gonnet committed
363
            (flags & 0x1000) >> 12 , (flags & 0x800) >> 11 , (flags & 0x400) >> 10 , (flags & 0x200) >> 9 , (flags & 0x100) >> 8 , (flags & 0x80) >> 7 , (flags & 0x40) >> 6 , (flags & 0x20) >> 5 , (flags & 0x10) >> 4 , (flags & 0x8) >> 3 , (flags & 0x4) >> 2 , (flags & 0x2) >> 1 , (flags & 0x1) >> 0 , 
364
            ((double)TIMER_TOC(timer_dosort)) / CPU_TPS * 1000 ); fflush(stdout);
Pedro Gonnet's avatar
Pedro Gonnet committed
365
    #else
366
367
        if ( clock )
            TIMER_TOC(timer_dosort);
Pedro Gonnet's avatar
Pedro Gonnet committed
368
369
370
    #endif

    }
371
372
373
374
375
    
    
/**
 * @brief Intermediate task between density and force
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
376
 * @param r The runner thread.
377
 * @param c The cell.
378
379
 */
 
Pedro Gonnet's avatar
Pedro Gonnet committed
380
void runner_doghost ( struct runner *r , struct cell *c ) {
381

382
    struct part *p, *parts = c->parts;
383
    struct cell *finger;
Pedro Gonnet's avatar
Pedro Gonnet committed
384
385
    int i, k, redo, count = c->count;
    int *pid;
386
    float h, ih, ih2, ih4, h_corr, rho, wcount, rho_dh, wcount_dh, u, fc;
387
    float normDiv_v, normCurl_v;
388
389
390
#ifndef LEGACY_GADGET2_SPH
    float alpha_dot, tau, S;   
#endif
391
    float dt_step = r->e->dt_step;
392
393
    TIMER_TIC
    
Pedro Gonnet's avatar
Pedro Gonnet committed
394
395
396
397
398
399
400
    /* Recurse? */
    if ( c->split ) {
        for ( k = 0 ; k < 8 ; k++ )
            if ( c->progeny[k] != NULL )
                runner_doghost( r , c->progeny[k] );
        return;
        }
401
        
Pedro Gonnet's avatar
Pedro Gonnet committed
402
403
404
405
406
407
408
409
    /* Init the IDs that have to be updated. */
    if ( ( pid = (int *)alloca( sizeof(int) * count ) ) == NULL )
        error( "Call to alloca failed." );
    for ( k = 0 ; k < count ; k++ )
        pid[k] = k;
        
    /* While there are particles that need to be updated... */
    while ( count > 0 ) {
410
    
Pedro Gonnet's avatar
Pedro Gonnet committed
411
412
        /* Reset the redo-count. */
        redo = 0;
413
    
Pedro Gonnet's avatar
Pedro Gonnet committed
414
        /* Loop over the parts in this cell. */
415
416
417
418
419
420
        __builtin_prefetch( &parts[ pid[0] ] , 0 , 1 );
        __builtin_prefetch( &parts[ pid[0] ].rho_dh , 0 , 1 );
        __builtin_prefetch( &parts[ pid[1] ] , 0 , 1 );
        __builtin_prefetch( &parts[ pid[1] ].rho_dh , 0 , 1 );
        __builtin_prefetch( &parts[ pid[2] ] , 0 , 1 );
        __builtin_prefetch( &parts[ pid[2] ].rho_dh , 0 , 1 );
Pedro Gonnet's avatar
Pedro Gonnet committed
421
422
423
        for ( i = 0 ; i < count ; i++ ) {

            /* Get a direct pointer on the part. */
424
425
426
            __builtin_prefetch( &parts[ pid[i+3] ] , 0 , 1 );
            __builtin_prefetch( &parts[ pid[i+3] ].rho_dh , 0 , 1 );
            p = &parts[ pid[i] ];
427
            
428
            /* Is this part within the timestep? */
Pedro Gonnet's avatar
Pedro Gonnet committed
429
            if ( p->dt <= dt_step ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
430
            
431
432
	            /* Some smoothing length multiples. */
	            h = p->h;
433
434
435
                ih = 1.0f / h;
                ih2 = ih * ih;
                ih4 = ih2 * ih2;
436

437
		        /* Final operation on the density. */
438
                p->rho = rho = ih * ih2 * ( p->rho + p->mass*kernel_root );
Pedro Gonnet's avatar
Pedro Gonnet committed
439
                p->rho_dh = rho_dh = ( p->rho_dh - 3.0f*p->mass*kernel_root ) * ih4;
440
441
                wcount = ( p->density.wcount + kernel_root ) * ( 4.0f / 3.0 * M_PI * kernel_gamma3 );
                wcount_dh = p->density.wcount_dh * ih * ( 4.0f / 3.0 * M_PI * kernel_gamma3 );
Pedro Gonnet's avatar
Pedro Gonnet committed
442
                    
Pedro Gonnet's avatar
Pedro Gonnet committed
443
                /* If no derivative, double the smoothing length. */
444
                if ( wcount_dh == 0.0f )
445
                    h_corr = p->h;
Pedro Gonnet's avatar
Pedro Gonnet committed
446
447
                    
                /* Otherwise, compute the smoothing length update (Newton step). */
448
                else {
449
                    h_corr = ( kernel_nwneigh - wcount ) / wcount_dh;
450
451
452

                    /* Truncate to the range [ -p->h/2 , p->h ]. */
                    h_corr = fminf( h_corr , h );
453
                    h_corr = fmaxf( h_corr , -h/2.f );
Pedro Gonnet's avatar
Pedro Gonnet committed
454
                    
455
                    }
456
                
Pedro Gonnet's avatar
Pedro Gonnet committed
457
                /* Apply the correction to p->h and to the compact part. */
458
                p->h += h_corr;
459
460

                /* Did we get the right number density? */
461
462
                if ( wcount > kernel_nwneigh + const_delta_nwneigh ||
                     wcount < kernel_nwneigh - const_delta_nwneigh ) {
463
                    // message( "particle %lli (h=%e,depth=%i) has bad wcount=%.3f." , p->id , p->h , c->depth , wcount ); fflush(stdout);
464
                    // p->h += ( p->density.wcount + kernel_root - kernel_nwneigh ) / p->density.wcount_dh;
465
466
                    pid[redo] = pid[i];
                    redo += 1;
467
                    p->density.wcount = 0.0;
468
                    p->density.wcount_dh = 0.0;
469
470
                    p->rho = 0.0;
                    p->rho_dh = 0.0;
471
472
473
		            p->density.div_v = 0.0;
		            for ( k=0 ; k < 3 ; k++)
		                p->density.curl_v[k] = 0.0;
474
475
                    continue;
                    }
476

477
                /* Pre-compute some stuff for the balsara switch. */
478
479
		        normDiv_v = fabs( p->density.div_v / rho * ih4 );
		        normCurl_v = sqrtf( p->density.curl_v[0] * p->density.curl_v[0] + p->density.curl_v[1] * p->density.curl_v[1] + p->density.curl_v[2] * p->density.curl_v[2] ) / rho * ih4;
480
                
Pedro Gonnet's avatar
Pedro Gonnet committed
481
482
483
                /* As of here, particle force variables will be set. Do _NOT_
                   try to read any particle density variables! */
                
484
                /* Compute this particle's sound speed. */
485
                u = p->u;
486
                p->force.c = fc = sqrtf( const_hydro_gamma * ( const_hydro_gamma - 1.0f ) * u );
487
488

                /* Compute the P/Omega/rho2. */
489
                p->force.POrho2 = u * ( const_hydro_gamma - 1.0f ) / ( rho + h * rho_dh / 3.0f );
490

491
492
		        /* Balsara switch */
		        p->force.balsara = normDiv_v / ( normDiv_v + normCurl_v + 0.0001f * fc * ih );
493

494
495
496
                #ifndef LEGACY_GADGET2_SPH
		            /* Viscosity parameter decay time */
		            tau = h / ( 2.f * const_viscosity_length * p->force.c );
497

498
499
		            /* Viscosity source term */
		            S = fmaxf( -normDiv_v, 0.f );
500

501
502
		            /* Compute the particle's viscosity parameter time derivative */
		            alpha_dot = ( const_viscosity_alpha_min - p->alpha ) / tau + ( const_viscosity_alpha_max - p->alpha ) * S;
503

504
505
506
		            /* Update particle's viscosity paramter */
		            p->alpha += alpha_dot * p->dt; 
                #endif                
507

508
509
510
                /* Reset the acceleration. */
                for ( k = 0 ; k < 3 ; k++ )
                    p->a[k] = 0.0f;
Pedro Gonnet's avatar
Pedro Gonnet committed
511

512
                /* Reset the time derivatives. */
513
514
515
                p->force.u_dt = 0.0f;
                p->force.h_dt = 0.0f;
                p->force.v_sig = 0.0f;
Pedro Gonnet's avatar
Pedro Gonnet committed
516

517
                }
Pedro Gonnet's avatar
Pedro Gonnet committed
518
519

            }
520
            
Pedro Gonnet's avatar
Pedro Gonnet committed
521
522
        /* Re-set the counter for the next loop (potentially). */
        count = redo;
523
524
525
526
527
528
529
530
        if ( count > 0 ) {
        
            // error( "Bad smoothing length, fixing this isn't implemented yet." );
            
            /* Climb up the cell hierarchy. */
            for ( finger = c ; finger != NULL ; finger = finger->parent ) {
            
                /* Run through this cell's density interactions. */
531
                for ( struct link *l = finger->density ; l != NULL ; l = l->next ) {
532
533
                
                    /* Self-interaction? */
534
                    if ( l->t->type == task_type_self )
535
                        runner_doself_subset_density( r , finger , parts , pid , count );
536
537
                        
                    /* Otherwise, pair interaction? */
538
                    else if ( l->t->type == task_type_pair ) {
539
540
                    
                        /* Left or right? */
541
542
                        if ( l->t->ci == finger )
                            runner_dopair_subset_density( r , finger , parts , pid , count , l->t->cj );
543
                        else
544
                            runner_dopair_subset_density( r , finger , parts , pid , count , l->t->ci );
545
546
547
548
                        
                        }
                
                    /* Otherwise, sub interaction? */
549
                    else if ( l->t->type == task_type_sub ) {
550
551
                    
                        /* Left or right? */
552
553
                        if ( l->t->ci == finger )
                            runner_dosub_subset_density( r , finger , parts , pid , count , l->t->cj , -1 , 1 );
554
                        else
555
                            runner_dosub_subset_density( r , finger , parts , pid , count , l->t->ci , -1 , 1 );
556
557
                        
                        }
558
559
                
                    }
Pedro Gonnet's avatar
Pedro Gonnet committed
560
                    
561
562
563
                }
        
            }
564
565
566
567
            
        }

    #ifdef TIMER_VERBOSE
568
        message( "runner %02i: %i parts at depth %i took %.3f ms." ,
Pedro Gonnet's avatar
Pedro Gonnet committed
569
            r->id , c->count , c->depth ,
570
            ((double)TIMER_TOC(timer_doghost)) / CPU_TPS * 1000 ); fflush(stdout);
571
    #else
572
        TIMER_TOC(timer_doghost);
573
574
575
    #endif
    
    }
576
577
578
579
580
581
582
583
584
585
586
    
    
/**
 * @brief Compute the second kick of the given cell.
 *
 * @param r The runner thread.
 * @param c The cell.
 */
 
void runner_dokick2 ( struct runner *r , struct cell *c ) {

587
588
    int j, k, count = 0, nr_parts = c->count;
    float dt_curr, hdt_curr, dt_min = FLT_MAX, dt_max = 0.0f;
Pedro Gonnet's avatar
Pedro Gonnet committed
589
    double ekin = 0.0, epot = 0.0;
590
    float mom[3] = { 0.0f , 0.0f , 0.0f }, ang[3] = { 0.0f , 0.0f , 0.0f };
591
592
    float x[3], v_hdt[3], u_hdt, h, pdt, m;
    float dt_step = r->e->dt_step, dt = r->e->dt, idt;
593
594
    float dt_cfl, dt_h_change, dt_u_change, dt_new;
    float h_dt, u_dt;
595
    struct part *restrict p, *restrict parts = c->parts;
596
    struct xpart *restrict xp, *restrict xparts = c->xparts;
597
598
599
    
    TIMER_TIC
    
600
601
602
    /* Init idt to avoid compiler stupidity. */
    idt = ( dt > 0 ) ? 1.0f / dt : 0.0f;
    
603
    /* Loop over the particles and kick them. */
Pedro Gonnet's avatar
Pedro Gonnet committed
604
605
606
607
608
609
610
611
612
    __builtin_prefetch( &parts[0] , 0 , 1 );
    __builtin_prefetch( &parts[0].rho_dh , 0 , 1 );
    __builtin_prefetch( &xparts[0] , 0 , 1 );
    __builtin_prefetch( &parts[1] , 0 , 1 );
    __builtin_prefetch( &parts[1].rho_dh , 0 , 1 );
    __builtin_prefetch( &xparts[1] , 0 , 1 );
    __builtin_prefetch( &parts[2] , 0 , 1 );
    __builtin_prefetch( &parts[2].rho_dh , 0 , 1 );
    __builtin_prefetch( &xparts[2] , 0 , 1 );
613
614
615
    for ( k = 0 ; k < nr_parts ; k++ ) {

        /* Get a handle on the part. */
Pedro Gonnet's avatar
Pedro Gonnet committed
616
617
618
        __builtin_prefetch( &parts[k+3] , 0 , 1 );
        __builtin_prefetch( &parts[k+3].rho_dh , 0 , 1 );
        __builtin_prefetch( &xparts[k+3] , 0 , 1 );
619
        p = &parts[k];
620
        xp = &xparts[k];
621
622
623
624
625

        /* Get local copies of particle data. */
        pdt = p->dt;
        m = p->mass;
        x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2];
626
627
        v_hdt[0] = xp->v_hdt[0]; v_hdt[1] = xp->v_hdt[1]; v_hdt[2] = xp->v_hdt[2];
        u_hdt = xp->u_hdt;
628

629
        /* Update the particle's data (if active). */
630
        if ( pdt <= dt_step ) {
631
632
            
            /* Increase the number of particles updated. */
633
            count += 1;
634
635
636
637
            
            /* Scale the derivatives as they're freshly computed. */
            h = p->h;
            h_dt = p->force.h_dt *= h * 0.333333333f;
Pedro Gonnet's avatar
Pedro Gonnet committed
638
            xp->omega = 1.0f + h * p->rho_dh / p->rho * 0.3333333333f;
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
            
            /* Compute the new time step. */
            u_dt = p->force.u_dt;
            dt_cfl = const_cfl * h / p->force.v_sig;
            dt_h_change = ( h_dt != 0.0f ) ? fabsf( const_ln_max_h_change * h / h_dt ) : FLT_MAX;
            dt_u_change = ( u_dt != 0.0f ) ? fabsf( const_max_u_change * p->u / u_dt ) : FLT_MAX;
            dt_new = fminf( dt_cfl , fminf( dt_h_change , dt_u_change ) );
            if ( pdt == 0.0f )
                p->dt = pdt = dt_new;
            else
                p->dt = pdt = fminf( dt_new , 2.0f*pdt );
                
            /* Get the particle-specific time step. */
            dt_curr = xp->dt_curr;
            hdt_curr = 0.5f * dt_curr;
            
            /* Update positions and energies at the full step. */
            p->v[0] = v_hdt[0] + hdt_curr * p->a[0];
            p->v[1] = v_hdt[1] + hdt_curr * p->a[1];
            p->v[2] = v_hdt[2] + hdt_curr * p->a[2];
            p->u = u_hdt + hdt_curr * u_dt;
            xp->v_hdt[0] = ( v_hdt[0] += dt_curr * p->a[0] );
            xp->v_hdt[1] = ( v_hdt[1] += dt_curr * p->a[1] );
            xp->v_hdt[2] = ( v_hdt[2] += dt_curr * p->a[2] );
            xp->u_hdt = ( u_hdt += dt_curr * u_dt );
            
            /* Set the new particle-specific time step. */
            if ( dt > 0.0f ) {
                dt_curr = dt;
                j = (int)( pdt * idt );
                while ( j > 1 ) {
                    dt_curr *= 2.0f;
                    j >>= 1;
                    }
                xp->dt_curr = dt_curr;
                }
            
676
677
678
679
680
681
682
            }

        /* Get the smallest/largest dt. */
        dt_min = fminf( dt_min , pdt );
        dt_max = fmaxf( dt_max , pdt );

        /* Collect total energy. */
683
684
        ekin += 0.5 * m * ( v_hdt[0]*v_hdt[0] + v_hdt[1]*v_hdt[1] + v_hdt[2]*v_hdt[2] );
        epot += m * u_hdt;
685
686

        /* Collect momentum */
687
688
689
        mom[0] += m * v_hdt[0];
        mom[1] += m * v_hdt[1];
        mom[2] += m * v_hdt[2];
690
691

	    /* Collect angular momentum */
692
693
694
	    ang[0] += m * ( x[1]*v_hdt[2] - x[2]*v_hdt[1] );
	    ang[1] += m * ( x[2]*v_hdt[0] - x[0]*v_hdt[2] );
	    ang[2] += m * ( x[0]*v_hdt[1] - x[1]*v_hdt[0] );
695
696
697
698
699
700
701

	    /* Collect entropic function */
	    // lent += u * pow( p->rho, 1.f-const_gamma );

        }

    #ifdef TIMER_VERBOSE
702
        message( "runner %02i: %i parts at depth %i took %.3f ms." ,
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
            r->id , c->count , c->depth ,
            ((double)TIMER_TOC(timer_kick2)) / CPU_TPS * 1000 ); fflush(stdout);
    #else
        TIMER_TOC(timer_kick2);
    #endif
        
    /* Store the computed values in the cell. */
    c->dt_min = dt_min;
    c->dt_max = dt_max;
    c->updated = count;
    c->ekin = ekin;
    c->epot = epot;
    c->mom[0] = mom[0]; c->mom[1] = mom[1]; c->mom[2] = mom[2];
    c->ang[0] = ang[0]; c->ang[1] = ang[1]; c->ang[2] = ang[2];
        
    }
Pedro Gonnet's avatar
Pedro Gonnet committed
719
720


721
722
723
724
725
726
727
728
729
/**
 * @brief Mapping function to set dt_min and dt_max, do the first
 * kick.
 */

void runner_dokick1 ( struct runner *r , struct cell *c ) {

    int j, k;
    struct engine *e = r->e;
730
    float pdt, dt_step = e->dt_step, dt = e->dt;
731
    float dt_min, dt_max, h_max, dx, dx_max;
732
    float a[3], v[3], u, u_dt, h, h_dt, w, rho;
733
    double x[3], x_old[3];
734
    struct part *restrict p, *restrict parts = c->parts;
735
    struct xpart *restrict xp, *restrict xparts = c->xparts;
736
737
738
739
740
741
742
743
744
745
746

    /* No children? */
    if ( !c->split ) {
    
        /* Init the min/max counters. */
        dt_min = FLT_MAX;
        dt_max = 0.0f;
        h_max = 0.0f;
        dx_max = 0.0f;
    
        /* Loop over parts. */
Pedro Gonnet's avatar
Pedro Gonnet committed
747
748
749
750
751
752
753
754
755
        __builtin_prefetch( &parts[0] , 0 , 1 );
        __builtin_prefetch( &parts[0].rho_dh , 0 , 1 );
        __builtin_prefetch( &xparts[0] , 0 , 1 );
        __builtin_prefetch( &parts[1] , 0 , 1 );
        __builtin_prefetch( &parts[1].rho_dh , 0 , 1 );
        __builtin_prefetch( &xparts[1] , 0 , 1 );
        __builtin_prefetch( &parts[2] , 0 , 1 );
        __builtin_prefetch( &parts[2].rho_dh , 0 , 1 );
        __builtin_prefetch( &xparts[2] , 0 , 1 );
756
757
758
        for ( k = 0 ; k < c->count ; k++ ) {
            
            /* Get a handle on the kth particle. */
Pedro Gonnet's avatar
Pedro Gonnet committed
759
760
761
            __builtin_prefetch( &parts[k+3] , 0 , 1 );
            __builtin_prefetch( &parts[k+3].rho_dh , 0 , 1 );
            __builtin_prefetch( &xparts[k+3] , 0 , 1 );
762
763
            p = &parts[k];
            xp = &xparts[k];
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
            
            /* Load the data locally. */
            a[0] = p->a[0]; a[1] = p->a[1]; a[2] = p->a[2];
            v[0] = p->v[0]; v[1] = p->v[1]; v[2] = p->v[2];
            x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2];
            x_old[0] = xp->x_old[0]; x_old[1] = xp->x_old[1]; x_old[2] = xp->x_old[2];
            h = p->h;
            u = p->u;
            h_dt = p->force.h_dt;
            u_dt = p->force.u_dt;
            pdt = p->dt;
            
            /* Store the min/max dt. */
            dt_min = fminf( dt_min , pdt );
            dt_max = fmaxf( dt_max , pdt );
            
780
781
782
783
            /* Move the particles with the velocities at the half-step. */
            p->x[0] = x[0] += dt * xp->v_hdt[0];
            p->x[1] = x[1] += dt * xp->v_hdt[1];
            p->x[2] = x[2] += dt * xp->v_hdt[2];
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
            dx = sqrtf( (x[0] - x_old[0])*(x[0] - x_old[0]) +
                        (x[1] - x_old[1])*(x[1] - x_old[1]) +
                        (x[2] - x_old[2])*(x[2] - x_old[2]) );
            dx_max = fmaxf( dx_max , dx );

            /* Update positions and energies at the half-step. */
            p->v[0] = v[0] += dt * a[0];
            p->v[1] = v[1] += dt * a[1];
            p->v[2] = v[2] += dt * a[2];
            w = u_dt / u * dt;
            if ( fabsf( w ) < 0.01f )
                p->u = u *= 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) );
            else
                p->u = u *= expf( w );
            w = h_dt / h * dt;
            if ( fabsf( w ) < 0.01f )
                p->h = h *= 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) );
            else
                p->h = h *= expf( w );
            h_max = fmaxf( h_max , h );

        
            /* Integrate other values if this particle will not be updated. */
            /* Init fields for density calculation. */
            if ( pdt > dt_step ) {
                float w = -3.0f * h_dt / h * dt;
                if ( fabsf( w ) < 0.1f )
                    rho = p->rho *= 1.0f + w*( 1.0f + w*( 0.5f + w*(1.0f/6.0f + 1.0f/24.0f*w ) ) );
                else
                    rho = p->rho *= expf( w );
                p->force.POrho2 = u * ( const_hydro_gamma - 1.0f ) / ( rho * xp->omega );
                }
            else {
                p->density.wcount = 0.0f;
                p->density.wcount_dh = 0.0f;
                p->rho = 0.0f;
                p->rho_dh = 0.0f;
	            p->density.div_v = 0.0f;
	            for ( j = 0 ; j < 3 ; ++j)
	                p->density.curl_v[j] = 0.0f;
                }
                
            }
            
        }
        
    /* Otherwise, agregate data from children. */
    else {
    
        /* Init with the first non-null child. */
        dt_min = FLT_MAX;
        dt_max = 0.0f;
        h_max = 0.0f;
        dx_max = 0.0f;
        
        /* Loop over the progeny. */
        for ( k = 0 ; k < 8 ; k++ )
            if ( c->progeny[k] != NULL ) {
842
843
                if ( c->count < space_subsize )
                    runner_dokick1( r , c->progeny[k] );
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
                dt_min = fminf( dt_min , c->progeny[k]->dt_min );
                dt_max = fmaxf( dt_max , c->progeny[k]->dt_max );
                h_max = fmaxf( h_max , c->progeny[k]->h_max );
                dx_max = fmaxf( dx_max , c->progeny[k]->dx_max );
                }
    
        }

    /* Store the values. */
    c->dt_min = dt_min;
    c->dt_max = dt_max;
    c->h_max = h_max;
    c->dx_max = dx_max;
    
    }


861
862
863
864
865
866
867
868
869
870
871
872
873
874
/**
 * @brief Combined second and first kick for fixed dt.
 *
 * @param r The runner thread.
 * @param c The cell.
 */
 
void runner_dokick ( struct runner *r , struct cell *c , int timer ) {

    int k, count = 0, nr_parts = c->count, updated;
    float dt_min = FLT_MAX, dt_max = 0.0f;
    float h_max, dx, dx_max;
    double ekin = 0.0, epot = 0.0;
    float mom[3] = { 0.0f , 0.0f , 0.0f }, ang[3] = { 0.0f , 0.0f , 0.0f };
875
    float x[3], x_old[3], v_hdt[3], a[3], u, u_hdt, h, pdt, m, w;
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
    float dt = r->e->dt, hdt = 0.5f*dt;
    float dt_cfl, dt_h_change, dt_u_change, dt_new;
    float h_dt, u_dt;
    struct part *restrict p, *restrict parts = c->parts;
    struct xpart *restrict xp, *restrict xparts = c->xparts;
    
    TIMER_TIC
    
    /* No children? */
    if ( !c->split ) {
    
        /* Init the min/max counters. */
        dt_min = FLT_MAX;
        dt_max = 0.0f;
        h_max = 0.0f;
        dx_max = 0.0f;
    
        /* Loop over the particles and kick them. */
        __builtin_prefetch( &parts[0] , 0 , 1 );
        __builtin_prefetch( &parts[0].rho_dh , 0 , 1 );
        __builtin_prefetch( &xparts[0] , 0 , 1 );
        __builtin_prefetch( &parts[1] , 0 , 1 );
        __builtin_prefetch( &parts[1].rho_dh , 0 , 1 );
        __builtin_prefetch( &xparts[1] , 0 , 1 );
        __builtin_prefetch( &parts[2] , 0 , 1 );
        __builtin_prefetch( &parts[2].rho_dh , 0 , 1 );
        __builtin_prefetch( &xparts[2] , 0 , 1 );
        for ( k = 0 ; k < nr_parts ; k++ ) {

            /* Get a handle on the part. */
            __builtin_prefetch( &parts[k+3] , 0 , 1 );
            __builtin_prefetch( &parts[k+3].rho_dh , 0 , 1 );
            __builtin_prefetch( &xparts[k+3] , 0 , 1 );
            p = &parts[k];
            xp = &xparts[k];

            /* Get local copies of particle data. */
            pdt = p->dt;
            u_dt = p->force.u_dt;
            h = p->h;
            m = p->mass;
            x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2];
            a[0] = p->a[0]; a[1] = p->a[1]; a[2] = p->a[2];
            x_old[0] = xp->x_old[0]; x_old[1] = xp->x_old[1]; x_old[2] = xp->x_old[2];
920
921
            v_hdt[0] = xp->v_hdt[0]; v_hdt[1] = xp->v_hdt[1]; v_hdt[2] = xp->v_hdt[2];
            u_hdt = xp->u_hdt;
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942

            /* Scale the derivatives if they're freshly computed. */
            h_dt = p->force.h_dt *= h * 0.333333333f;
            count += 1;
            xp->omega = 1.0f + h * p->rho_dh / p->rho * 0.3333333333f;

            /* Update the particle's time step. */
            dt_cfl = const_cfl * h / p->force.v_sig;
            dt_h_change = ( h_dt != 0.0f ) ? fabsf( const_ln_max_h_change * h / h_dt ) : FLT_MAX;
            dt_u_change = ( u_dt != 0.0f ) ? fabsf( const_max_u_change * p->u / u_dt ) : FLT_MAX;
            dt_new = fminf( dt_cfl , fminf( dt_h_change , dt_u_change ) );
            if ( pdt == 0.0f )
                p->dt = pdt = dt_new;
            else
                p->dt = pdt = fminf( dt_new , 2.0f*pdt );

            /* Get the smallest/largest dt. */
            dt_min = fminf( dt_min , pdt );
            dt_max = fmaxf( dt_max , pdt );

            /* Step and store the velocity and internal energy. */
943
944
945
946
            xp->v_hdt[0] = ( v_hdt[0] += dt * a[0] );
            xp->v_hdt[1] = ( v_hdt[1] += dt * a[1] );
            xp->v_hdt[2] = ( v_hdt[2] += dt * a[2] );
            xp->u_hdt = ( u_hdt += dt * u_dt );
947
948

            /* Move the particles with the velocitie at the half-step. */
949
950
951
            p->x[0] = x[0] += dt * v_hdt[0];
            p->x[1] = x[1] += dt * v_hdt[1];
            p->x[2] = x[2] += dt * v_hdt[2];
952
953
954
955
956
            dx = sqrtf( (x[0] - x_old[0])*(x[0] - x_old[0]) +
                        (x[1] - x_old[1])*(x[1] - x_old[1]) +
                        (x[2] - x_old[2])*(x[2] - x_old[2]) );
            dx_max = fmaxf( dx_max , dx );

957
958
959
960
961
            /* Update positions and energies at the next full step. */
            p->v[0] = v_hdt[0] + hdt * a[0];
            p->v[1] = v_hdt[1] + hdt * a[1];
            p->v[2] = v_hdt[2] + hdt * a[2];
            w = u_dt / u_hdt * hdt;
962
            if ( fabsf( w ) < 0.01f )
963
                p->u = u = u_hdt * ( 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) ) );
964
            else
965
                p->u = u = u_hdt * expf( w );
966
967
            w = h_dt / h * dt;
            if ( fabsf( w ) < 0.01f )
968
                p->h = h *= ( 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) ) );
969
            else
970
                p->h = h *= expf( w );
971
972
            h_max = fmaxf( h_max , h );

973
974
975
976
977
978
979
980
981
982
983
984
985
986
            /* Collect momentum */
            mom[0] += m * v_hdt[0];
            mom[1] += m * v_hdt[1];
            mom[2] += m * v_hdt[2];

	        /* Collect angular momentum */
	        ang[0] += m * ( x[1]*v_hdt[2] - x[2]*v_hdt[1] );
	        ang[1] += m * ( x[2]*v_hdt[0] - x[0]*v_hdt[2] );
	        ang[2] += m * ( x[0]*v_hdt[1] - x[1]*v_hdt[0] );

            /* Collect total energy. */
            ekin += 0.5 * m * ( v_hdt[0]*v_hdt[0] + v_hdt[1]*v_hdt[1] + v_hdt[2]*v_hdt[2] );
            epot += m * u_hdt;

987
988
989
990
991
992
993
994
995
996
997
998
999
1000
            /* Init fields for density calculation. */
            p->density.wcount = 0.0f;
            p->density.wcount_dh = 0.0f;
            p->rho = 0.0f;
            p->rho_dh = 0.0f;
	        p->density.div_v = 0.0f;
            p->density.curl_v[0] = 0.0f;
            p->density.curl_v[1] = 0.0f;
            p->density.curl_v[2] = 0.0f;
                
            }
            
        }
        
For faster browsing, not all history is shown. View entire blame