runner.c 47.2 KB
Newer Older
1
/*******************************************************************************
2
 * This file is part of SWIFT.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
 * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 ******************************************************************************/
Pedro Gonnet's avatar
Pedro Gonnet committed
19

Pedro Gonnet's avatar
Pedro Gonnet committed
20
21
/* Config parameters. */
#include "../config.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
22
23
24
25
26
27
28
29
30
31
32

/* Some standard headers. */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include <omp.h>

33
34
35
36
37
/* MPI headers. */
#ifdef WITH_MPI
    #include <mpi.h>
#endif

Pedro Gonnet's avatar
Pedro Gonnet committed
38
/* Local headers. */
39
#include "const.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
40
#include "cycle.h"
41
#include "atomic.h"
42
#include "timers.h"
43
#include "const.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
44
#include "lock.h"
45
46
#include "task.h"
#include "part.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
47
#include "space.h"
48
#include "multipole.h"
49
#include "cell.h"
50
#include "queue.h"
51
#include "scheduler.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
52
#include "engine.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
53
#include "runner.h"
54
#include "error.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
55

56
57
58
59
60
61
/* Include the right variant of the SPH interactions */
#ifdef LEGACY_GADGET2_SPH
#include "runner_iact_legacy.h"
#else
#include "runner_iact.h"
#endif
62
#include "runner_iact_grav.h"
63

Pedro Gonnet's avatar
Pedro Gonnet committed
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
/* Convert cell location to ID. */
#define cell_getid( cdim , i , j , k ) ( (int)(k) + (cdim)[2]*( (int)(j) + (cdim)[1]*(int)(i) ) )

/* The counters. */
int runner_counter[ runner_counter_count ];

        

const float runner_shift[13*3] = {
     5.773502691896258e-01 ,  5.773502691896258e-01 ,  5.773502691896258e-01 ,
     7.071067811865475e-01 ,  7.071067811865475e-01 ,  0.0                   ,
     5.773502691896258e-01 ,  5.773502691896258e-01 , -5.773502691896258e-01 ,
     7.071067811865475e-01 ,  0.0                   ,  7.071067811865475e-01 ,
     1.0                   ,  0.0                   ,  0.0                   ,
     7.071067811865475e-01 ,  0.0                   , -7.071067811865475e-01 ,
     5.773502691896258e-01 , -5.773502691896258e-01 ,  5.773502691896258e-01 ,
     7.071067811865475e-01 , -7.071067811865475e-01 ,  0.0                   ,
     5.773502691896258e-01 , -5.773502691896258e-01 , -5.773502691896258e-01 ,
     0.0                   ,  7.071067811865475e-01 ,  7.071067811865475e-01 ,
     0.0                   ,  1.0                   ,  0.0                   ,
     0.0                   ,  7.071067811865475e-01 , -7.071067811865475e-01 ,
     0.0                   ,  0.0                   ,  1.0                   ,
    };
const char runner_flip[27] = { 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 ,
                               0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }; 


91
/* Import the density loop functions. */
92
93
94
#define FUNCTION density
#include "runner_doiact.h"

95
/* Import the force loop functions. */
96
97
98
99
#undef FUNCTION
#define FUNCTION force
#include "runner_doiact.h"

100
101
102
/* Import the gravity loop functions. */
#include "runner_doiact_grav.h"

103

Pedro Gonnet's avatar
Pedro Gonnet committed
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
/**
 * @brief Sort the entries in ascending order using QuickSort.
 *
 * @param sort The entries
 * @param N The number of entries.
 */
 
void runner_dosort_ascending ( struct entry *sort , int N ) {

    struct {
        short int lo, hi;
        } qstack[10];
    int qpos, i, j, lo, hi, imin;
    struct entry temp;
    float pivot;
        
    /* Sort parts in cell_i in decreasing order with quicksort */
    qstack[0].lo = 0; qstack[0].hi = N - 1; qpos = 0;
    while ( qpos >= 0 ) {
        lo = qstack[qpos].lo; hi = qstack[qpos].hi;
        qpos -= 1;
        if ( hi - lo < 15 ) {
            for ( i = lo ; i < hi ; i++ ) {
                imin = i;
                for ( j = i+1 ; j <= hi ; j++ )
                    if ( sort[j].d < sort[imin].d )
                        imin = j;
                if ( imin != i ) {
                    temp = sort[imin]; sort[imin] = sort[i]; sort[i] = temp;
                    }
                }
            }
        else {
            pivot = sort[ ( lo + hi ) / 2 ].d;
            i = lo; j = hi;
            while ( i <= j ) {
                while ( sort[i].d < pivot ) i++;
                while ( sort[j].d > pivot ) j--;
                if ( i <= j ) {
                    if ( i < j ) {
                        temp = sort[i]; sort[i] = sort[j]; sort[j] = temp;
                        }
                    i += 1; j -= 1;
                    }
                }
            if ( j > ( lo + hi ) / 2 ) {
                if ( lo < j ) {
                    qpos += 1;
                    qstack[qpos].lo = lo;
                    qstack[qpos].hi = j;
                    }
                if ( i < hi ) {
                    qpos += 1;
                    qstack[qpos].lo = i;
                    qstack[qpos].hi = hi;
                    }
                }
            else {
                if ( i < hi ) {
                    qpos += 1;
                    qstack[qpos].lo = i;
                    qstack[qpos].hi = hi;
                    }
                if ( lo < j ) {
                    qpos += 1;
                    qstack[qpos].lo = lo;
                    qstack[qpos].hi = j;
                    }
                }
            }
        }
                
    }
    
    
/**
 * @brief Sort the particles in the given cell along all cardinal directions.
 *
 * @param r The #runner.
 * @param c The #cell.
184
 * @param flags Cell flag.
185
186
 * @param clock Flag indicating whether to record the timing or not, needed
 *      for recursive calls.
Pedro Gonnet's avatar
Pedro Gonnet committed
187
188
 */
 
189
void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
190
191
192

    struct entry *finger;
    struct entry *fingers[8];
Pedro Gonnet's avatar
Pedro Gonnet committed
193
    struct part *parts = c->parts;
194
    struct entry *sort;
Pedro Gonnet's avatar
Pedro Gonnet committed
195
    int j, k, count = c->count;
Pedro Gonnet's avatar
Pedro Gonnet committed
196
    int i, ind, off[8], inds[8], temp_i, missing;
197
    // float shift[3];
Pedro Gonnet's avatar
Pedro Gonnet committed
198
    float buff[8], px[3];
199
    
Pedro Gonnet's avatar
Pedro Gonnet committed
200
201
    TIMER_TIC
    
202
203
204
205
206
    /* Clean-up the flags, i.e. filter out what's already been sorted. */
    flags &= ~c->sorted;
    if ( flags == 0 )
        return;
    
Pedro Gonnet's avatar
Pedro Gonnet committed
207
    /* start by allocating the entry arrays. */
208
    if ( c->sort == NULL || c->sortsize < count ) {
209
210
        if ( c->sort != NULL )
            free( c->sort );
211
        c->sortsize = count * 1.1;
212
        if ( ( c->sort = (struct entry *)malloc( sizeof(struct entry) * (c->sortsize + 1) * 13 ) ) == NULL )
Pedro Gonnet's avatar
Pedro Gonnet committed
213
            error( "Failed to allocate sort memory." );
214
        }
215
    sort = c->sort;
Pedro Gonnet's avatar
Pedro Gonnet committed
216
217
218
219
        
    /* Does this cell have any progeny? */
    if ( c->split ) {
    
Pedro Gonnet's avatar
Pedro Gonnet committed
220
221
222
223
        /* Fill in the gaps within the progeny. */
        for ( k = 0 ; k < 8 ; k++ ) {
            if ( c->progeny[k] == NULL )
                continue;
224
            missing = flags & ~c->progeny[k]->sorted;
Pedro Gonnet's avatar
Pedro Gonnet committed
225
            if ( missing )
226
                runner_dosort( r , c->progeny[k] , missing , 0 );
Pedro Gonnet's avatar
Pedro Gonnet committed
227
228
229
230
            }
    
        /* Loop over the 13 different sort arrays. */
        for ( j = 0 ; j < 13 ; j++ ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
231
232
233
234
235
        
            /* Has this sort array been flagged? */
            if ( !( flags & (1 << j) ) )
                continue;
                
236
237
238
239
240
241
242
243
244
245
246
247
248
249
            /* Init the particle index offsets. */
            for ( off[0] = 0 , k = 1 ; k < 8 ; k++ )
                if ( c->progeny[k-1] != NULL )
                    off[k] = off[k-1] + c->progeny[k-1]->count;
                else
                    off[k] = off[k-1];

            /* Init the entries and indices. */
            for ( k = 0 ; k < 8 ; k++ ) {
                inds[k] = k;
                if ( c->progeny[k] != NULL && c->progeny[k]->count > 0 ) {
                    fingers[k] = &c->progeny[k]->sort[ j*(c->progeny[k]->count + 1) ];
                    buff[k] = fingers[k]->d;
                    off[k] = off[k];
Pedro Gonnet's avatar
Pedro Gonnet committed
250
                    }
251
252
                else
                    buff[k] = FLT_MAX;
Pedro Gonnet's avatar
Pedro Gonnet committed
253
254
                }

255
256
257
258
259
260
            /* Sort the buffer. */
            for ( i = 0 ; i < 7 ; i++ )
                for ( k = i+1 ; k < 8 ; k++ )
                    if ( buff[ inds[k] ] < buff[ inds[i] ] ) {
                        temp_i = inds[i]; inds[i] = inds[k]; inds[k] = temp_i;
                        }
Pedro Gonnet's avatar
Pedro Gonnet committed
261

262
            /* For each entry in the new sort list. */
263
            finger = &sort[ j*(count + 1) ];
264
            for ( ind = 0 ; ind < count ; ind++ ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
265

266
267
268
                /* Copy the minimum into the new sort array. */
                finger[ind].d = buff[inds[0]];
                finger[ind].i = fingers[inds[0]]->i + off[inds[0]];
Pedro Gonnet's avatar
Pedro Gonnet committed
269

270
271
272
                /* Update the buffer. */
                fingers[inds[0]] += 1;
                buff[inds[0]] = fingers[inds[0]]->d;
Pedro Gonnet's avatar
Pedro Gonnet committed
273

274
275
276
277
                /* Find the smallest entry. */
                for ( k = 1 ; k < 8 && buff[inds[k]] < buff[inds[k-1]] ; k++ ) {
                    temp_i = inds[k-1]; inds[k-1] = inds[k]; inds[k] = temp_i;
                    }
Pedro Gonnet's avatar
Pedro Gonnet committed
278

279
                } /* Merge. */
Pedro Gonnet's avatar
Pedro Gonnet committed
280
281
            
            /* Add a sentinel. */
282
283
            sort[ j*(count + 1) + count ].d = FLT_MAX;
            sort[ j*(count + 1) + count ].i = 0;
Pedro Gonnet's avatar
Pedro Gonnet committed
284
            
285
286
287
            /* Mark as sorted. */
            c->sorted |= ( 1 << j );
            
Pedro Gonnet's avatar
Pedro Gonnet committed
288
289
290
291
292
293
294
            } /* loop over sort arrays. */
    
        } /* progeny? */
        
    /* Otherwise, just sort. */
    else {
    
295
        /* Fill the sort array. */
Pedro Gonnet's avatar
Pedro Gonnet committed
296
        for ( k = 0 ; k < count ; k++ ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
297
298
299
            px[0] = parts[k].x[0];
            px[1] = parts[k].x[1];
            px[2] = parts[k].x[2];
300
301
            for ( j = 0 ; j < 13 ; j++ )
                if ( flags & (1 << j) ) {
302
303
                    sort[ j*(count + 1) + k].i = k;
                    sort[ j*(count + 1) + k].d = px[0]*runner_shift[ 3*j + 0 ] + px[1]*runner_shift[ 3*j + 1 ] + px[2]*runner_shift[ 3*j + 2 ];
304
                    }
Pedro Gonnet's avatar
Pedro Gonnet committed
305
            }
306
307

        /* Add the sentinel and sort. */
Pedro Gonnet's avatar
Pedro Gonnet committed
308
        for ( j = 0 ; j < 13 ; j++ )
309
            if ( flags & (1 << j) ) {
310
311
312
                sort[ j*(count + 1) + count ].d = FLT_MAX;
                sort[ j*(count + 1) + count ].i = 0;
                runner_dosort_ascending( &sort[ j*(count + 1) ] , count );
313
                c->sorted |= ( 1 << j );
Pedro Gonnet's avatar
Pedro Gonnet committed
314
315
316
317
318
319
320
321
                }
            
        }
        
    /* Verify the sorting. */
    /* for ( j = 0 ; j < 13 ; j++ ) {
        if ( !( flags & (1 << j) ) )
            continue;
322
323
        finger = &sort[ j*(count + 1) ];
        for ( k = 1 ; k < count ; k++ ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
324
325
            if ( finger[k].d < finger[k-1].d )
                error( "Sorting failed, ascending array." );
326
            if ( finger[k].i >= count )
Pedro Gonnet's avatar
Pedro Gonnet committed
327
328
329
                error( "Sorting failed, indices borked." );
            }
        } */
330
        
Pedro Gonnet's avatar
Pedro Gonnet committed
331
    #ifdef TIMER_VERBOSE
332
        message( "runner %02i: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) took %.3f ms." ,
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
            r->id , count , c->depth ,
            (flags & 0x1000) >> 12 , (flags & 0x800) >> 11 , (flags & 0x400) >> 10 , (flags & 0x200) >> 9 , (flags & 0x100) >> 8 , (flags & 0x80) >> 7 , (flags & 0x40) >> 6 , (flags & 0x20) >> 5 , (flags & 0x10) >> 4 , (flags & 0x8) >> 3 , (flags & 0x4) >> 2 , (flags & 0x2) >> 1 , (flags & 0x1) >> 0 , 
            ((double)TIMER_TOC(timer_dosort)) / CPU_TPS * 1000 ); fflush(stdout);
    #else
        if ( clock )
            TIMER_TOC(timer_dosort);
    #endif

    }
    
    
void runner_dogsort ( struct runner *r , struct cell *c , int flags , int clock ) {

    struct entry *finger;
    struct entry *fingers[8];
    struct gpart *gparts = c->gparts;
    struct entry *gsort;
    int j, k, count = c->gcount;
    int i, ind, off[8], inds[8], temp_i, missing;
    // float shift[3];
    float buff[8], px[3];
    
    TIMER_TIC
    
    /* Clean-up the flags, i.e. filter out what's already been sorted. */
    flags &= ~c->gsorted;
    if ( flags == 0 )
        return;
    
    /* start by allocating the entry arrays. */
    if ( c->gsort == NULL || c->gsortsize < count ) {
        if ( c->gsort != NULL )
            free( c->gsort );
        c->gsortsize = count * 1.1;
        if ( ( c->gsort = (struct entry *)malloc( sizeof(struct entry) * (c->gsortsize + 1) * 13 ) ) == NULL )
            error( "Failed to allocate sort memory." );
        }
    gsort = c->gsort;
        
    /* Does this cell have any progeny? */
    if ( c->split ) {
    
        /* Fill in the gaps within the progeny. */
        for ( k = 0 ; k < 8 ; k++ ) {
            if ( c->progeny[k] == NULL )
                continue;
            missing = flags & ~c->progeny[k]->gsorted;
            if ( missing )
                runner_dogsort( r , c->progeny[k] , missing , 0 );
            }
    
        /* Loop over the 13 different sort arrays. */
        for ( j = 0 ; j < 13 ; j++ ) {
        
            /* Has this sort array been flagged? */
            if ( !( flags & (1 << j) ) )
                continue;
                
            /* Init the particle index offsets. */
            for ( off[0] = 0 , k = 1 ; k < 8 ; k++ )
                if ( c->progeny[k-1] != NULL )
                    off[k] = off[k-1] + c->progeny[k-1]->gcount;
                else
                    off[k] = off[k-1];

            /* Init the entries and indices. */
            for ( k = 0 ; k < 8 ; k++ ) {
                inds[k] = k;
                if ( c->progeny[k] != NULL && c->progeny[k]->gcount > 0 ) {
                    fingers[k] = &c->progeny[k]->gsort[ j*(c->progeny[k]->gcount + 1) ];
                    buff[k] = fingers[k]->d;
                    off[k] = off[k];
                    }
                else
                    buff[k] = FLT_MAX;
                }

            /* Sort the buffer. */
            for ( i = 0 ; i < 7 ; i++ )
                for ( k = i+1 ; k < 8 ; k++ )
                    if ( buff[ inds[k] ] < buff[ inds[i] ] ) {
                        temp_i = inds[i]; inds[i] = inds[k]; inds[k] = temp_i;
                        }

            /* For each entry in the new sort list. */
            finger = &gsort[ j*(count + 1) ];
            for ( ind = 0 ; ind < count ; ind++ ) {

                /* Copy the minimum into the new sort array. */
                finger[ind].d = buff[inds[0]];
                finger[ind].i = fingers[inds[0]]->i + off[inds[0]];

                /* Update the buffer. */
                fingers[inds[0]] += 1;
                buff[inds[0]] = fingers[inds[0]]->d;

                /* Find the smallest entry. */
                for ( k = 1 ; k < 8 && buff[inds[k]] < buff[inds[k-1]] ; k++ ) {
                    temp_i = inds[k-1]; inds[k-1] = inds[k]; inds[k] = temp_i;
                    }

                } /* Merge. */
            
            /* Add a sentinel. */
            gsort[ j*(count + 1) + count ].d = FLT_MAX;
            gsort[ j*(count + 1) + count ].i = 0;
            
            /* Mark as sorted. */
            c->gsorted |= ( 1 << j );
            
            } /* loop over sort arrays. */
    
        } /* progeny? */
        
    /* Otherwise, just sort. */
    else {
    
        /* Fill the sort array. */
        for ( k = 0 ; k < count ; k++ ) {
            px[0] = gparts[k].x[0];
            px[1] = gparts[k].x[1];
            px[2] = gparts[k].x[2];
            for ( j = 0 ; j < 13 ; j++ )
                if ( flags & (1 << j) ) {
                    gsort[ j*(count + 1) + k].i = k;
                    gsort[ j*(count + 1) + k].d = px[0]*runner_shift[ 3*j + 0 ] + px[1]*runner_shift[ 3*j + 1 ] + px[2]*runner_shift[ 3*j + 2 ];
                    }
            }

        /* Add the sentinel and sort. */
        for ( j = 0 ; j < 13 ; j++ )
            if ( flags & (1 << j) ) {
                gsort[ j*(count + 1) + count ].d = FLT_MAX;
                gsort[ j*(count + 1) + count ].i = 0;
                runner_dosort_ascending( &gsort[ j*(count + 1) ] , count );
                c->gsorted |= ( 1 << j );
                }
            
        }
        
    /* Verify the sorting. */
    /* for ( j = 0 ; j < 13 ; j++ ) {
        if ( !( flags & (1 << j) ) )
            continue;
        finger = &c->gsort[ j*(count + 1) ];
        for ( k = 1 ; k < count ; k++ ) {
            if ( finger[k].d < finger[k-1].d )
                error( "Sorting failed, ascending array." );
            if ( finger[k].i < 0 || finger[k].i >= count )
                error( "Sorting failed, indices borked." );
            }
        } */
        
    #ifdef TIMER_VERBOSE
        message( "runner %02i: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) took %.3f ms." ,
            r->id , count , c->depth ,
Pedro Gonnet's avatar
Pedro Gonnet committed
489
            (flags & 0x1000) >> 12 , (flags & 0x800) >> 11 , (flags & 0x400) >> 10 , (flags & 0x200) >> 9 , (flags & 0x100) >> 8 , (flags & 0x80) >> 7 , (flags & 0x40) >> 6 , (flags & 0x20) >> 5 , (flags & 0x10) >> 4 , (flags & 0x8) >> 3 , (flags & 0x4) >> 2 , (flags & 0x2) >> 1 , (flags & 0x1) >> 0 , 
490
            ((double)TIMER_TOC(timer_dosort)) / CPU_TPS * 1000 ); fflush(stdout);
Pedro Gonnet's avatar
Pedro Gonnet committed
491
    #else
492
493
        if ( clock )
            TIMER_TOC(timer_dosort);
Pedro Gonnet's avatar
Pedro Gonnet committed
494
495
496
    #endif

    }
497
498
499
500
501
    
    
/**
 * @brief Intermediate task between density and force
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
502
 * @param r The runner thread.
503
 * @param c The cell.
504
505
 */
 
Pedro Gonnet's avatar
Pedro Gonnet committed
506
void runner_doghost ( struct runner *r , struct cell *c ) {
507

508
    struct part *p, *parts = c->parts;
509
    struct cell *finger;
Pedro Gonnet's avatar
Pedro Gonnet committed
510
511
    int i, k, redo, count = c->count;
    int *pid;
512
    float h, ih, ih2, ih4, h_corr, rho, wcount, rho_dh, wcount_dh, u, fc;
513
    float normDiv_v, normCurl_v;
514
515
516
#ifndef LEGACY_GADGET2_SPH
    float alpha_dot, tau, S;   
#endif
517
    float dt_step = r->e->dt_step;
518
519
    TIMER_TIC
    
Pedro Gonnet's avatar
Pedro Gonnet committed
520
521
522
523
524
525
526
    /* Recurse? */
    if ( c->split ) {
        for ( k = 0 ; k < 8 ; k++ )
            if ( c->progeny[k] != NULL )
                runner_doghost( r , c->progeny[k] );
        return;
        }
527
        
Pedro Gonnet's avatar
Pedro Gonnet committed
528
529
530
531
532
533
534
535
    /* Init the IDs that have to be updated. */
    if ( ( pid = (int *)alloca( sizeof(int) * count ) ) == NULL )
        error( "Call to alloca failed." );
    for ( k = 0 ; k < count ; k++ )
        pid[k] = k;
        
    /* While there are particles that need to be updated... */
    while ( count > 0 ) {
536
    
Pedro Gonnet's avatar
Pedro Gonnet committed
537
538
        /* Reset the redo-count. */
        redo = 0;
539
    
Pedro Gonnet's avatar
Pedro Gonnet committed
540
        /* Loop over the parts in this cell. */
541
542
543
544
545
546
        __builtin_prefetch( &parts[ pid[0] ] , 0 , 1 );
        __builtin_prefetch( &parts[ pid[0] ].rho_dh , 0 , 1 );
        __builtin_prefetch( &parts[ pid[1] ] , 0 , 1 );
        __builtin_prefetch( &parts[ pid[1] ].rho_dh , 0 , 1 );
        __builtin_prefetch( &parts[ pid[2] ] , 0 , 1 );
        __builtin_prefetch( &parts[ pid[2] ].rho_dh , 0 , 1 );
Pedro Gonnet's avatar
Pedro Gonnet committed
547
548
549
        for ( i = 0 ; i < count ; i++ ) {

            /* Get a direct pointer on the part. */
550
551
552
            __builtin_prefetch( &parts[ pid[i+3] ] , 0 , 1 );
            __builtin_prefetch( &parts[ pid[i+3] ].rho_dh , 0 , 1 );
            p = &parts[ pid[i] ];
553
            
554
            /* Is this part within the timestep? */
Pedro Gonnet's avatar
Pedro Gonnet committed
555
            if ( p->dt <= dt_step ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
556
            
557
558
	            /* Some smoothing length multiples. */
	            h = p->h;
559
560
561
                ih = 1.0f / h;
                ih2 = ih * ih;
                ih4 = ih2 * ih2;
562

563
		        /* Final operation on the density. */
564
                p->rho = rho = ih * ih2 * ( p->rho + p->mass*kernel_root );
Pedro Gonnet's avatar
Pedro Gonnet committed
565
                p->rho_dh = rho_dh = ( p->rho_dh - 3.0f*p->mass*kernel_root ) * ih4;
566
567
                wcount = ( p->density.wcount + kernel_root ) * ( 4.0f / 3.0 * M_PI * kernel_gamma3 );
                wcount_dh = p->density.wcount_dh * ih * ( 4.0f / 3.0 * M_PI * kernel_gamma3 );
Pedro Gonnet's avatar
Pedro Gonnet committed
568
                    
Pedro Gonnet's avatar
Pedro Gonnet committed
569
                /* If no derivative, double the smoothing length. */
570
                if ( wcount_dh == 0.0f )
571
                    h_corr = p->h;
Pedro Gonnet's avatar
Pedro Gonnet committed
572
573
                    
                /* Otherwise, compute the smoothing length update (Newton step). */
574
                else {
575
                    h_corr = ( kernel_nwneigh - wcount ) / wcount_dh;
576
577
578

                    /* Truncate to the range [ -p->h/2 , p->h ]. */
                    h_corr = fminf( h_corr , h );
579
                    h_corr = fmaxf( h_corr , -h/2.f );
Pedro Gonnet's avatar
Pedro Gonnet committed
580
                    
581
                    }
582
                
Pedro Gonnet's avatar
Pedro Gonnet committed
583
                /* Apply the correction to p->h and to the compact part. */
584
                p->h += h_corr;
585
586

                /* Did we get the right number density? */
587
588
                if ( wcount > kernel_nwneigh + const_delta_nwneigh ||
                     wcount < kernel_nwneigh - const_delta_nwneigh ) {
589
                    // message( "particle %lli (h=%e,depth=%i) has bad wcount=%.3f." , p->id , p->h , c->depth , wcount ); fflush(stdout);
590
                    // p->h += ( p->density.wcount + kernel_root - kernel_nwneigh ) / p->density.wcount_dh;
591
592
                    pid[redo] = pid[i];
                    redo += 1;
593
                    p->density.wcount = 0.0;
594
                    p->density.wcount_dh = 0.0;
595
596
                    p->rho = 0.0;
                    p->rho_dh = 0.0;
597
598
599
		            p->density.div_v = 0.0;
		            for ( k=0 ; k < 3 ; k++)
		                p->density.curl_v[k] = 0.0;
600
601
                    continue;
                    }
602

603
                /* Pre-compute some stuff for the balsara switch. */
604
605
		        normDiv_v = fabs( p->density.div_v / rho * ih4 );
		        normCurl_v = sqrtf( p->density.curl_v[0] * p->density.curl_v[0] + p->density.curl_v[1] * p->density.curl_v[1] + p->density.curl_v[2] * p->density.curl_v[2] ) / rho * ih4;
606
                
Pedro Gonnet's avatar
Pedro Gonnet committed
607
608
609
                /* As of here, particle force variables will be set. Do _NOT_
                   try to read any particle density variables! */
                
610
                /* Compute this particle's sound speed. */
611
                u = p->u;
612
                p->force.c = fc = sqrtf( const_hydro_gamma * ( const_hydro_gamma - 1.0f ) * u );
613
614

                /* Compute the P/Omega/rho2. */
615
                p->force.POrho2 = u * ( const_hydro_gamma - 1.0f ) / ( rho + h * rho_dh / 3.0f );
616

617
618
		        /* Balsara switch */
		        p->force.balsara = normDiv_v / ( normDiv_v + normCurl_v + 0.0001f * fc * ih );
619

620
621
622
                #ifndef LEGACY_GADGET2_SPH
		            /* Viscosity parameter decay time */
		            tau = h / ( 2.f * const_viscosity_length * p->force.c );
623

624
625
		            /* Viscosity source term */
		            S = fmaxf( -normDiv_v, 0.f );
626

627
628
		            /* Compute the particle's viscosity parameter time derivative */
		            alpha_dot = ( const_viscosity_alpha_min - p->alpha ) / tau + ( const_viscosity_alpha_max - p->alpha ) * S;
629

630
631
632
		            /* Update particle's viscosity paramter */
		            p->alpha += alpha_dot * p->dt; 
                #endif                
633

634
635
636
                /* Reset the acceleration. */
                for ( k = 0 ; k < 3 ; k++ )
                    p->a[k] = 0.0f;
Pedro Gonnet's avatar
Pedro Gonnet committed
637

638
                /* Reset the time derivatives. */
639
640
641
                p->force.u_dt = 0.0f;
                p->force.h_dt = 0.0f;
                p->force.v_sig = 0.0f;
Pedro Gonnet's avatar
Pedro Gonnet committed
642

643
                }
Pedro Gonnet's avatar
Pedro Gonnet committed
644
645

            }
646
            
Pedro Gonnet's avatar
Pedro Gonnet committed
647
648
        /* Re-set the counter for the next loop (potentially). */
        count = redo;
649
650
651
652
653
654
655
656
        if ( count > 0 ) {
        
            // error( "Bad smoothing length, fixing this isn't implemented yet." );
            
            /* Climb up the cell hierarchy. */
            for ( finger = c ; finger != NULL ; finger = finger->parent ) {
            
                /* Run through this cell's density interactions. */
657
                for ( struct link *l = finger->density ; l != NULL ; l = l->next ) {
658
659
                
                    /* Self-interaction? */
660
                    if ( l->t->type == task_type_self )
661
                        runner_doself_subset_density( r , finger , parts , pid , count );
662
663
                        
                    /* Otherwise, pair interaction? */
664
                    else if ( l->t->type == task_type_pair ) {
665
666
                    
                        /* Left or right? */
667
668
                        if ( l->t->ci == finger )
                            runner_dopair_subset_density( r , finger , parts , pid , count , l->t->cj );
669
                        else
670
                            runner_dopair_subset_density( r , finger , parts , pid , count , l->t->ci );
671
672
673
674
                        
                        }
                
                    /* Otherwise, sub interaction? */
675
                    else if ( l->t->type == task_type_sub ) {
676
677
                    
                        /* Left or right? */
678
679
                        if ( l->t->ci == finger )
                            runner_dosub_subset_density( r , finger , parts , pid , count , l->t->cj , -1 , 1 );
680
                        else
681
                            runner_dosub_subset_density( r , finger , parts , pid , count , l->t->ci , -1 , 1 );
682
683
                        
                        }
684
685
                
                    }
Pedro Gonnet's avatar
Pedro Gonnet committed
686
                    
687
688
689
                }
        
            }
690
691
692
693
            
        }

    #ifdef TIMER_VERBOSE
694
        message( "runner %02i: %i parts at depth %i took %.3f ms." ,
Pedro Gonnet's avatar
Pedro Gonnet committed
695
            r->id , c->count , c->depth ,
696
            ((double)TIMER_TOC(timer_doghost)) / CPU_TPS * 1000 ); fflush(stdout);
697
    #else
698
        TIMER_TOC(timer_doghost);
699
700
701
    #endif
    
    }
702
703
704
705
706
707
708
709
710
711
712
    
    
/**
 * @brief Compute the second kick of the given cell.
 *
 * @param r The runner thread.
 * @param c The cell.
 */
 
void runner_dokick2 ( struct runner *r , struct cell *c ) {

713
    int j, k, count = 0, nr_parts = c->count;
714
    float dt_min = FLT_MAX, dt_max = 0.0f;
Pedro Gonnet's avatar
Pedro Gonnet committed
715
    double ekin = 0.0, epot = 0.0;
716
    float mom[3] = { 0.0f , 0.0f , 0.0f }, ang[3] = { 0.0f , 0.0f , 0.0f };
717
    float x[3], v_hdt[3], u_hdt, h, pdt, m;
718
    float dt_step = r->e->dt_step, dt = r->e->dt, hdt, idt;
719
720
    float dt_cfl, dt_h_change, dt_u_change, dt_new;
    float h_dt, u_dt;
721
    struct part *restrict p, *restrict parts = c->parts;
722
    struct xpart *restrict xp, *restrict xparts = c->xparts;
723
724
725
    
    TIMER_TIC
    
726
727
    /* Init idt to avoid compiler stupidity. */
    idt = ( dt > 0 ) ? 1.0f / dt : 0.0f;
728
    hdt = dt / 2;
729
    
730
    /* Loop over the particles and kick them. */
Pedro Gonnet's avatar
Pedro Gonnet committed
731
732
733
734
735
736
737
738
739
    __builtin_prefetch( &parts[0] , 0 , 1 );
    __builtin_prefetch( &parts[0].rho_dh , 0 , 1 );
    __builtin_prefetch( &xparts[0] , 0 , 1 );
    __builtin_prefetch( &parts[1] , 0 , 1 );
    __builtin_prefetch( &parts[1].rho_dh , 0 , 1 );
    __builtin_prefetch( &xparts[1] , 0 , 1 );
    __builtin_prefetch( &parts[2] , 0 , 1 );
    __builtin_prefetch( &parts[2].rho_dh , 0 , 1 );
    __builtin_prefetch( &xparts[2] , 0 , 1 );
740
741
742
    for ( k = 0 ; k < nr_parts ; k++ ) {

        /* Get a handle on the part. */
Pedro Gonnet's avatar
Pedro Gonnet committed
743
744
745
        __builtin_prefetch( &parts[k+3] , 0 , 1 );
        __builtin_prefetch( &parts[k+3].rho_dh , 0 , 1 );
        __builtin_prefetch( &xparts[k+3] , 0 , 1 );
746
        p = &parts[k];
747
        xp = &xparts[k];
748
749
750
751
752

        /* Get local copies of particle data. */
        pdt = p->dt;
        m = p->mass;
        x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2];
753
754
        v_hdt[0] = xp->v_hdt[0]; v_hdt[1] = xp->v_hdt[1]; v_hdt[2] = xp->v_hdt[2];
        u_hdt = xp->u_hdt;
755

756
        /* Update the particle's data (if active). */
757
        if ( pdt <= dt_step ) {
758
759
            
            /* Increase the number of particles updated. */
760
            count += 1;
761
762
763
764
            
            /* Scale the derivatives as they're freshly computed. */
            h = p->h;
            h_dt = p->force.h_dt *= h * 0.333333333f;
Pedro Gonnet's avatar
Pedro Gonnet committed
765
            xp->omega = 1.0f + h * p->rho_dh / p->rho * 0.3333333333f;
766
767
768
769
770
771
772
773
774
775
776
777
778
            
            /* Compute the new time step. */
            u_dt = p->force.u_dt;
            dt_cfl = const_cfl * h / p->force.v_sig;
            dt_h_change = ( h_dt != 0.0f ) ? fabsf( const_ln_max_h_change * h / h_dt ) : FLT_MAX;
            dt_u_change = ( u_dt != 0.0f ) ? fabsf( const_max_u_change * p->u / u_dt ) : FLT_MAX;
            dt_new = fminf( dt_cfl , fminf( dt_h_change , dt_u_change ) );
            if ( pdt == 0.0f )
                p->dt = pdt = dt_new;
            else
                p->dt = pdt = fminf( dt_new , 2.0f*pdt );
                
            /* Update positions and energies at the full step. */
779
780
781
782
            p->v[0] = v_hdt[0] + hdt * p->a[0];
            p->v[1] = v_hdt[1] + hdt * p->a[1];
            p->v[2] = v_hdt[2] + hdt * p->a[2];
            p->u = u_hdt + hdt * u_dt;
783
784
785
            
            /* Set the new particle-specific time step. */
            if ( dt > 0.0f ) {
786
                float dt_curr = dt;
787
788
789
790
791
792
793
794
                j = (int)( pdt * idt );
                while ( j > 1 ) {
                    dt_curr *= 2.0f;
                    j >>= 1;
                    }
                xp->dt_curr = dt_curr;
                }
            
795
796
797
798
799
800
801
            }

        /* Get the smallest/largest dt. */
        dt_min = fminf( dt_min , pdt );
        dt_max = fmaxf( dt_max , pdt );

        /* Collect total energy. */
802
803
        ekin += 0.5 * m * ( v_hdt[0]*v_hdt[0] + v_hdt[1]*v_hdt[1] + v_hdt[2]*v_hdt[2] );
        epot += m * u_hdt;
804
805

        /* Collect momentum */
806
807
808
        mom[0] += m * v_hdt[0];
        mom[1] += m * v_hdt[1];
        mom[2] += m * v_hdt[2];
809
810

	    /* Collect angular momentum */
811
812
813
	    ang[0] += m * ( x[1]*v_hdt[2] - x[2]*v_hdt[1] );
	    ang[1] += m * ( x[2]*v_hdt[0] - x[0]*v_hdt[2] );
	    ang[2] += m * ( x[0]*v_hdt[1] - x[1]*v_hdt[0] );
814
815
816
817
818
819
820

	    /* Collect entropic function */
	    // lent += u * pow( p->rho, 1.f-const_gamma );

        }

    #ifdef TIMER_VERBOSE
821
        message( "runner %02i: %i parts at depth %i took %.3f ms." ,
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
            r->id , c->count , c->depth ,
            ((double)TIMER_TOC(timer_kick2)) / CPU_TPS * 1000 ); fflush(stdout);
    #else
        TIMER_TOC(timer_kick2);
    #endif
        
    /* Store the computed values in the cell. */
    c->dt_min = dt_min;
    c->dt_max = dt_max;
    c->updated = count;
    c->ekin = ekin;
    c->epot = epot;
    c->mom[0] = mom[0]; c->mom[1] = mom[1]; c->mom[2] = mom[2];
    c->ang[0] = ang[0]; c->ang[1] = ang[1]; c->ang[2] = ang[2];
        
    }
Pedro Gonnet's avatar
Pedro Gonnet committed
838
839


840
841
842
843
844
845
846
847
848
/**
 * @brief Mapping function to set dt_min and dt_max, do the first
 * kick.
 */

void runner_dokick1 ( struct runner *r , struct cell *c ) {

    int j, k;
    struct engine *e = r->e;
849
    float pdt, dt_step = e->dt_step, dt = e->dt, hdt = dt/2;
850
    float dt_min, dt_max, h_max, dx, dx_max;
851
    float a[3], v[3], u, u_dt, h, h_dt, w, rho;
852
    double x[3], x_old[3];
853
    struct part *restrict p, *restrict parts = c->parts;
854
    struct xpart *restrict xp, *restrict xparts = c->xparts;
855
856
857
858
859
860
861
862
863
864
865

    /* No children? */
    if ( !c->split ) {
    
        /* Init the min/max counters. */
        dt_min = FLT_MAX;
        dt_max = 0.0f;
        h_max = 0.0f;
        dx_max = 0.0f;
    
        /* Loop over parts. */
Pedro Gonnet's avatar
Pedro Gonnet committed
866
867
868
869
870
871
872
873
874
        __builtin_prefetch( &parts[0] , 0 , 1 );
        __builtin_prefetch( &parts[0].rho_dh , 0 , 1 );
        __builtin_prefetch( &xparts[0] , 0 , 1 );
        __builtin_prefetch( &parts[1] , 0 , 1 );
        __builtin_prefetch( &parts[1].rho_dh , 0 , 1 );
        __builtin_prefetch( &xparts[1] , 0 , 1 );
        __builtin_prefetch( &parts[2] , 0 , 1 );
        __builtin_prefetch( &parts[2].rho_dh , 0 , 1 );
        __builtin_prefetch( &xparts[2] , 0 , 1 );
875
876
877
        for ( k = 0 ; k < c->count ; k++ ) {
            
            /* Get a handle on the kth particle. */
Pedro Gonnet's avatar
Pedro Gonnet committed
878
879
880
            __builtin_prefetch( &parts[k+3] , 0 , 1 );
            __builtin_prefetch( &parts[k+3].rho_dh , 0 , 1 );
            __builtin_prefetch( &xparts[k+3] , 0 , 1 );
881
882
            p = &parts[k];
            xp = &xparts[k];
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
            
            /* Load the data locally. */
            a[0] = p->a[0]; a[1] = p->a[1]; a[2] = p->a[2];
            v[0] = p->v[0]; v[1] = p->v[1]; v[2] = p->v[2];
            x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2];
            x_old[0] = xp->x_old[0]; x_old[1] = xp->x_old[1]; x_old[2] = xp->x_old[2];
            h = p->h;
            u = p->u;
            h_dt = p->force.h_dt;
            u_dt = p->force.u_dt;
            pdt = p->dt;
            
            /* Store the min/max dt. */
            dt_min = fminf( dt_min , pdt );
            dt_max = fmaxf( dt_max , pdt );
            
899
900
901
902
903
904
            /* Update the half-step velocities from the current velocities. */
            xp->v_hdt[0] = v[0] + hdt * a[0];
            xp->v_hdt[1] = v[1] + hdt * a[1];
            xp->v_hdt[2] = v[2] + hdt * a[2];
            xp->u_hdt = u + hdt * u_dt;
            
905
906
907
908
            /* Move the particles with the velocities at the half-step. */
            p->x[0] = x[0] += dt * xp->v_hdt[0];
            p->x[1] = x[1] += dt * xp->v_hdt[1];
            p->x[2] = x[2] += dt * xp->v_hdt[2];
909
910
911
912
913
914
            dx = sqrtf( (x[0] - x_old[0])*(x[0] - x_old[0]) +
                        (x[1] - x_old[1])*(x[1] - x_old[1]) +
                        (x[2] - x_old[2])*(x[2] - x_old[2]) );
            dx_max = fmaxf( dx_max , dx );

            /* Update positions and energies at the half-step. */
915
916
917
            p->v[0] = v[0] + dt * a[0];
            p->v[1] = v[1] + dt * a[1];
            p->v[2] = v[2] + dt * a[2];
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
            w = u_dt / u * dt;
            if ( fabsf( w ) < 0.01f )
                p->u = u *= 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) );
            else
                p->u = u *= expf( w );
            w = h_dt / h * dt;
            if ( fabsf( w ) < 0.01f )
                p->h = h *= 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) );
            else
                p->h = h *= expf( w );
            h_max = fmaxf( h_max , h );

        
            /* Integrate other values if this particle will not be updated. */
            /* Init fields for density calculation. */
            if ( pdt > dt_step ) {
                float w = -3.0f * h_dt / h * dt;
                if ( fabsf( w ) < 0.1f )
                    rho = p->rho *= 1.0f + w*( 1.0f + w*( 0.5f + w*(1.0f/6.0f + 1.0f/24.0f*w ) ) );
                else
                    rho = p->rho *= expf( w );
                p->force.POrho2 = u * ( const_hydro_gamma - 1.0f ) / ( rho * xp->omega );
                }
            else {
                p->density.wcount = 0.0f;
                p->density.wcount_dh = 0.0f;
                p->rho = 0.0f;
                p->rho_dh = 0.0f;
	            p->density.div_v = 0.0f;
	            for ( j = 0 ; j < 3 ; ++j)
	                p->density.curl_v[j] = 0.0f;
                }
                
            }
            
        }
        
    /* Otherwise, agregate data from children. */
    else {
    
        /* Init with the first non-null child. */
        dt_min = FLT_MAX;
        dt_max = 0.0f;
        h_max = 0.0f;
        dx_max = 0.0f;
        
        /* Loop over the progeny. */
        for ( k = 0 ; k < 8 ; k++ )
            if ( c->progeny[k] != NULL ) {
967
968
                if ( c->count < space_subsize )
                    runner_dokick1( r , c->progeny[k] );
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
                dt_min = fminf( dt_min , c->progeny[k]->dt_min );
                dt_max = fmaxf( dt_max , c->progeny[k]->dt_max );
                h_max = fmaxf( h_max , c->progeny[k]->h_max );
                dx_max = fmaxf( dx_max , c->progeny[k]->dx_max );
                }
    
        }

    /* Store the values. */
    c->dt_min = dt_min;
    c->dt_max = dt_max;
    c->h_max = h_max;
    c->dx_max = dx_max;
    
    }


986
987
988
989
990
/**
 * @brief Combined second and first kick for fixed dt.
 *
 * @param r The runner thread.
 * @param c The cell.
991
 * @param timer The timer 
992
993
994
995
996
997
998
999
1000
 */
 
void runner_dokick ( struct runner *r , struct cell *c , int timer ) {

    int k, count = 0, nr_parts = c->count, updated;
    float dt_min = FLT_MAX, dt_max = 0.0f;
    float h_max, dx, dx_max;
    double ekin = 0.0, epot = 0.0;
    float mom[3] = { 0.0f , 0.0f , 0.0f }, ang[3] = { 0.0f , 0.0f , 0.0f };
For faster browsing, not all history is shown. View entire blame