runner_doiact.h 125 KB
Newer Older
1
/*******************************************************************************
2
 * This file is part of SWIFT.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
 * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 ******************************************************************************/


/* Before including this file, define FUNCTION, which is the
   name of the interaction function. This creates the interaction functions
   runner_dopair_FUNCTION, runner_dopair_FUNCTION_naive, runner_doself_FUNCTION,
   and runner_dosub_FUNCTION calling the pairwise interaction function
   runner_iact_FUNCTION. */

#define PASTE(x,y) x ## _ ## y

29
30
#define _DOPAIR1(f) PASTE(runner_dopair1,f)
#define DOPAIR1 _DOPAIR1(FUNCTION)
31

32
33
#define _DOPAIR2(f) PASTE(runner_dopair2,f)
#define DOPAIR2 _DOPAIR2(FUNCTION)
34

35
36
#define _DOPAIR_SUBSET(f) PASTE(runner_dopair_subset,f)
#define DOPAIR_SUBSET _DOPAIR_SUBSET(FUNCTION)
37

Pedro Gonnet's avatar
Pedro Gonnet committed
38
39
40
#define _DOPAIR_SUBSET_NAIVE(f) PASTE(runner_dopair_subset_naive,f)
#define DOPAIR_SUBSET_NAIVE _DOPAIR_SUBSET_NAIVE(FUNCTION)

41
42
#define _DOPAIR_NAIVE(f) PASTE(runner_dopair_naive,f)
#define DOPAIR_NAIVE _DOPAIR_NAIVE(FUNCTION)
43

44
45
#define _DOSELF_NAIVE(f) PASTE(runner_doself_naive,f)
#define DOSELF_NAIVE _DOSELF_NAIVE(FUNCTION)
46

47
48
#define _DOSELF1(f) PASTE(runner_doself1,f)
#define DOSELF1 _DOSELF1(FUNCTION)
49

50
51
#define _DOSELF2(f) PASTE(runner_doself2,f)
#define DOSELF2 _DOSELF2(FUNCTION)
52

53
54
#define _DOSELF_SUBSET(f) PASTE(runner_doself_subset,f)
#define DOSELF_SUBSET _DOSELF_SUBSET(FUNCTION)
55

56
57
#define _DOSUB1(f) PASTE(runner_dosub1,f)
#define DOSUB1 _DOSUB1(FUNCTION)
58

59
60
#define _DOSUB2(f) PASTE(runner_dosub2,f)
#define DOSUB2 _DOSUB2(FUNCTION)
61

62
63
#define _DOSUB_SUBSET(f) PASTE(runner_dosub_subset,f)
#define DOSUB_SUBSET _DOSUB_SUBSET(FUNCTION)
64

65
66
#define _IACT_NONSYM(f) PASTE(runner_iact_nonsym,f)
#define IACT_NONSYM _IACT_NONSYM(FUNCTION)
67

68
69
#define _IACT(f) PASTE(runner_iact,f)
#define IACT _IACT(FUNCTION)
70

71
#define _TIMER_DOSELF(f) PASTE(timer_doself,f)
72
#define TIMER_DOSELF _TIMER_DOSELF(FUNCTION)
73

74
#define _TIMER_DOPAIR(f) PASTE(timer_dopair,f)
75
#define TIMER_DOPAIR _TIMER_DOPAIR(FUNCTION)
Pedro Gonnet's avatar
Pedro Gonnet committed
76

77
#define _TIMER_DOSUB(f) PASTE(timer_dosub,f)
78
79
#define TIMER_DOSUB _TIMER_DOSUB(FUNCTION)

80
#define _TIMER_DOSELF_SUBSET(f) PASTE(timer_doself_subset,f)
81
82
#define TIMER_DOSELF_SUBSET _TIMER_DOSELF_SUBSET(FUNCTION)

83
#define _TIMER_DOPAIR_SUBSET(f) PASTE(timer_dopair_subset,f)
84
85
86
87
88
89
90
#define TIMER_DOPAIR_SUBSET _TIMER_DOPAIR_SUBSET(FUNCTION)

#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec,f)
#define IACT_NONSYM_VEC _IACT_NONSYM_VEC(FUNCTION)

#define _IACT_VEC(f) PASTE(runner_iact_vec,f)
#define IACT_VEC _IACT_VEC(FUNCTION)
Pedro Gonnet's avatar
Pedro Gonnet committed
91
92


93
94
95
96
97
98
99
100
101

/**
 * @brief Compute the interactions between a cell pair.
 *
 * @param r The #runner.
 * @param ci The first #cell.
 * @param cj The second #cell.
 */
 
102
void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *restrict cj ) {
103

Pedro Gonnet's avatar
Pedro Gonnet committed
104
    struct engine *e = r->e;
105
106
    int pid, pjd, k, count_i = ci->count, count_j = cj->count;
    double shift[3] = { 0.0 , 0.0 , 0.0 };
107
    struct part *restrict parts_i = ci->parts, *restrict parts_j = cj->parts;
Pedro Gonnet's avatar
Pedro Gonnet committed
108
    struct part *restrict pi, *restrict pj;
109
    double pix[3];
110
    float dx[3], hi, hig2, r2;
111
    float dt_step = e->dt_step;
Pedro Gonnet's avatar
Pedro Gonnet committed
112
113
114
115
116
117
118
119
    #ifdef VECTORIZE
        int icount = 0;
        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
        float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
        float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
        struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
    #endif
120
121
    TIMER_TIC
    
122
    /* Anything to do here? */
123
    if ( ci->dt_min > dt_step && cj->dt_min > dt_step )
124
125
        return;
    
126
127
    /* Get the relative distance between the pairs, wrapping. */
    for ( k = 0 ; k < 3 ; k++ ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
128
129
130
131
        if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 )
            shift[k] = e->s->dim[k];
        else if ( cj->loc[k] - ci->loc[k] > e->s->dim[k]/2 )
            shift[k] = -e->s->dim[k];
132
133
134
135
136
137
138
139
140
141
142
        }
        
    /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" ,
        ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] ,
        ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
    tic = getticks(); */
    
    /* Loop over the parts in ci. */
    for ( pid = 0 ; pid < count_i ; pid++ ) {
    
        /* Get a hold of the ith part in ci. */
Pedro Gonnet's avatar
Pedro Gonnet committed
143
        pi = &parts_i[ pid ];
144
        for ( k = 0 ; k < 3 ; k++ )
Pedro Gonnet's avatar
Pedro Gonnet committed
145
146
            pix[k] = pi->x[k] - shift[k];
        hi = pi->h;
147
        hig2 = hi * hi * kernel_gamma2;
148
149
150
151
152
        
        /* Loop over the parts in cj. */
        for ( pjd = 0 ; pjd < count_j ; pjd++ ) {
        
            /* Get a pointer to the jth particle. */
Pedro Gonnet's avatar
Pedro Gonnet committed
153
            pj = &parts_j[ pjd ];
154
155
        
            /* Compute the pairwise distance. */
Pedro Gonnet's avatar
Pedro Gonnet committed
156
            r2 = 0.0f;
157
            for ( k = 0 ; k < 3 ; k++ ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
158
                dx[k] = pix[k] - pj->x[k];
159
160
161
162
                r2 += dx[k]*dx[k];
                }
                
            /* Hit or miss? */
Pedro Gonnet's avatar
Pedro Gonnet committed
163
            if ( r2 < hig2 || r2 < pj->h*pj->h*kernel_gamma2 ) {
164
            
Pedro Gonnet's avatar
Pedro Gonnet committed
165
166
                #ifndef VECTORIZE
                        
Pedro Gonnet's avatar
Pedro Gonnet committed
167
                    IACT( r2 , dx , hi , pj->h , pi , pj );
168
                
Pedro Gonnet's avatar
Pedro Gonnet committed
169
170
171
172
173
174
175
176
                #else

                    /* Add this interaction to the queue. */
                    r2q[icount] = r2;
                    dxq[3*icount+0] = dx[0];
                    dxq[3*icount+1] = dx[1];
                    dxq[3*icount+2] = dx[2];
                    hiq[icount] = hi;
Pedro Gonnet's avatar
Pedro Gonnet committed
177
178
179
                    hjq[icount] = pj->h;
                    piq[icount] = pi;
                    pjq[icount] = pj;
Pedro Gonnet's avatar
Pedro Gonnet committed
180
181
182
183
184
185
186
187
188
189
                    icount += 1;

                    /* Flush? */
                    if ( icount == VEC_SIZE ) {
                        IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq );
                        icount = 0;
                        }

                #endif
                    
190
191
192
193
194
195
                }
        
            } /* loop over the parts in cj. */
    
        } /* loop over the parts in ci. */
        
Pedro Gonnet's avatar
Pedro Gonnet committed
196
197
198
199
200
201
202
    #ifdef VECTORIZE
    /* Pick up any leftovers. */
    if ( icount > 0 )
        for ( k = 0 ; k < icount ; k++ )
            IACT( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
    #endif
        
203
    #ifdef TIMER_VERBOSE
Pedro Gonnet's avatar
Pedro Gonnet committed
204
        printf( "runner_dopair_naive[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 );
205
    #else
206
        TIMER_TOC(TIMER_DOPAIR);
207
208
209
210
211
212
    #endif


    }


213
214
215
216
void DOSELF_NAIVE ( struct runner *r , struct cell *restrict c ) {

    int pid, pjd, k, count = c->count;
    struct part *restrict parts = c->parts;
Pedro Gonnet's avatar
Pedro Gonnet committed
217
    struct part *restrict pi, *restrict pj;
218
    double pix[3] = {0.0,0.0,0.0};
219
    float dx[3], hi, hig2, r2;
220
    float dt_step = r->e->dt_step;
221
222
223
224
225
226
227
228
229
230
231
    #ifdef VECTORIZE
        int icount = 0;
        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
        float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
        float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
        struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
    #endif
    TIMER_TIC
    
    /* Anything to do here? */
232
    if ( c->dt_min > dt_step )
233
234
235
236
237
238
239
240
241
242
243
        return;
    
    /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" ,
        ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] ,
        ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
    tic = getticks(); */
    
    /* Loop over the parts in ci. */
    for ( pid = 0 ; pid < count ; pid++ ) {
    
        /* Get a hold of the ith part in ci. */
Pedro Gonnet's avatar
Pedro Gonnet committed
244
        pi = &parts[ pid ];
245
246
247
        pix[0] = pi->x[0];
        pix[1] = pi->x[1];
        pix[2] = pi->x[2];
Pedro Gonnet's avatar
Pedro Gonnet committed
248
        hi = pi->h;
249
        hig2 = hi * hi * kernel_gamma2;
250
251
252
253
254
        
        /* Loop over the parts in cj. */
        for ( pjd = pid+1 ; pjd < count ; pjd++ ) {
        
            /* Get a pointer to the jth particle. */
Pedro Gonnet's avatar
Pedro Gonnet committed
255
            pj = &parts[ pjd ];
256
257
258
259
        
            /* Compute the pairwise distance. */
            r2 = 0.0f;
            for ( k = 0 ; k < 3 ; k++ ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
260
                dx[k] = pix[k] - pj->x[k];
261
262
263
264
                r2 += dx[k]*dx[k];
                }
                
            /* Hit or miss? */
Pedro Gonnet's avatar
Pedro Gonnet committed
265
            if ( r2 < hig2 || r2 < pj->h*pj->h*kernel_gamma2 ) {
266
267
268
            
                #ifndef VECTORIZE
                        
Pedro Gonnet's avatar
Pedro Gonnet committed
269
                    IACT( r2 , dx , hi , pj->h , pi , pj );
270
271
272
273
274
275
276
277
278
                
                #else

                    /* Add this interaction to the queue. */
                    r2q[icount] = r2;
                    dxq[3*icount+0] = dx[0];
                    dxq[3*icount+1] = dx[1];
                    dxq[3*icount+2] = dx[2];
                    hiq[icount] = hi;
Pedro Gonnet's avatar
Pedro Gonnet committed
279
280
281
                    hjq[icount] = pj->h;
                    piq[icount] = pi;
                    pjq[icount] = pj;
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
                    icount += 1;

                    /* Flush? */
                    if ( icount == VEC_SIZE ) {
                        IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq );
                        icount = 0;
                        }

                #endif
                    
                }
        
            } /* loop over the parts in cj. */
    
        } /* loop over the parts in ci. */
        
    #ifdef VECTORIZE
    /* Pick up any leftovers. */
    if ( icount > 0 )
        for ( k = 0 ; k < icount ; k++ )
            IACT( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
    #endif
        
    #ifdef TIMER_VERBOSE
        printf( "runner_doself[%02i]: %i parts at depth %i took %.3f ms.\n" , r->id , count , c->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 );
    #else
        TIMER_TOC(TIMER_DOSELF);
    #endif

    }


314
315
316
317
318
319
320
321
322
323
324
325
/**
 * @brief Compute the interactions between a cell pair, but only for the
 *      given indices in ci.
 *
 * @param r The #runner.
 * @param ci The first #cell.
 * @param parts_i The #parts to interact with @c cj.
 * @param ind The list of indices of particles in @c ci to interact with.
 * @param count The number of particles in @c ind.
 * @param cj The second #cell.
 */
 
326
void DOPAIR_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *restrict parts_i , int *restrict ind , int count , struct cell *restrict cj ) {
327
328

    struct engine *e = r->e;
329
    int pid, pjd, sid, k, count_j = cj->count, flipped;
330
    double shift[3] = { 0.0 , 0.0 , 0.0 };
Pedro Gonnet's avatar
Pedro Gonnet committed
331
    struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts;
332
    double pix[3];
333
    float dx[3], hi, hig2, r2, di, dxj;
334
    struct entry *sort_j;
Pedro Gonnet's avatar
Pedro Gonnet committed
335
336
337
338
339
340
341
342
    #ifdef VECTORIZE
        int icount = 0;
        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
        float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
        float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
        struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
    #endif
343
344
345
346
347
348
349
350
351
352
    TIMER_TIC
    
    /* Get the relative distance between the pairs, wrapping. */
    for ( k = 0 ; k < 3 ; k++ ) {
        if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 )
            shift[k] = e->s->dim[k];
        else if ( cj->loc[k] - ci->loc[k] > e->s->dim[k]/2 )
            shift[k] = -e->s->dim[k];
        }
        
353
354
355
356
357
358
359
360
    /* Get the sorting index. */
    for ( sid = 0 , k = 0 ; k < 3 ; k++ )
        sid = 3*sid + ( (cj->loc[k] - ci->loc[k] + shift[k] < 0) ? 0 : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1 );

    /* Switch the cells around? */
    flipped = runner_flip[sid];
    sid = sortlistID[sid];
    
361
362
363
364
    /* Have the cells been sorted? */
    if ( !(cj->sorted & (1 << sid) ) )
        error( "Trying to interact unsorted cells." );
    
365
366
367
368
369
    /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" ,
        ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] ,
        ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
    tic = getticks(); */
    
370
371
    /* Pick-out the sorted lists. */
    sort_j = &cj->sort[ sid*(cj->count + 1) ];
372
    dxj = cj->dx_max;
373
    
374
375
376
377
378
379
380
381
382
383
384
    /* Parts are on the left? */
    if ( !flipped ) {
    
        /* Loop over the parts_i. */
        for ( pid = 0 ; pid < count ; pid++ ) {

            /* Get a hold of the ith part in ci. */
            pi = &parts_i[ ind[ pid ] ];
            for ( k = 0 ; k < 3 ; k++ )
                pix[k] = pi->x[k] - shift[k];
            hi = pi->h;
385
386
            hig2 = hi * hi * kernel_gamma2;
            di = hi*kernel_gamma + dxj + pix[0]*runner_shift[ 3*sid + 0 ] + pix[1]*runner_shift[ 3*sid + 1 ] + pix[2]*runner_shift[ 3*sid + 2 ];
387
388
389
390
391

            /* Loop over the parts in cj. */
            for ( pjd = 0 ; pjd < count_j && sort_j[ pjd ].d < di ; pjd++ ) {

                /* Get a pointer to the jth particle. */
Pedro Gonnet's avatar
Pedro Gonnet committed
392
                pj = &parts_j[ sort_j[ pjd ].i ];
393
394
395
396

                /* Compute the pairwise distance. */
                r2 = 0.0f;
                for ( k = 0 ; k < 3 ; k++ ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
397
                    dx[k] = pix[k] - pj->x[k];
398
399
400
401
                    r2 += dx[k]*dx[k];
                    }

                /* Hit or miss? */
402
                if ( r2 < hig2 ) {
403

Pedro Gonnet's avatar
Pedro Gonnet committed
404
405
                    #ifndef VECTORIZE
                        
Pedro Gonnet's avatar
Pedro Gonnet committed
406
                        IACT_NONSYM( r2 , dx , hi , pj->h , pi , pj );
Pedro Gonnet's avatar
Pedro Gonnet committed
407
408
409
410
411
412
413
414
415
                    
                    #else
                    
                        /* Add this interaction to the queue. */
                        r2q[icount] = r2;
                        dxq[3*icount+0] = dx[0];
                        dxq[3*icount+1] = dx[1];
                        dxq[3*icount+2] = dx[2];
                        hiq[icount] = hi;
Pedro Gonnet's avatar
Pedro Gonnet committed
416
                        hjq[icount] = pj->h;
Pedro Gonnet's avatar
Pedro Gonnet committed
417
                        piq[icount] = pi;
Pedro Gonnet's avatar
Pedro Gonnet committed
418
                        pjq[icount] = pj;
Pedro Gonnet's avatar
Pedro Gonnet committed
419
420
421
422
423
424
425
426
427
                        icount += 1;
                        
                        /* Flush? */
                        if ( icount == VEC_SIZE ) {
                            IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq );
                            icount = 0;
                            }

                    #endif
Pedro Gonnet's avatar
Pedro Gonnet committed
428
                    
429
430
431
432
433
                    }

                } /* loop over the parts in cj. */

            } /* loop over the parts in ci. */
434
            
435
        }
436
        
437
438
    /* Parts are on the right. */
    else {
439
    
440
441
442
443
444
445
446
447
        /* Loop over the parts_i. */
        for ( pid = 0 ; pid < count ; pid++ ) {

            /* Get a hold of the ith part in ci. */
            pi = &parts_i[ ind[ pid ] ];
            for ( k = 0 ; k < 3 ; k++ )
                pix[k] = pi->x[k] - shift[k];
            hi = pi->h;
448
449
            hig2 = hi * hi * kernel_gamma2;
            di = -hi*kernel_gamma - dxj + pix[0]*runner_shift[ 3*sid + 0 ] + pix[1]*runner_shift[ 3*sid + 1 ] + pix[2]*runner_shift[ 3*sid + 2 ];
450
451
452
453
454

            /* Loop over the parts in cj. */
            for ( pjd = count_j-1 ; pjd >= 0 && di < sort_j[ pjd ].d ; pjd-- ) {

                /* Get a pointer to the jth particle. */
Pedro Gonnet's avatar
Pedro Gonnet committed
455
                pj = &parts_j[ sort_j[ pjd ].i ];
456
457
458
459

                /* Compute the pairwise distance. */
                r2 = 0.0f;
                for ( k = 0 ; k < 3 ; k++ ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
460
                    dx[k] = pix[k] - pj->x[k];
461
462
463
464
                    r2 += dx[k]*dx[k];
                    }

                /* Hit or miss? */
465
                if ( r2 < hig2 ) {
466

Pedro Gonnet's avatar
Pedro Gonnet committed
467
468
                    #ifndef VECTORIZE
                        
Pedro Gonnet's avatar
Pedro Gonnet committed
469
                        IACT_NONSYM( r2 , dx , hi , pj->h , pi , pj );
Pedro Gonnet's avatar
Pedro Gonnet committed
470
471
472
473
474
475
476
477
478
                    
                    #else
                    
                        /* Add this interaction to the queue. */
                        r2q[icount] = r2;
                        dxq[3*icount+0] = dx[0];
                        dxq[3*icount+1] = dx[1];
                        dxq[3*icount+2] = dx[2];
                        hiq[icount] = hi;
Pedro Gonnet's avatar
Pedro Gonnet committed
479
                        hjq[icount] = pj->h;
Pedro Gonnet's avatar
Pedro Gonnet committed
480
                        piq[icount] = pi;
Pedro Gonnet's avatar
Pedro Gonnet committed
481
                        pjq[icount] = pj;
Pedro Gonnet's avatar
Pedro Gonnet committed
482
483
484
485
486
487
488
489
490
                        icount += 1;
                        
                        /* Flush? */
                        if ( icount == VEC_SIZE ) {
                            IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq );
                            icount = 0;
                            }

                    #endif
491
492
493
494
495
496
497
498

                    }

                } /* loop over the parts in cj. */

            } /* loop over the parts in ci. */
            
        }
499
        
Pedro Gonnet's avatar
Pedro Gonnet committed
500
501
502
503
504
505
506
    #ifdef VECTORIZE
    /* Pick up any leftovers. */
    if ( icount > 0 )
        for ( k = 0 ; k < icount ; k++ )
            IACT_NONSYM( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
    #endif
        
507
    #ifdef TIMER_VERBOSE
508
        printf( "runner_dopair_subset[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 );
509
    #else
510
        TIMER_TOC(timer_dopair_subset);
511
512
513
514
515
516
    #endif


    }


Pedro Gonnet's avatar
Pedro Gonnet committed
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
/**
 * @brief Compute the interactions between a cell pair, but only for the
 *      given indices in ci.
 *
 * @param r The #runner.
 * @param ci The first #cell.
 * @param parts_i The #parts to interact with @c cj.
 * @param ind The list of indices of particles in @c ci to interact with.
 * @param count The number of particles in @c ind.
 * @param cj The second #cell.
 */
 
void DOPAIR_SUBSET_NAIVE ( struct runner *r , struct cell *restrict ci , struct part *restrict parts_i , int *restrict ind , int count , struct cell *restrict cj ) {

    struct engine *e = r->e;
    int pid, pjd, k, count_j = cj->count;
    double shift[3] = { 0.0 , 0.0 , 0.0 };
    struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts;
    double pix[3];
    float dx[3], hi, hig2, r2;
    #ifdef VECTORIZE
        int icount = 0;
        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
        float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
        float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
        struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
    #endif
    TIMER_TIC
    
    /* Get the relative distance between the pairs, wrapping. */
    for ( k = 0 ; k < 3 ; k++ ) {
        if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 )
            shift[k] = e->s->dim[k];
        else if ( cj->loc[k] - ci->loc[k] > e->s->dim[k]/2 )
            shift[k] = -e->s->dim[k];
        }
        
    /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" ,
        ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] ,
        ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
    tic = getticks(); */
    
    /* Loop over the parts_i. */
    for ( pid = 0 ; pid < count ; pid++ ) {

        /* Get a hold of the ith part in ci. */
        pi = &parts_i[ ind[ pid ] ];
        for ( k = 0 ; k < 3 ; k++ )
            pix[k] = pi->x[k] - shift[k];
        hi = pi->h;
        hig2 = hi * hi * kernel_gamma2;

        /* Loop over the parts in cj. */
        for ( pjd = 0 ; pjd < count_j ; pjd++ ) {

            /* Get a pointer to the jth particle. */
            pj = &parts_j[ pjd ];

            /* Compute the pairwise distance. */
            r2 = 0.0f;
            for ( k = 0 ; k < 3 ; k++ ) {
                dx[k] = pix[k] - pj->x[k];
                r2 += dx[k]*dx[k];
                }

            /* Hit or miss? */
            if ( r2 < hig2 ) {

                #ifndef VECTORIZE

                    IACT_NONSYM( r2 , dx , hi , pj->h , pi , pj );

                #else

                    /* Add this interaction to the queue. */
                    r2q[icount] = r2;
                    dxq[3*icount+0] = dx[0];
                    dxq[3*icount+1] = dx[1];
                    dxq[3*icount+2] = dx[2];
                    hiq[icount] = hi;
                    hjq[icount] = pj->h;
                    piq[icount] = pi;
                    pjq[icount] = pj;
                    icount += 1;

                    /* Flush? */
                    if ( icount == VEC_SIZE ) {
                        IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq );
                        icount = 0;
                        }

                #endif

                }

            } /* loop over the parts in cj. */

        } /* loop over the parts in ci. */

    #ifdef VECTORIZE
    /* Pick up any leftovers. */
    if ( icount > 0 )
        for ( k = 0 ; k < icount ; k++ )
            IACT_NONSYM( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
    #endif
        
    #ifdef TIMER_VERBOSE
        printf( "runner_dopair_subset[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 );
    #else
        TIMER_TOC(timer_dopair_subset);
    #endif


    }


634
635
636
637
638
639
/**
 * @brief Compute the interactions between a cell pair, but only for the
 *      given indices in ci.
 *
 * @param r The #runner.
 * @param ci The first #cell.
640
 * @param parts The #parts to interact.
641
642
643
644
 * @param ind The list of indices of particles in @c ci to interact with.
 * @param count The number of particles in @c ind.
 */
 
645
void DOSELF_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *restrict parts , int *restrict ind , int count ) {
646
647

    int pid, pjd, k, count_i = ci->count;
Pedro Gonnet's avatar
Pedro Gonnet committed
648
649
    struct part *restrict parts_j = ci->parts;
    struct part *restrict pi, *restrict pj;
650
    double pix[3] = {0.0,0.0,0.0};
651
    float dx[3], hi, hig2, r2;
Pedro Gonnet's avatar
Pedro Gonnet committed
652
653
654
655
656
657
658
659
    #ifdef VECTORIZE
        int icount = 0;
        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
        float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
        float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
        struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
    #endif
660
661
662
663
664
665
666
667
668
669
670
671
    TIMER_TIC
    
    /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" ,
        ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] ,
        ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
    tic = getticks(); */
    
    /* Loop over the parts in ci. */
    for ( pid = 0 ; pid < count ; pid++ ) {
    
        /* Get a hold of the ith part in ci. */
        pi = &parts[ ind[ pid ] ];
672
673
674
        pix[0] = pi->x[0];
        pix[1] = pi->x[1];
        pix[2] = pi->x[2];
675
        hi = pi->h;
676
        hig2 = hi * hi * kernel_gamma2;
677
678
679
680
681
        
        /* Loop over the parts in cj. */
        for ( pjd = 0 ; pjd < count_i ; pjd++ ) {
        
            /* Get a pointer to the jth particle. */
Pedro Gonnet's avatar
Pedro Gonnet committed
682
            pj = &parts_j[ pjd ];
683
684
685
686
            
            /* Compute the pairwise distance. */
            r2 = 0.0f;
            for ( k = 0 ; k < 3 ; k++ ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
687
                dx[k] = pix[k] - pj->x[k];
688
689
690
691
                r2 += dx[k]*dx[k];
                }
                
            /* Hit or miss? */
692
            if ( r2 > 0.0f && r2 < hig2 ) {
693
            
Pedro Gonnet's avatar
Pedro Gonnet committed
694
695
                #ifndef VECTORIZE

Pedro Gonnet's avatar
Pedro Gonnet committed
696
                    IACT_NONSYM( r2 , dx , hi , pj->h , pi , pj );
Pedro Gonnet's avatar
Pedro Gonnet committed
697
698
699
700
701
702
703
704
705

                #else

                    /* Add this interaction to the queue. */
                    r2q[icount] = r2;
                    dxq[3*icount+0] = dx[0];
                    dxq[3*icount+1] = dx[1];
                    dxq[3*icount+2] = dx[2];
                    hiq[icount] = hi;
Pedro Gonnet's avatar
Pedro Gonnet committed
706
                    hjq[icount] = pj->h;
Pedro Gonnet's avatar
Pedro Gonnet committed
707
                    piq[icount] = pi;
Pedro Gonnet's avatar
Pedro Gonnet committed
708
                    pjq[icount] = pj;
Pedro Gonnet's avatar
Pedro Gonnet committed
709
710
711
712
713
714
715
716
717
                    icount += 1;

                    /* Flush? */
                    if ( icount == VEC_SIZE ) {
                        IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq );
                        icount = 0;
                        }

                #endif
718
719
720
721
722
723
724
            
                }
        
            } /* loop over the parts in cj. */
    
        } /* loop over the parts in ci. */
        
Pedro Gonnet's avatar
Pedro Gonnet committed
725
726
727
728
729
730
731
    #ifdef VECTORIZE
    /* Pick up any leftovers. */
    if ( icount > 0 )
        for ( k = 0 ; k < icount ; k++ )
            IACT_NONSYM( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
    #endif
        
732
    #ifdef TIMER_VERBOSE
733
        printf( "runner_doself_subset[%02i]: %i/%i parts at depth %i took %.3f ms.\n" , r->id , count , ci->count , ci->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 );
734
    #else
735
        TIMER_TOC(timer_dopair_subset);
736
737
738
739
740
741
    #endif


    }


742
743
744
745
746
747
748
749
/**
 * @brief Compute the interactions between a cell pair.
 *
 * @param r The #runner.
 * @param ci The first #cell.
 * @param cj The second #cell.
 */
 
750
void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) {
751

752
    struct engine *restrict e = r->e;
753
754
    int pid, pjd, k, sid;
    double rshift, shift[3] = { 0.0 , 0.0 , 0.0 };
755
    struct entry *restrict sort_i, *restrict sort_j;
Pedro Gonnet's avatar
Pedro Gonnet committed
756
    struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j;
757
    double pix[3], pjx[3], di, dj;
758
    float dx[3], hi, hig2, hj, hjg2, r2, dx_max;
Pedro Gonnet's avatar
Pedro Gonnet committed
759
760
    double hi_max, hj_max;
    double di_max, dj_min;
761
    int count_i, count_j;
762
    float dt_step = e->dt_step;
Pedro Gonnet's avatar
Pedro Gonnet committed
763
764
765
766
767
768
769
770
    #ifdef VECTORIZE
        int icount = 0;
        float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
        float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
        float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
        float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
        struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
    #endif
771
772
    TIMER_TIC
    
773
    /* Anything to do here? */
774
    if ( ci->dt_min > dt_step && cj->dt_min > dt_step )
775
        return;
776
        
777
778
    /* Get the sort ID. */
    sid = space_getsid( e->s , &ci , &cj , shift );
779
    
780
781
782
783
    /* Have the cells been sorted? */
    if ( !(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid) ) )
        error( "Trying to interact unsorted cells." );
    
784
785
786
787
788
789
790
791
792
    /* Get the cutoff shift. */
    for ( rshift = 0.0 , k = 0 ; k < 3 ; k++ )
        rshift += shift[k]*runner_shift[ 3*sid + k ];
        
    /* Pick-out the sorted lists. */
    sort_i = &ci->sort[ sid*(ci->count + 1) ];
    sort_j = &cj->sort[ sid*(cj->count + 1) ];
    
    /* Get some other useful values. */
793
    hi_max = ci->h_max*kernel_gamma - rshift; hj_max = cj->h_max*kernel_gamma;
794
795
796
797
    count_i = ci->count; count_j = cj->count;
    parts_i = ci->parts; parts_j = cj->parts;
    di_max = sort_i[count_i-1].d - rshift;
    dj_min = sort_j[0].d;
798
    dx_max = ( ci->dx_max + cj->dx_max );
799
    
800

801
    /* Loop over the parts in ci. */
802
    for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min ; pid-- ) {
803
804
    
        /* Get a hold of the ith part in ci. */
Pedro Gonnet's avatar
Pedro Gonnet committed
805
        pi = &parts_i[ sort_i[ pid ].i ];
Pedro Gonnet's avatar
Pedro Gonnet committed
806
        if ( pi->dt > dt_step )
807
            continue;
Pedro Gonnet's avatar
Pedro Gonnet committed
808
        hi = pi->h;
809
        di = sort_i[pid].d + hi*kernel_gamma + dx_max - rshift;
810
811
812
        if ( di < dj_min )
            continue;
            
813
        hig2 = hi * hi * kernel_gamma2;
814
        for ( k = 0 ; k < 3 ; k++ )
Pedro Gonnet's avatar
Pedro Gonnet committed
815
            pix[k] = pi->x[k] - shift[k];
816
817
818
819
820
        
        /* Loop over the parts in cj. */
        for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d < di ; pjd++ ) {
        
            /* Get a pointer to the jth particle. */
Pedro Gonnet's avatar
Pedro Gonnet committed
821
            pj = &parts_j[ sort_j[pjd].i ];
822
823
        
            /* Compute the pairwise distance. */
Pedro Gonnet's avatar
Pedro Gonnet committed
824
            r2 = 0.0f;
825
            for ( k = 0 ; k < 3 ; k++ ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
826
                dx[k] = pix[k] - pj->x[k];
827
828
829
830
                r2 += dx[k]*dx[k];
                }
                
            /* Hit or miss? */
831
            if ( r2 < hig2 ) {
832
            
Pedro Gonnet's avatar
Pedro Gonnet committed
833
834
                #ifndef VECTORIZE
                        
Pedro Gonnet's avatar
Pedro Gonnet committed
835
                    IACT_NONSYM( r2 , dx , hi , pj->h , pi , pj );
Pedro Gonnet's avatar
Pedro Gonnet committed
836
837
838
839
840
841
842
843
844
                
                #else

                    /* Add this interaction to the queue. */
                    r2q[icount] = r2;
                    dxq[3*icount+0] = dx[0];
                    dxq[3*icount+1] = dx[1];
                    dxq[3*icount+2] = dx[2];
                    hiq[icount] = hi;
Pedro Gonnet's avatar
Pedro Gonnet committed
845
                    hjq[icount] = pj->h;
Pedro Gonnet's avatar
Pedro Gonnet committed
846
                    piq[icount] = pi;
Pedro Gonnet's avatar
Pedro Gonnet committed
847
                    pjq[icount] = pj;
Pedro Gonnet's avatar
Pedro Gonnet committed
848
849
850
851
                    icount += 1;

                    /* Flush? */
                    if ( icount == VEC_SIZE ) {
852
                        IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq );
Pedro Gonnet's avatar
Pedro Gonnet committed
853
854
855
856
                        icount = 0;
                        }

                #endif
857
858
859
860
861
862
863
864
865
866
867
            
                }
        
            } /* loop over the parts in cj. */
    
        } /* loop over the parts in ci. */
        
    /* printf( "runner_dopair: first half took %.3f ms...\n" , ((double)(getticks() - tic)) / CPU_TPS * 1000 );
    tic = getticks(); */

    /* Loop over the parts in cj. */
868
    for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max ; pjd++ ) {
869
870
    
        /* Get a hold of the jth part in cj. */
Pedro Gonnet's avatar
Pedro Gonnet committed
871
        pj = &parts_j[ sort_j[ pjd ].i ];
Pedro Gonnet's avatar
Pedro Gonnet committed
872
        if ( pj->dt > dt_step )
873
            continue;
Pedro Gonnet's avatar
Pedro Gonnet committed
874
        hj = pj->h;
875
        dj = sort_j[pjd].d - hj*kernel_gamma - dx_max - rshift;
876
877
878
879
        if ( dj > di_max )
            continue;
            
        for ( k = 0 ; k < 3 ; k++ )
Pedro Gonnet's avatar
Pedro Gonnet committed
880
            pjx[k] = pj->x[k] + shift[k];
881
        hjg2 = hj * hj * kernel_gamma2;
882
883
884
885
886
        
        /* Loop over the parts in ci. */
        for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d > dj ; pid-- ) {
        
            /* Get a pointer to the jth particle. */
Pedro Gonnet's avatar
Pedro Gonnet committed
887
            pi = &parts_i[ sort_i[pid].i ];
888
889
            
            /* Compute the pairwise distance. */
Pedro Gonnet's avatar
Pedro Gonnet committed
890
            r2 = 0.0f;
891
            for ( k = 0 ; k < 3 ; k++ ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
892
                dx[k] = pjx[k] - pi->x[k];
893
894
895
896
                r2 += dx[k]*dx[k];
                }
                
            /* Hit or miss? */
897
            if ( r2 < hjg2 ) {
898
            
Pedro Gonnet's avatar
Pedro Gonnet committed
899
900
                #ifndef VECTORIZE
                        
Pedro Gonnet's avatar
Pedro Gonnet committed
901
                    IACT_NONSYM( r2 , dx , hj , pi->h , pj , pi );
Pedro Gonnet's avatar
Pedro Gonnet committed
902
903
904
905
906
907
908
909
910
                
                #else

                    /* Add this interaction to the queue. */
                    r2q[icount] = r2;
                    dxq[3*icount+0] = dx[0];
                    dxq[3*icount+1] = dx[1];
                    dxq[3*icount+2] = dx[2];
                    hiq[icount] = hj;
Pedro Gonnet's avatar
Pedro Gonnet committed
911
                    hjq[icount] = pi->h;
Pedro Gonnet's avatar
Pedro Gonnet committed
912
                    piq[icount] = pj;
Pedro Gonnet's avatar
Pedro Gonnet committed
913
                    pjq[icount] = pi;
Pedro Gonnet's avatar
Pedro Gonnet committed
914
915
916
917
                    icount += 1;

                    /* Flush? */
                    if ( icount == VEC_SIZE ) {
918
                        IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq );
Pedro Gonnet's avatar
Pedro Gonnet committed
919
920
921
922
                        icount = 0;
                        }

                #endif
923
924
925
926
927
928
929
            
                }
        
            } /* loop over the parts in cj. */
    
        } /* loop over the parts in ci. */

Pedro Gonnet's avatar
Pedro Gonnet committed
930
931
932
933
    #ifdef VECTORIZE
    /* Pick up any leftovers. */
    if ( icount > 0 )
        for ( k = 0 ; k < icount ; k++ )
934
            IACT_NONSYM( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
Pedro Gonnet's avatar
Pedro Gonnet committed
935
936
    #endif
        
937
    #ifdef TIMER_VERBOSE
Pedro Gonnet's avatar
Pedro Gonnet committed
938
        printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000 );
939
    #else
940
        TIMER_TOC(TIMER_DOPAIR);
941
942
943
944
945
    #endif

    }


946
void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) {
947

948
949
950
951
952
953
954
955
    struct engine *restrict e = r->e;
    int pid, pjd, k, sid;
    double rshift, shift[3] = { 0.0 , 0.0 , 0.0 };
    struct entry *restrict sort_i, *restrict sort_j;
    struct entry *restrict sortdt_i = NULL, *restrict sortdt_j = NULL;
    int countdt_i = 0, countdt_j = 0;
    struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j;
    double pix[3], pjx[3], di, dj;
956
    float dx[3], hi, hig2, hj, hjg2, r2, dx_max;
957
958
959
    double hi_max, hj_max;
    double di_max, dj_min;
    int count_i, count_j;
960
    float dt_step = e->dt_step;
Pedro Gonnet's avatar
Pedro Gonnet committed
961
    #ifdef VECTORIZE
962
963
964
965
966
967
968
969
970
971
972
973
        int icount1 = 0;
        float r2q1[VEC_SIZE] __attribute__ ((aligned (16)));
        float hiq1[VEC_SIZE] __attribute__ ((aligned (16)));
        float hjq1[VEC_SIZE] __attribute__ ((aligned (16)));
        float dxq1[3*VEC_SIZE] __attribute__ ((aligned (16)));
        struct part *piq1[VEC_SIZE], *pjq1[VEC_SIZE];
        int icount2 = 0;
        float r2q2[VEC_SIZE] __attribute__ ((aligned (16)));
        float hiq2[VEC_SIZE] __attribute__ ((aligned (16)));
        float hjq2[VEC_SIZE] __attribute__ ((aligned (16)));
        float dxq2[3*VEC_SIZE] __attribute__ ((aligned (16)));
        struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE];
Pedro Gonnet's avatar
Pedro Gonnet committed
974
    #endif
975
976
    TIMER_TIC
    
977
    /* Anything to do here? */
978
    if ( ci->dt_min > dt_step && cj->dt_min > dt_step )
979
980
        return;
        
981
982
    /* Get the shift ID. */
    sid = space_getsid( e->s , &ci , &cj , shift );
983
    
984
985
986
987
    /* Have the cells been sorted? */
    if ( !(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid) ) )
        error( "Trying to interact unsorted cells." );
    
988
989
990
991
992
993
994
995
996
    /* Get the cutoff shift. */
    for ( rshift = 0.0 , k = 0 ; k < 3 ; k++ )
        rshift += shift[k]*runner_shift[ 3*sid + k ];
        
    /* Pick-out the sorted lists. */
    sort_i = &ci->sort[ sid*(ci->count + 1) ];
    sort_j = &cj->sort[ sid*(cj->count + 1) ];
    
    /* Get some other useful values. */
997
    hi_max = ci->h_max*kernel_gamma - rshift; hj_max = cj->h_max*kernel_gamma;
998
999
1000
1001
    count_i = ci->count; count_j = cj->count;
    parts_i = ci->parts; parts_j = cj->parts;
    di_max = sort_i[count_i-1].d - rshift;
    dj_min = sort_j[0].d;
1002
    dx_max = ( ci->dx_max + cj->dx_max );
1003
1004
    
    /* Collect the number of parts left and right below dt. */
1005
    if ( ci->dt_max <= dt_step ) {
1006
1007
1008
        sortdt_i = sort_i;
        countdt_i = count_i;
        }
1009
    else if ( ci->dt_min <= dt_step ) {
1010
1011
1012
        if ( ( sortdt_i = (struct entry *)alloca( sizeof(struct entry) * count_i ) ) == NULL )
            error( "Failed to allocate dt sortlists." );
        for ( k = 0 ; k < count_i ; k++ )
Pedro Gonnet's avatar
Pedro Gonnet committed
1013
            if ( parts_i[ sort_i[ k ].i ].dt <= dt_step ) {
1014
1015
1016
1017
                sortdt_i[ countdt_i ] = sort_i[k];
                countdt_i += 1;
                }
        }
1018
    if ( cj->dt_max <= dt_step ) {
1019
1020
1021
        sortdt_j = sort_j;
        countdt_j = count_j;
        }
1022
    else if ( cj->dt_min <= dt_step ) {
1023
1024
1025
        if ( ( sortdt_j = (struct entry *)alloca( sizeof(struct entry) * count_j ) ) == NULL )
            error( "Failed to allocate dt sortlists." );
        for ( k = 0 ; k < count_j ; k++ )
Pedro Gonnet's avatar
Pedro Gonnet committed
1026
            if ( parts_j[ sort_j[ k ].i ].dt <= dt_step ) {
1027
1028
1029
1030
1031
1032
                sortdt_j[ countdt_j ] = sort_j[k];
                countdt_j += 1;
                }
        }
    
    /* Loop over the parts in ci. */
1033
    for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min ; pid-- ) {
1034
1035
1036
    
        /* Get a hold of the ith part in ci. */
        pi = &parts_i[ sort_i[ pid ].i ];
Pedro Gonnet's avatar
Pedro Gonnet committed
1037
        hi = pi->h;
1038
        di = sort_i[pid].d + hi*kernel_gamma + dx_max - rshift;
1039
1040
1041
        if ( di < dj_min )
            continue;
            
1042
        hig2 = hi * hi * kernel_gamma2;
1043
        for ( k = 0 ; k < 3 ; k++ )
Pedro Gonnet's avatar
Pedro Gonnet committed
1044
            pix[k] = pi->x[k] - shift[k];
1045
            
1046
        /* Look at valid dt parts only? */
Pedro Gonnet's avatar
Pedro Gonnet committed
1047
        if ( pi->dt > dt_step ) {
1048
        
1049
1050
            /* Loop over the parts in cj within dt. */
            for ( pjd = 0 ; pjd < countdt_j && sortdt_j[pjd].d < di ; pjd++ ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
1051

1052
                /* Get a pointer to the jth particle. */
Pedro Gonnet's avatar
Pedro Gonnet committed
1053
1054
                pj = &parts_j[ sortdt_j[pjd].i ];
                hj = pj->h;
Pedro Gonnet's avatar
Pedro Gonnet committed
1055

1056
1057
1058
                /* Compute the pairwise distance. */
                r2 = 0.0f;
                for ( k = 0 ; k < 3 ; k++ ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
1059
                    dx[k] = pj->x[k] - pix[k];
1060
1061
1062
1063
                    r2 += dx[k]*dx[k];
                    }

                /* Hit or miss? */
1064
                if ( r2 < hig2 ) {
1065
1066
1067

                    #ifndef VECTORIZE

Pedro Gonnet's avatar
Pedro Gonnet committed
1068
                        IACT_NONSYM( r2 , dx , hj , hi , pj , pi );
1069
1070
1071
1072

                    #else

                        /* Add this interaction to the queue. */
1073
1074
1075
1076
                        r2q1[icount1] = r2;
                        dxq1[3*icount1+0] = dx[0];
                        dxq1[3*icount1+1] = dx[1];
                        dxq1[3*icount1+2] = dx[2];
1077
                        hiq1[icount1] = hj;
1078
                        hjq1[icount1] = hi;
Pedro Gonnet's avatar
Pedro Gonnet committed
1079
                        piq1[icount1] = pj;
1080
1081
                        pjq1[icount1] = pi;
                        icount1 += 1;
1082
1083

                        /* Flush? */
1084
1085
1086
                        if ( icount1 == VEC_SIZE ) {
                            IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 );
                            icount1 = 0;
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
                            }

                    #endif

                    }
        
                } /* loop over the parts in cj. */
                
            }
            
        /* Otherwise, look at all parts. */
        else {
        
            /* Loop over the parts in cj. */
            for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d < di ; pjd++ ) {

                /* Get a pointer to the jth particle. */
Pedro Gonnet's avatar
Pedro Gonnet committed
1104
1105
                pj = &parts_j[ sort_j[pjd].i ];
                hj = pj->h;
1106
1107
1108
1109

                /* Compute the pairwise distance. */
                r2 = 0.0f;
                for ( k = 0 ; k < 3 ; k++ ) {
Pedro Gonnet's avatar
Pedro Gonnet committed
1110
                    dx[k] = pix[k] - pj->x[k];
1111
1112
1113
1114
                    r2 += dx[k]*dx[k];
                    }

                /* Hit or miss? */
1115
                if ( r2 < hig2 ) {
1116
1117
1118

                    #ifndef VECTORIZE

1119
                        /* Does pj need to be updated too? */
Pedro Gonnet's avatar
Pedro Gonnet committed
1120
1121
                        if ( pj->dt <= dt_step )
                            IACT( r2 , dx , hi , hj , pi , pj );
1122
                        else
Pedro Gonnet's avatar
Pedro Gonnet committed
1123
                            IACT_NONSYM( r2 , dx , hi , hj , pi , pj );
1124
1125
1126

                    #else

1127
                        /* Does pj need to be updated too? */
Pedro Gonnet's avatar
Pedro Gonnet committed
1128
                        if ( pj->dt <= dt_step ) {
1129
1130
1131
1132
1133
1134
1135
                        
                            /* Add this interaction to the symmetric queue. */
                            r2q2[icount2] = r2;
                            dxq2[3*icount2+0] = dx[0];
                            dxq2[3*icount2+1] = dx[1];
                            dxq2[3*icount2+2] = dx[2];
                            hiq2[icount2] = hi;
1136
                            hjq2[icount2] = hj;
1137
                            piq2[icount2] = pi;